All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC 1/3] Unified NMI delayed call mechanism
@ 2010-06-12  9:28 Huang Ying
  2010-06-12  9:28 ` [RFC 2/3] Use unified NMI delayed call mechanism in MCE handler Huang Ying
                   ` (3 more replies)
  0 siblings, 4 replies; 31+ messages in thread
From: Huang Ying @ 2010-06-12  9:28 UTC (permalink / raw)
  To: Ingo Molnar, H. Peter Anvin; +Cc: linux-kernel, Andi Kleen, Huang Ying

NMI can be triggered even when IRQ is masked. So it is not safe for
NMI handler to call some functions. One solution is to delay the call
via self interrupt, so that the delayed call can be done once the
interrupt is enabled again. This has been implemented in MCE and perf
event. This patch provides a unified version and make it easier for
other NMI semantic handler to take use of the delayed call.

Signed-off-by: Huang Ying <ying.huang@intel.com>
---
 arch/x86/include/asm/entry_arch.h  |    1 
 arch/x86/include/asm/hw_irq.h      |    1 
 arch/x86/include/asm/irq_vectors.h |    5 +
 arch/x86/include/asm/nmi.h         |    7 ++
 arch/x86/kernel/entry_64.S         |    3 +
 arch/x86/kernel/irqinit.c          |    3 +
 arch/x86/kernel/traps.c            |  104 +++++++++++++++++++++++++++++++++++++
 7 files changed, 124 insertions(+)

--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -65,4 +65,5 @@ BUILD_INTERRUPT(threshold_interrupt,THRE
 BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
 #endif
 
+BUILD_INTERRUPT(nmi_delayed_call_interrupt,NMI_DELAYED_CALL_VECTOR)
 #endif
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -35,6 +35,7 @@ extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
 extern void reschedule_interrupt(void);
 extern void mce_self_interrupt(void);
+extern void nmi_delayed_call_interrupt(void);
 
 extern void invalidate_interrupt(void);
 extern void invalidate_interrupt0(void);
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -125,6 +125,11 @@
  */
 #define MCE_SELF_VECTOR			0xeb
 
+/*
+ * Self IPI vector for NMI delayed call
+ */
+#define NMI_DELAYED_CALL_VECTOR		0xe9
+
 #define NR_VECTORS			 256
 
 #define FPU_IRQ				  13
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -75,4 +75,11 @@ void enable_lapic_nmi_watchdog(void);
 void stop_nmi(void);
 void restart_nmi(void);
 
+#define NMI_DELAYED_CALL_ID_INVALID	-1
+
+typedef void (*nmi_delayed_call_func_t)(void);
+int nmi_delayed_call_register(nmi_delayed_call_func_t func);
+void nmi_delayed_call_unregister(int id);
+void nmi_delayed_call_schedule(int id);
+
 #endif /* _ASM_X86_NMI_H */
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1009,6 +1009,9 @@ apicinterrupt MCE_SELF_VECTOR \
 	mce_self_interrupt smp_mce_self_interrupt
 #endif
 
+apicinterrupt NMI_DELAYED_CALL_VECTOR \
+	nmi_delayed_call_interrupt smp_nmi_delayed_call_interrupt
+
 #ifdef CONFIG_SMP
 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
 	call_function_single_interrupt smp_call_function_single_interrupt
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -212,6 +212,9 @@ static void __init apic_intr_init(void)
 #if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
 	alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
 #endif
+#if defined(CONFIG_X86_LOCAL_APIC)
+	alloc_intr_gate(NMI_DELAYED_CALL_VECTOR, nmi_delayed_call_interrupt);
+#endif
 
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	/* self generated IPI for local APIC timer */
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -888,3 +888,107 @@ void __init trap_init(void)
 
 	x86_init.irqs.trap_init();
 }
+
+#define NMI_DELAYED_CALL_ID_MAX		32
+#define NMI_DELAYED_CALL_RESTART_MAX	5
+
+static nmi_delayed_call_func_t nmi_delayed_call_funcs[NMI_DELAYED_CALL_ID_MAX];
+static DEFINE_SPINLOCK(nmi_delayed_call_lock);
+
+static DEFINE_PER_CPU(unsigned long, nmi_delayed_call_pending);
+
+static void nmi_delayed_call_run(void)
+{
+	int cpu, restart = NMI_DELAYED_CALL_RESTART_MAX;
+	unsigned long pending, *ppending;
+	nmi_delayed_call_func_t *pfunc, func;
+
+	cpu = smp_processor_id();
+	ppending = per_cpu_ptr(&nmi_delayed_call_pending, cpu);
+	while (*ppending && restart--) {
+		pending = xchg(ppending, 0);
+		pfunc = nmi_delayed_call_funcs;
+		do {
+			if (pending & 1) {
+				func = *pfunc;
+				if (func)
+					func();
+			}
+			pfunc++;
+			pending >>= 1;
+		} while (pending);
+	}
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+asmlinkage void smp_nmi_delayed_call_interrupt(struct pt_regs *regs)
+{
+	ack_APIC_irq();
+	irq_enter();
+	nmi_delayed_call_run();
+	irq_exit();
+}
+#endif
+
+int nmi_delayed_call_register(nmi_delayed_call_func_t func)
+{
+	unsigned long flags;
+	int i, id = NMI_DELAYED_CALL_ID_INVALID;
+
+	spin_lock_irqsave(&nmi_delayed_call_lock, flags);
+	for (i = 0; i < NMI_DELAYED_CALL_ID_MAX; i++) {
+		if (!nmi_delayed_call_funcs[i]) {
+			nmi_delayed_call_funcs[i] = func;
+			id = i;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&nmi_delayed_call_lock, flags);
+	return id;
+}
+EXPORT_SYMBOL_GPL(nmi_delayed_call_register);
+
+/* Corresponding NMI handler should complete before invoking this
+ * function */
+void nmi_delayed_call_unregister(int id)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&nmi_delayed_call_lock, flags);
+	nmi_delayed_call_funcs[id] = NULL;
+	spin_unlock_irqrestore(&nmi_delayed_call_lock, flags);
+}
+EXPORT_SYMBOL_GPL(nmi_delayed_call_unregister);
+
+void nmi_delayed_call_schedule(int id)
+{
+	int cpu;
+
+	if (id == NMI_DELAYED_CALL_ID_INVALID)
+		return;
+	BUG_ON(id < 0 || id >= NMI_DELAYED_CALL_ID_MAX);
+
+	cpu = smp_processor_id();
+	set_bit(id, per_cpu_ptr(&nmi_delayed_call_pending, cpu));
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	/* Without APIC do not schedule */
+	if (!cpu_has_apic)
+		return;
+
+	/*
+	 * In nmi we cannot use kernel services safely. Trigger an
+	 * self interrupt through the APIC to instead do the
+	 * notification after interrupts are reenabled again.
+	 */
+	apic->send_IPI_self(NMI_DELAYED_CALL_VECTOR);
+
+	/*
+	 * Wait for idle afterwards again so that we don't leave the
+	 * APIC in a non idle state because the normal APIC writes
+	 * cannot exclude us.
+	 */
+	apic_wait_icr_idle();
+#endif
+}
+EXPORT_SYMBOL_GPL(nmi_delayed_call_schedule);

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [RFC 2/3] Use unified NMI delayed call mechanism in MCE handler
  2010-06-12  9:28 [RFC 1/3] Unified NMI delayed call mechanism Huang Ying
@ 2010-06-12  9:28 ` Huang Ying
  2010-06-12  9:28 ` [RFC 3/3] Use unified NMI delayed call mechanism in perf event NMI handler Huang Ying
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 31+ messages in thread
From: Huang Ying @ 2010-06-12  9:28 UTC (permalink / raw)
  To: Ingo Molnar, H. Peter Anvin; +Cc: linux-kernel, Andi Kleen, Huang Ying

The original self interrupt mechanism in MCE handler is replaced by
the unified delayed call mechanism.

Signed-off-by: Huang Ying <ying.huang@intel.com>
---
 arch/x86/include/asm/entry_arch.h  |    4 --
 arch/x86/include/asm/irq_vectors.h |    5 ---
 arch/x86/kernel/cpu/mcheck/mce.c   |   53 ++++++-------------------------------
 arch/x86/kernel/entry_64.S         |    5 ---
 arch/x86/kernel/irqinit.c          |    3 --
 5 files changed, 10 insertions(+), 60 deletions(-)

--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -61,9 +61,5 @@ BUILD_INTERRUPT(thermal_interrupt,THERMA
 BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
 #endif
 
-#ifdef CONFIG_X86_MCE
-BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
-#endif
-
 BUILD_INTERRUPT(nmi_delayed_call_interrupt,NMI_DELAYED_CALL_VECTOR)
 #endif
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -121,11 +121,6 @@
 #define UV_BAU_MESSAGE			0xea
 
 /*
- * Self IPI vector for machine checks
- */
-#define MCE_SELF_VECTOR			0xeb
-
-/*
  * Self IPI vector for NMI delayed call
  */
 #define NMI_DELAYED_CALL_VECTOR		0xe9
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -480,60 +480,22 @@ static inline void mce_get_rip(struct mc
 		m->ip = mce_rdmsrl(rip_msr);
 }
 
-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * Called after interrupts have been reenabled again
- * when a MCE happened during an interrupts off region
- * in the kernel.
- */
-asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
+static int mce_delayed_call_id = NMI_DELAYED_CALL_ID_INVALID;
+
+static void __mce_report_event(void)
 {
-	ack_APIC_irq();
-	exit_idle();
-	irq_enter();
 	mce_notify_irq();
 	mce_schedule_work();
-	irq_exit();
 }
-#endif
 
 static void mce_report_event(struct pt_regs *regs)
 {
 	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
-		mce_notify_irq();
-		/*
-		 * Triggering the work queue here is just an insurance
-		 * policy in case the syscall exit notify handler
-		 * doesn't run soon enough or ends up running on the
-		 * wrong CPU (can happen when audit sleeps)
-		 */
-		mce_schedule_work();
+		__mce_report_event();
 		return;
 	}
 
-#ifdef CONFIG_X86_LOCAL_APIC
-	/*
-	 * Without APIC do not notify. The event will be picked
-	 * up eventually.
-	 */
-	if (!cpu_has_apic)
-		return;
-
-	/*
-	 * When interrupts are disabled we cannot use
-	 * kernel services safely. Trigger an self interrupt
-	 * through the APIC to instead do the notification
-	 * after interrupts are reenabled again.
-	 */
-	apic->send_IPI_self(MCE_SELF_VECTOR);
-
-	/*
-	 * Wait for idle afterwards again so that we don't leave the
-	 * APIC in a non idle state because the normal APIC writes
-	 * cannot exclude us.
-	 */
-	apic_wait_icr_idle();
-#endif
+	nmi_delayed_call_schedule(mce_delayed_call_id);
 }
 
 DEFINE_PER_CPU(unsigned, mce_poll_count);
@@ -1731,6 +1693,11 @@ __setup("mce", mcheck_enable);
 
 int __init mcheck_init(void)
 {
+	mce_delayed_call_id = nmi_delayed_call_register(__mce_report_event);
+	if (mce_delayed_call_id == NMI_DELAYED_CALL_ID_INVALID)
+		pr_err(
+		"Failed to register mce delayed event reporting function!\n");
+
 	atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
 
 	mcheck_intel_therm_init();
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1004,11 +1004,6 @@ apicinterrupt THRESHOLD_APIC_VECTOR \
 apicinterrupt THERMAL_APIC_VECTOR \
 	thermal_interrupt smp_thermal_interrupt
 
-#ifdef CONFIG_X86_MCE
-apicinterrupt MCE_SELF_VECTOR \
-	mce_self_interrupt smp_mce_self_interrupt
-#endif
-
 apicinterrupt NMI_DELAYED_CALL_VECTOR \
 	nmi_delayed_call_interrupt smp_nmi_delayed_call_interrupt
 
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -209,9 +209,6 @@ static void __init apic_intr_init(void)
 #ifdef CONFIG_X86_MCE_THRESHOLD
 	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
 #endif
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
-	alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
-#endif
 #if defined(CONFIG_X86_LOCAL_APIC)
 	alloc_intr_gate(NMI_DELAYED_CALL_VECTOR, nmi_delayed_call_interrupt);
 #endif

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [RFC 3/3] Use unified NMI delayed call mechanism in perf event NMI handler
  2010-06-12  9:28 [RFC 1/3] Unified NMI delayed call mechanism Huang Ying
  2010-06-12  9:28 ` [RFC 2/3] Use unified NMI delayed call mechanism in MCE handler Huang Ying
@ 2010-06-12  9:28 ` Huang Ying
  2010-06-12 10:25 ` [RFC 1/3] Unified NMI delayed call mechanism Ingo Molnar
  2010-06-18 11:55 ` Peter Zijlstra
  3 siblings, 0 replies; 31+ messages in thread
From: Huang Ying @ 2010-06-12  9:28 UTC (permalink / raw)
  To: Ingo Molnar, H. Peter Anvin; +Cc: linux-kernel, Andi Kleen, Huang Ying

The original self interrupt mechanism in perf event NMI handler is
replaced by the unified delayed call mechanism.

Signed-off-by: Huang Ying <ying.huang@intel.com>
---
 arch/x86/include/asm/entry_arch.h  |    4 ----
 arch/x86/include/asm/irq_vectors.h |    5 -----
 arch/x86/kernel/cpu/perf_event.c   |   13 ++++++++-----
 arch/x86/kernel/entry_64.S         |    5 -----
 arch/x86/kernel/irqinit.c          |    6 ------
 5 files changed, 8 insertions(+), 25 deletions(-)

--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,10 +49,6 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOC
 BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
-#ifdef CONFIG_PERF_EVENTS
-BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
-#endif
-
 #ifdef CONFIG_X86_THERMAL_VECTOR
 BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 #endif
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -113,11 +113,6 @@
  */
 #define X86_PLATFORM_IPI_VECTOR		0xed
 
-/*
- * Performance monitoring pending work vector:
- */
-#define LOCAL_PENDING_VECTOR		0xec
-
 #define UV_BAU_MESSAGE			0xea
 
 /*
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1160,13 +1160,12 @@ static int x86_pmu_handle_irq(struct pt_
 	return handled;
 }
 
-void smp_perf_pending_interrupt(struct pt_regs *regs)
+static int perf_delayed_call_id = NMI_DELAYED_CALL_ID_INVALID;
+
+void perf_do_pending(void)
 {
-	irq_enter();
-	ack_APIC_irq();
 	inc_irq_stat(apic_pending_irqs);
 	perf_event_do_pending();
-	irq_exit();
 }
 
 void set_perf_event_pending(void)
@@ -1175,7 +1174,7 @@ void set_perf_event_pending(void)
 	if (!x86_pmu.apic || !x86_pmu_initialized())
 		return;
 
-	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+	nmi_delayed_call_schedule(perf_delayed_call_id);
 #endif
 }
 
@@ -1363,6 +1362,10 @@ void __init init_hw_perf_events(void)
 		}
 	}
 
+	perf_delayed_call_id = nmi_delayed_call_register(perf_do_pending);
+	if (perf_delayed_call_id == NMI_DELAYED_CALL_ID_INVALID)
+		pr_err("Failed to register NMI delayed call for perf.\n");
+
 	pr_info("... version:                %d\n",     x86_pmu.version);
 	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
 	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1021,11 +1021,6 @@ apicinterrupt ERROR_APIC_VECTOR \
 apicinterrupt SPURIOUS_APIC_VECTOR \
 	spurious_interrupt smp_spurious_interrupt
 
-#ifdef CONFIG_PERF_EVENTS
-apicinterrupt LOCAL_PENDING_VECTOR \
-	perf_pending_interrupt smp_perf_pending_interrupt
-#endif
-
 /*
  * Exception entry points.
  */
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -223,12 +223,6 @@ static void __init apic_intr_init(void)
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
-	/* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_EVENTS
-	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
-# endif
-
 #endif
 }
 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-12  9:28 [RFC 1/3] Unified NMI delayed call mechanism Huang Ying
  2010-06-12  9:28 ` [RFC 2/3] Use unified NMI delayed call mechanism in MCE handler Huang Ying
  2010-06-12  9:28 ` [RFC 3/3] Use unified NMI delayed call mechanism in perf event NMI handler Huang Ying
@ 2010-06-12 10:25 ` Ingo Molnar
  2010-06-13  1:54   ` Huang Ying
  2010-06-14  3:45   ` Hidetoshi Seto
  2010-06-18 11:55 ` Peter Zijlstra
  3 siblings, 2 replies; 31+ messages in thread
From: Ingo Molnar @ 2010-06-12 10:25 UTC (permalink / raw)
  To: Huang Ying, Fr??d??ric Weisbecker, Don Zickus, Peter Zijlstra
  Cc: H. Peter Anvin, linux-kernel, Andi Kleen


* Huang Ying <ying.huang@intel.com> wrote:

> NMI can be triggered even when IRQ is masked. So it is not safe for NMI 
> handler to call some functions. One solution is to delay the call via self 
> interrupt, so that the delayed call can be done once the interrupt is 
> enabled again. This has been implemented in MCE and perf event. This patch 
> provides a unified version and make it easier for other NMI semantic handler 
> to take use of the delayed call.
> 
> Signed-off-by: Huang Ying <ying.huang@intel.com>
> ---
>  arch/x86/include/asm/entry_arch.h  |    1 
>  arch/x86/include/asm/hw_irq.h      |    1 
>  arch/x86/include/asm/irq_vectors.h |    5 +
>  arch/x86/include/asm/nmi.h         |    7 ++
>  arch/x86/kernel/entry_64.S         |    3 +
>  arch/x86/kernel/irqinit.c          |    3 +
>  arch/x86/kernel/traps.c            |  104 +++++++++++++++++++++++++++++++++++++
>  7 files changed, 124 insertions(+)

Instead of introducing this extra intermediate facility please use the same 
approach the unified NMI watchdog is using (see latest -tip): a perf event 
callback gives all the extra functionality needed.

The MCE code needs to be updated to use that - and then it will be integrated 
into the events framework.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-12 10:25 ` [RFC 1/3] Unified NMI delayed call mechanism Ingo Molnar
@ 2010-06-13  1:54   ` Huang Ying
  2010-06-14  3:45   ` Hidetoshi Seto
  1 sibling, 0 replies; 31+ messages in thread
From: Huang Ying @ 2010-06-13  1:54 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Fr??d??ric Weisbecker, Don Zickus, Peter Zijlstra,
	H. Peter Anvin, linux-kernel, Andi Kleen

On Sat, 2010-06-12 at 18:25 +0800, Ingo Molnar wrote:
> * Huang Ying <ying.huang@intel.com> wrote:
> 
> > NMI can be triggered even when IRQ is masked. So it is not safe for NMI 
> > handler to call some functions. One solution is to delay the call via self 
> > interrupt, so that the delayed call can be done once the interrupt is 
> > enabled again. This has been implemented in MCE and perf event. This patch 
> > provides a unified version and make it easier for other NMI semantic handler 
> > to take use of the delayed call.
> > 
> > Signed-off-by: Huang Ying <ying.huang@intel.com>
> > ---
> >  arch/x86/include/asm/entry_arch.h  |    1 
> >  arch/x86/include/asm/hw_irq.h      |    1 
> >  arch/x86/include/asm/irq_vectors.h |    5 +
> >  arch/x86/include/asm/nmi.h         |    7 ++
> >  arch/x86/kernel/entry_64.S         |    3 +
> >  arch/x86/kernel/irqinit.c          |    3 +
> >  arch/x86/kernel/traps.c            |  104 +++++++++++++++++++++++++++++++++++++
> >  7 files changed, 124 insertions(+)
> 
> Instead of introducing this extra intermediate facility please use the same 
> approach the unified NMI watchdog is using (see latest -tip): a perf event 
> callback gives all the extra functionality needed.

Sorry, if my understanding is correct, the perf event overflow callback
should be run in NMI context instead of a delayed context (such as IRQ,
soft_irq, process context). That is, the backtrace of
watchdog_overflow_callback should be something as follow:

x86_pmu_handle_irq
  perf_event_overflow
    __perf_event_overflow
      watchdog_overflow_callback

I do not find the delayed mechanism here.

> The MCE code needs to be updated to use that - and then it will be integrated
> into the events framework.

MCE is NMI-like, and there are other NMI users too. I think some of them
will need some kind of delayed call mechanism. In fact, perf itself uses
self-made NMI delayed call mechanism too, I just want to generalize it
for other users too.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-12 10:25 ` [RFC 1/3] Unified NMI delayed call mechanism Ingo Molnar
  2010-06-13  1:54   ` Huang Ying
@ 2010-06-14  3:45   ` Hidetoshi Seto
  2010-06-14 13:54     ` Don Zickus
  2010-06-18  9:48     ` Ingo Molnar
  1 sibling, 2 replies; 31+ messages in thread
From: Hidetoshi Seto @ 2010-06-14  3:45 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Huang Ying, Fr??d??ric Weisbecker, Don Zickus, Peter Zijlstra,
	H. Peter Anvin, linux-kernel, Andi Kleen

(2010/06/12 19:25), Ingo Molnar wrote:
> 
> * Huang Ying <ying.huang@intel.com> wrote:
> 
>> NMI can be triggered even when IRQ is masked. So it is not safe for NMI 
>> handler to call some functions. One solution is to delay the call via self 
>> interrupt, so that the delayed call can be done once the interrupt is 
>> enabled again. This has been implemented in MCE and perf event. This patch 
>> provides a unified version and make it easier for other NMI semantic handler 
>> to take use of the delayed call.
> 
> Instead of introducing this extra intermediate facility please use the same 
> approach the unified NMI watchdog is using (see latest -tip): a perf event 
> callback gives all the extra functionality needed.
> 
> The MCE code needs to be updated to use that - and then it will be integrated 
> into the events framework.

Hi Ingo,

I think this "NMI delayed call mechanism" could be a part of "the events
framework" that we are planning to get in kernel soon.  At least APEI will
use NMI to report some hardware events (likely error) to kernel.  So I
suppose we will go to have a delayed call as an event handler for APEI.

Generally speaking "event" can occur independently of the situation.
NMI can tell us some of external events, expecting urgent reaction for
the event, but we cannot do everything in NMI context.  Or we might have
a sudden urge to generate an internal event while interrupts are disabled.

I agree that generating a self interrupt is reasonable solution.
Note that it could be said that both of "MCE handled (=event log should
be delivered to userland asap)" and "perf events pending (=pending events
should be handled asap)" are kind of internal event that requires urgent
handling in non-NMI kernel context.  One question here is why we should
have different vectors for these events that uses same mechanism.

How about calling the vector LOCAL_EVENT_VECTOR or so?
I guess there should be better name if it is possible to inject an event
to other cpus via IPI with this vector...


Thanks,
H.Seto



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-14  3:45   ` Hidetoshi Seto
@ 2010-06-14 13:54     ` Don Zickus
  2010-06-14 14:44       ` Andi Kleen
  2010-06-18  9:48     ` Ingo Molnar
  1 sibling, 1 reply; 31+ messages in thread
From: Don Zickus @ 2010-06-14 13:54 UTC (permalink / raw)
  To: Hidetoshi Seto
  Cc: Ingo Molnar, Huang Ying, Fr??d??ric Weisbecker, Peter Zijlstra,
	H. Peter Anvin, linux-kernel, Andi Kleen

On Mon, Jun 14, 2010 at 12:45:21PM +0900, Hidetoshi Seto wrote:
> (2010/06/12 19:25), Ingo Molnar wrote:
> > 
> > * Huang Ying <ying.huang@intel.com> wrote:
> > 
> >> NMI can be triggered even when IRQ is masked. So it is not safe for NMI 
> >> handler to call some functions. One solution is to delay the call via self 
> >> interrupt, so that the delayed call can be done once the interrupt is 
> >> enabled again. This has been implemented in MCE and perf event. This patch 
> >> provides a unified version and make it easier for other NMI semantic handler 
> >> to take use of the delayed call.
> > 
> > Instead of introducing this extra intermediate facility please use the same 
> > approach the unified NMI watchdog is using (see latest -tip): a perf event 
> > callback gives all the extra functionality needed.
> > 
> > The MCE code needs to be updated to use that - and then it will be integrated 
> > into the events framework.
> 
> Hi Ingo,
> 
> I think this "NMI delayed call mechanism" could be a part of "the events
> framework" that we are planning to get in kernel soon.  At least APEI will
> use NMI to report some hardware events (likely error) to kernel.  So I
> suppose we will go to have a delayed call as an event handler for APEI.
> 
> Generally speaking "event" can occur independently of the situation.
> NMI can tell us some of external events, expecting urgent reaction for
> the event, but we cannot do everything in NMI context.  Or we might have
> a sudden urge to generate an internal event while interrupts are disabled.
> 
> I agree that generating a self interrupt is reasonable solution.
> Note that it could be said that both of "MCE handled (=event log should
> be delivered to userland asap)" and "perf events pending (=pending events
> should be handled asap)" are kind of internal event that requires urgent
> handling in non-NMI kernel context.  One question here is why we should
> have different vectors for these events that uses same mechanism.

I think the perf event subsytem can log events in NMI context already and
deliver them to userspace when the NMI is done.  This is why I think Ingo
wants MCE to be updated to sit on top of the perf event subsytem to avoid
re-invent everything again.

Then again I do not know enough about the MCE stuff to understand what you
mean when an event comes in but you can't handle it in an NMI-safe
context.  An example would be helpful.

Cheers,
Don

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-14 13:54     ` Don Zickus
@ 2010-06-14 14:44       ` Andi Kleen
  2010-06-14 15:12         ` Don Zickus
  2010-06-18 10:30         ` Ingo Molnar
  0 siblings, 2 replies; 31+ messages in thread
From: Andi Kleen @ 2010-06-14 14:44 UTC (permalink / raw)
  To: Don Zickus
  Cc: Hidetoshi Seto, Ingo Molnar, Huang Ying, Fr??d??ric Weisbecker,
	Peter Zijlstra, H. Peter Anvin, linux-kernel, Andi Kleen

> I think the perf event subsytem can log events in NMI context already and
> deliver them to userspace when the NMI is done.  This is why I think Ingo
> wants MCE to be updated to sit on top of the perf event subsytem to avoid
> re-invent everything again.

perf is not solving the problem this is trying to solve.

> Then again I do not know enough about the MCE stuff to understand what you
> mean when an event comes in but you can't handle it in an NMI-safe
> context.  An example would be helpful.

At least for MCE hwpoison recovery needs to sleep and you obviously cannot sleep in
NMI like context. The way it's done is to first do a self interrupt, then do a work queue
wakeup and finally the sleeping operations. 

perf does not fit into this because it has no way to process such an event
inside the kernel.

Anyways this just cleans up the existing mechanism to share some code.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-14 14:44       ` Andi Kleen
@ 2010-06-14 15:12         ` Don Zickus
  2010-06-18 10:30         ` Ingo Molnar
  1 sibling, 0 replies; 31+ messages in thread
From: Don Zickus @ 2010-06-14 15:12 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Hidetoshi Seto, Ingo Molnar, Huang Ying, Fr??d??ric Weisbecker,
	Peter Zijlstra, H. Peter Anvin, linux-kernel

On Mon, Jun 14, 2010 at 04:44:03PM +0200, Andi Kleen wrote:
> > I think the perf event subsytem can log events in NMI context already and
> > deliver them to userspace when the NMI is done.  This is why I think Ingo
> > wants MCE to be updated to sit on top of the perf event subsytem to avoid
> > re-invent everything again.
> 
> perf is not solving the problem this is trying to solve.
> 
> > Then again I do not know enough about the MCE stuff to understand what you
> > mean when an event comes in but you can't handle it in an NMI-safe
> > context.  An example would be helpful.
> 
> At least for MCE hwpoison recovery needs to sleep and you obviously cannot sleep in
> NMI like context. The way it's done is to first do a self interrupt, then do a work queue
> wakeup and finally the sleeping operations. 
> 
> perf does not fit into this because it has no way to process such an event
> inside the kernel.

Ah, makes sense.  Thanks for the example.

Cheers,
Don

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-14  3:45   ` Hidetoshi Seto
  2010-06-14 13:54     ` Don Zickus
@ 2010-06-18  9:48     ` Ingo Molnar
  2010-06-18 11:34       ` huang ying
  1 sibling, 1 reply; 31+ messages in thread
From: Ingo Molnar @ 2010-06-18  9:48 UTC (permalink / raw)
  To: Hidetoshi Seto
  Cc: Huang Ying, Fr??d??ric Weisbecker, Don Zickus, Peter Zijlstra,
	H. Peter Anvin, linux-kernel, Andi Kleen


* Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> wrote:

> (2010/06/12 19:25), Ingo Molnar wrote:
> > 
> > * Huang Ying <ying.huang@intel.com> wrote:
> > 
> >> NMI can be triggered even when IRQ is masked. So it is not safe for NMI 
> >> handler to call some functions. One solution is to delay the call via self 
> >> interrupt, so that the delayed call can be done once the interrupt is 
> >> enabled again. This has been implemented in MCE and perf event. This patch 
> >> provides a unified version and make it easier for other NMI semantic handler 
> >> to take use of the delayed call.
> > 
> > Instead of introducing this extra intermediate facility please use the same 
> > approach the unified NMI watchdog is using (see latest -tip): a perf event 
> > callback gives all the extra functionality needed.
> > 
> > The MCE code needs to be updated to use that - and then it will be integrated 
> > into the events framework.
> 
> Hi Ingo,
> 
> I think this "NMI delayed call mechanism" could be a part of "the events 
> framework" that we are planning to get in kernel soon. [...]

My request was to make it part of perf events - which is a generic event 
logging framework. We dont really need/want a second 'events framework'
as we have one already ;-)

> [...]  At least APEI will use NMI to report some hardware events (likely 
> error) to kernel.  So I suppose we will go to have a delayed call as an 
> event handler for APEI.

Yep, that makes sense. I wasnt arguing against the functionality itself, i was 
arguing against the illogical layering that limits its utility. By making it 
part of perf events it becomes a generic part of that framework and can be 
used by anything that deals with events and uses that framework.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-14 14:44       ` Andi Kleen
  2010-06-14 15:12         ` Don Zickus
@ 2010-06-18 10:30         ` Ingo Molnar
  1 sibling, 0 replies; 31+ messages in thread
From: Ingo Molnar @ 2010-06-18 10:30 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Don Zickus, Hidetoshi Seto, Huang Ying, Fr??d??ric Weisbecker,
	Peter Zijlstra, H. Peter Anvin, linux-kernel


* Andi Kleen <andi@firstfloor.org> wrote:

> > I think the perf event subsytem can log events in NMI context already and 
> > deliver them to userspace when the NMI is done.  This is why I think Ingo 
> > wants MCE to be updated to sit on top of the perf event subsytem to avoid 
> > re-invent everything again.
> 
> perf is not solving the problem this is trying to solve.

That is why i requested to extend the events backend. That will unify more of 
the code than the first few steps achieved by these three patches - and offers 
the functionality to all code that uses the events framework.

> [...]
>
> perf does not fit into this because it has no way to process such an event 
> inside the kernel.

It 'does not fit' into the events backend only if you pretend that it is 
impossible or undesirable to have a delayed, in-context callback mechanism 
implemented there.

If you look at it more closely you'll notice that in reality it's not only 
possible but that it is also a pretty natural fit.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-18  9:48     ` Ingo Molnar
@ 2010-06-18 11:34       ` huang ying
  2010-06-18 12:45         ` Ingo Molnar
  0 siblings, 1 reply; 31+ messages in thread
From: huang ying @ 2010-06-18 11:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Hidetoshi Seto, Huang Ying, Fr??d??ric Weisbecker, Don Zickus,
	Peter Zijlstra, H. Peter Anvin, linux-kernel, Andi Kleen

Hi, Ingo,

On Fri, Jun 18, 2010 at 5:48 PM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> wrote:
>
>> (2010/06/12 19:25), Ingo Molnar wrote:
>> >
>> > * Huang Ying <ying.huang@intel.com> wrote:
>> >
>> >> NMI can be triggered even when IRQ is masked. So it is not safe for NMI
>> >> handler to call some functions. One solution is to delay the call via self
>> >> interrupt, so that the delayed call can be done once the interrupt is
>> >> enabled again. This has been implemented in MCE and perf event. This patch
>> >> provides a unified version and make it easier for other NMI semantic handler
>> >> to take use of the delayed call.
>> >
>> > Instead of introducing this extra intermediate facility please use the same
>> > approach the unified NMI watchdog is using (see latest -tip): a perf event
>> > callback gives all the extra functionality needed.
>> >
>> > The MCE code needs to be updated to use that - and then it will be integrated
>> > into the events framework.
>>
>> Hi Ingo,
>>
>> I think this "NMI delayed call mechanism" could be a part of "the events
>> framework" that we are planning to get in kernel soon. [...]
>
> My request was to make it part of perf events - which is a generic event
> logging framework. We dont really need/want a second 'events framework'
> as we have one already ;-)

This patchset is simple and straightforward, it is just a delayed
execution mechanism, not another 'events framework'. There are several
other NMI users other than perf, should we integrate all NMI users
into perf framework?

>> [...]  At least APEI will use NMI to report some hardware events (likely
>> error) to kernel.  So I suppose we will go to have a delayed call as an
>> event handler for APEI.
>
> Yep, that makes sense. I wasnt arguing against the functionality itself, i was
> arguing against the illogical layering that limits its utility. By making it
> part of perf events it becomes a generic part of that framework and can be
> used by anything that deals with events and uses that framework.

I think the the 'layering' in the patchset helps instead of 'limits'
its utility. It is designed to be as general as possible, so that it
can be used by both perf and other NMI users. Do you think so?

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-12  9:28 [RFC 1/3] Unified NMI delayed call mechanism Huang Ying
                   ` (2 preceding siblings ...)
  2010-06-12 10:25 ` [RFC 1/3] Unified NMI delayed call mechanism Ingo Molnar
@ 2010-06-18 11:55 ` Peter Zijlstra
  2010-06-18 12:25   ` Andi Kleen
  3 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2010-06-18 11:55 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen, Chris Mason

On Sat, 2010-06-12 at 17:28 +0800, Huang Ying wrote:
> +#define NMI_DELAYED_CALL_ID_MAX                32
> +#define NMI_DELAYED_CALL_RESTART_MAX   5
> +
> +static nmi_delayed_call_func_t nmi_delayed_call_funcs[NMI_DELAYED_CALL_ID_MAX];
> +static DEFINE_SPINLOCK(nmi_delayed_call_lock);
> +
> +static DEFINE_PER_CPU(unsigned long, nmi_delayed_call_pending);
> +
> +static void nmi_delayed_call_run(void)
> +{
> +       int cpu, restart = NMI_DELAYED_CALL_RESTART_MAX;
> +       unsigned long pending, *ppending;
> +       nmi_delayed_call_func_t *pfunc, func;
> +
> +       cpu = smp_processor_id();
> +       ppending = per_cpu_ptr(&nmi_delayed_call_pending, cpu);
> +       while (*ppending && restart--) {
> +               pending = xchg(ppending, 0);
> +               pfunc = nmi_delayed_call_funcs;
> +               do {
> +                       if (pending & 1) {
> +                               func = *pfunc;
> +                               if (func)
> +                                       func();
> +                       }
> +                       pfunc++;
> +                       pending >>= 1;
> +               } while (pending);
> +       }
> +}

So aside from the should this be perf or not, the above is utter
gibberish. Whoever came up with this nonsense?

Why not make a work_struct like thing and enqueue it using cmpxchg on a
percpu list, then have the interrupt process them. Read
perf_pending_queue() and __perf_pending_run().

That way you don't need this whole register/id/limit crap.

> +#ifdef CONFIG_X86_LOCAL_APIC

What's the point of the rest of this code if we don't have a lapic?

> +asmlinkage void smp_nmi_delayed_call_interrupt(struct pt_regs *regs)
> +{
> +       ack_APIC_irq();
> +       irq_enter();

You're missing inc_irq_stat() there.

> +       nmi_delayed_call_run();
> +       irq_exit();
> +}
> +#endif 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-18 11:55 ` Peter Zijlstra
@ 2010-06-18 12:25   ` Andi Kleen
  2010-06-18 12:48     ` Peter Zijlstra
  2010-06-18 14:37     ` Ingo Molnar
  0 siblings, 2 replies; 31+ messages in thread
From: Andi Kleen @ 2010-06-18 12:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen, Chris Mason

> So aside from the should this be perf or not, the above is utter
> gibberish. Whoever came up with this nonsense?

This is pretty much how softirqs (and before them bottom halves) work.
I believe Linus invented that scheme originally back in the early
days of Linux.

It's actually quite simple and works well

-Andi

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-18 11:34       ` huang ying
@ 2010-06-18 12:45         ` Ingo Molnar
  2010-06-18 13:40           ` huang ying
  0 siblings, 1 reply; 31+ messages in thread
From: Ingo Molnar @ 2010-06-18 12:45 UTC (permalink / raw)
  To: huang ying
  Cc: Hidetoshi Seto, Huang Ying, Fr??d??ric Weisbecker, Don Zickus,
	Peter Zijlstra, H. Peter Anvin, linux-kernel, Andi Kleen


* huang ying <huang.ying.caritas@gmail.com> wrote:

> Hi, Ingo,
> 
> On Fri, Jun 18, 2010 at 5:48 PM, Ingo Molnar <mingo@elte.hu> wrote:
> >
> > * Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> wrote:
> >
> >> (2010/06/12 19:25), Ingo Molnar wrote:
> >> >
> >> > * Huang Ying <ying.huang@intel.com> wrote:
> >> >
> >> >> NMI can be triggered even when IRQ is masked. So it is not safe for NMI
> >> >> handler to call some functions. One solution is to delay the call via self
> >> >> interrupt, so that the delayed call can be done once the interrupt is
> >> >> enabled again. This has been implemented in MCE and perf event. This patch
> >> >> provides a unified version and make it easier for other NMI semantic handler
> >> >> to take use of the delayed call.
> >> >
> >> > Instead of introducing this extra intermediate facility please use the same
> >> > approach the unified NMI watchdog is using (see latest -tip): a perf event
> >> > callback gives all the extra functionality needed.
> >> >
> >> > The MCE code needs to be updated to use that - and then it will be integrated
> >> > into the events framework.
> >>
> >> Hi Ingo,
> >>
> >> I think this "NMI delayed call mechanism" could be a part of "the events
> >> framework" that we are planning to get in kernel soon. [...]
> >
> > My request was to make it part of perf events - which is a generic event 
> > logging framework. We dont really need/want a second 'events framework' as 
> > we have one already ;-)
> 
> This patchset is simple and straightforward, [...]

We wouldnt want to add another workqueue or memory allocation mechanism 
either, even if it was 'simple and straightforward'. We try to make things 
more generally useful.

> [...] it is just a delayed execution mechanism, not another 'events 
> framework'. There are several other NMI users other than perf, should we 
> integrate all NMI users into perf framework?

We already did so with the NMI watchdog. What other significant NMI event 
users do you have in mind?

> >> [...] ??At least APEI will use NMI to report some hardware events (likely 
> >> error) to kernel. ??So I suppose we will go to have a delayed call as an 
> >> event handler for APEI.
> >
> > Yep, that makes sense. I wasnt arguing against the functionality itself, i 
> > was arguing against the illogical layering that limits its utility. By 
> > making it part of perf events it becomes a generic part of that framework 
> > and can be used by anything that deals with events and uses that 
> > framework.
> 
> I think the the 'layering' in the patchset helps instead of 'limits' its 
> utility. It is designed to be as general as possible, so that it can be used 
> by both perf and other NMI users. Do you think so?

What other NMI users do you mean? EDAC/MCE is going to go utilize events as 
well (away from the horrible /dev/mcelog interface), the NMI watchdog already 
did it and the perf tool obviously does as well. There's a few leftovers like 
kcrash which isnt really event centric and i dont think it needs to be 
converted.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-18 12:25   ` Andi Kleen
@ 2010-06-18 12:48     ` Peter Zijlstra
  2010-06-18 13:09       ` Andi Kleen
  2010-06-18 14:37     ` Ingo Molnar
  1 sibling, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2010-06-18 12:48 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel, Chris Mason

On Fri, 2010-06-18 at 14:25 +0200, Andi Kleen wrote:
> > So aside from the should this be perf or not, the above is utter
> > gibberish. Whoever came up with this nonsense?
> 
> This is pretty much how softirqs (and before them bottom halves) work.
> I believe Linus invented that scheme originally back in the early
> days of Linux.

Doesn't mean its the right abstraction for this.

> It's actually quite simple and works well

And adds more code than it removes whilst providing a very limited
service.

You generally want to pass more information along anyway, now your
callback function needs to go look for it. Much better to pass a
work_struct like thing around that is contained in the state it needs.



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-18 12:48     ` Peter Zijlstra
@ 2010-06-18 13:09       ` Andi Kleen
  2010-06-18 13:12         ` Peter Zijlstra
  0 siblings, 1 reply; 31+ messages in thread
From: Andi Kleen @ 2010-06-18 13:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, Huang Ying, Ingo Molnar, H.PeterA, linux-kernel, Chris Mason

> You generally want to pass more information along anyway, now your
> callback function needs to go look for it. Much better to pass a
> work_struct like thing around that is contained in the state it needs.

But how would you allocate the work queue in an NMI?

If it's only a single instance (like this bit) it can be always put
into a per cpu variable.

-Andi

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-18 13:09       ` Andi Kleen
@ 2010-06-18 13:12         ` Peter Zijlstra
  2010-06-18 13:23           ` Andi Kleen
  0 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2010-06-18 13:12 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel, Chris Mason

On Fri, 2010-06-18 at 15:09 +0200, Andi Kleen wrote:
> > You generally want to pass more information along anyway, now your
> > callback function needs to go look for it. Much better to pass a
> > work_struct like thing around that is contained in the state it needs.
> 
> But how would you allocate the work queue in an NMI?
> 
> If it's only a single instance (like this bit) it can be always put
> into a per cpu variable.

Pre-allocate. For the perf-event stuff we use the perf_event allocated
at creation time. But yeah, per-cpu storage also works.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-18 13:12         ` Peter Zijlstra
@ 2010-06-18 13:23           ` Andi Kleen
  2010-06-18 13:24             ` Peter Zijlstra
  0 siblings, 1 reply; 31+ messages in thread
From: Andi Kleen @ 2010-06-18 13:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, Huang Ying, Ingo Molnar, H.PeterA, linux-kernel, Chris Mason

On Fri, Jun 18, 2010 at 03:12:49PM +0200, Peter Zijlstra wrote:
> On Fri, 2010-06-18 at 15:09 +0200, Andi Kleen wrote:
> > > You generally want to pass more information along anyway, now your
> > > callback function needs to go look for it. Much better to pass a
> > > work_struct like thing around that is contained in the state it needs.
> > 
> > But how would you allocate the work queue in an NMI?
> > 
> > If it's only a single instance (like this bit) it can be always put
> > into a per cpu variable.
> 
> Pre-allocate. For the perf-event stuff we use the perf_event allocated
> at creation time. But yeah, per-cpu storage also works.

So you could just preallocate the bits instead ?

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-18 13:23           ` Andi Kleen
@ 2010-06-18 13:24             ` Peter Zijlstra
  0 siblings, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2010-06-18 13:24 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel, Chris Mason

On Fri, 2010-06-18 at 15:23 +0200, Andi Kleen wrote:
> On Fri, Jun 18, 2010 at 03:12:49PM +0200, Peter Zijlstra wrote:
> > On Fri, 2010-06-18 at 15:09 +0200, Andi Kleen wrote:
> > > > You generally want to pass more information along anyway, now your
> > > > callback function needs to go look for it. Much better to pass a
> > > > work_struct like thing around that is contained in the state it needs.
> > > 
> > > But how would you allocate the work queue in an NMI?
> > > 
> > > If it's only a single instance (like this bit) it can be always put
> > > into a per cpu variable.
> > 
> > Pre-allocate. For the perf-event stuff we use the perf_event allocated
> > at creation time. But yeah, per-cpu storage also works.
> 
> So you could just preallocate the bits instead ?

You mean the bits in your function array? Those are limited to 32 and
you'd need a secondary lookup to match them to your data object, not
very useful.



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-18 12:45         ` Ingo Molnar
@ 2010-06-18 13:40           ` huang ying
  2010-06-18 14:35             ` Ingo Molnar
  0 siblings, 1 reply; 31+ messages in thread
From: huang ying @ 2010-06-18 13:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Hidetoshi Seto, Huang Ying, Fr??d??ric Weisbecker, Don Zickus,
	Peter Zijlstra, H. Peter Anvin, linux-kernel, Andi Kleen

On Fri, Jun 18, 2010 at 8:45 PM, Ingo Molnar <mingo@elte.hu> wrote:
>> >> [...] ??At least APEI will use NMI to report some hardware events (likely
>> >> error) to kernel. ??So I suppose we will go to have a delayed call as an
>> >> event handler for APEI.
>> >
>> > Yep, that makes sense. I wasnt arguing against the functionality itself, i
>> > was arguing against the illogical layering that limits its utility. By
>> > making it part of perf events it becomes a generic part of that framework
>> > and can be used by anything that deals with events and uses that
>> > framework.
>>
>> I think the the 'layering' in the patchset helps instead of 'limits' its
>> utility. It is designed to be as general as possible, so that it can be used
>> by both perf and other NMI users. Do you think so?
>
> What other NMI users do you mean? EDAC/MCE is going to go utilize events as
> well (away from the horrible /dev/mcelog interface), the NMI watchdog already
> did it and the perf tool obviously does as well. There's a few leftovers like
> kcrash which isnt really event centric and i dont think it needs to be
> converted.

But why not just make it more general? It does not hurt anyone including perf.

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-18 13:40           ` huang ying
@ 2010-06-18 14:35             ` Ingo Molnar
  2010-06-18 15:16               ` huang ying
  0 siblings, 1 reply; 31+ messages in thread
From: Ingo Molnar @ 2010-06-18 14:35 UTC (permalink / raw)
  To: huang ying
  Cc: Hidetoshi Seto, Huang Ying, Fr??d??ric Weisbecker, Don Zickus,
	Peter Zijlstra, H. Peter Anvin, linux-kernel, Andi Kleen


* huang ying <huang.ying.caritas@gmail.com> wrote:

> On Fri, Jun 18, 2010 at 8:45 PM, Ingo Molnar <mingo@elte.hu> wrote:
> >> >> [...] ??At least APEI will use NMI to report some hardware events (likely
> >> >> error) to kernel. ??So I suppose we will go to have a delayed call as an
> >> >> event handler for APEI.
> >> >
> >> > Yep, that makes sense. I wasnt arguing against the functionality itself, i
> >> > was arguing against the illogical layering that limits its utility. By
> >> > making it part of perf events it becomes a generic part of that framework
> >> > and can be used by anything that deals with events and uses that
> >> > framework.
> >>
> >> I think the the 'layering' in the patchset helps instead of 'limits' its
> >> utility. It is designed to be as general as possible, so that it can be used
> >> by both perf and other NMI users. Do you think so?
> >
> > What other NMI users do you mean? EDAC/MCE is going to go utilize events 
> > as well (away from the horrible /dev/mcelog interface), the NMI watchdog 
> > already did it and the perf tool obviously does as well. There's a few 
> > leftovers like kcrash which isnt really event centric and i dont think it 
> > needs to be converted.
> 
> But why not just make it more general? It does not hurt anyone including 
> perf.

Because it's not actually more generic that way - just look at the code. It's 
x86 specific, plus it ties it to NMI delivery while the concept of delayed 
execution has nothing to do with NMIs.

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-18 12:25   ` Andi Kleen
  2010-06-18 12:48     ` Peter Zijlstra
@ 2010-06-18 14:37     ` Ingo Molnar
  2010-06-19 14:17       ` Andi Kleen
  1 sibling, 1 reply; 31+ messages in thread
From: Ingo Molnar @ 2010-06-18 14:37 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Peter Zijlstra, Huang Ying, H.PeterA, linux-kernel, Chris Mason


* Andi Kleen <andi@firstfloor.org> wrote:

> > So aside from the should this be perf or not, the above is utter
> > gibberish. Whoever came up with this nonsense?
> 
> This is pretty much how softirqs (and before them bottom halves) work.
> I believe Linus invented that scheme originally back in the early
> days of Linux.

Nope - the softirq code was written by Alexey Kuznetsov and David S. Miller. 
But it's irrelevant, because:

> It's actually quite simple and works well

... as Peter said the last thing we want is yet another softirq vector.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-18 14:35             ` Ingo Molnar
@ 2010-06-18 15:16               ` huang ying
  2010-06-18 15:31                 ` Peter Zijlstra
  0 siblings, 1 reply; 31+ messages in thread
From: huang ying @ 2010-06-18 15:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Hidetoshi Seto, Huang Ying, Fr??d??ric Weisbecker, Don Zickus,
	Peter Zijlstra, H. Peter Anvin, linux-kernel, Andi Kleen

On Fri, Jun 18, 2010 at 10:35 PM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * huang ying <huang.ying.caritas@gmail.com> wrote:
>
>> On Fri, Jun 18, 2010 at 8:45 PM, Ingo Molnar <mingo@elte.hu> wrote:
>> >> >> [...] ??At least APEI will use NMI to report some hardware events (likely
>> >> >> error) to kernel. ??So I suppose we will go to have a delayed call as an
>> >> >> event handler for APEI.
>> >> >
>> >> > Yep, that makes sense. I wasnt arguing against the functionality itself, i
>> >> > was arguing against the illogical layering that limits its utility. By
>> >> > making it part of perf events it becomes a generic part of that framework
>> >> > and can be used by anything that deals with events and uses that
>> >> > framework.
>> >>
>> >> I think the the 'layering' in the patchset helps instead of 'limits' its
>> >> utility. It is designed to be as general as possible, so that it can be used
>> >> by both perf and other NMI users. Do you think so?
>> >
>> > What other NMI users do you mean? EDAC/MCE is going to go utilize events
>> > as well (away from the horrible /dev/mcelog interface), the NMI watchdog
>> > already did it and the perf tool obviously does as well. There's a few
>> > leftovers like kcrash which isnt really event centric and i dont think it
>> > needs to be converted.
>>
>> But why not just make it more general? It does not hurt anyone including
>> perf.
>
> Because it's not actually more generic that way - just look at the code. It's
> x86 specific, plus it ties it to NMI delivery while the concept of delayed
> execution has nothing to do with NMIs.

soft_irq is a delayed mechanism for IRQ, a self interrupt can be a
delayed mechanism for NMI. If we can make soft_irq NMI-safe, we can
use soft_irq as a backup of self interrupt (for systems without APIC
and maybe for other architectures).

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-18 15:16               ` huang ying
@ 2010-06-18 15:31                 ` Peter Zijlstra
  2010-06-19  1:51                   ` huang ying
  2010-06-19  8:02                   ` Andi Kleen
  0 siblings, 2 replies; 31+ messages in thread
From: Peter Zijlstra @ 2010-06-18 15:31 UTC (permalink / raw)
  To: huang ying
  Cc: Ingo Molnar, Hidetoshi Seto, Huang Ying, Fr??d??ric Weisbecker,
	Don Zickus, H.PeterA, linux-kernel, Andi Kleen

On Fri, 2010-06-18 at 23:16 +0800, huang ying wrote:
> 
> soft_irq is a delayed mechanism for IRQ,

No its not.

>  a self interrupt can be a
> delayed mechanism for NMI. If we can make soft_irq NMI-safe, 

No you can't.

> we can
> use soft_irq as a backup of self interrupt (for systems without APIC
> and maybe for other architectures). 

Whatever would you want to do that for.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-18 15:31                 ` Peter Zijlstra
@ 2010-06-19  1:51                   ` huang ying
  2010-06-19  8:02                   ` Andi Kleen
  1 sibling, 0 replies; 31+ messages in thread
From: huang ying @ 2010-06-19  1:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Hidetoshi Seto, Huang Ying, Fr??d??ric Weisbecker,
	Don Zickus, H.PeterA, linux-kernel, Andi Kleen

On Fri, Jun 18, 2010 at 11:31 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Fri, 2010-06-18 at 23:16 +0800, huang ying wrote:
>>
>> soft_irq is a delayed mechanism for IRQ,
>
> No its not.

Why? What do you think soft_irq is for?

>>  a self interrupt can be a
>> delayed mechanism for NMI. If we can make soft_irq NMI-safe,
>
> No you can't.
>
>> we can
>> use soft_irq as a backup of self interrupt (for systems without APIC
>> and maybe for other architectures).
>
> Whatever would you want to do that for.

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-18 15:31                 ` Peter Zijlstra
  2010-06-19  1:51                   ` huang ying
@ 2010-06-19  8:02                   ` Andi Kleen
  2010-06-19 10:53                     ` Ingo Molnar
  1 sibling, 1 reply; 31+ messages in thread
From: Andi Kleen @ 2010-06-19  8:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: huang ying, Ingo Molnar, Hidetoshi Seto, Huang Ying,
	Fr??d??ric Weisbecker, Don Zickus, H.PeterA, linux-kernel,
	Andi Kleen

> > we can
> > use soft_irq as a backup of self interrupt (for systems without APIC
> > and maybe for other architectures). 
> 
> Whatever would you want to do that for.

The idea is that the work would be done latest on the next
timer interrupt as a fallback if APIC is not available.

That's what mce does already and it's probably approbiate
for most other users too.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-19  8:02                   ` Andi Kleen
@ 2010-06-19 10:53                     ` Ingo Molnar
  2010-06-19 14:07                       ` huang ying
  2010-06-19 14:24                       ` Andi Kleen
  0 siblings, 2 replies; 31+ messages in thread
From: Ingo Molnar @ 2010-06-19 10:53 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Peter Zijlstra, huang ying, Hidetoshi Seto, Huang Ying,
	Fr??d??ric Weisbecker, Don Zickus, H.PeterA, linux-kernel


( Ugh: what's this new fad of you not quoting the name of the person who 
  wrote a mail? It makes multi-level quotes utterly unreadable as it's not
  clear who wrote what. You should also respect others by quoting their names. )

* Andi Kleen <andi@firstfloor.org> wrote:

> > > we can use soft_irq as a backup of self interrupt (for systems without 
> > > APIC and maybe for other architectures).
> > 
> > Whatever would you want to do that for.
> 
> The idea is that the work would be done latest on the next timer interrupt 
> as a fallback if APIC is not available.

Abusing the timer irq for that is an exceedingly ugly and unacceptable design, 
as machine check events have nothing to do with timers. (That approach is also 
buggy because it inserts an arbitrary delay - which could be rather long on 
nohz.)

This kind of messy, ad-hoc piggybacking only creates unmaintainable code in 
the long run.

> That's what mce does already and it's probably approbiate for most other 
> users too.

Hell no, the unfortunate and unclean practices of the MCE code must not be 
propagated elsewhere.

The proper, generic approach would be to enable softirq notifications (on x86) 
from NMI contexts as well (it's actually possible without overhead), and to 
extend user return notifiers with the logical next step: nmi return notifiers. 
If presented in such a form then those could use softirqs for atomic callbacks 
and per cpu kthreads for sleepable callbacks, etc.

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-19 10:53                     ` Ingo Molnar
@ 2010-06-19 14:07                       ` huang ying
  2010-06-19 14:24                       ` Andi Kleen
  1 sibling, 0 replies; 31+ messages in thread
From: huang ying @ 2010-06-19 14:07 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, Peter Zijlstra, Hidetoshi Seto, Huang Ying,
	Fr??d??ric Weisbecker, Don Zickus, H.PeterA, linux-kernel

On Sat, Jun 19, 2010 at 6:53 PM, Ingo Molnar <mingo@elte.hu> wrote:
> The proper, generic approach would be to enable softirq notifications (on x86)
> from NMI contexts as well (it's actually possible without overhead),

Yes. I will do that. And I think self interrupt can be used as the
short-cut for soft_irq if available. The next soft_irq may be too late
if there is too few interrupts.

> and to
> extend user return notifiers with the logical next step: nmi return notifiers.
> If presented in such a form then those could use softirqs for atomic callbacks
> and per cpu kthreads for sleepable callbacks, etc.

NMI return notifiers fired in soft_irq?

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-18 14:37     ` Ingo Molnar
@ 2010-06-19 14:17       ` Andi Kleen
  0 siblings, 0 replies; 31+ messages in thread
From: Andi Kleen @ 2010-06-19 14:17 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, Peter Zijlstra, Huang Ying, H.PeterA, linux-kernel,
	Chris Mason

> Nope - the softirq code was written by Alexey Kuznetsov and David S. Miller. 

It was based on the design of the previous bottom halves in case you
missed it. I believe BHs were there from nearly the beginning.

(softirqs were basically just per cpu bottom halves, otherwise
being very similar)

-Andi

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC 1/3] Unified NMI delayed call mechanism
  2010-06-19 10:53                     ` Ingo Molnar
  2010-06-19 14:07                       ` huang ying
@ 2010-06-19 14:24                       ` Andi Kleen
  1 sibling, 0 replies; 31+ messages in thread
From: Andi Kleen @ 2010-06-19 14:24 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, Peter Zijlstra, huang ying, Hidetoshi Seto,
	Huang Ying, Fr??d??ric Weisbecker, Don Zickus, H.PeterA,
	linux-kernel

On Sat, Jun 19, 2010 at 12:53:52PM +0200, Ingo Molnar wrote:
> 
> ( Ugh: what's this new fad of you not quoting the name of the person who 
>   wrote a mail? It makes multi-level quotes utterly unreadable as it's not
>   clear who wrote what. You should also respect others by quoting their names. )

To give you an excuse to flame of course. I know that's 
your favourite pastime and I'm always happy to make you happy.

> 
> Abusing the timer irq for that is an exceedingly ugly and unacceptable design, 
> as machine check events have nothing to do with timers. (That approach is also 
The rationale is that this nearly never happens in practice anyways
(the operations that trigger it do near always only happen with APICs)
AFAIK that's true for all the proposed users. 

For very rare and obscure cases like this in my experience
the simplest possible code is the best,
that makes it most likely it actually works when it's needed.
I do not know of any simpler way to implement this.

Of course if there's evidence it's actually common that would
need to be revisited.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2010-06-19 14:24 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-06-12  9:28 [RFC 1/3] Unified NMI delayed call mechanism Huang Ying
2010-06-12  9:28 ` [RFC 2/3] Use unified NMI delayed call mechanism in MCE handler Huang Ying
2010-06-12  9:28 ` [RFC 3/3] Use unified NMI delayed call mechanism in perf event NMI handler Huang Ying
2010-06-12 10:25 ` [RFC 1/3] Unified NMI delayed call mechanism Ingo Molnar
2010-06-13  1:54   ` Huang Ying
2010-06-14  3:45   ` Hidetoshi Seto
2010-06-14 13:54     ` Don Zickus
2010-06-14 14:44       ` Andi Kleen
2010-06-14 15:12         ` Don Zickus
2010-06-18 10:30         ` Ingo Molnar
2010-06-18  9:48     ` Ingo Molnar
2010-06-18 11:34       ` huang ying
2010-06-18 12:45         ` Ingo Molnar
2010-06-18 13:40           ` huang ying
2010-06-18 14:35             ` Ingo Molnar
2010-06-18 15:16               ` huang ying
2010-06-18 15:31                 ` Peter Zijlstra
2010-06-19  1:51                   ` huang ying
2010-06-19  8:02                   ` Andi Kleen
2010-06-19 10:53                     ` Ingo Molnar
2010-06-19 14:07                       ` huang ying
2010-06-19 14:24                       ` Andi Kleen
2010-06-18 11:55 ` Peter Zijlstra
2010-06-18 12:25   ` Andi Kleen
2010-06-18 12:48     ` Peter Zijlstra
2010-06-18 13:09       ` Andi Kleen
2010-06-18 13:12         ` Peter Zijlstra
2010-06-18 13:23           ` Andi Kleen
2010-06-18 13:24             ` Peter Zijlstra
2010-06-18 14:37     ` Ingo Molnar
2010-06-19 14:17       ` Andi Kleen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.