linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC 1/5] Make soft_irq NMI safe
@ 2010-06-24  3:04 Huang Ying
  2010-06-24  3:04 ` [RFC 2/5] NMI return notifier Huang Ying
                   ` (5 more replies)
  0 siblings, 6 replies; 66+ messages in thread
From: Huang Ying @ 2010-06-24  3:04 UTC (permalink / raw)
  To: Ingo Molnar, H. Peter Anvin
  Cc: linux-kernel, Andi Kleen, Peter Zijlstra, Huang Ying

NMI can be triggered even when IRQ is masked, so many kernel services
can not be used in NMI handler.  It is necessary for NMI handler to
trigger some operations in other contexts such as IRQ and process.  To
do this, this patch makes soft_irq mechanism usable for NMI handler.

Signed-off-by: Huang Ying <ying.huang@intel.com>
---
 arch/ia64/include/asm/hardirq.h     |    2 -
 arch/powerpc/include/asm/hardirq.h  |    2 -
 arch/powerpc/kernel/irq.c           |    2 -
 arch/s390/include/asm/hardirq.h     |    2 -
 arch/s390/kernel/irq.c              |    2 -
 arch/sh/kernel/irq.c                |    2 -
 arch/sparc/include/asm/hardirq_64.h |    2 -
 arch/sparc/kernel/irq_64.c          |    2 -
 arch/x86/include/asm/hardirq.h      |    7 ----
 arch/x86/kernel/irq_32.c            |    2 -
 arch/x86/kernel/irq_64.c            |    2 -
 block/blk-iopoll.c                  |    6 +--
 block/blk-softirq.c                 |    6 +--
 include/linux/interrupt.h           |   10 +----
 include/linux/irq_cpustat.h         |    2 -
 kernel/hrtimer.c                    |    4 +-
 kernel/softirq.c                    |   61 +++++++++++++++++-------------------
 kernel/time/tick-sched.c            |    6 +--
 net/core/dev.c                      |   14 ++++----
 19 files changed, 62 insertions(+), 74 deletions(-)

--- a/arch/ia64/include/asm/hardirq.h
+++ b/arch/ia64/include/asm/hardirq.h
@@ -18,7 +18,7 @@
 
 #define __ARCH_IRQ_STAT	1
 
-#define local_softirq_pending()		(local_cpu_data->softirq_pending)
+#define local_softirq_pending()		(&local_cpu_data->softirq_pending)
 
 extern void __iomem *ipi_base_addr;
 
--- a/arch/powerpc/include/asm/hardirq.h
+++ b/arch/powerpc/include/asm/hardirq.h
@@ -16,7 +16,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpust
 
 #define __ARCH_IRQ_STAT
 
-#define local_softirq_pending()	__get_cpu_var(irq_stat).__softirq_pending
+#define local_softirq_pending()	(&__get_cpu_var(irq_stat).__softirq_pending)
 
 static inline void ack_bad_irq(unsigned int irq)
 {
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -505,7 +505,7 @@ void do_softirq(void)
 
 	local_irq_save(flags);
 
-	if (local_softirq_pending())
+	if (*local_softirq_pending())
 		do_softirq_onstack();
 
 	local_irq_restore(flags);
--- a/arch/s390/include/asm/hardirq.h
+++ b/arch/s390/include/asm/hardirq.h
@@ -18,7 +18,7 @@
 #include <linux/interrupt.h>
 #include <asm/lowcore.h>
 
-#define local_softirq_pending() (S390_lowcore.softirq_pending)
+#define local_softirq_pending() (&S390_lowcore.softirq_pending)
 
 #define __ARCH_IRQ_STAT
 #define __ARCH_HAS_DO_SOFTIRQ
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -70,7 +70,7 @@ asmlinkage void do_softirq(void)
 
 	local_irq_save(flags);
 
-	if (local_softirq_pending()) {
+	if (*local_softirq_pending()) {
 		/* Get current stack pointer. */
 		asm volatile("la %0,0(15)" : "=a" (old));
 		/* Check against async. stack address range. */
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -212,7 +212,7 @@ asmlinkage void do_softirq(void)
 
 	local_irq_save(flags);
 
-	if (local_softirq_pending()) {
+	if (*local_softirq_pending()) {
 		curctx = current_thread_info();
 		irqctx = softirq_ctx[smp_processor_id()];
 		irqctx->tinfo.task = curctx->task;
--- a/arch/sparc/include/asm/hardirq_64.h
+++ b/arch/sparc/include/asm/hardirq_64.h
@@ -10,7 +10,7 @@
 
 #define __ARCH_IRQ_STAT
 #define local_softirq_pending() \
-	(local_cpu_data().__softirq_pending)
+	(&local_cpu_data().__softirq_pending)
 
 void ack_bad_irq(unsigned int irq);
 
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -770,7 +770,7 @@ void do_softirq(void)
 
 	local_irq_save(flags);
 
-	if (local_softirq_pending()) {
+	if (*local_softirq_pending()) {
 		void *orig_sp, *sp = softirq_stack[smp_processor_id()];
 
 		sp += THREAD_SIZE - 192 - STACK_BIAS;
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -37,12 +37,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpust
 
 #define inc_irq_stat(member)	percpu_inc(irq_stat.member)
 
-#define local_softirq_pending()	percpu_read(irq_stat.__softirq_pending)
-
-#define __ARCH_SET_SOFTIRQ_PENDING
-
-#define set_softirq_pending(x)	percpu_write(irq_stat.__softirq_pending, (x))
-#define or_softirq_pending(x)	percpu_or(irq_stat.__softirq_pending, (x))
+#define local_softirq_pending()	(&__get_cpu_var(irq_stat.__softirq_pending))
 
 extern void ack_bad_irq(unsigned int irq);
 
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -168,7 +168,7 @@ asmlinkage void do_softirq(void)
 
 	local_irq_save(flags);
 
-	if (local_softirq_pending()) {
+	if (*local_softirq_pending()) {
 		curctx = current_thread_info();
 		irqctx = __get_cpu_var(softirq_ctx);
 		irqctx->tinfo.task = curctx->task;
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -74,7 +74,7 @@ asmlinkage void do_softirq(void)
 		return;
 
 	local_irq_save(flags);
-	pending = local_softirq_pending();
+	pending = *local_softirq_pending();
 	/* Switch to interrupt stack */
 	if (pending) {
 		call_softirq();
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -36,7 +36,7 @@ void blk_iopoll_sched(struct blk_iopoll
 
 	local_irq_save(flags);
 	list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll));
-	__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+	__raise_softirq_preempt_off(BLOCK_IOPOLL_SOFTIRQ);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(blk_iopoll_sched);
@@ -132,7 +132,7 @@ static void blk_iopoll_softirq(struct so
 	}
 
 	if (rearm)
-		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+		__raise_softirq_preempt_off(BLOCK_IOPOLL_SOFTIRQ);
 
 	local_irq_enable();
 }
@@ -202,7 +202,7 @@ static int __cpuinit blk_iopoll_cpu_noti
 		local_irq_disable();
 		list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
 				 &__get_cpu_var(blk_cpu_iopoll));
-		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+		__raise_softirq_preempt_off(BLOCK_IOPOLL_SOFTIRQ);
 		local_irq_enable();
 	}
 
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -47,7 +47,7 @@ static void trigger_softirq(void *data)
 	list_add_tail(&rq->csd.list, list);
 
 	if (list->next == &rq->csd.list)
-		raise_softirq_irqoff(BLOCK_SOFTIRQ);
+		raise_softirq_preempt_off(BLOCK_SOFTIRQ);
 
 	local_irq_restore(flags);
 }
@@ -90,7 +90,7 @@ static int __cpuinit blk_cpu_notify(stru
 		local_irq_disable();
 		list_splice_init(&per_cpu(blk_cpu_done, cpu),
 				 &__get_cpu_var(blk_cpu_done));
-		raise_softirq_irqoff(BLOCK_SOFTIRQ);
+		raise_softirq_preempt_off(BLOCK_SOFTIRQ);
 		local_irq_enable();
 	}
 
@@ -134,7 +134,7 @@ do_local:
 		 * hasn't run yet.
 		 */
 		if (list->next == &req->csd.list)
-			raise_softirq_irqoff(BLOCK_SOFTIRQ);
+			raise_softirq_preempt_off(BLOCK_SOFTIRQ);
 	} else if (raise_blk_irq(ccpu, req))
 		goto do_local;
 
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -347,11 +347,6 @@ static inline int disable_irq_wake(unsig
 }
 #endif /* CONFIG_GENERIC_HARDIRQS */
 
-#ifndef __ARCH_SET_SOFTIRQ_PENDING
-#define set_softirq_pending(x) (local_softirq_pending() = (x))
-#define or_softirq_pending(x)  (local_softirq_pending() |= (x))
-#endif
-
 /* Some architectures might implement lazy enabling/disabling of
  * interrupts. In some cases, such as stop_machine, we might want
  * to ensure that after a local_irq_disable(), interrupts have
@@ -402,8 +397,9 @@ asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
 extern void softirq_init(void);
-#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
-extern void raise_softirq_irqoff(unsigned int nr);
+#define __raise_softirq_preempt_off(nr)					\
+	do { set_bit(nr, (unsigned long *)local_softirq_pending()); } while (0)
+extern void raise_softirq_preempt_off(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
 extern void wakeup_softirqd(void);
 
--- a/include/linux/irq_cpustat.h
+++ b/include/linux/irq_cpustat.h
@@ -23,7 +23,7 @@ extern irq_cpustat_t irq_stat[];		/* def
 
   /* arch independent irq_stat fields */
 #define local_softirq_pending() \
-	__IRQ_STAT(smp_processor_id(), __softirq_pending)
+	(&__IRQ_STAT(smp_processor_id(), __softirq_pending))
 
   /* arch dependent irq_stat fields */
 #define nmi_count(cpu)		__IRQ_STAT((cpu), __nmi_count)	/* i386 */
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -695,10 +695,10 @@ static inline int hrtimer_enqueue_reprog
 	if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
 		if (wakeup) {
 			raw_spin_unlock(&base->cpu_base->lock);
-			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+			raise_softirq_preempt_off(HRTIMER_SOFTIRQ);
 			raw_spin_lock(&base->cpu_base->lock);
 		} else
-			__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+			__raise_softirq_preempt_off(HRTIMER_SOFTIRQ);
 
 		return 1;
 	}
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -155,7 +155,7 @@ static inline void _local_bh_enable_ip(u
  	 */
  	sub_preempt_count(SOFTIRQ_OFFSET - 1);
 
-	if (unlikely(!in_interrupt() && local_softirq_pending()))
+	if (unlikely(!in_interrupt() && *local_softirq_pending()))
 		do_softirq();
 
 	dec_preempt_count();
@@ -191,22 +191,22 @@ EXPORT_SYMBOL(local_bh_enable_ip);
 asmlinkage void __do_softirq(void)
 {
 	struct softirq_action *h;
-	__u32 pending;
+	__u32 pending, *ppending;
 	int max_restart = MAX_SOFTIRQ_RESTART;
 	int cpu;
 
-	pending = local_softirq_pending();
+	ppending = local_softirq_pending();
 	account_system_vtime(current);
 
 	__local_bh_disable((unsigned long)__builtin_return_address(0));
 	lockdep_softirq_enter();
 
 	cpu = smp_processor_id();
-restart:
-	/* Reset the pending bitmask before enabling irqs */
-	set_softirq_pending(0);
 
 	local_irq_enable();
+restart:
+	/* Reset the pending bitmask before enabling irqs */
+	pending = xchg(ppending, 0);
 
 	h = softirq_vec;
 
@@ -233,13 +233,12 @@ restart:
 		pending >>= 1;
 	} while (pending);
 
-	local_irq_disable();
-
-	pending = local_softirq_pending();
-	if (pending && --max_restart)
+	if (*ppending && --max_restart)
 		goto restart;
 
-	if (pending)
+	local_irq_disable();
+
+	if (*ppending)
 		wakeup_softirqd();
 
 	lockdep_softirq_exit();
@@ -260,7 +259,7 @@ asmlinkage void do_softirq(void)
 
 	local_irq_save(flags);
 
-	pending = local_softirq_pending();
+	pending = *local_softirq_pending();
 
 	if (pending)
 		__do_softirq();
@@ -299,7 +298,7 @@ void irq_exit(void)
 	account_system_vtime(current);
 	trace_hardirq_exit();
 	sub_preempt_count(IRQ_EXIT_OFFSET);
-	if (!in_interrupt() && local_softirq_pending())
+	if (!in_interrupt() && *local_softirq_pending())
 		invoke_softirq();
 
 	rcu_irq_exit();
@@ -312,11 +311,11 @@ void irq_exit(void)
 }
 
 /*
- * This function must run with irqs disabled!
+ * This function must run with preempt disabled!
  */
-inline void raise_softirq_irqoff(unsigned int nr)
+inline void raise_softirq_preempt_off(unsigned int nr)
 {
-	__raise_softirq_irqoff(nr);
+	__raise_softirq_preempt_off(nr);
 
 	/*
 	 * If we're in an interrupt or softirq, we're done
@@ -333,11 +332,9 @@ inline void raise_softirq_irqoff(unsigne
 
 void raise_softirq(unsigned int nr)
 {
-	unsigned long flags;
-
-	local_irq_save(flags);
-	raise_softirq_irqoff(nr);
-	local_irq_restore(flags);
+	preempt_disable();
+	raise_softirq_preempt_off(nr);
+	preempt_enable();
 }
 
 void open_softirq(int nr, void (*action)(struct softirq_action *))
@@ -365,7 +362,7 @@ void __tasklet_schedule(struct tasklet_s
 	t->next = NULL;
 	*__get_cpu_var(tasklet_vec).tail = t;
 	__get_cpu_var(tasklet_vec).tail = &(t->next);
-	raise_softirq_irqoff(TASKLET_SOFTIRQ);
+	raise_softirq_preempt_off(TASKLET_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -379,7 +376,7 @@ void __tasklet_hi_schedule(struct taskle
 	t->next = NULL;
 	*__get_cpu_var(tasklet_hi_vec).tail = t;
 	__get_cpu_var(tasklet_hi_vec).tail = &(t->next);
-	raise_softirq_irqoff(HI_SOFTIRQ);
+	raise_softirq_preempt_off(HI_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -391,7 +388,7 @@ void __tasklet_hi_schedule_first(struct
 
 	t->next = __get_cpu_var(tasklet_hi_vec).head;
 	__get_cpu_var(tasklet_hi_vec).head = t;
-	__raise_softirq_irqoff(HI_SOFTIRQ);
+	__raise_softirq_preempt_off(HI_SOFTIRQ);
 }
 
 EXPORT_SYMBOL(__tasklet_hi_schedule_first);
@@ -426,7 +423,7 @@ static void tasklet_action(struct softir
 		t->next = NULL;
 		*__get_cpu_var(tasklet_vec).tail = t;
 		__get_cpu_var(tasklet_vec).tail = &(t->next);
-		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
+		__raise_softirq_preempt_off(TASKLET_SOFTIRQ);
 		local_irq_enable();
 	}
 }
@@ -461,7 +458,7 @@ static void tasklet_hi_action(struct sof
 		t->next = NULL;
 		*__get_cpu_var(tasklet_hi_vec).tail = t;
 		__get_cpu_var(tasklet_hi_vec).tail = &(t->next);
-		__raise_softirq_irqoff(HI_SOFTIRQ);
+		__raise_softirq_preempt_off(HI_SOFTIRQ);
 		local_irq_enable();
 	}
 }
@@ -561,7 +558,7 @@ static void __local_trigger(struct call_
 
 	/* Trigger the softirq only if the list was previously empty.  */
 	if (head->next == &cp->list)
-		raise_softirq_irqoff(softirq);
+		raise_softirq_preempt_off(softirq);
 }
 
 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
@@ -659,7 +656,7 @@ static int __cpuinit remote_softirq_cpu_
 
 			local_head = &__get_cpu_var(softirq_work_list[i]);
 			list_splice_init(head, local_head);
-			raise_softirq_irqoff(i);
+			raise_softirq_preempt_off(i);
 		}
 		local_irq_enable();
 	}
@@ -698,7 +695,7 @@ static int run_ksoftirqd(void * __bind_c
 
 	while (!kthread_should_stop()) {
 		preempt_disable();
-		if (!local_softirq_pending()) {
+		if (!*local_softirq_pending()) {
 			preempt_enable_no_resched();
 			schedule();
 			preempt_disable();
@@ -706,7 +703,7 @@ static int run_ksoftirqd(void * __bind_c
 
 		__set_current_state(TASK_RUNNING);
 
-		while (local_softirq_pending()) {
+		while (*local_softirq_pending()) {
 			/* Preempt disable stops cpu going offline.
 			   If already offline, we'll be on wrong CPU:
 			   don't process */
@@ -781,7 +778,7 @@ static void takeover_tasklets(unsigned i
 		per_cpu(tasklet_vec, cpu).head = NULL;
 		per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
 	}
-	raise_softirq_irqoff(TASKLET_SOFTIRQ);
+	raise_softirq_preempt_off(TASKLET_SOFTIRQ);
 
 	if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
 		*__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
@@ -789,7 +786,7 @@ static void takeover_tasklets(unsigned i
 		per_cpu(tasklet_hi_vec, cpu).head = NULL;
 		per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
 	}
-	raise_softirq_irqoff(HI_SOFTIRQ);
+	raise_softirq_preempt_off(HI_SOFTIRQ);
 
 	local_irq_enable();
 }
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -304,12 +304,12 @@ void tick_nohz_stop_sched_tick(int inidl
 	if (need_resched())
 		goto end;
 
-	if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
+	if (unlikely(*local_softirq_pending() && cpu_online(cpu))) {
 		static int ratelimit;
 
 		if (ratelimit < 10) {
 			printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-			       (unsigned int) local_softirq_pending());
+			       (unsigned int) *local_softirq_pending());
 			ratelimit++;
 		}
 		goto end;
@@ -453,7 +453,7 @@ void tick_nohz_stop_sched_tick(int inidl
 		tick_do_update_jiffies64(ktime_get());
 		cpumask_clear_cpu(cpu, nohz_cpu_mask);
 	}
-	raise_softirq_irqoff(TIMER_SOFTIRQ);
+	raise_softirq_preempt_off(TIMER_SOFTIRQ);
 out:
 	ts->next_jiffies = next_jiffies;
 	ts->last_jiffies = last_jiffies;
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1564,7 +1564,7 @@ static inline void __netif_reschedule(st
 	q->next_sched = NULL;
 	*sd->output_queue_tailp = q;
 	sd->output_queue_tailp = &q->next_sched;
-	raise_softirq_irqoff(NET_TX_SOFTIRQ);
+	raise_softirq_preempt_off(NET_TX_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -1585,7 +1585,7 @@ void dev_kfree_skb_irq(struct sk_buff *s
 		sd = &__get_cpu_var(softnet_data);
 		skb->next = sd->completion_queue;
 		sd->completion_queue = skb;
-		raise_softirq_irqoff(NET_TX_SOFTIRQ);
+		raise_softirq_preempt_off(NET_TX_SOFTIRQ);
 		local_irq_restore(flags);
 	}
 }
@@ -2218,7 +2218,7 @@ static inline void ____napi_schedule(str
 				     struct napi_struct *napi)
 {
 	list_add_tail(&napi->poll_list, &sd->poll_list);
-	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	__raise_softirq_preempt_off(NET_RX_SOFTIRQ);
 }
 
 #ifdef CONFIG_RPS
@@ -2397,7 +2397,7 @@ static int rps_ipi_queued(struct softnet
 		sd->rps_ipi_next = mysd->rps_ipi_list;
 		mysd->rps_ipi_list = sd;
 
-		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+		__raise_softirq_preempt_off(NET_RX_SOFTIRQ);
 		return 1;
 	}
 #endif /* CONFIG_RPS */
@@ -2506,7 +2506,7 @@ int netif_rx_ni(struct sk_buff *skb)
 
 	preempt_disable();
 	err = netif_rx(skb);
-	if (local_softirq_pending())
+	if (*local_softirq_pending())
 		do_softirq();
 	preempt_enable();
 
@@ -3532,7 +3532,7 @@ out:
 
 softnet_break:
 	sd->time_squeeze++;
-	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	__raise_softirq_preempt_off(NET_RX_SOFTIRQ);
 	goto out;
 }
 
@@ -5670,7 +5670,7 @@ static int dev_cpu_callback(struct notif
 		oldsd->output_queue_tailp = &oldsd->output_queue;
 	}
 
-	raise_softirq_irqoff(NET_TX_SOFTIRQ);
+	raise_softirq_preempt_off(NET_TX_SOFTIRQ);
 	local_irq_enable();
 
 	/* Process offline CPU's input_pkt_queue */

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [RFC 2/5] NMI return notifier
  2010-06-24  3:04 [RFC 1/5] Make soft_irq NMI safe Huang Ying
@ 2010-06-24  3:04 ` Huang Ying
  2010-06-24  3:04 ` [RFC 3/5] x86, trigger NMI return notifier soft_irq earlier Huang Ying
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 66+ messages in thread
From: Huang Ying @ 2010-06-24  3:04 UTC (permalink / raw)
  To: Ingo Molnar, H. Peter Anvin
  Cc: linux-kernel, Andi Kleen, Peter Zijlstra, Huang Ying

Many kernel services can not be used in NMI handler. So NMI handler
needs a mechanism to do these operations in other contexts such as IRQ
and process.

This patch implements such a mechanism based on soft_irq in a similar
way as user return notifier. A new soft_irq named
NMI_RETURN_NOTIFIER_SOFTIRQ is defined and a lock-less single link
list is used to hold functions which will be called in the soft_irq
after NMI handler returned.

Signed-off-by: Huang Ying <ying.huang@intel.com>
---
 include/linux/interrupt.h |    1 
 include/linux/nmi.h       |   11 +++++
 kernel/Makefile           |    2 -
 kernel/nmi.c              |   86 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/softirq.c          |    2 +
 5 files changed, 101 insertions(+), 1 deletion(-)
 create mode 100644 kernel/nmi.c

--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -374,6 +374,7 @@ enum
 	TASKLET_SOFTIRQ,
 	SCHED_SOFTIRQ,
 	HRTIMER_SOFTIRQ,
+	NMI_RETURN_NOTIFIER_SOFTIRQ,
 	RCU_SOFTIRQ,	/* Preferable RCU should always be the last softirq */
 
 	NR_SOFTIRQS
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -47,4 +47,15 @@ static inline bool trigger_all_cpu_backt
 }
 #endif
 
+struct nmi_return_notifier {
+	void (*on_nmi_return)(struct nmi_return_notifier *nrn);
+	void *data;
+	struct nmi_return_notifier *next;
+};
+
+void nmi_return_notifier_schedule(struct nmi_return_notifier *nrn);
+void fire_nmi_return_notifiers(void);
+struct softirq_action;
+void nmi_return_notifier_action(struct softirq_action *a);
+
 #endif
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = sched.o fork.o exec_domain.o
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-	    async.o range.o
+	    async.o range.o nmi.o
 obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
 obj-y += groups.o
 
--- /dev/null
+++ b/kernel/nmi.c
@@ -0,0 +1,86 @@
+/*
+ * nmi.c
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/nmi.h>
+#include <linux/percpu.h>
+
+#define NMI_RETURN_NOTIFIER_TAIL	((struct nmi_return_notifier *)-1UL)
+
+static DEFINE_PER_CPU(struct nmi_return_notifier *, nmi_return_notifier_head) =
+	NMI_RETURN_NOTIFIER_TAIL;
+
+/*
+ * Some architectures can used this function to trigger soft_irq, for
+ * example via self interrupt.
+ */
+void __weak arch_nmi_return_notifier_schedule(struct nmi_return_notifier *nrn)
+{
+}
+
+/*
+ * Schedule a notification after current CPU returns from NMI handler.
+ * Must be called in atomic context.  The notifier will be called in
+ * IRQ or soft_irq context.
+ *
+ * This function is based on perf_pending_queue().
+ */
+void nmi_return_notifier_schedule(struct nmi_return_notifier *nrn)
+{
+	struct nmi_return_notifier **head;
+
+	if (cmpxchg(&nrn->next, NULL, NMI_RETURN_NOTIFIER_TAIL) != NULL)
+		return;
+
+	head = &get_cpu_var(nmi_return_notifier_head);
+
+	do {
+		nrn->next = *head;
+	} while (cmpxchg(head, nrn->next, nrn) != nrn->next);
+
+	raise_softirq_preempt_off(NMI_RETURN_NOTIFIER_SOFTIRQ);
+
+	arch_nmi_return_notifier_schedule(nrn);
+
+	put_cpu_var(nmi_return_notifier_head);
+}
+EXPORT_SYMBOL_GPL(nmi_return_notifier_schedule);
+
+void fire_nmi_return_notifiers(void)
+{
+	struct nmi_return_notifier *nrn, *list;
+
+	list = xchg(&__get_cpu_var(nmi_return_notifier_head),
+		    NMI_RETURN_NOTIFIER_TAIL);
+	while (list != NMI_RETURN_NOTIFIER_TAIL) {
+		nrn = list;
+		list = list->next;
+		nrn->next = NULL;
+		nrn->on_nmi_return(nrn);
+	}
+}
+EXPORT_SYMBOL_GPL(fire_nmi_return_notifiers);
+
+void nmi_return_notifier_action(struct softirq_action *a)
+{
+	fire_nmi_return_notifiers();
+}
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,6 +24,7 @@
 #include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
+#include <linux/nmi.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
@@ -687,6 +688,7 @@ void __init softirq_init(void)
 
 	open_softirq(TASKLET_SOFTIRQ, tasklet_action);
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action);
+	open_softirq(NMI_RETURN_NOTIFIER_SOFTIRQ, nmi_return_notifier_action);
 }
 
 static int run_ksoftirqd(void * __bind_cpu)

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [RFC 3/5] x86, trigger NMI return notifier soft_irq earlier
  2010-06-24  3:04 [RFC 1/5] Make soft_irq NMI safe Huang Ying
  2010-06-24  3:04 ` [RFC 2/5] NMI return notifier Huang Ying
@ 2010-06-24  3:04 ` Huang Ying
  2010-06-24  6:03   ` Peter Zijlstra
  2010-06-24  3:04 ` [RFC 4/5] x86, Use NMI return notifier in MCE Huang Ying
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 66+ messages in thread
From: Huang Ying @ 2010-06-24  3:04 UTC (permalink / raw)
  To: Ingo Molnar, H. Peter Anvin
  Cc: linux-kernel, Andi Kleen, Peter Zijlstra, Huang Ying

soft_irq is used to run NMI return notifiers. But it may be quite long
from soft_irq is raised in NMI handler to soft_irq is run actually. To
solve the issue, a self interrupt IPI is used to trigger the soft_irq
earlier.

Signed-off-by: Huang Ying <ying.huang@intel.com>
---
 arch/x86/include/asm/hardirq.h     |    1 +
 arch/x86/include/asm/hw_irq.h      |    1 +
 arch/x86/include/asm/irq_vectors.h |    5 +++++
 arch/x86/kernel/entry_64.S         |    5 +++++
 arch/x86/kernel/irq.c              |    7 +++++++
 arch/x86/kernel/irqinit.c          |    3 +++
 arch/x86/kernel/traps.c            |   29 +++++++++++++++++++++++++++++
 7 files changed, 51 insertions(+)

--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -11,6 +11,7 @@ typedef struct {
 #ifdef CONFIG_X86_LOCAL_APIC
 	unsigned int apic_timer_irqs;	/* arch dependent */
 	unsigned int irq_spurious_count;
+	unsigned int apic_nmi_return_notifier_irqs;
 #endif
 	unsigned int x86_platform_ipis;	/* arch dependent */
 	unsigned int apic_perf_irqs;
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -35,6 +35,7 @@ extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
 extern void reschedule_interrupt(void);
 extern void mce_self_interrupt(void);
+extern void nmi_return_notifier_interrupt(void);
 
 extern void invalidate_interrupt(void);
 extern void invalidate_interrupt0(void);
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -125,6 +125,11 @@
  */
 #define MCE_SELF_VECTOR			0xeb
 
+/*
+ * Self IPI vector for NMI return notifier
+ */
+#define NMI_RETURN_NOTIFIER_VECTOR	0xe9
+
 #define NR_VECTORS			 256
 
 #define FPU_IRQ				  13
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1009,6 +1009,11 @@ apicinterrupt MCE_SELF_VECTOR \
 	mce_self_interrupt smp_mce_self_interrupt
 #endif
 
+#ifdef CONFIG_X86_LOCAL_APIC
+apicinterrupt NMI_RETURN_NOTIFIER_VECTOR \
+	nmi_return_notifier_interrupt smp_nmi_return_notifier_interrupt
+#endif
+
 #ifdef CONFIG_SMP
 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
 	call_function_single_interrupt smp_call_function_single_interrupt
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -63,6 +63,12 @@ static int show_other_interrupts(struct
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
 	seq_printf(p, "  Spurious interrupts\n");
+
+	seq_printf(p, "%*s: ", prec, "NRN");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->apic_nmi_return_notifier_irqs);
+	seq_printf(p, "  NMI return notifier interrupts\n");
+
 	seq_printf(p, "%*s: ", prec, "PMI");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
@@ -184,6 +190,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 #ifdef CONFIG_X86_LOCAL_APIC
 	sum += irq_stats(cpu)->apic_timer_irqs;
 	sum += irq_stats(cpu)->irq_spurious_count;
+	sum += irq_stats(cpu)->apic_nmi_return_notifier_irqs;
 	sum += irq_stats(cpu)->apic_perf_irqs;
 	sum += irq_stats(cpu)->apic_pending_irqs;
 #endif
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -212,6 +212,9 @@ static void __init apic_intr_init(void)
 #if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
 	alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
 #endif
+#ifdef CONFIG_X86_LOCAL_APIC
+	alloc_intr_gate(NMI_RETURN_NOTIFIER_VECTOR, nmi_return_notifier_interrupt);
+#endif
 
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	/* self generated IPI for local APIC timer */
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -888,3 +888,32 @@ void __init trap_init(void)
 
 	x86_init.irqs.trap_init();
 }
+
+#ifdef CONFIG_X86_LOCAL_APIC
+asmlinkage void smp_nmi_return_notifier_interrupt(struct pt_regs *regs)
+{
+	ack_APIC_irq();
+	irq_enter();
+	inc_irq_stat(apic_nmi_return_notifier_irqs);
+	irq_exit();
+}
+
+void arch_nmi_return_notifier_schedule(struct nmi_return_notifier *nrn)
+{
+	/* Without APIC, hope next soft_irq not too late*/
+	if (!cpu_has_apic)
+		return;
+
+	/*
+	 * Use a self interrupt to trigger the soft_irq for NMI return
+	 * notifiers
+	 */
+	apic->send_IPI_self(NMI_RETURN_NOTIFIER_VECTOR);
+
+	/* Wait for idle afterward so that we don't leave the APIC in
+	 * a non idle state because the normal APIC writes cannot
+	 * exclude us.
+	 */
+	apic_wait_icr_idle();
+}
+#endif

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [RFC 4/5] x86, Use NMI return notifier in MCE
  2010-06-24  3:04 [RFC 1/5] Make soft_irq NMI safe Huang Ying
  2010-06-24  3:04 ` [RFC 2/5] NMI return notifier Huang Ying
  2010-06-24  3:04 ` [RFC 3/5] x86, trigger NMI return notifier soft_irq earlier Huang Ying
@ 2010-06-24  3:04 ` Huang Ying
  2010-06-24 10:00   ` Andi Kleen
  2010-06-24  3:04 ` [RFC 5/5] Use NMI return notifier in perf pending Huang Ying
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 66+ messages in thread
From: Huang Ying @ 2010-06-24  3:04 UTC (permalink / raw)
  To: Ingo Molnar, H. Peter Anvin
  Cc: linux-kernel, Andi Kleen, Peter Zijlstra, Huang Ying

Use general NMI return notifier mechanism to replace the self
interrupt used in MCE handler.

Signed-off-by: Huang Ying <ying.huang@intel.com>
---
 arch/x86/include/asm/entry_arch.h  |    4 --
 arch/x86/include/asm/irq_vectors.h |    5 ---
 arch/x86/kernel/cpu/mcheck/mce.c   |   50 +++++--------------------------------
 arch/x86/kernel/entry_64.S         |    5 ---
 arch/x86/kernel/irqinit.c          |    3 --
 5 files changed, 7 insertions(+), 60 deletions(-)

--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -61,8 +61,4 @@ BUILD_INTERRUPT(thermal_interrupt,THERMA
 BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
 #endif
 
-#ifdef CONFIG_X86_MCE
-BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
-#endif
-
 #endif
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -121,11 +121,6 @@
 #define UV_BAU_MESSAGE			0xea
 
 /*
- * Self IPI vector for machine checks
- */
-#define MCE_SELF_VECTOR			0xeb
-
-/*
  * Self IPI vector for NMI return notifier
  */
 #define NMI_RETURN_NOTIFIER_VECTOR	0xe9
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -480,60 +480,24 @@ static inline void mce_get_rip(struct mc
 		m->ip = mce_rdmsrl(rip_msr);
 }
 
-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * Called after interrupts have been reenabled again
- * when a MCE happened during an interrupts off region
- * in the kernel.
- */
-asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
+static void __mce_report_event(struct nmi_return_notifier *nrn)
 {
-	ack_APIC_irq();
-	exit_idle();
-	irq_enter();
 	mce_notify_irq();
 	mce_schedule_work();
-	irq_exit();
 }
-#endif
+
+static DEFINE_PER_CPU(struct nmi_return_notifier, mce_nrn) = {
+	.on_nmi_return	= __mce_report_event,
+};
 
 static void mce_report_event(struct pt_regs *regs)
 {
 	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
-		mce_notify_irq();
-		/*
-		 * Triggering the work queue here is just an insurance
-		 * policy in case the syscall exit notify handler
-		 * doesn't run soon enough or ends up running on the
-		 * wrong CPU (can happen when audit sleeps)
-		 */
-		mce_schedule_work();
+		__mce_report_event(NULL);
 		return;
 	}
 
-#ifdef CONFIG_X86_LOCAL_APIC
-	/*
-	 * Without APIC do not notify. The event will be picked
-	 * up eventually.
-	 */
-	if (!cpu_has_apic)
-		return;
-
-	/*
-	 * When interrupts are disabled we cannot use
-	 * kernel services safely. Trigger an self interrupt
-	 * through the APIC to instead do the notification
-	 * after interrupts are reenabled again.
-	 */
-	apic->send_IPI_self(MCE_SELF_VECTOR);
-
-	/*
-	 * Wait for idle afterwards again so that we don't leave the
-	 * APIC in a non idle state because the normal APIC writes
-	 * cannot exclude us.
-	 */
-	apic_wait_icr_idle();
-#endif
+	nmi_return_notifier_schedule(&__get_cpu_var(mce_nrn));
 }
 
 DEFINE_PER_CPU(unsigned, mce_poll_count);
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1004,11 +1004,6 @@ apicinterrupt THRESHOLD_APIC_VECTOR \
 apicinterrupt THERMAL_APIC_VECTOR \
 	thermal_interrupt smp_thermal_interrupt
 
-#ifdef CONFIG_X86_MCE
-apicinterrupt MCE_SELF_VECTOR \
-	mce_self_interrupt smp_mce_self_interrupt
-#endif
-
 #ifdef CONFIG_X86_LOCAL_APIC
 apicinterrupt NMI_RETURN_NOTIFIER_VECTOR \
 	nmi_return_notifier_interrupt smp_nmi_return_notifier_interrupt
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -209,9 +209,6 @@ static void __init apic_intr_init(void)
 #ifdef CONFIG_X86_MCE_THRESHOLD
 	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
 #endif
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
-	alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
-#endif
 #ifdef CONFIG_X86_LOCAL_APIC
 	alloc_intr_gate(NMI_RETURN_NOTIFIER_VECTOR, nmi_return_notifier_interrupt);
 #endif

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [RFC 5/5] Use NMI return notifier in perf pending
  2010-06-24  3:04 [RFC 1/5] Make soft_irq NMI safe Huang Ying
                   ` (2 preceding siblings ...)
  2010-06-24  3:04 ` [RFC 4/5] x86, Use NMI return notifier in MCE Huang Ying
@ 2010-06-24  3:04 ` Huang Ying
  2010-06-24  6:00   ` Peter Zijlstra
  2010-06-24  6:09 ` [RFC 1/5] Make soft_irq NMI safe Peter Zijlstra
  2010-06-24  6:35 ` [RFC][PATCH] irq_work Peter Zijlstra
  5 siblings, 1 reply; 66+ messages in thread
From: Huang Ying @ 2010-06-24  3:04 UTC (permalink / raw)
  To: Ingo Molnar, H. Peter Anvin
  Cc: linux-kernel, Andi Kleen, Peter Zijlstra, Huang Ying

Use general NMI return notifier mechanism to replace the self
interrupt used in perf pending.

Known issue: don't know how to deal with SPARC architecture.

Signed-off-by: Huang Ying <ying.huang@intel.com>
---
 arch/alpha/include/asm/perf_event.h  |    1 
 arch/arm/include/asm/perf_event.h    |   12 -----
 arch/parisc/include/asm/perf_event.h |    1 
 arch/powerpc/kernel/time.c           |    5 --
 arch/s390/include/asm/perf_event.h   |    3 -
 arch/sh/include/asm/perf_event.h     |    5 --
 arch/sparc/include/asm/perf_event.h  |    2 
 arch/x86/include/asm/entry_arch.h    |    4 -
 arch/x86/include/asm/hardirq.h       |    1 
 arch/x86/include/asm/irq_vectors.h   |    5 --
 arch/x86/kernel/cpu/perf_event.c     |   19 --------
 arch/x86/kernel/entry_64.S           |    5 --
 arch/x86/kernel/irq.c                |    5 --
 arch/x86/kernel/irqinit.c            |    6 --
 include/linux/perf_event.h           |   11 ----
 kernel/perf_event.c                  |   81 +++++------------------------------
 kernel/timer.c                       |    2 
 17 files changed, 15 insertions(+), 153 deletions(-)

--- a/arch/alpha/include/asm/perf_event.h
+++ b/arch/alpha/include/asm/perf_event.h
@@ -2,7 +2,6 @@
 #define __ASM_ALPHA_PERF_EVENT_H
 
 /* Alpha only supports software events through this interface. */
-static inline void set_perf_event_pending(void) { }
 
 #define PERF_EVENT_INDEX_OFFSET 0
 
--- a/arch/arm/include/asm/perf_event.h
+++ b/arch/arm/include/asm/perf_event.h
@@ -12,18 +12,6 @@
 #ifndef __ARM_PERF_EVENT_H__
 #define __ARM_PERF_EVENT_H__
 
-/*
- * NOP: on *most* (read: all supported) ARM platforms, the performance
- * counter interrupts are regular interrupts and not an NMI. This
- * means that when we receive the interrupt we can call
- * perf_event_do_pending() that handles all of the work with
- * interrupts enabled.
- */
-static inline void
-set_perf_event_pending(void)
-{
-}
-
 /* ARM performance counters start from 1 (in the cp15 accesses) so use the
  * same indexes here for consistency. */
 #define PERF_EVENT_INDEX_OFFSET 1
--- a/arch/parisc/include/asm/perf_event.h
+++ b/arch/parisc/include/asm/perf_event.h
@@ -2,6 +2,5 @@
 #define __ASM_PARISC_PERF_EVENT_H
 
 /* parisc only supports software events through this interface. */
-static inline void set_perf_event_pending(void) { }
 
 #endif /* __ASM_PARISC_PERF_EVENT_H */
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -635,11 +635,6 @@ void timer_interrupt(struct pt_regs * re
 
 	calculate_steal_time();
 
-	if (test_perf_event_pending()) {
-		clear_perf_event_pending();
-		perf_event_do_pending();
-	}
-
 #ifdef CONFIG_PPC_ISERIES
 	if (firmware_has_feature(FW_FEATURE_ISERIES))
 		get_lppaca()->int_dword.fields.decr_int = 0;
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h
@@ -4,7 +4,4 @@
  * Copyright 2009 Martin Schwidefsky, IBM Corporation.
  */
 
-static inline void set_perf_event_pending(void) {}
-static inline void clear_perf_event_pending(void) {}
-
 #define PERF_EVENT_INDEX_OFFSET 0
--- a/arch/sh/include/asm/perf_event.h
+++ b/arch/sh/include/asm/perf_event.h
@@ -26,11 +26,6 @@ extern int register_sh_pmu(struct sh_pmu
 extern int reserve_pmc_hardware(void);
 extern void release_pmc_hardware(void);
 
-static inline void set_perf_event_pending(void)
-{
-	/* Nothing to see here, move along. */
-}
-
 #define PERF_EVENT_INDEX_OFFSET	0
 
 #endif /* __ASM_SH_PERF_EVENT_H */
--- a/arch/sparc/include/asm/perf_event.h
+++ b/arch/sparc/include/asm/perf_event.h
@@ -1,8 +1,6 @@
 #ifndef __ASM_SPARC_PERF_EVENT_H
 #define __ASM_SPARC_PERF_EVENT_H
 
-extern void set_perf_event_pending(void);
-
 #define	PERF_EVENT_INDEX_OFFSET	0
 
 #ifdef CONFIG_PERF_EVENTS
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,10 +49,6 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOC
 BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
-#ifdef CONFIG_PERF_EVENTS
-BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
-#endif
-
 #ifdef CONFIG_X86_THERMAL_VECTOR
 BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 #endif
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -15,7 +15,6 @@ typedef struct {
 #endif
 	unsigned int x86_platform_ipis;	/* arch dependent */
 	unsigned int apic_perf_irqs;
-	unsigned int apic_pending_irqs;
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -113,11 +113,6 @@
  */
 #define X86_PLATFORM_IPI_VECTOR		0xed
 
-/*
- * Performance monitoring pending work vector:
- */
-#define LOCAL_PENDING_VECTOR		0xec
-
 #define UV_BAU_MESSAGE			0xea
 
 /*
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1160,25 +1160,6 @@ static int x86_pmu_handle_irq(struct pt_
 	return handled;
 }
 
-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
-	irq_enter();
-	ack_APIC_irq();
-	inc_irq_stat(apic_pending_irqs);
-	perf_event_do_pending();
-	irq_exit();
-}
-
-void set_perf_event_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	if (!x86_pmu.apic || !x86_pmu_initialized())
-		return;
-
-	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
-
 void perf_events_lapic_init(void)
 {
 	if (!x86_pmu.apic || !x86_pmu_initialized())
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1023,11 +1023,6 @@ apicinterrupt ERROR_APIC_VECTOR \
 apicinterrupt SPURIOUS_APIC_VECTOR \
 	spurious_interrupt smp_spurious_interrupt
 
-#ifdef CONFIG_PERF_EVENTS
-apicinterrupt LOCAL_PENDING_VECTOR \
-	perf_pending_interrupt smp_perf_pending_interrupt
-#endif
-
 /*
  * Exception entry points.
  */
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -73,10 +73,6 @@ static int show_other_interrupts(struct
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance monitoring interrupts\n");
-	seq_printf(p, "%*s: ", prec, "PND");
-	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
-	seq_printf(p, "  Performance pending work\n");
 #endif
 	if (x86_platform_ipi_callback) {
 		seq_printf(p, "%*s: ", prec, "PLT");
@@ -192,7 +188,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	sum += irq_stats(cpu)->irq_spurious_count;
 	sum += irq_stats(cpu)->apic_nmi_return_notifier_irqs;
 	sum += irq_stats(cpu)->apic_perf_irqs;
-	sum += irq_stats(cpu)->apic_pending_irqs;
 #endif
 	if (x86_platform_ipi_callback)
 		sum += irq_stats(cpu)->x86_platform_ipis;
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -223,12 +223,6 @@ static void __init apic_intr_init(void)
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
-	/* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_EVENTS
-	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
-# endif
-
 #endif
 }
 
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -484,6 +484,7 @@ struct perf_guest_info_callbacks {
 #include <linux/workqueue.h>
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
+#include <linux/nmi.h>
 #include <asm/atomic.h>
 #include <asm/local.h>
 
@@ -608,11 +609,6 @@ struct perf_mmap_data {
 	void				*data_pages[0];
 };
 
-struct perf_pending_entry {
-	struct perf_pending_entry *next;
-	void (*func)(struct perf_pending_entry *);
-};
-
 struct perf_sample_data;
 
 typedef void (*perf_overflow_handler_t)(struct perf_event *, int,
@@ -719,7 +715,7 @@ struct perf_event {
 	int				pending_wakeup;
 	int				pending_kill;
 	int				pending_disable;
-	struct perf_pending_entry	pending;
+	struct nmi_return_notifier	pending;
 
 	atomic_t			event_limit;
 
@@ -831,8 +827,6 @@ extern void perf_event_task_tick(struct
 extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
-extern void set_perf_event_pending(void);
-extern void perf_event_do_pending(void);
 extern void perf_event_print_debug(void);
 extern void __perf_disable(void);
 extern bool __perf_enable(void);
@@ -1031,7 +1025,6 @@ perf_event_task_tick(struct task_struct
 static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
-static inline void perf_event_do_pending(void)				{ }
 static inline void perf_event_print_debug(void)				{ }
 static inline void perf_disable(void)					{ }
 static inline void perf_enable(void)					{ }
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2829,15 +2829,17 @@ void perf_event_wakeup(struct perf_event
  *
  * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
  *
- * The NMI bit means we cannot possibly take locks. Therefore, maintain a
- * single linked list and use cmpxchg() to add entries lockless.
+ * The NMI bit means we cannot possibly take locks. Therefore, use
+ * nmi_return_notifier.
  */
 
-static void perf_pending_event(struct perf_pending_entry *entry)
+static void perf_pending_event(struct nmi_return_notifier *nrn)
 {
-	struct perf_event *event = container_of(entry,
+	struct perf_event *event = container_of(nrn,
 			struct perf_event, pending);
 
+	nrn->data = NULL;
+
 	if (event->pending_disable) {
 		event->pending_disable = 0;
 		__perf_event_disable(event);
@@ -2849,59 +2851,12 @@ static void perf_pending_event(struct pe
 	}
 }
 
-#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
-
-static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
-	PENDING_TAIL,
-};
-
-static void perf_pending_queue(struct perf_pending_entry *entry,
-			       void (*func)(struct perf_pending_entry *))
+static void perf_pending_queue(struct nmi_return_notifier *nrn,
+			       void (*func)(struct nmi_return_notifier *))
 {
-	struct perf_pending_entry **head;
-
-	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
-		return;
-
-	entry->func = func;
-
-	head = &get_cpu_var(perf_pending_head);
-
-	do {
-		entry->next = *head;
-	} while (cmpxchg(head, entry->next, entry) != entry->next);
-
-	set_perf_event_pending();
-
-	put_cpu_var(perf_pending_head);
-}
-
-static int __perf_pending_run(void)
-{
-	struct perf_pending_entry *list;
-	int nr = 0;
-
-	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
-	while (list != PENDING_TAIL) {
-		void (*func)(struct perf_pending_entry *);
-		struct perf_pending_entry *entry = list;
-
-		list = list->next;
-
-		func = entry->func;
-		entry->next = NULL;
-		/*
-		 * Ensure we observe the unqueue before we issue the wakeup,
-		 * so that we won't be waiting forever.
-		 * -- see perf_not_pending().
-		 */
-		smp_wmb();
-
-		func(entry);
-		nr++;
-	}
-
-	return nr;
+	nrn->on_nmi_return = func;
+	nrn->data = nrn;
+	nmi_return_notifier_schedule(nrn);
 }
 
 static inline int perf_not_pending(struct perf_event *event)
@@ -2911,15 +2866,10 @@ static inline int perf_not_pending(struc
 	 * need to wait.
 	 */
 	get_cpu();
-	__perf_pending_run();
+	fire_nmi_return_notifiers();
 	put_cpu();
 
-	/*
-	 * Ensure we see the proper queue state before going to sleep
-	 * so that we do not miss the wakeup. -- see perf_pending_handle()
-	 */
-	smp_rmb();
-	return event->pending.next == NULL;
+	return event->pending.data == NULL;
 }
 
 static void perf_pending_sync(struct perf_event *event)
@@ -2927,11 +2877,6 @@ static void perf_pending_sync(struct per
 	wait_event(event->waitq, perf_not_pending(event));
 }
 
-void perf_event_do_pending(void)
-{
-	__perf_pending_run();
-}
-
 /*
  * Callchain support -- arch specific
  */
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,6 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
-#include <linux/perf_event.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
@@ -1264,7 +1263,6 @@ void update_process_times(int user_tick)
 	run_local_timers();
 	rcu_check_callbacks(cpu, user_tick);
 	printk_tick();
-	perf_event_do_pending();
 	scheduler_tick();
 	run_posix_cpu_timers(p);
 }

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC 5/5] Use NMI return notifier in perf pending
  2010-06-24  3:04 ` [RFC 5/5] Use NMI return notifier in perf pending Huang Ying
@ 2010-06-24  6:00   ` Peter Zijlstra
  0 siblings, 0 replies; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24  6:00 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

On Thu, 2010-06-24 at 11:04 +0800, Huang Ying wrote:
> Use general NMI return notifier mechanism to replace the self
> interrupt used in perf pending.
> 
> Known issue: don't know how to deal with SPARC architecture.

NAK I want and need hardirq context for the callback

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC 3/5] x86, trigger NMI return notifier soft_irq earlier
  2010-06-24  3:04 ` [RFC 3/5] x86, trigger NMI return notifier soft_irq earlier Huang Ying
@ 2010-06-24  6:03   ` Peter Zijlstra
  0 siblings, 0 replies; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24  6:03 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

On Thu, 2010-06-24 at 11:04 +0800, Huang Ying wrote:
> soft_irq is used to run NMI return notifiers. But it may be quite long
> from soft_irq is raised in NMI handler to soft_irq is run actually. To
> solve the issue, a self interrupt IPI is used to trigger the soft_irq
> earlier.

This all is just gross hackery,..

wth is wrong with doing what I proposed earlier and use a work_struct
like callback mechanism?



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC 1/5] Make soft_irq NMI safe
  2010-06-24  3:04 [RFC 1/5] Make soft_irq NMI safe Huang Ying
                   ` (3 preceding siblings ...)
  2010-06-24  3:04 ` [RFC 5/5] Use NMI return notifier in perf pending Huang Ying
@ 2010-06-24  6:09 ` Peter Zijlstra
  2010-06-24  6:45   ` Huang Ying
  2010-06-24  6:35 ` [RFC][PATCH] irq_work Peter Zijlstra
  5 siblings, 1 reply; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24  6:09 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

On Thu, 2010-06-24 at 11:04 +0800, Huang Ying wrote:
> +#define __raise_softirq_preempt_off(nr)                                        \
> +       do { set_bit(nr, (unsigned long *)local_softirq_pending()); } while (0) 

So that is the reason for that insane local_softirq_pending()
definition?

Quite revolting.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [RFC][PATCH] irq_work
  2010-06-24  3:04 [RFC 1/5] Make soft_irq NMI safe Huang Ying
                   ` (4 preceding siblings ...)
  2010-06-24  6:09 ` [RFC 1/5] Make soft_irq NMI safe Peter Zijlstra
@ 2010-06-24  6:35 ` Peter Zijlstra
  2010-06-24  6:43   ` Huang Ying
                     ` (2 more replies)
  5 siblings, 3 replies; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24  6:35 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen


Something like this, but filled out with some arch code that does the
self-ipi and calls irq_work_run() should do.

No need to molest the softirq code, no need for limited vectors of any
kind.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/irq_callback.h |   13 ++++++++
 kernel/irq_callback.c        |   66 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)

Index: linux-2.6/include/linux/irq_callback.h
===================================================================
--- /dev/null
+++ linux-2.6/include/linux/irq_callback.h
@@ -0,0 +1,13 @@
+#ifndef _LINUX_IRQ_CALLBACK_H
+#define _LINUX_IRQ_CALLBACK_H
+
+struct irq_work {
+	struct irq_work *next;
+	void (*func)(struct irq_work *);
+};
+
+int irq_work_queue(struct irq_work *entry, void (*func)(struct irq_work *));
+void irq_work_run(void);
+void irq_work_sync(struct irq_work *entry);
+
+#endif /* _LINUX_IRQ_CALLBACK_H */
Index: linux-2.6/kernel/irq_callback.c
===================================================================
--- /dev/null
+++ linux-2.6/kernel/irq_callback.c
@@ -0,0 +1,66 @@
+
+#include <linux/irq_callback.h>
+
+#define CALLBACK_TAIL ((struct irq_work *)-1UL)
+
+static DEFINE_PER_CPU(struct irq_work *, irq_work_list) = {
+	CALLBACK_TAIL,
+};
+
+int irq_work_queue(struct irq_work *entry, void (*func)(struct irq_work *))
+{
+	struct irq_work **head;
+
+	if (cmpxchg(&entry->next, NULL, CALLBACK_TAIL) != NULL)
+		return 0;
+
+	entry->func = func;
+
+	head = &get_cpu_var(irq_work_list);
+
+	do {
+		entry->next = *head;
+	} while (cmpxchg(head, entry->next, entry) != entry->next);
+
+	if (entry->next == CALLBACK_TAIL)
+		arch_self_ipi();
+
+	put_cpu_var(irq_work_list);
+	return 1;
+}
+
+void irq_work_run(void)
+{
+	struct irq_work *list;
+
+	list = xchg(&__get_cpu_var(irq_work_list), CALLBACK_TAIL);
+	while (list != CALLBACK_TAIL) {
+		struct irq_work *entry = list;
+
+		list = list->next;
+		entry->func(entry);
+
+		entry->next = NULL;
+		/*
+		 * matches the mb in cmpxchg() in irq_work_queue()
+		 */
+		smp_wmb();
+	}
+}
+
+static int irq_work_pending(struct irq_work *entry)
+{
+	/*
+	 * matches the wmb in irq_work_run
+	 */
+	smp_rmb();
+	return entry->next != NULL;
+}
+
+void irq_work_sync(struct irq_work *entry)
+{
+	WARN_ON_ONCE(irqs_disabled());
+
+	while (irq_work_pending(entry))
+		cpu_relax();
+}


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24  6:35 ` [RFC][PATCH] irq_work Peter Zijlstra
@ 2010-06-24  6:43   ` Huang Ying
  2010-06-24  6:47     ` Peter Zijlstra
  2010-06-25  2:12   ` Huang Ying
  2010-06-25 18:30   ` [RFC][PATCH] irq_work -v2 Peter Zijlstra
  2 siblings, 1 reply; 66+ messages in thread
From: Huang Ying @ 2010-06-24  6:43 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

Hi, Peter,

On Thu, 2010-06-24 at 14:35 +0800, Peter Zijlstra wrote:
> Something like this, but filled out with some arch code that does the
> self-ipi and calls irq_work_run() should do.
> 
> No need to molest the softirq code, no need for limited vectors of any
> kind.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/irq_callback.h |   13 ++++++++
>  kernel/irq_callback.c        |   66 +++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 79 insertions(+)
> 
> Index: linux-2.6/include/linux/irq_callback.h
> ===================================================================
> --- /dev/null
> +++ linux-2.6/include/linux/irq_callback.h
> @@ -0,0 +1,13 @@
> +#ifndef _LINUX_IRQ_CALLBACK_H
> +#define _LINUX_IRQ_CALLBACK_H
> +
> +struct irq_work {
> +	struct irq_work *next;
> +	void (*func)(struct irq_work *);
> +};
> +
> +int irq_work_queue(struct irq_work *entry, void (*func)(struct irq_work *));
> +void irq_work_run(void);
> +void irq_work_sync(struct irq_work *entry);
> +
> +#endif /* _LINUX_IRQ_CALLBACK_H */
> Index: linux-2.6/kernel/irq_callback.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6/kernel/irq_callback.c
> @@ -0,0 +1,66 @@
> +
> +#include <linux/irq_callback.h>
> +
> +#define CALLBACK_TAIL ((struct irq_work *)-1UL)
> +
> +static DEFINE_PER_CPU(struct irq_work *, irq_work_list) = {
> +	CALLBACK_TAIL,
> +};
> +
> +int irq_work_queue(struct irq_work *entry, void (*func)(struct irq_work *))
> +{
> +	struct irq_work **head;
> +
> +	if (cmpxchg(&entry->next, NULL, CALLBACK_TAIL) != NULL)
> +		return 0;
> +
> +	entry->func = func;
> +
> +	head = &get_cpu_var(irq_work_list);
> +
> +	do {
> +		entry->next = *head;
> +	} while (cmpxchg(head, entry->next, entry) != entry->next);
> +
> +	if (entry->next == CALLBACK_TAIL)
> +		arch_self_ipi();
> +
> +	put_cpu_var(irq_work_list);
> +	return 1;
> +}
> +
> +void irq_work_run(void)
> +{
> +	struct irq_work *list;
> +
> +	list = xchg(&__get_cpu_var(irq_work_list), CALLBACK_TAIL);
> +	while (list != CALLBACK_TAIL) {
> +		struct irq_work *entry = list;
> +
> +		list = list->next;
> +		entry->func(entry);
> +
> +		entry->next = NULL;
> +		/*
> +		 * matches the mb in cmpxchg() in irq_work_queue()
> +		 */
> +		smp_wmb();
> +	}
> +}
> +
> +static int irq_work_pending(struct irq_work *entry)
> +{
> +	/*
> +	 * matches the wmb in irq_work_run
> +	 */
> +	smp_rmb();
> +	return entry->next != NULL;
> +}
> +
> +void irq_work_sync(struct irq_work *entry)
> +{
> +	WARN_ON_ONCE(irqs_disabled());
> +
> +	while (irq_work_pending(entry))
> +		cpu_relax();
> +}

I fact I uses exactly the similar method in my patches, just trigger it
with soft_irq instead of IRQ. Please take a look at
nmi_return_notifier_schedule in

[RFC 2/5] NMI return notifier

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC 1/5] Make soft_irq NMI safe
  2010-06-24  6:09 ` [RFC 1/5] Make soft_irq NMI safe Peter Zijlstra
@ 2010-06-24  6:45   ` Huang Ying
  0 siblings, 0 replies; 66+ messages in thread
From: Huang Ying @ 2010-06-24  6:45 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

On Thu, 2010-06-24 at 14:09 +0800, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 11:04 +0800, Huang Ying wrote:
> > +#define __raise_softirq_preempt_off(nr)                                        \
> > +       do { set_bit(nr, (unsigned long *)local_softirq_pending()); } while (0) 
> 
> So that is the reason for that insane local_softirq_pending()
> definition?

I need to change local_softirq_pending() for this and the xchg() in
__do_softirq().

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24  6:43   ` Huang Ying
@ 2010-06-24  6:47     ` Peter Zijlstra
  2010-06-24  6:50       ` Huang Ying
  0 siblings, 1 reply; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24  6:47 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

On Thu, 2010-06-24 at 14:43 +0800, Huang Ying wrote:
> Hi, Peter,

> I fact I uses exactly the similar method in my patches, just trigger it
> with soft_irq instead of IRQ. Please take a look at
> nmi_return_notifier_schedule in

But then why still use softirq? Once you have this its completely
useless.



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24  6:47     ` Peter Zijlstra
@ 2010-06-24  6:50       ` Huang Ying
  2010-06-24  6:58         ` Peter Zijlstra
  0 siblings, 1 reply; 66+ messages in thread
From: Huang Ying @ 2010-06-24  6:50 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

On Thu, 2010-06-24 at 14:47 +0800, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 14:43 +0800, Huang Ying wrote:
> > Hi, Peter,
> 
> > I fact I uses exactly the similar method in my patches, just trigger it
> > with soft_irq instead of IRQ. Please take a look at
> > nmi_return_notifier_schedule in
> 
> But then why still use softirq? Once you have this its completely
> useless.

Some systems have no self interrupt, for example the system without
APIC. We need to provide a fallback for them. soft_irq can help here.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24  6:50       ` Huang Ying
@ 2010-06-24  6:58         ` Peter Zijlstra
  2010-06-24  7:04           ` Huang Ying
  0 siblings, 1 reply; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24  6:58 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

On Thu, 2010-06-24 at 14:50 +0800, Huang Ying wrote:
> On Thu, 2010-06-24 at 14:47 +0800, Peter Zijlstra wrote:
> > On Thu, 2010-06-24 at 14:43 +0800, Huang Ying wrote:
> > > Hi, Peter,
> > 
> > > I fact I uses exactly the similar method in my patches, just trigger it
> > > with soft_irq instead of IRQ. Please take a look at
> > > nmi_return_notifier_schedule in
> > 
> > But then why still use softirq? Once you have this its completely
> > useless.
> 
> Some systems have no self interrupt, for example the system without
> APIC. We need to provide a fallback for them. soft_irq can help here.

So there's systems that don't have self-ipi but do have NMI context?

Can't we run the callbacks from the tick or something for such legacy
muck? I really don't like the whole softirq mess.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24  6:58         ` Peter Zijlstra
@ 2010-06-24  7:04           ` Huang Ying
  2010-06-24  7:19             ` Peter Zijlstra
  0 siblings, 1 reply; 66+ messages in thread
From: Huang Ying @ 2010-06-24  7:04 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

On Thu, 2010-06-24 at 14:58 +0800, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 14:50 +0800, Huang Ying wrote:
> > On Thu, 2010-06-24 at 14:47 +0800, Peter Zijlstra wrote:
> > > On Thu, 2010-06-24 at 14:43 +0800, Huang Ying wrote:
> > > > Hi, Peter,
> > > 
> > > > I fact I uses exactly the similar method in my patches, just trigger it
> > > > with soft_irq instead of IRQ. Please take a look at
> > > > nmi_return_notifier_schedule in
> > > 
> > > But then why still use softirq? Once you have this its completely
> > > useless.
> > 
> > Some systems have no self interrupt, for example the system without
> > APIC. We need to provide a fallback for them. soft_irq can help here.
> 
> So there's systems that don't have self-ipi but do have NMI context?

Yes. NMI is there from 8259 age.

> Can't we run the callbacks from the tick or something for such legacy
> muck? I really don't like the whole softirq mess.

That is possible. But in NO_HZ system, we have no tick to rely on.
soft_irq is better here, because it will be triggered for any interrupt.

Best Regards,
Huang Ying


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24  7:04           ` Huang Ying
@ 2010-06-24  7:19             ` Peter Zijlstra
  2010-06-24  7:27               ` Huang Ying
  0 siblings, 1 reply; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24  7:19 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

On Thu, 2010-06-24 at 15:04 +0800, Huang Ying wrote:

> Yes. NMI is there from 8259 age.

But do we really care about such systems?

> That is possible. But in NO_HZ system, we have no tick to rely on.

Of course you have, you can delay the NO_HZ state when there's pending
callbacks, that's all of 1 line.

> soft_irq is better here, because it will be triggered for any interrupt.

Well, you can do the callbacks from irq_exit() as well, that's no
problem.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24  7:19             ` Peter Zijlstra
@ 2010-06-24  7:27               ` Huang Ying
  2010-06-24  7:32                 ` Peter Zijlstra
  0 siblings, 1 reply; 66+ messages in thread
From: Huang Ying @ 2010-06-24  7:27 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

On Thu, 2010-06-24 at 15:19 +0800, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 15:04 +0800, Huang Ying wrote:
> 
> > Yes. NMI is there from 8259 age.
> 
> But do we really care about such systems?
> 
> > That is possible. But in NO_HZ system, we have no tick to rely on.
> 
> Of course you have, you can delay the NO_HZ state when there's pending
> callbacks, that's all of 1 line.
> 
> > soft_irq is better here, because it will be triggered for any interrupt.
> 
> Well, you can do the callbacks from irq_exit() as well, that's no
> problem.

I think it is not a good idea to add overhead in such a hot path if the
overhead can be avoided.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24  7:27               ` Huang Ying
@ 2010-06-24  7:32                 ` Peter Zijlstra
  2010-06-24 10:27                   ` Andi Kleen
  0 siblings, 1 reply; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24  7:32 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

On Thu, 2010-06-24 at 15:27 +0800, Huang Ying wrote:
> On Thu, 2010-06-24 at 15:19 +0800, Peter Zijlstra wrote:
> > On Thu, 2010-06-24 at 15:04 +0800, Huang Ying wrote:
> > 
> > > Yes. NMI is there from 8259 age.
> > 
> > But do we really care about such systems?
> > 
> > > That is possible. But in NO_HZ system, we have no tick to rely on.
> > 
> > Of course you have, you can delay the NO_HZ state when there's pending
> > callbacks, that's all of 1 line.
> > 
> > > soft_irq is better here, because it will be triggered for any interrupt.
> > 
> > Well, you can do the callbacks from irq_exit() as well, that's no
> > problem.
> 
> I think it is not a good idea to add overhead in such a hot path if the
> overhead can be avoided.

True, but I really don't like the softirq thing, and I really don't care
about !APIC machines, I probably couldn't buy one if I wanted to and its
not like we have good MCE support for them now, so who cares.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC 4/5] x86, Use NMI return notifier in MCE
  2010-06-24  3:04 ` [RFC 4/5] x86, Use NMI return notifier in MCE Huang Ying
@ 2010-06-24 10:00   ` Andi Kleen
  0 siblings, 0 replies; 66+ messages in thread
From: Andi Kleen @ 2010-06-24 10:00 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H. Peter Anvin, linux-kernel, Peter Zijlstra

Huang Ying <ying.huang@intel.com> writes:

Hi Ying,

>  {
>  	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
> -		mce_notify_irq();
> -		/*
> -		 * Triggering the work queue here is just an insurance
> -		 * policy in case the syscall exit notify handler
> -		 * doesn't run soon enough or ends up running on the
> -		 * wrong CPU (can happen when audit sleeps)
> -		 */
> -		mce_schedule_work();
> +		__mce_report_event(NULL);

Do we still handle the CPU switch case correctly?

The backend handler needs to run on the same CPU to process the per
CPU mce pfns.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24  7:32                 ` Peter Zijlstra
@ 2010-06-24 10:27                   ` Andi Kleen
  2010-06-24 10:30                     ` Peter Zijlstra
  0 siblings, 1 reply; 66+ messages in thread
From: Andi Kleen @ 2010-06-24 10:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

> True, but I really don't like the softirq thing, and I really don't care
> about !APIC machines, I probably couldn't buy one if I wanted to and its
> not like we have good MCE support for them now, so who cares.

In theory you can run a machine with good MCE support in non APIC single
CPU mode. It wouldn't make much sense, but you could do it.

Anyways, I don't think we need a lot of effort to handle this case,
but it would be better to not explicitely break it either.

That's why the timer fallback in the original code was fine, this 
basically never happens and even if there is a 5s delay from tickless
that's fine.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 10:27                   ` Andi Kleen
@ 2010-06-24 10:30                     ` Peter Zijlstra
  2010-06-24 10:52                       ` Andi Kleen
  0 siblings, 1 reply; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24 10:30 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel

On Thu, 2010-06-24 at 12:27 +0200, Andi Kleen wrote:
> > True, but I really don't like the softirq thing, and I really don't care
> > about !APIC machines, I probably couldn't buy one if I wanted to and its
> > not like we have good MCE support for them now, so who cares.
> 
> In theory you can run a machine with good MCE support in non APIC single
> CPU mode. It wouldn't make much sense, but you could do it.
> 
> Anyways, I don't think we need a lot of effort to handle this case,
> but it would be better to not explicitely break it either.
> 
> That's why the timer fallback in the original code was fine, this 
> basically never happens and even if there is a 5s delay from tickless
> that's fine.

Right, in that case I would very much prefer the simpler thing I
proposed over all this softirq stuff, we can have the tick process the
callbacks for really broken hardware (perf_events doesn't care since
without a lapic there's no pmi anyway).



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 10:30                     ` Peter Zijlstra
@ 2010-06-24 10:52                       ` Andi Kleen
  2010-06-24 10:58                         ` Peter Zijlstra
  0 siblings, 1 reply; 66+ messages in thread
From: Andi Kleen @ 2010-06-24 10:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, Huang Ying, Ingo Molnar, H.PeterA, linux-kernel

> Right, in that case I would very much prefer the simpler thing I
> proposed over all this softirq stuff, we can have the tick process the
> callbacks for really broken hardware (perf_events doesn't care since
> without a lapic there's no pmi anyway).

Ying's approach will work I think. 

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 10:52                       ` Andi Kleen
@ 2010-06-24 10:58                         ` Peter Zijlstra
  2010-06-24 11:08                           ` Andi Kleen
  0 siblings, 1 reply; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24 10:58 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel

On Thu, 2010-06-24 at 12:52 +0200, Andi Kleen wrote:
> > Right, in that case I would very much prefer the simpler thing I
> > proposed over all this softirq stuff, we can have the tick process the
> > callbacks for really broken hardware (perf_events doesn't care since
> > without a lapic there's no pmi anyway).
> 
> Ying's approach will work I think. 

Right, except that I really dislike it, it touches far too much code for
no particular reason.

And I really want hardirq context for perf callbacks, some code actually
relies on it (I used to have the fallback in the timer softirq and that
broke thing at some point).

So I'm really opposed to all the softirq molestation as I see no reason
to do that at all.



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 10:58                         ` Peter Zijlstra
@ 2010-06-24 11:08                           ` Andi Kleen
  2010-06-24 11:10                             ` Peter Zijlstra
  0 siblings, 1 reply; 66+ messages in thread
From: Andi Kleen @ 2010-06-24 11:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, Huang Ying, Ingo Molnar, H.PeterA, linux-kernel

> And I really want hardirq context for perf callbacks, some code actually
> relies on it (I used to have the fallback in the timer softirq and that

Surely that could be fixed?  *requiring* hard irq context sounds weird.

> broke thing at some point).

I have one case that needs to sleep (but only when interrupting user code)
They key thing in it really is to switch stacks back to process.

-andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 11:08                           ` Andi Kleen
@ 2010-06-24 11:10                             ` Peter Zijlstra
  2010-06-24 11:20                               ` Andi Kleen
  2010-06-24 11:23                               ` Ingo Molnar
  0 siblings, 2 replies; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24 11:10 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel

On Thu, 2010-06-24 at 13:08 +0200, Andi Kleen wrote:
> > And I really want hardirq context for perf callbacks, some code actually
> > relies on it (I used to have the fallback in the timer softirq and that
> 
> Surely that could be fixed?  *requiring* hard irq context sounds weird.

possibly, but there is no reason what so ever to use softirq here.

> > broke thing at some point).
> 
> I have one case that needs to sleep (but only when interrupting user code)
> They key thing in it really is to switch stacks back to process.

softirq can't sleep either, you need a trampoline anyway.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 11:10                             ` Peter Zijlstra
@ 2010-06-24 11:20                               ` Andi Kleen
  2010-06-24 11:33                                 ` Peter Zijlstra
  2010-06-24 11:42                                 ` Peter Zijlstra
  2010-06-24 11:23                               ` Ingo Molnar
  1 sibling, 2 replies; 66+ messages in thread
From: Andi Kleen @ 2010-06-24 11:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, Huang Ying, Ingo Molnar, H.PeterA, linux-kernel

On Thu, Jun 24, 2010 at 01:10:52PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 13:08 +0200, Andi Kleen wrote:
> > > And I really want hardirq context for perf callbacks, some code actually
> > > relies on it (I used to have the fallback in the timer softirq and that
> > 
> > Surely that could be fixed?  *requiring* hard irq context sounds weird.
> 
> possibly, but there is no reason what so ever to use softirq here.

Ok so going back to the original self-irq patchkit. Unfortunately the other
reviewer hated that. How to get out of that deadlock?


> > > broke thing at some point).
> > 
> > I have one case that needs to sleep (but only when interrupting user code)
> > They key thing in it really is to switch stacks back to process.
> 
> softirq can't sleep either, you need a trampoline anywa

Not true, when you interrupt ring 3 it can sleep. You just need to make
sure to run on the right stack and fix up any irq counters.

Anyways this can be solved in a different way too, it would just fit
in there too.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 11:10                             ` Peter Zijlstra
  2010-06-24 11:20                               ` Andi Kleen
@ 2010-06-24 11:23                               ` Ingo Molnar
  2010-06-24 11:34                                 ` Peter Zijlstra
  1 sibling, 1 reply; 66+ messages in thread
From: Ingo Molnar @ 2010-06-24 11:23 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andi Kleen, Huang Ying, H.PeterA, linux-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Thu, 2010-06-24 at 13:08 +0200, Andi Kleen wrote:
> > > And I really want hardirq context for perf callbacks, some code actually
> > > relies on it (I used to have the fallback in the timer softirq and that
> > 
> > Surely that could be fixed?  *requiring* hard irq context sounds weird.
> 
> possibly, but there is no reason what so ever to use softirq here.
> 
> > > broke thing at some point).
> > 
> > I have one case that needs to sleep (but only when interrupting user code)
> > They key thing in it really is to switch stacks back to process.
> 
> softirq can't sleep either, you need a trampoline anyway.

What might make sense is to offer two types of callbacks: one that is 
immediate whenever an event triggers - and another that is sleepable and is 
executed from process context.

Having an intermediate softirq level might be over-design indeed.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 11:20                               ` Andi Kleen
@ 2010-06-24 11:33                                 ` Peter Zijlstra
  2010-06-24 11:55                                   ` Andi Kleen
  2010-06-24 11:42                                 ` Peter Zijlstra
  1 sibling, 1 reply; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24 11:33 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel

On Thu, 2010-06-24 at 13:20 +0200, Andi Kleen wrote:
> > softirq can't sleep either, you need a trampoline anywa
> 
> Not true, when you interrupt ring 3 it can sleep. You just need to make
> sure to run on the right stack and fix up any irq counters.

But that is not softirq. That would be something like what faults do,
but we don't have anything else that does that.



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 11:23                               ` Ingo Molnar
@ 2010-06-24 11:34                                 ` Peter Zijlstra
  2010-06-24 12:35                                   ` Ingo Molnar
  0 siblings, 1 reply; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24 11:34 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andi Kleen, Huang Ying, H.PeterA, linux-kernel

On Thu, 2010-06-24 at 13:23 +0200, Ingo Molnar wrote:

> What might make sense is to offer two types of callbacks: one that is 
> immediate whenever an event triggers - and another that is sleepable and is 
> executed from process context.

Trouble is waking that thread, you cannot wake tasks from NMI context,
so whatever you do, you'll end up with a trampoline.

You could of course offer that trampoline nicely packaged, but I'm not
sure that's really worth the effort.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 11:20                               ` Andi Kleen
  2010-06-24 11:33                                 ` Peter Zijlstra
@ 2010-06-24 11:42                                 ` Peter Zijlstra
  2010-06-24 11:58                                   ` Andi Kleen
  1 sibling, 1 reply; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24 11:42 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel

On Thu, 2010-06-24 at 13:20 +0200, Andi Kleen wrote:
> Ok so going back to the original self-irq patchkit. Unfortunately the other
> reviewer hated that. How to get out of that deadlock?

Well, I didn't like your original patch either.

What's wrong with going with the patch I posted today? (aside from me
getting the barriers slightly wrong and not doing the arch
implementation).

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 11:33                                 ` Peter Zijlstra
@ 2010-06-24 11:55                                   ` Andi Kleen
  2010-06-24 11:57                                     ` Peter Zijlstra
  0 siblings, 1 reply; 66+ messages in thread
From: Andi Kleen @ 2010-06-24 11:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, Huang Ying, Ingo Molnar, H.PeterA, linux-kernel

On Thu, Jun 24, 2010 at 01:33:24PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 13:20 +0200, Andi Kleen wrote:
> > > softirq can't sleep either, you need a trampoline anywa
> > 
> > Not true, when you interrupt ring 3 it can sleep. You just need to make
> > sure to run on the right stack and fix up any irq counters.
> 
> But that is not softirq. That would be something like what faults do,

Yes not a classical one, but can use the same infrastructure.

> but we don't have anything else that does that.

Actually we do, audit in syscalls and scheduling in interrupts and signals 
all work this way. Probably more at some point adding more code to this
path was very popular.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 11:55                                   ` Andi Kleen
@ 2010-06-24 11:57                                     ` Peter Zijlstra
  2010-06-24 12:02                                       ` Andi Kleen
  0 siblings, 1 reply; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24 11:57 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel

On Thu, 2010-06-24 at 13:55 +0200, Andi Kleen wrote:
> > but we don't have anything else that does that.
> 
> Actually we do, audit in syscalls and scheduling in interrupts and signals 
> all work this way. Probably more at some point adding more code to this
> path was very popular. 

That's the return to user path, nothing to do with softirqs. Add a TIF
flag and call your function there.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 11:42                                 ` Peter Zijlstra
@ 2010-06-24 11:58                                   ` Andi Kleen
  2010-06-24 12:02                                     ` Peter Zijlstra
  0 siblings, 1 reply; 66+ messages in thread
From: Andi Kleen @ 2010-06-24 11:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, Huang Ying, Ingo Molnar, H.PeterA, linux-kernel

On Thu, Jun 24, 2010 at 01:42:11PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 13:20 +0200, Andi Kleen wrote:
> > Ok so going back to the original self-irq patchkit. Unfortunately the other
> > reviewer hated that. How to get out of that deadlock?
> 
> Well, I didn't like your original patch either.
> 
> What's wrong with going with the patch I posted today? (aside from me
> getting the barriers slightly wrong and not doing the arch
> implementation).

Well it would need to work.

Also I personally didn't see the point of the irq items list because
there's no good way to dynamically allocate it in a NMI, so the window
would be always "fixed size" anyways and you could as well just use 
per cpu data.

That's why for simple self irq I preferred Ying's original patch.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 11:57                                     ` Peter Zijlstra
@ 2010-06-24 12:02                                       ` Andi Kleen
  2010-06-24 12:18                                         ` Peter Zijlstra
  0 siblings, 1 reply; 66+ messages in thread
From: Andi Kleen @ 2010-06-24 12:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, Huang Ying, Ingo Molnar, H.PeterA, linux-kernel

On Thu, Jun 24, 2010 at 01:57:29PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 13:55 +0200, Andi Kleen wrote:
> > > but we don't have anything else that does that.
> > 
> > Actually we do, audit in syscalls and scheduling in interrupts and signals 
> > all work this way. Probably more at some point adding more code to this
> > path was very popular. 
> 
> That's the return to user path, nothing to do with softirqs. Add a TIF
> flag and call your function there.

It does that, but there are some cases where it's not enough.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 11:58                                   ` Andi Kleen
@ 2010-06-24 12:02                                     ` Peter Zijlstra
  0 siblings, 0 replies; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24 12:02 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel

On Thu, 2010-06-24 at 13:58 +0200, Andi Kleen wrote:
> On Thu, Jun 24, 2010 at 01:42:11PM +0200, Peter Zijlstra wrote:
> > On Thu, 2010-06-24 at 13:20 +0200, Andi Kleen wrote:
> > > Ok so going back to the original self-irq patchkit. Unfortunately the other
> > > reviewer hated that. How to get out of that deadlock?
> > 
> > Well, I didn't like your original patch either.
> > 
> > What's wrong with going with the patch I posted today? (aside from me
> > getting the barriers slightly wrong and not doing the arch
> > implementation).
> 
> Well it would need to work.

Look at kernel/perf_event.c:perf_pending_queue()/__perf_pending_run()

> Also I personally didn't see the point of the irq items list because
> there's no good way to dynamically allocate it in a NMI, so the window
> would be always "fixed size" anyways and you could as well just use 
> per cpu data.
> 
> That's why for simple self irq I preferred Ying's original patch.

I already told you that I have an irq_work in every perf_event structure
(its called perf_pending_entry), I cannot register an id for each
perf_event because:
  1) there's potentially more than 32 of them
  2) I'd need an id->perf_event map which is a waste of time





^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 12:02                                       ` Andi Kleen
@ 2010-06-24 12:18                                         ` Peter Zijlstra
  2010-06-24 12:38                                           ` Andi Kleen
  0 siblings, 1 reply; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-24 12:18 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel

On Thu, 2010-06-24 at 14:02 +0200, Andi Kleen wrote:
> On Thu, Jun 24, 2010 at 01:57:29PM +0200, Peter Zijlstra wrote:
> > On Thu, 2010-06-24 at 13:55 +0200, Andi Kleen wrote:
> > > > but we don't have anything else that does that.
> > > 
> > > Actually we do, audit in syscalls and scheduling in interrupts and signals 
> > > all work this way. Probably more at some point adding more code to this
> > > path was very popular. 
> > 
> > That's the return to user path, nothing to do with softirqs. Add a TIF
> > flag and call your function there.
> 
> It does that, but there are some cases where it's not enough.

care to expand on that?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 11:34                                 ` Peter Zijlstra
@ 2010-06-24 12:35                                   ` Ingo Molnar
  2010-06-24 13:02                                     ` Andi Kleen
  0 siblings, 1 reply; 66+ messages in thread
From: Ingo Molnar @ 2010-06-24 12:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, Huang Ying, H. Peter Anvin, Borislav Petkov,
	linux-kernel, mauro


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Thu, 2010-06-24 at 13:23 +0200, Ingo Molnar wrote:
> 
> > What might make sense is to offer two types of callbacks: one that is 
> > immediate whenever an event triggers - and another that is sleepable and is 
> > executed from process context.
> 
> Trouble is waking that thread, you cannot wake tasks from NMI context,
> so whatever you do, you'll end up with a trampoline.
> 
> You could of course offer that trampoline nicely packaged, but I'm not
> sure that's really worth the effort.

Right, so there's basically three clean solutions to the 'sleepable callback' 
problem, in order of amount of state that needs to be passed to it:

 - State-less (or idempotent) events/callbacks: use a hardirq callback to wake
   up a well-known process context.

 - If we want the task that generates an event to execute a sleeping callback:
   use a TIF flag and state in the task itself to pass along the info.

 - In the most generic case, if there's arbitrary target task and arbitrary
   state that needs to be queued, then to achieve sleepable callbacks the
   following solution can be used: the task allocates a perf ring-buffer and
   uses a TIF flag to trigger consumption of it.

   All memory allocation, wakeup, etc. is handled already by the regular perf
   events and ring-buffer codepaths.

No special, open-coded trampolining needed - the ring-buffer is the trampoline 
and the ring-buffer consumer can key off the events it receives. (and there 
can be multiple consumers of the same event source so we can have in-task 
kernel based action combined with a user-space daemon that get an event stream 
as well.)

All of these solutions use the fact that perf events are a generic event 
framework. If there's any missing details somewhere then fixes/enhancements 
can be added - right now our in-kernel event consumers are simple. But the 
design is sound.

And none of these solutions involves the incestous low level raping of 
softirqs.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 12:18                                         ` Peter Zijlstra
@ 2010-06-24 12:38                                           ` Andi Kleen
  2010-06-25 10:38                                             ` Peter Zijlstra
  0 siblings, 1 reply; 66+ messages in thread
From: Andi Kleen @ 2010-06-24 12:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, Huang Ying, Ingo Molnar, H.PeterA, linux-kernel

On Thu, Jun 24, 2010 at 02:18:07PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-06-24 at 14:02 +0200, Andi Kleen wrote:
> > On Thu, Jun 24, 2010 at 01:57:29PM +0200, Peter Zijlstra wrote:
> > > On Thu, 2010-06-24 at 13:55 +0200, Andi Kleen wrote:
> > > > > but we don't have anything else that does that.
> > > > 
> > > > Actually we do, audit in syscalls and scheduling in interrupts and signals 
> > > > all work this way. Probably more at some point adding more code to this
> > > > path was very popular. 
> > > 
> > > That's the return to user path, nothing to do with softirqs. Add a TIF
> > > flag and call your function there.
> > 
> > It does that, but there are some cases where it's not enough.
> 
> care to expand on that?

This is for execution context error recovery.

TIF works for user space, but it's a bit ugly because it requires adding
more data to the task_struct because CPUs can change. The sleepable 
soft irq would have avoided that (that's not a show stopper)

The other case was to recover from a *_user() error in the kernel.
I originally had some fancy code for preemptive kernels that 
exploited that you could sleep here

(it doesn't work for non preemptive unfortunately because we can't
know if locks are hold and some *_user are expected to 
never sleep) 

But there were still ugly special cases for switching stacks
and the sleepable softirqs could have avoided that.

Anyways the later is not fatal either, but it would have been
nice to solve that one.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 12:35                                   ` Ingo Molnar
@ 2010-06-24 13:02                                     ` Andi Kleen
  2010-06-24 13:20                                       ` Borislav Petkov
  0 siblings, 1 reply; 66+ messages in thread
From: Andi Kleen @ 2010-06-24 13:02 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Andi Kleen, Huang Ying, H. Peter Anvin,
	Borislav Petkov, linux-kernel, mauro

On Thu, Jun 24, 2010 at 02:35:37PM +0200, Ingo Molnar wrote:
> All of these solutions use the fact that perf events are a generic event 
> framework. If there's any missing details somewhere then fixes/enhancements 
> can be added - right now our in-kernel event consumers are simple. But the 
> design is sound.

One immediate problem that comes to mind with the proposal 
is that if the event is of a type that cannot be dropped (e.g. an error 
that needs to be handled) then a shared ring buffer cannot guarantee that.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 13:02                                     ` Andi Kleen
@ 2010-06-24 13:20                                       ` Borislav Petkov
  2010-06-24 13:33                                         ` Andi Kleen
  0 siblings, 1 reply; 66+ messages in thread
From: Borislav Petkov @ 2010-06-24 13:20 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Ingo Molnar, Peter Zijlstra, Huang Ying, H. Peter Anvin,
	Borislav Petkov, linux-kernel, mauro

From: Andi Kleen <andi@firstfloor.org>
Date: Thu, Jun 24, 2010 at 03:02:34PM +0200

> On Thu, Jun 24, 2010 at 02:35:37PM +0200, Ingo Molnar wrote:
> > All of these solutions use the fact that perf events are a generic event 
> > framework. If there's any missing details somewhere then fixes/enhancements 
> > can be added - right now our in-kernel event consumers are simple. But the 
> > design is sound.
> 
> One immediate problem that comes to mind with the proposal 
> is that if the event is of a type that cannot be dropped (e.g. an error 
> that needs to be handled) then a shared ring buffer cannot guarantee that.

If its a critical error you do all the handling in the kernel and you
don't need task context at all, no? Can you give an example of such an
error?

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 13:20                                       ` Borislav Petkov
@ 2010-06-24 13:33                                         ` Andi Kleen
  2010-06-24 13:42                                           ` Ingo Molnar
  2010-06-24 13:46                                           ` Ingo Molnar
  0 siblings, 2 replies; 66+ messages in thread
From: Andi Kleen @ 2010-06-24 13:33 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Andi Kleen, Ingo Molnar, Peter Zijlstra, Huang Ying,
	H. Peter Anvin, Borislav Petkov, linux-kernel, mauro

> If its a critical error you do all the handling in the kernel and you

I assume you mean in MCE.  And the answer is no.

MCE generally can only panic or log, everything else
needs other contexts.

> don't need task context at all, no? 

Process context is needed for various recovery schemes, all
that need to sleep for example.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 13:33                                         ` Andi Kleen
@ 2010-06-24 13:42                                           ` Ingo Molnar
  2010-06-24 13:46                                           ` Ingo Molnar
  1 sibling, 0 replies; 66+ messages in thread
From: Ingo Molnar @ 2010-06-24 13:42 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Borislav Petkov, Peter Zijlstra, Huang Ying, H. Peter Anvin,
	Borislav Petkov, linux-kernel, mauro


Andi,

* Andi Kleen <andi@firstfloor.org> wrote:

> > If its a critical error you do all the handling in the kernel and you

Sidenote: could you please stop doing this special new email style of cutting 
out the portion from your emails which shows whom you replied to?

Doing that:

 - Shows disrespect towards the person you are replying to. If you feel the
   content worth quoting then you should pay respect to the person having
   written that email as well, and quote his name.

 - Makes the thread flow much harder to follow to me as i dont see it from the 
   mail text whom you replied to.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 13:33                                         ` Andi Kleen
  2010-06-24 13:42                                           ` Ingo Molnar
@ 2010-06-24 13:46                                           ` Ingo Molnar
  2010-06-24 14:01                                             ` Andi Kleen
  1 sibling, 1 reply; 66+ messages in thread
From: Ingo Molnar @ 2010-06-24 13:46 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Borislav Petkov, Peter Zijlstra, Huang Ying, H. Peter Anvin,
	Borislav Petkov, linux-kernel, mauro


* Andi Kleen <andi@firstfloor.org> wrote:

> > If its a critical error you do all the handling in the kernel and you
> 
> I assume you mean in MCE.  And the answer is no.
> 
> MCE generally can only panic or log, everything else
> needs other contexts.
> 
> > don't need task context at all, no? 
> 
> Process context is needed for various recovery schemes, all
> that need to sleep for example.

Please, as Peter and Boris asked you already, quote a concrete, specific 
example:

  'Specific event X occurs, kernel wants/needs to do Y. This cannot be done
   via the suggested method due to Z.'

Your generic arguments look wrong (to the extent they are specified) and it 
makes it much easier and faster to address your points if you dont blur them 
by vagaries.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 13:46                                           ` Ingo Molnar
@ 2010-06-24 14:01                                             ` Andi Kleen
  2010-06-24 15:41                                               ` Borislav Petkov
  0 siblings, 1 reply; 66+ messages in thread
From: Andi Kleen @ 2010-06-24 14:01 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, Borislav Petkov, Peter Zijlstra, Huang Ying,
	H. Peter Anvin, Borislav Petkov, linux-kernel, mauro

> Please, as Peter and Boris asked you already, quote a concrete, specific 
> example:

It was already in my answer to Peter.

> 
>   'Specific event X occurs, kernel wants/needs to do Y. This cannot be done
>    via the suggested method due to Z.'
> 
> Your generic arguments look wrong (to the extent they are specified) and it 
> makes it much easier and faster to address your points if you dont blur them 
> by vagaries.

It's one of the fundamental properties of recoverable errors.

Error happens.
Machine check or NMI or other exception happens. 
	That exception runs on the exception stack
	The error is not fatal, but recoverable.
For example you want to kill a process or call hwpoison or do some other
	recovery action. These generally have to sleep to do anything
	interesting.
You cannot do the sleeping on the exception stack, so you push it to
another context.

Now just because an error is recoverable doesn't mean it's not critical
(I think that was the mistake Boris made). If you don't do something
(like killing or recovery) you could end up in a loop or consume
corrupted data or something else bad. 

So the error has to have a fail safe path from detection to handling.

That's quite different from logging or performance counting etc.
where dropping events on overload is normal and expected.

Normally it can be only done by using dedicated resources.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 14:01                                             ` Andi Kleen
@ 2010-06-24 15:41                                               ` Borislav Petkov
  2010-06-24 16:09                                                 ` Andi Kleen
  0 siblings, 1 reply; 66+ messages in thread
From: Borislav Petkov @ 2010-06-24 15:41 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Ingo Molnar, Borislav Petkov, Peter Zijlstra, Huang Ying,
	H. Peter Anvin, Borislav Petkov, linux-kernel, mauro

From: Andi Kleen <andi@firstfloor.org>
Date: Thu, Jun 24, 2010 at 10:01:43AM -0400

> > Please, as Peter and Boris asked you already, quote a concrete, specific 
> > example:
> 
> It was already in my answer to Peter.
> 
> > 
> >   'Specific event X occurs, kernel wants/needs to do Y. This cannot be done
> >    via the suggested method due to Z.'
> > 
> > Your generic arguments look wrong (to the extent they are specified) and it 
> > makes it much easier and faster to address your points if you dont blur them 
> > by vagaries.
> 
> It's one of the fundamental properties of recoverable errors.
> 
> Error happens.
> Machine check or NMI or other exception happens. 
> 	That exception runs on the exception stack
> 	The error is not fatal, but recoverable.
> For example you want to kill a process or call hwpoison or do some other
> 	recovery action. These generally have to sleep to do anything
> 	interesting.
> You cannot do the sleeping on the exception stack, so you push it to
> another context.
> 
> Now just because an error is recoverable doesn't mean it's not critical
> (I think that was the mistake Boris made).

It wasn't a mistake - I was simply trying to lure you into giving a more
concrete example so that we all land on the same page and we know what
the heck you/we/all are talking about.

> If you don't do something
> (like killing or recovery) you could end up in a loop or consume
> corrupted data or something else bad. 
> 
> So the error has to have a fail safe path from detection to handling.

So we are talking about a more involved and "could-sleep" error
recovery.

> That's quite different from logging or performance counting etc.
> where dropping events on overload is normal and expected.

So I went back and reread the whole thread, and correct me if I'm
wrong but the whole run softirq after NMI has one use case for now -
"could-sleep" error handling for MCEs _only_ on x86. So you're changing
a bunch of generic and x86 kernel code just for error handling. Hmm,
that's a kinda big hammer in my book.

A slimmer solution is a much better way to go, IMHO. I think Peter said
something about irq_exit(), which should be just fine.

But AFAICT an arch-specific solution would be even better, e.g.
if you call into your deferred work helper from paranoid_exit in
<arch/x86/kernel/entry_64.S>. I.e, something like

#ifdef CONFIG_X86_MCE
testl $_TIF_NEED_POST_NMI,%ebx
jnz do_post_nmi_work
#endif

Or even slimmer, rewrite the paranoidzeroentry to a MCE-specific variant
which does the added functionality. But that wouldn't be extensible if
other entities want post-NMI work later.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 15:41                                               ` Borislav Petkov
@ 2010-06-24 16:09                                                 ` Andi Kleen
  0 siblings, 0 replies; 66+ messages in thread
From: Andi Kleen @ 2010-06-24 16:09 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Andi Kleen, Ingo Molnar, Peter Zijlstra, Huang Ying,
	H. Peter Anvin, Borislav Petkov, linux-kernel, mauro

On Thu, Jun 24, 2010 at 05:41:24PM +0200, Borislav Petkov wrote:
> > If you don't do something
> > (like killing or recovery) you could end up in a loop or consume
> > corrupted data or something else bad. 
> > 
> > So the error has to have a fail safe path from detection to handling.
> 
> So we are talking about a more involved and "could-sleep" error
> recovery.

That's one case, there are other too.

> 
> > That's quite different from logging or performance counting etc.
> > where dropping events on overload is normal and expected.
> 
> So I went back and reread the whole thread, and correct me if I'm
> wrong but the whole run softirq after NMI has one use case for now -
> "could-sleep" error handling for MCEs _only_ on x86. So you're changing

Nope, there are multiple use cases. Today it's background MCE
and possibly perf if it ever decides to share code
with the rest of the kernel instead of wanting to be Bork of Linux. 
Future ones would be more MCE errors and also non MCE errors like NMIs.

> a bunch of generic and x86 kernel code just for error handling. Hmm,
> that's a kinda big hammer in my book.

Actually no, it would just make the current code slightly cleaner
and somewhat more general.  But for most cases it works without it.
> 
> A slimmer solution is a much better way to go, IMHO. I think Peter said
> something about irq_exit(), which should be just fine.

The "slimmer solution" is there, but it has some limitations.
I merely said that softirqs would be useful for solving these limitations
(but are not strictly needed)

Anyways slimmer solution was even originally proposed, 
just some of the earlier review proposed softirqs instead.
So Ying posts softirqs and then he gets now flamed for posting
softirqs.  Overall there wasn't much consistency in the suggestions,
three different reviewers suggested three incompatible approaches.

Anyways if there are no softirqs that's fine too, the error
handler can probably live with not having that.

> But AFAICT an arch-specific solution would be even better, e.g.
> if you call into your deferred work helper from paranoid_exit in
> <arch/x86/kernel/entry_64.S>. I.e, something like

Yes that helps for part of the error handling (in fact this
has been implemented), but that does not solve the self interrupt
problem which requires delaying until next cli.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24  6:35 ` [RFC][PATCH] irq_work Peter Zijlstra
  2010-06-24  6:43   ` Huang Ying
@ 2010-06-25  2:12   ` Huang Ying
  2010-06-25  7:48     ` Peter Zijlstra
  2010-06-25  9:08     ` Andi Kleen
  2010-06-25 18:30   ` [RFC][PATCH] irq_work -v2 Peter Zijlstra
  2 siblings, 2 replies; 66+ messages in thread
From: Huang Ying @ 2010-06-25  2:12 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, H.Peter Anvin, linux-kernel, Andi Kleen

On Thu, 2010-06-24 at 14:35 +0800, Peter Zijlstra wrote:
> Something like this, but filled out with some arch code that does the
> self-ipi and calls irq_work_run() should do.
> 
> No need to molest the softirq code, no need for limited vectors of any
> kind.

Now, as far as my understanding goes, hard IRQ based solution is
acceptable for everyone.

Ingo and Andi,

Do you agree?

> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/irq_callback.h |   13 ++++++++
>  kernel/irq_callback.c        |   66 +++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 79 insertions(+)
> 
> Index: linux-2.6/include/linux/irq_callback.h
> ===================================================================
> --- /dev/null
> +++ linux-2.6/include/linux/irq_callback.h
> @@ -0,0 +1,13 @@
> +#ifndef _LINUX_IRQ_CALLBACK_H
> +#define _LINUX_IRQ_CALLBACK_H
> +
> +struct irq_work {
> +	struct irq_work *next;
> +	void (*func)(struct irq_work *);
> +};

It is better to add "void *data" field in this struct to allow same
function can be used for multiple struct irq_work.

And I think IRQ is the implementation detail here, so irq_work is
probably not a good name. nmi_return_notifier or nmi_callback is better?

> +int irq_work_queue(struct irq_work *entry, void (*func)(struct irq_work *));
> +void irq_work_run(void);
> +void irq_work_sync(struct irq_work *entry);
> +
> +#endif /* _LINUX_IRQ_CALLBACK_H */
> Index: linux-2.6/kernel/irq_callback.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6/kernel/irq_callback.c
> @@ -0,0 +1,66 @@
> +
> +#include <linux/irq_callback.h>
> +
> +#define CALLBACK_TAIL ((struct irq_work *)-1UL)
> +
> +static DEFINE_PER_CPU(struct irq_work *, irq_work_list) = {
> +	CALLBACK_TAIL,
> +};
> +
> +int irq_work_queue(struct irq_work *entry, void (*func)(struct irq_work *))
> +{
> +	struct irq_work **head;
> +
> +	if (cmpxchg(&entry->next, NULL, CALLBACK_TAIL) != NULL)
> +		return 0;
> +
> +	entry->func = func;
> +
> +	head = &get_cpu_var(irq_work_list);
> +
> +	do {
> +		entry->next = *head;
> +	} while (cmpxchg(head, entry->next, entry) != entry->next);
> +
> +	if (entry->next == CALLBACK_TAIL)
> +		arch_self_ipi();
> +
> +	put_cpu_var(irq_work_list);
> +	return 1;
> +}
> +
> +void irq_work_run(void)
> +{
> +	struct irq_work *list;
> +
> +	list = xchg(&__get_cpu_var(irq_work_list), CALLBACK_TAIL);
> +	while (list != CALLBACK_TAIL) {
> +		struct irq_work *entry = list;
> +
> +		list = list->next;
> +		entry->func(entry);
> +
> +		entry->next = NULL;

entry->next = NULL should be put before entry->func(entry), so that we
will not lose a notification from NMI. And maybe check irq_work_list for
several times to make sure nothing is lost.

> +		/*
> +		 * matches the mb in cmpxchg() in irq_work_queue()
> +		 */
> +		smp_wmb();
> +	}
> +}

I don't know why we need smp_wmb() here and smp_rmb() in
irq_work_pending(). The smp_<x>mb() in original perf_pending_xxx code is
not necessary too. Because smp_<x>mb is invoked in wake_up_process() and
__wait_event() already.

> +static int irq_work_pending(struct irq_work *entry)
> +{
> +	/*
> +	 * matches the wmb in irq_work_run
> +	 */
> +	smp_rmb();
> +	return entry->next != NULL;
> +}
> +
> +void irq_work_sync(struct irq_work *entry)
> +{
> +	WARN_ON_ONCE(irqs_disabled());
> +
> +	while (irq_work_pending(entry))
> +		cpu_relax();
> +}

If we move entry->next = NULL earlier in irq_work_run(), we need another
flag to signify the entry->func is running here.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-25  2:12   ` Huang Ying
@ 2010-06-25  7:48     ` Peter Zijlstra
  2010-06-25  9:17       ` Huang Ying
  2010-06-25  9:08     ` Andi Kleen
  1 sibling, 1 reply; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-25  7:48 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H.Peter Anvin, linux-kernel, Andi Kleen

On Fri, 2010-06-25 at 10:12 +0800, Huang Ying wrote:
> 
> It is better to add "void *data" field in this struct to allow same
> function can be used for multiple struct irq_work. 

No, simply do:

struct my_foo {
  struct irq_work work;
  /* my extra data */
}

void my_func(struct irq_work *work)
{
  struct my_foo *foo = container_of(work, struct my_foo, work);

  /* tada! */
}


> And I think IRQ is the implementation detail here, so irq_work is
> probably not a good name. nmi_return_notifier or nmi_callback is better?

Well, its ran in hard-irq context, so its an irq work. There's nothing
that says it can only be used from NMI context.

> > +void irq_work_run(void)
> > +{
> > +     struct irq_work *list;
> > +
> > +     list = xchg(&__get_cpu_var(irq_work_list), CALLBACK_TAIL);
> > +     while (list != CALLBACK_TAIL) {
> > +             struct irq_work *entry = list;
> > +
> > +             list = list->next;
> > +             entry->func(entry);
> > +
> > +             entry->next = NULL;
> 
> entry->next = NULL should be put before entry->func(entry), so that we
> will not lose a notification from NMI. And maybe check irq_work_list for
> several times to make sure nothing is lost.

But then _sync() will return before its done executing.

I think clearing after the function is done executing is the only sane
semantics (and yes, I should fix the current perf code).

You can always miss an NMI since it can always happen before the
callback gets done, and allowing another enqueue before the callback is
complete is asking for trouble.

> > +             /*
> > +              * matches the mb in cmpxchg() in irq_work_queue()
> > +              */
> > +             smp_wmb();
> > +     }
> > +}
> 
> I don't know why we need smp_wmb() here and smp_rmb() in
> irq_work_pending(). The smp_<x>mb() in original perf_pending_xxx code is
> not necessary too. Because smp_<x>mb is invoked in wake_up_process() and
> __wait_event() already.

The smp_wmb() wants to be before ->next = NULL; so that all writes are
completed before we release the entry. To that same effect _sync() and
_queue need the (r)mb.



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-25  2:12   ` Huang Ying
  2010-06-25  7:48     ` Peter Zijlstra
@ 2010-06-25  9:08     ` Andi Kleen
  1 sibling, 0 replies; 66+ messages in thread
From: Andi Kleen @ 2010-06-25  9:08 UTC (permalink / raw)
  To: Huang Ying
  Cc: Peter Zijlstra, Ingo Molnar, H.Peter Anvin, linux-kernel, Andi Kleen

On Fri, Jun 25, 2010 at 10:12:43AM +0800, Huang Ying wrote:
> On Thu, 2010-06-24 at 14:35 +0800, Peter Zijlstra wrote:
> > Something like this, but filled out with some arch code that does the
> > self-ipi and calls irq_work_run() should do.
> > 
> > No need to molest the softirq code, no need for limited vectors of any
> > kind.
> 
> Now, as far as my understanding goes, hard IRQ based solution is
> acceptable for everyone.
> 
> Ingo and Andi,
> 
> Do you agree?

Yes that's fine for me. The error handling issues can be solved in other
ways too.

-Andi

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-25  7:48     ` Peter Zijlstra
@ 2010-06-25  9:17       ` Huang Ying
  2010-06-25  9:23         ` Frederic Weisbecker
  2010-06-25  9:30         ` Peter Zijlstra
  0 siblings, 2 replies; 66+ messages in thread
From: Huang Ying @ 2010-06-25  9:17 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

On Fri, 2010-06-25 at 15:48 +0800, Peter Zijlstra wrote:
> On Fri, 2010-06-25 at 10:12 +0800, Huang Ying wrote:
> > 
> > It is better to add "void *data" field in this struct to allow same
> > function can be used for multiple struct irq_work. 
> 
> No, simply do:
> 
> struct my_foo {
>   struct irq_work work;
>   /* my extra data */
> }
> 
> void my_func(struct irq_work *work)
> {
>   struct my_foo *foo = container_of(work, struct my_foo, work);
> 
>   /* tada! */
> }

Yes. This works too. But Adding "void *data" field is helpful if you do
not embed struct irq_work into another struct.

> > And I think IRQ is the implementation detail here, so irq_work is
> > probably not a good name. nmi_return_notifier or nmi_callback is better?
> 
> Well, its ran in hard-irq context, so its an irq work. There's nothing
> that says it can only be used from NMI context.

It may be run in other contexts on some system (without APIC). And I
don't think it is useful to others except NMI handler. I think this is a
choice between naming after implementation and purpose.

> > > +void irq_work_run(void)
> > > +{
> > > +     struct irq_work *list;
> > > +
> > > +     list = xchg(&__get_cpu_var(irq_work_list), CALLBACK_TAIL);
> > > +     while (list != CALLBACK_TAIL) {
> > > +             struct irq_work *entry = list;
> > > +
> > > +             list = list->next;
> > > +             entry->func(entry);
> > > +
> > > +             entry->next = NULL;
> > 
> > entry->next = NULL should be put before entry->func(entry), so that we
> > will not lose a notification from NMI. And maybe check irq_work_list for
> > several times to make sure nothing is lost.
> 
> But then _sync() will return before its done executing.

We can use another flag to signify whether it is executing. For example
the bit 0 of entry->next.

> I think clearing after the function is done executing is the only sane
> semantics (and yes, I should fix the current perf code).
> 
> You can always miss an NMI since it can always happen before the
> callback gets done, and allowing another enqueue before the callback is
> complete is asking for trouble.

If we move entry->next = NULL before entry->func(entry), we will not
miss the NMI. Can you show how to miss it in this way?

> > > +             /*
> > > +              * matches the mb in cmpxchg() in irq_work_queue()
> > > +              */
> > > +             smp_wmb();
> > > +     }
> > > +}
> > 
> > I don't know why we need smp_wmb() here and smp_rmb() in
> > irq_work_pending(). The smp_<x>mb() in original perf_pending_xxx code is
> > not necessary too. Because smp_<x>mb is invoked in wake_up_process() and
> > __wait_event() already.
> 
> The smp_wmb() wants to be before ->next = NULL; so that all writes are
> completed before we release the entry. To that same effect _sync() and
> _queue need the (r)mb.

It is reasonable in this way.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-25  9:17       ` Huang Ying
@ 2010-06-25  9:23         ` Frederic Weisbecker
  2010-06-25  9:30           ` Huang Ying
  2010-06-25  9:30         ` Peter Zijlstra
  1 sibling, 1 reply; 66+ messages in thread
From: Frederic Weisbecker @ 2010-06-25  9:23 UTC (permalink / raw)
  To: Huang Ying
  Cc: Peter Zijlstra, Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

2010/6/25 Huang Ying <ying.huang@intel.com>:
> On Fri, 2010-06-25 at 15:48 +0800, Peter Zijlstra wrote:
>> On Fri, 2010-06-25 at 10:12 +0800, Huang Ying wrote:
>> >
>> > It is better to add "void *data" field in this struct to allow same
>> > function can be used for multiple struct irq_work.
>>
>> No, simply do:
>>
>> struct my_foo {
>>   struct irq_work work;
>>   /* my extra data */
>> }
>>
>> void my_func(struct irq_work *work)
>> {
>>   struct my_foo *foo = container_of(work, struct my_foo, work);
>>
>>   /* tada! */
>> }
>
> Yes. This works too. But Adding "void *data" field is helpful if you do
> not embed struct irq_work into another struct.



That's what makes most sense. If you use work->data to put foo, then
you can also do the opposite. Now the best is to pick the choice that
gives you a real type and a typechecking, and not an error-prone and
obfuscated void *

This is the way things are made in the kernel. struct work_struct, struct list,
struct rcu_head, etc... are all embedded into a container, so that we can
use container_of.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-25  9:23         ` Frederic Weisbecker
@ 2010-06-25  9:30           ` Huang Ying
  2010-06-25  9:44             ` Frederic Weisbecker
  0 siblings, 1 reply; 66+ messages in thread
From: Huang Ying @ 2010-06-25  9:30 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Peter Zijlstra, Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

On Fri, 2010-06-25 at 17:23 +0800, Frederic Weisbecker wrote:
> 2010/6/25 Huang Ying <ying.huang@intel.com>:
> > On Fri, 2010-06-25 at 15:48 +0800, Peter Zijlstra wrote:
> >> On Fri, 2010-06-25 at 10:12 +0800, Huang Ying wrote:
> >> >
> >> > It is better to add "void *data" field in this struct to allow same
> >> > function can be used for multiple struct irq_work.
> >>
> >> No, simply do:
> >>
> >> struct my_foo {
> >>   struct irq_work work;
> >>   /* my extra data */
> >> }
> >>
> >> void my_func(struct irq_work *work)
> >> {
> >>   struct my_foo *foo = container_of(work, struct my_foo, work);
> >>
> >>   /* tada! */
> >> }
> >
> > Yes. This works too. But Adding "void *data" field is helpful if you do
> > not embed struct irq_work into another struct.
>
> 
> That's what makes most sense. If you use work->data to put foo, then
> you can also do the opposite. Now the best is to pick the choice that
> gives you a real type and a typechecking, and not an error-prone and
> obfuscated void *
> 
> This is the way things are made in the kernel. struct work_struct, struct list,
> struct rcu_head, etc... are all embedded into a container, so that we can
> use container_of.

container_of has no full type checking too.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-25  9:17       ` Huang Ying
  2010-06-25  9:23         ` Frederic Weisbecker
@ 2010-06-25  9:30         ` Peter Zijlstra
  2010-06-25 11:58           ` huang ying
  1 sibling, 1 reply; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-25  9:30 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen

On Fri, 2010-06-25 at 17:17 +0800, Huang Ying wrote:
> On Fri, 2010-06-25 at 15:48 +0800, Peter Zijlstra wrote:
> > On Fri, 2010-06-25 at 10:12 +0800, Huang Ying wrote:
> > > 
> > > It is better to add "void *data" field in this struct to allow same
> > > function can be used for multiple struct irq_work. 
> > 
> > No, simply do:
> > 
> > struct my_foo {
> >   struct irq_work work;
> >   /* my extra data */
> > }
> > 
> > void my_func(struct irq_work *work)
> > {
> >   struct my_foo *foo = container_of(work, struct my_foo, work);
> > 
> >   /* tada! */
> > }
> 
> Yes. This works too. But Adding "void *data" field is helpful if you do
> not embed struct irq_work into another struct.

No, embedding is the normal way we do this in the kernel.

> > > And I think IRQ is the implementation detail here, so irq_work is
> > > probably not a good name. nmi_return_notifier or nmi_callback is better?
> > 
> > Well, its ran in hard-irq context, so its an irq work. There's nothing
> > that says it can only be used from NMI context.
> 
> It may be run in other contexts on some system (without APIC). 

I would consider that a BUG. Use a random IRQ source to process the
callbacks on these broken platforms.

> And I
> don't think it is useful to others except NMI handler. I think this is a
> choice between naming after implementation and purpose.

There is, although I'm sure people will yell at me for even proposing
this. You can raise the IPI from an IRQ disabled section to get
something done right after it.

> We can use another flag to signify whether it is executing. For example
> the bit 0 of entry->next.

There's no point.

> > I think clearing after the function is done executing is the only sane
> > semantics (and yes, I should fix the current perf code).
> > 
> > You can always miss an NMI since it can always happen before the
> > callback gets done, and allowing another enqueue before the callback is
> > complete is asking for trouble.
> 
> If we move entry->next = NULL before entry->func(entry), we will not
> miss the NMI. Can you show how to miss it in this way?

<NMI>
  ...
  irq_work_queue(&my_work, func);
  ...
<EOI>
<IPI>
  irq_work_run()

  <NMI>
    irq_work_queue(&my_work, func); <FAIL>
  <EOI>

   my_func.next = NULL;
<EOI>

Really not that hard. Now imagine wrapping irq_work in some state and
you reusing the state while the function is still running..

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-25  9:30           ` Huang Ying
@ 2010-06-25  9:44             ` Frederic Weisbecker
  0 siblings, 0 replies; 66+ messages in thread
From: Frederic Weisbecker @ 2010-06-25  9:44 UTC (permalink / raw)
  To: Huang Ying; +Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Andi Kleen

2010/6/25 Huang Ying <ying.huang@intel.com>:
> On Fri, 2010-06-25 at 17:23 +0800, Frederic Weisbecker wrote:
>> 2010/6/25 Huang Ying <ying.huang@intel.com>:
>> > On Fri, 2010-06-25 at 15:48 +0800, Peter Zijlstra wrote:
>> >> On Fri, 2010-06-25 at 10:12 +0800, Huang Ying wrote:
>> >> >
>> >> > It is better to add "void *data" field in this struct to allow same
>> >> > function can be used for multiple struct irq_work.
>> >>
>> >> No, simply do:
>> >>
>> >> struct my_foo {
>> >>   struct irq_work work;
>> >>   /* my extra data */
>> >> }
>> >>
>> >> void my_func(struct irq_work *work)
>> >> {
>> >>   struct my_foo *foo = container_of(work, struct my_foo, work);
>> >>
>> >>   /* tada! */
>> >> }
>> >
>> > Yes. This works too. But Adding "void *data" field is helpful if you do
>> > not embed struct irq_work into another struct.
>>
>>
>> That's what makes most sense. If you use work->data to put foo, then
>> you can also do the opposite. Now the best is to pick the choice that
>> gives you a real type and a typechecking, and not an error-prone and
>> obfuscated void *
>>
>> This is the way things are made in the kernel. struct work_struct, struct list,
>> struct rcu_head, etc... are all embedded into a container, so that we can
>> use container_of.
>
> container_of has no full type checking too.


You're right. There is nothing that guarantees B is contained into A,
I mean the code is supposed
to provide this guarantee, but not the type.

That said it's much proper than playing with a void *data, beside the
fact kernel developers
will quickly understand what you do if you play with such scheme as
they are used to it.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-24 12:38                                           ` Andi Kleen
@ 2010-06-25 10:38                                             ` Peter Zijlstra
  0 siblings, 0 replies; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-25 10:38 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Huang Ying, Ingo Molnar, H. Peter Anvin, linux-kernel

On Thu, 2010-06-24 at 14:38 +0200, Andi Kleen wrote:
>  The sleepable 
> soft irq would have avoided that (that's not a show stopper) 

I'm still not convinced sleepable softirq is a workable thing.

Softirqs:
  A) are non-preemptible
  B) are per-cpu because of A
  C) can be ran from ksoftirqd context
  D) generic kernel infrastructure with identical semantics on all archs

If you were to make something like a sleepable softirq, you'd loose A
(per definition), B (sleepable implies migratable to cpus_allowed) and
possibly D (unless you want to touch all architectures).

Now from your 'requirements':

> I have one case that needs to sleep (but only when interrupting user code)

> TIF works for user space, but it's a bit ugly because it requires adding
> more data to the task_struct because CPUs can change.

Which I read as:

 1) needs to run in the task context of the task that got 'interrupted'
 2) needs to stay on the cpu it got interrupted on.

So C is out of the window too, at which point there's nothing resembling
softirqs left.

To boot, x86_64 runs softirqs from the hardirq stack:

/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(call_softirq)
        CFI_STARTPROC
        push %rbp
        CFI_ADJUST_CFA_OFFSET   8
        CFI_REL_OFFSET rbp,0
        mov  %rsp,%rbp
        CFI_DEF_CFA_REGISTER rbp
        incl PER_CPU_VAR(irq_count)
        cmove PER_CPU_VAR(irq_stack_ptr),%rsp
        push  %rbp                      # backlink for old unwinder
        call __do_softirq
        leaveq
        CFI_DEF_CFA_REGISTER    rsp
        CFI_ADJUST_CFA_OFFSET   -8
        decl PER_CPU_VAR(irq_count)
        ret 
        CFI_ENDPROC
END(call_softirq) 

Also, -rt has something that could be considered sleepable softirqs,
although we call them preemptible softirqs. It runs all softirqs from
cpu bound kthreads, which again doesn't match your requirements.

So no, I don't think your idea of sleepable softirqs is sound.


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work
  2010-06-25  9:30         ` Peter Zijlstra
@ 2010-06-25 11:58           ` huang ying
  0 siblings, 0 replies; 66+ messages in thread
From: huang ying @ 2010-06-25 11:58 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Huang Ying, Ingo Molnar, hpa, linux-kernel, Andi Kleen

On Fri, Jun 25, 2010 at 5:30 PM, Peter Zijlstra <peterz@infradead.org> wrote:
>> > You can always miss an NMI since it can always happen before the
>> > callback gets done, and allowing another enqueue before the callback is
>> > complete is asking for trouble.
>>
>> If we move entry->next = NULL before entry->func(entry), we will not
>> miss the NMI. Can you show how to miss it in this way?
>
> <NMI>
>  ...
>  irq_work_queue(&my_work, func);
>  ...
> <EOI>
> <IPI>
>  irq_work_run()
>
>  <NMI>
>    irq_work_queue(&my_work, func); <FAIL>
>  <EOI>
>
>   my_func.next = NULL;

entry->func() should follows here. You can collect all information
(maybe some data in a ring buffer) from NMI handler in entry->func().
But if you place entry->NULL after entry->func(), you will really lose
a NMI notification and the information from NMI handler.

> <EOI>

> Really not that hard. Now imagine wrapping irq_work in some state and
> you reusing the state while the function is still running..

So I suggest to use another flag to signify the function is running to
distinguish.

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [RFC][PATCH] irq_work -v2
  2010-06-24  6:35 ` [RFC][PATCH] irq_work Peter Zijlstra
  2010-06-24  6:43   ` Huang Ying
  2010-06-25  2:12   ` Huang Ying
@ 2010-06-25 18:30   ` Peter Zijlstra
  2010-06-25 19:30     ` Andi Kleen
  2010-06-26  1:26     ` huang ying
  2 siblings, 2 replies; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-25 18:30 UTC (permalink / raw)
  To: Huang Ying
  Cc: Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen, tglx, davem, paulus

Utterly untested and you really need visit all the touched architectures
if you want to make it useful.

---
Subject: irq_work: generic hard-irq context callbacks

In order for other NMI context users that want to run things from
hard-IRQ context, extract the perf_event callback mechanism.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/alpha/include/asm/perf_event.h           |    9 -
 arch/frv/lib/perf_event.c                     |   19 ---
 arch/parisc/include/asm/perf_event.h          |    7 -
 arch/s390/include/asm/perf_event.h            |   10 -
 linux-2.6/arch/alpha/Kconfig                  |    1 
 linux-2.6/arch/arm/Kconfig                    |    1 
 linux-2.6/arch/arm/include/asm/perf_event.h   |   12 -
 linux-2.6/arch/arm/kernel/perf_event.c        |    4 
 linux-2.6/arch/frv/Kconfig                    |    1 
 linux-2.6/arch/parisc/Kconfig                 |    1 
 linux-2.6/arch/powerpc/Kconfig                |    1 
 linux-2.6/arch/powerpc/kernel/time.c          |   42 +++---
 linux-2.6/arch/s390/Kconfig                   |    1 
 linux-2.6/arch/sh/Kconfig                     |    1 
 linux-2.6/arch/sh/include/asm/perf_event.h    |    7 -
 linux-2.6/arch/sparc/Kconfig                  |    2 
 linux-2.6/arch/sparc/include/asm/perf_event.h |    4 
 linux-2.6/arch/sparc/kernel/pcr.c             |    8 -
 linux-2.6/arch/x86/Kconfig                    |    1 
 linux-2.6/arch/x86/include/asm/entry_arch.h   |    4 
 linux-2.6/arch/x86/include/asm/hw_irq.h       |    2 
 linux-2.6/arch/x86/kernel/Makefile            |    1 
 linux-2.6/arch/x86/kernel/cpu/perf_event.c    |   19 ---
 linux-2.6/arch/x86/kernel/entry_64.S          |    4 
 linux-2.6/arch/x86/kernel/irq_work.c          |   26 ++++
 linux-2.6/arch/x86/kernel/irqinit.c           |    4 
 linux-2.6/include/linux/irq_work.h            |   20 +++
 linux-2.6/include/linux/perf_event.h          |   11 -
 linux-2.6/init/Kconfig                        |    8 +
 linux-2.6/kernel/Makefile                     |    2 
 linux-2.6/kernel/irq_work.c                   |  157 ++++++++++++++++++++++++++
 linux-2.6/kernel/perf_event.c                 |  102 ----------------
 linux-2.6/kernel/timer.c                      |    4 
 33 files changed, 266 insertions(+), 230 deletions(-)

Index: linux-2.6/include/linux/irq_work.h
===================================================================
--- /dev/null
+++ linux-2.6/include/linux/irq_work.h
@@ -0,0 +1,20 @@
+#ifndef _LINUX_IRQ_WORK_H
+#define _LINUX_IRQ_WORK_H
+
+struct irq_work {
+	struct irq_work *next;
+	void (*func)(struct irq_work *);
+};
+
+static inline
+void init_irq_work(struct irq_work *entry, void (*func)(struct irq_work *))
+{
+	entry->next = NULL;
+	entry->func = func;
+}
+
+int irq_work_queue(struct irq_work *entry);
+void irq_work_run(void);
+void irq_work_sync(struct irq_work *entry);
+
+#endif /* _LINUX_IRQ_WORK_H */
Index: linux-2.6/kernel/irq_work.c
===================================================================
--- /dev/null
+++ linux-2.6/kernel/irq_work.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Provides a framework for enqueueing and running callbacks from hardirq
+ * context. The enqueueing is NMI-safe.
+ */
+
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+
+/*
+ * An entry can be in one of four states:
+ *
+ * free	     NULL, 0 -> {claimed}       : free to be used
+ * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
+ * pending   next, 3 -> {busy}          : queued, pending callback
+ * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
+ *
+ * We use the lower two bits of the next pointer to keep PENDING and BUSY
+ * flags.
+ */
+
+#define IRQ_WORK_PENDING	1UL
+#define IRQ_WORK_BUSY		2UL
+#define IRQ_WORK_FLAGS		3UL
+
+static inline bool irq_work_is_set(struct irq_work *entry, int flags)
+{
+	return (unsigned long)entry->next & flags;
+}
+
+static inline struct irq_work *irq_work_next(struct irq_work *entry)
+{
+	unsigned long next = (unsigned long)entry->next;
+	next &= ~IRQ_WORK_FLAGS;
+	return (struct irq_work *)next;
+}
+
+static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
+{
+	unsigned long next = (unsigned long)entry;
+	next |= flags;
+	return (struct irq_work *)next;
+}
+
+static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
+
+/*
+ * Claim the entry so that no one else will poke at it.
+ */
+static bool irq_work_claim(struct irq_work *entry)
+{
+	unsigned long flags;
+
+	do {
+		flags = (unsigned long)entry->next;
+		if (flags & IRQ_WORK_PENDING)
+			return false;
+	} while (cmpxchg(&entry->next, flags, flags | IRQ_WORK_FLAGS) != flags);
+
+	return true;
+}
+
+
+void __weak arch_irq_work_raise(void)
+{
+	/*
+	 * Lame architectures will get the timer tick callback
+	 */
+}
+
+/*
+ * Queue the entry and raise the IPI if needed.
+ */
+static void __irq_work_queue(struct irq_work *entry)
+{
+	struct irq_work **head;
+
+	head = &get_cpu_var(irq_work_list);
+
+	do {
+		/*
+		 * Can assign non-atomic because we keep the flags set.
+		 */
+		entry->next = next_flags(*head, IRQ_WORK_FLAGS);
+	} while (cmpxchg(head, entry->next, entry) != entry->next);
+
+	/*
+	 * The list was empty, raise self-interrupt to start processing.
+	 */
+	if (!irq_work_next(entry))
+		arch_irq_work_raise();
+
+	put_cpu_var(irq_work_list);
+}
+
+/*
+ * Enqueue the irq_work @entry, returns true on success, failure when the
+ * @entry was already enqueued by someone else.
+ *
+ * Can be re-enqueued while the callback is still in progress.
+ */
+bool irq_work_queue(struct irq_work *entry)
+{
+	if (!irq_work_claim(entry)) {
+		/*
+		 * Already enqueued, can't do!
+		 */
+		return false;
+	}
+
+	__irq_work_queue(entry);
+	return true;
+}
+
+/*
+ * Run the irq_work entries on this cpu. Requires to be ran from hardirq
+ * context with local IRQs disabled.
+ */
+void irq_work_run(void)
+{
+	struct irq_work *list;
+
+	BUG_ON(!in_irq());
+	BUG_ON(!irqs_disabled());
+
+	list = xchg(&__get_cpu_var(irq_work_list), NULL);
+	while (list != NULL) {
+		struct irq_work *entry = list;
+
+		list = irq_work_next(list);
+
+		/*
+		 * Clear the PENDING bit, after this point the @entry
+		 * can be re-used.
+		 */
+		entry->next = next_flags(NULL, IRQ_WORK_BUSY);
+		entry->func(entry);
+		/*
+		 * Clear the BUSY bit and return to the free state if
+		 * no-one else claimed it meanwhile.
+		 */
+		cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
+	}
+}
+
+/*
+ * Synchronize against the irq_work @entry, ensures the entry is not
+ * currently in use.
+ */
+void irq_work_sync(struct irq_work *entry)
+{
+	WARN_ON_ONCE(irqs_disabled());
+
+	while (irq_work_is_set(entry, IRQ_WORK_BUSY))
+		cpu_relax();
+}
Index: linux-2.6/arch/alpha/Kconfig
===================================================================
--- linux-2.6.orig/arch/alpha/Kconfig
+++ linux-2.6/arch/alpha/Kconfig
@@ -9,6 +9,7 @@ config ALPHA
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_SYSCALL_WRAPPERS
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select HAVE_DMA_ATTRS
 	help
Index: linux-2.6/arch/alpha/include/asm/perf_event.h
===================================================================
--- linux-2.6.orig/arch/alpha/include/asm/perf_event.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef __ASM_ALPHA_PERF_EVENT_H
-#define __ASM_ALPHA_PERF_EVENT_H
-
-/* Alpha only supports software events through this interface. */
-static inline void set_perf_event_pending(void) { }
-
-#define PERF_EVENT_INDEX_OFFSET 0
-
-#endif /* __ASM_ALPHA_PERF_EVENT_H */
Index: linux-2.6/arch/arm/Kconfig
===================================================================
--- linux-2.6.orig/arch/arm/Kconfig
+++ linux-2.6/arch/arm/Kconfig
@@ -22,6 +22,7 @@ config ARM
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_KERNEL_LZMA
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 	help
Index: linux-2.6/arch/arm/include/asm/perf_event.h
===================================================================
--- linux-2.6.orig/arch/arm/include/asm/perf_event.h
+++ linux-2.6/arch/arm/include/asm/perf_event.h
@@ -12,18 +12,6 @@
 #ifndef __ARM_PERF_EVENT_H__
 #define __ARM_PERF_EVENT_H__
 
-/*
- * NOP: on *most* (read: all supported) ARM platforms, the performance
- * counter interrupts are regular interrupts and not an NMI. This
- * means that when we receive the interrupt we can call
- * perf_event_do_pending() that handles all of the work with
- * interrupts enabled.
- */
-static inline void
-set_perf_event_pending(void)
-{
-}
-
 /* ARM performance counters start from 1 (in the cp15 accesses) so use the
  * same indexes here for consistency. */
 #define PERF_EVENT_INDEX_OFFSET 1
Index: linux-2.6/arch/frv/Kconfig
===================================================================
--- linux-2.6.orig/arch/frv/Kconfig
+++ linux-2.6/arch/frv/Kconfig
@@ -7,6 +7,7 @@ config FRV
 	default y
 	select HAVE_IDE
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 
 config ZONE_DMA
Index: linux-2.6/arch/frv/lib/perf_event.c
===================================================================
--- linux-2.6.orig/arch/frv/lib/perf_event.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Performance event handling
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/perf_event.h>
-
-/*
- * mark the performance event as pending
- */
-void set_perf_event_pending(void)
-{
-}
Index: linux-2.6/arch/parisc/Kconfig
===================================================================
--- linux-2.6.orig/arch/parisc/Kconfig
+++ linux-2.6/arch/parisc/Kconfig
@@ -16,6 +16,7 @@ config PARISC
 	select RTC_DRV_GENERIC
 	select INIT_ALL_POSSIBLE
 	select BUG
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select GENERIC_ATOMIC64 if !64BIT
 	help
Index: linux-2.6/arch/parisc/include/asm/perf_event.h
===================================================================
--- linux-2.6.orig/arch/parisc/include/asm/perf_event.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __ASM_PARISC_PERF_EVENT_H
-#define __ASM_PARISC_PERF_EVENT_H
-
-/* parisc only supports software events through this interface. */
-static inline void set_perf_event_pending(void) { }
-
-#endif /* __ASM_PARISC_PERF_EVENT_H */
Index: linux-2.6/arch/powerpc/Kconfig
===================================================================
--- linux-2.6.orig/arch/powerpc/Kconfig
+++ linux-2.6/arch/powerpc/Kconfig
@@ -139,6 +139,7 @@ config PPC
 	select HAVE_OPROFILE
 	select HAVE_SYSCALL_WRAPPERS if PPC64
 	select GENERIC_ATOMIC64 if PPC32
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select HAVE_REGS_AND_STACK_ACCESS_API
 
Index: linux-2.6/arch/powerpc/kernel/time.c
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/time.c
+++ linux-2.6/arch/powerpc/kernel/time.c
@@ -53,7 +53,7 @@
 #include <linux/posix-timers.h>
 #include <linux/irq.h>
 #include <linux/delay.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
 #include <asm/trace.h>
 
 #include <asm/io.h>
@@ -532,60 +532,60 @@ void __init iSeries_time_init_early(void
 }
 #endif /* CONFIG_PPC_ISERIES */
 
-#ifdef CONFIG_PERF_EVENTS
+#ifdef CONFIG_IRQ_WORK
 
 /*
  * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
  */
 #ifdef CONFIG_PPC64
-static inline unsigned long test_perf_event_pending(void)
+static inline unsigned long test_irq_work_pending(void)
 {
 	unsigned long x;
 
 	asm volatile("lbz %0,%1(13)"
 		: "=r" (x)
-		: "i" (offsetof(struct paca_struct, perf_event_pending)));
+		: "i" (offsetof(struct paca_struct, irq_work_pending)));
 	return x;
 }
 
-static inline void set_perf_event_pending_flag(void)
+static inline void set_irq_work_pending_flag(void)
 {
 	asm volatile("stb %0,%1(13)" : :
 		"r" (1),
-		"i" (offsetof(struct paca_struct, perf_event_pending)));
+		"i" (offsetof(struct paca_struct, irq_work_pending)));
 }
 
-static inline void clear_perf_event_pending(void)
+static inline void clear_irq_work_pending(void)
 {
 	asm volatile("stb %0,%1(13)" : :
 		"r" (0),
-		"i" (offsetof(struct paca_struct, perf_event_pending)));
+		"i" (offsetof(struct paca_struct, irq_work_pending)));
 }
 
 #else /* 32-bit */
 
-DEFINE_PER_CPU(u8, perf_event_pending);
+DEFINE_PER_CPU(u8, irq_work_pending);
 
-#define set_perf_event_pending_flag()	__get_cpu_var(perf_event_pending) = 1
-#define test_perf_event_pending()	__get_cpu_var(perf_event_pending)
-#define clear_perf_event_pending()	__get_cpu_var(perf_event_pending) = 0
+#define set_irq_work_pending_flag()	__get_cpu_var(irq_work_pending) = 1
+#define test_irq_work_pending()		__get_cpu_var(irq_work_pending)
+#define clear_irq_work_pending()	__get_cpu_var(irq_work_pending) = 0
 
 #endif /* 32 vs 64 bit */
 
-void set_perf_event_pending(void)
+void set_irq_work_pending(void)
 {
 	preempt_disable();
-	set_perf_event_pending_flag();
+	set_irq_work_pending_flag();
 	set_dec(1);
 	preempt_enable();
 }
 
-#else  /* CONFIG_PERF_EVENTS */
+#else  /* CONFIG_IRQ_WORK */
 
-#define test_perf_event_pending()	0
-#define clear_perf_event_pending()
+#define test_irq_work_pending()	0
+#define clear_irq_work_pending()
 
-#endif /* CONFIG_PERF_EVENTS */
+#endif /* CONFIG_IRQ_WORK */
 
 /*
  * For iSeries shared processors, we have to let the hypervisor
@@ -635,9 +635,9 @@ void timer_interrupt(struct pt_regs * re
 
 	calculate_steal_time();
 
-	if (test_perf_event_pending()) {
-		clear_perf_event_pending();
-		perf_event_do_pending();
+	if (test_irq_work_pending()) {
+		clear_irq_work_pending();
+		irq_work_run();
 	}
 
 #ifdef CONFIG_PPC_ISERIES
Index: linux-2.6/arch/s390/Kconfig
===================================================================
--- linux-2.6.orig/arch/s390/Kconfig
+++ linux-2.6/arch/s390/Kconfig
@@ -98,6 +98,7 @@ config S390
 	select HAVE_KVM if 64BIT
 	select HAVE_ARCH_TRACEHOOK
 	select INIT_ALL_POSSIBLE
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_BZIP2
Index: linux-2.6/arch/s390/include/asm/perf_event.h
===================================================================
--- linux-2.6.orig/arch/s390/include/asm/perf_event.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * Performance event support - s390 specific definitions.
- *
- * Copyright 2009 Martin Schwidefsky, IBM Corporation.
- */
-
-static inline void set_perf_event_pending(void) {}
-static inline void clear_perf_event_pending(void) {}
-
-#define PERF_EVENT_INDEX_OFFSET 0
Index: linux-2.6/arch/sh/Kconfig
===================================================================
--- linux-2.6.orig/arch/sh/Kconfig
+++ linux-2.6/arch/sh/Kconfig
@@ -16,6 +16,7 @@ config SUPERH
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_ATTRS
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 	select HAVE_KERNEL_GZIP
Index: linux-2.6/arch/sh/include/asm/perf_event.h
===================================================================
--- linux-2.6.orig/arch/sh/include/asm/perf_event.h
+++ linux-2.6/arch/sh/include/asm/perf_event.h
@@ -26,11 +26,4 @@ extern int register_sh_pmu(struct sh_pmu
 extern int reserve_pmc_hardware(void);
 extern void release_pmc_hardware(void);
 
-static inline void set_perf_event_pending(void)
-{
-	/* Nothing to see here, move along. */
-}
-
-#define PERF_EVENT_INDEX_OFFSET	0
-
 #endif /* __ASM_SH_PERF_EVENT_H */
Index: linux-2.6/arch/sparc/Kconfig
===================================================================
--- linux-2.6.orig/arch/sparc/Kconfig
+++ linux-2.6/arch/sparc/Kconfig
@@ -25,6 +25,7 @@ config SPARC
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select RTC_CLASS
 	select RTC_DRV_M48T59
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 	select HAVE_DMA_ATTRS
@@ -52,6 +53,7 @@ config SPARC64
 	select RTC_DRV_BQ4802
 	select RTC_DRV_SUN4V
 	select RTC_DRV_STARFIRE
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 
Index: linux-2.6/arch/sparc/include/asm/perf_event.h
===================================================================
--- linux-2.6.orig/arch/sparc/include/asm/perf_event.h
+++ linux-2.6/arch/sparc/include/asm/perf_event.h
@@ -1,10 +1,6 @@
 #ifndef __ASM_SPARC_PERF_EVENT_H
 #define __ASM_SPARC_PERF_EVENT_H
 
-extern void set_perf_event_pending(void);
-
-#define	PERF_EVENT_INDEX_OFFSET	0
-
 #ifdef CONFIG_PERF_EVENTS
 #include <asm/ptrace.h>
 
Index: linux-2.6/arch/sparc/kernel/pcr.c
===================================================================
--- linux-2.6.orig/arch/sparc/kernel/pcr.c
+++ linux-2.6/arch/sparc/kernel/pcr.c
@@ -7,7 +7,7 @@
 #include <linux/init.h>
 #include <linux/irq.h>
 
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
 #include <linux/ftrace.h>
 
 #include <asm/pil.h>
@@ -43,14 +43,14 @@ void __irq_entry deferred_pcr_work_irq(i
 
 	old_regs = set_irq_regs(regs);
 	irq_enter();
-#ifdef CONFIG_PERF_EVENTS
-	perf_event_do_pending();
+#ifdef CONFIG_IRQ_WORK
+	irq_work_run();
 #endif
 	irq_exit();
 	set_irq_regs(old_regs);
 }
 
-void set_perf_event_pending(void)
+void arch_irq_work_raise(void)
 {
 	set_softint(1 << PIL_DEFERRED_PCR_WORK);
 }
Index: linux-2.6/arch/x86/Kconfig
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig
+++ linux-2.6/arch/x86/Kconfig
@@ -55,6 +55,7 @@ config X86
 	select HAVE_HW_BREAKPOINT
 	select HAVE_MIXED_BREAKPOINTS_REGS
 	select PERF_EVENTS
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS_NMI
 	select ANON_INODES
 	select HAVE_ARCH_KMEMCHECK
Index: linux-2.6/arch/x86/include/asm/entry_arch.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/entry_arch.h
+++ linux-2.6/arch/x86/include/asm/entry_arch.h
@@ -49,8 +49,8 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOC
 BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
-#ifdef CONFIG_PERF_EVENTS
-BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
+#ifdef CONFIG_IRQ_WORK
+BUILD_INTERRUPT(irq_work_interrupt,LOCAL_PENDING_VECTOR)
 #endif
 
 #ifdef CONFIG_X86_THERMAL_VECTOR
Index: linux-2.6/arch/x86/include/asm/hw_irq.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/hw_irq.h
+++ linux-2.6/arch/x86/include/asm/hw_irq.h
@@ -29,7 +29,7 @@
 extern void apic_timer_interrupt(void);
 extern void x86_platform_ipi(void);
 extern void error_interrupt(void);
-extern void perf_pending_interrupt(void);
+extern void irq_work_interrupt(void);
 
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
Index: linux-2.6/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6.orig/arch/x86/kernel/Makefile
+++ linux-2.6/arch/x86/kernel/Makefile
@@ -33,6 +33,7 @@ obj-y			:= process_$(BITS).o signal.o en
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o ldt.o dumpstack.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o
+obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
Index: linux-2.6/arch/x86/kernel/cpu/perf_event.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/perf_event.c
+++ linux-2.6/arch/x86/kernel/cpu/perf_event.c
@@ -1170,25 +1170,6 @@ static int x86_pmu_handle_irq(struct pt_
 	return handled;
 }
 
-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
-	irq_enter();
-	ack_APIC_irq();
-	inc_irq_stat(apic_pending_irqs);
-	perf_event_do_pending();
-	irq_exit();
-}
-
-void set_perf_event_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	if (!x86_pmu.apic || !x86_pmu_initialized())
-		return;
-
-	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
-
 void perf_events_lapic_init(void)
 {
 	if (!x86_pmu.apic || !x86_pmu_initialized())
Index: linux-2.6/arch/x86/kernel/entry_64.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/entry_64.S
+++ linux-2.6/arch/x86/kernel/entry_64.S
@@ -1023,9 +1023,9 @@ apicinterrupt ERROR_APIC_VECTOR \
 apicinterrupt SPURIOUS_APIC_VECTOR \
 	spurious_interrupt smp_spurious_interrupt
 
-#ifdef CONFIG_PERF_EVENTS
+#ifdef CONFIG_IRQ_WORK
 apicinterrupt LOCAL_PENDING_VECTOR \
-	perf_pending_interrupt smp_perf_pending_interrupt
+	irq_work_interrupt smp_irq_work_interrupt
 #endif
 
 /*
Index: linux-2.6/arch/x86/kernel/irq_work.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/x86/kernel/irq_work.c
@@ -0,0 +1,26 @@
+
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+#include <asm/apic.h>
+
+void smp_perf_pending_interrupt(struct pt_regs *regs)
+{
+	irq_enter();
+	ack_APIC_irq();
+	inc_irq_stat(apic_pending_irqs);
+	irq_work_run();
+	irq_exit();
+}
+
+void arch_irq_work_raise(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (!cpu_as_apic)
+		return;
+
+	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+	apic_wait_icr_idle();
+#endif
+}
+
+
Index: linux-2.6/arch/x86/kernel/irqinit.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/irqinit.c
+++ linux-2.6/arch/x86/kernel/irqinit.c
@@ -225,8 +225,8 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 
 	/* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_EVENTS
-	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+# ifdef CONFIG_IRQ_WORK
+	alloc_intr_gate(LOCAL_PENDING_VECTOR, irq_work_interrupt);
 # endif
 
 #endif
Index: linux-2.6/include/linux/perf_event.h
===================================================================
--- linux-2.6.orig/include/linux/perf_event.h
+++ linux-2.6/include/linux/perf_event.h
@@ -486,6 +486,7 @@ struct perf_guest_info_callbacks {
 #include <linux/workqueue.h>
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
+#include <linux/irq_work.h>
 #include <asm/atomic.h>
 #include <asm/local.h>
 
@@ -629,11 +630,6 @@ struct perf_buffer {
 	void				*data_pages[0];
 };
 
-struct perf_pending_entry {
-	struct perf_pending_entry *next;
-	void (*func)(struct perf_pending_entry *);
-};
-
 struct perf_sample_data;
 
 typedef void (*perf_overflow_handler_t)(struct perf_event *, int,
@@ -741,7 +737,7 @@ struct perf_event {
 	int				pending_wakeup;
 	int				pending_kill;
 	int				pending_disable;
-	struct perf_pending_entry	pending;
+	struct irq_work			pending;
 
 	atomic_t			event_limit;
 
@@ -853,8 +849,6 @@ extern void perf_event_task_tick(struct 
 extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
-extern void set_perf_event_pending(void);
-extern void perf_event_do_pending(void);
 extern void perf_event_print_debug(void);
 extern void __perf_disable(void);
 extern bool __perf_enable(void);
@@ -1028,7 +1022,6 @@ perf_event_task_tick(struct task_struct 
 static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
-static inline void perf_event_do_pending(void)				{ }
 static inline void perf_event_print_debug(void)				{ }
 static inline void perf_disable(void)					{ }
 static inline void perf_enable(void)					{ }
Index: linux-2.6/init/Kconfig
===================================================================
--- linux-2.6.orig/init/Kconfig
+++ linux-2.6/init/Kconfig
@@ -21,6 +21,13 @@ config CONSTRUCTORS
 	depends on !UML
 	default y
 
+config HAVE_IRQ_WORK
+	bool
+
+config IRQ_WORK
+	bool
+	depends on HAVE_IRQ_WORK
+
 menu "General setup"
 
 config EXPERIMENTAL
@@ -983,6 +990,7 @@ config PERF_EVENTS
 	default y if (PROFILING || PERF_COUNTERS)
 	depends on HAVE_PERF_EVENTS
 	select ANON_INODES
+	select IRQ_WORK
 	help
 	  Enable kernel support for various performance events provided
 	  by software and hardware.
Index: linux-2.6/kernel/Makefile
===================================================================
--- linux-2.6.orig/kernel/Makefile
+++ linux-2.6/kernel/Makefile
@@ -23,6 +23,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_perf_event.o = -pg
+CFLAGS_REMOVE_irq_work.o = -pg
 endif
 
 obj-$(CONFIG_FREEZER) += freezer.o
@@ -101,6 +102,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
 obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
Index: linux-2.6/kernel/perf_event.c
===================================================================
--- linux-2.6.orig/kernel/perf_event.c
+++ linux-2.6/kernel/perf_event.c
@@ -1880,12 +1880,11 @@ static void free_event_rcu(struct rcu_he
 	kfree(event);
 }
 
-static void perf_pending_sync(struct perf_event *event);
 static void perf_buffer_put(struct perf_buffer *buffer);
 
 static void free_event(struct perf_event *event)
 {
-	perf_pending_sync(event);
+	irq_work_sync(&event->pending);
 
 	if (!event->parent) {
 		atomic_dec(&nr_events);
@@ -2829,15 +2828,6 @@ void perf_event_wakeup(struct perf_event
 	}
 }
 
-/*
- * Pending wakeups
- *
- * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
- *
- * The NMI bit means we cannot possibly take locks. Therefore, maintain a
- * single linked list and use cmpxchg() to add entries lockless.
- */
-
 static void perf_pending_event(struct perf_pending_entry *entry)
 {
 	struct perf_event *event = container_of(entry,
@@ -2854,89 +2844,6 @@ static void perf_pending_event(struct pe
 	}
 }
 
-#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
-
-static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
-	PENDING_TAIL,
-};
-
-static void perf_pending_queue(struct perf_pending_entry *entry,
-			       void (*func)(struct perf_pending_entry *))
-{
-	struct perf_pending_entry **head;
-
-	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
-		return;
-
-	entry->func = func;
-
-	head = &get_cpu_var(perf_pending_head);
-
-	do {
-		entry->next = *head;
-	} while (cmpxchg(head, entry->next, entry) != entry->next);
-
-	set_perf_event_pending();
-
-	put_cpu_var(perf_pending_head);
-}
-
-static int __perf_pending_run(void)
-{
-	struct perf_pending_entry *list;
-	int nr = 0;
-
-	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
-	while (list != PENDING_TAIL) {
-		void (*func)(struct perf_pending_entry *);
-		struct perf_pending_entry *entry = list;
-
-		list = list->next;
-
-		func = entry->func;
-		entry->next = NULL;
-		/*
-		 * Ensure we observe the unqueue before we issue the wakeup,
-		 * so that we won't be waiting forever.
-		 * -- see perf_not_pending().
-		 */
-		smp_wmb();
-
-		func(entry);
-		nr++;
-	}
-
-	return nr;
-}
-
-static inline int perf_not_pending(struct perf_event *event)
-{
-	/*
-	 * If we flush on whatever cpu we run, there is a chance we don't
-	 * need to wait.
-	 */
-	get_cpu();
-	__perf_pending_run();
-	put_cpu();
-
-	/*
-	 * Ensure we see the proper queue state before going to sleep
-	 * so that we do not miss the wakeup. -- see perf_pending_handle()
-	 */
-	smp_rmb();
-	return event->pending.next == NULL;
-}
-
-static void perf_pending_sync(struct perf_event *event)
-{
-	wait_event(event->waitq, perf_not_pending(event));
-}
-
-void perf_event_do_pending(void)
-{
-	__perf_pending_run();
-}
-
 /*
  * Callchain support -- arch specific
  */
@@ -2996,8 +2903,7 @@ static void perf_output_wakeup(struct pe
 
 	if (handle->nmi) {
 		handle->event->pending_wakeup = 1;
-		perf_pending_queue(&handle->event->pending,
-				   perf_pending_event);
+		irq_work_queue(&handle->event->pending);
 	} else
 		perf_event_wakeup(handle->event);
 }
@@ -3988,8 +3894,7 @@ static int __perf_event_overflow(struct 
 		event->pending_kill = POLL_HUP;
 		if (nmi) {
 			event->pending_disable = 1;
-			perf_pending_queue(&event->pending,
-					   perf_pending_event);
+			irq_work_queue(&event->pending);
 		} else
 			perf_event_disable(event);
 	}
@@ -4841,6 +4746,7 @@ perf_event_alloc(struct perf_event_attr 
 	INIT_LIST_HEAD(&event->event_entry);
 	INIT_LIST_HEAD(&event->sibling_list);
 	init_waitqueue_head(&event->waitq);
+	irq_work_init(&event->pending, perf_pending_event);
 
 	mutex_init(&event->mmap_mutex);
 
Index: linux-2.6/kernel/timer.c
===================================================================
--- linux-2.6.orig/kernel/timer.c
+++ linux-2.6/kernel/timer.c
@@ -37,7 +37,7 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
@@ -1260,7 +1260,7 @@ void update_process_times(int user_tick)
 	run_local_timers();
 	rcu_check_callbacks(cpu, user_tick);
 	printk_tick();
-	perf_event_do_pending();
+	irq_work_run();
 	scheduler_tick();
 	run_posix_cpu_timers(p);
 }
Index: linux-2.6/arch/arm/kernel/perf_event.c
===================================================================
--- linux-2.6.orig/arch/arm/kernel/perf_event.c
+++ linux-2.6/arch/arm/kernel/perf_event.c
@@ -1045,7 +1045,7 @@ armv6pmu_handle_irq(int irq_num,
 	 * platforms that can have the PMU interrupts raised as a PMI, this
 	 * will not work.
 	 */
-	perf_event_do_pending();
+	irq_work_run();
 
 	return IRQ_HANDLED;
 }
@@ -2021,7 +2021,7 @@ static irqreturn_t armv7pmu_handle_irq(i
 	 * platforms that can have the PMU interrupts raised as a PMI, this
 	 * will not work.
 	 */
-	perf_event_do_pending();
+	irq_work_run();
 
 	return IRQ_HANDLED;
 }


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work -v2
  2010-06-25 18:30   ` [RFC][PATCH] irq_work -v2 Peter Zijlstra
@ 2010-06-25 19:30     ` Andi Kleen
  2010-06-25 19:39       ` Peter Zijlstra
  2010-06-25 19:47       ` Peter Zijlstra
  2010-06-26  1:26     ` huang ying
  1 sibling, 2 replies; 66+ messages in thread
From: Andi Kleen @ 2010-06-25 19:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel, Andi Kleen,
	tglx, davem, paulus

On Fri, Jun 25, 2010 at 08:30:25PM +0200, Peter Zijlstra wrote:

I'm not sure what all the logic for entry enqueued by someone
else is good for? Is that for the case you don't have enough
entries preallocated and you share them with someone else?

Normally if the sharing is per cpu that would be difficult 
to recover from because if it's due to a nest situation (for example)
you would deadlock.

For me it would seem simpler to simply not share.

> +	struct irq_work *list;
> +
> +	BUG_ON(!in_irq());
> +	BUG_ON(!irqs_disabled());
> +
> +	list = xchg(&__get_cpu_var(irq_work_list), NULL);
> +	while (list != NULL) {
> +		struct irq_work *entry = list;
> +
> +		list = irq_work_next(list);
> +
> +		/*
> +		 * Clear the PENDING bit, after this point the @entry
> +		 * can be re-used.
> +		 */
> +		entry->next = next_flags(NULL, IRQ_WORK_BUSY);
> +		entry->func(entry);

Needs compiler memory barrier here I think.

> +		/*
> +		 * Clear the BUSY bit and return to the free state if
> +		 * no-one else claimed it meanwhile.
> +		 */
> +		cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
> +	}
> +}

-Andi

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work -v2
  2010-06-25 19:30     ` Andi Kleen
@ 2010-06-25 19:39       ` Peter Zijlstra
  2010-06-25 19:49         ` Peter Zijlstra
  2010-06-25 22:29         ` Andi Kleen
  2010-06-25 19:47       ` Peter Zijlstra
  1 sibling, 2 replies; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-25 19:39 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel, tglx, davem, paulus

On Fri, 2010-06-25 at 21:30 +0200, Andi Kleen wrote:
> 
> I'm not sure what all the logic for entry enqueued by someone
> else is good for? Is that for the case you don't have enough
> entries preallocated and you share them with someone else?
> 
> Normally if the sharing is per cpu that would be difficult 
> to recover from because if it's due to a nest situation (for example)
> you would deadlock.
> 
> For me it would seem simpler to simply not share.

perf has two different reasons to for the callback, what I do is set the
state and enqueue, if its already enqueued the pending callback will
handle both.

Its cheaper than having two callback structures per event.

We can expose the claim/enqueue thing separately so that users can
choose.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work -v2
  2010-06-25 19:30     ` Andi Kleen
  2010-06-25 19:39       ` Peter Zijlstra
@ 2010-06-25 19:47       ` Peter Zijlstra
  1 sibling, 0 replies; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-25 19:47 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel, tglx, davem, paulus

On Fri, 2010-06-25 at 21:30 +0200, Andi Kleen wrote:
> > +             entry->next = next_flags(NULL, IRQ_WORK_BUSY);
> > +             entry->func(entry);
> 
> Needs compiler memory barrier here I think.
> 
> > +             /*
> > +              * Clear the BUSY bit and return to the free state if
> > +              * no-one else claimed it meanwhile.
> > +              */
> > +             cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
> > +     } 

Both the (indirect) function call and the cmpxchg imply a compiler
barrier.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work -v2
  2010-06-25 19:39       ` Peter Zijlstra
@ 2010-06-25 19:49         ` Peter Zijlstra
  2010-06-25 22:29         ` Andi Kleen
  1 sibling, 0 replies; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-25 19:49 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel, tglx, davem, paulus

On Fri, 2010-06-25 at 21:39 +0200, Peter Zijlstra wrote:
> On Fri, 2010-06-25 at 21:30 +0200, Andi Kleen wrote:
> > 
> > I'm not sure what all the logic for entry enqueued by someone
> > else is good for? Is that for the case you don't have enough
> > entries preallocated and you share them with someone else?
> > 
> > Normally if the sharing is per cpu that would be difficult 
> > to recover from because if it's due to a nest situation (for example)
> > you would deadlock.
> > 
> > For me it would seem simpler to simply not share.
> 
> perf has two different reasons to for the callback, what I do is set the
> state and enqueue, if its already enqueued the pending callback will
> handle both.
> 
> Its cheaper than having two callback structures per event.
> 
> We can expose the claim/enqueue thing separately so that users can
> choose.

Also, its possible the PMI hits again before the IRQ callback has a
chance to happen.



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work -v2
  2010-06-25 19:39       ` Peter Zijlstra
  2010-06-25 19:49         ` Peter Zijlstra
@ 2010-06-25 22:29         ` Andi Kleen
  2010-06-26  8:36           ` Peter Zijlstra
  1 sibling, 1 reply; 66+ messages in thread
From: Andi Kleen @ 2010-06-25 22:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, Huang Ying, Ingo Molnar, H.PeterA, linux-kernel,
	tglx, davem, paulus

> perf has two different reasons to for the callback, what I do is set the
> state and enqueue, if its already enqueued the pending callback will
> handle both.
> 
> Its cheaper than having two callback structures per event.

Again it sounds like you just need a bit...
> 
> We can expose the claim/enqueue thing separately so that users can
> choose.

Yes it would be good to separate that, because I doubt other users
will require similar hacks.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work -v2
  2010-06-25 18:30   ` [RFC][PATCH] irq_work -v2 Peter Zijlstra
  2010-06-25 19:30     ` Andi Kleen
@ 2010-06-26  1:26     ` huang ying
  1 sibling, 0 replies; 66+ messages in thread
From: huang ying @ 2010-06-26  1:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Huang Ying, Ingo Molnar, H.Peter Anvin, linux-kernel, Andi Kleen,
	tglx, davem, paulus

On Sat, Jun 26, 2010 at 2:30 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> +
> +static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
> +
> +/*
> + * Claim the entry so that no one else will poke at it.
> + */
> +static bool irq_work_claim(struct irq_work *entry)
> +{
> +       unsigned long flags;
> +
> +       do {
> +               flags = (unsigned long)entry->next;
> +               if (flags & IRQ_WORK_PENDING)
> +                       return false;
> +       } while (cmpxchg(&entry->next, flags, flags | IRQ_WORK_FLAGS) != flags);
> +
> +       return true;
> +}
> +
> +
> +void __weak arch_irq_work_raise(void)
> +{
> +       /*
> +        * Lame architectures will get the timer tick callback
> +        */
> +}
> +
> +/*
> + * Queue the entry and raise the IPI if needed.
> + */
> +static void __irq_work_queue(struct irq_work *entry)
> +{
> +       struct irq_work **head;
> +
> +       head = &get_cpu_var(irq_work_list);
> +
> +       do {
> +               /*
> +                * Can assign non-atomic because we keep the flags set.
> +                */
> +               entry->next = next_flags(*head, IRQ_WORK_FLAGS);
> +       } while (cmpxchg(head, entry->next, entry) != entry->next);

*head & IRQ_WORK_FLAGS == 0, but entry->next & IRQ_WORK_FLAGS ==
IRQ_WORK_FLAGS. So the cmpxchg will never succeed.

> +
> +       /*
> +        * The list was empty, raise self-interrupt to start processing.
> +        */
> +       if (!irq_work_next(entry))
> +               arch_irq_work_raise();
> +
> +       put_cpu_var(irq_work_list);
> +}

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work -v2
  2010-06-25 22:29         ` Andi Kleen
@ 2010-06-26  8:36           ` Peter Zijlstra
  2010-06-26 10:08             ` Andi Kleen
  0 siblings, 1 reply; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-26  8:36 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel, tglx, davem, paulus

On Sat, 2010-06-26 at 00:29 +0200, Andi Kleen wrote:

> Yes it would be good to separate that, because I doubt other users
> will require similar hacks.

You're such a constructive critic..

I would think every NMI user would need them since NMI can interrupt at
any time, and if you have a limited number of irq_work structs (like 1
per cpu) you'll end up with wanting to enqueue an already enqueued one
at some point.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work -v2
  2010-06-26  8:36           ` Peter Zijlstra
@ 2010-06-26 10:08             ` Andi Kleen
  2010-06-26 10:32               ` Peter Zijlstra
  0 siblings, 1 reply; 66+ messages in thread
From: Andi Kleen @ 2010-06-26 10:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, Huang Ying, Ingo Molnar, H.PeterA, linux-kernel,
	tglx, davem, paulus

On Sat, Jun 26, 2010 at 10:36:45AM +0200, Peter Zijlstra wrote:
> On Sat, 2010-06-26 at 00:29 +0200, Andi Kleen wrote:
> 
> > Yes it would be good to separate that, because I doubt other users
> > will require similar hacks.
> 
> You're such a constructive critic..

Well I'm only adapting to your tone (FWIW I thought your original
description of Ying's patches was bordering to unfair, not quoting
the words back to you). I find it also always interesting when
people who always dish out with full hands are quite sensitive themselves...

But yes we can agree to not use such tone, if that's a mutual agreement.

> I would think every NMI user would need them since NMI can interrupt at
> any time, and if you have a limited number of irq_work structs (like 1
> per cpu) you'll end up with wanting to enqueue an already enqueued one
> at some point.

You could as well drop the excessive event. In fact it surprises me that you 
don't simply do that in perf. The state should be in the PMU registers 
anyways, so you'll pick it up from there (and if you get NMIs as quickly that 
you cannot process them you have to eventually throttle by dropping anyways)

With the reuse methology you end up with the same problem anyways, is
just shifts it slightly.

For fatal NMIs it's more like: if the error is fatal then the NMI handler
will stop and if it's non fatal it can be dropped on overload.
For overload situations there needs to be a dropping mechanism, spinning
is not ok because you don't know if the current owner isn't on your
own CPU.

Some of the other errors cannot drop, but these need other mechanisms
anyways.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [RFC][PATCH] irq_work -v2
  2010-06-26 10:08             ` Andi Kleen
@ 2010-06-26 10:32               ` Peter Zijlstra
  0 siblings, 0 replies; 66+ messages in thread
From: Peter Zijlstra @ 2010-06-26 10:32 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang Ying, Ingo Molnar, H.PeterA, linux-kernel, tglx, davem, paulus

On Sat, 2010-06-26 at 12:08 +0200, Andi Kleen wrote:
> You could as well drop the excessive event. In fact it surprises me that you 
> don't simply do that in perf. The state should be in the PMU registers 
> anyways, so you'll pick it up from there (and if you get NMIs as quickly that 
> you cannot process them you have to eventually throttle by dropping anyways)

I'm not quite seeing what you mean, the PMU state is reset on PMI, it
doesn't know about previous overflows, and it most certainly doesn't
know if for the previous event we had to wake the buffer consumers.

For non-uniform events (like basically everything but
cycles/instructions) you can get high bursts of PMIs, only when the rate
stays too high will we eventually throttle, but we can (and should) deal
with short periods of high rate PMIs.

The PMU is stopped during the PMI, we write out data to the buffer and
possible queue a callback to wake the consumers and or de-schedule the
event.

By setting the the pending op state and enqueueing the callback we
ensure the pending op isn't lost/ignored. If there was already one in
flight it will pick up our pending state, if not we just queued a new
one.

In fact Huang argued for something very similar, so I assumed he
actually needed this too.


^ permalink raw reply	[flat|nested] 66+ messages in thread

end of thread, other threads:[~2010-06-26 10:32 UTC | newest]

Thread overview: 66+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-06-24  3:04 [RFC 1/5] Make soft_irq NMI safe Huang Ying
2010-06-24  3:04 ` [RFC 2/5] NMI return notifier Huang Ying
2010-06-24  3:04 ` [RFC 3/5] x86, trigger NMI return notifier soft_irq earlier Huang Ying
2010-06-24  6:03   ` Peter Zijlstra
2010-06-24  3:04 ` [RFC 4/5] x86, Use NMI return notifier in MCE Huang Ying
2010-06-24 10:00   ` Andi Kleen
2010-06-24  3:04 ` [RFC 5/5] Use NMI return notifier in perf pending Huang Ying
2010-06-24  6:00   ` Peter Zijlstra
2010-06-24  6:09 ` [RFC 1/5] Make soft_irq NMI safe Peter Zijlstra
2010-06-24  6:45   ` Huang Ying
2010-06-24  6:35 ` [RFC][PATCH] irq_work Peter Zijlstra
2010-06-24  6:43   ` Huang Ying
2010-06-24  6:47     ` Peter Zijlstra
2010-06-24  6:50       ` Huang Ying
2010-06-24  6:58         ` Peter Zijlstra
2010-06-24  7:04           ` Huang Ying
2010-06-24  7:19             ` Peter Zijlstra
2010-06-24  7:27               ` Huang Ying
2010-06-24  7:32                 ` Peter Zijlstra
2010-06-24 10:27                   ` Andi Kleen
2010-06-24 10:30                     ` Peter Zijlstra
2010-06-24 10:52                       ` Andi Kleen
2010-06-24 10:58                         ` Peter Zijlstra
2010-06-24 11:08                           ` Andi Kleen
2010-06-24 11:10                             ` Peter Zijlstra
2010-06-24 11:20                               ` Andi Kleen
2010-06-24 11:33                                 ` Peter Zijlstra
2010-06-24 11:55                                   ` Andi Kleen
2010-06-24 11:57                                     ` Peter Zijlstra
2010-06-24 12:02                                       ` Andi Kleen
2010-06-24 12:18                                         ` Peter Zijlstra
2010-06-24 12:38                                           ` Andi Kleen
2010-06-25 10:38                                             ` Peter Zijlstra
2010-06-24 11:42                                 ` Peter Zijlstra
2010-06-24 11:58                                   ` Andi Kleen
2010-06-24 12:02                                     ` Peter Zijlstra
2010-06-24 11:23                               ` Ingo Molnar
2010-06-24 11:34                                 ` Peter Zijlstra
2010-06-24 12:35                                   ` Ingo Molnar
2010-06-24 13:02                                     ` Andi Kleen
2010-06-24 13:20                                       ` Borislav Petkov
2010-06-24 13:33                                         ` Andi Kleen
2010-06-24 13:42                                           ` Ingo Molnar
2010-06-24 13:46                                           ` Ingo Molnar
2010-06-24 14:01                                             ` Andi Kleen
2010-06-24 15:41                                               ` Borislav Petkov
2010-06-24 16:09                                                 ` Andi Kleen
2010-06-25  2:12   ` Huang Ying
2010-06-25  7:48     ` Peter Zijlstra
2010-06-25  9:17       ` Huang Ying
2010-06-25  9:23         ` Frederic Weisbecker
2010-06-25  9:30           ` Huang Ying
2010-06-25  9:44             ` Frederic Weisbecker
2010-06-25  9:30         ` Peter Zijlstra
2010-06-25 11:58           ` huang ying
2010-06-25  9:08     ` Andi Kleen
2010-06-25 18:30   ` [RFC][PATCH] irq_work -v2 Peter Zijlstra
2010-06-25 19:30     ` Andi Kleen
2010-06-25 19:39       ` Peter Zijlstra
2010-06-25 19:49         ` Peter Zijlstra
2010-06-25 22:29         ` Andi Kleen
2010-06-26  8:36           ` Peter Zijlstra
2010-06-26 10:08             ` Andi Kleen
2010-06-26 10:32               ` Peter Zijlstra
2010-06-25 19:47       ` Peter Zijlstra
2010-06-26  1:26     ` huang ying

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).