Kuppuswamy triggered a NULL pointer dereference via the AER error injection mechanism in the low level APIC code. https://lore.kernel.org/r/f54208d62407901b5de15ce8c3d078c70fc7a1d0.1582313239.git.sathyanarayanan.kuppuswamy@linux.intel.com It turned out that AER error injection is calling generic_handle_irq() from task context which is unsafe for x86 interrupts which end up being handled by the APIC. The fragile interrupt affinity handling which is imposed by the x86 hardware does not allow to call into this code except from actual interrupt context. While the pointer could be checked this would just paper over the problem and still be able to trigger warnings or silently corrupting state. This series addresses the problem: - Prevent the invocation of generic_handle_irq() from non interrupt context on affected interrupts. - Add a few missing sanity checks to the existing debugfs injection mechanism - Convert the debugfs injection into a function which can be invoked from other places. This provides a halfways safe interface to inject interrupts via the irq_retrigger mechanism which does the injection via IPI. This interface is solely for debug and testing purposes as it still can make a device interrupts stale on x86 under very obscure and hard to hit circumstances. For debug and error injection testing this is acceptable. For any other use not. - Change the AER code to use the new interface. Thanks, tglx ---- arch/x86/kernel/apic/vector.c | 6 + drivers/pci/pcie/Kconfig | 1 drivers/pci/pcie/aer_inject.c | 6 - include/linux/interrupt.h | 2 include/linux/irq.h | 13 +++ kernel/irq/Kconfig | 5 + kernel/irq/chip.c | 2 kernel/irq/debugfs.c | 28 -------- kernel/irq/internals.h | 10 ++ kernel/irq/irqdesc.c | 6 + kernel/irq/resend.c | 143 +++++++++++++++++++++++++++++++----------- 11 files changed, 153 insertions(+), 69 deletions(-)
Interrupts cannot be injected when the interrupt is not activated and when a replay is already in progress. Fixes: 536e2e34bd00 ("genirq/debugfs: Triggering of interrupts from userspace") Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: stable@vger.kernel.org --- kernel/irq/debugfs.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -206,8 +206,15 @@ static ssize_t irq_debug_write(struct fi chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); - if (irq_settings_is_level(desc) || desc->istate & IRQS_NMI) { - /* Can't do level nor NMIs, sorry */ + /* + * Don't allow injection when the interrupt is: + * - Level or NMI type + * - not activated + * - replaying already + */ + if (irq_settings_is_level(desc) || + !irqd_is_activated(&desc->irq_data) || + (desc->istate & (IRQS_NMI | IRQS_REPLAY)) { err = -EINVAL; } else { desc->istate |= IRQS_PENDING;
In general calling generic_handle_irq() with interrupts disabled from non interrupt context is harmless. For some interrupt controllers like the x86 trainwrecks this is outright dangerous as it might corrupt state if an interrupt affinity change is pending. Add infrastructure which allows to mark interrupts as unsafe and catch such usage in generic_handle_irq(). Reported-by: sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/linux/irq.h | 13 +++++++++++++ kernel/irq/internals.h | 8 ++++++++ kernel/irq/irqdesc.c | 6 ++++++ kernel/irq/resend.c | 5 +++-- 4 files changed, 30 insertions(+), 2 deletions(-) --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -211,6 +211,8 @@ struct irq_data { * IRQD_CAN_RESERVE - Can use reservation mode * IRQD_MSI_NOMASK_QUIRK - Non-maskable MSI quirk for affinity change * required + * IRQD_HANDLE_ENFORCE_IRQCTX - Enforce that handle_irq_*() is only invoked + * from actual interrupt context. */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -234,6 +236,7 @@ enum { IRQD_DEFAULT_TRIGGER_SET = (1 << 25), IRQD_CAN_RESERVE = (1 << 26), IRQD_MSI_NOMASK_QUIRK = (1 << 27), + IRQD_HANDLE_ENFORCE_IRQCTX = (1 << 28), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) @@ -303,6 +306,16 @@ static inline bool irqd_is_single_target return __irqd_to_state(d) & IRQD_SINGLE_TARGET; } +static inline void irqd_set_handle_enforce_irqctx(struct irq_data *d) +{ + __irqd_to_state(d) |= IRQD_HANDLE_ENFORCE_IRQCTX; +} + +static inline bool irqd_is_handle_enforce_irqctx(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_HANDLE_ENFORCE_IRQCTX; +} + static inline bool irqd_is_wakeup_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_WAKEUP_STATE; --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -425,6 +425,10 @@ static inline struct cpumask *irq_desc_g { return desc->pending_mask; } +static inline bool handle_enforce_irqctx(struct irq_data *data) +{ + return irqd_is_handle_enforce_irqctx(data); +} bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); #else /* CONFIG_GENERIC_PENDING_IRQ */ static inline bool irq_can_move_pcntxt(struct irq_data *data) @@ -451,6 +455,10 @@ static inline bool irq_fixup_move_pendin { return false; } +static inline bool handle_enforce_irqctx(struct irq_data *data) +{ + return false; +} #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -638,9 +638,15 @@ void irq_init_desc(unsigned int irq) int generic_handle_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); + struct irq_data *data; if (!desc) return -EINVAL; + + data = irq_desc_get_irq_data(desc); + if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data))) + return -EPERM; + generic_handle_irq_desc(desc); return 0; } --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -72,8 +72,9 @@ void check_irq_resend(struct irq_desc *d desc->istate &= ~IRQS_PENDING; desc->istate |= IRQS_REPLAY; - if (!desc->irq_data.chip->irq_retrigger || - !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { + if ((!desc->irq_data.chip->irq_retrigger || + !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) && + !handle_enforce_irqctx(&desc->irq_data)) { #ifdef CONFIG_HARDIRQS_SW_RESEND unsigned int irq = irq_desc_get_irq(desc);
Sathyanarayanan reported that the PCI-E AER error injection mechanism can result in a NULL pointer dereference in apic_ack_edge(): BUG: unable to handle kernel NULL pointer dereference at 0000000000000078 RIP: 0010:apic_ack_edge+0x1e/0x40 Call Trace: handle_edge_irq+0x7d/0x1e0 generic_handle_irq+0x27/0x30 aer_inject_write+0x53a/0x720 It crashes in irq_complete_move() which dereferences get_irq_regs() which is obviously NULL when this is called from non interrupt context. Of course the pointer could be checked, but that just papers over the real issue. Invoking the low level interrupt handling mechanism from random code can wreckage the fragile interrupt affinity mechanism of x86 as interrupts can only be moved in interrupt context or with special care when a CPU goes offline and the move has to be enforced. In the best case this triggers the warning in the MSI affinity setter, but if the call happens on the correct CPU it just corrupts state and might prevent further interrupt delivery for the affected device. Mark the APIC interrupts as unsuitable for being invoked in random contexts. This prevents the AER injection from proliferating the wreckage, but that's less broken than the current state of affairs and more correct than just papering over the problem by sprinkling random checks all over the place and silently corrupting state. Reported-by: sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/apic/vector.c | 6 ++++++ 1 file changed, 6 insertions(+) --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -557,6 +557,12 @@ static int x86_vector_alloc_irqs(struct irqd->hwirq = virq + i; irqd_set_single_target(irqd); /* + * Prevent that any of these interrupts is invoked in + * non interrupt context via e.g. generic_handle_irq() + * as that can corrupt the affinity move state. + */ + irqd_set_handle_enforce_irqctx(irqd); + /* * Legacy vectors are already assigned when the IOAPIC * takes them over. They stay on the same vector. This is * required for check_timer() to work correctly as it might
In preparation for an interrupt injection interface which can be used safely by error injection mechanisms. e.g. PCI-E/ AER, add a return value to check_irq_resend() so errors can be propagated to the caller. Split out the software resend code so the ugly #ifdef in check_irq_resend() goes away and the whole thing becomes readable. Fix up the caller in debugfs. The caller in irq_startup() does not care about the return value as this is unconditionally invoked for all interrupts and the resend is best effort anyway. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- kernel/irq/debugfs.c | 3 - kernel/irq/internals.h | 2 - kernel/irq/resend.c | 83 ++++++++++++++++++++++++++++--------------------- 3 files changed, 51 insertions(+), 37 deletions(-) --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -218,8 +218,7 @@ static ssize_t irq_debug_write(struct fi err = -EINVAL; } else { desc->istate |= IRQS_PENDING; - check_irq_resend(desc); - err = 0; + err = check_irq_resend(desc); } raw_spin_unlock_irqrestore(&desc->lock, flags); --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -108,7 +108,7 @@ irqreturn_t handle_irq_event_percpu(stru irqreturn_t handle_irq_event(struct irq_desc *desc); /* Resending of interrupts :*/ -void check_irq_resend(struct irq_desc *desc); +int check_irq_resend(struct irq_desc *desc); bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -47,6 +47,43 @@ static void resend_irqs(unsigned long ar /* Tasklet to handle resend: */ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); +static int irq_sw_resend(struct irq_desc *desc) +{ + unsigned int irq = irq_desc_get_irq(desc); + + /* + * Validate whether this interrupt can be safely injected from + * non interrupt context + */ + if (handle_enforce_irqctx(&desc->irq_data)) + return -EINVAL; + + /* + * If the interrupt is running in the thread context of the parent + * irq we need to be careful, because we cannot trigger it + * directly. + */ + if (irq_settings_is_nested_thread(desc)) { + /* + * If the parent_irq is valid, we retrigger the parent, + * otherwise we do nothing. + */ + if (!desc->parent_irq) + return -EINVAL; + irq = desc->parent_irq; + } + + /* Set it pending and activate the softirq: */ + set_bit(irq, irqs_resend); + tasklet_schedule(&resend_tasklet); + return 0; +} + +#else +static int irq_sw_resend(struct irq_desc *desc) +{ + return -EINVAL; +} #endif /* @@ -54,50 +91,28 @@ static DECLARE_TASKLET(resend_tasklet, r * * Is called with interrupts disabled and desc->lock held. */ -void check_irq_resend(struct irq_desc *desc) +int check_irq_resend(struct irq_desc *desc) { /* - * We do not resend level type interrupts. Level type - * interrupts are resent by hardware when they are still - * active. Clear the pending bit so suspend/resume does not - * get confused. + * We do not resend level type interrupts. Level type interrupts + * are resent by hardware when they are still active. Clear the + * pending bit so suspend/resume does not get confused. */ if (irq_settings_is_level(desc)) { desc->istate &= ~IRQS_PENDING; - return; + return -EINVAL; } + if (desc->istate & IRQS_REPLAY) - return; + return -EBUSY; + if (desc->istate & IRQS_PENDING) { desc->istate &= ~IRQS_PENDING; desc->istate |= IRQS_REPLAY; - if ((!desc->irq_data.chip->irq_retrigger || - !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) && - !handle_enforce_irqctx(&desc->irq_data)) { -#ifdef CONFIG_HARDIRQS_SW_RESEND - unsigned int irq = irq_desc_get_irq(desc); - - /* - * If the interrupt is running in the thread - * context of the parent irq we need to be - * careful, because we cannot trigger it - * directly. - */ - if (irq_settings_is_nested_thread(desc)) { - /* - * If the parent_irq is valid, we - * retrigger the parent, otherwise we - * do nothing. - */ - if (!desc->parent_irq) - return; - irq = desc->parent_irq; - } - /* Set it pending and activate the softirq: */ - set_bit(irq, irqs_resend); - tasklet_schedule(&resend_tasklet); -#endif - } + if (!desc->irq_data.chip->irq_retrigger || + !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) + return irq_sw_resend(desc); } + return 0; }
The code sets IRQS_REPLAY unconditionally whether the resend happens or not. That doesn't have bad side effects right now, but inconsistent state is always a latent source of problems. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- kernel/irq/resend.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -93,6 +93,8 @@ static int irq_sw_resend(struct irq_desc */ int check_irq_resend(struct irq_desc *desc) { + int err = 0; + /* * We do not resend level type interrupts. Level type interrupts * are resent by hardware when they are still active. Clear the @@ -106,13 +108,17 @@ int check_irq_resend(struct irq_desc *de if (desc->istate & IRQS_REPLAY) return -EBUSY; - if (desc->istate & IRQS_PENDING) { - desc->istate &= ~IRQS_PENDING; - desc->istate |= IRQS_REPLAY; + if (!(desc->istate & IRQS_PENDING)) + return 0; - if (!desc->irq_data.chip->irq_retrigger || - !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) - return irq_sw_resend(desc); - } - return 0; + desc->istate &= ~IRQS_PENDING; + + if (!desc->irq_data.chip->irq_retrigger || + !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) + err = irq_sw_resend(desc); + + /* If the retrigger was successfull, mark it with the REPLAY bit */ + if (!err) + desc->istate |= IRQS_REPLAY; + return err; }
Error injection mechanisms need a half ways safe way to inject interrupts as invoking generic_handle_irq() or the actual device interrupt handler directly from e.g. a debugfs write is not guaranteed to be safe. On x86 generic_handle_irq() is unsafe due to the hardware trainwreck which is the base of x86 interrupt delivery and affinity management. Move the irq debugfs injection code into a separate function which can be used by error injection code as well. The implementation prevents at least that state is corrupted, but it cannot close a very tiny race window on x86 which might result in a stale and not serviced device interrupt under very unlikely circumstances. This is explicitly for debugging and testing and not for production use or abuse in random driver code. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/linux/interrupt.h | 2 + kernel/irq/Kconfig | 5 ++++ kernel/irq/chip.c | 2 - kernel/irq/debugfs.c | 34 ----------------------------- kernel/irq/internals.h | 2 - kernel/irq/resend.c | 53 ++++++++++++++++++++++++++++++++++++++++++++-- 6 files changed, 61 insertions(+), 37 deletions(-) --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -248,6 +248,8 @@ extern void enable_percpu_nmi(unsigned i extern int prepare_percpu_nmi(unsigned int irq); extern void teardown_percpu_nmi(unsigned int irq); +extern int irq_inject_interrupt(unsigned int irq); + /* The following three functions are for the core kernel use only. */ extern void suspend_device_irqs(void); extern void resume_device_irqs(void); --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -43,6 +43,10 @@ config GENERIC_IRQ_MIGRATION config AUTO_IRQ_AFFINITY bool +# Interrupt injection mechanism +config GENERIC_IRQ_INJECTION + bool + # Tasklet based software resend for pending interrupts on enable_irq() config HARDIRQS_SW_RESEND bool @@ -127,6 +131,7 @@ config SPARSE_IRQ config GENERIC_IRQ_DEBUGFS bool "Expose irq internals in debugfs" depends on DEBUG_FS + select GENERIC_IRQ_INJECTION default n ---help--- --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -278,7 +278,7 @@ int irq_startup(struct irq_desc *desc, b } } if (resend) - check_irq_resend(desc); + check_irq_resend(desc, false); return ret; } --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -190,39 +190,7 @@ static ssize_t irq_debug_write(struct fi return -EFAULT; if (!strncmp(buf, "trigger", size)) { - unsigned long flags; - int err; - - /* Try the HW interface first */ - err = irq_set_irqchip_state(irq_desc_get_irq(desc), - IRQCHIP_STATE_PENDING, true); - if (!err) - return count; - - /* - * Otherwise, try to inject via the resend interface, - * which may or may not succeed. - */ - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); - - /* - * Don't allow injection when the interrupt is: - * - Level or NMI type - * - not activated - * - replaying already - */ - if (irq_settings_is_level(desc) || - !irqd_is_activated(&desc->irq_data) || - (desc->istate & (IRQS_NMI | IRQS_REPLAY)) { - err = -EINVAL; - } else { - desc->istate |= IRQS_PENDING; - err = check_irq_resend(desc); - } - - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); + int err = irq_inject_interrupt(irq_desc_get_irq(desc)); return err ? err : count; } --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -108,7 +108,7 @@ irqreturn_t handle_irq_event_percpu(stru irqreturn_t handle_irq_event(struct irq_desc *desc); /* Resending of interrupts :*/ -int check_irq_resend(struct irq_desc *desc); +int check_irq_resend(struct irq_desc *desc, bool inject); bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -91,7 +91,7 @@ static int irq_sw_resend(struct irq_desc * * Is called with interrupts disabled and desc->lock held. */ -int check_irq_resend(struct irq_desc *desc) +int check_irq_resend(struct irq_desc *desc, bool inject) { int err = 0; @@ -108,7 +108,7 @@ int check_irq_resend(struct irq_desc *de if (desc->istate & IRQS_REPLAY) return -EBUSY; - if (!(desc->istate & IRQS_PENDING)) + if (!(desc->istate & IRQS_PENDING) && !inject) return 0; desc->istate &= ~IRQS_PENDING; @@ -122,3 +122,52 @@ int check_irq_resend(struct irq_desc *de desc->istate |= IRQS_REPLAY; return err; } + +#ifdef CONFIG_GENERIC_IRQ_INJECTION +/** + * irq_inject_interrupt - Inject an interrupt for testing/error injection + * @irq: The interrupt number + * + * This function must only be used for debug and testing purposes! + * + * Especially on x86 this can cause a premature completion of an interrupt + * affinity change causing the interrupt line to become stale. Very + * unlikely, but possible. + * + * The injection can fail for various reasons: + * - Interrupt is not activated + * - Interrupt is NMI type or currently replaying + * - Interrupt is level type + * - Interrupt does not support hardware retrigger and software resend is + * either not enabled or not possible for the interrupt. + */ +int irq_inject_interrupt(unsigned int irq) +{ + struct irq_desc *desc; + unsigned long flags; + int err; + + /* Try the state injection hardware interface first */ + if (!irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true)) + return 0; + + /* That failed, try via the resend mechanism */ + desc = irq_get_desc_buslock(irq, &flags, 0); + if (!desc) + return -EINVAL; + + /* + * Only try to inject when the interrupt is: + * - not NMI type + * - activated + */ + if ((desc->istate & IRQS_NMI) || !irqd_is_activated(&desc->irq_data)) + err = -EINVAL; + else + err = check_irq_resend(desc, true); + + irq_put_desc_busunlock(desc, flags); + return err; +} +EXPORT_SYMBOL_GPL(irq_inject_interrupt); +#endif
The AER error injection mechanism just blindly abuses generic_handle_irq() which is really not meant for consumption by random drivers. The include of linux/irq.h should have been a red flag in the first place. Driver code, unless implementing interrupt chips or low level hypervisor functionality has absolutely no business with that. Invoking generic_handle_irq() from non interrupt handling context can have nasty side effects at least on x86 due to the hardware trainwreck which makes interrupt affinity changes a fragile beast. Sathyanarayanan triggered a NULL pointer dereference in the low level APIC code that way. While the particular pointer could be checked this would only paper over the issue because there are other ways to trigger warnings or silently corrupt state. Invoke the new irq_inject_interrupt() mechanism, which has the necessary sanity checks in place and injects the interrupt via the irq_retrigger() mechanism, which is at least halfways safe vs. the fragile x86 affinity change mechanics. It's safe on x86 as it does not corrupt state, but it still can cause a premature completion of an interrupt affinity change causing the interrupt line to become stale. Very unlikely, but possible. For regular operations this is a non issue as AER error injection is meant for debugging and testing and not for usage on production systems. People using this should better know what they are doing. Fixes: 390e2db82480 ("PCI/AER: Abstract AER interrupt handling") Reported-by: sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- drivers/pci/pcie/Kconfig | 1 + drivers/pci/pcie/aer_inject.c | 6 ++---- 2 files changed, 3 insertions(+), 4 deletions(-) --- a/drivers/pci/pcie/Kconfig +++ b/drivers/pci/pcie/Kconfig @@ -34,6 +34,7 @@ config PCIEAER config PCIEAER_INJECT tristate "PCI Express error injection support" depends on PCIEAER + select GENERIC_IRQ_INJECTION help This enables PCI Express Root Port Advanced Error Reporting (AER) software error injector. --- a/drivers/pci/pcie/aer_inject.c +++ b/drivers/pci/pcie/aer_inject.c @@ -16,7 +16,7 @@ #include <linux/module.h> #include <linux/init.h> -#include <linux/irq.h> +#include <linux/interrupt.h> #include <linux/miscdevice.h> #include <linux/pci.h> #include <linux/slab.h> @@ -468,9 +468,7 @@ static int aer_inject(struct aer_error_i } pci_info(edev->port, "Injecting errors %08x/%08x into device %s\n", einj->cor_status, einj->uncor_status, pci_name(dev)); - local_irq_disable(); - generic_handle_irq(edev->irq); - local_irq_enable(); + ret = irq_inject_interrupt(edev->irq); } else { pci_err(rpdev, "AER device not found\n"); ret = -ENODEV;
On 2020-03-06 13:03, Thomas Gleixner wrote:
> Interrupts cannot be injected when the interrupt is not activated and
> when
> a replay is already in progress.
>
> Fixes: 536e2e34bd00 ("genirq/debugfs: Triggering of interrupts from
> userspace")
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: stable@vger.kernel.org
> ---
> kernel/irq/debugfs.c | 11 +++++++++--
> 1 file changed, 9 insertions(+), 2 deletions(-)
>
> --- a/kernel/irq/debugfs.c
> +++ b/kernel/irq/debugfs.c
> @@ -206,8 +206,15 @@ static ssize_t irq_debug_write(struct fi
> chip_bus_lock(desc);
> raw_spin_lock_irqsave(&desc->lock, flags);
>
> - if (irq_settings_is_level(desc) || desc->istate & IRQS_NMI) {
> - /* Can't do level nor NMIs, sorry */
> + /*
> + * Don't allow injection when the interrupt is:
> + * - Level or NMI type
> + * - not activated
> + * - replaying already
> + */
> + if (irq_settings_is_level(desc) ||
> + !irqd_is_activated(&desc->irq_data) ||
> + (desc->istate & (IRQS_NMI | IRQS_REPLAY)) {
> err = -EINVAL;
> } else {
> desc->istate |= IRQS_PENDING;
Huh, nice catch.
Acked-by: Marc Zyngier <maz@kernel.org>
M.
--
Jazz is not dead. It just smells funny...
On 2020-03-06 13:03, Thomas Gleixner wrote: > In general calling generic_handle_irq() with interrupts disabled from > non > interrupt context is harmless. For some interrupt controllers like the > x86 > trainwrecks this is outright dangerous as it might corrupt state if an > interrupt affinity change is pending. > > Add infrastructure which allows to mark interrupts as unsafe and catch > such > usage in generic_handle_irq(). > > Reported-by: sathyanarayanan.kuppuswamy@linux.intel.com > Signed-off-by: Thomas Gleixner <tglx@linutronix.de> > --- > include/linux/irq.h | 13 +++++++++++++ > kernel/irq/internals.h | 8 ++++++++ > kernel/irq/irqdesc.c | 6 ++++++ > kernel/irq/resend.c | 5 +++-- > 4 files changed, 30 insertions(+), 2 deletions(-) > > --- a/include/linux/irq.h > +++ b/include/linux/irq.h > @@ -211,6 +211,8 @@ struct irq_data { > * IRQD_CAN_RESERVE - Can use reservation mode > * IRQD_MSI_NOMASK_QUIRK - Non-maskable MSI quirk for affinity change > * required > + * IRQD_HANDLE_ENFORCE_IRQCTX - Enforce that handle_irq_*() is only > invoked > + * from actual interrupt context. > */ > enum { > IRQD_TRIGGER_MASK = 0xf, > @@ -234,6 +236,7 @@ enum { > IRQD_DEFAULT_TRIGGER_SET = (1 << 25), > IRQD_CAN_RESERVE = (1 << 26), > IRQD_MSI_NOMASK_QUIRK = (1 << 27), > + IRQD_HANDLE_ENFORCE_IRQCTX = (1 << 28), > }; > > #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, > state_use_accessors) > @@ -303,6 +306,16 @@ static inline bool irqd_is_single_target > return __irqd_to_state(d) & IRQD_SINGLE_TARGET; > } > > +static inline void irqd_set_handle_enforce_irqctx(struct irq_data *d) > +{ > + __irqd_to_state(d) |= IRQD_HANDLE_ENFORCE_IRQCTX; > +} > + > +static inline bool irqd_is_handle_enforce_irqctx(struct irq_data *d) > +{ > + return __irqd_to_state(d) & IRQD_HANDLE_ENFORCE_IRQCTX; > +} > + > static inline bool irqd_is_wakeup_set(struct irq_data *d) > { > return __irqd_to_state(d) & IRQD_WAKEUP_STATE; > --- a/kernel/irq/internals.h > +++ b/kernel/irq/internals.h > @@ -425,6 +425,10 @@ static inline struct cpumask *irq_desc_g > { > return desc->pending_mask; > } > +static inline bool handle_enforce_irqctx(struct irq_data *data) > +{ > + return irqd_is_handle_enforce_irqctx(data); > +} > bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); > #else /* CONFIG_GENERIC_PENDING_IRQ */ > static inline bool irq_can_move_pcntxt(struct irq_data *data) > @@ -451,6 +455,10 @@ static inline bool irq_fixup_move_pendin > { > return false; > } > +static inline bool handle_enforce_irqctx(struct irq_data *data) > +{ > + return false; > +} > #endif /* !CONFIG_GENERIC_PENDING_IRQ */ > > #if !defined(CONFIG_IRQ_DOMAIN) || > !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) > --- a/kernel/irq/irqdesc.c > +++ b/kernel/irq/irqdesc.c > @@ -638,9 +638,15 @@ void irq_init_desc(unsigned int irq) > int generic_handle_irq(unsigned int irq) > { > struct irq_desc *desc = irq_to_desc(irq); > + struct irq_data *data; > > if (!desc) > return -EINVAL; > + > + data = irq_desc_get_irq_data(desc); > + if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data))) > + return -EPERM; It is a bit sad that there are only *two* users in the tree that actually check the return value of generic_handle_irq(). Thankfully, the WARN_ON should wake people up. > + > generic_handle_irq_desc(desc); > return 0; > } > --- a/kernel/irq/resend.c > +++ b/kernel/irq/resend.c > @@ -72,8 +72,9 @@ void check_irq_resend(struct irq_desc *d > desc->istate &= ~IRQS_PENDING; > desc->istate |= IRQS_REPLAY; > > - if (!desc->irq_data.chip->irq_retrigger || > - !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { > + if ((!desc->irq_data.chip->irq_retrigger || > + !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) && > + !handle_enforce_irqctx(&desc->irq_data)) { > #ifdef CONFIG_HARDIRQS_SW_RESEND > unsigned int irq = irq_desc_get_irq(desc); Acked-by: Marc Zyngier <maz@kernel.org> M. -- Jazz is not dead. It just smells funny...
On 2020-03-06 13:03, Thomas Gleixner wrote:
> In preparation for an interrupt injection interface which can be used
> safely by error injection mechanisms. e.g. PCI-E/ AER, add a return
> value
> to check_irq_resend() so errors can be propagated to the caller.
>
> Split out the software resend code so the ugly #ifdef in
> check_irq_resend()
> goes away and the whole thing becomes readable.
>
> Fix up the caller in debugfs. The caller in irq_startup() does not care
> about the return value as this is unconditionally invoked for all
> interrupts and the resend is best effort anyway.
>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <maz@kernel.org>
M.
--
Jazz is not dead. It just smells funny...
On 2020-03-06 13:03, Thomas Gleixner wrote:
> The code sets IRQS_REPLAY unconditionally whether the resend happens or
> not. That doesn't have bad side effects right now, but inconsistent
> state
> is always a latent source of problems.
>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <maz@kernel.org>
M.
--
Jazz is not dead. It just smells funny...
On 2020-03-06 13:03, Thomas Gleixner wrote:
> Error injection mechanisms need a half ways safe way to inject
> interrupts as
> invoking generic_handle_irq() or the actual device interrupt handler
> directly from e.g. a debugfs write is not guaranteed to be safe.
>
> On x86 generic_handle_irq() is unsafe due to the hardware trainwreck
> which
> is the base of x86 interrupt delivery and affinity management.
>
> Move the irq debugfs injection code into a separate function which can
> be
> used by error injection code as well.
>
> The implementation prevents at least that state is corrupted, but it
> cannot
> close a very tiny race window on x86 which might result in a stale and
> not
> serviced device interrupt under very unlikely circumstances.
>
> This is explicitly for debugging and testing and not for production use
> or
> abuse in random driver code.
>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <maz@kernel.org>
M.
--
Jazz is not dead. It just smells funny...
On 3/6/20 5:03 AM, Thomas Gleixner wrote: > The AER error injection mechanism just blindly abuses generic_handle_irq() > which is really not meant for consumption by random drivers. The include of > linux/irq.h should have been a red flag in the first place. Driver code, > unless implementing interrupt chips or low level hypervisor functionality > has absolutely no business with that. > > Invoking generic_handle_irq() from non interrupt handling context can have > nasty side effects at least on x86 due to the hardware trainwreck which > makes interrupt affinity changes a fragile beast. Sathyanarayanan triggered > a NULL pointer dereference in the low level APIC code that way. While the > particular pointer could be checked this would only paper over the issue > because there are other ways to trigger warnings or silently corrupt state. > > Invoke the new irq_inject_interrupt() mechanism, which has the necessary > sanity checks in place and injects the interrupt via the irq_retrigger() > mechanism, which is at least halfways safe vs. the fragile x86 affinity > change mechanics. > > It's safe on x86 as it does not corrupt state, but it still can cause a > premature completion of an interrupt affinity change causing the interrupt > line to become stale. Very unlikely, but possible. > > For regular operations this is a non issue as AER error injection is meant > for debugging and testing and not for usage on production systems. People > using this should better know what they are doing. It looks good to me. Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com> Tested-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com> > > Fixes: 390e2db82480 ("PCI/AER: Abstract AER interrupt handling") > Reported-by: sathyanarayanan.kuppuswamy@linux.intel.com > Signed-off-by: Thomas Gleixner <tglx@linutronix.de> > --- > drivers/pci/pcie/Kconfig | 1 + > drivers/pci/pcie/aer_inject.c | 6 ++---- > 2 files changed, 3 insertions(+), 4 deletions(-) > > --- a/drivers/pci/pcie/Kconfig > +++ b/drivers/pci/pcie/Kconfig > @@ -34,6 +34,7 @@ config PCIEAER > config PCIEAER_INJECT > tristate "PCI Express error injection support" > depends on PCIEAER > + select GENERIC_IRQ_INJECTION > help > This enables PCI Express Root Port Advanced Error Reporting > (AER) software error injector. > --- a/drivers/pci/pcie/aer_inject.c > +++ b/drivers/pci/pcie/aer_inject.c > @@ -16,7 +16,7 @@ > > #include <linux/module.h> > #include <linux/init.h> > -#include <linux/irq.h> > +#include <linux/interrupt.h> > #include <linux/miscdevice.h> > #include <linux/pci.h> > #include <linux/slab.h> > @@ -468,9 +468,7 @@ static int aer_inject(struct aer_error_i > } > pci_info(edev->port, "Injecting errors %08x/%08x into device %s\n", > einj->cor_status, einj->uncor_status, pci_name(dev)); > - local_irq_disable(); > - generic_handle_irq(edev->irq); > - local_irq_enable(); > + ret = irq_inject_interrupt(edev->irq); > } else { > pci_err(rpdev, "AER device not found\n"); > ret = -ENODEV; > -- Sathyanarayanan Kuppuswamy Linux kernel developer
Hi, On 3/6/20 5:03 AM, Thomas Gleixner wrote: > Error injection mechanisms need a half ways safe way to inject interrupts as > invoking generic_handle_irq() or the actual device interrupt handler > directly from e.g. a debugfs write is not guaranteed to be safe. > > On x86 generic_handle_irq() is unsafe due to the hardware trainwreck which > is the base of x86 interrupt delivery and affinity management. > > Move the irq debugfs injection code into a separate function which can be > used by error injection code as well. > > The implementation prevents at least that state is corrupted, but it cannot > close a very tiny race window on x86 which might result in a stale and not > serviced device interrupt under very unlikely circumstances. > > This is explicitly for debugging and testing and not for production use or > abuse in random driver code. > > Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com> Tested-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com> > --- > include/linux/interrupt.h | 2 + > kernel/irq/Kconfig | 5 ++++ > kernel/irq/chip.c | 2 - > kernel/irq/debugfs.c | 34 ----------------------------- > kernel/irq/internals.h | 2 - > kernel/irq/resend.c | 53 ++++++++++++++++++++++++++++++++++++++++++++-- > 6 files changed, 61 insertions(+), 37 deletions(-) > > --- a/include/linux/interrupt.h > +++ b/include/linux/interrupt.h > @@ -248,6 +248,8 @@ extern void enable_percpu_nmi(unsigned i > extern int prepare_percpu_nmi(unsigned int irq); > extern void teardown_percpu_nmi(unsigned int irq); > > +extern int irq_inject_interrupt(unsigned int irq); > + > /* The following three functions are for the core kernel use only. */ > extern void suspend_device_irqs(void); > extern void resume_device_irqs(void); > --- a/kernel/irq/Kconfig > +++ b/kernel/irq/Kconfig > @@ -43,6 +43,10 @@ config GENERIC_IRQ_MIGRATION > config AUTO_IRQ_AFFINITY > bool > > +# Interrupt injection mechanism > +config GENERIC_IRQ_INJECTION > + bool > + > # Tasklet based software resend for pending interrupts on enable_irq() > config HARDIRQS_SW_RESEND > bool > @@ -127,6 +131,7 @@ config SPARSE_IRQ > config GENERIC_IRQ_DEBUGFS > bool "Expose irq internals in debugfs" > depends on DEBUG_FS > + select GENERIC_IRQ_INJECTION > default n > ---help--- > > --- a/kernel/irq/chip.c > +++ b/kernel/irq/chip.c > @@ -278,7 +278,7 @@ int irq_startup(struct irq_desc *desc, b > } > } > if (resend) > - check_irq_resend(desc); > + check_irq_resend(desc, false); > > return ret; > } > --- a/kernel/irq/debugfs.c > +++ b/kernel/irq/debugfs.c > @@ -190,39 +190,7 @@ static ssize_t irq_debug_write(struct fi > return -EFAULT; > > if (!strncmp(buf, "trigger", size)) { > - unsigned long flags; > - int err; > - > - /* Try the HW interface first */ > - err = irq_set_irqchip_state(irq_desc_get_irq(desc), > - IRQCHIP_STATE_PENDING, true); > - if (!err) > - return count; > - > - /* > - * Otherwise, try to inject via the resend interface, > - * which may or may not succeed. > - */ > - chip_bus_lock(desc); > - raw_spin_lock_irqsave(&desc->lock, flags); > - > - /* > - * Don't allow injection when the interrupt is: > - * - Level or NMI type > - * - not activated > - * - replaying already > - */ > - if (irq_settings_is_level(desc) || > - !irqd_is_activated(&desc->irq_data) || > - (desc->istate & (IRQS_NMI | IRQS_REPLAY)) { > - err = -EINVAL; > - } else { > - desc->istate |= IRQS_PENDING; > - err = check_irq_resend(desc); > - } > - > - raw_spin_unlock_irqrestore(&desc->lock, flags); > - chip_bus_sync_unlock(desc); > + int err = irq_inject_interrupt(irq_desc_get_irq(desc)); > > return err ? err : count; > } > --- a/kernel/irq/internals.h > +++ b/kernel/irq/internals.h > @@ -108,7 +108,7 @@ irqreturn_t handle_irq_event_percpu(stru > irqreturn_t handle_irq_event(struct irq_desc *desc); > > /* Resending of interrupts :*/ > -int check_irq_resend(struct irq_desc *desc); > +int check_irq_resend(struct irq_desc *desc, bool inject); > bool irq_wait_for_poll(struct irq_desc *desc); > void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); > > --- a/kernel/irq/resend.c > +++ b/kernel/irq/resend.c > @@ -91,7 +91,7 @@ static int irq_sw_resend(struct irq_desc > * > * Is called with interrupts disabled and desc->lock held. > */ > -int check_irq_resend(struct irq_desc *desc) > +int check_irq_resend(struct irq_desc *desc, bool inject) > { > int err = 0; > > @@ -108,7 +108,7 @@ int check_irq_resend(struct irq_desc *de > if (desc->istate & IRQS_REPLAY) > return -EBUSY; > > - if (!(desc->istate & IRQS_PENDING)) > + if (!(desc->istate & IRQS_PENDING) && !inject) > return 0; > > desc->istate &= ~IRQS_PENDING; > @@ -122,3 +122,52 @@ int check_irq_resend(struct irq_desc *de > desc->istate |= IRQS_REPLAY; > return err; > } > + > +#ifdef CONFIG_GENERIC_IRQ_INJECTION > +/** > + * irq_inject_interrupt - Inject an interrupt for testing/error injection > + * @irq: The interrupt number > + * > + * This function must only be used for debug and testing purposes! > + * > + * Especially on x86 this can cause a premature completion of an interrupt > + * affinity change causing the interrupt line to become stale. Very > + * unlikely, but possible. > + * > + * The injection can fail for various reasons: > + * - Interrupt is not activated > + * - Interrupt is NMI type or currently replaying > + * - Interrupt is level type > + * - Interrupt does not support hardware retrigger and software resend is > + * either not enabled or not possible for the interrupt. > + */ > +int irq_inject_interrupt(unsigned int irq) > +{ > + struct irq_desc *desc; > + unsigned long flags; > + int err; > + > + /* Try the state injection hardware interface first */ > + if (!irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true)) > + return 0; > + > + /* That failed, try via the resend mechanism */ > + desc = irq_get_desc_buslock(irq, &flags, 0); > + if (!desc) > + return -EINVAL; > + > + /* > + * Only try to inject when the interrupt is: > + * - not NMI type > + * - activated > + */ > + if ((desc->istate & IRQS_NMI) || !irqd_is_activated(&desc->irq_data)) > + err = -EINVAL; > + else > + err = check_irq_resend(desc, true); > + > + irq_put_desc_busunlock(desc, flags); > + return err; > +} > +EXPORT_SYMBOL_GPL(irq_inject_interrupt); > +#endif > -- Sathyanarayanan Kuppuswamy Linux kernel developer
On 3/6/20 10:32 AM, Kuppuswamy Sathyanarayanan wrote: > > On 3/6/20 5:03 AM, Thomas Gleixner wrote: >> The AER error injection mechanism just blindly abuses >> generic_handle_irq() >> which is really not meant for consumption by random drivers. The >> include of >> linux/irq.h should have been a red flag in the first place. Driver code, >> unless implementing interrupt chips or low level hypervisor >> functionality >> has absolutely no business with that. >> >> Invoking generic_handle_irq() from non interrupt handling context can >> have >> nasty side effects at least on x86 due to the hardware trainwreck which >> makes interrupt affinity changes a fragile beast. Sathyanarayanan >> triggered >> a NULL pointer dereference in the low level APIC code that way. While >> the >> particular pointer could be checked this would only paper over the issue >> because there are other ways to trigger warnings or silently corrupt >> state. >> >> Invoke the new irq_inject_interrupt() mechanism, which has the necessary >> sanity checks in place and injects the interrupt via the irq_retrigger() >> mechanism, which is at least halfways safe vs. the fragile x86 affinity >> change mechanics. >> >> It's safe on x86 as it does not corrupt state, but it still can cause a >> premature completion of an interrupt affinity change causing the >> interrupt >> line to become stale. Very unlikely, but possible. >> >> For regular operations this is a non issue as AER error injection is >> meant >> for debugging and testing and not for usage on production systems. >> People >> using this should better know what they are doing. > It looks good to me. > > Reviewed-by: Kuppuswamy Sathyanarayanan > <sathyanarayanan.kuppuswamy@linux.intel.com> > Tested-by: Kuppuswamy Sathyanarayanan > <sathyanarayanan.kuppuswamy@linux.intel.com> >> >> Fixes: 390e2db82480 ("PCI/AER: Abstract AER interrupt handling") This patch is merged in v4.20 kernel. So this fix could be a candidate for stable fix. >> Reported-by: sathyanarayanan.kuppuswamy@linux.intel.com >> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> >> --- >> drivers/pci/pcie/Kconfig | 1 + >> drivers/pci/pcie/aer_inject.c | 6 ++---- >> 2 files changed, 3 insertions(+), 4 deletions(-) >> >> --- a/drivers/pci/pcie/Kconfig >> +++ b/drivers/pci/pcie/Kconfig >> @@ -34,6 +34,7 @@ config PCIEAER >> config PCIEAER_INJECT >> tristate "PCI Express error injection support" >> depends on PCIEAER >> + select GENERIC_IRQ_INJECTION >> help >> This enables PCI Express Root Port Advanced Error Reporting >> (AER) software error injector. >> --- a/drivers/pci/pcie/aer_inject.c >> +++ b/drivers/pci/pcie/aer_inject.c >> @@ -16,7 +16,7 @@ >> #include <linux/module.h> >> #include <linux/init.h> >> -#include <linux/irq.h> >> +#include <linux/interrupt.h> >> #include <linux/miscdevice.h> >> #include <linux/pci.h> >> #include <linux/slab.h> >> @@ -468,9 +468,7 @@ static int aer_inject(struct aer_error_i >> } >> pci_info(edev->port, "Injecting errors %08x/%08x into >> device %s\n", >> einj->cor_status, einj->uncor_status, pci_name(dev)); >> - local_irq_disable(); >> - generic_handle_irq(edev->irq); >> - local_irq_enable(); >> + ret = irq_inject_interrupt(edev->irq); >> } else { >> pci_err(rpdev, "AER device not found\n"); >> ret = -ENODEV; >> -- Sathyanarayanan Kuppuswamy Linux kernel developer