linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: [PATCH] interrupt: discover and disable very frequent interrupts
@ 2022-09-30  8:31 张鑫城
  0 siblings, 0 replies; 11+ messages in thread
From: 张鑫城 @ 2022-09-30  8:31 UTC (permalink / raw)
  To: hdegoede, tglx
  Cc: linux-kernel, maz, oleksandr, bigeasy, mark.rutland, michael

Hi,

Thank you very much for your valuable suggestions, I have modified the patch as follows:

Subject: [PATCH] interrupt: discover and disable very frequent interrupts

In some cases, a peripheral's interrupt will be triggered frequently,
which will keep the CPU processing the interrupt and eventually cause
the RCU to report rcu_sched self-detected stall on the CPU.

[  838.131628] rcu: INFO: rcu_sched self-detected stall on CPU
[  838.137189] rcu:     0-....: (194839 ticks this GP) idle=f02/1/0x4000000000000004
softirq=9993/9993 fqs=97428
[  838.146912] rcu:      (t=195015 jiffies g=6773 q=0)
[  838.151516] Task dump for CPU 0:
[  838.154730] systemd-sleep   R  running task        0  3445      1 0x0000000a

Signed-off-by: zhangxincheng <zhangxincheng@uniontech.com>
Change-Id: I9c92146f2772eae383c16c8c10de028b91e07150
---
 include/linux/irqdesc.h |  2 ++
 kernel/irq/spurious.c   | 36 +++++++++++++++++++++++++++++-------
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 1cd4e36890fb..a3bd521c3557 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -102,6 +102,8 @@ struct irq_desc {
 	int			parent_irq;
 	struct module		*owner;
 	const char		*name;
+	u32 gap_count;
+	u64 gap_time;
 } ____cacheline_internodealigned_in_smp;
 
 #ifdef CONFIG_SPARSE_IRQ
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 02b2daf07441..75bd0088446a 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -188,19 +188,21 @@ static inline int bad_action_ret(irqreturn_t action_ret)
  *
  * (The other 100-of-100,000 interrupts may have been a correctly
  *  functioning device sharing an IRQ with the failing one)
+ *
+ * Some bad hardware will trigger interrupts very frequently, which will
+ * cause the CPU to process hardware interrupts all the time. So when
+ * we find this out, the interrupt should be disabled.
  */
-static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
+static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret, const char *msg)
 {
 	unsigned int irq = irq_desc_get_irq(desc);
 	struct irqaction *action;
 	unsigned long flags;
 
 	if (bad_action_ret(action_ret)) {
-		printk(KERN_ERR "irq event %d: bogus return value %x\n",
-				irq, action_ret);
+		printk(msg, irq, action_ret);
 	} else {
-		printk(KERN_ERR "irq %d: nobody cared (try booting with "
-				"the \"irqpoll\" option)\n", irq);
+		printk(msg, irq);
 	}
 	dump_stack();
 	printk(KERN_ERR "handlers:\n");
@@ -228,7 +230,7 @@ static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
 
 	if (count > 0) {
 		count--;
-		__report_bad_irq(desc, action_ret);
+		__report_bad_irq(desc, action_ret, KERN_ERR "irq event %d: bogus return value %x\n");
 	}
 }
 
@@ -282,6 +284,25 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
 		return;
 	}
 
+	if((desc->gap_count & 0xffff0000) == 0)
+		desc->gap_time = get_jiffies_64();
+
+	desc->gap_count ++;
+
+	if((desc->gap_count & 0x0000ffff) >= 2000) {
+		if((get_jiffies_64() - desc->gap_time) < HZ) {
+			desc->gap_count += 0x00010000;
+			desc->gap_count &= 0xffff0000;
+		} else {
+			desc->gap_count = 0;
+		}
+
+		if((desc->gap_count >> 16) > 30) {
+		__report_bad_irq(desc, action_ret, KERN_ERR "irq %d: triggered too frequently\n");
+			irq_disable(desc);
+		}
+	}
+
 	/*
 	 * We cannot call note_interrupt from the threaded handler
 	 * because we need to look at the compound of all handlers
@@ -416,7 +437,8 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
 		/*
 		 * The interrupt is stuck
 		 */
-		__report_bad_irq(desc, action_ret);
+		__report_bad_irq(desc, action_ret, KERN_ERR "irq %d: nobody cared (try booting"
+				"with the \"irqpoll\" option)\n");
 		/*
 		 * Now kill the IRQ
 		 */
--
2.20.1


Regards,

Zhang Xincheng

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] interrupt: discover and disable very frequent interrupts
  2022-10-17 11:21             ` Thomas Gleixner
@ 2022-10-18  1:05               ` Zhang Xincheng
  0 siblings, 0 replies; 11+ messages in thread
From: Zhang Xincheng @ 2022-10-18  1:05 UTC (permalink / raw)
  To: tglx
  Cc: bigeasy, hdegoede, linux-kernel, mark.rutland, maz, michael,
	oleksandr, zhangxincheng

> > 
> > +config FREQUENT_IRQ_DEBUG
> > + bool "Support for finding and reporting frequent interrupt"
> > + default n
> > + help
> > +
> 
> Pointless newline.
> 

Thank you very much for your advice, I get it.

> > +   This is a mechanism to detect and report that interrupts
> > +   are triggered too frequently.
> > +
> > +config COUNT_PER_SECOND
> > + int "Interrupt limit per second"
> > + depends on FREQUENT_IRQ_DEBUG
> > + default "2000"
> > + help
> 
> 2000 interrupts per second is just a hillarious low default. Trivial to
> reach with networking. Aside of that on systems where the per CPU timer
> interrupt goes through this code, that's trivial to exceed with
> something simple like a periodic timer with a 250us interval.
> 

This limit is really too low on some high-performance PCs, and I neglected.

> > +#ifdef CONFIG_FREQUENT_IRQ_DEBUG
> > +#define COUNT_PER_SECOND_MASK 0x0000ffff
> > +#define DURATION_LIMIT_MASK 0xffff0000
> > +#define DURATION_LIMIT_COUNT 0x00010000
> > +#define DURATION_LIMIT_OFFSET 16
> > +static unsigned int count_per_second = CONFIG_COUNT_PER_SECOND;
> > +static unsigned int duration_limit = CONFIG_DURATION_LIMIT;
> > +static bool disable_frequent_irq;
> > +#endif /* CONFIG_FREQUENT_IRQ_DEBUG */
> > +
> >  /*
> >   * We wait here for a poller to finish.
> >   *
> > @@ -189,18 +199,16 @@ static inline int bad_action_ret(irqreturn_t action_ret)
> >   * (The other 100-of-100,000 interrupts may have been a correctly
> >   *  functioning device sharing an IRQ with the failing one)
> >   */
> > -static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
> > +static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret, const char *msg)
> >  {
> >  unsigned int irq = irq_desc_get_irq(desc);
> >  struct irqaction *action;
> >  unsigned long flags;
> > 
> >  if (bad_action_ret(action_ret)) {
> > - printk(KERN_ERR "irq event %d: bogus return value %x\n",
> > - irq, action_ret);
> > + printk(msg, irq, action_ret);
> 
> This wants to be pr_err() and that change needs to be split out into a
> seperate patch if at all.
> 

This is a good suggestion, I get it.

> > +#ifdef CONFIG_FREQUENT_IRQ_DEBUG
> > +/*
> > + * Some bad hardware will trigger interrupts very frequently, which will
> > + * cause the CPU to process hardware interrupts all the time. We found
> > + * and reported it, and disabling it is optional.
> > + */
> > +void report_frequent_irq(struct irq_desc *desc, irqreturn_t action_ret)
> 
> static, no?
> 

Yes, indeed it should be static.

> > +{
> > + if (desc->have_reported)
> > + return;
> > +
> > + if ((desc->gap_count & DURATION_LIMIT_MASK) == 0)
> 
> What's the point of this mask dance here? Use seperate variables. This
> is unreadable and overoptimized for no value.
> 

This mask is probably not really needed.

> Also why is a simple count per second not sufficient? Why do you need
> the extra duration limit?
> 

The extra duration limit is is increased to ignore transient conditions.

> > + desc->gap_time = get_jiffies_64();
> 
> Why does this need 64bit jiffies? 32bit are plenty enough.
> 

Yes,32bit are plenty enough.

> > +
> > + desc->gap_count++;
> > +
> > + if ((desc->gap_count & COUNT_PER_SECOND_MASK) >= count_per_second) {
> > + if ((get_jiffies_64() - desc->gap_time) < HZ) {
> > + desc->gap_count += DURATION_LIMIT_COUNT;
> > + desc->gap_count &= DURATION_LIMIT_MASK;
> > + } else {
> > + desc->gap_count = 0;
> > + }
> > +
> > + if ((desc->gap_count >> DURATION_LIMIT_OFFSET) >= duration_limit) {
> > + __report_bad_irq(desc, action_ret, KERN_ERR "irq %d: triggered too "
> > + "frequently\n");
> > + desc->have_reported = true;
> > + if (disable_frequent_irq)
> > + irq_disable(desc);
> 
> How is this rate limiting? This is simply disabling the interrupt.
> 

This does not limit the frequency of an interrupt, it just reports an
interrupt that is triggered very frequently and may not be normal.
As for how the administrator will deal with this in the future, it is
up to the administrator.

> 
> So again if your limit is too narrow you might simply disable the wrong
> interrupt and render the machine useless.
> 
> So if enabled in Kconfig it must be default off and you need a command
> line parameter to turn it on, but TBH I'm less than convinced that this
> is actually useful for general purpose debugging in it's current form
> simply because it is hard to get the limit right.
> 

At present, this mechanism needs to artificially give a suitable limit,
which is really not very easy to use. But it's really hard to automatically
get a suitable limit for different machines, and I need to reconsider.


Thanks,

        Zhang Xincheng

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] interrupt: discover and disable very frequent interrupts
  2022-10-09 10:02           ` Zhang Xincheng
@ 2022-10-17 11:21             ` Thomas Gleixner
  2022-10-18  1:05               ` Zhang Xincheng
  0 siblings, 1 reply; 11+ messages in thread
From: Thomas Gleixner @ 2022-10-17 11:21 UTC (permalink / raw)
  To: Zhang Xincheng, maz
  Cc: linux-kernel, oleksandr, Hans de Goede, bigeasy, mark.rutland, michael

On Sun, Oct 09 2022 at 18:02, Zhang Xincheng wrote:
>  
> +config FREQUENT_IRQ_DEBUG
> +	bool "Support for finding and reporting frequent interrupt"
> +	default n
> +	help
> +

Pointless newline.

> +	  This is a mechanism to detect and report that interrupts
> +	  are triggered too frequently.
> +
> +config COUNT_PER_SECOND
> +	int "Interrupt limit per second"
> +	depends on FREQUENT_IRQ_DEBUG
> +	default "2000"
> +	help

2000 interrupts per second is just a hillarious low default. Trivial to
reach with networking. Aside of that on systems where the per CPU timer
interrupt goes through this code, that's trivial to exceed with
something simple like a periodic timer with a 250us interval.

> +#ifdef CONFIG_FREQUENT_IRQ_DEBUG
> +#define COUNT_PER_SECOND_MASK	0x0000ffff
> +#define DURATION_LIMIT_MASK		0xffff0000
> +#define DURATION_LIMIT_COUNT	0x00010000
> +#define DURATION_LIMIT_OFFSET	16
> +static unsigned int count_per_second = CONFIG_COUNT_PER_SECOND;
> +static unsigned int duration_limit = CONFIG_DURATION_LIMIT;
> +static bool disable_frequent_irq;
> +#endif /* CONFIG_FREQUENT_IRQ_DEBUG */
> +
>  /*
>   * We wait here for a poller to finish.
>   *
> @@ -189,18 +199,16 @@ static inline int bad_action_ret(irqreturn_t action_ret)
>   * (The other 100-of-100,000 interrupts may have been a correctly
>   *  functioning device sharing an IRQ with the failing one)
>   */
> -static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
> +static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret, const char *msg)
>  {
>  	unsigned int irq = irq_desc_get_irq(desc);
>  	struct irqaction *action;
>  	unsigned long flags;
>  
>  	if (bad_action_ret(action_ret)) {
> -		printk(KERN_ERR "irq event %d: bogus return value %x\n",
> -				irq, action_ret);
> +		printk(msg, irq, action_ret);

This wants to be pr_err() and that change needs to be split out into a
seperate patch if at all.

> +#ifdef CONFIG_FREQUENT_IRQ_DEBUG
> +/*
> + * Some bad hardware will trigger interrupts very frequently, which will
> + * cause the CPU to process hardware interrupts all the time. We found
> + * and reported it, and disabling it is optional.
> + */
> +void report_frequent_irq(struct irq_desc *desc, irqreturn_t action_ret)

static, no?

> +{
> +	if (desc->have_reported)
> +		return;
> +
> +	if ((desc->gap_count & DURATION_LIMIT_MASK) == 0)

What's the point of this mask dance here? Use seperate variables. This
is unreadable and overoptimized for no value.

Also why is a simple count per second not sufficient? Why do you need
the extra duration limit?

> +		desc->gap_time = get_jiffies_64();

Why does this need 64bit jiffies? 32bit are plenty enough.

> +
> +	desc->gap_count++;
> +
> +	if ((desc->gap_count & COUNT_PER_SECOND_MASK) >= count_per_second) {
> +		if ((get_jiffies_64() - desc->gap_time) < HZ) {
> +			desc->gap_count += DURATION_LIMIT_COUNT;
> +			desc->gap_count &= DURATION_LIMIT_MASK;
> +		} else {
> +			desc->gap_count = 0;
> +		}
> +
> +		if ((desc->gap_count >> DURATION_LIMIT_OFFSET) >= duration_limit) {
> +			__report_bad_irq(desc, action_ret, KERN_ERR "irq %d: triggered too "
> +					"frequently\n");
> +			desc->have_reported = true;
> +			if (disable_frequent_irq)
> +				irq_disable(desc);

How is this rate limiting? This is simply disabling the interrupt.

So again if your limit is too narrow you might simply disable the wrong
interrupt and render the machine useless.

So if enabled in Kconfig it must be default off and you need a command
line parameter to turn it on, but TBH I'm less than convinced that this
is actually useful for general purpose debugging in it's current form
simply because it is hard to get the limit right.

Thanks,

        tglx




^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] interrupt: discover and disable very frequent interrupts
  2022-10-09  2:13         ` Marc Zyngier
@ 2022-10-09 10:02           ` Zhang Xincheng
  2022-10-17 11:21             ` Thomas Gleixner
  0 siblings, 1 reply; 11+ messages in thread
From: Zhang Xincheng @ 2022-10-09 10:02 UTC (permalink / raw)
  To: maz
  Cc: tglx, linux-kernel, oleksandr, Hans de Goede, bigeasy,
	mark.rutland, michael

> I don't think this is much better. You hardcode values that only make
> sense on your HW, and for nobody else. And what can the user do with
> this message? Nothing at all. The message itself only contributes to
> problem. As it is, this patch is only a nuisance.

Indeed, this patch can only report a possible problem.

> As I said before, this would be much better as a rate-limiter, with
> configurable limits, and behind a debug option.

This is a good suggestion, I have modified my patch as follows:

From 681a864ead6521560a25d089b96b8360bf987f5f Mon Sep 17 00:00:00 2001
From: zhangxincheng <zhangxincheng@uniontech.com>
Date: Thu, 29 Sep 2022 17:22:25 +0800
Subject: [PATCH] interrupt: debug for discovering frequent interrupts

In some cases, a peripheral's interrupt will be triggered frequently,
which will keep the CPU processing the interrupt and eventually cause
the RCU to report rcu_sched self-detected stall on the CPU.

[  838.131628] rcu: INFO: rcu_sched self-detected stall on CPU
[  838.137189] rcu:     0-....: (194839 ticks this GP) idle=f02/1/0x4000000000000004
softirq=9993/9993 fqs=97428
[  838.146912] rcu:      (t=195015 jiffies g=6773 q=0)
[  838.151516] Task dump for CPU 0:
[  838.154730] systemd-sleep   R  running task        0  3445      1 0x0000000a

Signed-off-by: Zhang Xincheng <zhangxincheng@uniontech.com>
---
 include/linux/irqdesc.h |   5 ++
 kernel/irq/Kconfig      |  25 +++++++++
 kernel/irq/spurious.c   | 121 +++++++++++++++++++++++++++++++++++++---
 3 files changed, 144 insertions(+), 7 deletions(-)

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 1cd4e36890fb..f82b138c68d6 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -102,6 +102,11 @@ struct irq_desc {
 	int			parent_irq;
 	struct module		*owner;
 	const char		*name;
+#ifdef CONFIG_FREQUENT_IRQ_DEBUG
+	bool have_reported;
+	u32 gap_count;
+	u64 gap_time;
+#endif
 } ____cacheline_internodealigned_in_smp;
 
 #ifdef CONFIG_SPARSE_IRQ
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index db3d174c53d4..ae5c6e45265a 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -137,6 +137,31 @@ config GENERIC_IRQ_DEBUGFS
 
 	  If you don't know what to do here, say N.
 
+config FREQUENT_IRQ_DEBUG
+	bool "Support for finding and reporting frequent interrupt"
+	default n
+	help
+
+	  This is a mechanism to detect and report that interrupts
+	  are triggered too frequently.
+
+config COUNT_PER_SECOND
+	int "Interrupt limit per second"
+	depends on FREQUENT_IRQ_DEBUG
+	default "2000"
+	help
+
+	  This is the limit on the number of interrupts triggered per second.
+	  (Max 0xffff)
+config DURATION_LIMIT
+	int "Duration limit"
+	depends on FREQUENT_IRQ_DEBUG
+	default "30"
+	help
+
+	  The number of interruptions per second exceeds the duration limit of
+	  the limit. (Max 0xffff)
+
 endmenu
 
 config GENERIC_IRQ_MULTI_HANDLER
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 02b2daf07441..a8f606567b21 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -22,6 +22,16 @@ static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
 static int irq_poll_cpu;
 static atomic_t irq_poll_active;
 
+#ifdef CONFIG_FREQUENT_IRQ_DEBUG
+#define COUNT_PER_SECOND_MASK	0x0000ffff
+#define DURATION_LIMIT_MASK		0xffff0000
+#define DURATION_LIMIT_COUNT	0x00010000
+#define DURATION_LIMIT_OFFSET	16
+static unsigned int count_per_second = CONFIG_COUNT_PER_SECOND;
+static unsigned int duration_limit = CONFIG_DURATION_LIMIT;
+static bool disable_frequent_irq;
+#endif /* CONFIG_FREQUENT_IRQ_DEBUG */
+
 /*
  * We wait here for a poller to finish.
  *
@@ -189,18 +199,16 @@ static inline int bad_action_ret(irqreturn_t action_ret)
  * (The other 100-of-100,000 interrupts may have been a correctly
  *  functioning device sharing an IRQ with the failing one)
  */
-static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
+static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret, const char *msg)
 {
 	unsigned int irq = irq_desc_get_irq(desc);
 	struct irqaction *action;
 	unsigned long flags;
 
 	if (bad_action_ret(action_ret)) {
-		printk(KERN_ERR "irq event %d: bogus return value %x\n",
-				irq, action_ret);
+		printk(msg, irq, action_ret);
 	} else {
-		printk(KERN_ERR "irq %d: nobody cared (try booting with "
-				"the \"irqpoll\" option)\n", irq);
+		printk(msg, irq);
 	}
 	dump_stack();
 	printk(KERN_ERR "handlers:\n");
@@ -228,7 +236,7 @@ static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
 
 	if (count > 0) {
 		count--;
-		__report_bad_irq(desc, action_ret);
+		__report_bad_irq(desc, action_ret, KERN_ERR "irq event %d: bogus return value %x\n");
 	}
 }
 
@@ -269,6 +277,46 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
 
 #define SPURIOUS_DEFERRED	0x80000000
 
+#ifdef CONFIG_FREQUENT_IRQ_DEBUG
+/*
+ * Some bad hardware will trigger interrupts very frequently, which will
+ * cause the CPU to process hardware interrupts all the time. We found
+ * and reported it, and disabling it is optional.
+ */
+void report_frequent_irq(struct irq_desc *desc, irqreturn_t action_ret)
+{
+	if (desc->have_reported)
+		return;
+
+	if ((desc->gap_count & DURATION_LIMIT_MASK) == 0)
+		desc->gap_time = get_jiffies_64();
+
+	desc->gap_count++;
+
+	if ((desc->gap_count & COUNT_PER_SECOND_MASK) >= count_per_second) {
+		if ((get_jiffies_64() - desc->gap_time) < HZ) {
+			desc->gap_count += DURATION_LIMIT_COUNT;
+			desc->gap_count &= DURATION_LIMIT_MASK;
+		} else {
+			desc->gap_count = 0;
+		}
+
+		if ((desc->gap_count >> DURATION_LIMIT_OFFSET) >= duration_limit) {
+			__report_bad_irq(desc, action_ret, KERN_ERR "irq %d: triggered too "
+					"frequently\n");
+			desc->have_reported = true;
+			if (disable_frequent_irq)
+				irq_disable(desc);
+		}
+	}
+}
+
+#else
+void report_frequent_irq(struct irq_desc *desc, irqreturn_t action_ret)
+{
+}
+#endif /* CONFIG_FREQUENT_IRQ_DEBUG */
+
 void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
 {
 	unsigned int irq;
@@ -282,6 +330,8 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
 		return;
 	}
 
+	report_frequent_irq(desc, action_ret);
+
 	/*
 	 * We cannot call note_interrupt from the threaded handler
 	 * because we need to look at the compound of all handlers
@@ -416,7 +466,8 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
 		/*
 		 * The interrupt is stuck
 		 */
-		__report_bad_irq(desc, action_ret);
+		__report_bad_irq(desc, action_ret, KERN_ERR "irq %d: nobody cared (try "
+				"bootingwith the \"irqpoll\" option)\n");
 		/*
 		 * Now kill the IRQ
 		 */
@@ -476,3 +527,59 @@ static int __init irqpoll_setup(char *str)
 }
 
 __setup("irqpoll", irqpoll_setup);
+
+#ifdef CONFIG_FREQUENT_IRQ_DEBUG
+int __init count_per_second_setup(char *str)
+{
+	int ret;
+
+	ret = kstrtouint(str, 10, &count_per_second);
+	if (ret)
+		return 0;
+
+	printk(KERN_INFO "Interrupt limit per second: %u\n", count_per_second);
+
+	return 1;
+}
+
+__setup("count_per_second=", count_per_second_setup);
+module_param(count_per_second, uint, 0644);
+MODULE_PARM_DESC(count_per_second, "Interrupt limit per second. (Max 0xffff)");
+
+int __init duration_limit_setup(char *str)
+{
+	int ret;
+
+	ret = kstrtouint(str, 10, &duration_limit);
+	if (ret)
+		return 0;
+
+	printk(KERN_INFO "Duration limit: %u\n", duration_limit);
+
+	return 1;
+}
+
+__setup("duration_limit=", duration_limit_setup);
+module_param(duration_limit, uint, 0644);
+MODULE_PARM_DESC(duration_limit, "The number of interruptions per second exceeds the duration limit of the limit. (Max 0xffff)");
+
+int __init disable_frequent_irq_setup(char *str)
+{
+	int ret;
+
+	ret = kstrtobool(str, &disable_frequent_irq);
+	if (ret)
+		return 0;
+
+	if (disable_frequent_irq)
+		printk(KERN_INFO "Disable frequent irq'\n");
+	else
+		printk(KERN_INFO "Don't disable frequent irq'\n");
+
+	return 1;
+}
+
+__setup("disable_frequent_irq=", disable_frequent_irq_setup);
+module_param(disable_frequent_irq, bool, 0644);
+MODULE_PARM_DESC(noirqdebug, "Disable frequent irq when true");
+#endif /* CONFIG_FREQUENT_IRQ_DEBUG */
-- 
2.20.1


Thanks,

             Zhang Xincheng

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] interrupt: discover and disable very frequent interrupts
  2022-10-09  1:31       ` Zhang Xincheng
@ 2022-10-09  2:13         ` Marc Zyngier
  2022-10-09 10:02           ` Zhang Xincheng
  0 siblings, 1 reply; 11+ messages in thread
From: Marc Zyngier @ 2022-10-09  2:13 UTC (permalink / raw)
  To: Zhang Xincheng
  Cc: tglx, linux-kernel, oleksandr, Hans de Goede, bigeasy,
	mark.rutland, michael

On Sun, 09 Oct 2022 02:31:36 +0100,
"=?utf-8?B?WmhhbmcgWGluY2hlbmc=?=" <zhangxincheng@uniontech.com> wrote:
> 
> > Again: what makes you think that it is better to kill the interrupt
> > than suffering a RCU stall? Yes, that's a lot of interrupts. But
> > killing it and risking the whole system isn't an acceptable outcome.
> 
> It's really not good to kill interrupts directly.

I'm glad you finally agree, (202210081220.9da0a329-yujie.liu@intel.com
has a good example of a perfectly working machine that your patch
kills for no reason).

> Perhaps a better way is
> to report it and let the system administrator decide what to do with it.
> 
> + if((desc->gap_count & 0xffff0000) == 0)
> + desc->gap_time = get_jiffies_64();
> +
> + desc->gap_count ++;
> +
> + if((desc->gap_count & 0x0000ffff) >= 2000) {
> + if((get_jiffies_64() - desc->gap_time) < HZ) {
> + desc->gap_count += 0x00010000;
> + desc->gap_count &= 0xffff0000;
> + } else {
> + desc->gap_count = 0;
> + }
> +
> + if((desc->gap_count >> 16) > 30) {
> + __report_bad_irq(desc, action_ret, KERN_ERR "irq %d: triggered too frequently\n");
> + }
> + }
> +

I don't think this is much better. You hardcode values that only make
sense on your HW, and for nobody else. And what can the user do with
this message? Nothing at all. The message itself only contributes to
problem. As it is, this patch is only a nuisance.

As I said before, this would be much better as a rate-limiter, with
configurable limits, and behind a debug option.

	M.

-- 
Without deviation from the norm, progress is not possible.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] interrupt: discover and disable very frequent interrupts
  2022-09-30 10:37     ` Marc Zyngier
@ 2022-10-09  1:31       ` Zhang Xincheng
  2022-10-09  2:13         ` Marc Zyngier
  0 siblings, 1 reply; 11+ messages in thread
From: Zhang Xincheng @ 2022-10-09  1:31 UTC (permalink / raw)
  To: maz
  Cc: tglx, linux-kernel, oleksandr, Hans de Goede, bigeasy,
	mark.rutland, michael

> Again: what makes you think that it is better to kill the interrupt
> than suffering a RCU stall? Yes, that's a lot of interrupts. But
> killing it and risking the whole system isn't an acceptable outcome.

It's really not good to kill interrupts directly. Perhaps a better way is
to report it and let the system administrator decide what to do with it.

+ if((desc->gap_count & 0xffff0000) == 0)
+ desc->gap_time = get_jiffies_64();
+
+ desc->gap_count ++;
+
+ if((desc->gap_count & 0x0000ffff) >= 2000) {
+ if((get_jiffies_64() - desc->gap_time) < HZ) {
+ desc->gap_count += 0x00010000;
+ desc->gap_count &= 0xffff0000;
+ } else {
+ desc->gap_count = 0;
+ }
+
+ if((desc->gap_count >> 16) > 30) {
+ __report_bad_irq(desc, action_ret, KERN_ERR "irq %d: triggered too frequently\n");
+ }
+ }
+


Thanks,

             Zhang Xincheng

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] interrupt: discover and disable very frequent interrupts
  2022-09-30  9:57   ` Zhang Xincheng
@ 2022-09-30 10:37     ` Marc Zyngier
  2022-10-09  1:31       ` Zhang Xincheng
  0 siblings, 1 reply; 11+ messages in thread
From: Marc Zyngier @ 2022-09-30 10:37 UTC (permalink / raw)
  To: Zhang Xincheng
  Cc: tglx, linux-kernel, oleksandr, Hans de Goede, bigeasy,
	mark.rutland, michael

On Fri, 30 Sep 2022 10:57:17 +0100,
"=?utf-8?B?WmhhbmcgWGluY2hlbmc=?=" <zhangxincheng@uniontech.com> wrote:
> 
> > Irrespective of the patch itself, I would really like to understand
> > why you consider that it is a better course of action to kill a device
> > (and potentially the whole machine) than to let the storm eventually
> > calm down? A frequent interrupt is not necessarily the sign of
> > something going wrong. It is the sign of a busy system. I prefer my
> > systems busy rather than dead.
> 
> Because I found that some peripherals will send interrupts to the
> CPU very frequently in some cases, and the interrupts will be
> handled correctly, which will cause the CPU to do nothing but handle
> the interrupts. At the same time, the RCU system will report the
> following logs:
> 
> [  838.131628] rcu: INFO: rcu_sched self-detected stall on CPU
> [  838.137189] rcu:     0-....: (194839 ticks this GP) idle=f02/1/0x4000000000000004 softirq=9993/9993 fqs=97428 
> [  838.146912] rcu:      (t=195015 jiffies g=6773 q=0)
> [  838.151516] Task dump for CPU 0:
> [  838.154730] systemd-sleep   R  running task        0  3445      1 0x0000000a
> [  838.161764] Call trace:
> [  838.164198]  dump_backtrace+0x0/0x190
> [  838.167846]  show_stack+0x14/0x20
> [  838.171148]  sched_show_task+0x134/0x160
> [  838.175057]  dump_cpu_task+0x40/0x4c
> [  838.178618]  rcu_dump_cpu_stacks+0xc4/0x108
> [  838.182788]  rcu_check_callbacks+0x6e4/0x898
> [  838.187044]  update_process_times+0x2c/0x88
> [  838.191214]  tick_sched_handle.isra.5+0x3c/0x50
> [  838.195730]  tick_sched_timer+0x48/0x98
> [  838.199552]  __hrtimer_run_queues+0xec/0x2f8
> [  838.203808]  hrtimer_interrupt+0x10c/0x298
> [  838.207891]  arch_timer_handler_phys+0x2c/0x38
> [  838.212321]  handle_percpu_devid_irq+0x88/0x228
> [  838.216837]  generic_handle_irq+0x2c/0x40
> [  838.220833]  __handle_domain_irq+0x60/0xb8
> [  838.224915]  gic_handle_irq+0x7c/0x178
> [  838.228650]  el1_irq+0xb0/0x140
> [  838.231778]  __do_softirq+0x84/0x2e8
> [  838.235340]  irq_exit+0x9c/0xb8
> [  838.238468]  __handle_domain_irq+0x64/0xb8
> [  838.242550]  gic_handle_irq+0x7c/0x178
> [  838.246285]  el1_irq+0xb0/0x140
> [  838.249413]  resume_irqs+0xfc/0x148
> [  838.252888]  resume_device_irqs+0x10/0x18
> [  838.256883]  dpm_resume_noirq+0x10/0x20
> [  838.260706]  suspend_devices_and_enter+0x170/0x788
> [  838.265483]  pm_suspend+0x41c/0x4cc
> [  838.268958]  state_store+0xbc/0x160
> [  838.272433]  kobj_attr_store+0x14/0x28
> [  838.276168]  sysfs_kf_write+0x40/0x50
> [  838.279817]  kernfs_fop_write+0xcc/0x1e0
> [  838.283726]  __vfs_write+0x18/0x140
> [  838.287201]  vfs_write+0xa4/0x1b0
> [  838.290503]  ksys_write+0x4c/0xb8
> [  838.293804]  __arm64_sys_write+0x18/0x20
> [  838.297713]  el0_svc_common+0x90/0x178
> [  838.301449]  el0_svc_handler+0x9c/0xa8
> [  838.305184]  el0_svc+0x8/0xc
> 
> The log is from the process of waking up a sleeping machine, 
> I left the machine in this state for a night and it successfully woke up,
>  and then I saw from /proc/interrupts that a GPIO interrupt triggered 
> more than 13 billion times.
> 
> 29: 1368200001  0  0  0  0  0  0  0     phytium_gpio6   Edge ACPI:Event

Again: what makes you think that it is better to kill the interrupt
than suffering a RCU stall? Yes, that's a lot of interrupts. But
killing it and risking the whole system isn't an acceptable outcome.

	M.

-- 
Without deviation from the norm, progress is not possible.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] interrupt: discover and disable very frequent interrupts
  2022-09-30  9:23 ` Marc Zyngier
@ 2022-09-30  9:57   ` Zhang Xincheng
  2022-09-30 10:37     ` Marc Zyngier
  0 siblings, 1 reply; 11+ messages in thread
From: Zhang Xincheng @ 2022-09-30  9:57 UTC (permalink / raw)
  To: maz
  Cc: tglx, linux-kernel, oleksandr, Hans de Goede, bigeasy,
	mark.rutland, michael

> Irrespective of the patch itself, I would really like to understand
> why you consider that it is a better course of action to kill a device
> (and potentially the whole machine) than to let the storm eventually
> calm down? A frequent interrupt is not necessarily the sign of
> something going wrong. It is the sign of a busy system. I prefer my
> systems busy rather than dead.

Because I found that some peripherals will send interrupts to the CPU very frequently 
in some cases, and the interrupts will be handled correctly, which will cause the CPU 
to do nothing but handle the interrupts. At the same time, the RCU system will report 
the following logs:

[  838.131628] rcu: INFO: rcu_sched self-detected stall on CPU
[  838.137189] rcu:     0-....: (194839 ticks this GP) idle=f02/1/0x4000000000000004 softirq=9993/9993 fqs=97428 
[  838.146912] rcu:      (t=195015 jiffies g=6773 q=0)
[  838.151516] Task dump for CPU 0:
[  838.154730] systemd-sleep   R  running task        0  3445      1 0x0000000a
[  838.161764] Call trace:
[  838.164198]  dump_backtrace+0x0/0x190
[  838.167846]  show_stack+0x14/0x20
[  838.171148]  sched_show_task+0x134/0x160
[  838.175057]  dump_cpu_task+0x40/0x4c
[  838.178618]  rcu_dump_cpu_stacks+0xc4/0x108
[  838.182788]  rcu_check_callbacks+0x6e4/0x898
[  838.187044]  update_process_times+0x2c/0x88
[  838.191214]  tick_sched_handle.isra.5+0x3c/0x50
[  838.195730]  tick_sched_timer+0x48/0x98
[  838.199552]  __hrtimer_run_queues+0xec/0x2f8
[  838.203808]  hrtimer_interrupt+0x10c/0x298
[  838.207891]  arch_timer_handler_phys+0x2c/0x38
[  838.212321]  handle_percpu_devid_irq+0x88/0x228
[  838.216837]  generic_handle_irq+0x2c/0x40
[  838.220833]  __handle_domain_irq+0x60/0xb8
[  838.224915]  gic_handle_irq+0x7c/0x178
[  838.228650]  el1_irq+0xb0/0x140
[  838.231778]  __do_softirq+0x84/0x2e8
[  838.235340]  irq_exit+0x9c/0xb8
[  838.238468]  __handle_domain_irq+0x64/0xb8
[  838.242550]  gic_handle_irq+0x7c/0x178
[  838.246285]  el1_irq+0xb0/0x140
[  838.249413]  resume_irqs+0xfc/0x148
[  838.252888]  resume_device_irqs+0x10/0x18
[  838.256883]  dpm_resume_noirq+0x10/0x20
[  838.260706]  suspend_devices_and_enter+0x170/0x788
[  838.265483]  pm_suspend+0x41c/0x4cc
[  838.268958]  state_store+0xbc/0x160
[  838.272433]  kobj_attr_store+0x14/0x28
[  838.276168]  sysfs_kf_write+0x40/0x50
[  838.279817]  kernfs_fop_write+0xcc/0x1e0
[  838.283726]  __vfs_write+0x18/0x140
[  838.287201]  vfs_write+0xa4/0x1b0
[  838.290503]  ksys_write+0x4c/0xb8
[  838.293804]  __arm64_sys_write+0x18/0x20
[  838.297713]  el0_svc_common+0x90/0x178
[  838.301449]  el0_svc_handler+0x9c/0xa8
[  838.305184]  el0_svc+0x8/0xc

The log is from the process of waking up a sleeping machine, 
I left the machine in this state for a night and it successfully woke up,
 and then I saw from /proc/interrupts that a GPIO interrupt triggered 
more than 13 billion times.

29: 1368200001  0  0  0  0  0  0  0     phytium_gpio6   Edge ACPI:Event

> Something like this should be limited to a debug feature. It would
> also be a lot more useful if it was built as an interrupt *limiting*
> feature, rather then killing the interrupt forever (which is IMHO a
> ludicrous thing to do).

It's a good idea to have it as a debugging feature.


Thanks,

             Zhang Xincheng

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] interrupt: discover and disable very frequent interrupts
  2022-09-30  6:40 Zhang Xincheng
  2022-09-30  7:05 ` Hans de Goede
@ 2022-09-30  9:23 ` Marc Zyngier
  2022-09-30  9:57   ` Zhang Xincheng
  1 sibling, 1 reply; 11+ messages in thread
From: Marc Zyngier @ 2022-09-30  9:23 UTC (permalink / raw)
  To: Zhang Xincheng
  Cc: tglx, linux-kernel, oleksandr, hdegoede, bigeasy, mark.rutland, michael

On Fri, 30 Sep 2022 07:40:42 +0100,
Zhang Xincheng <zhangxincheng@uniontech.com> wrote:
> 
> From: zhangxincheng <zhangxincheng@uniontech.com>
> 
> In some cases, a peripheral's interrupt will be triggered frequently,
> which will keep the CPU processing the interrupt and eventually cause
> the RCU to report rcu_sched self-detected stall on the CPU.
> 
> [  838.131628] rcu: INFO: rcu_sched self-detected stall on CPU
> [  838.137189] rcu:     0-....: (194839 ticks this GP) idle=f02/1/0x4000000000000004
> softirq=9993/9993 fqs=97428
> [  838.146912] rcu:      (t=195015 jiffies g=6773 q=0)
> [  838.151516] Task dump for CPU 0:
> [  838.154730] systemd-sleep   R  running task        0  3445      1 0x0000000a
> 
> Signed-off-by: zhangxincheng <zhangxincheng@uniontech.com>
> Change-Id: I9c92146f2772eae383c16c8c10de028b91e07150
> Signed-off-by: zhangxincheng <zhangxincheng@uniontech.com>

Irrespective of the patch itself, I would really like to understand
why you consider that it is a better course of action to kill a device
(and potentially the whole machine) than to let the storm eventually
calm down? A frequent interrupt is not necessarily the sign of
something going wrong. It is the sign of a busy system. I prefer my
systems busy rather than dead.

Furthermore, I see no rationale here about the number of interrupt
that *you* consider as being "too many" over what period of time (it
seems to me that both parameters are firmly hardcoded).

Something like this should be limited to a debug feature. It would
also be a lot more useful if it was built as an interrupt *limiting*
feature, rather then killing the interrupt forever (which is IMHO a
ludicrous thing to do).

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] interrupt: discover and disable very frequent interrupts
  2022-09-30  6:40 Zhang Xincheng
@ 2022-09-30  7:05 ` Hans de Goede
  2022-09-30  9:23 ` Marc Zyngier
  1 sibling, 0 replies; 11+ messages in thread
From: Hans de Goede @ 2022-09-30  7:05 UTC (permalink / raw)
  To: Zhang Xincheng, tglx
  Cc: linux-kernel, maz, oleksandr, bigeasy, mark.rutland, michael

Hi,

On 9/30/22 08:40, Zhang Xincheng wrote:
> From: zhangxincheng <zhangxincheng@uniontech.com>
> 
> In some cases, a peripheral's interrupt will be triggered frequently,
> which will keep the CPU processing the interrupt and eventually cause
> the RCU to report rcu_sched self-detected stall on the CPU.
> 
> [  838.131628] rcu: INFO: rcu_sched self-detected stall on CPU
> [  838.137189] rcu:     0-....: (194839 ticks this GP) idle=f02/1/0x4000000000000004
> softirq=9993/9993 fqs=97428
> [  838.146912] rcu:      (t=195015 jiffies g=6773 q=0)
> [  838.151516] Task dump for CPU 0:
> [  838.154730] systemd-sleep   R  running task        0  3445      1 0x0000000a
> 
> Signed-off-by: zhangxincheng <zhangxincheng@uniontech.com>
> Change-Id: I9c92146f2772eae383c16c8c10de028b91e07150
> Signed-off-by: zhangxincheng <zhangxincheng@uniontech.com>
> ---
>  include/linux/irqdesc.h |  2 ++
>  kernel/irq/spurious.c   | 52 +++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 54 insertions(+)
> 
> diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
> index 1cd4e36890fb..a3bd521c3557 100644
> --- a/include/linux/irqdesc.h
> +++ b/include/linux/irqdesc.h
> @@ -102,6 +102,8 @@ struct irq_desc {
>  	int			parent_irq;
>  	struct module		*owner;
>  	const char		*name;
> +	u32 gap_count;
> +	u64 gap_time;
>  } ____cacheline_internodealigned_in_smp;
>  
>  #ifdef CONFIG_SPARSE_IRQ
> diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
> index 02b2daf07441..b7162a10d92c 100644
> --- a/kernel/irq/spurious.c
> +++ b/kernel/irq/spurious.c
> @@ -222,6 +222,38 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
>  	raw_spin_unlock_irqrestore(&desc->lock, flags);
>  }
>  
> +/*
> + * Some bad hardware will trigger interrupts very frequently, which will
> + * cause the CPU to process hardware interrupts all the time. So when
> + * we find this out, the interrupt should be disabled.
> + */
> +static void __report_so_frequently_irq(struct irq_desc *desc, irqreturn_t action_ret)
> +{
> +	unsigned int irq = irq_desc_get_irq(desc);
> +	struct irqaction *action;
> +	unsigned long flags;
> +
> +	printk(KERN_ERR "irq %d: triggered too frequently\n",irq);
> +	dump_stack();
> +	printk(KERN_ERR "handlers:\n");
> +
> +	/*
> +	 * We need to take desc->lock here. note_interrupt() is called
> +	 * w/o desc->lock held, but IRQ_PROGRESS set. We might race
> +	 * with something else removing an action. It's ok to take
> +	 * desc->lock here. See synchronize_irq().
> +	 */
> +	raw_spin_lock_irqsave(&desc->lock, flags);
> +	for_each_action_of_desc(desc, action) {
> +		printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
> +		if (action->thread_fn)
> +			printk(KERN_CONT " threaded [<%p>] %pf",
> +					action->thread_fn, action->thread_fn);
> +		printk(KERN_CONT "\n");
> +	}
> +	raw_spin_unlock_irqrestore(&desc->lock, flags);
> +}
> +

This is basically a copy of __report_bad_irq() please instead just give
__report_bad_irq() a "const char *msg" argument and drop this copy.

Note __report_bad_irq() is currently called twice, once from
report_bad_irq() and in that case "if (bad_action_ret(action_ret))" is
always true, so inside report_bad_irq() you will now want to use:

	__report_bad_irq(desc, action_ret, KERN_ERR "irq event %d: bogus return value %x\n");

And then there is a second call in note_interrupt() in which case
"if (bad_action_ret(action_ret))" is always false so there you
will want to use:

	__report_bad_irq(desc, action_ret, KERN_ERR "irq %d: nobody cared (try booting with the \"irqpoll\" option)\n");

And then in __report_bad_irq() itself replace the if with 2 printk() calls with:

	if (bad_action_ret(action_ret))
		printk(msg, irq, action_ret);
	else
		printk(msg, irq);

Note the if is still necessary so that if any of this gets inlined
(I sure hope not ...) that the argument count then is correct and
we don't get the compiler warning about the number of arguments
not matching the format-string.

And then replace your __report_so_frequently_irq() call with:

	__report_bad_irq(desc, action_ret, KERN_ERR "irq %d: triggered too frequently\n");

	


>  static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
>  {
>  	static int count = 100;
> @@ -273,6 +305,26 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
>  {
>  	unsigned int irq;
>  
> +	if((desc->gap_count & 0xffff0000) == 0)
> +		desc->gap_time = get_jiffies_64();
> +
> +	desc->gap_count ++;
> +
> +	if((desc->gap_count & 0x0000ffff) >= 2000) {
> +		if((get_jiffies_64() - desc->gap_time) < HZ) {
> +			desc->gap_count += 0x00010000;
> +			desc->gap_count &= 0xffff0000;
> +		} else {
> +			desc->gap_count = 0;
> +		}
> +
> +		if((desc->gap_count >> 16) > 30) {
> +			__report_so_frequently_irq(desc, action_ret);
> +			irq_disable(desc);
> +		}
> +	}
> +
> +

I believe this should be moved to below this block:

        if (bad_action_ret(action_ret)) {
                report_bad_irq(desc, action_ret);
                return;
        }

Since we don't want this check when polling and we want to keep
using / prefer the existing handling for bad interrupt return
values.

>  	if (desc->istate & IRQS_POLL_INPROGRESS ||
>  	    irq_settings_is_polled(desc))
>  		return;



Please note I'm in no way an expert on the interrupt subsystem, so 
I have no idea if this patch is a good idea in general. I just noticed
the code duplication which is why I did a partial review of this patch.

Regards,

Hans


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH] interrupt: discover and disable very frequent interrupts
@ 2022-09-30  6:40 Zhang Xincheng
  2022-09-30  7:05 ` Hans de Goede
  2022-09-30  9:23 ` Marc Zyngier
  0 siblings, 2 replies; 11+ messages in thread
From: Zhang Xincheng @ 2022-09-30  6:40 UTC (permalink / raw)
  To: tglx
  Cc: linux-kernel, maz, oleksandr, hdegoede, bigeasy, mark.rutland,
	michael, zhangxincheng

From: zhangxincheng <zhangxincheng@uniontech.com>

In some cases, a peripheral's interrupt will be triggered frequently,
which will keep the CPU processing the interrupt and eventually cause
the RCU to report rcu_sched self-detected stall on the CPU.

[  838.131628] rcu: INFO: rcu_sched self-detected stall on CPU
[  838.137189] rcu:     0-....: (194839 ticks this GP) idle=f02/1/0x4000000000000004
softirq=9993/9993 fqs=97428
[  838.146912] rcu:      (t=195015 jiffies g=6773 q=0)
[  838.151516] Task dump for CPU 0:
[  838.154730] systemd-sleep   R  running task        0  3445      1 0x0000000a

Signed-off-by: zhangxincheng <zhangxincheng@uniontech.com>
Change-Id: I9c92146f2772eae383c16c8c10de028b91e07150
Signed-off-by: zhangxincheng <zhangxincheng@uniontech.com>
---
 include/linux/irqdesc.h |  2 ++
 kernel/irq/spurious.c   | 52 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 1cd4e36890fb..a3bd521c3557 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -102,6 +102,8 @@ struct irq_desc {
 	int			parent_irq;
 	struct module		*owner;
 	const char		*name;
+	u32 gap_count;
+	u64 gap_time;
 } ____cacheline_internodealigned_in_smp;
 
 #ifdef CONFIG_SPARSE_IRQ
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 02b2daf07441..b7162a10d92c 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -222,6 +222,38 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 
+/*
+ * Some bad hardware will trigger interrupts very frequently, which will
+ * cause the CPU to process hardware interrupts all the time. So when
+ * we find this out, the interrupt should be disabled.
+ */
+static void __report_so_frequently_irq(struct irq_desc *desc, irqreturn_t action_ret)
+{
+	unsigned int irq = irq_desc_get_irq(desc);
+	struct irqaction *action;
+	unsigned long flags;
+
+	printk(KERN_ERR "irq %d: triggered too frequently\n",irq);
+	dump_stack();
+	printk(KERN_ERR "handlers:\n");
+
+	/*
+	 * We need to take desc->lock here. note_interrupt() is called
+	 * w/o desc->lock held, but IRQ_PROGRESS set. We might race
+	 * with something else removing an action. It's ok to take
+	 * desc->lock here. See synchronize_irq().
+	 */
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	for_each_action_of_desc(desc, action) {
+		printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
+		if (action->thread_fn)
+			printk(KERN_CONT " threaded [<%p>] %pf",
+					action->thread_fn, action->thread_fn);
+		printk(KERN_CONT "\n");
+	}
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+}
+
 static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
 {
 	static int count = 100;
@@ -273,6 +305,26 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
 {
 	unsigned int irq;
 
+	if((desc->gap_count & 0xffff0000) == 0)
+		desc->gap_time = get_jiffies_64();
+
+	desc->gap_count ++;
+
+	if((desc->gap_count & 0x0000ffff) >= 2000) {
+		if((get_jiffies_64() - desc->gap_time) < HZ) {
+			desc->gap_count += 0x00010000;
+			desc->gap_count &= 0xffff0000;
+		} else {
+			desc->gap_count = 0;
+		}
+
+		if((desc->gap_count >> 16) > 30) {
+			__report_so_frequently_irq(desc, action_ret);
+			irq_disable(desc);
+		}
+	}
+
+
 	if (desc->istate & IRQS_POLL_INPROGRESS ||
 	    irq_settings_is_polled(desc))
 		return;
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2022-10-18  1:07 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-30  8:31 [PATCH] interrupt: discover and disable very frequent interrupts 张鑫城
  -- strict thread matches above, loose matches on Subject: below --
2022-09-30  6:40 Zhang Xincheng
2022-09-30  7:05 ` Hans de Goede
2022-09-30  9:23 ` Marc Zyngier
2022-09-30  9:57   ` Zhang Xincheng
2022-09-30 10:37     ` Marc Zyngier
2022-10-09  1:31       ` Zhang Xincheng
2022-10-09  2:13         ` Marc Zyngier
2022-10-09 10:02           ` Zhang Xincheng
2022-10-17 11:21             ` Thomas Gleixner
2022-10-18  1:05               ` Zhang Xincheng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).