[RFC PATCH 2/3] softirq: implement interrupt flood detection

From: Ming Lei <ming.lei@redhat.com>
To: Thomas Gleixner <tglx@linutronix.de>, Jens Axboe <axboe@kernel.dk>
Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org,
	Ming Lei <ming.lei@redhat.com>, Long Li <longli@microsoft.com>,
	Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Christoph Hellwig <hch@lst.de>,
	Keith Busch <keith.busch@intel.com>,
	Sagi Grimberg <sagi@grimberg.me>,
	John Garry <john.garry@huawei.com>,
	Hannes Reinecke <hare@suse.com>
Subject: [RFC PATCH 2/3] softirq: implement interrupt flood detection
Date: Wed, 18 Dec 2019 15:19:41 +0800	[thread overview]
Message-ID: <20191218071942.22336-3-ming.lei@redhat.com> (raw)
In-Reply-To: <20191218071942.22336-1-ming.lei@redhat.com>

For some high performance IO devices, interrupt may come very frequently,
meantime IO request is often completed in irq context for minimizing IO
latency. Especially on some devices(SCSI or NVMe), IO requests can be
submitted concurrently from multiple CPU cores, meantime these IO's
completion is only done on single CPU core. Either fast IO storage is
comming, or multiple storage devices attached to same HBA, or multiple
HBAs in same system, IOs from multiple storage devices may saturate one
single CPU core for hanlding these IO's interrups.

Interrupt flood can be triggered, and scheduler is stuck, then CPU lockup
is followed.

The interrupt flood infomation is very useful for the following use
cases:

1) avoid CPU soft lockup

After interrupt flood is detected, block layer may switch to complete IO
in process context, then CPU lockup can be avoided. And switch back to
interrupt context after interrupt flood is gone.

2) partition IO completion into interrupt and process contexts effectively

When handling IO completion in interrupt context, IO latency is good,
but there is risk to lock up CPU. When moving IO completion to process
context via NAPI or threaded interrupt handler, CPU lockup can be
avoided. So people try to move part of IO completion into process
context for avoiding CPU lockup, such as Keith's work:

https://lore.kernel.org/linux-nvme/20191209175622.1964-1-kbusch@kernel.org/T/#t

However, it is hard to partition IO completion in the two contexts, if
less work is moved to process context, risk of locking up CPU can't be
eliminated; if more work is moved to process context, extra IO latency is
introduced, then IO performance is hurt.

Interrupt flood information can be one useful hint for partitioning IO
completion work into the two contexts effectively.

Implement one simple and efficient CPU interrupt flood detection mechanism.
This mechanism uses average interrupt interval on each CPU to evaluate if
interrupt flood is triggered. For saving cost of reading clock from hardware,
applies two-stage detection:

- in the 1st stage, read clock from scheduler runqueue clock which is
software clock, and the cost is quite low

- after small enough interrupt interval is observed in the 1st stage, switch
to the 2nd stage detection given it is reasonable to assume that scheduler
has been stuck

- the 2nd stage is for confirming if interrupt flood really happens by using
hardware clock after scheduler is stuck

Cc: Long Li <longli@microsoft.com>
Cc: Ingo Molnar <mingo@redhat.com>,
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: John Garry <john.garry@huawei.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/base/cpu.c      |  23 ++++++
 include/linux/hardirq.h |   2 +
 kernel/softirq.c        | 161 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 186 insertions(+)

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 6265871a4af2..34c253cde5f3 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -20,6 +20,7 @@
 #include <linux/tick.h>
 #include <linux/pm_qos.h>
 #include <linux/sched/isolation.h>
+#include <linux/hardirq.h>
 
 #include "base.h"
 
@@ -183,10 +184,31 @@ static struct attribute_group crash_note_cpu_attr_group = {
 };
 #endif
 
+static ssize_t show_irq_interval(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
+	ssize_t rc;
+	u64 avg = irq_get_avg_interval(cpu->dev.id);
+
+	rc = sprintf(buf, "%d: %d\n",  (int)(avg & 1), (int)(avg >> 1));
+	return rc;
+}
+
+static DEVICE_ATTR(irq_interval, 0400, show_irq_interval, NULL);
+static struct attribute *irq_interval_cpu_attrs[] = {
+	&dev_attr_irq_interval.attr,
+	NULL
+};
+static struct attribute_group irq_interval_cpu_attr_group = {
+	.attrs = irq_interval_cpu_attrs,
+};
+
 static const struct attribute_group *common_cpu_attr_groups[] = {
 #ifdef CONFIG_KEXEC
 	&crash_note_cpu_attr_group,
 #endif
+	&irq_interval_cpu_attr_group,
 	NULL
 };
 
@@ -194,6 +216,7 @@ static const struct attribute_group *hotplugable_cpu_attr_groups[] = {
 #ifdef CONFIG_KEXEC
 	&crash_note_cpu_attr_group,
 #endif
+	&irq_interval_cpu_attr_group,
 	NULL
 };
 
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index da0af631ded5..a0db287bb574 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -8,6 +8,8 @@
 #include <linux/vtime.h>
 #include <asm/hardirq.h>
 
+extern u64 irq_get_avg_interval(int);
+extern bool irq_is_flood(void);
 
 extern void synchronize_irq(unsigned int irq);
 extern bool synchronize_hardirq(unsigned int irq);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0427a86743a4..f6e434ac4183 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -25,6 +25,8 @@
 #include <linux/smpboot.h>
 #include <linux/tick.h>
 #include <linux/irq.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
@@ -52,6 +54,26 @@ DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);
 EXPORT_PER_CPU_SYMBOL(irq_stat);
 #endif
 
+#define IRQ_INTERVAL_STAGE1_WEIGHT_BITS		ilog2(512)
+#define IRQ_INTERVAL_STAGE2_WEIGHT_BITS		ilog2(128)
+
+#define IRQ_INTERVAL_THRESHOLD_UNIT_NS	1000
+
+#define IRQ_INTERVAL_MIN_THRESHOLD_NS	IRQ_INTERVAL_THRESHOLD_UNIT_NS
+#define IRQ_INTERVAL_MAX_MIN_THRESHOLD_TIME_NS  4000000000
+
+struct irq_interval {
+	u64                     clock;
+	int			avg;
+	int			std_threshold:31;
+	int			stage:1;
+
+	u64			stage_start_clock;
+	unsigned		stage1_time;
+	unsigned		stage2_time;
+};
+DEFINE_PER_CPU(struct irq_interval, avg_irq_interval);
+
 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
 
 DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
@@ -339,6 +361,140 @@ asmlinkage __visible void do_softirq(void)
 	local_irq_restore(flags);
 }
 
+static inline void irq_interval_update_avg(struct irq_interval *inter,
+		u64 now, int weight_bits)
+{
+	inter->avg = inter->avg - ((inter->avg) >> weight_bits) +
+		((now - inter->clock) >> weight_bits);
+	if (unlikely(inter->avg < 0))
+		inter->avg = 0;
+}
+
+/*
+ * Keep the ratio of stage2 time to stage1 time between 1/2 and 1/8. If
+ * it is out of the range, adjust .std_threshold for maintaining the ratio.
+ */
+static inline void irq_interval_update_threshold(struct irq_interval *inter)
+{
+	if (inter->stage2_time * 2 > inter->stage1_time)
+		inter->std_threshold -= IRQ_INTERVAL_THRESHOLD_UNIT_NS;
+	if (inter->stage2_time * 8 < inter->stage1_time)
+		inter->std_threshold += IRQ_INTERVAL_THRESHOLD_UNIT_NS;
+
+	if (inter->std_threshold <= 0)
+		inter->std_threshold = IRQ_INTERVAL_THRESHOLD_UNIT_NS;
+
+	if (inter->std_threshold >= 32 * IRQ_INTERVAL_THRESHOLD_UNIT_NS)
+		inter->std_threshold = 32 * IRQ_INTERVAL_THRESHOLD_UNIT_NS;
+}
+
+/*
+ * If we stay at stage1 for too long with minimized threshold and low enough
+ * interrupt average interval, there may have risk to lock up CPU.
+ */
+static bool irq_interval_cpu_lockup_risk(struct irq_interval *inter, u64 now)
+{
+	if (inter->avg > inter->std_threshold)
+		return false;
+
+	if (inter->std_threshold != IRQ_INTERVAL_MIN_THRESHOLD_NS)
+		return false;
+
+	if (now - inter->stage_start_clock <=
+			IRQ_INTERVAL_MAX_MIN_THRESHOLD_TIME_NS)
+		return false;
+	return true;
+}
+
+/*
+ * Update average interrupt interval with the Exponential Weighted Moving
+ * Average(EWMA), and implement two-stage interrupt flood detection.
+ *
+ * Use scheduler's runqueue software clock at default for figuring
+ * interrupt interval for saving cost. When the interval becomes zero,
+ * it is reasonable to conclude scheduler's activity on this CPU has been
+ * stopped because of interrupt flood. Then switch to the 2nd stage
+ * detection in which clock is read from hardware, and the detection
+ * result can be more reliable.
+ */
+static void irq_interval_update(void)
+{
+	struct irq_interval *inter = raw_cpu_ptr(&avg_irq_interval);
+	u64 now;
+
+	if (likely(!inter->stage)) {
+		now = sched_local_rq_clock();
+		irq_interval_update_avg(inter, now,
+				IRQ_INTERVAL_STAGE1_WEIGHT_BITS);
+
+		if (unlikely(inter->avg < inter->std_threshold / 2 ||
+				irq_interval_cpu_lockup_risk(inter, now))) {
+			inter->stage = 1;
+			now = sched_clock_cpu(smp_processor_id());
+			inter->stage1_time = now - inter->stage_start_clock;
+			inter->stage_start_clock = now;
+
+			/*
+			 * reset as the mean of the min and the max value of
+			 * stage2's threshold
+			 */
+			inter->avg = inter->std_threshold +
+				(inter->std_threshold >> 2);
+		}
+	} else {
+		now = sched_clock_cpu(smp_processor_id());
+
+		irq_interval_update_avg(inter, now,
+				IRQ_INTERVAL_STAGE2_WEIGHT_BITS);
+
+		if (inter->avg > inter->std_threshold * 2) {
+			inter->stage = 0;
+			inter->stage2_time = now - inter->stage_start_clock;
+			inter->stage_start_clock = now;
+
+			irq_interval_update_threshold(inter);
+		}
+	}
+}
+
+static void irq_interval_update_clock(void)
+{
+	struct irq_interval *inter = raw_cpu_ptr(&avg_irq_interval);
+
+	if (likely(!inter->stage))
+		inter->clock = sched_local_rq_clock();
+	else
+		inter->clock = sched_clock_cpu(smp_processor_id());
+}
+
+u64 irq_get_avg_interval(int cpu)
+{
+	struct irq_interval *inter = per_cpu_ptr(&avg_irq_interval, cpu);
+
+	WARN_ON_ONCE(inter->stage > 1);
+	return (inter->avg << 1) | inter->stage;
+}
+EXPORT_SYMBOL_GPL(irq_get_avg_interval);
+
+bool irq_is_flood(void)
+{
+	struct irq_interval *inter = raw_cpu_ptr(&avg_irq_interval);
+
+	return inter->stage && inter->avg < inter->std_threshold;
+}
+EXPORT_SYMBOL_GPL(irq_is_flood);
+
+static void irq_interval_init(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct irq_interval *inter = &per_cpu(avg_irq_interval, i);
+
+		inter->std_threshold = IRQ_INTERVAL_THRESHOLD_UNIT_NS * 2;
+	}
+}
+
 /*
  * Enter an interrupt context.
  */
@@ -356,6 +512,7 @@ void irq_enter(void)
 	}
 
 	__irq_enter();
+	irq_interval_update();
 }
 
 static inline void invoke_softirq(void)
@@ -408,6 +565,8 @@ void irq_exit(void)
 	lockdep_assert_irqs_disabled();
 #endif
 	account_irq_exit_time(current);
+	irq_interval_update_clock();
+
 	preempt_count_sub(HARDIRQ_OFFSET);
 	if (!in_interrupt() && local_softirq_pending())
 		invoke_softirq();
@@ -683,6 +842,8 @@ static __init int spawn_ksoftirqd(void)
 				  takeover_tasklets);
 	BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
 
+	irq_interval_init();
+
 	return 0;
 }
 early_initcall(spawn_ksoftirqd);
-- 
2.20.1