Re: [RFC PATCH 2/3] softirq: implement interrupt flood detection

From: Peter Zijlstra <peterz@infradead.org>
To: Ming Lei <ming.lei@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>,
	Jens Axboe <axboe@kernel.dk>,
	linux-kernel@vger.kernel.org, linux-block@vger.kernel.org,
	Long Li <longli@microsoft.com>, Ingo Molnar <mingo@redhat.com>,
	Christoph Hellwig <hch@lst.de>,
	Keith Busch <keith.busch@intel.com>,
	Sagi Grimberg <sagi@grimberg.me>,
	John Garry <john.garry@huawei.com>,
	Hannes Reinecke <hare@suse.com>
Subject: Re: [RFC PATCH 2/3] softirq: implement interrupt flood detection
Date: Wed, 18 Dec 2019 11:49:41 +0100	[thread overview]
Message-ID: <20191218104941.GR2844@hirez.programming.kicks-ass.net> (raw)
In-Reply-To: <20191218071942.22336-3-ming.lei@redhat.com>

On Wed, Dec 18, 2019 at 03:19:41PM +0800, Ming Lei wrote:

> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index 0427a86743a4..f6e434ac4183 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -25,6 +25,8 @@
>  #include <linux/smpboot.h>
>  #include <linux/tick.h>
>  #include <linux/irq.h>
> +#include <linux/sched.h>
> +#include <linux/sched/clock.h>
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/irq.h>
> @@ -52,6 +54,26 @@ DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);
>  EXPORT_PER_CPU_SYMBOL(irq_stat);
>  #endif
>  
> +#define IRQ_INTERVAL_STAGE1_WEIGHT_BITS		ilog2(512)
> +#define IRQ_INTERVAL_STAGE2_WEIGHT_BITS		ilog2(128)

That must be the most difficult way of writing 9 and 7 resp.

> +#define IRQ_INTERVAL_THRESHOLD_UNIT_NS	1000
> +
> +#define IRQ_INTERVAL_MIN_THRESHOLD_NS	IRQ_INTERVAL_THRESHOLD_UNIT_NS
> +#define IRQ_INTERVAL_MAX_MIN_THRESHOLD_TIME_NS  4000000000

(seriously a name with MAX_MIN in it ?!?)

That's unreadable, we have (4*NSEC_PER_SEC) for that (if I counted the
0s correctly)

These are all a bunch of magic value, any justification for them? Will
they always work?

> +
> +struct irq_interval {
> +	u64                     clock;
> +	int			avg;
> +	int			std_threshold:31;

I know of a standard deviation, but what is a standard threshold?

> +	int			stage:1;

signed single bit.. there's people that object to that. They figure just
a sign bit isn't much useful.

> +
> +	u64			stage_start_clock;
> +	unsigned		stage1_time;
> +	unsigned		stage2_time;
> +};
> +DEFINE_PER_CPU(struct irq_interval, avg_irq_interval);
> +
>  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
>  
>  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
> @@ -339,6 +361,140 @@ asmlinkage __visible void do_softirq(void)
>  	local_irq_restore(flags);
>  }
>  
> +static inline void irq_interval_update_avg(struct irq_interval *inter,
> +		u64 now, int weight_bits)
> +{
> +	inter->avg = inter->avg - ((inter->avg) >> weight_bits) +
> +		((now - inter->clock) >> weight_bits);

Did you perhaps want to write something like:

	s64 sample = now - inter->clock;

	inter->avg += (sample - inter->avg) >> weight_bits;

Which is a recognisable form.

It also shows the obvious overflow when sample is large (interrupts
didn't happen for a while). You'll want to clamp @sample to some max.

> +	if (unlikely(inter->avg < 0))
> +		inter->avg = 0;

And since inter->avg must never be <0, wth are you using a signed
bitfield? This generates shit code. Use an on-stack temporary if
anything:

	int avg = inter->avg;

	avg += (sample - avg) >> bits;
	if (avg < 0)
		avg = 0;

	inter->avg = avg;

and presto! no signed bitfields required.

> +}
> +
> +/*
> + * Keep the ratio of stage2 time to stage1 time between 1/2 and 1/8. If
> + * it is out of the range, adjust .std_threshold for maintaining the ratio.

it is either @std_threshold or @irq_interval::std_threshold

> + */
> +static inline void irq_interval_update_threshold(struct irq_interval *inter)
> +{
> +	if (inter->stage2_time * 2 > inter->stage1_time)
> +		inter->std_threshold -= IRQ_INTERVAL_THRESHOLD_UNIT_NS;
> +	if (inter->stage2_time * 8 < inter->stage1_time)
> +		inter->std_threshold += IRQ_INTERVAL_THRESHOLD_UNIT_NS;

I suppose that will eventually converge.

> +	if (inter->std_threshold <= 0)
> +		inter->std_threshold = IRQ_INTERVAL_THRESHOLD_UNIT_NS;

I think you'll find you actually meant to write:

	if (inter->std_threshold < IRQ_INTERVAL_THRESHOLD_UNIT_NS)


> +	if (inter->std_threshold >= 32 * IRQ_INTERVAL_THRESHOLD_UNIT_NS)
> +		inter->std_threshold = 32 * IRQ_INTERVAL_THRESHOLD_UNIT_NS;

We actually have a macro for this:

	inter->std_threshold = clamp(inter->std_threshold,
				     IRQ_INTERVAL_THRESHOLD_UNIT_NS,
				     32 * IRQ_INTERVAL_THRESHOLD_UNIT_NS);

> +}
> +
> +/*
> + * If we stay at stage1 for too long with minimized threshold and low enough
> + * interrupt average interval, there may have risk to lock up CPU.

It's not locked up...

> + */
> +static bool irq_interval_cpu_lockup_risk(struct irq_interval *inter, u64 now)
> +{
> +	if (inter->avg > inter->std_threshold)
> +		return false;
> +
> +	if (inter->std_threshold != IRQ_INTERVAL_MIN_THRESHOLD_NS)
> +		return false;
> +
> +	if (now - inter->stage_start_clock <=
> +			IRQ_INTERVAL_MAX_MIN_THRESHOLD_TIME_NS)
> +		return false;
> +	return true;
> +}
> +
> +/*
> + * Update average interrupt interval with the Exponential Weighted Moving
> + * Average(EWMA), and implement two-stage interrupt flood detection.
> + *
> + * Use scheduler's runqueue software clock at default for figuring
> + * interrupt interval for saving cost. When the interval becomes zero,
> + * it is reasonable to conclude scheduler's activity on this CPU has been
> + * stopped because of interrupt flood. Then switch to the 2nd stage
> + * detection in which clock is read from hardware, and the detection
> + * result can be more reliable.
> + */
> +static void irq_interval_update(void)
> +{
> +	struct irq_interval *inter = raw_cpu_ptr(&avg_irq_interval);

raw_cpu_ptr is wrong, this wants to be this_cpu_ptr()

> +	u64 now;
> +
> +	if (likely(!inter->stage)) {
> +		now = sched_local_rq_clock();
> +		irq_interval_update_avg(inter, now,
> +				IRQ_INTERVAL_STAGE1_WEIGHT_BITS);
> +
> +		if (unlikely(inter->avg < inter->std_threshold / 2 ||
> +				irq_interval_cpu_lockup_risk(inter, now))) {
> +			inter->stage = 1;
> +			now = sched_clock_cpu(smp_processor_id());
> +			inter->stage1_time = now - inter->stage_start_clock;
> +			inter->stage_start_clock = now;
> +
> +			/*
> +			 * reset as the mean of the min and the max value of
> +			 * stage2's threshold
> +			 */
> +			inter->avg = inter->std_threshold +
> +				(inter->std_threshold >> 2);
> +		}
> +	} else {
> +		now = sched_clock_cpu(smp_processor_id());
> +
> +		irq_interval_update_avg(inter, now,
> +				IRQ_INTERVAL_STAGE2_WEIGHT_BITS);
> +
> +		if (inter->avg > inter->std_threshold * 2) {
> +			inter->stage = 0;
> +			inter->stage2_time = now - inter->stage_start_clock;
> +			inter->stage_start_clock = now;
> +
> +			irq_interval_update_threshold(inter);
> +		}
> +	}
> +}

AFAICT the only reason for much of this complexity is so that you can
use this sched_local_rq_clock() thing, right? Once that reaches a
threshold, you go use the more accurate sched_clock_cpu() and once that
tickles the threshold you call it golden and raise hell.

So pray tell, why did you not integrate this with IRQ_TIME_ACCOUNTING ?
That already takes a timestamp and does most of what you need.

> @@ -356,6 +512,7 @@ void irq_enter(void)
>  	}
>  
>  	__irq_enter();
> +	irq_interval_update();
>  }

Arggh.. you're going to make every single interrupt take at least 2
extra cache misses for this gunk?!?

And it lumps all interrupts on a single heap, and doesn't do any of the
otherwise useful things we've been wanting to have IRQ timings for :/


_If_ you want to do something like this, do it like the below. That only
adds a few instruction to irq_exit() and only touches a cacheline that's
already touched.

It computes both the avg duration and the avg inter-arrival-time of
hardirqs. Things get critical when:

	inter-arrival-avg < 2*duration-avg

or something like that.

---

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index d43318a489f2..6f5ef70b5a1d 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -50,7 +50,7 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
 void irqtime_account_irq(struct task_struct *curr)
 {
 	struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
-	s64 delta;
+	s64 delta, iat;
 	int cpu;
 
 	if (!sched_clock_irqtime)
@@ -58,7 +58,6 @@ void irqtime_account_irq(struct task_struct *curr)
 
 	cpu = smp_processor_id();
 	delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
-	irqtime->irq_start_time += delta;
 
 	/*
 	 * We do not account for softirq time from ksoftirqd here.
@@ -66,10 +65,21 @@ void irqtime_account_irq(struct task_struct *curr)
 	 * in that case, so as not to confuse scheduler with a special task
 	 * that do not consume any time, but still wants to run.
 	 */
-	if (hardirq_count())
+	if (hardirq_count()) {
 		irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
-	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+
+		/* this is irq_exit(), delta is the duration of the hardirq */
+		irqtime->duration_avg += (delta - irqtime->duration_avg) >> 7;
+
+		/* compute the inter arrival time using the previous arrival_time */
+		iat = irqtime->irq_start_time - irqtime->irq_arrival_time;
+		irqtime->irq_arrival_time += iat;
+		irqtime->irq_inter_arrival_avg += (iat - irqtime->inter_arrival_avg) >> 7;
+
+	} else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
 		irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
+
+	irqtime->irq_start_time += delta;
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 280a3c735935..cab07e5a6c11 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2236,6 +2236,9 @@ struct irqtime {
 	u64			total;
 	u64			tick_delta;
 	u64			irq_start_time;
+	u64			irq_duration_avg;
+	u64			irq_arrival_time;
+	u64			irq_inter_arrival_avg;
 	struct u64_stats_sync	sync;
 };