[RFC 6/6] softirq/sched: Account si cpu time to ksoftirqd(s)

From: Dmitry Safonov <dima@arista.com>
To: linux-kernel@vger.kernel.org
Cc: Dmitry Safonov <dima@arista.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	David Miller <davem@davemloft.net>,
	Eric Dumazet <edumazet@google.com>,
	Frederic Weisbecker <fweisbec@gmail.com>,
	Hannes Frederic Sowa <hannes@stressinduktion.org>,
	Ingo Molnar <mingo@kernel.org>,
	"Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Mauro Carvalho Chehab <mchehab@s-opensource.com>,
	Mike Galbraith <efault@gmx.de>, Paolo Abeni <pabeni@redhat.com>,
	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Radu Rendec <rrendec@arista.com>, Rik van Riel <riel@redhat.com>,
	Stanislaw Gruszka <sgruszka@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Wanpeng Li <wanpeng.li@hotmail.com>
Subject: [RFC 6/6] softirq/sched: Account si cpu time to ksoftirqd(s)
Date: Thu, 18 Jan 2018 16:12:38 +0000	[thread overview]
Message-ID: <20180118161238.13792-7-dima@arista.com> (raw)
In-Reply-To: <20180118161238.13792-1-dima@arista.com>

Warning: non-merge-ready in any sense

Under CONFIG_FAIR_SOFTIRQ_SCHEDULE each sched tick will account cpu time
spent on processing softirqs to ksoftirqd of the softirq's group.
Update then ksoftirqd->se.sum_exec_runtime and recalculate
ksoftirqd->se.vruntime.

Use CFS's vrutime to decide if softirq needs to be served or deferred.
It's possible to tune this with ksoftirqd nice policy.

Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 include/linux/interrupt.h |  1 +
 kernel/sched/fair.c       | 38 ++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h      | 19 +++++++++++++++++++
 kernel/softirq.c          | 45 +++++++++++++++++++++++++++++++++++++--------
 4 files changed, 95 insertions(+), 8 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 17e1a04445fa..a0b5c24c088a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -512,6 +512,7 @@ extern struct task_struct *__percpu **ksoftirqd;
 extern unsigned nr_softirq_groups;
 
 extern bool servicing_softirq(unsigned nr);
+extern unsigned group_softirqs(unsigned nr);
 static inline bool current_is_ksoftirqd(void)
 {
 	unsigned i;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2fe3aa853e4d..d0105739551f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -813,6 +813,42 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 }
 #endif /* CONFIG_SMP */
 
+static void update_ksoftirqd(struct cfs_rq *cfs_rq)
+{
+#ifdef CONFIG_FAIR_SOFTIRQ_SCHEDULE
+	int rq_cpu = cpu_of(rq_of(cfs_rq));
+	u64 si_times[NR_SOFTIRQS], delta[NR_SOFTIRQS];
+	unsigned i;
+
+	if (unlikely(!ksoftirqd))
+		return;
+
+	softirq_time_read(rq_cpu, si_times);
+
+	for (i = 0; i < NR_SOFTIRQS; i++) {
+		delta[i] = si_times[i] - cfs_rq->prev_si_time[i];
+		cfs_rq->prev_si_time[i] = si_times[i];
+		if (unlikely((s64)delta[i] < 0))
+			delta[i] = 0;
+	}
+
+	for (i = 0; i < nr_softirq_groups; i++) {
+		unsigned j, softirq = 0, group_mask = group_softirqs(i);
+		struct task_struct *tsk = *this_cpu_ptr(ksoftirqd[i]);
+		u64 sum_delta = 0;
+
+		while ((j = ffs(group_mask))) {
+			softirq += j - 1;
+			group_mask >>= j;
+			sum_delta += delta[softirq];
+		}
+
+		tsk->se.sum_exec_runtime += sum_delta;
+		tsk->se.vruntime += calc_delta_fair(sum_delta, &tsk->se);
+	}
+#endif
+}
+
 /*
  * Update the current task's runtime statistics.
  */
@@ -822,6 +858,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	u64 now = rq_clock_task(rq_of(cfs_rq));
 	u64 delta_exec;
 
+	update_ksoftirqd(cfs_rq);
+
 	if (unlikely(!curr))
 		return;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 14e154c86dc5..e95d8d4f9146 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -487,6 +487,10 @@ struct cfs_rq {
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
 
+#ifdef CONFIG_FAIR_SOFTIRQ_SCHEDULE
+	u64 prev_si_time[NR_SOFTIRQS];
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 	int runtime_enabled;
 	u64 runtime_expires;
@@ -2081,6 +2085,21 @@ static inline u64 irq_time_read(int cpu)
 }
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
+static inline void softirq_time_read(int cpu, u64 si_times[NR_SOFTIRQS])
+{
+#ifdef CONFIG_FAIR_SOFTIRQ_SCHEDULE
+	struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
+	unsigned int seq, i;
+
+	for (i = 0; i < NR_SOFTIRQS; i++) {
+		do {
+			seq = __u64_stats_fetch_begin(&irqtime->sync);
+			si_times[i] = irqtime->total_si[i];
+		} while (__u64_stats_fetch_retry(&irqtime->sync, seq));
+	}
+#endif
+}
+
 #ifdef CONFIG_CPU_FREQ
 DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 516e31d3d5b4..a123bafa11c2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -82,6 +82,11 @@ bool servicing_softirq(unsigned nr)
 	return false;
 }
 
+unsigned group_softirqs(unsigned nr)
+{
+	return group_to_softirqs[nr];
+}
+
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
  * but we also don't want to introduce a worst case 1/HZ latency
@@ -112,15 +117,10 @@ static void wakeup_softirqd(u32 softirq_mask)
  * If ksoftirqd is scheduled, we do not want to process pending softirqs
  * right now. Let ksoftirqd handle this at its own rate, to get fairness.
  */
-static bool ksoftirqd_running(void)
+static bool ksoftirqd_running(__u32 pending)
 {
-	/* We rely that there are pending softirqs */
-	__u32 pending = local_softirq_pending();
 	unsigned i;
 
-	if (!ksoftirqd)
-		return false;
-
 	for (i = 0; i < nr_softirq_groups && pending; i++) {
 		/* Interrupts are disabled: no need to stop preemption */
 		struct task_struct *tsk = *this_cpu_ptr(ksoftirqd[i]);
@@ -137,6 +137,33 @@ static bool ksoftirqd_running(void)
 	return !pending;
 }
 
+static __u32 softirqs_to_serve(__u32 pending)
+{
+	unsigned i;
+	__u32 unserve = pending;
+
+	if (!ksoftirqd || !current || is_idle_task(current))
+		return pending;
+
+	if (!IS_ENABLED(CONFIG_FAIR_SOFTIRQ_SCHEDULE))
+		return ksoftirqd_running(pending) ? 0 : pending;
+
+	for (i = 0; i < nr_softirq_groups && unserve; i++) {
+		/* Interrupts are disabled: no need to stop preemption */
+		struct task_struct *tsk = *this_cpu_ptr(ksoftirqd[i]);
+
+		if (tsk && (s64)(current->se.vruntime - tsk->se.vruntime) < 0) {
+			if (tsk->state != TASK_RUNNING)
+				wake_up_process(tsk);
+			continue;
+		}
+
+		unserve &= ~group_to_softirqs[i];
+	}
+
+	return pending & ~unserve;
+}
+
 /*
  * preempt_count and SOFTIRQ_OFFSET usage:
  * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
@@ -385,7 +412,8 @@ asmlinkage __visible void do_softirq(void)
 
 	local_irq_save(flags);
 
-	if (!ksoftirqd_running())
+	pending = softirqs_to_serve(pending);
+	if (pending)
 		do_softirq_own_stack(pending);
 
 	local_irq_restore(flags);
@@ -414,7 +442,8 @@ static inline void invoke_softirq(void)
 {
 	__u32 pending = local_softirq_pending();
 
-	if (!pending || !ksoftirqd_running())
+	pending = softirqs_to_serve(pending);
+	if (!pending)
 		return;
 
 	if (!force_irqthreads) {
-- 
2.13.6