All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 0/5] isolation: 1Hz residual tick offloading
@ 2017-12-19  3:23 Frederic Weisbecker
  2017-12-19  3:23 ` [PATCH 1/5] sched: Move tick code to a separate file Frederic Weisbecker
                   ` (4 more replies)
  0 siblings, 5 replies; 25+ messages in thread
From: Frederic Weisbecker @ 2017-12-19  3:23 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Peter Zijlstra, Chris Metcalf,
	Thomas Gleixner, Luiz Capitulino, Christoph Lameter,
	Paul E . McKenney, Ingo Molnar, Wanpeng Li, Mike Galbraith,
	Rik van Riel

Finally! It has been years since I had to do that but I kept knocking
against prerequisites, mostly about making scheduler_tick() resilient
against the absence of ticks. Now it seems that current->sched_class->task_tick()
is the last piece of it.

This patchset adds a flag to the isolcpus boot option to offload the
residual 1Hz tick.

For quick testing, say on CPUs 1-7:

	"isolcpus=nohz_offload,domain,1-7"

git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks.git
	timers/0z

HEAD: a9366794579a4ff71ec3546b9983536a669fbfb9

Thanks,
	Frederic
---

Frederic Weisbecker (5):
      sched: Move tick code to a separate file
      sched: Rename init_rq_hrtick to hrtick_rq_init
      sched/isolation: Add scheduler tick offloading interface
      sched/isolation: Residual 1Hz scheduler tick offload
      sched/isolation: Document "nohz_offload" flag


 Documentation/admin-guide/kernel-parameters.txt |   7 +-
 include/linux/sched/isolation.h                 |   3 +-
 kernel/sched/Makefile                           |   2 +-
 kernel/sched/core.c                             | 186 +-----------------
 kernel/sched/isolation.c                        |  10 +
 kernel/sched/sched.h                            |  13 +-
 kernel/sched/tick.c                             | 250 ++++++++++++++++++++++++
 7 files changed, 284 insertions(+), 187 deletions(-)

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 1/5] sched: Move tick code to a separate file
  2017-12-19  3:23 [RFC PATCH 0/5] isolation: 1Hz residual tick offloading Frederic Weisbecker
@ 2017-12-19  3:23 ` Frederic Weisbecker
  2017-12-19  9:08   ` Peter Zijlstra
  2017-12-19  3:23 ` [PATCH 2/5] sched: Rename init_rq_hrtick to hrtick_rq_init Frederic Weisbecker
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 25+ messages in thread
From: Frederic Weisbecker @ 2017-12-19  3:23 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Peter Zijlstra, Chris Metcalf,
	Thomas Gleixner, Luiz Capitulino, Christoph Lameter,
	Paul E . McKenney, Ingo Molnar, Wanpeng Li, Mike Galbraith,
	Rik van Riel

Let's debloat some more core.c
Also we are going to expand the tick code even further to introduce
scheduler tick offloading.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <kernellwp@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/Makefile |   2 +-
 kernel/sched/core.c   | 182 --------------------------------------------------
 kernel/sched/sched.h  |   7 +-
 kernel/sched/tick.c   | 177 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 184 insertions(+), 184 deletions(-)
 create mode 100644 kernel/sched/tick.c

diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index e2f9d4f..dd0b01e 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,7 +16,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
-obj-y += core.o loadavg.o clock.o cputime.o
+obj-y += core.o loadavg.o clock.o cputime.o tick.o
 obj-y += idle_task.o fair.o rt.o deadline.o
 obj-y += wait.o wait_bit.o swait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 644fa2e..06af4fa 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -233,129 +233,6 @@ void update_rq_clock(struct rq *rq)
 	update_rq_clock_task(rq, delta);
 }
 
-
-#ifdef CONFIG_SCHED_HRTICK
-/*
- * Use HR-timers to deliver accurate preemption points.
- */
-
-static void hrtick_clear(struct rq *rq)
-{
-	if (hrtimer_active(&rq->hrtick_timer))
-		hrtimer_cancel(&rq->hrtick_timer);
-}
-
-/*
- * High-resolution timer tick.
- * Runs from hardirq context with interrupts disabled.
- */
-static enum hrtimer_restart hrtick(struct hrtimer *timer)
-{
-	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
-	struct rq_flags rf;
-
-	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-
-	rq_lock(rq, &rf);
-	update_rq_clock(rq);
-	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
-	rq_unlock(rq, &rf);
-
-	return HRTIMER_NORESTART;
-}
-
-#ifdef CONFIG_SMP
-
-static void __hrtick_restart(struct rq *rq)
-{
-	struct hrtimer *timer = &rq->hrtick_timer;
-
-	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
-}
-
-/*
- * called from hardirq (IPI) context
- */
-static void __hrtick_start(void *arg)
-{
-	struct rq *rq = arg;
-	struct rq_flags rf;
-
-	rq_lock(rq, &rf);
-	__hrtick_restart(rq);
-	rq->hrtick_csd_pending = 0;
-	rq_unlock(rq, &rf);
-}
-
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and irqs disabled
- */
-void hrtick_start(struct rq *rq, u64 delay)
-{
-	struct hrtimer *timer = &rq->hrtick_timer;
-	ktime_t time;
-	s64 delta;
-
-	/*
-	 * Don't schedule slices shorter than 10000ns, that just
-	 * doesn't make sense and can cause timer DoS.
-	 */
-	delta = max_t(s64, delay, 10000LL);
-	time = ktime_add_ns(timer->base->get_time(), delta);
-
-	hrtimer_set_expires(timer, time);
-
-	if (rq == this_rq()) {
-		__hrtick_restart(rq);
-	} else if (!rq->hrtick_csd_pending) {
-		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
-		rq->hrtick_csd_pending = 1;
-	}
-}
-
-#else
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and irqs disabled
- */
-void hrtick_start(struct rq *rq, u64 delay)
-{
-	/*
-	 * Don't schedule slices shorter than 10000ns, that just
-	 * doesn't make sense. Rely on vruntime for fairness.
-	 */
-	delay = max_t(u64, delay, 10000LL);
-	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
-		      HRTIMER_MODE_REL_PINNED);
-}
-#endif /* CONFIG_SMP */
-
-static void init_rq_hrtick(struct rq *rq)
-{
-#ifdef CONFIG_SMP
-	rq->hrtick_csd_pending = 0;
-
-	rq->hrtick_csd.flags = 0;
-	rq->hrtick_csd.func = __hrtick_start;
-	rq->hrtick_csd.info = rq;
-#endif
-
-	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	rq->hrtick_timer.function = hrtick;
-}
-#else	/* CONFIG_SCHED_HRTICK */
-static inline void hrtick_clear(struct rq *rq)
-{
-}
-
-static inline void init_rq_hrtick(struct rq *rq)
-{
-}
-#endif	/* CONFIG_SCHED_HRTICK */
-
 /*
  * cmpxchg based fetch_or, macro so it works for different integer types
  */
@@ -3005,65 +2882,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	return ns;
 }
 
-/*
- * This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled.
- */
-void scheduler_tick(void)
-{
-	int cpu = smp_processor_id();
-	struct rq *rq = cpu_rq(cpu);
-	struct task_struct *curr = rq->curr;
-	struct rq_flags rf;
-
-	sched_clock_tick();
-
-	rq_lock(rq, &rf);
-
-	update_rq_clock(rq);
-	curr->sched_class->task_tick(rq, curr, 0);
-	cpu_load_update_active(rq);
-	calc_global_load_tick(rq);
-
-	rq_unlock(rq, &rf);
-
-	perf_event_task_tick();
-
-#ifdef CONFIG_SMP
-	rq->idle_balance = idle_cpu(cpu);
-	trigger_load_balance(rq);
-#endif
-	rq_last_tick_reset(rq);
-}
-
-#ifdef CONFIG_NO_HZ_FULL
-/**
- * scheduler_tick_max_deferment
- *
- * Keep at least one tick per second when a single
- * active task is running because the scheduler doesn't
- * yet completely support full dynticks environment.
- *
- * This makes sure that uptime, CFS vruntime, load
- * balancing, etc... continue to move forward, even
- * with a very low granularity.
- *
- * Return: Maximum deferment in nanoseconds.
- */
-u64 scheduler_tick_max_deferment(void)
-{
-	struct rq *rq = this_rq();
-	unsigned long next, now = READ_ONCE(jiffies);
-
-	next = rq->last_sched_tick + HZ;
-
-	if (time_before_eq(next, now))
-		return 0;
-
-	return jiffies_to_nsecs(next - now);
-}
-#endif
-
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 				defined(CONFIG_PREEMPT_TRACER))
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b19552a2..43f065e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1676,7 +1676,9 @@ static inline int hrtick_enabled(struct rq *rq)
 	return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
 
-void hrtick_start(struct rq *rq, u64 delay);
+extern void hrtick_start(struct rq *rq, u64 delay);
+extern void hrtick_clear(struct rq *rq);
+extern void init_rq_hrtick(struct rq *rq);
 
 #else
 
@@ -1685,6 +1687,9 @@ static inline int hrtick_enabled(struct rq *rq)
 	return 0;
 }
 
+static inline void hrtick_clear(struct rq *rq) { }
+static inline void init_rq_hrtick(struct rq *rq) { }
+
 #endif /* CONFIG_SCHED_HRTICK */
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/tick.c b/kernel/sched/tick.c
new file mode 100644
index 0000000..bcc6d7d
--- /dev/null
+++ b/kernel/sched/tick.c
@@ -0,0 +1,177 @@
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/perf_event.h>
+#include "sched.h"
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ */
+void scheduler_tick(void)
+{
+	int cpu = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+	struct task_struct *curr = rq->curr;
+	struct rq_flags rf;
+
+	sched_clock_tick();
+
+	rq_lock(rq, &rf);
+
+	update_rq_clock(rq);
+	curr->sched_class->task_tick(rq, curr, 0);
+	cpu_load_update_active(rq);
+	calc_global_load_tick(rq);
+
+	rq_unlock(rq, &rf);
+
+	perf_event_task_tick();
+
+#ifdef CONFIG_SMP
+	rq->idle_balance = idle_cpu(cpu);
+	trigger_load_balance(rq);
+#endif
+	rq_last_tick_reset(rq);
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+/**
+ * scheduler_tick_max_deferment
+ *
+ * Keep at least one tick per second when a single
+ * active task is running because the scheduler doesn't
+ * yet completely support full dynticks environment.
+ *
+ * This makes sure that uptime, CFS vruntime, load
+ * balancing, etc... continue to move forward, even
+ * with a very low granularity.
+ *
+ * Return: Maximum deferment in nanoseconds.
+ */
+u64 scheduler_tick_max_deferment(void)
+{
+	struct rq *rq = this_rq();
+	unsigned long next, now = READ_ONCE(jiffies);
+
+	next = rq->last_sched_tick + HZ;
+
+	if (time_before_eq(next, now))
+		return 0;
+
+	return jiffies_to_nsecs(next - now);
+}
+#endif
+
+#ifdef CONFIG_SCHED_HRTICK
+/*
+ * Use HR-timers to deliver accurate preemption points.
+ */
+
+void hrtick_clear(struct rq *rq)
+{
+	if (hrtimer_active(&rq->hrtick_timer))
+		hrtimer_cancel(&rq->hrtick_timer);
+}
+
+/*
+ * High-resolution timer tick.
+ * Runs from hardirq context with interrupts disabled.
+ */
+static enum hrtimer_restart hrtick(struct hrtimer *timer)
+{
+	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+	struct rq_flags rf;
+
+	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+	rq_lock(rq, &rf);
+	update_rq_clock(rq);
+	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+	rq_unlock(rq, &rf);
+
+	return HRTIMER_NORESTART;
+}
+
+#ifdef CONFIG_SMP
+
+static void __hrtick_restart(struct rq *rq)
+{
+	struct hrtimer *timer = &rq->hrtick_timer;
+
+	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+}
+
+/*
+ * called from hardirq (IPI) context
+ */
+static void __hrtick_start(void *arg)
+{
+	struct rq *rq = arg;
+	struct rq_flags rf;
+
+	rq_lock(rq, &rf);
+	__hrtick_restart(rq);
+	rq->hrtick_csd_pending = 0;
+	rq_unlock(rq, &rf);
+}
+
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+void hrtick_start(struct rq *rq, u64 delay)
+{
+	struct hrtimer *timer = &rq->hrtick_timer;
+	ktime_t time;
+	s64 delta;
+
+	/*
+	 * Don't schedule slices shorter than 10000ns, that just
+	 * doesn't make sense and can cause timer DoS.
+	 */
+	delta = max_t(s64, delay, 10000LL);
+	time = ktime_add_ns(timer->base->get_time(), delta);
+
+	hrtimer_set_expires(timer, time);
+
+	if (rq == this_rq()) {
+		__hrtick_restart(rq);
+	} else if (!rq->hrtick_csd_pending) {
+		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
+		rq->hrtick_csd_pending = 1;
+	}
+}
+
+#else
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+void hrtick_start(struct rq *rq, u64 delay)
+{
+	/*
+	 * Don't schedule slices shorter than 10000ns, that just
+	 * doesn't make sense. Rely on vruntime for fairness.
+	 */
+	delay = max_t(u64, delay, 10000LL);
+	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
+		      HRTIMER_MODE_REL_PINNED);
+}
+#endif /* CONFIG_SMP */
+
+void init_rq_hrtick(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+	rq->hrtick_csd_pending = 0;
+
+	rq->hrtick_csd.flags = 0;
+	rq->hrtick_csd.func = __hrtick_start;
+	rq->hrtick_csd.info = rq;
+#endif
+
+	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rq->hrtick_timer.function = hrtick;
+}
+#endif	/* CONFIG_SCHED_HRTICK */
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 2/5] sched: Rename init_rq_hrtick to hrtick_rq_init
  2017-12-19  3:23 [RFC PATCH 0/5] isolation: 1Hz residual tick offloading Frederic Weisbecker
  2017-12-19  3:23 ` [PATCH 1/5] sched: Move tick code to a separate file Frederic Weisbecker
@ 2017-12-19  3:23 ` Frederic Weisbecker
  2017-12-19  3:23 ` [PATCH 3/5] sched/isolation: Add scheduler tick offloading interface Frederic Weisbecker
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 25+ messages in thread
From: Frederic Weisbecker @ 2017-12-19  3:23 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Peter Zijlstra, Chris Metcalf,
	Thomas Gleixner, Luiz Capitulino, Christoph Lameter,
	Paul E . McKenney, Ingo Molnar, Wanpeng Li, Mike Galbraith,
	Rik van Riel

Do that rename in order to normalize the hrtick namespace.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <kernellwp@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 2 +-
 kernel/sched/sched.h | 4 ++--
 kernel/sched/tick.c  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 06af4fa..b6f74c8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5773,7 +5773,7 @@ void __init sched_init(void)
 		rq->last_sched_tick = 0;
 #endif
 #endif /* CONFIG_SMP */
-		init_rq_hrtick(rq);
+		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
 	}
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 43f065e..16eef0c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1678,7 +1678,7 @@ static inline int hrtick_enabled(struct rq *rq)
 
 extern void hrtick_start(struct rq *rq, u64 delay);
 extern void hrtick_clear(struct rq *rq);
-extern void init_rq_hrtick(struct rq *rq);
+extern void hrtick_rq_init(struct rq *rq);
 
 #else
 
@@ -1688,7 +1688,7 @@ static inline int hrtick_enabled(struct rq *rq)
 }
 
 static inline void hrtick_clear(struct rq *rq) { }
-static inline void init_rq_hrtick(struct rq *rq) { }
+static inline void hrtick_rq_init(struct rq *rq) { }
 
 #endif /* CONFIG_SCHED_HRTICK */
 
diff --git a/kernel/sched/tick.c b/kernel/sched/tick.c
index bcc6d7d..5eabfe3 100644
--- a/kernel/sched/tick.c
+++ b/kernel/sched/tick.c
@@ -161,7 +161,7 @@ void hrtick_start(struct rq *rq, u64 delay)
 }
 #endif /* CONFIG_SMP */
 
-void init_rq_hrtick(struct rq *rq)
+void hrtick_rq_init(struct rq *rq)
 {
 #ifdef CONFIG_SMP
 	rq->hrtick_csd_pending = 0;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 3/5] sched/isolation: Add scheduler tick offloading interface
  2017-12-19  3:23 [RFC PATCH 0/5] isolation: 1Hz residual tick offloading Frederic Weisbecker
  2017-12-19  3:23 ` [PATCH 1/5] sched: Move tick code to a separate file Frederic Weisbecker
  2017-12-19  3:23 ` [PATCH 2/5] sched: Rename init_rq_hrtick to hrtick_rq_init Frederic Weisbecker
@ 2017-12-19  3:23 ` Frederic Weisbecker
  2017-12-19  3:23 ` [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload Frederic Weisbecker
  2017-12-19  3:23 ` [PATCH 5/5] sched/isolation: Document "nohz_offload" flag Frederic Weisbecker
  4 siblings, 0 replies; 25+ messages in thread
From: Frederic Weisbecker @ 2017-12-19  3:23 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Peter Zijlstra, Chris Metcalf,
	Thomas Gleixner, Luiz Capitulino, Christoph Lameter,
	Paul E . McKenney, Ingo Molnar, Wanpeng Li, Mike Galbraith,
	Rik van Riel

Add the boot option that will allow us to offload the 1Hz scheduler tick
to the housekeeping CPU.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <kernellwp@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h | 3 ++-
 kernel/sched/isolation.c        | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index d849431..c831855 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -11,7 +11,8 @@ enum hk_flags {
 	HK_FLAG_MISC		= (1 << 2),
 	HK_FLAG_SCHED		= (1 << 3),
 	HK_FLAG_TICK		= (1 << 4),
-	HK_FLAG_DOMAIN		= (1 << 5),
+	HK_FLAG_TICK_SCHED	= (1 << 5),
+	HK_FLAG_DOMAIN		= (1 << 6),
 };
 
 #ifdef CONFIG_CPU_ISOLATION
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index b71b436..264ddcd 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -136,6 +136,12 @@ static int __init housekeeping_isolcpus_setup(char *str)
 			continue;
 		}
 
+		if (!strncmp(str, "nohz_offload,", 13)) {
+			str += 13;
+			flags |= HK_FLAG_TICK | HK_FLAG_TICK_SCHED;
+			continue;
+		}
+
 		if (!strncmp(str, "domain,", 7)) {
 			str += 7;
 			flags |= HK_FLAG_DOMAIN;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2017-12-19  3:23 [RFC PATCH 0/5] isolation: 1Hz residual tick offloading Frederic Weisbecker
                   ` (2 preceding siblings ...)
  2017-12-19  3:23 ` [PATCH 3/5] sched/isolation: Add scheduler tick offloading interface Frederic Weisbecker
@ 2017-12-19  3:23 ` Frederic Weisbecker
  2017-12-19  9:19   ` Peter Zijlstra
  2017-12-19 16:03   ` Christopher Lameter
  2017-12-19  3:23 ` [PATCH 5/5] sched/isolation: Document "nohz_offload" flag Frederic Weisbecker
  4 siblings, 2 replies; 25+ messages in thread
From: Frederic Weisbecker @ 2017-12-19  3:23 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Peter Zijlstra, Chris Metcalf,
	Thomas Gleixner, Luiz Capitulino, Christoph Lameter,
	Paul E . McKenney, Ingo Molnar, Wanpeng Li, Mike Galbraith,
	Rik van Riel

When a CPU runs in full dynticks mode, a 1Hz tick remains in order to
keep the scheduler stats alive. However this residual tick is a burden
for Real-Time tasks that can't stand no interruption at all.

Adding the boot parameter "isolcpus=nohz_offload" will now outsource
these scheduler ticks to the global workqueue so that a housekeeping CPU
handles that tick remotely.

Note it's still up to the user to affine the global workqueues to the
housekeeping CPUs through /sys/devices/virtual/workqueue/cpumask or
domains isolation.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <kernellwp@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c      |  2 ++
 kernel/sched/isolation.c |  4 +++
 kernel/sched/sched.h     |  6 ++++
 kernel/sched/tick.c      | 79 ++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b6f74c8..f50ba18 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5531,6 +5531,7 @@ int sched_cpu_starting(unsigned int cpu)
 {
 	set_cpu_rq_start_time(cpu);
 	sched_rq_cpu_starting(cpu);
+	sched_tick_start(cpu);
 	return 0;
 }
 
@@ -5542,6 +5543,7 @@ int sched_cpu_dying(unsigned int cpu)
 
 	/* Handle pending wakeups and then migrate everything off */
 	sched_ttwu_pending();
+	sched_tick_stop(cpu);
 
 	rq_lock_irqsave(rq, &rf);
 	if (rq->rd) {
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 264ddcd..c5e7e90a 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/static_key.h>
 #include <linux/ctype.h>
+#include "sched.h"
 
 DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
 EXPORT_SYMBOL_GPL(housekeeping_overriden);
@@ -60,6 +61,9 @@ void __init housekeeping_init(void)
 
 	static_branch_enable(&housekeeping_overriden);
 
+	if (housekeeping_flags & HK_FLAG_TICK_SCHED)
+		sched_tick_offload_init();
+
 	/* We need at least one CPU to handle housekeeping work */
 	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 16eef0c..57821c9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1587,6 +1587,9 @@ extern void post_init_entity_util_avg(struct sched_entity *se);
 
 #ifdef CONFIG_NO_HZ_FULL
 extern bool sched_can_stop_tick(struct rq *rq);
+extern void sched_tick_start(int cpu);
+extern void sched_tick_stop(int cpu);
+extern int __init sched_tick_offload_init(void);
 
 /*
  * Tick may be needed by tasks in the runqueue depending on their policy and
@@ -1611,6 +1614,9 @@ static inline void sched_update_tick_dependency(struct rq *rq)
 		tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
 }
 #else
+static inline void sched_tick_start(int cpu) { }
+static inline void sched_tick_stop(int cpu) { }
+static inline int sched_tick_offload_init(void) { return 0; }
 static inline void sched_update_tick_dependency(struct rq *rq) { }
 #endif
 
diff --git a/kernel/sched/tick.c b/kernel/sched/tick.c
index 5eabfe3..fc31f9e 100644
--- a/kernel/sched/tick.c
+++ b/kernel/sched/tick.c
@@ -1,5 +1,6 @@
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/isolation.h>
 #include <linux/perf_event.h>
 #include "sched.h"
 
@@ -50,9 +51,14 @@ void scheduler_tick(void)
  */
 u64 scheduler_tick_max_deferment(void)
 {
-	struct rq *rq = this_rq();
-	unsigned long next, now = READ_ONCE(jiffies);
+	struct rq *rq;
+	unsigned long next, now;
 
+	if (!housekeeping_cpu(smp_processor_id(), HK_FLAG_TICK_SCHED))
+		return ktime_to_ns(KTIME_MAX);
+
+	rq = this_rq();
+	now = READ_ONCE(jiffies);
 	next = rq->last_sched_tick + HZ;
 
 	if (time_before_eq(next, now))
@@ -60,7 +66,74 @@ u64 scheduler_tick_max_deferment(void)
 
 	return jiffies_to_nsecs(next - now);
 }
-#endif
+
+struct tick_work {
+	int			cpu;
+	struct delayed_work	work;
+};
+
+static struct tick_work __percpu *tick_work_cpu;
+
+static void sched_tick_remote(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct tick_work *twork = container_of(dwork, struct tick_work, work);
+	struct rq *rq = cpu_rq(twork->cpu);
+	struct rq_flags rf;
+
+	rq_lock_irq(rq, &rf);
+	update_rq_clock(rq);
+	rq->curr->sched_class->task_tick(rq, rq->curr, 0);
+	rq_unlock_irq(rq, &rf);
+
+	queue_delayed_work(system_unbound_wq, dwork, HZ);
+}
+
+void sched_tick_start(int cpu)
+{
+	struct tick_work *twork;
+
+	if (housekeeping_cpu(cpu, HK_FLAG_TICK_SCHED))
+		return;
+
+	WARN_ON_ONCE(!tick_work_cpu);
+
+	twork = per_cpu_ptr(tick_work_cpu, cpu);
+	twork->cpu = cpu;
+	INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+	queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+
+	return;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+void sched_tick_stop(int cpu)
+{
+	struct tick_work *twork;
+
+	if (housekeeping_cpu(cpu, HK_FLAG_TICK_SCHED))
+		return;
+
+	WARN_ON_ONCE(!tick_work_cpu);
+
+	twork = per_cpu_ptr(tick_work_cpu, cpu);
+	cancel_delayed_work_sync(&twork->work);
+
+	return;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __init sched_tick_offload_init(void)
+{
+	tick_work_cpu = alloc_percpu(struct tick_work);
+	if (!tick_work_cpu) {
+		pr_err("Can't allocate remote tick struct\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+#endif /* CONFIG_NO_HZ_FULL */
 
 #ifdef CONFIG_SCHED_HRTICK
 /*
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 5/5] sched/isolation: Document "nohz_offload" flag
  2017-12-19  3:23 [RFC PATCH 0/5] isolation: 1Hz residual tick offloading Frederic Weisbecker
                   ` (3 preceding siblings ...)
  2017-12-19  3:23 ` [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload Frederic Weisbecker
@ 2017-12-19  3:23 ` Frederic Weisbecker
  4 siblings, 0 replies; 25+ messages in thread
From: Frederic Weisbecker @ 2017-12-19  3:23 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Peter Zijlstra, Chris Metcalf,
	Thomas Gleixner, Luiz Capitulino, Christoph Lameter,
	Paul E . McKenney, Ingo Molnar, Wanpeng Li, Mike Galbraith,
	Rik van Riel

Document the interface to offload the 1Hz scheduler tick in full
dynticks mode. Also improve the comment about the existing "nohz" flag
in order to differentiate its behaviour.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <kernellwp@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1683107..fcc5fd9 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1745,7 +1745,12 @@
 			specified in the flag list (default: domain):
 
 			nohz
-			  Disable the tick when a single task runs.
+			  Disable the tick when a single task runs. A residual 1Hz
+			  tick remains to maintain scheduler stats alive.
+			nohz_offload
+			  Like nohz but the residual 1Hz tick is offloaded to
+			  housekeeping CPUs, leaving the CPU free of any tick if
+			  nothing else requests it.
 			domain
 			  Isolate from the general SMP balancing and scheduling
 			  algorithms. Note that performing domain isolation this way
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/5] sched: Move tick code to a separate file
  2017-12-19  3:23 ` [PATCH 1/5] sched: Move tick code to a separate file Frederic Weisbecker
@ 2017-12-19  9:08   ` Peter Zijlstra
  2017-12-19 16:33     ` Frederic Weisbecker
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2017-12-19  9:08 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: LKML, Chris Metcalf, Thomas Gleixner, Luiz Capitulino,
	Christoph Lameter, Paul E . McKenney, Ingo Molnar, Wanpeng Li,
	Mike Galbraith, Rik van Riel

On Tue, Dec 19, 2017 at 04:23:54AM +0100, Frederic Weisbecker wrote:
> Let's debloat some more core.c
> Also we are going to expand the tick code even further to introduce
> scheduler tick offloading.

Not a fan, this is a pretty artificial split and just makes me curse
more for not finding code.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2017-12-19  3:23 ` [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload Frederic Weisbecker
@ 2017-12-19  9:19   ` Peter Zijlstra
  2017-12-19 14:34     ` Luiz Capitulino
                       ` (2 more replies)
  2017-12-19 16:03   ` Christopher Lameter
  1 sibling, 3 replies; 25+ messages in thread
From: Peter Zijlstra @ 2017-12-19  9:19 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: LKML, Chris Metcalf, Thomas Gleixner, Luiz Capitulino,
	Christoph Lameter, Paul E . McKenney, Ingo Molnar, Wanpeng Li,
	Mike Galbraith, Rik van Riel

On Tue, Dec 19, 2017 at 04:23:57AM +0100, Frederic Weisbecker wrote:
> When a CPU runs in full dynticks mode, a 1Hz tick remains in order to
> keep the scheduler stats alive. However this residual tick is a burden
> for Real-Time tasks that can't stand no interruption at all.

I'm not sure that is accurate. RT doesn't necessarily have anything much
to so with this. The tick is per definition very deterministic and thus
should not be a problem.

> Adding the boot parameter "isolcpus=nohz_offload" will now outsource
> these scheduler ticks to the global workqueue so that a housekeeping CPU
> handles that tick remotely.

The global workqueue sounds horrific; surely you want at least one such
housekeeping CPU per node or something ?

> Note it's still up to the user to affine the global workqueues to the
> housekeeping CPUs through /sys/devices/virtual/workqueue/cpumask or
> domains isolation.

Not sure I understand what this means... from what I can tell you're
using an unbound workqueue, there's no way to split the ticks up to node
local CPUs.

> +static void sched_tick_remote(struct work_struct *work)
> +{
> +	struct delayed_work *dwork = to_delayed_work(work);
> +	struct tick_work *twork = container_of(dwork, struct tick_work, work);
> +	struct rq *rq = cpu_rq(twork->cpu);
> +	struct rq_flags rf;
> +
> +	rq_lock_irq(rq, &rf);
> +	update_rq_clock(rq);
> +	rq->curr->sched_class->task_tick(rq, rq->curr, 0);
> +	rq_unlock_irq(rq, &rf);
> +
> +	queue_delayed_work(system_unbound_wq, dwork, HZ);
> +}
> +
> +void sched_tick_start(int cpu)
> +{
> +	struct tick_work *twork;
> +
> +	if (housekeeping_cpu(cpu, HK_FLAG_TICK_SCHED))
> +		return;
> +
> +	WARN_ON_ONCE(!tick_work_cpu);
> +
> +	twork = per_cpu_ptr(tick_work_cpu, cpu);
> +	twork->cpu = cpu;
> +	INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
> +	queue_delayed_work(system_unbound_wq, &twork->work, HZ);
> +
> +	return;
> +}
> +
> +#ifdef CONFIG_HOTPLUG_CPU
> +void sched_tick_stop(int cpu)
> +{
> +	struct tick_work *twork;
> +
> +	if (housekeeping_cpu(cpu, HK_FLAG_TICK_SCHED))
> +		return;
> +
> +	WARN_ON_ONCE(!tick_work_cpu);
> +
> +	twork = per_cpu_ptr(tick_work_cpu, cpu);
> +	cancel_delayed_work_sync(&twork->work);
> +
> +	return;
> +}
> +#endif /* CONFIG_HOTPLUG_CPU */

This seems daft in that you _always_ run this remote tick, even when the
CPU in question is not in nohz (full) mode.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2017-12-19  9:19   ` Peter Zijlstra
@ 2017-12-19 14:34     ` Luiz Capitulino
  2017-12-19 16:01     ` Christopher Lameter
  2017-12-19 16:26     ` Frederic Weisbecker
  2 siblings, 0 replies; 25+ messages in thread
From: Luiz Capitulino @ 2017-12-19 14:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Frederic Weisbecker, LKML, Chris Metcalf, Thomas Gleixner,
	Christoph Lameter, Paul E . McKenney, Ingo Molnar, Wanpeng Li,
	Mike Galbraith, Rik van Riel

On Tue, 19 Dec 2017 10:19:11 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Dec 19, 2017 at 04:23:57AM +0100, Frederic Weisbecker wrote:
> > When a CPU runs in full dynticks mode, a 1Hz tick remains in order to
> > keep the scheduler stats alive. However this residual tick is a burden
> > for Real-Time tasks that can't stand no interruption at all.  
> 
> I'm not sure that is accurate. RT doesn't necessarily have anything much
> to so with this. The tick is per definition very deterministic and thus
> should not be a problem.

What Frederic says is certainly true for HPC. But even for RT I'd say
this is a very nice feature, since it may reduce maximum latency.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2017-12-19  9:19   ` Peter Zijlstra
  2017-12-19 14:34     ` Luiz Capitulino
@ 2017-12-19 16:01     ` Christopher Lameter
  2017-12-19 16:04       ` Peter Zijlstra
  2017-12-19 16:26     ` Frederic Weisbecker
  2 siblings, 1 reply; 25+ messages in thread
From: Christopher Lameter @ 2017-12-19 16:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Frederic Weisbecker, LKML, Chris Metcalf, Thomas Gleixner,
	Luiz Capitulino, Paul E . McKenney, Ingo Molnar, Wanpeng Li,
	Mike Galbraith, Rik van Riel

On Tue, 19 Dec 2017, Peter Zijlstra wrote:

> On Tue, Dec 19, 2017 at 04:23:57AM +0100, Frederic Weisbecker wrote:
> > When a CPU runs in full dynticks mode, a 1Hz tick remains in order to
> > keep the scheduler stats alive. However this residual tick is a burden
> > for Real-Time tasks that can't stand no interruption at all.
>
> I'm not sure that is accurate. RT doesn't necessarily have anything much
> to so with this. The tick is per definition very deterministic and thus
> should not be a problem.

Depends what one means by RT. Certainly if you want bare metal performance
then the tick is a problem. There are numerous loads sensitive to cpu
denial through the scheduler tick for a few microseconds. MPI and software
that rendevouz frequently in HPC are one example. Here at my workplace we
regular monitor these effects and would like a system that runs as clean
as possible.

> > Adding the boot parameter "isolcpus=nohz_offload" will now outsource
> > these scheduler ticks to the global workqueue so that a housekeeping CPU
> > handles that tick remotely.
>
> The global workqueue sounds horrific; surely you want at least one such
> housekeeping CPU per node or something ?

Well that determines how many processors you can free from the OS noise.
Having one workqueue per node sacrifices one core more to the operating
system. But it would increase OS performance. So maybe we can have that
configurable?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2017-12-19  3:23 ` [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload Frederic Weisbecker
  2017-12-19  9:19   ` Peter Zijlstra
@ 2017-12-19 16:03   ` Christopher Lameter
  2017-12-19 16:32     ` Frederic Weisbecker
  1 sibling, 1 reply; 25+ messages in thread
From: Christopher Lameter @ 2017-12-19 16:03 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: LKML, Peter Zijlstra, Chris Metcalf, Thomas Gleixner,
	Luiz Capitulino, Paul E . McKenney, Ingo Molnar, Wanpeng Li,
	Mike Galbraith, Rik van Riel

On Tue, 19 Dec 2017, Frederic Weisbecker wrote:

> Adding the boot parameter "isolcpus=nohz_offload" will now outsource
> these scheduler ticks to the global workqueue so that a housekeeping CPU
> handles that tick remotely.

The vmstat processing required per cpu area access. How does that work if
the code is running on a remote processor?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2017-12-19 16:01     ` Christopher Lameter
@ 2017-12-19 16:04       ` Peter Zijlstra
  2017-12-19 16:38         ` Christopher Lameter
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2017-12-19 16:04 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Frederic Weisbecker, LKML, Chris Metcalf, Thomas Gleixner,
	Luiz Capitulino, Paul E . McKenney, Ingo Molnar, Wanpeng Li,
	Mike Galbraith, Rik van Riel

On Tue, Dec 19, 2017 at 10:01:46AM -0600, Christopher Lameter wrote:
> On Tue, 19 Dec 2017, Peter Zijlstra wrote:
> 
> > On Tue, Dec 19, 2017 at 04:23:57AM +0100, Frederic Weisbecker wrote:
> > > When a CPU runs in full dynticks mode, a 1Hz tick remains in order to
> > > keep the scheduler stats alive. However this residual tick is a burden
> > > for Real-Time tasks that can't stand no interruption at all.
> >
> > I'm not sure that is accurate. RT doesn't necessarily have anything much
> > to so with this. The tick is per definition very deterministic and thus
> > should not be a problem.
> 
> Depends what one means by RT. 

Real Time computing as per the literature. Any other definition is
wrong and confusing.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2017-12-19  9:19   ` Peter Zijlstra
  2017-12-19 14:34     ` Luiz Capitulino
  2017-12-19 16:01     ` Christopher Lameter
@ 2017-12-19 16:26     ` Frederic Weisbecker
  2 siblings, 0 replies; 25+ messages in thread
From: Frederic Weisbecker @ 2017-12-19 16:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Chris Metcalf, Thomas Gleixner, Luiz Capitulino,
	Christoph Lameter, Paul E . McKenney, Ingo Molnar, Wanpeng Li,
	Mike Galbraith, Rik van Riel

2017-12-19 10:19 UTC+01:00, Peter Zijlstra <peterz@infradead.org>:
> On Tue, Dec 19, 2017 at 04:23:57AM +0100, Frederic Weisbecker wrote:
>> When a CPU runs in full dynticks mode, a 1Hz tick remains in order to
>> keep the scheduler stats alive. However this residual tick is a burden
>> for Real-Time tasks that can't stand no interruption at all.
>
> I'm not sure that is accurate. RT doesn't necessarily have anything much
> to so with this. The tick is per definition very deterministic and thus
> should not be a problem.

I see, the term Real-Time can indeed be misleading here. I'll rather
use "bare metal", as per Christoph's suggestion.

>
>> Adding the boot parameter "isolcpus=nohz_offload" will now outsource
>> these scheduler ticks to the global workqueue so that a housekeeping CPU
>> handles that tick remotely.
>
> The global workqueue sounds horrific; surely you want at least one such
> housekeeping CPU per node or something ?

I guess it depends how much CPUs we can afford to sacrifice to
housekeeping. Surely the more CPUs we isolate, the more CPUs we want
to do housekeeping and preferably per node. IIRC, the
system_unbound_wq queues a work to a thread running on the enqueuer
node when possible. But I need to check that. If it's the case, then
it's up to the user to leave one CPU out of isolcpus on each node and
the works should get queued and requeued to those per node
housekeepers automatically.

>
>> +static void sched_tick_remote(struct work_struct *work)
>> +{
>> +	struct delayed_work *dwork = to_delayed_work(work);
>> +	struct tick_work *twork = container_of(dwork, struct tick_work, work);
>> +	struct rq *rq = cpu_rq(twork->cpu);
>> +	struct rq_flags rf;
>> +
>> +	rq_lock_irq(rq, &rf);
>> +	update_rq_clock(rq);
>> +	rq->curr->sched_class->task_tick(rq, rq->curr, 0);
>> +	rq_unlock_irq(rq, &rf);
>> +
>> +	queue_delayed_work(system_unbound_wq, dwork, HZ);
>> +}
>> +
>> +void sched_tick_start(int cpu)
>> +{
>> +	struct tick_work *twork;
>> +
>> +	if (housekeeping_cpu(cpu, HK_FLAG_TICK_SCHED))
>> +		return;
>> +
>> +	WARN_ON_ONCE(!tick_work_cpu);
>> +
>> +	twork = per_cpu_ptr(tick_work_cpu, cpu);
>> +	twork->cpu = cpu;
>> +	INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
>> +	queue_delayed_work(system_unbound_wq, &twork->work, HZ);
>> +
>> +	return;
>> +}
>> +
>> +#ifdef CONFIG_HOTPLUG_CPU
>> +void sched_tick_stop(int cpu)
>> +{
>> +	struct tick_work *twork;
>> +
>> +	if (housekeeping_cpu(cpu, HK_FLAG_TICK_SCHED))
>> +		return;
>> +
>> +	WARN_ON_ONCE(!tick_work_cpu);
>> +
>> +	twork = per_cpu_ptr(tick_work_cpu, cpu);
>> +	cancel_delayed_work_sync(&twork->work);
>> +
>> +	return;
>> +}
>> +#endif /* CONFIG_HOTPLUG_CPU */
>
> This seems daft in that you _always_ run this remote tick, even when the
> CPU in question is not in nohz (full) mode.

Yeah that's very basic, I think I should add a check to verify that
the CPU has effectively stopped its tick and is not in idle mode. This
will be racy but it shouldn't matter much.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2017-12-19 16:03   ` Christopher Lameter
@ 2017-12-19 16:32     ` Frederic Weisbecker
  2017-12-19 17:23       ` Christopher Lameter
  0 siblings, 1 reply; 25+ messages in thread
From: Frederic Weisbecker @ 2017-12-19 16:32 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: LKML, Peter Zijlstra, Chris Metcalf, Thomas Gleixner,
	Luiz Capitulino, Paul E . McKenney, Ingo Molnar, Wanpeng Li,
	Mike Galbraith, Rik van Riel

2017-12-19 17:03 UTC+01:00, Christopher Lameter <cl@linux.com>:
> On Tue, 19 Dec 2017, Frederic Weisbecker wrote:
>
>> Adding the boot parameter "isolcpus=nohz_offload" will now outsource
>> these scheduler ticks to the global workqueue so that a housekeeping CPU
>> handles that tick remotely.
>
> The vmstat processing required per cpu area access. How does that work if
> the code is running on a remote processor?

It seems that current::sched_class::task_tick() is ok with this, as it
uses per runqueues or per task datas. And both are passed as
arguments.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/5] sched: Move tick code to a separate file
  2017-12-19  9:08   ` Peter Zijlstra
@ 2017-12-19 16:33     ` Frederic Weisbecker
  0 siblings, 0 replies; 25+ messages in thread
From: Frederic Weisbecker @ 2017-12-19 16:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Chris Metcalf, Thomas Gleixner, Luiz Capitulino,
	Christoph Lameter, Paul E . McKenney, Ingo Molnar, Wanpeng Li,
	Mike Galbraith, Rik van Riel

2017-12-19 10:08 UTC+01:00, Peter Zijlstra <peterz@infradead.org>:
> On Tue, Dec 19, 2017 at 04:23:54AM +0100, Frederic Weisbecker wrote:
>> Let's debloat some more core.c
>> Also we are going to expand the tick code even further to introduce
>> scheduler tick offloading.
>
> Not a fan, this is a pretty artificial split and just makes me curse
> more for not finding code.

As you prefer, I'll keep things in core.c

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2017-12-19 16:04       ` Peter Zijlstra
@ 2017-12-19 16:38         ` Christopher Lameter
  2017-12-19 16:49           ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Christopher Lameter @ 2017-12-19 16:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Frederic Weisbecker, LKML, Chris Metcalf, Thomas Gleixner,
	Luiz Capitulino, Paul E . McKenney, Ingo Molnar, Wanpeng Li,
	Mike Galbraith, Rik van Riel

On Tue, 19 Dec 2017, Peter Zijlstra wrote:

> > Depends what one means by RT.
>
> Real Time computing as per the literature. Any other definition is
> wrong and confusing.

That is an understanding of language rooted in the positivism of the early
20th century which was intending to assign a single and clear meaning to
each word (Betrand Russell f.e.). Contemporarily the meaning of words are
determined by the use cases for those words in communities and in
particular by the function of these words (See Wittgenstein).

And the term RT has been heavily abused by marketing folks to mean any
number of things so people can use RT to refer to variety of things. So
please always clarify what you mean exactly.

You mean RT in CS academic literature from the 1990s I guess? Mostly
useless for real workloads...

Frederic is probably more using the misleading characterization of
"realtime" as spread by the marketing teams of leading distros that
suggests low latency and less OS noise.

But then in business in general "realtime processing" is a term used
opposed to "offline" processing. A system that reacts in real time is a
system that continually reacts against inputs without offline processing
of data. Which can be minutes.

So lets better avoid that term entirely.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2017-12-19 16:38         ` Christopher Lameter
@ 2017-12-19 16:49           ` Peter Zijlstra
  2017-12-19 17:26             ` Christopher Lameter
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2017-12-19 16:49 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Frederic Weisbecker, LKML, Chris Metcalf, Thomas Gleixner,
	Luiz Capitulino, Paul E . McKenney, Ingo Molnar, Wanpeng Li,
	Mike Galbraith, Rik van Riel

On Tue, Dec 19, 2017 at 10:38:39AM -0600, Christopher Lameter wrote:
> And the term RT has been heavily abused by marketing folks to mean any
> number of things so people can use RT to refer to variety of things. So
> please always clarify what you mean exactly.

Do not _ever_ listen to marketing... Also your argument is circular, you
cannot state clearly what is meant if every word means something else to
others.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2017-12-19 16:32     ` Frederic Weisbecker
@ 2017-12-19 17:23       ` Christopher Lameter
  0 siblings, 0 replies; 25+ messages in thread
From: Christopher Lameter @ 2017-12-19 17:23 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: LKML, Peter Zijlstra, Chris Metcalf, Thomas Gleixner,
	Luiz Capitulino, Paul E . McKenney, Ingo Molnar, Wanpeng Li,
	Mike Galbraith, Rik van Riel

On Tue, 19 Dec 2017, Frederic Weisbecker wrote:

> > The vmstat processing required per cpu area access. How does that work if
> > the code is running on a remote processor?
>
> It seems that current::sched_class::task_tick() is ok with this, as it
> uses per runqueues or per task datas. And both are passed as
> arguments.

So I guess no vmstat processing is required? vmstat does not use per task
data but per cpu data.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2017-12-19 16:49           ` Peter Zijlstra
@ 2017-12-19 17:26             ` Christopher Lameter
  0 siblings, 0 replies; 25+ messages in thread
From: Christopher Lameter @ 2017-12-19 17:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Frederic Weisbecker, LKML, Chris Metcalf, Thomas Gleixner,
	Luiz Capitulino, Paul E . McKenney, Ingo Molnar, Wanpeng Li,
	Mike Galbraith, Rik van Riel

On Tue, 19 Dec 2017, Peter Zijlstra wrote:

> On Tue, Dec 19, 2017 at 10:38:39AM -0600, Christopher Lameter wrote:
> > And the term RT has been heavily abused by marketing folks to mean any
> > number of things so people can use RT to refer to variety of things. So
> > please always clarify what you mean exactly.
>
> Do not _ever_ listen to marketing... Also your argument is circular, you
> cannot state clearly what is meant if every word means something else to
> others.

As Goedel has shown every logical must ultimately circular. The circle has
to be widened enough to include terms whose meaning is agreed upon by all
involved parties so that a common understanding can be reached.

Marketing speak is valid in a certain business context and has some
meaning that are often offensive to us since it does not map clearly to
our engineering concepts and strict definitions that we like.
Nevertheless lots of users use the terms in the marketing sense and
engineers get sucked into other meanings of words.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2018-01-16 15:57     ` Frederic Weisbecker
@ 2018-01-16 16:53       ` Luiz Capitulino
  0 siblings, 0 replies; 25+ messages in thread
From: Luiz Capitulino @ 2018-01-16 16:53 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Ingo Molnar, LKML, Peter Zijlstra, Chris Metcalf,
	Thomas Gleixner, Christoph Lameter, Paul E . McKenney,
	Wanpeng Li, Mike Galbraith, Rik van Riel

On Tue, 16 Jan 2018 16:57:45 +0100
Frederic Weisbecker <frederic@kernel.org> wrote:

> On Fri, Jan 12, 2018 at 02:22:58PM -0500, Luiz Capitulino wrote:
> > On Thu,  4 Jan 2018 05:25:36 +0100
> > Frederic Weisbecker <frederic@kernel.org> wrote:
> >   
> > > When a CPU runs in full dynticks mode, a 1Hz tick remains in order to
> > > keep the scheduler stats alive. However this residual tick is a burden
> > > for bare metal tasks that can't stand any interruption at all, or want
> > > to minimize them.
> > > 
> > > Adding the boot parameter "isolcpus=nohz_offload" will now outsource
> > > these scheduler ticks to the global workqueue so that a housekeeping CPU
> > > handles that tick remotely.
> > > 
> > > Note it's still up to the user to affine the global workqueues to the
> > > housekeeping CPUs through /sys/devices/virtual/workqueue/cpumask or
> > > domains isolation.
> > > 
> > > Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> > > Cc: Chris Metcalf <cmetcalf@mellanox.com>
> > > Cc: Christoph Lameter <cl@linux.com>
> > > Cc: Luiz Capitulino <lcapitulino@redhat.com>
> > > Cc: Mike Galbraith <efault@gmx.de>
> > > Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > > Cc: Peter Zijlstra <peterz@infradead.org>
> > > Cc: Rik van Riel <riel@redhat.com>
> > > Cc: Thomas Gleixner <tglx@linutronix.de>
> > > Cc: Wanpeng Li <kernellwp@gmail.com>
> > > Cc: Ingo Molnar <mingo@kernel.org>
> > > ---
> > >  kernel/sched/core.c      | 88 ++++++++++++++++++++++++++++++++++++++++++++++--
> > >  kernel/sched/isolation.c |  4 +++
> > >  kernel/sched/sched.h     |  2 ++
> > >  3 files changed, 91 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > > index d72d0e9..b964890 100644
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -3052,9 +3052,14 @@ void scheduler_tick(void)
> > >   */
> > >  u64 scheduler_tick_max_deferment(void)
> > >  {
> > > -	struct rq *rq = this_rq();
> > > -	unsigned long next, now = READ_ONCE(jiffies);
> > > +	struct rq *rq;
> > > +	unsigned long next, now;
> > >  
> > > +	if (!housekeeping_cpu(smp_processor_id(), HK_FLAG_TICK_SCHED))
> > > +		return ktime_to_ns(KTIME_MAX);
> > > +
> > > +	rq = this_rq();
> > > +	now = READ_ONCE(jiffies);
> > >  	next = rq->last_sched_tick + HZ;
> > >  
> > >  	if (time_before_eq(next, now))
> > > @@ -3062,7 +3067,82 @@ u64 scheduler_tick_max_deferment(void)
> > >  
> > >  	return jiffies_to_nsecs(next - now);
> > >  }
> > > -#endif
> > > +
> > > +struct tick_work {
> > > +	int			cpu;
> > > +	struct delayed_work	work;
> > > +};
> > > +
> > > +static struct tick_work __percpu *tick_work_cpu;
> > > +
> > > +static void sched_tick_remote(struct work_struct *work)
> > > +{
> > > +	struct delayed_work *dwork = to_delayed_work(work);
> > > +	struct tick_work *twork = container_of(dwork, struct tick_work, work);
> > > +	int cpu = twork->cpu;
> > > +	struct rq *rq = cpu_rq(cpu);
> > > +	struct rq_flags rf;
> > > +
> > > +	/*
> > > +	 * Handle the tick only if it appears the remote CPU is running
> > > +	 * in full dynticks mode. The check is racy by nature, but
> > > +	 * missing a tick or having one too much is no big deal.
> > > +	 */
> > > +	if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
> > > +		rq_lock_irq(rq, &rf);
> > > +		update_rq_clock(rq);
> > > +		rq->curr->sched_class->task_tick(rq, rq->curr, 0);
> > > +		rq_unlock_irq(rq, &rf);
> > > +	}  
> > 
> > OK, so this executes task_tick() remotely. What about account_process_tick()?
> > Don't we need it as well?  
> 
> Nope, tasks in nohz_full mode have their special accounting that doesn't
> rely on the tick.

OK, excellent.

> > In particular, when I run a hog application on a nohz_full core configured
> > with tick offload, I can see in top that the CPU usage goes from 100%
> > to idle for a few seconds every couple of seconds. Could this be related?
> > 
> > Also, in my testing I'm sometimes seeing the tick. Sometimes at 10 or
> > 20 seconds interval. Is this expected? I'll dig deeper next week.  
> 
> That's expected, see the changelog: the offload is not affine by default.
> You need to either also isolate the domains:
> 
>     isolcpus=nohz_offload,domain
> 
> or tweak the workqueue cpumask through:
> 
>     /sys/devices/virtual/workqueue/cpumask

Yeah, I already do that. Later today or tomorrow I'll debug this to
see if the problem is in my setup or not.

> 
> Thanks.
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2018-01-12 19:22   ` Luiz Capitulino
@ 2018-01-16 15:57     ` Frederic Weisbecker
  2018-01-16 16:53       ` Luiz Capitulino
  0 siblings, 1 reply; 25+ messages in thread
From: Frederic Weisbecker @ 2018-01-16 15:57 UTC (permalink / raw)
  To: Luiz Capitulino
  Cc: Ingo Molnar, LKML, Peter Zijlstra, Chris Metcalf,
	Thomas Gleixner, Christoph Lameter, Paul E . McKenney,
	Wanpeng Li, Mike Galbraith, Rik van Riel

On Fri, Jan 12, 2018 at 02:22:58PM -0500, Luiz Capitulino wrote:
> On Thu,  4 Jan 2018 05:25:36 +0100
> Frederic Weisbecker <frederic@kernel.org> wrote:
> 
> > When a CPU runs in full dynticks mode, a 1Hz tick remains in order to
> > keep the scheduler stats alive. However this residual tick is a burden
> > for bare metal tasks that can't stand any interruption at all, or want
> > to minimize them.
> > 
> > Adding the boot parameter "isolcpus=nohz_offload" will now outsource
> > these scheduler ticks to the global workqueue so that a housekeeping CPU
> > handles that tick remotely.
> > 
> > Note it's still up to the user to affine the global workqueues to the
> > housekeeping CPUs through /sys/devices/virtual/workqueue/cpumask or
> > domains isolation.
> > 
> > Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> > Cc: Chris Metcalf <cmetcalf@mellanox.com>
> > Cc: Christoph Lameter <cl@linux.com>
> > Cc: Luiz Capitulino <lcapitulino@redhat.com>
> > Cc: Mike Galbraith <efault@gmx.de>
> > Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Rik van Riel <riel@redhat.com>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Wanpeng Li <kernellwp@gmail.com>
> > Cc: Ingo Molnar <mingo@kernel.org>
> > ---
> >  kernel/sched/core.c      | 88 ++++++++++++++++++++++++++++++++++++++++++++++--
> >  kernel/sched/isolation.c |  4 +++
> >  kernel/sched/sched.h     |  2 ++
> >  3 files changed, 91 insertions(+), 3 deletions(-)
> > 
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index d72d0e9..b964890 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -3052,9 +3052,14 @@ void scheduler_tick(void)
> >   */
> >  u64 scheduler_tick_max_deferment(void)
> >  {
> > -	struct rq *rq = this_rq();
> > -	unsigned long next, now = READ_ONCE(jiffies);
> > +	struct rq *rq;
> > +	unsigned long next, now;
> >  
> > +	if (!housekeeping_cpu(smp_processor_id(), HK_FLAG_TICK_SCHED))
> > +		return ktime_to_ns(KTIME_MAX);
> > +
> > +	rq = this_rq();
> > +	now = READ_ONCE(jiffies);
> >  	next = rq->last_sched_tick + HZ;
> >  
> >  	if (time_before_eq(next, now))
> > @@ -3062,7 +3067,82 @@ u64 scheduler_tick_max_deferment(void)
> >  
> >  	return jiffies_to_nsecs(next - now);
> >  }
> > -#endif
> > +
> > +struct tick_work {
> > +	int			cpu;
> > +	struct delayed_work	work;
> > +};
> > +
> > +static struct tick_work __percpu *tick_work_cpu;
> > +
> > +static void sched_tick_remote(struct work_struct *work)
> > +{
> > +	struct delayed_work *dwork = to_delayed_work(work);
> > +	struct tick_work *twork = container_of(dwork, struct tick_work, work);
> > +	int cpu = twork->cpu;
> > +	struct rq *rq = cpu_rq(cpu);
> > +	struct rq_flags rf;
> > +
> > +	/*
> > +	 * Handle the tick only if it appears the remote CPU is running
> > +	 * in full dynticks mode. The check is racy by nature, but
> > +	 * missing a tick or having one too much is no big deal.
> > +	 */
> > +	if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
> > +		rq_lock_irq(rq, &rf);
> > +		update_rq_clock(rq);
> > +		rq->curr->sched_class->task_tick(rq, rq->curr, 0);
> > +		rq_unlock_irq(rq, &rf);
> > +	}
> 
> OK, so this executes task_tick() remotely. What about account_process_tick()?
> Don't we need it as well?

Nope, tasks in nohz_full mode have their special accounting that doesn't
rely on the tick.

> 
> In particular, when I run a hog application on a nohz_full core configured
> with tick offload, I can see in top that the CPU usage goes from 100%
> to idle for a few seconds every couple of seconds. Could this be related?
> 
> Also, in my testing I'm sometimes seeing the tick. Sometimes at 10 or
> 20 seconds interval. Is this expected? I'll dig deeper next week.

That's expected, see the changelog: the offload is not affine by default.
You need to either also isolate the domains:

    isolcpus=nohz_offload,domain

or tweak the workqueue cpumask through:

    /sys/devices/virtual/workqueue/cpumask

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2018-01-04  4:25 ` [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload Frederic Weisbecker
@ 2018-01-12 19:22   ` Luiz Capitulino
  2018-01-16 15:57     ` Frederic Weisbecker
  0 siblings, 1 reply; 25+ messages in thread
From: Luiz Capitulino @ 2018-01-12 19:22 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Ingo Molnar, LKML, Peter Zijlstra, Chris Metcalf,
	Thomas Gleixner, Christoph Lameter, Paul E . McKenney,
	Wanpeng Li, Mike Galbraith, Rik van Riel

On Thu,  4 Jan 2018 05:25:36 +0100
Frederic Weisbecker <frederic@kernel.org> wrote:

> When a CPU runs in full dynticks mode, a 1Hz tick remains in order to
> keep the scheduler stats alive. However this residual tick is a burden
> for bare metal tasks that can't stand any interruption at all, or want
> to minimize them.
> 
> Adding the boot parameter "isolcpus=nohz_offload" will now outsource
> these scheduler ticks to the global workqueue so that a housekeeping CPU
> handles that tick remotely.
> 
> Note it's still up to the user to affine the global workqueues to the
> housekeeping CPUs through /sys/devices/virtual/workqueue/cpumask or
> domains isolation.
> 
> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> Cc: Chris Metcalf <cmetcalf@mellanox.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Luiz Capitulino <lcapitulino@redhat.com>
> Cc: Mike Galbraith <efault@gmx.de>
> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Rik van Riel <riel@redhat.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Wanpeng Li <kernellwp@gmail.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> ---
>  kernel/sched/core.c      | 88 ++++++++++++++++++++++++++++++++++++++++++++++--
>  kernel/sched/isolation.c |  4 +++
>  kernel/sched/sched.h     |  2 ++
>  3 files changed, 91 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index d72d0e9..b964890 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3052,9 +3052,14 @@ void scheduler_tick(void)
>   */
>  u64 scheduler_tick_max_deferment(void)
>  {
> -	struct rq *rq = this_rq();
> -	unsigned long next, now = READ_ONCE(jiffies);
> +	struct rq *rq;
> +	unsigned long next, now;
>  
> +	if (!housekeeping_cpu(smp_processor_id(), HK_FLAG_TICK_SCHED))
> +		return ktime_to_ns(KTIME_MAX);
> +
> +	rq = this_rq();
> +	now = READ_ONCE(jiffies);
>  	next = rq->last_sched_tick + HZ;
>  
>  	if (time_before_eq(next, now))
> @@ -3062,7 +3067,82 @@ u64 scheduler_tick_max_deferment(void)
>  
>  	return jiffies_to_nsecs(next - now);
>  }
> -#endif
> +
> +struct tick_work {
> +	int			cpu;
> +	struct delayed_work	work;
> +};
> +
> +static struct tick_work __percpu *tick_work_cpu;
> +
> +static void sched_tick_remote(struct work_struct *work)
> +{
> +	struct delayed_work *dwork = to_delayed_work(work);
> +	struct tick_work *twork = container_of(dwork, struct tick_work, work);
> +	int cpu = twork->cpu;
> +	struct rq *rq = cpu_rq(cpu);
> +	struct rq_flags rf;
> +
> +	/*
> +	 * Handle the tick only if it appears the remote CPU is running
> +	 * in full dynticks mode. The check is racy by nature, but
> +	 * missing a tick or having one too much is no big deal.
> +	 */
> +	if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
> +		rq_lock_irq(rq, &rf);
> +		update_rq_clock(rq);
> +		rq->curr->sched_class->task_tick(rq, rq->curr, 0);
> +		rq_unlock_irq(rq, &rf);
> +	}

OK, so this executes task_tick() remotely. What about account_process_tick()?
Don't we need it as well?

In particular, when I run a hog application on a nohz_full core configured
with tick offload, I can see in top that the CPU usage goes from 100%
to idle for a few seconds every couple of seconds. Could this be related?

Also, in my testing I'm sometimes seeing the tick. Sometimes at 10 or
20 seconds interval. Is this expected? I'll dig deeper next week.

> +
> +	queue_delayed_work(system_unbound_wq, dwork, HZ);
> +}
> +
> +static void sched_tick_start(int cpu)
> +{
> +	struct tick_work *twork;
> +
> +	if (housekeeping_cpu(cpu, HK_FLAG_TICK_SCHED))
> +		return;
> +
> +	WARN_ON_ONCE(!tick_work_cpu);
> +
> +	twork = per_cpu_ptr(tick_work_cpu, cpu);
> +	twork->cpu = cpu;
> +	INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
> +	queue_delayed_work(system_unbound_wq, &twork->work, HZ);
> +}
> +
> +#ifdef CONFIG_HOTPLUG_CPU
> +static void sched_tick_stop(int cpu)
> +{
> +	struct tick_work *twork;
> +
> +	if (housekeeping_cpu(cpu, HK_FLAG_TICK_SCHED))
> +		return;
> +
> +	WARN_ON_ONCE(!tick_work_cpu);
> +
> +	twork = per_cpu_ptr(tick_work_cpu, cpu);
> +	cancel_delayed_work_sync(&twork->work);
> +}
> +#endif /* CONFIG_HOTPLUG_CPU */
> +
> +int __init sched_tick_offload_init(void)
> +{
> +	tick_work_cpu = alloc_percpu(struct tick_work);
> +	if (!tick_work_cpu) {
> +		pr_err("Can't allocate remote tick struct\n");
> +		return -ENOMEM;
> +	}
> +
> +	return 0;
> +}
> +
> +#else
> +static void sched_tick_start(int cpu) { }
> +static void sched_tick_stop(int cpu) { }
> +#endif /* CONFIG_NO_HZ_FULL */
>  
>  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
>  				defined(CONFIG_PREEMPT_TRACER))
> @@ -5713,6 +5793,7 @@ int sched_cpu_starting(unsigned int cpu)
>  {
>  	set_cpu_rq_start_time(cpu);
>  	sched_rq_cpu_starting(cpu);
> +	sched_tick_start(cpu);
>  	return 0;
>  }
>  
> @@ -5724,6 +5805,7 @@ int sched_cpu_dying(unsigned int cpu)
>  
>  	/* Handle pending wakeups and then migrate everything off */
>  	sched_ttwu_pending();
> +	sched_tick_stop(cpu);
>  
>  	rq_lock_irqsave(rq, &rf);
>  	if (rq->rd) {
> diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
> index 264ddcd..c5e7e90a 100644
> --- a/kernel/sched/isolation.c
> +++ b/kernel/sched/isolation.c
> @@ -12,6 +12,7 @@
>  #include <linux/kernel.h>
>  #include <linux/static_key.h>
>  #include <linux/ctype.h>
> +#include "sched.h"
>  
>  DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
>  EXPORT_SYMBOL_GPL(housekeeping_overriden);
> @@ -60,6 +61,9 @@ void __init housekeeping_init(void)
>  
>  	static_branch_enable(&housekeeping_overriden);
>  
> +	if (housekeeping_flags & HK_FLAG_TICK_SCHED)
> +		sched_tick_offload_init();
> +
>  	/* We need at least one CPU to handle housekeeping work */
>  	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
>  }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index b19552a2..5a3b82c 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1587,6 +1587,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se);
>  
>  #ifdef CONFIG_NO_HZ_FULL
>  extern bool sched_can_stop_tick(struct rq *rq);
> +extern int __init sched_tick_offload_init(void);
>  
>  /*
>   * Tick may be needed by tasks in the runqueue depending on their policy and
> @@ -1611,6 +1612,7 @@ static inline void sched_update_tick_dependency(struct rq *rq)
>  		tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
>  }
>  #else
> +static inline int sched_tick_offload_init(void) { return 0; }
>  static inline void sched_update_tick_dependency(struct rq *rq) { }
>  #endif
>  

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2018-01-04  4:25 [GIT PULL] isolation: 1Hz residual tick offloading v3 Frederic Weisbecker
@ 2018-01-04  4:25 ` Frederic Weisbecker
  2018-01-12 19:22   ` Luiz Capitulino
  0 siblings, 1 reply; 25+ messages in thread
From: Frederic Weisbecker @ 2018-01-04  4:25 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: LKML, Frederic Weisbecker, Peter Zijlstra, Chris Metcalf,
	Thomas Gleixner, Luiz Capitulino, Christoph Lameter,
	Paul E . McKenney, Wanpeng Li, Mike Galbraith, Rik van Riel

When a CPU runs in full dynticks mode, a 1Hz tick remains in order to
keep the scheduler stats alive. However this residual tick is a burden
for bare metal tasks that can't stand any interruption at all, or want
to minimize them.

Adding the boot parameter "isolcpus=nohz_offload" will now outsource
these scheduler ticks to the global workqueue so that a housekeeping CPU
handles that tick remotely.

Note it's still up to the user to affine the global workqueues to the
housekeeping CPUs through /sys/devices/virtual/workqueue/cpumask or
domains isolation.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <kernellwp@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c      | 88 ++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/isolation.c |  4 +++
 kernel/sched/sched.h     |  2 ++
 3 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d72d0e9..b964890 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3052,9 +3052,14 @@ void scheduler_tick(void)
  */
 u64 scheduler_tick_max_deferment(void)
 {
-	struct rq *rq = this_rq();
-	unsigned long next, now = READ_ONCE(jiffies);
+	struct rq *rq;
+	unsigned long next, now;
 
+	if (!housekeeping_cpu(smp_processor_id(), HK_FLAG_TICK_SCHED))
+		return ktime_to_ns(KTIME_MAX);
+
+	rq = this_rq();
+	now = READ_ONCE(jiffies);
 	next = rq->last_sched_tick + HZ;
 
 	if (time_before_eq(next, now))
@@ -3062,7 +3067,82 @@ u64 scheduler_tick_max_deferment(void)
 
 	return jiffies_to_nsecs(next - now);
 }
-#endif
+
+struct tick_work {
+	int			cpu;
+	struct delayed_work	work;
+};
+
+static struct tick_work __percpu *tick_work_cpu;
+
+static void sched_tick_remote(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct tick_work *twork = container_of(dwork, struct tick_work, work);
+	int cpu = twork->cpu;
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
+
+	/*
+	 * Handle the tick only if it appears the remote CPU is running
+	 * in full dynticks mode. The check is racy by nature, but
+	 * missing a tick or having one too much is no big deal.
+	 */
+	if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
+		rq_lock_irq(rq, &rf);
+		update_rq_clock(rq);
+		rq->curr->sched_class->task_tick(rq, rq->curr, 0);
+		rq_unlock_irq(rq, &rf);
+	}
+
+	queue_delayed_work(system_unbound_wq, dwork, HZ);
+}
+
+static void sched_tick_start(int cpu)
+{
+	struct tick_work *twork;
+
+	if (housekeeping_cpu(cpu, HK_FLAG_TICK_SCHED))
+		return;
+
+	WARN_ON_ONCE(!tick_work_cpu);
+
+	twork = per_cpu_ptr(tick_work_cpu, cpu);
+	twork->cpu = cpu;
+	INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+	queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void sched_tick_stop(int cpu)
+{
+	struct tick_work *twork;
+
+	if (housekeeping_cpu(cpu, HK_FLAG_TICK_SCHED))
+		return;
+
+	WARN_ON_ONCE(!tick_work_cpu);
+
+	twork = per_cpu_ptr(tick_work_cpu, cpu);
+	cancel_delayed_work_sync(&twork->work);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __init sched_tick_offload_init(void)
+{
+	tick_work_cpu = alloc_percpu(struct tick_work);
+	if (!tick_work_cpu) {
+		pr_err("Can't allocate remote tick struct\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+#else
+static void sched_tick_start(int cpu) { }
+static void sched_tick_stop(int cpu) { }
+#endif /* CONFIG_NO_HZ_FULL */
 
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 				defined(CONFIG_PREEMPT_TRACER))
@@ -5713,6 +5793,7 @@ int sched_cpu_starting(unsigned int cpu)
 {
 	set_cpu_rq_start_time(cpu);
 	sched_rq_cpu_starting(cpu);
+	sched_tick_start(cpu);
 	return 0;
 }
 
@@ -5724,6 +5805,7 @@ int sched_cpu_dying(unsigned int cpu)
 
 	/* Handle pending wakeups and then migrate everything off */
 	sched_ttwu_pending();
+	sched_tick_stop(cpu);
 
 	rq_lock_irqsave(rq, &rf);
 	if (rq->rd) {
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 264ddcd..c5e7e90a 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/static_key.h>
 #include <linux/ctype.h>
+#include "sched.h"
 
 DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
 EXPORT_SYMBOL_GPL(housekeeping_overriden);
@@ -60,6 +61,9 @@ void __init housekeeping_init(void)
 
 	static_branch_enable(&housekeeping_overriden);
 
+	if (housekeeping_flags & HK_FLAG_TICK_SCHED)
+		sched_tick_offload_init();
+
 	/* We need at least one CPU to handle housekeeping work */
 	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b19552a2..5a3b82c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1587,6 +1587,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se);
 
 #ifdef CONFIG_NO_HZ_FULL
 extern bool sched_can_stop_tick(struct rq *rq);
+extern int __init sched_tick_offload_init(void);
 
 /*
  * Tick may be needed by tasks in the runqueue depending on their policy and
@@ -1611,6 +1612,7 @@ static inline void sched_update_tick_dependency(struct rq *rq)
 		tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
 }
 #else
+static inline int sched_tick_offload_init(void) { return 0; }
 static inline void sched_update_tick_dependency(struct rq *rq) { }
 #endif
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2017-12-30  3:55 [PATCH 0/5] isolation: 1Hz residual tick offloading v3 Frederic Weisbecker
@ 2017-12-30  3:55 ` Frederic Weisbecker
  0 siblings, 0 replies; 25+ messages in thread
From: Frederic Weisbecker @ 2017-12-30  3:55 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Peter Zijlstra, Chris Metcalf,
	Thomas Gleixner, Luiz Capitulino, Christoph Lameter,
	Paul E . McKenney, Ingo Molnar, Wanpeng Li, Mike Galbraith,
	Rik van Riel

When a CPU runs in full dynticks mode, a 1Hz tick remains in order to
keep the scheduler stats alive. However this residual tick is a burden
for bare metal tasks that can't stand no interruption at all, or want
to minimize them.

Adding the boot parameter "isolcpus=nohz_offload" will now outsource
these scheduler ticks to the global workqueue so that a housekeeping CPU
handles that tick remotely.

Note it's still up to the user to affine the global workqueues to the
housekeeping CPUs through /sys/devices/virtual/workqueue/cpumask or
domains isolation.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <kernellwp@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c      | 88 ++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/isolation.c |  4 +++
 kernel/sched/sched.h     |  2 ++
 3 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d72d0e9..b964890 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3052,9 +3052,14 @@ void scheduler_tick(void)
  */
 u64 scheduler_tick_max_deferment(void)
 {
-	struct rq *rq = this_rq();
-	unsigned long next, now = READ_ONCE(jiffies);
+	struct rq *rq;
+	unsigned long next, now;
 
+	if (!housekeeping_cpu(smp_processor_id(), HK_FLAG_TICK_SCHED))
+		return ktime_to_ns(KTIME_MAX);
+
+	rq = this_rq();
+	now = READ_ONCE(jiffies);
 	next = rq->last_sched_tick + HZ;
 
 	if (time_before_eq(next, now))
@@ -3062,7 +3067,82 @@ u64 scheduler_tick_max_deferment(void)
 
 	return jiffies_to_nsecs(next - now);
 }
-#endif
+
+struct tick_work {
+	int			cpu;
+	struct delayed_work	work;
+};
+
+static struct tick_work __percpu *tick_work_cpu;
+
+static void sched_tick_remote(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct tick_work *twork = container_of(dwork, struct tick_work, work);
+	int cpu = twork->cpu;
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
+
+	/*
+	 * Handle the tick only if it appears the remote CPU is running
+	 * in full dynticks mode. The check is racy by nature, but
+	 * missing a tick or having one too much is no big deal.
+	 */
+	if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
+		rq_lock_irq(rq, &rf);
+		update_rq_clock(rq);
+		rq->curr->sched_class->task_tick(rq, rq->curr, 0);
+		rq_unlock_irq(rq, &rf);
+	}
+
+	queue_delayed_work(system_unbound_wq, dwork, HZ);
+}
+
+static void sched_tick_start(int cpu)
+{
+	struct tick_work *twork;
+
+	if (housekeeping_cpu(cpu, HK_FLAG_TICK_SCHED))
+		return;
+
+	WARN_ON_ONCE(!tick_work_cpu);
+
+	twork = per_cpu_ptr(tick_work_cpu, cpu);
+	twork->cpu = cpu;
+	INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+	queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void sched_tick_stop(int cpu)
+{
+	struct tick_work *twork;
+
+	if (housekeeping_cpu(cpu, HK_FLAG_TICK_SCHED))
+		return;
+
+	WARN_ON_ONCE(!tick_work_cpu);
+
+	twork = per_cpu_ptr(tick_work_cpu, cpu);
+	cancel_delayed_work_sync(&twork->work);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __init sched_tick_offload_init(void)
+{
+	tick_work_cpu = alloc_percpu(struct tick_work);
+	if (!tick_work_cpu) {
+		pr_err("Can't allocate remote tick struct\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+#else
+static void sched_tick_start(int cpu) { }
+static void sched_tick_stop(int cpu) { }
+#endif /* CONFIG_NO_HZ_FULL */
 
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 				defined(CONFIG_PREEMPT_TRACER))
@@ -5713,6 +5793,7 @@ int sched_cpu_starting(unsigned int cpu)
 {
 	set_cpu_rq_start_time(cpu);
 	sched_rq_cpu_starting(cpu);
+	sched_tick_start(cpu);
 	return 0;
 }
 
@@ -5724,6 +5805,7 @@ int sched_cpu_dying(unsigned int cpu)
 
 	/* Handle pending wakeups and then migrate everything off */
 	sched_ttwu_pending();
+	sched_tick_stop(cpu);
 
 	rq_lock_irqsave(rq, &rf);
 	if (rq->rd) {
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 264ddcd..c5e7e90a 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/static_key.h>
 #include <linux/ctype.h>
+#include "sched.h"
 
 DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
 EXPORT_SYMBOL_GPL(housekeeping_overriden);
@@ -60,6 +61,9 @@ void __init housekeeping_init(void)
 
 	static_branch_enable(&housekeeping_overriden);
 
+	if (housekeeping_flags & HK_FLAG_TICK_SCHED)
+		sched_tick_offload_init();
+
 	/* We need at least one CPU to handle housekeeping work */
 	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b19552a2..5a3b82c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1587,6 +1587,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se);
 
 #ifdef CONFIG_NO_HZ_FULL
 extern bool sched_can_stop_tick(struct rq *rq);
+extern int __init sched_tick_offload_init(void);
 
 /*
  * Tick may be needed by tasks in the runqueue depending on their policy and
@@ -1611,6 +1612,7 @@ static inline void sched_update_tick_dependency(struct rq *rq)
 		tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
 }
 #else
+static inline int sched_tick_offload_init(void) { return 0; }
 static inline void sched_update_tick_dependency(struct rq *rq) { }
 #endif
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload
  2017-12-21 17:14 [PATCH 0/5] isolation: 1Hz residual tick offloading v2 Frederic Weisbecker
@ 2017-12-21 17:14 ` Frederic Weisbecker
  0 siblings, 0 replies; 25+ messages in thread
From: Frederic Weisbecker @ 2017-12-21 17:14 UTC (permalink / raw)
  To: LKML
  Cc: Frederic Weisbecker, Peter Zijlstra, Chris Metcalf,
	Thomas Gleixner, Luiz Capitulino, Christoph Lameter,
	Paul E . McKenney, Ingo Molnar, Wanpeng Li, Mike Galbraith,
	Rik van Riel

When a CPU runs in full dynticks mode, a 1Hz tick remains in order to
keep the scheduler stats alive. However this residual tick is a burden
for bare metal tasks that can't stand no interruption at all, or want
to minimize them.

Adding the boot parameter "isolcpus=nohz_offload" will now outsource
these scheduler ticks to the global workqueue so that a housekeeping CPU
handles that tick remotely.

Note it's still up to the user to affine the global workqueues to the
housekeeping CPUs through /sys/devices/virtual/workqueue/cpumask or
domains isolation.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <kernellwp@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c      | 89 ++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/isolation.c |  4 +++
 kernel/sched/sched.h     |  2 ++
 3 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d72d0e9..a12008c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3052,9 +3052,14 @@ void scheduler_tick(void)
  */
 u64 scheduler_tick_max_deferment(void)
 {
-	struct rq *rq = this_rq();
-	unsigned long next, now = READ_ONCE(jiffies);
+	struct rq *rq;
+	unsigned long next, now;
 
+	if (!housekeeping_cpu(smp_processor_id(), HK_FLAG_TICK_SCHED))
+		return ktime_to_ns(KTIME_MAX);
+
+	rq = this_rq();
+	now = READ_ONCE(jiffies);
 	next = rq->last_sched_tick + HZ;
 
 	if (time_before_eq(next, now))
@@ -3062,7 +3067,83 @@ u64 scheduler_tick_max_deferment(void)
 
 	return jiffies_to_nsecs(next - now);
 }
-#endif
+
+struct tick_work {
+	int			cpu;
+	struct delayed_work	work;
+};
+
+static struct tick_work __percpu *tick_work_cpu;
+
+static void sched_tick_remote(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct tick_work *twork = container_of(dwork, struct tick_work, work);
+	int cpu = twork->cpu;
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
+
+	/*
+	 * Handle the tick only if it appears the remote CPU is running
+	 * in full dynticks mode. The check is racy by nature, but
+	 * missing a tick or having one too much is no big deal.
+	 */
+	if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
+		rq_lock_irq(rq, &rf);
+		update_rq_clock(rq);
+		rq->curr->sched_class->task_tick(rq, rq->curr, 0);
+		rq_unlock_irq(rq, &rf);
+	}
+
+	queue_delayed_work(system_unbound_wq, dwork, HZ);
+}
+
+static void sched_tick_start(int cpu)
+{
+	struct tick_work *twork;
+
+	if (housekeeping_cpu(cpu, HK_FLAG_TICK_SCHED))
+		return;
+
+	WARN_ON_ONCE(!tick_work_cpu);
+
+	twork = per_cpu_ptr(tick_work_cpu, cpu);
+	twork->cpu = cpu;
+	INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+	queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+
+	return;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void sched_tick_stop(int cpu)
+{
+	struct tick_work *twork;
+
+	if (housekeeping_cpu(cpu, HK_FLAG_TICK_SCHED))
+		return;
+
+	WARN_ON_ONCE(!tick_work_cpu);
+
+	twork = per_cpu_ptr(tick_work_cpu, cpu);
+	cancel_delayed_work_sync(&twork->work);
+
+	return;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __init sched_tick_offload_init(void)
+{
+	tick_work_cpu = alloc_percpu(struct tick_work);
+	if (!tick_work_cpu) {
+		pr_err("Can't allocate remote tick struct\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+#endif /* CONFIG_NO_HZ_FULL */
 
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 				defined(CONFIG_PREEMPT_TRACER))
@@ -5713,6 +5794,7 @@ int sched_cpu_starting(unsigned int cpu)
 {
 	set_cpu_rq_start_time(cpu);
 	sched_rq_cpu_starting(cpu);
+	sched_tick_start(cpu);
 	return 0;
 }
 
@@ -5724,6 +5806,7 @@ int sched_cpu_dying(unsigned int cpu)
 
 	/* Handle pending wakeups and then migrate everything off */
 	sched_ttwu_pending();
+	sched_tick_stop(cpu);
 
 	rq_lock_irqsave(rq, &rf);
 	if (rq->rd) {
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 264ddcd..c5e7e90a 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/static_key.h>
 #include <linux/ctype.h>
+#include "sched.h"
 
 DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
 EXPORT_SYMBOL_GPL(housekeeping_overriden);
@@ -60,6 +61,9 @@ void __init housekeeping_init(void)
 
 	static_branch_enable(&housekeeping_overriden);
 
+	if (housekeeping_flags & HK_FLAG_TICK_SCHED)
+		sched_tick_offload_init();
+
 	/* We need at least one CPU to handle housekeeping work */
 	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b19552a2..5a3b82c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1587,6 +1587,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se);
 
 #ifdef CONFIG_NO_HZ_FULL
 extern bool sched_can_stop_tick(struct rq *rq);
+extern int __init sched_tick_offload_init(void);
 
 /*
  * Tick may be needed by tasks in the runqueue depending on their policy and
@@ -1611,6 +1612,7 @@ static inline void sched_update_tick_dependency(struct rq *rq)
 		tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
 }
 #else
+static inline int sched_tick_offload_init(void) { return 0; }
 static inline void sched_update_tick_dependency(struct rq *rq) { }
 #endif
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2018-01-16 16:54 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-19  3:23 [RFC PATCH 0/5] isolation: 1Hz residual tick offloading Frederic Weisbecker
2017-12-19  3:23 ` [PATCH 1/5] sched: Move tick code to a separate file Frederic Weisbecker
2017-12-19  9:08   ` Peter Zijlstra
2017-12-19 16:33     ` Frederic Weisbecker
2017-12-19  3:23 ` [PATCH 2/5] sched: Rename init_rq_hrtick to hrtick_rq_init Frederic Weisbecker
2017-12-19  3:23 ` [PATCH 3/5] sched/isolation: Add scheduler tick offloading interface Frederic Weisbecker
2017-12-19  3:23 ` [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload Frederic Weisbecker
2017-12-19  9:19   ` Peter Zijlstra
2017-12-19 14:34     ` Luiz Capitulino
2017-12-19 16:01     ` Christopher Lameter
2017-12-19 16:04       ` Peter Zijlstra
2017-12-19 16:38         ` Christopher Lameter
2017-12-19 16:49           ` Peter Zijlstra
2017-12-19 17:26             ` Christopher Lameter
2017-12-19 16:26     ` Frederic Weisbecker
2017-12-19 16:03   ` Christopher Lameter
2017-12-19 16:32     ` Frederic Weisbecker
2017-12-19 17:23       ` Christopher Lameter
2017-12-19  3:23 ` [PATCH 5/5] sched/isolation: Document "nohz_offload" flag Frederic Weisbecker
2017-12-21 17:14 [PATCH 0/5] isolation: 1Hz residual tick offloading v2 Frederic Weisbecker
2017-12-21 17:14 ` [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload Frederic Weisbecker
2017-12-30  3:55 [PATCH 0/5] isolation: 1Hz residual tick offloading v3 Frederic Weisbecker
2017-12-30  3:55 ` [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload Frederic Weisbecker
2018-01-04  4:25 [GIT PULL] isolation: 1Hz residual tick offloading v3 Frederic Weisbecker
2018-01-04  4:25 ` [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload Frederic Weisbecker
2018-01-12 19:22   ` Luiz Capitulino
2018-01-16 15:57     ` Frederic Weisbecker
2018-01-16 16:53       ` Luiz Capitulino

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.