All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH Resend] Expose do_timer CPU via sysctl to userspace as R/W
@ 2014-02-19 12:32 henrik
  2014-02-19 16:42 ` Thomas Gleixner
  0 siblings, 1 reply; 4+ messages in thread
From: henrik @ 2014-02-19 12:32 UTC (permalink / raw)
  To: linux-kernel; +Cc: Henrik Austad, Thomas Gleixner

From: Henrik Austad <haustad@cisco.com>

Looks like this got dropped by vger a few days ago, resending.

This allows everybody in a system to read which core is currently
running do_timer() as well as letting root change this.

A few things to keep in mind

- This is intended as a debug-feature, except for testing whether or not
  the target CPU is present or not, the logic will not care -one- bit if
  moving the do_timer to that particular CPU is a good idea.
- It introduces more ifdeffery in the kernel.

On the positive side, it allows you to quickly change where the timer
runs, which is nice if you have interrupts flooding the core where the
timer is updated.

It is exposed via sysctl as

     /proc/sys/kernel/time_tick_cpu
     kernel:time_tick_cpu

Enable/disable via CONFIG_EXPOSE_TICK_CPU, disabling this will not lead
to an increased image size.

CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Henrik Austad <haustad@cisco.com>
---
 include/linux/clocksource.h |   15 +++++++++++++++
 kernel/sysctl.c             |   13 +++++++++++++
 kernel/time/Kconfig         |   16 ++++++++++++++++
 kernel/time/tick-common.c   |   25 +++++++++++++++++++++++++
 kernel/time/tick-internal.h |    5 +++++
 kernel/time/timekeeping.c   |   42 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 116 insertions(+)

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 67301a4..b61c893 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -18,6 +18,10 @@
 #include <asm/div64.h>
 #include <asm/io.h>
 
+#ifdef CONFIG_EXPOSE_TICK_CPU
+#include <linux/sysctl.h>
+#endif
+
 /* clocksource cycle base type */
 typedef u64 cycle_t;
 struct clocksource;
@@ -359,3 +363,14 @@ static inline void clocksource_of_init(void) {}
 #endif
 
 #endif /* _LINUX_CLOCKSOURCE_H */
+#ifdef CONFIG_EXPOSE_TICK_CPU
+/*
+ * expose the CPU that handles the timer-tick and allow userspace to change it.
+ */
+extern int sysctl_tick_do_timer_cpu;
+extern int timekeeping_expose_timer_cpu(struct ctl_table *table,
+					int write,
+					void __user *buffer,
+					size_t *lenp,
+					loff_t *ppos);
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 49e13e1..c6613f5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -64,6 +64,10 @@
 #include <linux/sched/sysctl.h>
 #include <linux/kexec.h>
 
+#ifdef CONFIG_EXPOSE_TICK_CPU
+#include <linux/clocksource.h>
+#endif
+
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 
@@ -424,6 +428,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sched_rr_handler,
 	},
+#ifdef CONFIG_EXPOSE_TICK_CPU
+	{
+		.procname  = "timer_tick_cpu",
+		.data      = &sysctl_tick_do_timer_cpu,
+		.maxlen	   = sizeof(int),
+		.mode	   = 0644,
+		.proc_handler	= timekeeping_expose_timer_cpu,
+	},
+#endif
 #ifdef CONFIG_SCHED_AUTOGROUP
 	{
 		.procname	= "sched_autogroup_enabled",
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 3ce6e8c..45460de 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -202,3 +202,19 @@ config HIGH_RES_TIMERS
 
 endmenu
 endif
+
+config EXPOSE_TICK_CPU
+         bool "Expose which CPU is responsible for timer-tick"
+	 default False
+	 help
+	   This option exposes which CPU is responsible from the
+	   periodic timer-tick. It also alows userspace to change the
+	   CPU, letting userspace move the peridic update away from a
+	   CPU.
+
+	   Note: if the CPU goes offline, the underlying machinery will
+	   move the handler away, and once the CPU is brought back
+	   online, there is absolutely *no* guarantee that the handling
+	   will be moved back to this CPU.
+
+
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 20b2fe3..54dbd7f 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -347,6 +347,31 @@ void tick_handover_do_timer(int *cpup)
 	}
 }
 
+#ifdef CONFIG_EXPOSE_TICK_CPU
+/* part of the expose-tick sysctl interface for r/w which core to run
+ * the do_timer update */
+int tick_expose_cpu(void)
+{
+	return tick_do_timer_cpu;
+}
+int tick_set_cpu(int new_cpu)
+{
+	int old_cpu, current_cpu;
+	if (!cpu_present(new_cpu)) {
+		pr_err("Illegal value for tick_do_timer_cpu, requested CPU (%d) not present\n",
+			new_cpu);
+		return -EINVAL;
+	}
+
+	current_cpu = tick_do_timer_cpu;
+	if ((old_cpu = cmpxchg(&tick_do_timer_cpu, tick_do_timer_cpu, new_cpu)) != current_cpu) {
+		pr_err("Error updating timer-cpu!\n");
+		return -EBADE;
+	}
+	return 0;
+}
+#endif	/* CONFIG_EXPOSE_TICK_CPU */
+
 /*
  * Shutdown an event device on a given cpu:
  *
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 8329669..6c960d6 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -154,5 +154,10 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 
 #endif
 
+#ifdef CONFIG_EXPOSE_TICK_CPU
+extern int tick_expose_cpu(void);
+extern int tick_set_cpu(int new_cpu);
+#endif
+
 extern void do_timer(unsigned long ticks);
 extern void update_wall_time(void);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0aa4ce8..09cebda 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -27,6 +27,10 @@
 #include "ntp_internal.h"
 #include "timekeeping_internal.h"
 
+#ifdef CONFIG_EXPOSE_TICK_CPU
+int sysctl_tick_do_timer_cpu;
+#endif
+
 #define TK_CLEAR_NTP		(1 << 0)
 #define TK_MIRROR		(1 << 1)
 #define TK_CLOCK_WAS_SET	(1 << 2)
@@ -1583,6 +1587,7 @@ void do_timer(unsigned long ticks)
 	calc_global_load(ticks);
 }
 
+
 /**
  * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
  *    and sleep offsets.
@@ -1739,3 +1744,40 @@ void xtime_update(unsigned long ticks)
 	write_sequnlock(&jiffies_lock);
 	update_wall_time();
 }
+
+#ifdef CONFIG_EXPOSE_TICK_CPU
+/*
+ *sysctl interface for exposing timer tick CPU
+ *
+ * Note that setting the timer-cpu will not be persistent accross
+ * sleep/resumes or other events that end up changing the do_timer_cpu.
+ */
+int timekeeping_expose_timer_cpu(struct ctl_table *table,
+				 int write,
+				 void __user *buffer,
+				 size_t *lenp,
+				 loff_t *ppos)
+{
+	static DEFINE_MUTEX(expose_mutex);
+	int ret = -EINVAL;
+
+	mutex_lock(&expose_mutex);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret || !write) {
+		goto out;
+	}
+
+	/*
+	 * tick_set_cpu will do rudimentary check of value.
+	 *
+	 * We could update tick_do_timer_cpu here, but better leave this
+	 * to tick-common.c
+	 */
+	ret = tick_set_cpu(sysctl_tick_do_timer_cpu);
+out:
+	/* blind update/reset for sysctl value */
+	sysctl_tick_do_timer_cpu = tick_expose_cpu();
+	mutex_unlock(&expose_mutex);
+	return ret;
+}
+#endif	/* CONFIG_EXPOSE_TICK_CPU */
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH Resend] Expose do_timer CPU via sysctl to userspace as R/W
  2014-02-19 12:32 [PATCH Resend] Expose do_timer CPU via sysctl to userspace as R/W henrik
@ 2014-02-19 16:42 ` Thomas Gleixner
  2014-02-19 19:43   ` Henrik Austad
  0 siblings, 1 reply; 4+ messages in thread
From: Thomas Gleixner @ 2014-02-19 16:42 UTC (permalink / raw)
  To: henrik
  Cc: LKML, Henrik Austad, Peter Zijlstra, Frederic Weisbecker, John Stultz

On Wed, 19 Feb 2014, henrik@austad.us wrote:
> From: Henrik Austad <haustad@cisco.com>
> 
> Looks like this got dropped by vger a few days ago, resending.
> 
> This allows everybody in a system to read which core is currently
> running do_timer() as well as letting root change this.
> 
> A few things to keep in mind
> 
> - This is intended as a debug-feature, except for testing whether or not
>   the target CPU is present or not, the logic will not care -one- bit if
>   moving the do_timer to that particular CPU is a good idea.

Well at least it must check that the cpu is online and neither in
nohz idle nor nohz full mode.

And we probably want to have that in sysfs with two files:

    timekeeping/current_cpu
    timekeeping/forced_cpu

The latter contains -1 when the system boots and if you write to it,
the duty gets hard assigned to that core, which in turn makes it
blocked from NOHZ idle and NOHZ full modes. If the core goes offline,
then the value must got back to -1. Writing -1 to it undoes the hard
assignment.

> - It introduces more ifdeffery in the kernel.

There is no reason to do that.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH Resend] Expose do_timer CPU via sysctl to userspace as R/W
  2014-02-19 16:42 ` Thomas Gleixner
@ 2014-02-19 19:43   ` Henrik Austad
  2014-02-19 19:50     ` Peter Zijlstra
  0 siblings, 1 reply; 4+ messages in thread
From: Henrik Austad @ 2014-02-19 19:43 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Henrik Austad, Peter Zijlstra, Frederic Weisbecker, John Stultz

On Wed, Feb 19, 2014 at 05:42:07PM +0100, Thomas Gleixner wrote:
> On Wed, 19 Feb 2014, henrik@austad.us wrote:
> > From: Henrik Austad <haustad@cisco.com>
> > 
> > Looks like this got dropped by vger a few days ago, resending.
> > 
> > This allows everybody in a system to read which core is currently
> > running do_timer() as well as letting root change this.
> > 
> > A few things to keep in mind
> > 
> > - This is intended as a debug-feature, except for testing whether or not
> >   the target CPU is present or not, the logic will not care -one- bit if
> >   moving the do_timer to that particular CPU is a good idea.
> 
> Well at least it must check that the cpu is online and neither in
> nohz idle nor nohz full mode.

The online-part should be taken care of by the cpu_present(), no?

But the nohz idle/nohz full should be checked, I agree.

> And we probably want to have that in sysfs with two files:
> 
>     timekeeping/current_cpu
>     timekeeping/forced_cpu

Fair enough, I can add that

> The latter contains -1 when the system boots and if you write to it,
> the duty gets hard assigned to that core, which in turn makes it
> blocked from NOHZ idle and NOHZ full modes. If the core goes offline,
> then the value must got back to -1. Writing -1 to it undoes the hard
> assignment.

Yeah, sounds like a good approach.

> > - It introduces more ifdeffery in the kernel.
> 
> There is no reason to do that.

probably not, as you can see, I wrapped everything in 
CONFIG_EXPOSE_TICK_CPU and added an option in Kconfig. Just drop all of 
that?

Thanks for the feedback! :)

-- 
Henrik Austad

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH Resend] Expose do_timer CPU via sysctl to userspace as R/W
  2014-02-19 19:43   ` Henrik Austad
@ 2014-02-19 19:50     ` Peter Zijlstra
  0 siblings, 0 replies; 4+ messages in thread
From: Peter Zijlstra @ 2014-02-19 19:50 UTC (permalink / raw)
  To: Henrik Austad
  Cc: Thomas Gleixner, LKML, Henrik Austad, Frederic Weisbecker, John Stultz

On Wed, Feb 19, 2014 at 08:43:55PM +0100, Henrik Austad wrote:
> The online-part should be taken care of by the cpu_present(), no?

No, cpu_present() only means there is a physical CPU present. It might
be offline.

See the comment in include/linux/cpumask.h

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2014-02-19 19:50 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-02-19 12:32 [PATCH Resend] Expose do_timer CPU via sysctl to userspace as R/W henrik
2014-02-19 16:42 ` Thomas Gleixner
2014-02-19 19:43   ` Henrik Austad
2014-02-19 19:50     ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.