All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] hrtimers: system-wide and per-task hrtimer slacks
@ 2012-02-20  7:49 Dmitry Antipov
  2012-04-05  0:10   ` Andrew Morton
  0 siblings, 1 reply; 8+ messages in thread
From: Dmitry Antipov @ 2012-02-20  7:49 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: linux-kernel, linaro-dev, patches, Dmitry Antipov

This patch proposes a system-wide sysctl-aware default for the
high-resolution timer slack value, which may be changed from 0
to HRTIMER_MAX_SLACK nanoseconds. Default system-wide and per-task
values are HRTIMER_DEFAULT_SLACK. Per-task value isn't inherited
across fork(); instead, newborn task uses system-wide value by
default, and newborn thread uses it's group leader value.

Signed-off-by: Dmitry Antipov <dmitry.antipov@linaro.org>
---
 Documentation/sysctl/kernel.txt |    8 ++++++++
 include/linux/hrtimer.h         |   11 +++++++++++
 include/linux/init_task.h       |    2 +-
 include/linux/sched.h           |   11 ++++++++---
 kernel/fork.c                   |    9 +++++++--
 kernel/futex.c                  |    4 ++--
 kernel/hrtimer.c                |   10 +++++++---
 kernel/sys.c                    |    9 ++++-----
 kernel/sysctl.c                 |   10 ++++++++++
 9 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 6d78841..83b63ed 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -606,6 +606,14 @@ can be ORed together:
 
 ==============================================================
 
+timer_slack:
+
+This value can be used to query and set the default slack for
+high-resolution timers, in nanoseconds. The default value is 50
+microseconds, and can be changed from 0 nanoseconds to 1 millisecond.
+
+==============================================================
+
 unknown_nmi_panic:
 
 The value in this file affects behavior of handling NMI. When the
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index fd0dc30..77169b7 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -24,6 +24,16 @@
 #include <linux/timer.h>
 #include <linux/timerqueue.h>
 
+/*
+ * Default system-wide and per-task hrtimer slack, in nanoseconds.
+ */
+#define HRTIMER_DEFAULT_SLACK 50000
+
+/*
+ * Reasonable limit for hrtimer slack.
+ */
+#define HRTIMER_MAX_SLACK 1000000
+
 struct hrtimer_clock_base;
 struct hrtimer_cpu_base;
 
@@ -323,6 +333,7 @@ extern ktime_t ktime_get_monotonic_offset(void);
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 
+extern int default_timer_slack_ns;
 
 /* Exported timer functions: */
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 9c66b1a..b29be0d 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -178,7 +178,7 @@ extern struct cred init_cred;
 	.journal_info	= NULL,						\
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),	\
-	.timer_slack_ns = 50000, /* 50 usec default slack */		\
+	.timer_slack_ns = HRTIMER_DEFAULT_SLACK,			\
 	.pids = {							\
 		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
 		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9b13f79..811e034 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1551,11 +1551,11 @@ struct task_struct {
 	struct latency_record latency_record[LT_SAVECOUNT];
 #endif
 	/*
-	 * time slack values; these are used to round up poll() and
-	 * select() etc timeout values. These are in nanoseconds.
+	 * High-resolution timer slack value, in nanoseconds.
+	 * Used to round up poll()/select(), nanosleep, futex
+	 * waiting, etc. timeout values of non-realtime tasks.
 	 */
 	unsigned long timer_slack_ns;
-	unsigned long default_timer_slack_ns;
 
 	struct list_head	*scm_work_list;
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -2631,6 +2631,11 @@ static inline int spin_needbreak(spinlock_t *lock)
 #endif
 }
 
+static inline unsigned long task_timer_slack(struct task_struct *tsk)
+{
+	return rt_task(tsk) ? 0 : tsk->timer_slack_ns;
+}
+
 /*
  * Thread group CPU time accounting.
  */
diff --git a/kernel/fork.c b/kernel/fork.c
index b77fd55..6aaff93 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1164,8 +1164,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #if defined(SPLIT_RSS_COUNTING)
 	memset(&p->rss_stat, 0, sizeof(p->rss_stat));
 #endif
-
-	p->default_timer_slack_ns = current->timer_slack_ns;
+	/* 
+	 * New thread inherits the slack from the group
+	 * leader. New process uses system-default slack.
+	 */
+	p->timer_slack_ns = (clone_flags & CLONE_THREAD) ?
+		current->group_leader->timer_slack_ns :
+		default_timer_slack_ns;
 
 	task_io_accounting_init(&p->ioac);
 	acct_clear_integrals(p);
diff --git a/kernel/futex.c b/kernel/futex.c
index 1614be2..a0d302d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1887,7 +1887,7 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 				      HRTIMER_MODE_ABS);
 		hrtimer_init_sleeper(to, current);
 		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
-					     current->timer_slack_ns);
+					     task_timer_slack(current));
 	}
 
 retry:
@@ -2281,7 +2281,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 				      HRTIMER_MODE_ABS);
 		hrtimer_init_sleeper(to, current);
 		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
-					     current->timer_slack_ns);
+					     task_timer_slack(current));
 	}
 
 	/*
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ae34bf5..0c56fec 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -51,6 +51,12 @@
 #include <trace/events/timer.h>
 
 /*
+ * Default hrtimer slack value, in nanoseconds. May be changed in
+ * [0..HRTIMER_MAX_SLACK] range through kernel.timer_slack sysctl.
+ */
+__read_mostly int default_timer_slack_ns = HRTIMER_DEFAULT_SLACK;
+
+/*
  * The timer bases:
  *
  * There are more clockids then hrtimer bases. Thus, we index
@@ -1564,9 +1570,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 	int ret = 0;
 	unsigned long slack;
 
-	slack = current->timer_slack_ns;
-	if (rt_task(current))
-		slack = 0;
+	slack = task_timer_slack(current);
 
 	hrtimer_init_on_stack(&t.timer, clockid, mode);
 	hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
diff --git a/kernel/sys.c b/kernel/sys.c
index 4070153..ac32846 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -22,6 +22,7 @@
 #include <linux/device.h>
 #include <linux/key.h>
 #include <linux/times.h>
+#include <linux/hrtimer.h>
 #include <linux/posix-timers.h>
 #include <linux/security.h>
 #include <linux/dcookies.h>
@@ -1917,12 +1918,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			error = current->timer_slack_ns;
 			break;
 		case PR_SET_TIMERSLACK:
-			if (arg2 <= 0)
-				current->timer_slack_ns =
-					current->default_timer_slack_ns;
-			else
+			if (arg2 <= HRTIMER_MAX_SLACK)
 				current->timer_slack_ns = arg2;
-			error = 0;
+			else
+				error = -EINVAL;
 			break;
 		case PR_MCE_KILL:
 			if (arg4 | arg5)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f487f25..2cd42c6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,6 +136,7 @@ static int min_percpu_pagelist_fract = 8;
 
 static int ngroups_max = NGROUPS_MAX;
 static const int cap_last_cap = CAP_LAST_CAP;
+static const int slack_max = HRTIMER_MAX_SLACK;
 
 #ifdef CONFIG_INOTIFY_USER
 #include <linux/inotify.h>
@@ -1004,6 +1005,15 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
+	{
+		.procname	= "timer_slack",
+		.data		= &default_timer_slack_ns,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &slack_max,
+	},
 	{ }
 };
 
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] hrtimers: system-wide and per-task hrtimer slacks
@ 2012-04-05  0:10   ` Andrew Morton
  0 siblings, 0 replies; 8+ messages in thread
From: Andrew Morton @ 2012-04-05  0:10 UTC (permalink / raw)
  To: Dmitry Antipov
  Cc: Thomas Gleixner, linux-kernel, linaro-dev, patches, linux-man

On Mon, 20 Feb 2012 11:49:32 +0400
Dmitry Antipov <dmitry.antipov@linaro.org> wrote:

> This patch proposes a system-wide sysctl-aware default for the
> high-resolution timer slack value, which may be changed from 0
> to HRTIMER_MAX_SLACK nanoseconds. Default system-wide and per-task
> values are HRTIMER_DEFAULT_SLACK. Per-task value isn't inherited
> across fork(); instead, newborn task uses system-wide value by
> default, and newborn thread uses it's group leader value.

Well..  there are some back-incompatibilities here. 
prctl(PR_SET_TIMERSLACK, -1) used to restore current's slack setting to
whatever-we-inherited-at-fork, but that has been removed.  What are the
implications of this, and did we need to do it?

If we do make changes in this area then the prctl manpage should be
updated, please.  And if
http://www.spinics.net/lists/linux-man/msg01149.html represents the
current state of that manpage then it should be updated anyway - that
entry doesn't say anything about the (arg2 <= 0) case.



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] hrtimers: system-wide and per-task hrtimer slacks
@ 2012-04-05  0:10   ` Andrew Morton
  0 siblings, 0 replies; 8+ messages in thread
From: Andrew Morton @ 2012-04-05  0:10 UTC (permalink / raw)
  To: Dmitry Antipov
  Cc: Thomas Gleixner, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linaro-dev-cunTk1MwBs8s++Sfvej+rw,
	patches-QSEj5FYQhm4dnm+yROfE0A, linux-man-u79uwXL29TY76Z2rM5mHXA

On Mon, 20 Feb 2012 11:49:32 +0400
Dmitry Antipov <dmitry.antipov-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org> wrote:

> This patch proposes a system-wide sysctl-aware default for the
> high-resolution timer slack value, which may be changed from 0
> to HRTIMER_MAX_SLACK nanoseconds. Default system-wide and per-task
> values are HRTIMER_DEFAULT_SLACK. Per-task value isn't inherited
> across fork(); instead, newborn task uses system-wide value by
> default, and newborn thread uses it's group leader value.

Well..  there are some back-incompatibilities here. 
prctl(PR_SET_TIMERSLACK, -1) used to restore current's slack setting to
whatever-we-inherited-at-fork, but that has been removed.  What are the
implications of this, and did we need to do it?

If we do make changes in this area then the prctl manpage should be
updated, please.  And if
http://www.spinics.net/lists/linux-man/msg01149.html represents the
current state of that manpage then it should be updated anyway - that
entry doesn't say anything about the (arg2 <= 0) case.


--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] hrtimers: system-wide and per-task hrtimer slacks
@ 2012-04-06  9:14     ` Dmitry Antipov
  0 siblings, 0 replies; 8+ messages in thread
From: Dmitry Antipov @ 2012-04-06  9:14 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Thomas Gleixner, linux-kernel, linaro-dev, patches, linux-man

On 04/05/2012 04:10 AM, Andrew Morton wrote:

> Well..  there are some back-incompatibilities here.
> prctl(PR_SET_TIMERSLACK, -1) used to restore current's slack setting to
> whatever-we-inherited-at-fork, but that has been removed.  What are the
> implications of this, and did we need to do it?

It seems you're looking at the previous version of this patch
(http://lkml.org/lkml/2012/2/20/55). Latest proposal is
http://lwn.net/Articles/484162/, which defines PR_SET_TIMERSLACK
action as:
...
case PR_SET_TIMERSLACK:
         if (arg2 <= 0)
                 current->timer_slack_ns =
                         default_timer_slack_ns;
         else if (arg2 <= HRTIMER_MAX_SLACK)
                 current->timer_slack_ns = arg2;
         else
                 error = -EINVAL;
         break;
...

> If we do make changes in this area then the prctl manpage should be
> updated, please.  And if
> http://www.spinics.net/lists/linux-man/msg01149.html represents the
> current state of that manpage then it should be updated anyway - that
> entry doesn't say anything about the (arg2<= 0) case.

I sent a patch for man pages too, it should be one of the recent posts
at http://www.spinics.net/lists/linux-man/index.html.

Dmitry



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] hrtimers: system-wide and per-task hrtimer slacks
@ 2012-04-06  9:14     ` Dmitry Antipov
  0 siblings, 0 replies; 8+ messages in thread
From: Dmitry Antipov @ 2012-04-06  9:14 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Thomas Gleixner, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linaro-dev-cunTk1MwBs8s++Sfvej+rw,
	patches-QSEj5FYQhm4dnm+yROfE0A, linux-man-u79uwXL29TY76Z2rM5mHXA

On 04/05/2012 04:10 AM, Andrew Morton wrote:

> Well..  there are some back-incompatibilities here.
> prctl(PR_SET_TIMERSLACK, -1) used to restore current's slack setting to
> whatever-we-inherited-at-fork, but that has been removed.  What are the
> implications of this, and did we need to do it?

It seems you're looking at the previous version of this patch
(http://lkml.org/lkml/2012/2/20/55). Latest proposal is
http://lwn.net/Articles/484162/, which defines PR_SET_TIMERSLACK
action as:
...
case PR_SET_TIMERSLACK:
         if (arg2 <= 0)
                 current->timer_slack_ns =
                         default_timer_slack_ns;
         else if (arg2 <= HRTIMER_MAX_SLACK)
                 current->timer_slack_ns = arg2;
         else
                 error = -EINVAL;
         break;
...

> If we do make changes in this area then the prctl manpage should be
> updated, please.  And if
> http://www.spinics.net/lists/linux-man/msg01149.html represents the
> current state of that manpage then it should be updated anyway - that
> entry doesn't say anything about the (arg2<= 0) case.

I sent a patch for man pages too, it should be one of the recent posts
at http://www.spinics.net/lists/linux-man/index.html.

Dmitry


--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] hrtimers: system-wide and per-task hrtimer slacks
@ 2012-04-24 22:06       ` Michael Kerrisk
  0 siblings, 0 replies; 8+ messages in thread
From: Michael Kerrisk @ 2012-04-24 22:06 UTC (permalink / raw)
  To: Dmitry Antipov
  Cc: Andrew Morton, Thomas Gleixner, linux-kernel, linaro-dev,
	patches, linux-man

Dmitry,

On Fri, Apr 6, 2012 at 9:14 PM, Dmitry Antipov
<dmitry.antipov@linaro.org> wrote:
> On 04/05/2012 04:10 AM, Andrew Morton wrote:
>
>> Well..  there are some back-incompatibilities here.
>> prctl(PR_SET_TIMERSLACK, -1) used to restore current's slack setting to
>> whatever-we-inherited-at-fork, but that has been removed.  What are the
>> implications of this, and did we need to do it?
>
>
> It seems you're looking at the previous version of this patch
> (http://lkml.org/lkml/2012/2/20/55). Latest proposal is
> http://lwn.net/Articles/484162/, which defines PR_SET_TIMERSLACK
> action as:
> ...
> case PR_SET_TIMERSLACK:
>        if (arg2 <= 0)
>                current->timer_slack_ns =
>                        default_timer_slack_ns;
>        else if (arg2 <= HRTIMER_MAX_SLACK)
>                current->timer_slack_ns = arg2;
>        else
>                error = -EINVAL;
>        break;
> ...
>
>
>> If we do make changes in this area then the prctl manpage should be
>> updated, please.  And if
>> http://www.spinics.net/lists/linux-man/msg01149.html represents the
>> current state of that manpage then it should be updated anyway - that
>> entry doesn't say anything about the (arg2<= 0) case.
>
>
> I sent a patch for man pages too, it should be one of the recent posts
> at http://www.spinics.net/lists/linux-man/index.html.

Your response didn't actually address Andrew's point. Your patch
changes user-visible semantics that have been in place since kernel
2.6.28. Specifically:

* The meaning of prctl(PS_SET_TIMESLACK, n) changes,
  for the n<0 case (formerly, this reverted the timer slack
  to the per-process "default", with the proposed patch, it
  reverts the timer slack to a system-wide default).
* The semantics of setting the timer slack of a new thread
  have changed.

Perhaps these changes are warranted/necessary, but they *are* ABI
changes, and so should be carefully explained and well justified.

Thanks,

Michael

PS As background to the discussion, here's the current draft of some
text I plan to add to prctl(2) that explains the current semantics,
which would change with Dmitry's patch:

prctl(2):
       PR_SET_TIMERSLACK (since Linux 2.6.28)
              Set  the  timer slack for the calling thread to the value in
              arg2.  The timer slack is a value, expressed in nanoseconds,
              that  is  used  by the kernel to group timer expirations for
              this thread that are close to one another; as a consequence,
              timer expirations for this thread may be up to the specified
              number of nanoseconds late (but will  never  expire  early).
              Grouping timer expirations can help reduce system power con‐
              sumption by minimizing CPU wake-ups.

              The timer expirations affected by timer slack are those  set
              by  select(2), pselect(2), poll(2), ppoll(2), epoll_wait(2),
              epoll_pwait(2),   clock_nanosleep(2),   nanosleep(2),    and
              futex(2)  (and  thus  the  library functions implemented via
              futexes: pthread_cond_timedwait(3),  pthread_rwlock_timedrd‐
              lock(3), pthread_rwlock_timedwrlock(3), and sem_wait(3)).

              Each  thread  has  two  associated  timer  slack  values:  a
              "default" value, and a "current" value.  The "current" value
              is the one that governs grouping of timer expirations.  When
              a new thread is created, the two timer slack values are made
              the  same  as  the  "current"  value of the creating thread.
              Thereafter, a thread can adjust its timer  slack  value  via
              PR_SET_TIMERSLACK:  if  arg2  is  greater than zero, then it
              specifies a new value for the "current" timer slack for  the
              calling  thread; if arg2 is less than or equal to zero, then
              the "current" timer slack is set  to  the  "default"  value.
              The  timer  slack value of init (PID 1), the ancestor of all
              threads, is 50,000 nanoseconds (50 microseconds).

fork(2):
       *  The "default" timer slack of the child is set to  the  value  of
          the  "current"  timer slack of the parent.  (See the description
          of PR_SET_TIMERSLACK on prctl(2).)

-- 
Michael Kerrisk Linux man-pages maintainer;
http://www.kernel.org/doc/man-pages/
Author of "The Linux Programming Interface", http://blog.man7.org/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] hrtimers: system-wide and per-task hrtimer slacks
@ 2012-04-24 22:06       ` Michael Kerrisk
  0 siblings, 0 replies; 8+ messages in thread
From: Michael Kerrisk @ 2012-04-24 22:06 UTC (permalink / raw)
  To: Dmitry Antipov
  Cc: Andrew Morton, Thomas Gleixner,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linaro-dev-cunTk1MwBs8s++Sfvej+rw,
	patches-QSEj5FYQhm4dnm+yROfE0A, linux-man-u79uwXL29TY76Z2rM5mHXA

Dmitry,

On Fri, Apr 6, 2012 at 9:14 PM, Dmitry Antipov
<dmitry.antipov-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org> wrote:
> On 04/05/2012 04:10 AM, Andrew Morton wrote:
>
>> Well..  there are some back-incompatibilities here.
>> prctl(PR_SET_TIMERSLACK, -1) used to restore current's slack setting to
>> whatever-we-inherited-at-fork, but that has been removed.  What are the
>> implications of this, and did we need to do it?
>
>
> It seems you're looking at the previous version of this patch
> (http://lkml.org/lkml/2012/2/20/55). Latest proposal is
> http://lwn.net/Articles/484162/, which defines PR_SET_TIMERSLACK
> action as:
> ...
> case PR_SET_TIMERSLACK:
>        if (arg2 <= 0)
>                current->timer_slack_ns =
>                        default_timer_slack_ns;
>        else if (arg2 <= HRTIMER_MAX_SLACK)
>                current->timer_slack_ns = arg2;
>        else
>                error = -EINVAL;
>        break;
> ...
>
>
>> If we do make changes in this area then the prctl manpage should be
>> updated, please.  And if
>> http://www.spinics.net/lists/linux-man/msg01149.html represents the
>> current state of that manpage then it should be updated anyway - that
>> entry doesn't say anything about the (arg2<= 0) case.
>
>
> I sent a patch for man pages too, it should be one of the recent posts
> at http://www.spinics.net/lists/linux-man/index.html.

Your response didn't actually address Andrew's point. Your patch
changes user-visible semantics that have been in place since kernel
2.6.28. Specifically:

* The meaning of prctl(PS_SET_TIMESLACK, n) changes,
  for the n<0 case (formerly, this reverted the timer slack
  to the per-process "default", with the proposed patch, it
  reverts the timer slack to a system-wide default).
* The semantics of setting the timer slack of a new thread
  have changed.

Perhaps these changes are warranted/necessary, but they *are* ABI
changes, and so should be carefully explained and well justified.

Thanks,

Michael

PS As background to the discussion, here's the current draft of some
text I plan to add to prctl(2) that explains the current semantics,
which would change with Dmitry's patch:

prctl(2):
       PR_SET_TIMERSLACK (since Linux 2.6.28)
              Set  the  timer slack for the calling thread to the value in
              arg2.  The timer slack is a value, expressed in nanoseconds,
              that  is  used  by the kernel to group timer expirations for
              this thread that are close to one another; as a consequence,
              timer expirations for this thread may be up to the specified
              number of nanoseconds late (but will  never  expire  early).
              Grouping timer expirations can help reduce system power con‐
              sumption by minimizing CPU wake-ups.

              The timer expirations affected by timer slack are those  set
              by  select(2), pselect(2), poll(2), ppoll(2), epoll_wait(2),
              epoll_pwait(2),   clock_nanosleep(2),   nanosleep(2),    and
              futex(2)  (and  thus  the  library functions implemented via
              futexes: pthread_cond_timedwait(3),  pthread_rwlock_timedrd‐
              lock(3), pthread_rwlock_timedwrlock(3), and sem_wait(3)).

              Each  thread  has  two  associated  timer  slack  values:  a
              "default" value, and a "current" value.  The "current" value
              is the one that governs grouping of timer expirations.  When
              a new thread is created, the two timer slack values are made
              the  same  as  the  "current"  value of the creating thread.
              Thereafter, a thread can adjust its timer  slack  value  via
              PR_SET_TIMERSLACK:  if  arg2  is  greater than zero, then it
              specifies a new value for the "current" timer slack for  the
              calling  thread; if arg2 is less than or equal to zero, then
              the "current" timer slack is set  to  the  "default"  value.
              The  timer  slack value of init (PID 1), the ancestor of all
              threads, is 50,000 nanoseconds (50 microseconds).

fork(2):
       *  The "default" timer slack of the child is set to  the  value  of
          the  "current"  timer slack of the parent.  (See the description
          of PR_SET_TIMERSLACK on prctl(2).)

-- 
Michael Kerrisk Linux man-pages maintainer;
http://www.kernel.org/doc/man-pages/
Author of "The Linux Programming Interface", http://blog.man7.org/
--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [RFC PATCH] hrtimers: system-wide and per-task hrtimer slacks
@ 2012-02-28  8:46 Dmitry Antipov
  0 siblings, 0 replies; 8+ messages in thread
From: Dmitry Antipov @ 2012-02-28  8:46 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Ingo Molnar, Peter Zijlstra, John Stultz, linux-kernel,
	linaro-dev, patches, Dmitry Antipov

This patch proposes a system-wide sysctl-aware default for the
high-resolution timer slack value, which may be changed from 0
to HRTIMER_MAX_SLACK nanoseconds. Default system-wide and per-task
values are HRTIMER_DEFAULT_SLACK. Per-task value isn't inherited
across fork(); instead, newborn task uses system-wide value by
default, and newborn thread uses it's group leader value.

Signed-off-by: Dmitry Antipov <dmitry.antipov@linaro.org>
---
 Documentation/sysctl/kernel.txt |    8 ++++++++
 include/linux/hrtimer.h         |   11 +++++++++++
 include/linux/init_task.h       |    2 +-
 include/linux/sched.h           |   11 ++++++++---
 kernel/fork.c                   |    9 +++++++--
 kernel/futex.c                  |    4 ++--
 kernel/hrtimer.c                |   10 +++++++---
 kernel/sys.c                    |    8 +++++---
 kernel/sysctl.c                 |   10 ++++++++++
 9 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 6d78841..83b63ed 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -606,6 +606,14 @@ can be ORed together:
 
 ==============================================================
 
+timer_slack:
+
+This value can be used to query and set the default slack for
+high-resolution timers, in nanoseconds. The default value is 50
+microseconds, and can be changed from 0 nanoseconds to 1 millisecond.
+
+==============================================================
+
 unknown_nmi_panic:
 
 The value in this file affects behavior of handling NMI. When the
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index fd0dc30..b9da137 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -24,6 +24,16 @@
 #include <linux/timer.h>
 #include <linux/timerqueue.h>
 
+/*
+ * Default system-wide and per-task hrtimer slack, in nanoseconds.
+ */
+#define HRTIMER_DEFAULT_SLACK 50000
+
+/*
+ * Reasonable limit for hrtimer slack, in nanoseconds.
+ */
+#define HRTIMER_MAX_SLACK 1000000
+
 struct hrtimer_clock_base;
 struct hrtimer_cpu_base;
 
@@ -323,6 +333,7 @@ extern ktime_t ktime_get_monotonic_offset(void);
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 
+extern int default_timer_slack_ns;
 
 /* Exported timer functions: */
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 9c66b1a..b29be0d 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -178,7 +178,7 @@ extern struct cred init_cred;
 	.journal_info	= NULL,						\
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),	\
-	.timer_slack_ns = 50000, /* 50 usec default slack */		\
+	.timer_slack_ns = HRTIMER_DEFAULT_SLACK,			\
 	.pids = {							\
 		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
 		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7d379a6..aa0a806 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1551,11 +1551,11 @@ struct task_struct {
 	struct latency_record latency_record[LT_SAVECOUNT];
 #endif
 	/*
-	 * time slack values; these are used to round up poll() and
-	 * select() etc timeout values. These are in nanoseconds.
+	 * High-resolution timer slack value, in nanoseconds.
+	 * Used to round up poll()/select(), nanosleep, futex
+	 * waiting, etc. timeout values of non-realtime tasks.
 	 */
 	unsigned long timer_slack_ns;
-	unsigned long default_timer_slack_ns;
 
 	struct list_head	*scm_work_list;
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -2628,6 +2628,11 @@ static inline int spin_needbreak(spinlock_t *lock)
 #endif
 }
 
+static inline unsigned long task_timer_slack(struct task_struct *tsk)
+{
+	return rt_task(tsk) ? 0 : tsk->timer_slack_ns;
+}
+
 /*
  * Thread group CPU time accounting.
  */
diff --git a/kernel/fork.c b/kernel/fork.c
index e2cd3e2..0f9a983 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1167,8 +1167,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #if defined(SPLIT_RSS_COUNTING)
 	memset(&p->rss_stat, 0, sizeof(p->rss_stat));
 #endif
-
-	p->default_timer_slack_ns = current->timer_slack_ns;
+	/* 
+	 * New thread inherits the slack from the group
+	 * leader. New process uses system-default slack.
+	 */
+	p->timer_slack_ns = (clone_flags & CLONE_THREAD) ?
+		current->group_leader->timer_slack_ns :
+		default_timer_slack_ns;
 
 	task_io_accounting_init(&p->ioac);
 	acct_clear_integrals(p);
diff --git a/kernel/futex.c b/kernel/futex.c
index 1614be2..a0d302d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1887,7 +1887,7 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 				      HRTIMER_MODE_ABS);
 		hrtimer_init_sleeper(to, current);
 		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
-					     current->timer_slack_ns);
+					     task_timer_slack(current));
 	}
 
 retry:
@@ -2281,7 +2281,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 				      HRTIMER_MODE_ABS);
 		hrtimer_init_sleeper(to, current);
 		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
-					     current->timer_slack_ns);
+					     task_timer_slack(current));
 	}
 
 	/*
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ae34bf5..0c56fec 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -51,6 +51,12 @@
 #include <trace/events/timer.h>
 
 /*
+ * Default hrtimer slack value, in nanoseconds. May be changed in
+ * [0..HRTIMER_MAX_SLACK] range through kernel.timer_slack sysctl.
+ */
+__read_mostly int default_timer_slack_ns = HRTIMER_DEFAULT_SLACK;
+
+/*
  * The timer bases:
  *
  * There are more clockids then hrtimer bases. Thus, we index
@@ -1564,9 +1570,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 	int ret = 0;
 	unsigned long slack;
 
-	slack = current->timer_slack_ns;
-	if (rt_task(current))
-		slack = 0;
+	slack = task_timer_slack(current);
 
 	hrtimer_init_on_stack(&t.timer, clockid, mode);
 	hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
diff --git a/kernel/sys.c b/kernel/sys.c
index 4070153..e976540 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -22,6 +22,7 @@
 #include <linux/device.h>
 #include <linux/key.h>
 #include <linux/times.h>
+#include <linux/hrtimer.h>
 #include <linux/posix-timers.h>
 #include <linux/security.h>
 #include <linux/dcookies.h>
@@ -1919,10 +1920,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		case PR_SET_TIMERSLACK:
 			if (arg2 <= 0)
 				current->timer_slack_ns =
-					current->default_timer_slack_ns;
-			else
+					default_timer_slack_ns;
+			else if (arg2 <= HRTIMER_MAX_SLACK)
 				current->timer_slack_ns = arg2;
-			error = 0;
+			else
+				error = -EINVAL;
 			break;
 		case PR_MCE_KILL:
 			if (arg4 | arg5)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f487f25..2cd42c6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,6 +136,7 @@ static int min_percpu_pagelist_fract = 8;
 
 static int ngroups_max = NGROUPS_MAX;
 static const int cap_last_cap = CAP_LAST_CAP;
+static const int slack_max = HRTIMER_MAX_SLACK;
 
 #ifdef CONFIG_INOTIFY_USER
 #include <linux/inotify.h>
@@ -1004,6 +1005,15 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
+	{
+		.procname	= "timer_slack",
+		.data		= &default_timer_slack_ns,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &slack_max,
+	},
 	{ }
 };
 
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2012-04-24 22:06 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-02-20  7:49 [RFC PATCH] hrtimers: system-wide and per-task hrtimer slacks Dmitry Antipov
2012-04-05  0:10 ` Andrew Morton
2012-04-05  0:10   ` Andrew Morton
2012-04-06  9:14   ` Dmitry Antipov
2012-04-06  9:14     ` Dmitry Antipov
2012-04-24 22:06     ` Michael Kerrisk
2012-04-24 22:06       ` Michael Kerrisk
2012-02-28  8:46 Dmitry Antipov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.