linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] Imprecise timers.
@ 2008-07-22  3:02 David Woodhouse
  2008-07-22  3:05 ` [RFC] schedule_timeout_range() David Woodhouse
                   ` (2 more replies)
  0 siblings, 3 replies; 22+ messages in thread
From: David Woodhouse @ 2008-07-22  3:02 UTC (permalink / raw)
  To: linux-kernel; +Cc: Thomas Gleixner, Ingo Molnar, arjan

Many users of timers don't really care too much about exactly when their
timer fires -- and waking a CPU to satisfy such a timer is a waste of
power. This patch implements a 'range' timer which will fire at a 'convenient'
moment within given constraints.

It's implemented by a deferrable timer at the beginning of the range,
which will run some time later when the CPU happens to be awake. And a
non-deferrable timer at the hard deadline, to ensure it really does
happen by then.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/timer.h |  101 ++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/timer.c        |   18 +++++---
 2 files changed, 110 insertions(+), 9 deletions(-)

diff --git a/include/linux/timer.h b/include/linux/timer.h
index d4ba792..163137c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -23,22 +23,60 @@ struct timer_list {
 #endif
 };
 
+/* This can probably be optimised somehow, but for now we do it the
+   simple way: two timers, one deferrable and one for the deadline. */
+struct range_timer {
+	struct timer_list early;
+	struct timer_list deadline;
+	void (*function)(unsigned long);
+	unsigned long data;
+};
+
 extern struct tvec_base boot_tvec_bases;
+/*
+ * Note that all tvec_bases are 2 byte aligned and lower bit of
+ * base in timer_list is guaranteed to be zero. Use the LSB for
+ * the new flag to indicate whether the timer is deferrable
+ */
+#define TBASE_DEFERRABLE_FLAG		(0x1)
 
-#define TIMER_INITIALIZER(_function, _expires, _data) {		\
+#define __TIMER_INITIALIZER(_function, _expires, _data, _base) {\
 		.entry = { .prev = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
 		.expires = (_expires),				\
 		.data = (_data),				\
-		.base = &boot_tvec_bases,			\
+		.base = (struct tvec_base *)(_base),		\
 	}
+#define TIMER_INITIALIZER(_function, _expires, _data)		\
+	__TIMER_INITIALIZER(_function, _expires, _data, &boot_tvec_bases)
+
+#define TIMER_INITIALIZER_DEFERRABLE(_function, _expires, _data)	\
+	__TIMER_INITIALIZER(_function, _expires, _data,			\
+		(unsigned long)&boot_tvec_bases + TBASE_DEFERRABLE_FLAG)
+
+void range_timer_func(unsigned long timer);
 
 #define DEFINE_TIMER(_name, _function, _expires, _data)		\
 	struct timer_list _name =				\
 		TIMER_INITIALIZER(_function, _expires, _data)
 
+#define DEFINE_RANGE_TIMER(_name, _function, _expires, _deadline, _data)\
+	struct range_timer _name = {					\
+		.early = TIMER_INITIALIZER_DEFERRABLE(range_timer_func,	\
+			(_expires), (unsigned long)&(_name)),		\
+		.deadline = TIMER_INITIALIZER(range_timer_func,		\
+			(_deadline), (unsigned long)(&(_name))),	\
+		.function = (_function),				\
+		.data = (_data),					\
+	}
+
 void init_timer(struct timer_list *timer);
 void init_timer_deferrable(struct timer_list *timer);
+static inline void init_range_timer(struct range_timer *timer)
+{
+	init_timer_deferrable(&timer->early);
+	init_timer(&timer->deadline);
+}
 
 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 extern void init_timer_on_stack(struct timer_list *timer);
@@ -51,6 +89,19 @@ static inline void init_timer_on_stack(struct timer_list *timer)
 }
 #endif
 
+static inline void setup_range_timer(struct range_timer *timer,
+				     void (*function)(unsigned long),
+				     unsigned long data)
+{
+	timer->early.function = range_timer_func;
+	timer->early.data = (unsigned long)timer;
+	timer->deadline.function = range_timer_func;
+	timer->deadline.data = (unsigned long)timer;
+	timer->function = function;
+	timer->data = data;
+
+	init_range_timer(timer);
+}
 static inline void setup_timer(struct timer_list * timer,
 				void (*function)(unsigned long),
 				unsigned long data)
@@ -83,12 +134,54 @@ static inline int timer_pending(const struct timer_list * timer)
 {
 	return timer->entry.next != NULL;
 }
+static inline int range_timer_pending(const struct range_timer *timer)
+{
+	return timer_pending(&timer->early) || timer_pending(&timer->deadline);
+}
 
 extern void add_timer_on(struct timer_list *timer, int cpu);
 extern int del_timer(struct timer_list * timer);
 extern int __mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
 
+static inline void add_range_timer_on(struct range_timer *timer, int cpu)
+{
+	add_timer_on(&timer->early, cpu);
+	add_timer_on(&timer->deadline, cpu);
+}
+static inline int del_range_timer(struct range_timer *timer)
+{
+	return del_timer(&timer->early) | del_timer(&timer->deadline);
+}
+static inline int __mod_range_timer(struct range_timer *timer,
+				    unsigned long expires,
+				    unsigned long deadline)
+{
+	int ret;
+	WARN_ON(deadline < expires);
+	/* We need them on the same CPU */
+	preempt_disable();
+	ret = __mod_timer(&timer->early, expires);
+	ret |= __mod_timer(&timer->deadline, deadline);
+	preempt_enable();
+
+	return ret;
+}
+
+static inline int mod_range_timer(struct range_timer *timer,
+				  unsigned long expires,
+				  unsigned long deadline)
+{
+	int ret;
+	WARN_ON(deadline < expires);
+	/* We need them on the same CPU */
+	preempt_disable();
+	ret = mod_timer(&timer->early, expires);
+	ret |= mod_timer(&timer->deadline, deadline);
+	preempt_enable();
+
+	return ret;
+}
 /*
  * The jiffies value which is added to now, when there is no timer
  * in the timer wheel:
@@ -174,6 +267,10 @@ static inline void add_timer(struct timer_list *timer)
 # define del_timer_sync(t)		del_timer(t)
 #endif
 
+static inline int del_range_timer_sync(struct range_timer *timer)
+{
+	return del_timer_sync(&timer->early) | del_timer_sync(&timer->deadline);
+}
 #define del_singleshot_timer_sync(t) del_timer_sync(t)
 
 extern void init_timers(void);
diff --git a/kernel/timer.c b/kernel/timer.c
index 03bc7f1..e114f08 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -81,13 +81,6 @@ struct tvec_base boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
 
-/*
- * Note that all tvec_bases are 2 byte aligned and lower bit of
- * base in timer_list is guaranteed to be zero. Use the LSB for
- * the new flag to indicate whether the timer is deferrable
- */
-#define TBASE_DEFERRABLE_FLAG		(0x1)
-
 /* Functions below help us manage 'deferrable' flag */
 static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
 {
@@ -1525,3 +1518,14 @@ unsigned long msleep_interruptible(unsigned int msecs)
 }
 
 EXPORT_SYMBOL(msleep_interruptible);
+
+void range_timer_func(unsigned long t)
+{
+	struct range_timer *timer = (void *)t;
+
+	del_timer(&timer->early);
+	del_timer(&timer->deadline);
+
+	timer->function(timer->data);
+}
+EXPORT_SYMBOL_GPL(range_timer_func);
-- 
1.5.5.1


-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [RFC] schedule_timeout_range()
  2008-07-22  3:02 [RFC] Imprecise timers David Woodhouse
@ 2008-07-22  3:05 ` David Woodhouse
  2008-07-22  3:56   ` Nick Piggin
  2008-07-22  7:19 ` [RFC] Imprecise timers Rene Herman
  2008-07-29  0:36 ` Pallipadi, Venkatesh
  2 siblings, 1 reply; 22+ messages in thread
From: David Woodhouse @ 2008-07-22  3:05 UTC (permalink / raw)
  To: linux-kernel; +Cc: Thomas Gleixner, Ingo Molnar, arjan

Along the same lines as the previous patch, this provides
schedule_timeout_range() for when the precise moment of wakeup doesn't
matter (and isn't worth wasting power on), but any time the CPU happens
to be awake within a given range of time is fine.

Implement schedule_timeout() using it, and likewise for the _killable,
_interruptible and _uninterruptible variants.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/sched.h |   16 +++++--
 kernel/timer.c        |  127 +++++++++++++++++++++++++++++++------------------
 2 files changed, 93 insertions(+), 50 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1941d8b..5e9f5a9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -325,12 +325,20 @@ extern char __sched_text_start[], __sched_text_end[];
 extern int in_sched_functions(unsigned long addr);
 
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
-extern signed long schedule_timeout(signed long timeout);
-extern signed long schedule_timeout_interruptible(signed long timeout);
-extern signed long schedule_timeout_killable(signed long timeout);
-extern signed long schedule_timeout_uninterruptible(signed long timeout);
+extern long schedule_timeout_range(long timeout, long deadline);
+extern long schedule_timeout_range_interruptible(long timeout, long deadline);
+extern long schedule_timeout_range_killable(long timeout, long deadline);
+extern long schedule_timeout_range_uninterruptible(long timeout, long deadline);
 asmlinkage void schedule(void);
 
+#define schedule_timeout(_t) schedule_timeout_range((_t), (_t))
+#define schedule_timeout_interruptible(_t) \
+	schedule_timeout_range_interruptible((_t), (_t))
+#define schedule_timeout_killable(_t) \
+	schedule_timeout_range_killable((_t), (_t))
+#define schedule_timeout_uninterruptible(_t) \
+	schedule_timeout_range_uninterruptible((_t), (_t))
+
 struct nsproxy;
 struct user_namespace;
 
diff --git a/kernel/timer.c b/kernel/timer.c
index e114f08..dd43c34 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1144,11 +1144,12 @@ static void process_timeout(unsigned long __data)
 }
 
 /**
- * schedule_timeout - sleep until timeout
- * @timeout: timeout value in jiffies
+ * schedule_timeout_range - sleep until timeout
+ * @timeout: timeout value in jiffies, deferrable
+ * @deadline: hard deadline for timeout
  *
- * Make the current task sleep until @timeout jiffies have
- * elapsed. The routine will return immediately unless
+ * Make the current task sleep for a length of time between @timeout
+ * and @deadline jiffies. The routine will return immediately unless
  * the current task state has been set (see set_current_state()).
  *
  * You can set the task state as follows -
@@ -1169,81 +1170,115 @@ static void process_timeout(unsigned long __data)
  *
  * In all cases the return value is guaranteed to be non-negative.
  */
-signed long __sched schedule_timeout(signed long timeout)
+signed long __sched schedule_timeout_range(signed long timeout,
+					   signed long deadline)
 {
 	struct timer_list timer;
+	struct timer_list timer2;
 	unsigned long expire;
+	unsigned long expire2;
 
-	switch (timeout)
-	{
-	case MAX_SCHEDULE_TIMEOUT:
-		/*
-		 * These two special cases are useful to be comfortable
-		 * in the caller. Nothing more. We could take
-		 * MAX_SCHEDULE_TIMEOUT from one of the negative value
-		 * but I' d like to return a valid offset (>=0) to allow
-		 * the caller to do everything it want with the retval.
-		 */
+	/*
+	 * This special case is useful to be comfortable
+	 * in the caller. Nothing more. We could take
+	 * MAX_SCHEDULE_TIMEOUT from one of the negative value
+	 * but I'd like to return a valid offset (>=0) to allow
+	 * the caller to do everything it want with the retval.
+	 */
+	if (timeout == MAX_SCHEDULE_TIMEOUT &&
+	    deadline == MAX_SCHEDULE_TIMEOUT) {
 		schedule();
 		goto out;
-	default:
-		/*
-		 * Another bit of PARANOID. Note that the retval will be
-		 * 0 since no piece of kernel is supposed to do a check
-		 * for a negative retval of schedule_timeout() (since it
-		 * should never happens anyway). You just have the printk()
-		 * that will tell you if something is gone wrong and where.
-		 */
-		if (timeout < 0) {
-			printk(KERN_ERR "schedule_timeout: wrong timeout "
-				"value %lx\n", timeout);
-			dump_stack();
-			current->state = TASK_RUNNING;
-			goto out;
-		}
+	}
+       /*
+	* Another bit of PARANOIA. Note that the retval will be
+	* 0 since no piece of kernel is supposed to do a check
+	* for a negative retval of schedule_timeout() (since it
+	* should never happens anyway). You just have the printk()
+	* that will tell you if something is gone wrong and where.
+	*/
+	if (unlikely(timeout < 0)) {
+		printk(KERN_ERR "schedule_timeout: wrong timeout "
+		       "value %lx\n", timeout);
+		dump_stack();
+		current->state = TASK_RUNNING;
+		goto out;
+	}
+	if (unlikely(deadline < 0)) {
+		printk(KERN_ERR "schedule_timeout: wrong deadline "
+		       "value %lx\n", deadline);
+		dump_stack();
+		current->state = TASK_RUNNING;
+		goto out;
+	}
+	if (unlikely(timeout > deadline)) {
+		printk(KERN_ERR "schedule_timeout: deadline %lx earlier "
+		       "than initial timeout %lx\n", deadline, timeout);
+		timeout = deadline;
 	}
 
 	expire = timeout + jiffies;
+	expire2 = expire + deadline - timeout;
+
+	/* Don't bother to set up the deferrable timer if the deadline
+	   is at the same time */
+	if (timeout != deadline) {
+		setup_timer_on_stack(&timer, process_timeout,
+				     (unsigned long)current);
+		timer_set_deferrable(&timer);
+		__mod_timer(&timer, expire);
+	}
+	/* And don't bother with the deadline if it's infinite */
+ 	if (deadline != MAX_SCHEDULE_TIMEOUT) {
+		setup_timer_on_stack(&timer2, process_timeout,
+				     (unsigned long)current);
+		__mod_timer(&timer, expire2);
+	}
 
-	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire);
 	schedule();
-	del_singleshot_timer_sync(&timer);
-
-	/* Remove the timer from the object tracker */
-	destroy_timer_on_stack(&timer);
+	if (timeout != deadline) {
+		del_singleshot_timer_sync(&timer);
+		destroy_timer_on_stack(&timer);
+	}
+	if (deadline != MAX_SCHEDULE_TIMEOUT) {
+		del_singleshot_timer_sync(&timer2);
+		destroy_timer_on_stack(&timer2);
+	}
 
 	timeout = expire - jiffies;
 
  out:
 	return timeout < 0 ? 0 : timeout;
 }
-EXPORT_SYMBOL(schedule_timeout);
+EXPORT_SYMBOL(schedule_timeout_range);
 
 /*
  * We can use __set_current_state() here because schedule_timeout() calls
  * schedule() unconditionally.
  */
-signed long __sched schedule_timeout_interruptible(signed long timeout)
+signed long __sched schedule_timeout_range_interruptible(signed long timeout,
+							 signed long deadline)
 {
 	__set_current_state(TASK_INTERRUPTIBLE);
-	return schedule_timeout(timeout);
+	return schedule_timeout_range(timeout, deadline);
 }
-EXPORT_SYMBOL(schedule_timeout_interruptible);
+EXPORT_SYMBOL(schedule_timeout_range_interruptible);
 
-signed long __sched schedule_timeout_killable(signed long timeout)
+signed long __sched schedule_timeout_range_killable(signed long timeout,
+						    signed long deadline)
 {
 	__set_current_state(TASK_KILLABLE);
-	return schedule_timeout(timeout);
+	return schedule_timeout_range(timeout, deadline);
 }
-EXPORT_SYMBOL(schedule_timeout_killable);
+EXPORT_SYMBOL(schedule_timeout_range_killable);
 
-signed long __sched schedule_timeout_uninterruptible(signed long timeout)
+signed long __sched schedule_timeout_range_uninterruptible(signed long timeout,
+							   signed long deadline)
 {
 	__set_current_state(TASK_UNINTERRUPTIBLE);
-	return schedule_timeout(timeout);
+	return schedule_timeout_range(timeout, deadline);
 }
-EXPORT_SYMBOL(schedule_timeout_uninterruptible);
+EXPORT_SYMBOL(schedule_timeout_range_uninterruptible);
 
 /* Thread ID - the internal kernel "pid" */
 asmlinkage long sys_gettid(void)
-- 
1.5.5.1



-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [RFC] schedule_timeout_range()
  2008-07-22  3:05 ` [RFC] schedule_timeout_range() David Woodhouse
@ 2008-07-22  3:56   ` Nick Piggin
  2008-07-22  4:12     ` David Woodhouse
  2008-07-22  4:33     ` Arjan van de Ven
  0 siblings, 2 replies; 22+ messages in thread
From: Nick Piggin @ 2008-07-22  3:56 UTC (permalink / raw)
  To: David Woodhouse; +Cc: linux-kernel, Thomas Gleixner, Ingo Molnar, arjan

On Tuesday 22 July 2008 13:05, David Woodhouse wrote:
> Along the same lines as the previous patch, this provides
> schedule_timeout_range() for when the precise moment of wakeup doesn't
> matter (and isn't worth wasting power on), but any time the CPU happens
> to be awake within a given range of time is fine.
>
> Implement schedule_timeout() using it, and likewise for the _killable,
> _interruptible and _uninterruptible variants.

Rather than specific "deadline" values (which we can't guarantee anyway),
or vague "can defer" values, I would prefer just a small selection of
maybe orders of magnitude flags, maybe SECONDS, MILLISECONDS, MICROSECONDS
which gives an amount of delay the kernel might add to the timer.

If you prefer the deadline parameter, don't call it a hard deadline
which is misleading.

Otherwise, seems like a pretty good idea.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] schedule_timeout_range()
  2008-07-22  3:56   ` Nick Piggin
@ 2008-07-22  4:12     ` David Woodhouse
  2008-07-22  4:26       ` Arjan van de Ven
  2008-07-22  4:33       ` Nick Piggin
  2008-07-22  4:33     ` Arjan van de Ven
  1 sibling, 2 replies; 22+ messages in thread
From: David Woodhouse @ 2008-07-22  4:12 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-kernel, Thomas Gleixner, Ingo Molnar, arjan

On Tue, 2008-07-22 at 13:56 +1000, Nick Piggin wrote:
> Rather than specific "deadline" values (which we can't guarantee anyway),
> or vague "can defer" values,

We already _have_ those vague 'can defer' timers. They'll get run the
next time the CPU happens to be awake after they expire.

>  I would prefer just a small selection of maybe orders of magnitude
> flags, maybe SECONDS, MILLISECONDS, MICROSECONDS which gives an amount
> of delay the kernel might add to the timer.

As far as I can tell, any implementation of that ends up being converted
into what we have at the moment -- a deferrable timer which gets run
some time after it expires, and a timer which would actually _wake_ a
sleeping CPU. You have to create a value for that final timer anyway, so
why not just let the in-kernel caller provide it?

There's no point in trying to coalesce the 'final' timeouts; if just one
of them wakes the CPU and we're in the range for any other 'range
timers', those others will happy immediately anyway.

We did ponder the idea of a per-process setting which affects userspace
delays like nanosleep/poll/select, and introduces a variable extra delay
if the CPU is actually sleeping. So we can reduce the number of CPU
wakeup events for thosee userspace apps which aren't timing-sensitive.

We were also thinking of extending nanosleep/ppoll/pselect also to take
a 'range', for those cases where the process-wide setting needs to be
overridden. The prctl is a simple solution which doesn't involve
modifying large chunks of userspace to use new system calls, but it's
not a panacea -- in some places, an app might _want_ a prompt wakeup.

For kernel timers, though, I think it's better to let the caller set a
non-deferrable timer at a specific time. Although you're right that
'deadline' is probably a bad name for it. 

How about 'start' and 'end'? Or 'early' and 'late'? I really don't care
too much what it's called.

-- 
dwmw2


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] schedule_timeout_range()
  2008-07-22  4:12     ` David Woodhouse
@ 2008-07-22  4:26       ` Arjan van de Ven
  2008-07-22  4:34         ` David Woodhouse
  2008-07-22  4:33       ` Nick Piggin
  1 sibling, 1 reply; 22+ messages in thread
From: Arjan van de Ven @ 2008-07-22  4:26 UTC (permalink / raw)
  To: David Woodhouse; +Cc: Nick Piggin, linux-kernel, Thomas Gleixner, Ingo Molnar

On Tue, 22 Jul 2008 00:12:02 -0400
David Woodhouse <dwmw2@infradead.org> wrote:

> How about 'start' and 'end'? Or 'early' and 'late'? I really don't
> care too much what it's called.

or "start" and "delta" ?


-- 
If you want to reach me at my work email, use arjan@linux.intel.com
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] schedule_timeout_range()
  2008-07-22  3:56   ` Nick Piggin
  2008-07-22  4:12     ` David Woodhouse
@ 2008-07-22  4:33     ` Arjan van de Ven
  1 sibling, 0 replies; 22+ messages in thread
From: Arjan van de Ven @ 2008-07-22  4:33 UTC (permalink / raw)
  To: Nick Piggin; +Cc: David Woodhouse, linux-kernel, Thomas Gleixner, Ingo Molnar

On Tue, 22 Jul 2008 13:56:29 +1000
Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> Rather than specific "deadline" values (which we can't guarantee
> anyway), or vague "can defer" values, I would prefer just a small
> selection of maybe orders of magnitude flags, maybe SECONDS,
> MILLISECONDS, MICROSECONDS which gives an amount of delay the kernel
> might add to the timer.

the problem is that many of these are "I'd like a timeout between 30
and ..eh 40 seconds" (think scsi timeouts or networkcard timeouts)

it's not just specific slack units.

> 
-- 
If you want to reach me at my work email, use arjan@linux.intel.com
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] schedule_timeout_range()
  2008-07-22  4:12     ` David Woodhouse
  2008-07-22  4:26       ` Arjan van de Ven
@ 2008-07-22  4:33       ` Nick Piggin
  2008-07-22  4:45         ` David Woodhouse
  1 sibling, 1 reply; 22+ messages in thread
From: Nick Piggin @ 2008-07-22  4:33 UTC (permalink / raw)
  To: David Woodhouse; +Cc: linux-kernel, Thomas Gleixner, Ingo Molnar, arjan

On Tuesday 22 July 2008 14:12, David Woodhouse wrote:
> On Tue, 2008-07-22 at 13:56 +1000, Nick Piggin wrote:
> > Rather than specific "deadline" values (which we can't guarantee anyway),
> > or vague "can defer" values,
>
> We already _have_ those vague 'can defer' timers. They'll get run the
> next time the CPU happens to be awake after they expire.

Right, but that may be too vague to be really useful. OK, not exactly:
as with anything, if we really need an exact response, we have to wait
with interrupts disabled etc. However I don't think it would hurt to
get away from the all or nothing approach with future APIs that are added
(eventually the old ones could just be implemented over the new).


> >  I would prefer just a small selection of maybe orders of magnitude
> > flags, maybe SECONDS, MILLISECONDS, MICROSECONDS which gives an amount
> > of delay the kernel might add to the timer.
>
> As far as I can tell, any implementation of that ends up being converted
> into what we have at the moment -- a deferrable timer which gets run
> some time after it expires, and a timer which would actually _wake_ a
> sleeping CPU. You have to create a value for that final timer anyway, so
> why not just let the in-kernel caller provide it?

That is a fair point.


> There's no point in trying to coalesce the 'final' timeouts; if just one
> of them wakes the CPU and we're in the range for any other 'range
> timers', those others will happy immediately anyway.

Sure.


> We did ponder the idea of a per-process setting which affects userspace
> delays like nanosleep/poll/select, and introduces a variable extra delay
> if the CPU is actually sleeping. So we can reduce the number of CPU
> wakeup events for thosee userspace apps which aren't timing-sensitive.

Not such a bad idea. Maybe also something to think about adding explicitly
to future syscalls (if not a complete new parameter for delay time, then
at least a flag or two or different variants for different amounts of
accuracy). I guess select/poll is pretty widely used though, so there will
be some good gains just from a per-process setting.


> We were also thinking of extending nanosleep/ppoll/pselect also to take
> a 'range', for those cases where the process-wide setting needs to be
> overridden. The prctl is a simple solution which doesn't involve
> modifying large chunks of userspace to use new system calls, but it's
> not a panacea -- in some places, an app might _want_ a prompt wakeup.
>
> For kernel timers, though, I think it's better to let the caller set a
> non-deferrable timer at a specific time. Although you're right that
> 'deadline' is probably a bad name for it.
>
> How about 'start' and 'end'? Or 'early' and 'late'? I really don't care
> too much what it's called.

Well I think 'timeout' is fine for the "at least this much time", that's
well understood and used. As for the slop... slop? deferrable? Hmm,
precision might come pretty close to the engineering definition, no?

The only thing I dislike about explicit times is that when a driver or
someone doesn't _really_ know how much to specify. Do you say 10s, 100s?
It shouldn't be arbitrary, but we should have a few constants I think.

Some upper bound would be nice, which basically would not have to ever
fire by itself unless there is some CPU activity (so you don't have to
set two timers as a bonus). After that, I wonder, perhaps some "maximum
power savings value but not completely deferred"? Say give it a max of
30s? Or perhaps even that is not future-proof enough if we one day want
to suspend most of the system between external IOs?

if 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] schedule_timeout_range()
  2008-07-22  4:26       ` Arjan van de Ven
@ 2008-07-22  4:34         ` David Woodhouse
  0 siblings, 0 replies; 22+ messages in thread
From: David Woodhouse @ 2008-07-22  4:34 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: Nick Piggin, linux-kernel, Thomas Gleixner, Ingo Molnar

On Mon, 2008-07-21 at 21:26 -0700, Arjan van de Ven wrote:
> On Tue, 22 Jul 2008 00:12:02 -0400
> David Woodhouse <dwmw2@infradead.org> wrote:
> 
> > How about 'start' and 'end'? Or 'early' and 'late'? I really don't
> > care too much what it's called.
> 
> or "start" and "delta" ?

Yeah, that works -- it certainly makes a lot of sense for the
_userspace_ API to look like that. We could do the same for the
in-kernel one too...

-- 
dwmw2


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] schedule_timeout_range()
  2008-07-22  4:33       ` Nick Piggin
@ 2008-07-22  4:45         ` David Woodhouse
  2008-07-22  4:50           ` Nick Piggin
  0 siblings, 1 reply; 22+ messages in thread
From: David Woodhouse @ 2008-07-22  4:45 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-kernel, Thomas Gleixner, Ingo Molnar, arjan

On Tue, 2008-07-22 at 14:33 +1000, Nick Piggin wrote:
> The only thing I dislike about explicit times is that when a driver or
> someone doesn't _really_ know how much to specify. Do you say 10s, 100s?

This is true, but they certainly have a _better_ idea than we do. If the
individual callers can't even come up with an answer, how are we ever
going to come up with a generic policy that does the right thing? 

I really don't think that applying this kind of policy in generic code
is useful -- I'd like the callers to provide numbers even if they _do_
pull it out of their wossname.

The number they provide is the _maximum_ amount of time they should be
prepared to wait (let's assume for a moment that they stayed sober and
remembered Linux isn't a real-time kernel, so all guarantees are taken
with a pinch of salt. Let's not get bogged down in nomenclature).

In practice, they'll almost always get called before that maximum time
expires -- that's the whole _point_, of course. But we can't _invent_
that maximum in generic code; that's really up to the caller.

> Some upper bound would be nice, which basically would not have to ever
> fire by itself unless there is some CPU activity (so you don't have to
> set two timers as a bonus). After that, I wonder, perhaps some "maximum
> power savings value but not completely deferred"? Say give it a max of
> 30s? Or perhaps even that is not future-proof enough if we one day want
> to suspend most of the system between external IOs?

I _really_ don't think we want to go there. Let the caller set a maximum
amount of time they're prepared to wait, and that's it.

-- 
dwmw2


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] schedule_timeout_range()
  2008-07-22  4:45         ` David Woodhouse
@ 2008-07-22  4:50           ` Nick Piggin
  2008-07-22  4:58             ` David Woodhouse
  0 siblings, 1 reply; 22+ messages in thread
From: Nick Piggin @ 2008-07-22  4:50 UTC (permalink / raw)
  To: David Woodhouse; +Cc: linux-kernel, Thomas Gleixner, Ingo Molnar, arjan

On Tuesday 22 July 2008 14:45, David Woodhouse wrote:
> On Tue, 2008-07-22 at 14:33 +1000, Nick Piggin wrote:
> > The only thing I dislike about explicit times is that when a driver or
> > someone doesn't _really_ know how much to specify. Do you say 10s, 100s?
>
> This is true, but they certainly have a _better_ idea than we do. If the
> individual callers can't even come up with an answer, how are we ever
> going to come up with a generic policy that does the right thing?

OK, how about still having a never-until-machine-is-already-awake?


> I really don't think that applying this kind of policy in generic code
> is useful -- I'd like the callers to provide numbers even if they _do_
> pull it out of their wossname.
>
> The number they provide is the _maximum_ amount of time they should be
> prepared to wait (let's assume for a moment that they stayed sober and
> remembered Linux isn't a real-time kernel, so all guarantees are taken
> with a pinch of salt. Let's not get bogged down in nomenclature).

Well, I think it is still wise to avoid words like deadline, hard,
and timeout in the same sentence ;)


> In practice, they'll almost always get called before that maximum time
> expires -- that's the whole _point_, of course. But we can't _invent_
> that maximum in generic code; that's really up to the caller.

Not a maximum, but just an "I don't know... a lot?" define. But yeah
I guess there aren't too many good reasons for that.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] schedule_timeout_range()
  2008-07-22  4:50           ` Nick Piggin
@ 2008-07-22  4:58             ` David Woodhouse
  2008-07-22  5:35               ` Jan Engelhardt
  0 siblings, 1 reply; 22+ messages in thread
From: David Woodhouse @ 2008-07-22  4:58 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-kernel, Thomas Gleixner, Ingo Molnar, arjan

On Tue, 2008-07-22 at 14:50 +1000, Nick Piggin wrote:
> On Tuesday 22 July 2008 14:45, David Woodhouse wrote:
> > On Tue, 2008-07-22 at 14:33 +1000, Nick Piggin wrote:
> > > The only thing I dislike about explicit times is that when a driver or
> > > someone doesn't _really_ know how much to specify. Do you say 10s, 100s?
> >
> > This is true, but they certainly have a _better_ idea than we do. If the
> > individual callers can't even come up with an answer, how are we ever
> > going to come up with a generic policy that does the right thing?
> 
> OK, how about still having a never-until-machine-is-already-awake?

For timers we have that already -- it's called a deferrable timer. All
I've done to create the 'range timer' is couple that with a normal
timer, to implement the 'some time between X and Y' behaviour in a
fashion which is simple for people to use.

I did add the 'never-until-machine-is-already-awake' behaviour you
request to schedule_timeout_range() -- you get it by setting 'timeout'
to your intended minimum time, and setting the other argument (which was
called 'deadline' in my original patch) to MAX_SCHEDULE_TIMEOUT.

> > I really don't think that applying this kind of policy in generic code
> > is useful -- I'd like the callers to provide numbers even if they _do_
> > pull it out of their wossname.
> >
> > The number they provide is the _maximum_ amount of time they should be
> > prepared to wait (let's assume for a moment that they stayed sober and
> > remembered Linux isn't a real-time kernel, so all guarantees are taken
> > with a pinch of salt. Let's not get bogged down in nomenclature).
> 
> Well, I think it is still wise to avoid words like deadline, hard,
> and timeout in the same sentence ;)

Probably true :)

> > In practice, they'll almost always get called before that maximum time
> > expires -- that's the whole _point_, of course. But we can't _invent_
> > that maximum in generic code; that's really up to the caller.
> 
> Not a maximum, but just an "I don't know... a lot?" define. But yeah
> I guess there aren't too many good reasons for that.

I'd really like to avoid it. It puts the responsibility for coming up
with a number a _long_ way from where it should be, in the individual
caller.

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] schedule_timeout_range()
  2008-07-22  4:58             ` David Woodhouse
@ 2008-07-22  5:35               ` Jan Engelhardt
  0 siblings, 0 replies; 22+ messages in thread
From: Jan Engelhardt @ 2008-07-22  5:35 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Nick Piggin, linux-kernel, Thomas Gleixner, Ingo Molnar, arjan


On Tuesday 2008-07-22 06:58, David Woodhouse wrote:
>
>> > In practice, they'll almost always get called before that maximum time
>> > expires -- that's the whole _point_, of course. But we can't _invent_
>> > that maximum in generic code; that's really up to the caller.
>> 
>> Not a maximum, but just an "I don't know... a lot?" define. But yeah
>> I guess there aren't too many good reasons for that.
>
>I'd really like to avoid it. It puts the responsibility for coming up
>with a number a _long_ way from where it should be, in the individual
>caller.

Wait for drivers to make use of the range timer, hear their requirements
out, then can make a better-informed decision about the preciseness
of "a lot[?]". Maybe it turns out that drivers only ever need a range
like r={20msec, Infinity} because, say, a drive's status just remains
available anytime after 20msec until (finally) polled.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] Imprecise timers.
  2008-07-22  3:02 [RFC] Imprecise timers David Woodhouse
  2008-07-22  3:05 ` [RFC] schedule_timeout_range() David Woodhouse
@ 2008-07-22  7:19 ` Rene Herman
  2008-07-22 12:54   ` Arjan van de Ven
  2008-07-29  0:36 ` Pallipadi, Venkatesh
  2 siblings, 1 reply; 22+ messages in thread
From: Rene Herman @ 2008-07-22  7:19 UTC (permalink / raw)
  To: David Woodhouse; +Cc: linux-kernel, Thomas Gleixner, Ingo Molnar, arjan

On 22-07-08 05:02, David Woodhouse wrote:

> Many users of timers don't really care too much about exactly when
> their timer fires -- and waking a CPU to satisfy such a timer is a
> waste of power. This patch implements a 'range' timer which will fire
> at a 'convenient' moment within given constraints.
> 
> It's implemented by a deferrable timer at the beginning of the range,
> which will run some time later when the CPU happens to be awake. And
> a non-deferrable timer at the hard deadline, to ensure it really does
> happen by then.

Are there actually users for this (not just in theory)? The deferrable 
timer sort of sounds like all I'd ever want if I, as you say, wouldn't 
really care...

Rene.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] Imprecise timers.
  2008-07-22  7:19 ` [RFC] Imprecise timers Rene Herman
@ 2008-07-22 12:54   ` Arjan van de Ven
  2008-07-22 14:04     ` Rene Herman
  0 siblings, 1 reply; 22+ messages in thread
From: Arjan van de Ven @ 2008-07-22 12:54 UTC (permalink / raw)
  To: Rene Herman
  Cc: David Woodhouse, linux-kernel, Thomas Gleixner, Ingo Molnar,
	Stephen Hemminger

On Tue, 22 Jul 2008 09:19:02 +0200
Rene Herman <rene.herman@keyaccess.nl> wrote:

> On 22-07-08 05:02, David Woodhouse wrote:
> 
> > Many users of timers don't really care too much about exactly when
> > their timer fires -- and waking a CPU to satisfy such a timer is a
> > waste of power. This patch implements a 'range' timer which will
> > fire at a 'convenient' moment within given constraints.
> > 
> > It's implemented by a deferrable timer at the beginning of the
> > range, which will run some time later when the CPU happens to be
> > awake. And a non-deferrable timer at the hard deadline, to ensure
> > it really does happen by then.
> 
> Are there actually users for this (not just in theory)? The
> deferrable timer sort of sounds like all I'd ever want if I, as you
> say, wouldn't really care...

there's a few; mostly around hardware timeout..For example Stephen want
it for his drivers.

EXT3 journal flushing is another one where we can easily say 
"between 4 and 7 seconds" rather than "exactly at 5"


-- 
If you want to reach me at my work email, use arjan@linux.intel.com
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] Imprecise timers.
  2008-07-22 12:54   ` Arjan van de Ven
@ 2008-07-22 14:04     ` Rene Herman
  0 siblings, 0 replies; 22+ messages in thread
From: Rene Herman @ 2008-07-22 14:04 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: David Woodhouse, linux-kernel, Thomas Gleixner, Ingo Molnar,
	Stephen Hemminger

On 22-07-08 14:54, Arjan van de Ven wrote:

> On Tue, 22 Jul 2008 09:19:02 +0200
> Rene Herman <rene.herman@keyaccess.nl> wrote:
> 
>> On 22-07-08 05:02, David Woodhouse wrote:
>>
>>> Many users of timers don't really care too much about exactly when
>>> their timer fires -- and waking a CPU to satisfy such a timer is a
>>> waste of power. This patch implements a 'range' timer which will
>>> fire at a 'convenient' moment within given constraints.
>>>
>>> It's implemented by a deferrable timer at the beginning of the
>>> range, which will run some time later when the CPU happens to be
>>> awake. And a non-deferrable timer at the hard deadline, to ensure
>>> it really does happen by then.
>> Are there actually users for this (not just in theory)? The
>> deferrable timer sort of sounds like all I'd ever want if I, as you
>> say, wouldn't really care...
> 
> there's a few; mostly around hardware timeout..For example Stephen want
> it for his drivers.

Hardware I've dealt with is (almost? can't remember anything else) 
exlusively minimal delays and as such this thing seemed like perhaps a 
bit over-apisized...

> EXT3 journal flushing is another one where we can easily say 
> "between 4 and 7 seconds" rather than "exactly at 5"

This a nice-ish example though. It might be considered necessary to make 
the current commit delay when set explicitly be the non-deferrable upper 
bound but almost none do I guess.

Rene.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* RE: [RFC] Imprecise timers.
  2008-07-22  3:02 [RFC] Imprecise timers David Woodhouse
  2008-07-22  3:05 ` [RFC] schedule_timeout_range() David Woodhouse
  2008-07-22  7:19 ` [RFC] Imprecise timers Rene Herman
@ 2008-07-29  0:36 ` Pallipadi, Venkatesh
  2008-08-09 12:54   ` Pavel Machek
  2 siblings, 1 reply; 22+ messages in thread
From: Pallipadi, Venkatesh @ 2008-07-29  0:36 UTC (permalink / raw)
  To: David Woodhouse, linux-kernel; +Cc: Thomas Gleixner, Ingo Molnar, arjan



>-----Original Message-----
>From: linux-kernel-owner@vger.kernel.org
>[mailto:linux-kernel-owner@vger.kernel.org] On Behalf Of David
>Woodhouse
>Sent: Monday, July 21, 2008 8:03 PM
>To: linux-kernel@vger.kernel.org
>Cc: Thomas Gleixner; Ingo Molnar; arjan@infradead.org
>Subject: [RFC] Imprecise timers.
>
>Many users of timers don't really care too much about exactly
>when their
>timer fires -- and waking a CPU to satisfy such a timer is a waste of
>power. This patch implements a 'range' timer which will fire
>at a 'convenient'
>moment within given constraints.
>
>It's implemented by a deferrable timer at the beginning of the range,
>which will run some time later when the CPU happens to be awake. And a
>non-deferrable timer at the hard deadline, to ensure it really does
>happen by then.
>

One concern I have is drivers using range_timers thinking that they need
some upper bound, while all they need is a simple deferrable timer. With that
we will have multiple timers waking up the CPU all the time (say, on
different CPUs) problem again. Even without the timers waking up all
the time problem, we have extra overhead of one or more spinlocks
just to delete one extra unused timer. I don't how, but
we will have to restrict this somehow to only code that really needs this
kind of deadline.

Regarding implementation, I am thinking mod_range_timer should use
ret &= mod_timer(). That is, if one of the timers has already fired and
the other one is pending mod_range_timer return value should say it has
already fired. Right?

Also, do we need both mod_range_timer and __mod_range_timer.
Original mod_timer version talks about optimization for networking code.
I think we can do with only one of the two here...

Thanks,
Venki


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] Imprecise timers.
  2008-07-29  0:36 ` Pallipadi, Venkatesh
@ 2008-08-09 12:54   ` Pavel Machek
  2008-08-11 17:35     ` Venki Pallipadi
  0 siblings, 1 reply; 22+ messages in thread
From: Pavel Machek @ 2008-08-09 12:54 UTC (permalink / raw)
  To: Pallipadi, Venkatesh
  Cc: David Woodhouse, linux-kernel, Thomas Gleixner, Ingo Molnar, arjan

On Mon 2008-07-28 17:36:57, Pallipadi, Venkatesh wrote:
> 
> 
> >-----Original Message-----
> >From: linux-kernel-owner@vger.kernel.org
> >[mailto:linux-kernel-owner@vger.kernel.org] On Behalf Of David
> >Woodhouse
> >Sent: Monday, July 21, 2008 8:03 PM
> >To: linux-kernel@vger.kernel.org
> >Cc: Thomas Gleixner; Ingo Molnar; arjan@infradead.org
> >Subject: [RFC] Imprecise timers.
> >
> >Many users of timers don't really care too much about exactly
> >when their
> >timer fires -- and waking a CPU to satisfy such a timer is a waste of
> >power. This patch implements a 'range' timer which will fire
> >at a 'convenient'
> >moment within given constraints.
> >
> >It's implemented by a deferrable timer at the beginning of the range,
> >which will run some time later when the CPU happens to be awake. And a
> >non-deferrable timer at the hard deadline, to ensure it really does
> >happen by then.
> >
> 
> One concern I have is drivers using range_timers thinking that they need
> some upper bound, while all they need is a simple deferrable timer. With that
> we will have multiple timers waking up the CPU all the time (say, on
> different CPUs) problem again. Even without the timers waking up all

I don't get it. Who has timers that can be deferred forever? At that
point they may simply not set the timer at all, right?

							Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] Imprecise timers.
  2008-08-09 12:54   ` Pavel Machek
@ 2008-08-11 17:35     ` Venki Pallipadi
  2008-08-12 12:00       ` Pavel Machek
  0 siblings, 1 reply; 22+ messages in thread
From: Venki Pallipadi @ 2008-08-11 17:35 UTC (permalink / raw)
  To: Pavel Machek
  Cc: David Woodhouse, linux-kernel, Thomas Gleixner, Ingo Molnar, arjan

On Sat, Aug 09, 2008 at 05:54:47AM -0700, Pavel Machek wrote:
> On Mon 2008-07-28 17:36:57, Pallipadi, Venkatesh wrote:
> >
> >
> > >-----Original Message-----
> > >From: linux-kernel-owner@vger.kernel.org
> > >[mailto:linux-kernel-owner@vger.kernel.org] On Behalf Of David
> > >Woodhouse
> > >Sent: Monday, July 21, 2008 8:03 PM
> > >To: linux-kernel@vger.kernel.org
> > >Cc: Thomas Gleixner; Ingo Molnar; arjan@infradead.org
> > >Subject: [RFC] Imprecise timers.
> > >
> > >Many users of timers don't really care too much about exactly
> > >when their
> > >timer fires -- and waking a CPU to satisfy such a timer is a waste of
> > >power. This patch implements a 'range' timer which will fire
> > >at a 'convenient'
> > >moment within given constraints.
> > >
> > >It's implemented by a deferrable timer at the beginning of the range,
> > >which will run some time later when the CPU happens to be awake. And a
> > >non-deferrable timer at the hard deadline, to ensure it really does
> > >happen by then.
> > >
> >
> > One concern I have is drivers using range_timers thinking that they need
> > some upper bound, while all they need is a simple deferrable timer. With that
> > we will have multiple timers waking up the CPU all the time (say, on
> > different CPUs) problem again. Even without the timers waking up all
> 
> I don't get it. Who has timers that can be deferred forever? At that
> point they may simply not set the timer at all, right?
> 

I don't think I said drivers have or need timers that can be deferred forever.

My point is, is it worth the overhead of setting and deleting additional timer,
just because drivers think that they need to use this new interface,
need to set a upper bound and come up with random upper bounds.
Apart from the overhead of setup and teardown we will somewhat negate the
benefits of deferrable timers as the upper bound hard timers can fire at
different times waking up the CPUs frequently.

I understand that some drivers need this kind of upper limit. I am not sure
whether all drivers need it and if not, how can we restrict drivers from using
this when they don't really need it.

Thanks,
Venki



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] Imprecise timers.
  2008-08-11 17:35     ` Venki Pallipadi
@ 2008-08-12 12:00       ` Pavel Machek
  2008-08-12 18:11         ` Venki Pallipadi
  0 siblings, 1 reply; 22+ messages in thread
From: Pavel Machek @ 2008-08-12 12:00 UTC (permalink / raw)
  To: Venki Pallipadi
  Cc: David Woodhouse, linux-kernel, Thomas Gleixner, Ingo Molnar, arjan

Hi!

> > > >which will run some time later when the CPU happens to be awake. And a
> > > >non-deferrable timer at the hard deadline, to ensure it really does
> > > >happen by then.
> > > >
> > >
> > > One concern I have is drivers using range_timers thinking that they need
> > > some upper bound, while all they need is a simple deferrable timer. With that
> > > we will have multiple timers waking up the CPU all the time (say, on
> > > different CPUs) problem again. Even without the timers waking up all
> > 
> > I don't get it. Who has timers that can be deferred forever? At that
> > point they may simply not set the timer at all, right?
> > 
> 
> I don't think I said drivers have or need timers that can be deferred forever.
> 
> My point is, is it worth the overhead of setting and deleting additional timer,
> just because drivers think that they need to use this new interface,
> need to set a upper bound and come up with random upper bounds.
> Apart from the overhead of setup and teardown we will somewhat negate the
> benefits of deferrable timers as the upper bound hard timers can fire at
> different times waking up the CPUs frequently.

> I understand that some drivers need this kind of upper limit. I am not sure
> whether all drivers need it and if not, how can we restrict drivers from using
> this when they don't really need it.

Do you have example of driver that does NOT need upper limit?

Like... lets take ATA.

submit_command()
if command is not back in ~5 seconds, it probably timed out.

So you set soft limit to 5 seconds, and hard limit to 10. You
definitely want user to know something is wrong after 10 seconds,
right?

								Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] Imprecise timers.
  2008-08-12 12:00       ` Pavel Machek
@ 2008-08-12 18:11         ` Venki Pallipadi
  2008-08-12 21:55           ` Alan Cox
  2008-08-12 21:58           ` Pavel Machek
  0 siblings, 2 replies; 22+ messages in thread
From: Venki Pallipadi @ 2008-08-12 18:11 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Pallipadi, Venkatesh, David Woodhouse, linux-kernel,
	Thomas Gleixner, Ingo Molnar, arjan

On Tue, Aug 12, 2008 at 05:00:31AM -0700, Pavel Machek wrote:
> Hi!
> 
> > > > >which will run some time later when the CPU happens to be awake. And a
> > > > >non-deferrable timer at the hard deadline, to ensure it really does
> > > > >happen by then.
> > > > >
> > > >
> > > > One concern I have is drivers using range_timers thinking that they need
> > > > some upper bound, while all they need is a simple deferrable timer. With that
> > > > we will have multiple timers waking up the CPU all the time (say, on
> > > > different CPUs) problem again. Even without the timers waking up all
> > >
> > > I don't get it. Who has timers that can be deferred forever? At that
> > > point they may simply not set the timer at all, right?
> > >
> >
> > I don't think I said drivers have or need timers that can be deferred forever.
> >
> > My point is, is it worth the overhead of setting and deleting additional timer,
> > just because drivers think that they need to use this new interface,
> > need to set a upper bound and come up with random upper bounds.
> > Apart from the overhead of setup and teardown we will somewhat negate the
> > benefits of deferrable timers as the upper bound hard timers can fire at
> > different times waking up the CPUs frequently.
> 
> > I understand that some drivers need this kind of upper limit. I am not sure
> > whether all drivers need it and if not, how can we restrict drivers from using
> > this when they don't really need it.
> 
> Do you have example of driver that does NOT need upper limit?
> 
> Like... lets take ATA.
> 
> submit_command()
> if command is not back in ~5 seconds, it probably timed out.
> 
> So you set soft limit to 5 seconds, and hard limit to 10. You
> definitely want user to know something is wrong after 10 seconds,
> right?
> 

I would say this will be the wrong usage of deadline, atleast with the
two timer and no round off implementation.

For this example, it will probably be better to use round_jiffies to round the
timer to second level and make it go in sync with all other round timers on
this CPU, rather than setting two timers with hard timer not in sync with
other timers on the CPU.

- if there are other timers (rounded or otherwise) on this CPU that goes off
between 5-10 seconds, we are paying penalty for setting up and removing one
extra timer with no obvious benefit.
- if there are no other timers on this CPU, that goes off between 5-10 seconds,
timer after 10 seconds will not be in sync with other potential round timers
on this CPU that may go off between 10-11 seconds, causing one extra wakeup
and again the overhead of setting up and removing one extra timer.

Instead a rounded timer after 5 seconds still ensures atleast 1s idle time on
the CPU without the overhead.

There can be many users who really don't care about the deadline or can happliy
use rounded timers or deferrable timers. Say garbage collectors, cache_reap
and friends, ondemand governor, vga cursor blinking comes to my mind.

Thanks,
Venki



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] Imprecise timers.
  2008-08-12 18:11         ` Venki Pallipadi
@ 2008-08-12 21:55           ` Alan Cox
  2008-08-12 21:58           ` Pavel Machek
  1 sibling, 0 replies; 22+ messages in thread
From: Alan Cox @ 2008-08-12 21:55 UTC (permalink / raw)
  To: Venki Pallipadi
  Cc: Pavel Machek, Pallipadi, Venkatesh, David Woodhouse,
	linux-kernel, Thomas Gleixner, Ingo Molnar, arjan

> Instead a rounded timer after 5 seconds still ensures atleast 1s idle time on
> the CPU without the overhead.
> 
> There can be many users who really don't care about the deadline or can happliy
> use rounded timers or deferrable timers. Say garbage collectors, cache_reap
> and friends, ondemand governor, vga cursor blinking comes to my mind.

The cursor really really needs to be steady to within a few frames or you
will notice it. Not using blinking cursors would of course be even better
(or sneaking the blink into the graphics hardware ;))

Your API also needs to use some kind of internal time concept that can be
shared between virtual machines because the end product of this done
right has to be to push timer scheduling at the basic level into the
hypervisor because it needs to merge timer events across guests. Anything
less and your get screwed the moment you load 50 guests on your box.

The simplest I can see to do it would be to make init_timer set a timer
field for tolerance to zero. Existing stuff would just work and new
drivers can set timer.tolerance = foo;

powertop can then be use to hit the key wakeup causes in the existing code

Alan

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC] Imprecise timers.
  2008-08-12 18:11         ` Venki Pallipadi
  2008-08-12 21:55           ` Alan Cox
@ 2008-08-12 21:58           ` Pavel Machek
  1 sibling, 0 replies; 22+ messages in thread
From: Pavel Machek @ 2008-08-12 21:58 UTC (permalink / raw)
  To: Venki Pallipadi
  Cc: David Woodhouse, linux-kernel, Thomas Gleixner, Ingo Molnar, arjan

Hi!

> Instead a rounded timer after 5 seconds still ensures atleast 1s idle time on
> the CPU without the overhead.
> 
> There can be many users who really don't care about the deadline or can happliy
> use rounded timers or deferrable timers. Say garbage collectors, cache_reap
> and friends, ondemand governor, vga cursor blinking comes to my mind.

Ok, maybe.

But don't try to play with vga cursor blinking; any irregularity there
will be _very_ visible_.

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2008-08-12 22:13 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-07-22  3:02 [RFC] Imprecise timers David Woodhouse
2008-07-22  3:05 ` [RFC] schedule_timeout_range() David Woodhouse
2008-07-22  3:56   ` Nick Piggin
2008-07-22  4:12     ` David Woodhouse
2008-07-22  4:26       ` Arjan van de Ven
2008-07-22  4:34         ` David Woodhouse
2008-07-22  4:33       ` Nick Piggin
2008-07-22  4:45         ` David Woodhouse
2008-07-22  4:50           ` Nick Piggin
2008-07-22  4:58             ` David Woodhouse
2008-07-22  5:35               ` Jan Engelhardt
2008-07-22  4:33     ` Arjan van de Ven
2008-07-22  7:19 ` [RFC] Imprecise timers Rene Herman
2008-07-22 12:54   ` Arjan van de Ven
2008-07-22 14:04     ` Rene Herman
2008-07-29  0:36 ` Pallipadi, Venkatesh
2008-08-09 12:54   ` Pavel Machek
2008-08-11 17:35     ` Venki Pallipadi
2008-08-12 12:00       ` Pavel Machek
2008-08-12 18:11         ` Venki Pallipadi
2008-08-12 21:55           ` Alan Cox
2008-08-12 21:58           ` Pavel Machek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).