linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 1/4] prepare_to_wait/finish_wait sleep/wakeup API
@ 2002-09-26  4:07 Andrew Morton
  2002-09-26  4:24 ` David S. Miller
  0 siblings, 1 reply; 7+ messages in thread
From: Andrew Morton @ 2002-09-26  4:07 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: lkml



It's worth a whopping 2% on spwecweb on an 8-way.  Which is faintly
surprising because __wake_up and other wait/wakeup functions are not
apparent in the specweb profiles which I've seen.



The main objective of this is to reduce the CPU cost of the wait/wakeup
operation.  When a task is woken up, its waitqueue is removed from the
waitqueue_head by the waker (ie: immediately), rather than by the woken
process.

This means that a subsequent wakeup does not need to revisit the
just-woken task.  It also means that the just-woken task does not need
to take the waitqueue_head's lock, which may well reside in another
CPU's cache.

I have no decent measurements on the effect of this change - possibly a
20-30% drop in __wake_up cost in Badari's 40-dds-to-40-disks test (it
was the most expensive function), but it's inconclusive.  And no
quantitative testing of which I am aware has been performed by
networking people.

The API is very simple to use (Linus thought it up):

my_func(waitqueue_head_t *wqh)
{
	DEFINE_WAIT(wait);

	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
	if (!some_test)
		schedule();
	finish_wait(wqh, &wait);
}

or:

	DEFINE_WAIT(wait);

	while (!some_test_1) {
		prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
		if (!some_test_2)
			schedule();
		...
	}
	finish_wait(wqh, &wait);

You need to bear in mind that once prepare_to_wait has been performed,
your task could be removed from the waitqueue_head and placed into
TASK_RUNNING at any time.  You don't know whether or not you're still
on the waitqueue_head.

Running prepare_to_wait() when you're already on the waitqueue_head is
fine - it will do the right thing.

Running finish_wait() when you're actually not on the waitqueue_head is
fine.

Running finish_wait() when you've _never_ been on the waitqueue_head is
fine, as ling as the DEFINE_WAIT() macro was used to initialise the
waitqueue.

You don't need to fiddle with current->state.  prepare_to_wait() and
finish_wait() will do that.  finish_wait() will always return in state
TASK_RUNNING.

There are plenty of usage examples in vm-wakeups.patch and
tcp-wakeups.patch.




 include/linux/wait.h |   26 ++++++++++++++++++++++++++
 kernel/fork.c        |   46 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/ksyms.c       |    4 ++++
 3 files changed, 76 insertions(+)

--- 2.5.38/include/linux/wait.h~prepare_to_wait	Wed Sep 25 20:15:20 2002
+++ 2.5.38-akpm/include/linux/wait.h	Wed Sep 25 20:15:20 2002
@@ -119,6 +119,32 @@ static inline void __remove_wait_queue(w
 		_raced;						\
 	})
 
+/*
+ * Waitqueue's which are removed from the waitqueue_head at wakeup time
+ */
+void FASTCALL(prepare_to_wait(wait_queue_head_t *q,
+				wait_queue_t *wait, int state));
+void FASTCALL(prepare_to_wait_exclusive(wait_queue_head_t *q,
+				wait_queue_t *wait, int state));
+void FASTCALL(finish_wait(wait_queue_head_t *q, wait_queue_t *wait));
+int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync);
+
+#define DEFINE_WAIT(name)						\
+	wait_queue_t name = {						\
+		.task		= current,				\
+		.func		= autoremove_wake_function,		\
+		.task_list	= {	.next = &name.task_list,	\
+					.prev = &name.task_list,	\
+				},					\
+	}
+
+#define init_wait(wait)							\
+	do {								\
+		wait->task = current;					\
+		wait->func = autoremove_wake_function;			\
+		INIT_LIST_HEAD(&wait->task_list);			\
+	} while (0)
+	
 #endif /* __KERNEL__ */
 
 #endif
--- 2.5.38/kernel/fork.c~prepare_to_wait	Wed Sep 25 20:15:20 2002
+++ 2.5.38-akpm/kernel/fork.c	Wed Sep 25 20:15:20 2002
@@ -103,6 +103,52 @@ void remove_wait_queue(wait_queue_head_t
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 
+void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+	unsigned long flags;
+
+	__set_current_state(state);
+	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&q->lock, flags);
+	if (list_empty(&wait->task_list))
+		__add_wait_queue(q, wait);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+
+void
+prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+	unsigned long flags;
+
+	__set_current_state(state);
+	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&q->lock, flags);
+	if (list_empty(&wait->task_list))
+		__add_wait_queue_tail(q, wait);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+
+void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
+{
+	unsigned long flags;
+
+	__set_current_state(TASK_RUNNING);
+	if (!list_empty(&wait->task_list)) {
+		spin_lock_irqsave(&q->lock, flags);
+		list_del_init(&wait->task_list);
+		spin_unlock_irqrestore(&q->lock, flags);
+	}
+}
+
+int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync)
+{
+	int ret = default_wake_function(wait, mode, sync);
+
+	if (ret)
+		list_del_init(&wait->task_list);
+	return ret;
+}
+
 void __init fork_init(unsigned long mempages)
 {
 	/* create a slab on which task_structs can be allocated */
--- 2.5.38/kernel/ksyms.c~prepare_to_wait	Wed Sep 25 20:15:20 2002
+++ 2.5.38-akpm/kernel/ksyms.c	Wed Sep 25 20:15:20 2002
@@ -400,6 +400,10 @@ EXPORT_SYMBOL(irq_stat);
 EXPORT_SYMBOL(add_wait_queue);
 EXPORT_SYMBOL(add_wait_queue_exclusive);
 EXPORT_SYMBOL(remove_wait_queue);
+EXPORT_SYMBOL(prepare_to_wait);
+EXPORT_SYMBOL(prepare_to_wait_exclusive);
+EXPORT_SYMBOL(finish_wait);
+EXPORT_SYMBOL(autoremove_wake_function);
 
 /* completion handling */
 EXPORT_SYMBOL(wait_for_completion);

.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch 1/4] prepare_to_wait/finish_wait sleep/wakeup API
  2002-09-26  4:07 [patch 1/4] prepare_to_wait/finish_wait sleep/wakeup API Andrew Morton
@ 2002-09-26  4:24 ` David S. Miller
  2002-09-26  4:37   ` Linus Torvalds
  0 siblings, 1 reply; 7+ messages in thread
From: David S. Miller @ 2002-09-26  4:24 UTC (permalink / raw)
  To: akpm; +Cc: torvalds, linux-kernel

   From: Andrew Morton <akpm@digeo.com>
   Date: Wed, 25 Sep 2002 21:07:47 -0700
   
   The main objective of this is to reduce the CPU cost of the wait/wakeup
   operation.  When a task is woken up, its waitqueue is removed from the
   waitqueue_head by the waker (ie: immediately), rather than by the woken
   process.

I don't want to say that your changes cannot be made to work,
but it's been one of my understandings all these years that
the fact that the task itself controls it's presence on the
wait queue is what allows many races to be handled properly and
cleanly.

For example, the ordering of the test and add/remove from
the wait queue is pretty important.

It probably doesn't matter when there is a higher level of locking
done around the sleep/wakeup (TCP sockets are one good example)
and if that is what this is aimed at, great.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch 1/4] prepare_to_wait/finish_wait sleep/wakeup API
  2002-09-26  4:37   ` Linus Torvalds
@ 2002-09-26  4:34     ` David S. Miller
  2002-09-26  5:12       ` Linus Torvalds
  0 siblings, 1 reply; 7+ messages in thread
From: David S. Miller @ 2002-09-26  4:34 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, linux-kernel

   From: Linus Torvalds <torvalds@transmeta.com>
   Date: Wed, 25 Sep 2002 21:37:23 -0700 (PDT)
   
   On Wed, 25 Sep 2002, David S. Miller wrote:
   > For example, the ordering of the test and add/remove from
   > the wait queue is pretty important.
   
   The test and add yes. Remove no, since remove is always done after
   we know we're waking up.

Ok, so if the condition retest fails at wakeup (someone got to the
event before us), it's ok because we add ourselves back to the wait
queue first, mark ourselves as sleeping, _then_ retest.

Right?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch 1/4] prepare_to_wait/finish_wait sleep/wakeup API
  2002-09-26  4:24 ` David S. Miller
@ 2002-09-26  4:37   ` Linus Torvalds
  2002-09-26  4:34     ` David S. Miller
  0 siblings, 1 reply; 7+ messages in thread
From: Linus Torvalds @ 2002-09-26  4:37 UTC (permalink / raw)
  To: David S. Miller; +Cc: akpm, linux-kernel


On Wed, 25 Sep 2002, David S. Miller wrote:
> 
> I don't want to say that your changes cannot be made to work,
> but it's been one of my understandings all these years that
> the fact that the task itself controls it's presence on the
> wait queue is what allows many races to be handled properly and
> cleanly.

No, the important part is that the process adds itself and marks itself as
sleeping _before_ doing the test. The "marks itself as sleeping" part is 
the really important one.

The "removes itself" was/is really just a matter of being able to handle 
loops more efficiently (which is probably a case of optimizing for the 
wrong thing, since the common case is to wait for just _one_ event, 
especially since we made the herd behaviour go away with the exclusive 
stuff).

The "removes itself" thing was also something I thought was cleaner (have 
the same entity do both add and remove), but I certainly buy into the CPU 
lock bouncing arguments against it, so..

> For example, the ordering of the test and add/remove from
> the wait queue is pretty important.

The test and add yes. Remove no, since remove is always done after we know 
we're waking up.

		Linus


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch 1/4] prepare_to_wait/finish_wait sleep/wakeup API
  2002-09-26  4:34     ` David S. Miller
@ 2002-09-26  5:12       ` Linus Torvalds
  2002-09-26  5:24         ` Andrew Morton
  0 siblings, 1 reply; 7+ messages in thread
From: Linus Torvalds @ 2002-09-26  5:12 UTC (permalink / raw)
  To: David S. Miller; +Cc: akpm, linux-kernel


On Wed, 25 Sep 2002, David S. Miller wrote:
> 
> Ok, so if the condition retest fails at wakeup (someone got to the
> event before us), it's ok because we add ourselves back to the wait
> queue first, mark ourselves as sleeping, _then_ retest.

Right. The looping case (if somebody else was first) is slowed down
marginally, but the common case is sped up and needs one less time through
the waitqueue lock.

		Linus


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch 1/4] prepare_to_wait/finish_wait sleep/wakeup API
  2002-09-26  5:12       ` Linus Torvalds
@ 2002-09-26  5:24         ` Andrew Morton
  2002-09-26 14:41           ` Linus Torvalds
  0 siblings, 1 reply; 7+ messages in thread
From: Andrew Morton @ 2002-09-26  5:24 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: David S. Miller, linux-kernel

Linus Torvalds wrote:
> 
> On Wed, 25 Sep 2002, David S. Miller wrote:
> >
> > Ok, so if the condition retest fails at wakeup (someone got to the
> > event before us), it's ok because we add ourselves back to the wait
> > queue first, mark ourselves as sleeping, _then_ retest.
> 
> Right. The looping case (if somebody else was first) is slowed down
> marginally, but the common case is sped up and needs one less time through
> the waitqueue lock.
> 

Most of the gain I saw in Badari's profiles (dd to 60 disks) was
in fact in __wake_up.  60 tasks parked on a waitqueue, waiting
for memory to come clean, wakeups being delivered to them faster
than they can wake up and get off the queue.

Yeah, my code is bust ;)  The heavy __wake_up cost in there seems
to be specific to the profusion chipset, which is two quads joined
by wet string, but the principle still applies.

I expect a decent win would come from using this technique in
select/poll, but that code relies on the remains-on-the-waitqueue
semantics, and would need some fiddling.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch 1/4] prepare_to_wait/finish_wait sleep/wakeup API
  2002-09-26  5:24         ` Andrew Morton
@ 2002-09-26 14:41           ` Linus Torvalds
  0 siblings, 0 replies; 7+ messages in thread
From: Linus Torvalds @ 2002-09-26 14:41 UTC (permalink / raw)
  To: Andrew Morton; +Cc: David S. Miller, linux-kernel


On Wed, 25 Sep 2002, Andrew Morton wrote:
> 
> I expect a decent win would come from using this technique in
> select/poll, but that code relies on the remains-on-the-waitqueue
> semantics, and would need some fiddling.

Actually, I think that select/poll is exactly the kind of thing that would 
_not_ improve noticeably, since usually it's only one (out of possibly 
hundreds) or wait-queues that gets woken up. 

So even if that one were to be made faster, the _real_ cost when it comes
to the wait-queues will be all the other 99 waitqueues that select/poll
has to remove itself from the old way anyway.

			Linus


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2002-09-26 14:35 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-09-26  4:07 [patch 1/4] prepare_to_wait/finish_wait sleep/wakeup API Andrew Morton
2002-09-26  4:24 ` David S. Miller
2002-09-26  4:37   ` Linus Torvalds
2002-09-26  4:34     ` David S. Miller
2002-09-26  5:12       ` Linus Torvalds
2002-09-26  5:24         ` Andrew Morton
2002-09-26 14:41           ` Linus Torvalds

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).