linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] rwsem: steal writing sem for better performance
@ 2013-02-05 13:11 Alex Shi
  2013-02-05 14:58 ` Ingo Molnar
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Alex Shi @ 2013-02-05 13:11 UTC (permalink / raw)
  To: mingo, torvalds; +Cc: paul.gortmaker, alex.shi, linux-kernel

Commit 5a50508 change to rwsem from mutex, that cause aim7 fork_test
dropped 50%. Yuanhan liu does a good analysis, find it caused by
strict sequential writing. Ingo suggest stealing sem writing from
front task in waiting queue. https://lkml.org/lkml/2013/1/29/84
So has this patch.

In this patch, I just allow writing steal happen when the first waiter
is also writer. Then the performance fully recovered.

Reported-by: lkp@linux.intel.com
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Alex Shi <alex.shi@intel.com>
---
 lib/rwsem.c | 75 +++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 29 deletions(-)

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 8337e1b..88ba8e5 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -2,6 +2,8 @@
  *
  * Written by David Howells (dhowells@redhat.com).
  * Derived from arch/i386/kernel/semaphore.c
+ *
+ * Steal writing sem. Alex Shi <alex.shi@intel.com>
  */
 #include <linux/rwsem.h>
 #include <linux/sched.h>
@@ -60,7 +62,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
 	struct list_head *next;
-	signed long oldcount, woken, loop, adjustment;
+	signed long woken, loop, adjustment;
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 	if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
@@ -72,30 +74,8 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 		 */
 		goto out;
 
-	/* There's a writer at the front of the queue - try to grant it the
-	 * write lock.  However, we only wake this writer if we can transition
-	 * the active part of the count from 0 -> 1
-	 */
-	adjustment = RWSEM_ACTIVE_WRITE_BIAS;
-	if (waiter->list.next == &sem->wait_list)
-		adjustment -= RWSEM_WAITING_BIAS;
-
- try_again_write:
-	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
-	if (oldcount & RWSEM_ACTIVE_MASK)
-		/* Someone grabbed the sem already */
-		goto undo_write;
-
-	/* We must be careful not to touch 'waiter' after we set ->task = NULL.
-	 * It is an allocated on the waiter's stack and may become invalid at
-	 * any time after that point (due to a wakeup from another source).
-	 */
-	list_del(&waiter->list);
-	tsk = waiter->task;
-	smp_mb();
-	waiter->task = NULL;
-	wake_up_process(tsk);
-	put_task_struct(tsk);
+	/* wake up the writing waiter and let the task grap sem */
+	wake_up_process(waiter->task);
 	goto out;
 
  readers_only:
@@ -157,12 +137,40 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 
  out:
 	return sem;
+}
+
+/* try to get write sem,  caller hold sem->wait_lock */
+static int try_get_writer_sem(struct rw_semaphore *sem,
+					struct rwsem_waiter *waiter)
+{
+	struct rwsem_waiter *fwaiter;
+	long oldcount, adjustment;
 
-	/* undo the change to the active count, but check for a transition
-	 * 1->0 */
- undo_write:
+	/* only steal when first waiter is writing */
+	fwaiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+	if (!(fwaiter->flags & RWSEM_WAITING_FOR_WRITE))
+		return 0;
+
+	adjustment = RWSEM_ACTIVE_WRITE_BIAS;
+	/* only one waiter in queue */
+	if (fwaiter == waiter && waiter->list.next == &sem->wait_list)
+		adjustment -= RWSEM_WAITING_BIAS;
+
+try_again_write:
+	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+	if (!(oldcount & RWSEM_ACTIVE_MASK)) {
+		/* no active lock */
+		struct task_struct *tsk = waiter->task;
+
+		list_del(&waiter->list);
+		smp_mb();
+		put_task_struct(tsk);
+		tsk->state = TASK_RUNNING;
+		return 1;
+	}
+	/* some one grabbed the sem already */
 	if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK)
-		goto out;
+		return 0;
 	goto try_again_write;
 }
 
@@ -210,6 +218,15 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 	for (;;) {
 		if (!waiter.task)
 			break;
+
+		raw_spin_lock_irq(&sem->wait_lock);
+		/* try to get the writer sem, may steal from the head writer */
+		if (flags == RWSEM_WAITING_FOR_WRITE)
+			if (try_get_writer_sem(sem, &waiter)) {
+				raw_spin_unlock_irq(&sem->wait_lock);
+				return sem;
+			}
+		raw_spin_unlock_irq(&sem->wait_lock);
 		schedule();
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 	}
-- 
1.7.12


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] rwsem: steal writing sem for better performance
  2013-02-05 13:11 [PATCH] rwsem: steal writing sem for better performance Alex Shi
@ 2013-02-05 14:58 ` Ingo Molnar
  2013-02-06  0:07   ` Alex Shi
  2013-02-06 13:52 ` [tip:core/urgent] rwsem: Implement writer lock-stealing for better scalability tip-bot for Alex Shi
  2013-02-22 12:25 ` [tip:core/locking] " tip-bot for Alex Shi
  2 siblings, 1 reply; 8+ messages in thread
From: Ingo Molnar @ 2013-02-05 14:58 UTC (permalink / raw)
  To: Alex Shi; +Cc: torvalds, paul.gortmaker, linux-kernel


* Alex Shi <alex.shi@intel.com> wrote:

> Commit 5a50508 change to rwsem from mutex, that cause aim7 fork_test
> dropped 50%. Yuanhan liu does a good analysis, find it caused by
> strict sequential writing. Ingo suggest stealing sem writing from
> front task in waiting queue. https://lkml.org/lkml/2013/1/29/84
> So has this patch.
> 
> In this patch, I just allow writing steal happen when the 
> first waiter is also writer. Then the performance fully 
> recovered.

All the aim7 fork_test performance regression is recovered?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] rwsem: steal writing sem for better performance
  2013-02-05 14:58 ` Ingo Molnar
@ 2013-02-06  0:07   ` Alex Shi
  2013-02-06 11:28     ` Ingo Molnar
  0 siblings, 1 reply; 8+ messages in thread
From: Alex Shi @ 2013-02-06  0:07 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: torvalds, paul.gortmaker, linux-kernel

On 02/05/2013 10:58 PM, Ingo Molnar wrote:
> 
> * Alex Shi <alex.shi@intel.com> wrote:
> 
>> Commit 5a50508 change to rwsem from mutex, that cause aim7 fork_test
>> dropped 50%. Yuanhan liu does a good analysis, find it caused by
>> strict sequential writing. Ingo suggest stealing sem writing from
>> front task in waiting queue. https://lkml.org/lkml/2013/1/29/84
>> So has this patch.
>>
>> In this patch, I just allow writing steal happen when the 
>> first waiter is also writer. Then the performance fully 
>> recovered.
> 
> All the aim7 fork_test performance regression is recovered?

Sure. 100% recovered on my NHM EP, NHM EX, SNB EP 2S and 4S machines.
> 
> Thanks,
> 
> 	Ingo
> 


-- 
Thanks Alex

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] rwsem: steal writing sem for better performance
  2013-02-06  0:07   ` Alex Shi
@ 2013-02-06 11:28     ` Ingo Molnar
  2013-02-06 21:36       ` Linus Torvalds
  0 siblings, 1 reply; 8+ messages in thread
From: Ingo Molnar @ 2013-02-06 11:28 UTC (permalink / raw)
  To: Alex Shi, Linus Torvalds, Andrew Morton
  Cc: torvalds, paul.gortmaker, linux-kernel


* Alex Shi <alex.shi@intel.com> wrote:

> On 02/05/2013 10:58 PM, Ingo Molnar wrote:
> > 
> > * Alex Shi <alex.shi@intel.com> wrote:
> > 
> >> Commit 5a50508 change to rwsem from mutex, that cause aim7 
> >> fork_test dropped 50%. Yuanhan liu does a good analysis, 
> >> find it caused by strict sequential writing. Ingo suggest 
> >> stealing sem writing from front task in waiting queue. 
> >> https://lkml.org/lkml/2013/1/29/84 So has this patch.
> >>
> >> In this patch, I just allow writing steal happen when the 
> >> first waiter is also writer. Then the performance fully 
> >> recovered.
> > 
> > All the aim7 fork_test performance regression is recovered?
> 
> Sure. 100% recovered on my NHM EP, NHM EX, SNB EP 2S and 4S 
> machines.

That's totally awesome!

Linus, Andrew, what is your thinking about the patch and about 
the timing of the patch?

It looks correct and straightforward to me, but rwsem races 
tended to be tricky to find and fix in the past...

We are in -rc7 territory which is a bit late for fundamental 
locking changes.

OTOH this is a regression fix, and not a small one.

I'll queue it up optimistically in an urgent branch for the time 
being, to see what happens with a few days of testing ... If 
there's any problems with it then it's clearly ineligible.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [tip:core/urgent] rwsem: Implement writer lock-stealing for better scalability
  2013-02-05 13:11 [PATCH] rwsem: steal writing sem for better performance Alex Shi
  2013-02-05 14:58 ` Ingo Molnar
@ 2013-02-06 13:52 ` tip-bot for Alex Shi
  2013-02-22 12:25 ` [tip:core/locking] " tip-bot for Alex Shi
  2 siblings, 0 replies; 8+ messages in thread
From: tip-bot for Alex Shi @ 2013-02-06 13:52 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, anton, hpa, mingo, arjan, a.p.zijlstra, torvalds,
	alex.shi, yuanhan.liu, dhowells, akpm, tglx, walken

Commit-ID:  3a15e0e0cdda5b401d0a36dd7e83406cd1ce0724
Gitweb:     http://git.kernel.org/tip/3a15e0e0cdda5b401d0a36dd7e83406cd1ce0724
Author:     Alex Shi <alex.shi@intel.com>
AuthorDate: Tue, 5 Feb 2013 21:11:55 +0800
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Wed, 6 Feb 2013 12:41:43 +0100

rwsem: Implement writer lock-stealing for better scalability

Commit 5a505085f043 ("mm/rmap: Convert the struct anon_vma::mutex
to an rwsem") changed struct anon_vma::mutex to an rwsem, which
caused aim7 fork_test performance to drop by 50%.

Yuanhan Liu did the following excellent analysis:

    https://lkml.org/lkml/2013/1/29/84

and found that the regression is caused by strict, serialized,
FIFO sequential write-ownership of rwsems. Ingo suggested
implementing opportunistic lock-stealing for the front writer
task in the waitqueue.

Yuanhan Liu implemented lock-stealing for spinlock-rwsems,
which indeed recovered much of the regression - confirming
the analysis that the main factor in the regression was the
FIFO writer-fairness of rwsems.

In this patch we allow lock-stealing to happen when the first
waiter is also writer. With that change in place the
aim7 fork_test performance is fully recovered on my
Intel NHM EP, NHM EX, SNB EP 2S and 4S test-machines.

Reported-by: lkp@linux.intel.com
Reported-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Signed-off-by: Alex Shi <alex.shi@intel.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Anton Blanchard <anton@samba.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: paul.gortmaker@windriver.com
Link: https://lkml.org/lkml/2013/1/29/84
Link: http://lkml.kernel.org/r/1360069915-31619-1-git-send-email-alex.shi@intel.com
[ Small stylistic fixes, updated changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 lib/rwsem.c | 75 +++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 29 deletions(-)

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 8337e1b..ad5e0df 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -2,6 +2,8 @@
  *
  * Written by David Howells (dhowells@redhat.com).
  * Derived from arch/i386/kernel/semaphore.c
+ *
+ * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
  */
 #include <linux/rwsem.h>
 #include <linux/sched.h>
@@ -60,7 +62,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
 	struct list_head *next;
-	signed long oldcount, woken, loop, adjustment;
+	signed long woken, loop, adjustment;
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 	if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
@@ -72,30 +74,8 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 		 */
 		goto out;
 
-	/* There's a writer at the front of the queue - try to grant it the
-	 * write lock.  However, we only wake this writer if we can transition
-	 * the active part of the count from 0 -> 1
-	 */
-	adjustment = RWSEM_ACTIVE_WRITE_BIAS;
-	if (waiter->list.next == &sem->wait_list)
-		adjustment -= RWSEM_WAITING_BIAS;
-
- try_again_write:
-	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
-	if (oldcount & RWSEM_ACTIVE_MASK)
-		/* Someone grabbed the sem already */
-		goto undo_write;
-
-	/* We must be careful not to touch 'waiter' after we set ->task = NULL.
-	 * It is an allocated on the waiter's stack and may become invalid at
-	 * any time after that point (due to a wakeup from another source).
-	 */
-	list_del(&waiter->list);
-	tsk = waiter->task;
-	smp_mb();
-	waiter->task = NULL;
-	wake_up_process(tsk);
-	put_task_struct(tsk);
+	/* Wake up the writing waiter and let the task grab the sem: */
+	wake_up_process(waiter->task);
 	goto out;
 
  readers_only:
@@ -157,12 +137,40 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 
  out:
 	return sem;
+}
+
+/* Try to get write sem, caller holds sem->wait_lock: */
+static int try_get_writer_sem(struct rw_semaphore *sem,
+					struct rwsem_waiter *waiter)
+{
+	struct rwsem_waiter *fwaiter;
+	long oldcount, adjustment;
 
-	/* undo the change to the active count, but check for a transition
-	 * 1->0 */
- undo_write:
+	/* only steal when first waiter is writing */
+	fwaiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+	if (!(fwaiter->flags & RWSEM_WAITING_FOR_WRITE))
+		return 0;
+
+	adjustment = RWSEM_ACTIVE_WRITE_BIAS;
+	/* Only one waiter in the queue: */
+	if (fwaiter == waiter && waiter->list.next == &sem->wait_list)
+		adjustment -= RWSEM_WAITING_BIAS;
+
+try_again_write:
+	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+	if (!(oldcount & RWSEM_ACTIVE_MASK)) {
+		/* No active lock: */
+		struct task_struct *tsk = waiter->task;
+
+		list_del(&waiter->list);
+		smp_mb();
+		put_task_struct(tsk);
+		tsk->state = TASK_RUNNING;
+		return 1;
+	}
+	/* some one grabbed the sem already */
 	if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK)
-		goto out;
+		return 0;
 	goto try_again_write;
 }
 
@@ -210,6 +218,15 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 	for (;;) {
 		if (!waiter.task)
 			break;
+
+		raw_spin_lock_irq(&sem->wait_lock);
+		/* Try to get the writer sem, may steal from the head writer: */
+		if (flags == RWSEM_WAITING_FOR_WRITE)
+			if (try_get_writer_sem(sem, &waiter)) {
+				raw_spin_unlock_irq(&sem->wait_lock);
+				return sem;
+			}
+		raw_spin_unlock_irq(&sem->wait_lock);
 		schedule();
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 	}

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] rwsem: steal writing sem for better performance
  2013-02-06 11:28     ` Ingo Molnar
@ 2013-02-06 21:36       ` Linus Torvalds
  2013-02-06 21:47         ` Ingo Molnar
  0 siblings, 1 reply; 8+ messages in thread
From: Linus Torvalds @ 2013-02-06 21:36 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alex Shi, Andrew Morton, Paul Gortmaker, Linux Kernel Mailing List

On Wed, Feb 6, 2013 at 10:28 PM, Ingo Molnar <mingo@kernel.org> wrote:
>
> Linus, Andrew, what is your thinking about the patch and about
> the timing of the patch?

Not for 3.8. Queue it for 3.9, with possibly a stable tag with a big
comment "apply after much testing".

              Linus

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] rwsem: steal writing sem for better performance
  2013-02-06 21:36       ` Linus Torvalds
@ 2013-02-06 21:47         ` Ingo Molnar
  0 siblings, 0 replies; 8+ messages in thread
From: Ingo Molnar @ 2013-02-06 21:47 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alex Shi, Andrew Morton, Paul Gortmaker, Linux Kernel Mailing List


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Wed, Feb 6, 2013 at 10:28 PM, Ingo Molnar <mingo@kernel.org> wrote:
> >
> > Linus, Andrew, what is your thinking about the patch and about
> > the timing of the patch?
> 
> Not for 3.8. Queue it for 3.9, with possibly a stable tag with 
> a big comment "apply after much testing".

Ok, will do!

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [tip:core/locking] rwsem: Implement writer lock-stealing for better scalability
  2013-02-05 13:11 [PATCH] rwsem: steal writing sem for better performance Alex Shi
  2013-02-05 14:58 ` Ingo Molnar
  2013-02-06 13:52 ` [tip:core/urgent] rwsem: Implement writer lock-stealing for better scalability tip-bot for Alex Shi
@ 2013-02-22 12:25 ` tip-bot for Alex Shi
  2 siblings, 0 replies; 8+ messages in thread
From: tip-bot for Alex Shi @ 2013-02-22 12:25 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, anton, hpa, mingo, arjan, a.p.zijlstra, torvalds,
	alex.shi, yuanhan.liu, dhowells, akpm, tglx, walken

Commit-ID:  ce6711f3d196f09ca0ed29a24dfad42d83912b20
Gitweb:     http://git.kernel.org/tip/ce6711f3d196f09ca0ed29a24dfad42d83912b20
Author:     Alex Shi <alex.shi@intel.com>
AuthorDate: Tue, 5 Feb 2013 21:11:55 +0800
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Tue, 19 Feb 2013 08:42:43 +0100

rwsem: Implement writer lock-stealing for better scalability

Commit 5a505085f043 ("mm/rmap: Convert the struct anon_vma::mutex
to an rwsem") changed struct anon_vma::mutex to an rwsem, which
caused aim7 fork_test performance to drop by 50%.

Yuanhan Liu did the following excellent analysis:

    https://lkml.org/lkml/2013/1/29/84

and found that the regression is caused by strict, serialized,
FIFO sequential write-ownership of rwsems. Ingo suggested
implementing opportunistic lock-stealing for the front writer
task in the waitqueue.

Yuanhan Liu implemented lock-stealing for spinlock-rwsems,
which indeed recovered much of the regression - confirming
the analysis that the main factor in the regression was the
FIFO writer-fairness of rwsems.

In this patch we allow lock-stealing to happen when the first
waiter is also writer. With that change in place the
aim7 fork_test performance is fully recovered on my
Intel NHM EP, NHM EX, SNB EP 2S and 4S test-machines.

Reported-by: lkp@linux.intel.com
Reported-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Signed-off-by: Alex Shi <alex.shi@intel.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Anton Blanchard <anton@samba.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: paul.gortmaker@windriver.com
Link: https://lkml.org/lkml/2013/1/29/84
Link: http://lkml.kernel.org/r/1360069915-31619-1-git-send-email-alex.shi@intel.com
[ Small stylistic fixes, updated changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 lib/rwsem.c | 75 +++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 29 deletions(-)

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 8337e1b..ad5e0df 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -2,6 +2,8 @@
  *
  * Written by David Howells (dhowells@redhat.com).
  * Derived from arch/i386/kernel/semaphore.c
+ *
+ * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
  */
 #include <linux/rwsem.h>
 #include <linux/sched.h>
@@ -60,7 +62,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
 	struct list_head *next;
-	signed long oldcount, woken, loop, adjustment;
+	signed long woken, loop, adjustment;
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 	if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
@@ -72,30 +74,8 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 		 */
 		goto out;
 
-	/* There's a writer at the front of the queue - try to grant it the
-	 * write lock.  However, we only wake this writer if we can transition
-	 * the active part of the count from 0 -> 1
-	 */
-	adjustment = RWSEM_ACTIVE_WRITE_BIAS;
-	if (waiter->list.next == &sem->wait_list)
-		adjustment -= RWSEM_WAITING_BIAS;
-
- try_again_write:
-	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
-	if (oldcount & RWSEM_ACTIVE_MASK)
-		/* Someone grabbed the sem already */
-		goto undo_write;
-
-	/* We must be careful not to touch 'waiter' after we set ->task = NULL.
-	 * It is an allocated on the waiter's stack and may become invalid at
-	 * any time after that point (due to a wakeup from another source).
-	 */
-	list_del(&waiter->list);
-	tsk = waiter->task;
-	smp_mb();
-	waiter->task = NULL;
-	wake_up_process(tsk);
-	put_task_struct(tsk);
+	/* Wake up the writing waiter and let the task grab the sem: */
+	wake_up_process(waiter->task);
 	goto out;
 
  readers_only:
@@ -157,12 +137,40 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 
  out:
 	return sem;
+}
+
+/* Try to get write sem, caller holds sem->wait_lock: */
+static int try_get_writer_sem(struct rw_semaphore *sem,
+					struct rwsem_waiter *waiter)
+{
+	struct rwsem_waiter *fwaiter;
+	long oldcount, adjustment;
 
-	/* undo the change to the active count, but check for a transition
-	 * 1->0 */
- undo_write:
+	/* only steal when first waiter is writing */
+	fwaiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+	if (!(fwaiter->flags & RWSEM_WAITING_FOR_WRITE))
+		return 0;
+
+	adjustment = RWSEM_ACTIVE_WRITE_BIAS;
+	/* Only one waiter in the queue: */
+	if (fwaiter == waiter && waiter->list.next == &sem->wait_list)
+		adjustment -= RWSEM_WAITING_BIAS;
+
+try_again_write:
+	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+	if (!(oldcount & RWSEM_ACTIVE_MASK)) {
+		/* No active lock: */
+		struct task_struct *tsk = waiter->task;
+
+		list_del(&waiter->list);
+		smp_mb();
+		put_task_struct(tsk);
+		tsk->state = TASK_RUNNING;
+		return 1;
+	}
+	/* some one grabbed the sem already */
 	if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK)
-		goto out;
+		return 0;
 	goto try_again_write;
 }
 
@@ -210,6 +218,15 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 	for (;;) {
 		if (!waiter.task)
 			break;
+
+		raw_spin_lock_irq(&sem->wait_lock);
+		/* Try to get the writer sem, may steal from the head writer: */
+		if (flags == RWSEM_WAITING_FOR_WRITE)
+			if (try_get_writer_sem(sem, &waiter)) {
+				raw_spin_unlock_irq(&sem->wait_lock);
+				return sem;
+			}
+		raw_spin_unlock_irq(&sem->wait_lock);
 		schedule();
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 	}

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2013-02-22 12:26 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-02-05 13:11 [PATCH] rwsem: steal writing sem for better performance Alex Shi
2013-02-05 14:58 ` Ingo Molnar
2013-02-06  0:07   ` Alex Shi
2013-02-06 11:28     ` Ingo Molnar
2013-02-06 21:36       ` Linus Torvalds
2013-02-06 21:47         ` Ingo Molnar
2013-02-06 13:52 ` [tip:core/urgent] rwsem: Implement writer lock-stealing for better scalability tip-bot for Alex Shi
2013-02-22 12:25 ` [tip:core/locking] " tip-bot for Alex Shi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).