All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] md/raid5: reduce lock contention in read_one_chunk()
@ 2021-06-03 13:54 gal.ofri
  2021-06-03 23:31 ` NeilBrown
  2021-06-04  7:43 ` Guoqing Jiang
  0 siblings, 2 replies; 8+ messages in thread
From: gal.ofri @ 2021-06-03 13:54 UTC (permalink / raw)
  To: linux-raid; +Cc: Gal Ofri, Song Liu, Neil Brown

From: Gal Ofri <gal.ofri@storing.io>

There is a lock contention on device_lock in read_one_chunk().
device_lock is taken to sync conf->active_aligned_reads and conf->quiesce.
read_one_chunk() takes the lock, then waits for quiesce=0 (resumed) before
incrementing active_aligned_reads.
raid5_quiesce() takes the lock, sets quiesce=2 (in-progress), then waits
for active_aligned_reads to be zero before setting quiesce=1 (suspended).

Introduce a new rwlock for read_one_chunk() and raid5_quiesce().
device_lock is not needed to protect concurrent access to
active_aligned_reads (already atomic), so replace it with a read lock.
In order to retain the sync of conf->active_aligned_reads with
conf->quiesce, take write-lock in raid5_quiesce(). This way we still drain
active io before quiescent, and prevent new io activation in quiescent.

raid5_quiesce() uses un/lock_all_device_hash_locks_irq() for locking.
We cannot remove device_lock from there, so rename
un/lock_all_device_hash_locks_irq() to un/lock_all_quiesce_locks_irq().

My setups:
1. 8 local nvme drives (each up to 250k iops).
2. 8 ram disks (brd).

Each setup with raid6 (6+2) with group_thread_cnt=8, 1024 io threads on a
96 cpu-cores (48 per socket) system. Record both iops and cpu spent on this
contention with rand-read-4k. Record bw with sequential-read-128k.
Note: in most cases cpu is still busy but due to "new" bottlenecks.

nvme:
              | iops           | cpu  | bw
-----------------------------------------------
without patch | 1.6M           | ~50% | 5.5GB/s
with patch    | 2M (throttled) | <10% | 5.5GB/s

ram (brd):
              | iops           | cpu  | bw
-----------------------------------------------
without patch | 2M             | ~80% | 24GB/s
with patch    | 3.9M           | <10% | 50GB/s

CC: Song Liu <song@kernel.org>
CC: Neil Brown <neilb@suse.de>
Signed-off-by: Gal Ofri <gal.ofri@storing.io>
---
* Should I break the patch into two commits (renaming the function and
the rest of the patch)?
* Note: I tried to use a simple spinlock rather than a rwlock, but contention
remains this way.
---
 drivers/md/raid5.c | 31 ++++++++++++++++++-------------
 drivers/md/raid5.h |  1 +
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7d4ff8a5c55e..afc32350a3f8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -90,18 +90,20 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
 	spin_unlock_irq(conf->hash_locks + hash);
 }
 
-static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
+static inline void lock_all_quiesce_locks_irq(struct r5conf *conf)
 {
 	int i;
 	spin_lock_irq(conf->hash_locks);
 	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
 		spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
 	spin_lock(&conf->device_lock);
+	write_lock(&conf->aligned_reads_lock);
 }
 
-static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
+static inline void unlock_all_quiesce_locks_irq(struct r5conf *conf)
 {
 	int i;
+	write_unlock(&conf->aligned_reads_lock);
 	spin_unlock(&conf->device_lock);
 	for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
 		spin_unlock(conf->hash_locks + i);
@@ -5443,11 +5445,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	/* No reshape active, so we can trust rdev->data_offset */
 	align_bio->bi_iter.bi_sector += rdev->data_offset;
 
-	spin_lock_irq(&conf->device_lock);
-	wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
-			    conf->device_lock);
+	/* Ensure that active_aligned_reads and quiesce are synced */
+	read_lock_irq(&conf->aligned_reads_lock);
+	wait_event_cmd(conf->wait_for_quiescent, conf->quiesce == 0,
+			read_unlock_irq(&conf->aligned_reads_lock),
+			read_lock_irq(&conf->aligned_reads_lock));
 	atomic_inc(&conf->active_aligned_reads);
-	spin_unlock_irq(&conf->device_lock);
+	read_unlock_irq(&conf->aligned_reads_lock);
 
 	if (mddev->gendisk)
 		trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
@@ -7198,6 +7202,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	} else
 		goto abort;
 	spin_lock_init(&conf->device_lock);
+	rwlock_init(&conf->aligned_reads_lock);
 	seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
 	mutex_init(&conf->cache_size_mutex);
 	init_waitqueue_head(&conf->wait_for_quiescent);
@@ -7255,7 +7260,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 
 	/* We init hash_locks[0] separately to that it can be used
 	 * as the reference lock in the spin_lock_nest_lock() call
-	 * in lock_all_device_hash_locks_irq in order to convince
+	 * in lock_all_quiesce_locks_irq in order to convince
 	 * lockdep that we know what we are doing.
 	 */
 	spin_lock_init(conf->hash_locks);
@@ -8329,7 +8334,7 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
 
 	if (quiesce) {
 		/* stop all writes */
-		lock_all_device_hash_locks_irq(conf);
+		lock_all_quiesce_locks_irq(conf);
 		/* '2' tells resync/reshape to pause so that all
 		 * active stripes can drain
 		 */
@@ -8338,19 +8343,19 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
 		wait_event_cmd(conf->wait_for_quiescent,
 				    atomic_read(&conf->active_stripes) == 0 &&
 				    atomic_read(&conf->active_aligned_reads) == 0,
-				    unlock_all_device_hash_locks_irq(conf),
-				    lock_all_device_hash_locks_irq(conf));
+				    unlock_all_quiesce_locks_irq(conf),
+				    lock_all_quiesce_locks_irq(conf));
 		conf->quiesce = 1;
-		unlock_all_device_hash_locks_irq(conf);
+		unlock_all_quiesce_locks_irq(conf);
 		/* allow reshape to continue */
 		wake_up(&conf->wait_for_overlap);
 	} else {
 		/* re-enable writes */
-		lock_all_device_hash_locks_irq(conf);
+		lock_all_quiesce_locks_irq(conf);
 		conf->quiesce = 0;
 		wake_up(&conf->wait_for_quiescent);
 		wake_up(&conf->wait_for_overlap);
-		unlock_all_device_hash_locks_irq(conf);
+		unlock_all_quiesce_locks_irq(conf);
 	}
 	log_quiesce(conf, quiesce);
 }
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 5c05acf20e1f..16ccd9e64e6a 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -610,6 +610,7 @@ struct r5conf {
 	struct bio		*retry_read_aligned_list; /* aligned bios retry list  */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 	atomic_t		active_aligned_reads;
+	rwlock_t		aligned_reads_lock; /* protect active_aligned_reads from quiesce */
 	atomic_t		pending_full_writes; /* full write backlog */
 	int			bypass_count; /* bypassed prereads */
 	int			bypass_threshold; /* preread nice */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] md/raid5: reduce lock contention in read_one_chunk()
  2021-06-03 13:54 [PATCH] md/raid5: reduce lock contention in read_one_chunk() gal.ofri
@ 2021-06-03 23:31 ` NeilBrown
  2021-06-04  8:42   ` Gal Ofri
  2021-06-04  7:43 ` Guoqing Jiang
  1 sibling, 1 reply; 8+ messages in thread
From: NeilBrown @ 2021-06-03 23:31 UTC (permalink / raw)
  To: gal.ofri; +Cc: linux-raid, Gal Ofri, Song Liu

On Thu, 03 Jun 2021, gal.ofri@storing.io wrote:
> * Note: I tried to use a simple spinlock rather than a rwlock, but contention
> remains this way.

This surprises me.  I only expect rwlocks to be a benefit when the
readlock is held for significantly longer than the time it takes to get
an uncontended lock, and I don't think that is happening here.
However, you have done the measurements (thanks for that!) and I cannot
argue with numbers.

However it does suggest that the lock is still heavily contented.  I
wonder if we can do better.  Can we make the common case lockless?
e.g. change the read_one_chunk code to something like
  
  if (!conf->quiesce) {
         atomic_inc(&conf->active_aligned_reads);
         did_inc = true;
  }
  if (smp_load_acquire(&conf->quiesce)) {
            if (did_inc && atomic_dec_and_test(&cnf->active_aligned_reads))
                 wakeup(conf->wait_for_quiescent);
            old code goes here
  }

and probably change the "conf->quiesce = 2" in raid5_quiesce() to
    smp_store_release(&conf->quiesce, 2);


Could you try that and report results?  Thanks.

NeilBrown

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] md/raid5: reduce lock contention in read_one_chunk()
  2021-06-03 13:54 [PATCH] md/raid5: reduce lock contention in read_one_chunk() gal.ofri
  2021-06-03 23:31 ` NeilBrown
@ 2021-06-04  7:43 ` Guoqing Jiang
  2021-06-04  8:53   ` Gal Ofri
  1 sibling, 1 reply; 8+ messages in thread
From: Guoqing Jiang @ 2021-06-04  7:43 UTC (permalink / raw)
  To: gal.ofri, linux-raid; +Cc: Song Liu, Neil Brown



On 6/3/21 9:54 PM, gal.ofri@storing.io wrote:
> Each setup with raid6 (6+2) with group_thread_cnt=8, 1024 io threads on a
> 96 cpu-cores (48 per socket) system.

Just curious '8'  is chose for  group_thread_cnt. IIUC, group means one 
numa node, and better to
set it  to match the number of cores in one numa node in case better 
performance is expected.

Thanks,
Guoqing

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] md/raid5: reduce lock contention in read_one_chunk()
  2021-06-03 23:31 ` NeilBrown
@ 2021-06-04  8:42   ` Gal Ofri
  2021-06-04  9:19     ` Gal Ofri
  0 siblings, 1 reply; 8+ messages in thread
From: Gal Ofri @ 2021-06-04  8:42 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, Song Liu

On Fri, 04 Jun 2021 09:31:04 +1000
"NeilBrown" <neilb@suse.de> wrote:
Hey Neil,
thank you for your feedback !

> This surprises me.  I only expect rwlocks to be a benefit when the
> readlock is held for significantly longer than the time it takes to get
> an uncontended lock, and I don't think that is happening here.
> However, you have done the measurements (thanks for that!) and I cannot
> argue with numbers.
To be honest, I expected spinlock to perform as well as rwlock in this case too,
but experiment proves otherwise (lock contention and iops are almost identical to before the patch).
Maybe it's because wait_event_lock_irq() is in the critical section ?

> Could you try that and report results?  Thanks.
I patched the code as per your suggestion and it performs even better:
up to 4.2M iops with zero contention on that lock (~55GB throughput).
Since it only impacts quiesce now, I thought it'd be better to use spinlock afterall.
Please let me know if you think otherwise.

I'll run all tests again, rephrase where needed and resubmit.

Thanks,
Gal

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] md/raid5: reduce lock contention in read_one_chunk()
  2021-06-04  7:43 ` Guoqing Jiang
@ 2021-06-04  8:53   ` Gal Ofri
  0 siblings, 0 replies; 8+ messages in thread
From: Gal Ofri @ 2021-06-04  8:53 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, Song Liu, Neil Brown

On Fri, 4 Jun 2021 15:43:48 +0800
Guoqing Jiang <jgq516@gmail.com> wrote:

> Just curious '8'  is chose for  group_thread_cnt. IIUC, group means one 
> numa node, and better to
> set it  to match the number of cores in one numa node in case better 
> performance is expected.
Since the worker threads has really low impact on 100%-read workloads,
group_thread_cnt has no effect here.  It performs just as good with
group_thread_cnt set to 0, 8 or 96.  Thinking about it, I should probably
omit that comment from the patch.

btw, in write workloads, the worker threads are actually contended by
device_lock, so choosing any number >8 often results in degraded
performance in my tests.  I have a p.o.c. branch that reduces the
contention by replacing device_lock with hash locks to remove that
contention, but it's a big-risky patch so I'd need to break it down first. 

Thanks, Gal

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] md/raid5: reduce lock contention in read_one_chunk()
  2021-06-04  8:42   ` Gal Ofri
@ 2021-06-04  9:19     ` Gal Ofri
  2021-06-05  0:30       ` NeilBrown
  0 siblings, 1 reply; 8+ messages in thread
From: Gal Ofri @ 2021-06-04  9:19 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, Song Liu

Just a second thought

> Since it only impacts quiesce now, I thought it'd be better to use spinlock afterall.
> Please let me know if you think otherwise.
Since slow path is only at quiesce, I can remove aligned_reads_lock altogether and use device_lock like before.
What do you think ?

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] md/raid5: reduce lock contention in read_one_chunk()
  2021-06-04  9:19     ` Gal Ofri
@ 2021-06-05  0:30       ` NeilBrown
  2021-06-06 15:10         ` Gal Ofri
  0 siblings, 1 reply; 8+ messages in thread
From: NeilBrown @ 2021-06-05  0:30 UTC (permalink / raw)
  To: Gal Ofri; +Cc: linux-raid, Song Liu

On Fri, 04 Jun 2021, Gal Ofri wrote:
> Just a second thought
> 
> > Since it only impacts quiesce now, I thought it'd be better to use spinlock afterall.
> > Please let me know if you think otherwise.
> Since slow path is only at quiesce, I can remove aligned_reads_lock altogether and use device_lock like before.
> What do you think ?
> 
Yes, I think you should remove aligned_read_lock.  Remove all of your
patch and just make the change I suggested.  Then, of course, review it
carefully to make sure you agree that it is correct.

Thanks
NeilBrown


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] md/raid5: reduce lock contention in read_one_chunk()
  2021-06-05  0:30       ` NeilBrown
@ 2021-06-06 15:10         ` Gal Ofri
  0 siblings, 0 replies; 8+ messages in thread
From: Gal Ofri @ 2021-06-06 15:10 UTC (permalink / raw)
  To: NeilBrown; +Cc: Song Liu

First, there's this race condition, caused by the following code:
if (conf->quiesce == 0)
{
	...
}
if (smp_load_acquire(&conf->quiesce) != 0)
{
	...
}

cpu-0 [raid5_quiesce()]	| cpu-1 [read_one_chunk()]
----------------------------------------------------------
conf->quiesce = 2 		| 
----------------------------------------------------------
				| if (conf->quiesce == 0)
				| /* false */		
----------------------------------------------------------
conf->quiesce = 1		|
----------------------------------------------------------
conf->quiesce = 0 		|
/* resuming from quiesce. */ 	|
----------------------------------------------------------
				| if (smp_load_acquire(&conf->quiesce) != 0)
				| /* false */
----------------------------------------------------------

So we skipped both conditions and did not increment active_aligned_reads, which
would result in an error down the road (endio/retry/next-quiesce/etc.).
It's really unlikely, but it is possible.

So I fixed it by changing:
-	if (smp_load_acquire(&conf->quiesce) != 0) {
+	if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) {

Now with this fix in place, there's still the possibility that
active_aligned_reads would be >0 while quiesce==1 (before read_one_chunk()
reaches the second condition).  I can't find a bug caused by that, but IMO it's
quite counter-intuitive given wait_event(active_aligned_reads==0) in
raid5_quiesce(). Should I add a comment for that point ?

I'm adding the new code below for reference, please let me know what you think
about that point before I re-submit the patch.

thanks,
Gal Ofri
--

@@ -5396,6 +5396,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	struct md_rdev *rdev;
 	sector_t sector, end_sector, first_bad;
 	int bad_sectors, dd_idx;
+	bool did_inc;
 
 	if (!in_chunk_boundary(mddev, raid_bio)) {
 		pr_debug("%s: non aligned\n", __func__);
@@ -5443,11 +5444,23 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	/* No reshape active, so we can trust rdev->data_offset */
 	align_bio->bi_iter.bi_sector += rdev->data_offset;
 
-	spin_lock_irq(&conf->device_lock);
-	wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
-			    conf->device_lock);
-	atomic_inc(&conf->active_aligned_reads);
-	spin_unlock_irq(&conf->device_lock);
+	did_inc = false;
+	if (conf->quiesce == 0) {
+		atomic_inc(&conf->active_aligned_reads);
+		did_inc = true;
+	}
+	/* need a memory barrier to detect the race with raid5_quiesce() */
+	if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) {
+		/* quiesce is in progress, so we need to undo io activation and wait
+		 * for it to finish */
+		if (did_inc && atomic_dec_and_test(&conf->active_aligned_reads))
+			wake_up(&conf->wait_for_quiescent);
+		spin_lock_irq(&conf->device_lock);
+		wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
+				    conf->device_lock);
+		atomic_inc(&conf->active_aligned_reads);
+		spin_unlock_irq(&conf->device_lock);
+	}
 
 	if (mddev->gendisk)
 		trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
@@ -8334,7 +8347,9 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
 		 * active stripes can drain
 		 */
 		r5c_flush_cache(conf, INT_MAX);
-		conf->quiesce = 2;
+		/* need a memory barrier to make sure read_one_chunk() sees
+		 * quiesce started and reverts to slow (locked) path. */
+		smp_store_release(&conf->quiesce, 2);
 		wait_event_cmd(conf->wait_for_quiescent,
 				    atomic_read(&conf->active_stripes) == 0 &&
 				    atomic_read(&conf->active_aligned_reads) == 0,



^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2021-06-06 15:10 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-03 13:54 [PATCH] md/raid5: reduce lock contention in read_one_chunk() gal.ofri
2021-06-03 23:31 ` NeilBrown
2021-06-04  8:42   ` Gal Ofri
2021-06-04  9:19     ` Gal Ofri
2021-06-05  0:30       ` NeilBrown
2021-06-06 15:10         ` Gal Ofri
2021-06-04  7:43 ` Guoqing Jiang
2021-06-04  8:53   ` Gal Ofri

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.