All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] MD: fix lock contention for flush bios
@ 2018-01-24  2:43 Xiao Ni
  2018-01-24  9:02 ` Guoqing Jiang
  2018-02-17 21:22 ` Shaohua Li
  0 siblings, 2 replies; 6+ messages in thread
From: Xiao Ni @ 2018-01-24  2:43 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, ming.lei, ncroxon

There is a lock contention when there are many processes which send flush bios
to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv.

Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio
to be NULL, otherwise get mddev->lock.

This patch remove mddev->flush_bio and allocat one bio for each flush bio.
I did a test with command dbench -s 128 -t 4800. This is the test result:

=================Before the patch============================

 Operation      Count    AvgLat    MaxLat
 ----------------------------------------
 NTCreateX     124271     5.128 12752.344
 Close          96922     0.645  6250.826
 Rename          5141     1.556  6747.033
 Unlink         20283     5.045 12752.294
 Qpathinfo     111872     0.012   250.069
 Qfileinfo      25684     0.004     0.293
 Qfsinfo        18535     0.007     0.646
 Sfileinfo      11763     0.014     0.571
 Find           41018     0.026     0.869
 WriteX         88616  6750.094 80740.875
 ReadX         179250     0.018  1749.302
 LockX            328     0.008     0.042
 UnlockX          328     0.004     0.023
 Flush           9650  1688.288 38998.814

Throughput 1.01668 MB/sec (sync open)  128 clients  128 procs  max_latency=80740.883 ms

========================after patch============================
 128  cleanup 600 sec
   0  cleanup 600 sec

 Operation      Count    AvgLat    MaxLat
 ----------------------------------------
 NTCreateX      52444     1.403  1499.920
 Close          45740     0.035  1000.301
 Rename          2090     0.073     1.591
 Unlink          5032     5.855  1251.375
 Qpathinfo      44493     0.167   501.498
 Qfileinfo      16261     0.004     0.577
 Qfsinfo         5887     0.011     1.133
 Sfileinfo       6303     0.022     1.277
 Find           15321     0.034     1.041
 WriteX         62730  1197.684  3247.813
 ReadX          60379     0.013     2.408
 LockX             48     0.010     0.039
 UnlockX           48     0.004     0.006
 Flush           4370   382.070   999.735

Throughput 5.06007 MB/sec (sync open)  128 clients  128 procs  max_latency=3247.823 ms

The test was did on one raid10 with 4 SSDs.

Suggested-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Xiao Ni <xni@redhat.com>
---
 drivers/md/md.c | 105 +++++++++++++++++++++++++-------------------------------
 drivers/md/md.h |  14 ++++----
 2 files changed, 54 insertions(+), 65 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4e4dee0..1e562f5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -409,33 +409,61 @@ static int md_congested(void *data, int bits)
 	return mddev_congested(mddev, bits);
 }
 
-/*
- * Generic flush handling for md
- */
+static void submit_flushes(struct work_struct *ws)
+{
+	struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
+	struct mddev *mddev = fi->mddev;
+	struct bio *bio = fi->bio;
+
+	bio->bi_opf &= ~REQ_PREFLUSH;
+	md_handle_request(mddev, bio);
 
-static void md_end_flush(struct bio *bio)
+	kfree(fi);
+}
+
+
+static void md_end_flush(struct bio *flush_bio)
 {
-	struct md_rdev *rdev = bio->bi_private;
-	struct mddev *mddev = rdev->mddev;
+	struct flush_info *fi = flush_bio->bi_private;
+	struct bio *bio = fi->bio;
+	struct mddev *mddev = fi->mddev;
+	struct md_rdev *rdev;
 
-	rdev_dec_pending(rdev, mddev);
+	rcu_read_lock();
+		rdev_for_each_rcu(rdev, mddev)
+			rdev_dec_pending(rdev, mddev);
+	rcu_read_unlock();
 
-	if (atomic_dec_and_test(&mddev->flush_pending)) {
-		/* The pre-request flush has finished */
-		queue_work(md_wq, &mddev->flush_work);
+	if (bio->bi_iter.bi_size == 0)
+		/* an empty barrier - all done */
+		bio_endio(bio);
+	else {
+		INIT_WORK(&fi->flush_work, submit_flushes);
+		queue_work(md_wq, &fi->flush_work);
 	}
-	bio_put(bio);
 }
 
-static void md_submit_flush_data(struct work_struct *ws);
-
-static void submit_flushes(struct work_struct *ws)
+void md_flush_request(struct mddev *mddev, struct bio *bio)
 {
-	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
 	struct md_rdev *rdev;
+	struct flush_info *fi;
+	struct bio *f_bio;
+
+	fi = kmalloc(sizeof(*fi), GFP_NOIO);
+	if (fi == NULL) {
+		pr_err("md: %s failed to alloc memory for flush bio\n",
+		       mdname(mddev));
+		bio->bi_status = BLK_STS_IOERR;
+		bio_endio(bio);
+	}
+
+	fi->bio = bio;
+	fi->mddev = mddev;
+	f_bio = &fi->flush_bio;
+	bio_init(f_bio, NULL, 0);
+	f_bio->bi_private = fi;
+	f_bio->bi_end_io = md_end_flush;
 
-	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
-	atomic_set(&mddev->flush_pending, 1);
 	rcu_read_lock();
 	rdev_for_each_rcu(rdev, mddev)
 		if (rdev->raid_disk >= 0 &&
@@ -449,54 +477,16 @@ static void submit_flushes(struct work_struct *ws)
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
 			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
-			bi->bi_end_io = md_end_flush;
-			bi->bi_private = rdev;
 			bio_set_dev(bi, rdev->bdev);
 			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-			atomic_inc(&mddev->flush_pending);
+			bio_chain(bi, f_bio);
 			submit_bio(bi);
 			rcu_read_lock();
 			rdev_dec_pending(rdev, mddev);
 		}
 	rcu_read_unlock();
-	if (atomic_dec_and_test(&mddev->flush_pending))
-		queue_work(md_wq, &mddev->flush_work);
-}
-
-static void md_submit_flush_data(struct work_struct *ws)
-{
-	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
-	struct bio *bio = mddev->flush_bio;
-
-	/*
-	 * must reset flush_bio before calling into md_handle_request to avoid a
-	 * deadlock, because other bios passed md_handle_request suspend check
-	 * could wait for this and below md_handle_request could wait for those
-	 * bios because of suspend check
-	 */
-	mddev->flush_bio = NULL;
-	wake_up(&mddev->sb_wait);
-
-	if (bio->bi_iter.bi_size == 0)
-		/* an empty barrier - all done */
-		bio_endio(bio);
-	else {
-		bio->bi_opf &= ~REQ_PREFLUSH;
-		md_handle_request(mddev, bio);
-	}
-}
-
-void md_flush_request(struct mddev *mddev, struct bio *bio)
-{
-	spin_lock_irq(&mddev->lock);
-	wait_event_lock_irq(mddev->sb_wait,
-			    !mddev->flush_bio,
-			    mddev->lock);
-	mddev->flush_bio = bio;
-	spin_unlock_irq(&mddev->lock);
 
-	INIT_WORK(&mddev->flush_work, submit_flushes);
-	queue_work(md_wq, &mddev->flush_work);
+	bio_endio(f_bio);
 }
 EXPORT_SYMBOL(md_flush_request);
 
@@ -555,7 +545,6 @@ void mddev_init(struct mddev *mddev)
 	atomic_set(&mddev->openers, 0);
 	atomic_set(&mddev->active_io, 0);
 	spin_lock_init(&mddev->lock);
-	atomic_set(&mddev->flush_pending, 0);
 	init_waitqueue_head(&mddev->sb_wait);
 	init_waitqueue_head(&mddev->recovery_wait);
 	mddev->reshape_position = MaxSector;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7d6bcf0..16e7f03 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -252,6 +252,13 @@ enum mddev_sb_flags {
 	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
 };
 
+struct flush_info {
+	struct bio *bio;
+	struct bio flush_bio;
+	struct mddev *mddev;
+	struct work_struct flush_work;
+};
+
 struct mddev {
 	void				*private;
 	struct md_personality		*pers;
@@ -457,13 +464,6 @@ struct mddev {
 						   * metadata and bitmap writes
 						   */
 
-	/* Generic flush handling.
-	 * The last to finish preflush schedules a worker to submit
-	 * the rest of the request (without the REQ_PREFLUSH flag).
-	 */
-	struct bio *flush_bio;
-	atomic_t flush_pending;
-	struct work_struct flush_work;
 	struct work_struct event_work;	/* used by dm to report failure event */
 	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
 	struct md_cluster_info		*cluster_info;
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] MD: fix lock contention for flush bios
  2018-01-24  2:43 [RFC PATCH] MD: fix lock contention for flush bios Xiao Ni
@ 2018-01-24  9:02 ` Guoqing Jiang
  2018-01-24 13:41   ` Xiao Ni
  2018-02-17 21:22 ` Shaohua Li
  1 sibling, 1 reply; 6+ messages in thread
From: Guoqing Jiang @ 2018-01-24  9:02 UTC (permalink / raw)
  To: Xiao Ni, linux-raid; +Cc: shli, neilb, ming.lei, ncroxon



On 01/24/2018 10:43 AM, Xiao Ni wrote:
> There is a lock contention when there are many processes which send flush bios
> to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv.
>
> Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio
> to be NULL, otherwise get mddev->lock.

With the new approach, can we still keep the synchronization across all 
devices?
I found the previous commit a2826aa92e2e ("md: support barrier requests 
on all
personalities") did want to keep synchronization.

[snip]

> Suggested-by: Ming Lei <ming.lei@redhat.com>
> Signed-off-by: Xiao Ni <xni@redhat.com>
> ---
>   drivers/md/md.c | 105 +++++++++++++++++++++++++-------------------------------
>   drivers/md/md.h |  14 ++++----
>   2 files changed, 54 insertions(+), 65 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 4e4dee0..1e562f5 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -409,33 +409,61 @@ static int md_congested(void *data, int bits)
>   	return mddev_congested(mddev, bits);
>   }
>   
> -/*
> - * Generic flush handling for md
> - */
> +static void submit_flushes(struct work_struct *ws)
> +{
> +	struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
> +	struct mddev *mddev = fi->mddev;
> +	struct bio *bio = fi->bio;
> +
> +	bio->bi_opf &= ~REQ_PREFLUSH;
> +	md_handle_request(mddev, bio);
>   
> -static void md_end_flush(struct bio *bio)
> +	kfree(fi);
> +}
> +
> +

An extra blank line above.

> +static void md_end_flush(struct bio *flush_bio)
>   {
> -	struct md_rdev *rdev = bio->bi_private;
> -	struct mddev *mddev = rdev->mddev;
> +	struct flush_info *fi = flush_bio->bi_private;
> +	struct bio *bio = fi->bio;
> +	struct mddev *mddev = fi->mddev;
> +	struct md_rdev *rdev;
>   
> -	rdev_dec_pending(rdev, mddev);
> +	rcu_read_lock();
> +		rdev_for_each_rcu(rdev, mddev)
> +			rdev_dec_pending(rdev, mddev);

Unnecessary indentation.

> +	rcu_read_unlock();
>   
> -	if (atomic_dec_and_test(&mddev->flush_pending)) {
> -		/* The pre-request flush has finished */
> -		queue_work(md_wq, &mddev->flush_work);
> +	if (bio->bi_iter.bi_size == 0)
> +		/* an empty barrier - all done */
> +		bio_endio(bio);
> +	else {
> +		INIT_WORK(&fi->flush_work, submit_flushes);
> +		queue_work(md_wq, &fi->flush_work);
>   	}
> -	bio_put(bio);
>   }
>   
> -static void md_submit_flush_data(struct work_struct *ws);
> -
> -static void submit_flushes(struct work_struct *ws)
> +void md_flush_request(struct mddev *mddev, struct bio *bio)
>   {
> -	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
>   	struct md_rdev *rdev;
> +	struct flush_info *fi;
> +	struct bio *f_bio;
> +
> +	fi = kmalloc(sizeof(*fi), GFP_NOIO);
> +	if (fi == NULL) {
> +		pr_err("md: %s failed to alloc memory for flush bio\n",
> +		       mdname(mddev));
> +		bio->bi_status = BLK_STS_IOERR;
> +		bio_endio(bio);

Maybe you missed "return" here.

Thanks,
Guoqing

> +	}
> +
> +	fi->bio = bio;
> +	fi->mddev = mddev;
> +	f_bio = &fi->flush_bio;
> +	bio_init(f_bio, NULL, 0);
> +	f_bio->bi_private = fi;
> +	f_bio->bi_end_io = md_end_flush;
>   
> -	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
> -	atomic_set(&mddev->flush_pending, 1);
>   	rcu_read_lock();
>   	rdev_for_each_rcu(rdev, mddev)
>   		if (rdev->raid_disk >= 0 &&
> @@ -449,54 +477,16 @@ static void submit_flushes(struct work_struct *ws)
>   			atomic_inc(&rdev->nr_pending);
>   			rcu_read_unlock();
>   			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
> -			bi->bi_end_io = md_end_flush;
> -			bi->bi_private = rdev;
>   			bio_set_dev(bi, rdev->bdev);
>   			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
> -			atomic_inc(&mddev->flush_pending);
> +			bio_chain(bi, f_bio);
>   			submit_bio(bi);
>   			rcu_read_lock();
>   			rdev_dec_pending(rdev, mddev);
>   		}
>   	rcu_read_unlock();
> -	if (atomic_dec_and_test(&mddev->flush_pending))
> -		queue_work(md_wq, &mddev->flush_work);
> -}
> -
> -static void md_submit_flush_data(struct work_struct *ws)
> -{
> -	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
> -	struct bio *bio = mddev->flush_bio;
> -
> -	/*
> -	 * must reset flush_bio before calling into md_handle_request to avoid a
> -	 * deadlock, because other bios passed md_handle_request suspend check
> -	 * could wait for this and below md_handle_request could wait for those
> -	 * bios because of suspend check
> -	 */
> -	mddev->flush_bio = NULL;
> -	wake_up(&mddev->sb_wait);
> -
> -	if (bio->bi_iter.bi_size == 0)
> -		/* an empty barrier - all done */
> -		bio_endio(bio);
> -	else {
> -		bio->bi_opf &= ~REQ_PREFLUSH;
> -		md_handle_request(mddev, bio);
> -	}
> -}
> -
> -void md_flush_request(struct mddev *mddev, struct bio *bio)
> -{
> -	spin_lock_irq(&mddev->lock);
> -	wait_event_lock_irq(mddev->sb_wait,
> -			    !mddev->flush_bio,
> -			    mddev->lock);
> -	mddev->flush_bio = bio;
> -	spin_unlock_irq(&mddev->lock);
>   
> -	INIT_WORK(&mddev->flush_work, submit_flushes);
> -	queue_work(md_wq, &mddev->flush_work);
> +	bio_endio(f_bio);
>   }
>   EXPORT_SYMBOL(md_flush_request);
>   
> @@ -555,7 +545,6 @@ void mddev_init(struct mddev *mddev)
>   	atomic_set(&mddev->openers, 0);
>   	atomic_set(&mddev->active_io, 0);
>   	spin_lock_init(&mddev->lock);
> -	atomic_set(&mddev->flush_pending, 0);
>   	init_waitqueue_head(&mddev->sb_wait);
>   	init_waitqueue_head(&mddev->recovery_wait);
>   	mddev->reshape_position = MaxSector;
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 7d6bcf0..16e7f03 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -252,6 +252,13 @@ enum mddev_sb_flags {
>   	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
>   };
>   
> +struct flush_info {
> +	struct bio *bio;
> +	struct bio flush_bio;
> +	struct mddev *mddev;
> +	struct work_struct flush_work;
> +};
> +
>   struct mddev {
>   	void				*private;
>   	struct md_personality		*pers;
> @@ -457,13 +464,6 @@ struct mddev {
>   						   * metadata and bitmap writes
>   						   */
>   
> -	/* Generic flush handling.
> -	 * The last to finish preflush schedules a worker to submit
> -	 * the rest of the request (without the REQ_PREFLUSH flag).
> -	 */
> -	struct bio *flush_bio;
> -	atomic_t flush_pending;
> -	struct work_struct flush_work;
>   	struct work_struct event_work;	/* used by dm to report failure event */
>   	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
>   	struct md_cluster_info		*cluster_info;


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] MD: fix lock contention for flush bios
  2018-01-24  9:02 ` Guoqing Jiang
@ 2018-01-24 13:41   ` Xiao Ni
       [not found]     ` <3b9cdab1-c6ac-c40c-4cd8-4a6ea20b5547@suse.com>
  0 siblings, 1 reply; 6+ messages in thread
From: Xiao Ni @ 2018-01-24 13:41 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, shli, neilb, ming lei, ncroxon



----- Original Message -----
> From: "Guoqing Jiang" <gqjiang@suse.com>
> To: "Xiao Ni" <xni@redhat.com>, linux-raid@vger.kernel.org
> Cc: shli@kernel.org, neilb@suse.com, "ming lei" <ming.lei@redhat.com>, ncroxon@redhat.com
> Sent: Wednesday, January 24, 2018 5:02:57 PM
> Subject: Re: [RFC PATCH] MD: fix lock contention for flush bios
> 
> 
> 
> On 01/24/2018 10:43 AM, Xiao Ni wrote:
> > There is a lock contention when there are many processes which send flush
> > bios
> > to md device. eg. Create many lvs on one raid device and mkfs.xfs on each
> > lv.
> >
> > Now it just can handle flush request sequentially. It needs to wait
> > mddev->flush_bio
> > to be NULL, otherwise get mddev->lock.
> 
> With the new approach, can we still keep the synchronization across all
> devices?
> I found the previous commit a2826aa92e2e ("md: support barrier requests
> on all
> personalities") did want to keep synchronization.


When one flush bio is sumbitted to md, it creates one bio for each rdev to send flush request.
If it must waits for all the flush requests to return, my patch breaks the rule.

Process A submits a flush bio to md, we call this flush bio bio-a.
Process B submits a flush bio to md, we call this flush bio bio-b.

Before my patch there is only one process can handle flush bio. Process B waits for the returning
of bio-b from Process B. After my patch it can handle the flush bios from all processes. Can't 
we handle flush bios from different processes at the same time?

Xiao

> 
> [snip]
> 
> > Suggested-by: Ming Lei <ming.lei@redhat.com>
> > Signed-off-by: Xiao Ni <xni@redhat.com>
> > ---
> >   drivers/md/md.c | 105
> >   +++++++++++++++++++++++++-------------------------------
> >   drivers/md/md.h |  14 ++++----
> >   2 files changed, 54 insertions(+), 65 deletions(-)
> >
> > diff --git a/drivers/md/md.c b/drivers/md/md.c
> > index 4e4dee0..1e562f5 100644
> > --- a/drivers/md/md.c
> > +++ b/drivers/md/md.c
> > @@ -409,33 +409,61 @@ static int md_congested(void *data, int bits)
> >   	return mddev_congested(mddev, bits);
> >   }
> >   
> > -/*
> > - * Generic flush handling for md
> > - */
> > +static void submit_flushes(struct work_struct *ws)
> > +{
> > +	struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
> > +	struct mddev *mddev = fi->mddev;
> > +	struct bio *bio = fi->bio;
> > +
> > +	bio->bi_opf &= ~REQ_PREFLUSH;
> > +	md_handle_request(mddev, bio);
> >   
> > -static void md_end_flush(struct bio *bio)
> > +	kfree(fi);
> > +}
> > +
> > +
> 
> An extra blank line above.
> 
> > +static void md_end_flush(struct bio *flush_bio)
> >   {
> > -	struct md_rdev *rdev = bio->bi_private;
> > -	struct mddev *mddev = rdev->mddev;
> > +	struct flush_info *fi = flush_bio->bi_private;
> > +	struct bio *bio = fi->bio;
> > +	struct mddev *mddev = fi->mddev;
> > +	struct md_rdev *rdev;
> >   
> > -	rdev_dec_pending(rdev, mddev);
> > +	rcu_read_lock();
> > +		rdev_for_each_rcu(rdev, mddev)
> > +			rdev_dec_pending(rdev, mddev);
> 
> Unnecessary indentation.
> 
> > +	rcu_read_unlock();
> >   
> > -	if (atomic_dec_and_test(&mddev->flush_pending)) {
> > -		/* The pre-request flush has finished */
> > -		queue_work(md_wq, &mddev->flush_work);
> > +	if (bio->bi_iter.bi_size == 0)
> > +		/* an empty barrier - all done */
> > +		bio_endio(bio);
> > +	else {
> > +		INIT_WORK(&fi->flush_work, submit_flushes);
> > +		queue_work(md_wq, &fi->flush_work);
> >   	}
> > -	bio_put(bio);
> >   }
> >   
> > -static void md_submit_flush_data(struct work_struct *ws);
> > -
> > -static void submit_flushes(struct work_struct *ws)
> > +void md_flush_request(struct mddev *mddev, struct bio *bio)
> >   {
> > -	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
> >   	struct md_rdev *rdev;
> > +	struct flush_info *fi;
> > +	struct bio *f_bio;
> > +
> > +	fi = kmalloc(sizeof(*fi), GFP_NOIO);
> > +	if (fi == NULL) {
> > +		pr_err("md: %s failed to alloc memory for flush bio\n",
> > +		       mdname(mddev));
> > +		bio->bi_status = BLK_STS_IOERR;
> > +		bio_endio(bio);
> 
> Maybe you missed "return" here.
> 
> Thanks,
> Guoqing
> 
> > +	}
> > +
> > +	fi->bio = bio;
> > +	fi->mddev = mddev;
> > +	f_bio = &fi->flush_bio;
> > +	bio_init(f_bio, NULL, 0);
> > +	f_bio->bi_private = fi;
> > +	f_bio->bi_end_io = md_end_flush;
> >   
> > -	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
> > -	atomic_set(&mddev->flush_pending, 1);
> >   	rcu_read_lock();
> >   	rdev_for_each_rcu(rdev, mddev)
> >   		if (rdev->raid_disk >= 0 &&
> > @@ -449,54 +477,16 @@ static void submit_flushes(struct work_struct *ws)
> >   			atomic_inc(&rdev->nr_pending);
> >   			rcu_read_unlock();
> >   			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
> > -			bi->bi_end_io = md_end_flush;
> > -			bi->bi_private = rdev;
> >   			bio_set_dev(bi, rdev->bdev);
> >   			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
> > -			atomic_inc(&mddev->flush_pending);
> > +			bio_chain(bi, f_bio);
> >   			submit_bio(bi);
> >   			rcu_read_lock();
> >   			rdev_dec_pending(rdev, mddev);
> >   		}
> >   	rcu_read_unlock();
> > -	if (atomic_dec_and_test(&mddev->flush_pending))
> > -		queue_work(md_wq, &mddev->flush_work);
> > -}
> > -
> > -static void md_submit_flush_data(struct work_struct *ws)
> > -{
> > -	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
> > -	struct bio *bio = mddev->flush_bio;
> > -
> > -	/*
> > -	 * must reset flush_bio before calling into md_handle_request to avoid a
> > -	 * deadlock, because other bios passed md_handle_request suspend check
> > -	 * could wait for this and below md_handle_request could wait for those
> > -	 * bios because of suspend check
> > -	 */
> > -	mddev->flush_bio = NULL;
> > -	wake_up(&mddev->sb_wait);
> > -
> > -	if (bio->bi_iter.bi_size == 0)
> > -		/* an empty barrier - all done */
> > -		bio_endio(bio);
> > -	else {
> > -		bio->bi_opf &= ~REQ_PREFLUSH;
> > -		md_handle_request(mddev, bio);
> > -	}
> > -}
> > -
> > -void md_flush_request(struct mddev *mddev, struct bio *bio)
> > -{
> > -	spin_lock_irq(&mddev->lock);
> > -	wait_event_lock_irq(mddev->sb_wait,
> > -			    !mddev->flush_bio,
> > -			    mddev->lock);
> > -	mddev->flush_bio = bio;
> > -	spin_unlock_irq(&mddev->lock);
> >   
> > -	INIT_WORK(&mddev->flush_work, submit_flushes);
> > -	queue_work(md_wq, &mddev->flush_work);
> > +	bio_endio(f_bio);
> >   }
> >   EXPORT_SYMBOL(md_flush_request);
> >   
> > @@ -555,7 +545,6 @@ void mddev_init(struct mddev *mddev)
> >   	atomic_set(&mddev->openers, 0);
> >   	atomic_set(&mddev->active_io, 0);
> >   	spin_lock_init(&mddev->lock);
> > -	atomic_set(&mddev->flush_pending, 0);
> >   	init_waitqueue_head(&mddev->sb_wait);
> >   	init_waitqueue_head(&mddev->recovery_wait);
> >   	mddev->reshape_position = MaxSector;
> > diff --git a/drivers/md/md.h b/drivers/md/md.h
> > index 7d6bcf0..16e7f03 100644
> > --- a/drivers/md/md.h
> > +++ b/drivers/md/md.h
> > @@ -252,6 +252,13 @@ enum mddev_sb_flags {
> >   	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
> >   };
> >   
> > +struct flush_info {
> > +	struct bio *bio;
> > +	struct bio flush_bio;
> > +	struct mddev *mddev;
> > +	struct work_struct flush_work;
> > +};
> > +
> >   struct mddev {
> >   	void				*private;
> >   	struct md_personality		*pers;
> > @@ -457,13 +464,6 @@ struct mddev {
> >   						   * metadata and bitmap writes
> >   						   */
> >   
> > -	/* Generic flush handling.
> > -	 * The last to finish preflush schedules a worker to submit
> > -	 * the rest of the request (without the REQ_PREFLUSH flag).
> > -	 */
> > -	struct bio *flush_bio;
> > -	atomic_t flush_pending;
> > -	struct work_struct flush_work;
> >   	struct work_struct event_work;	/* used by dm to report failure event */
> >   	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
> >   	struct md_cluster_info		*cluster_info;
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] MD: fix lock contention for flush bios
       [not found]     ` <3b9cdab1-c6ac-c40c-4cd8-4a6ea20b5547@suse.com>
@ 2018-01-26  2:07       ` Ming Lei
  2018-01-26  2:23         ` Guoqing Jiang
  0 siblings, 1 reply; 6+ messages in thread
From: Ming Lei @ 2018-01-26  2:07 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: Xiao Ni, linux-raid, shli, neilb, ncroxon

Hi Guoqing,

On Fri, Jan 26, 2018 at 09:10:11AM +0800, Guoqing Jiang wrote:
> 
> 
> On 01/24/2018 09:41 PM, Xiao Ni wrote:
> > ----- Original Message -----
> > > From: "Guoqing Jiang"<gqjiang@suse.com>
> > > To: "Xiao Ni"<xni@redhat.com>,linux-raid@vger.kernel.org
> > > Cc:shli@kernel.org,neilb@suse.com, "ming lei"<ming.lei@redhat.com>,ncroxon@redhat.com
> > > Sent: Wednesday, January 24, 2018 5:02:57 PM
> > > Subject: Re: [RFC PATCH] MD: fix lock contention for flush bios
> > > 
> > > 
> > > 
> > > On 01/24/2018 10:43 AM, Xiao Ni wrote:
> > > > There is a lock contention when there are many processes which send flush
> > > > bios
> > > > to md device. eg. Create many lvs on one raid device and mkfs.xfs on each
> > > > lv.
> > > > 
> > > > Now it just can handle flush request sequentially. It needs to wait
> > > > mddev->flush_bio
> > > > to be NULL, otherwise get mddev->lock.
> > > With the new approach, can we still keep the synchronization across all
> > > devices?
> > > I found the previous commit a2826aa92e2e ("md: support barrier requests
> > > on all
> > > personalities") did want to keep synchronization.
> > When one flush bio is sumbitted to md, it creates one bio for each rdev to send flush request.
> > If it must waits for all the flush requests to return, my patch breaks the rule.
> > 
> > Process A submits a flush bio to md, we call this flush bio bio-a.
> > Process B submits a flush bio to md, we call this flush bio bio-b.
> > 
> > Before my patch there is only one process can handle flush bio. Process B waits for the returning
> > of bio-b from Process B. After my patch it can handle the flush bios from all processes. Can't
> > we handle flush bios from different processes at the same time?
> 
> Hmm, barrier does need the synchronization though it caused the contention.

Seems this patch only touches code of handling MD flush request, not related
with barrier, could you explain a bit if it is?

> For flush/fua, I guess
> we may not need it since fs need to ensure that certain requests are
> executed in order, but I am not
> so sure about it.

From block layer's view, there isn't order in handling flush request
among different IOs, and it is just a command sent to hardware for flushing
the internal cache in drive.

The current strictly serialized style of handling MD flush request does hurt
performance, and IMO, the idea of the patch should be correct, at least from
the view of handling flush request.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] MD: fix lock contention for flush bios
  2018-01-26  2:07       ` Ming Lei
@ 2018-01-26  2:23         ` Guoqing Jiang
  0 siblings, 0 replies; 6+ messages in thread
From: Guoqing Jiang @ 2018-01-26  2:23 UTC (permalink / raw)
  To: Ming Lei; +Cc: Xiao Ni, linux-raid, shli, neilb, ncroxon



On 01/26/2018 10:07 AM, Ming Lei wrote:
> Hi Guoqing,
>
> On Fri, Jan 26, 2018 at 09:10:11AM +0800, Guoqing Jiang wrote:
>>
>> On 01/24/2018 09:41 PM, Xiao Ni wrote:
>>> ----- Original Message -----
>>>> From: "Guoqing Jiang"<gqjiang@suse.com>
>>>> To: "Xiao Ni"<xni@redhat.com>,linux-raid@vger.kernel.org
>>>> Cc:shli@kernel.org,neilb@suse.com, "ming lei"<ming.lei@redhat.com>,ncroxon@redhat.com
>>>> Sent: Wednesday, January 24, 2018 5:02:57 PM
>>>> Subject: Re: [RFC PATCH] MD: fix lock contention for flush bios
>>>>
>>>>
>>>>
>>>> On 01/24/2018 10:43 AM, Xiao Ni wrote:
>>>>> There is a lock contention when there are many processes which send flush
>>>>> bios
>>>>> to md device. eg. Create many lvs on one raid device and mkfs.xfs on each
>>>>> lv.
>>>>>
>>>>> Now it just can handle flush request sequentially. It needs to wait
>>>>> mddev->flush_bio
>>>>> to be NULL, otherwise get mddev->lock.
>>>> With the new approach, can we still keep the synchronization across all
>>>> devices?
>>>> I found the previous commit a2826aa92e2e ("md: support barrier requests
>>>> on all
>>>> personalities") did want to keep synchronization.
>>> When one flush bio is sumbitted to md, it creates one bio for each rdev to send flush request.
>>> If it must waits for all the flush requests to return, my patch breaks the rule.
>>>
>>> Process A submits a flush bio to md, we call this flush bio bio-a.
>>> Process B submits a flush bio to md, we call this flush bio bio-b.
>>>
>>> Before my patch there is only one process can handle flush bio. Process B waits for the returning
>>> of bio-b from Process B. After my patch it can handle the flush bios from all processes. Can't
>>> we handle flush bios from different processes at the same time?
>> Hmm, barrier does need the synchronization though it caused the contention.
> Seems this patch only touches code of handling MD flush request, not related
> with barrier, could you explain a bit if it is?

Hmm, I didn't notice the barrier had been converted to flush/fua long 
time ago.
And some comments inside the code need to be updated I think.

>> For flush/fua, I guess
>> we may not need it since fs need to ensure that certain requests are
>> executed in order, but I am not
>> so sure about it.
>  From block layer's view, there isn't order in handling flush request
> among different IOs, and it is just a command sent to hardware for flushing
> the internal cache in drive.
>
> The current strictly serialized style of handling MD flush request does hurt
> performance, and IMO, the idea of the patch should be correct, at least from
> the view of handling flush request.

Thanks for the explanation, no more concern from me.

Regards,
Guoqing

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] MD: fix lock contention for flush bios
  2018-01-24  2:43 [RFC PATCH] MD: fix lock contention for flush bios Xiao Ni
  2018-01-24  9:02 ` Guoqing Jiang
@ 2018-02-17 21:22 ` Shaohua Li
  1 sibling, 0 replies; 6+ messages in thread
From: Shaohua Li @ 2018-02-17 21:22 UTC (permalink / raw)
  To: Xiao Ni; +Cc: linux-raid, neilb, ming.lei, ncroxon

On Wed, Jan 24, 2018 at 10:43:54AM +0800, Xiao Ni wrote:
> There is a lock contention when there are many processes which send flush bios
> to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv.
> 
> Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio
> to be NULL, otherwise get mddev->lock.
> 
> This patch remove mddev->flush_bio and allocat one bio for each flush bio.
> I did a test with command dbench -s 128 -t 4800. This is the test result:
> 
> =================Before the patch============================
> 
>  Operation      Count    AvgLat    MaxLat
>  ----------------------------------------
>  NTCreateX     124271     5.128 12752.344
>  Close          96922     0.645  6250.826
>  Rename          5141     1.556  6747.033
>  Unlink         20283     5.045 12752.294
>  Qpathinfo     111872     0.012   250.069
>  Qfileinfo      25684     0.004     0.293
>  Qfsinfo        18535     0.007     0.646
>  Sfileinfo      11763     0.014     0.571
>  Find           41018     0.026     0.869
>  WriteX         88616  6750.094 80740.875
>  ReadX         179250     0.018  1749.302
>  LockX            328     0.008     0.042
>  UnlockX          328     0.004     0.023
>  Flush           9650  1688.288 38998.814
> 
> Throughput 1.01668 MB/sec (sync open)  128 clients  128 procs  max_latency=80740.883 ms
> 
> ========================after patch============================
>  128  cleanup 600 sec
>    0  cleanup 600 sec
> 
>  Operation      Count    AvgLat    MaxLat
>  ----------------------------------------
>  NTCreateX      52444     1.403  1499.920
>  Close          45740     0.035  1000.301
>  Rename          2090     0.073     1.591
>  Unlink          5032     5.855  1251.375
>  Qpathinfo      44493     0.167   501.498
>  Qfileinfo      16261     0.004     0.577
>  Qfsinfo         5887     0.011     1.133
>  Sfileinfo       6303     0.022     1.277
>  Find           15321     0.034     1.041
>  WriteX         62730  1197.684  3247.813
>  ReadX          60379     0.013     2.408
>  LockX             48     0.010     0.039
>  UnlockX           48     0.004     0.006
>  Flush           4370   382.070   999.735
> 
> Throughput 5.06007 MB/sec (sync open)  128 clients  128 procs  max_latency=3247.823 ms
> 
> The test was did on one raid10 with 4 SSDs.
> 
> Suggested-by: Ming Lei <ming.lei@redhat.com>
> Signed-off-by: Xiao Ni <xni@redhat.com>
> ---
>  drivers/md/md.c | 105 +++++++++++++++++++++++++-------------------------------
>  drivers/md/md.h |  14 ++++----
>  2 files changed, 54 insertions(+), 65 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 4e4dee0..1e562f5 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -409,33 +409,61 @@ static int md_congested(void *data, int bits)
>  	return mddev_congested(mddev, bits);
>  }
>  
> -/*
> - * Generic flush handling for md
> - */
> +static void submit_flushes(struct work_struct *ws)
> +{
> +	struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
> +	struct mddev *mddev = fi->mddev;
> +	struct bio *bio = fi->bio;
> +
> +	bio->bi_opf &= ~REQ_PREFLUSH;
> +	md_handle_request(mddev, bio);
>  
> -static void md_end_flush(struct bio *bio)
> +	kfree(fi);
> +}
> +
> +
> +static void md_end_flush(struct bio *flush_bio)
>  {
> -	struct md_rdev *rdev = bio->bi_private;
> -	struct mddev *mddev = rdev->mddev;
> +	struct flush_info *fi = flush_bio->bi_private;
> +	struct bio *bio = fi->bio;
> +	struct mddev *mddev = fi->mddev;
> +	struct md_rdev *rdev;
>  
> -	rdev_dec_pending(rdev, mddev);
> +	rcu_read_lock();
> +		rdev_for_each_rcu(rdev, mddev)
> +			rdev_dec_pending(rdev, mddev);
> +	rcu_read_unlock();
>  
> -	if (atomic_dec_and_test(&mddev->flush_pending)) {
> -		/* The pre-request flush has finished */
> -		queue_work(md_wq, &mddev->flush_work);
> +	if (bio->bi_iter.bi_size == 0)
> +		/* an empty barrier - all done */
> +		bio_endio(bio);
> +	else {
> +		INIT_WORK(&fi->flush_work, submit_flushes);
> +		queue_work(md_wq, &fi->flush_work);
>  	}
> -	bio_put(bio);
>  }
>  
> -static void md_submit_flush_data(struct work_struct *ws);
> -
> -static void submit_flushes(struct work_struct *ws)
> +void md_flush_request(struct mddev *mddev, struct bio *bio)
>  {
> -	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
>  	struct md_rdev *rdev;
> +	struct flush_info *fi;
> +	struct bio *f_bio;
> +
> +	fi = kmalloc(sizeof(*fi), GFP_NOIO);
> +	if (fi == NULL) {
> +		pr_err("md: %s failed to alloc memory for flush bio\n",
> +		       mdname(mddev));
> +		bio->bi_status = BLK_STS_IOERR;
> +		bio_endio(bio);
> +	}

So we allocate flush_info for each thread which calling into flush. This is a
little concern to me. I'm wondering if this optimization is really necessary.
In practice, flush is called very rare.

Thanks,
Shaohua
> +	fi->bio = bio;
> +	fi->mddev = mddev;
> +	f_bio = &fi->flush_bio;
> +	bio_init(f_bio, NULL, 0);
> +	f_bio->bi_private = fi;
> +	f_bio->bi_end_io = md_end_flush;
>  
> -	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
> -	atomic_set(&mddev->flush_pending, 1);
>  	rcu_read_lock();
>  	rdev_for_each_rcu(rdev, mddev)
>  		if (rdev->raid_disk >= 0 &&
> @@ -449,54 +477,16 @@ static void submit_flushes(struct work_struct *ws)
>  			atomic_inc(&rdev->nr_pending);
>  			rcu_read_unlock();
>  			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
> -			bi->bi_end_io = md_end_flush;
> -			bi->bi_private = rdev;
>  			bio_set_dev(bi, rdev->bdev);
>  			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
> -			atomic_inc(&mddev->flush_pending);
> +			bio_chain(bi, f_bio);
>  			submit_bio(bi);
>  			rcu_read_lock();
>  			rdev_dec_pending(rdev, mddev);
>  		}
>  	rcu_read_unlock();
> -	if (atomic_dec_and_test(&mddev->flush_pending))
> -		queue_work(md_wq, &mddev->flush_work);
> -}
> -
> -static void md_submit_flush_data(struct work_struct *ws)
> -{
> -	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
> -	struct bio *bio = mddev->flush_bio;
> -
> -	/*
> -	 * must reset flush_bio before calling into md_handle_request to avoid a
> -	 * deadlock, because other bios passed md_handle_request suspend check
> -	 * could wait for this and below md_handle_request could wait for those
> -	 * bios because of suspend check
> -	 */
> -	mddev->flush_bio = NULL;
> -	wake_up(&mddev->sb_wait);
> -
> -	if (bio->bi_iter.bi_size == 0)
> -		/* an empty barrier - all done */
> -		bio_endio(bio);
> -	else {
> -		bio->bi_opf &= ~REQ_PREFLUSH;
> -		md_handle_request(mddev, bio);
> -	}
> -}
> -
> -void md_flush_request(struct mddev *mddev, struct bio *bio)
> -{
> -	spin_lock_irq(&mddev->lock);
> -	wait_event_lock_irq(mddev->sb_wait,
> -			    !mddev->flush_bio,
> -			    mddev->lock);
> -	mddev->flush_bio = bio;
> -	spin_unlock_irq(&mddev->lock);
>  
> -	INIT_WORK(&mddev->flush_work, submit_flushes);
> -	queue_work(md_wq, &mddev->flush_work);
> +	bio_endio(f_bio);
>  }
>  EXPORT_SYMBOL(md_flush_request);
>  
> @@ -555,7 +545,6 @@ void mddev_init(struct mddev *mddev)
>  	atomic_set(&mddev->openers, 0);
>  	atomic_set(&mddev->active_io, 0);
>  	spin_lock_init(&mddev->lock);
> -	atomic_set(&mddev->flush_pending, 0);
>  	init_waitqueue_head(&mddev->sb_wait);
>  	init_waitqueue_head(&mddev->recovery_wait);
>  	mddev->reshape_position = MaxSector;
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 7d6bcf0..16e7f03 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -252,6 +252,13 @@ enum mddev_sb_flags {
>  	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
>  };
>  
> +struct flush_info {
> +	struct bio *bio;
> +	struct bio flush_bio;
> +	struct mddev *mddev;
> +	struct work_struct flush_work;
> +};
> +
>  struct mddev {
>  	void				*private;
>  	struct md_personality		*pers;
> @@ -457,13 +464,6 @@ struct mddev {
>  						   * metadata and bitmap writes
>  						   */
>  
> -	/* Generic flush handling.
> -	 * The last to finish preflush schedules a worker to submit
> -	 * the rest of the request (without the REQ_PREFLUSH flag).
> -	 */
> -	struct bio *flush_bio;
> -	atomic_t flush_pending;
> -	struct work_struct flush_work;
>  	struct work_struct event_work;	/* used by dm to report failure event */
>  	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
>  	struct md_cluster_info		*cluster_info;
> -- 
> 2.7.4
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2018-02-17 21:22 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-24  2:43 [RFC PATCH] MD: fix lock contention for flush bios Xiao Ni
2018-01-24  9:02 ` Guoqing Jiang
2018-01-24 13:41   ` Xiao Ni
     [not found]     ` <3b9cdab1-c6ac-c40c-4cd8-4a6ea20b5547@suse.com>
2018-01-26  2:07       ` Ming Lei
2018-01-26  2:23         ` Guoqing Jiang
2018-02-17 21:22 ` Shaohua Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.