linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH -next 0/8] md: fix and refactor io accounting and 'active_io'
@ 2023-06-19 20:48 Yu Kuai
  2023-06-19 20:48 ` [PATCH -next 1/8] md: move initialization and destruction of 'io_acct_set' to md.c Yu Kuai
                   ` (8 more replies)
  0 siblings, 9 replies; 22+ messages in thread
From: Yu Kuai @ 2023-06-19 20:48 UTC (permalink / raw)
  To: xni, song; +Cc: linux-raid, linux-kernel, yukuai3, yukuai1, yi.zhang, yangerkun

From: Yu Kuai <yukuai3@huawei.com>

This patchset do following things:
 - make io accounting for all raid levels consistent, patch 1, 3-5;
 - enable io accounting for all raid levels, patch 6-8;
 - hold 'active_io' before io start to dispatch, and release 'active_io'
 when io is done, make mddev_suspend() will wait for io to be done, patch 2

Yu Kuai (8):
  md: move initialization and destruction of 'io_acct_set' to md.c
  md: also clone new io if io accounting is disabled
  raid5: fix missing io accounting in raid5_align_endio()
  md/raid1: switch to use md_account_bio() for io accounting
  md/raid10: switch to use md_account_bio() for io accounting
  md/md-multipath: enable io accounting
  md/md-linear: enable io accounting
  md/md-faulty: enable io accounting

 drivers/md/md-faulty.c    |  2 +
 drivers/md/md-linear.c    |  1 +
 drivers/md/md-multipath.c |  1 +
 drivers/md/md.c           | 78 +++++++++++++++++++--------------------
 drivers/md/md.h           |  6 +--
 drivers/md/raid0.c        | 16 +-------
 drivers/md/raid1.c        | 14 +++----
 drivers/md/raid1.h        |  1 -
 drivers/md/raid10.c       | 20 +++++-----
 drivers/md/raid10.h       |  1 -
 drivers/md/raid5.c        | 70 ++++++++++-------------------------
 11 files changed, 79 insertions(+), 131 deletions(-)

-- 
2.39.2


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH -next 1/8] md: move initialization and destruction of 'io_acct_set' to md.c
  2023-06-19 20:48 [PATCH -next 0/8] md: fix and refactor io accounting and 'active_io' Yu Kuai
@ 2023-06-19 20:48 ` Yu Kuai
  2023-06-20  8:35   ` Xiao Ni
  2023-06-19 20:48 ` [PATCH -next 2/8] md: also clone new io if io accounting is disabled Yu Kuai
                   ` (7 subsequent siblings)
  8 siblings, 1 reply; 22+ messages in thread
From: Yu Kuai @ 2023-06-19 20:48 UTC (permalink / raw)
  To: xni, song; +Cc: linux-raid, linux-kernel, yukuai3, yukuai1, yi.zhang, yangerkun

From: Yu Kuai <yukuai3@huawei.com>

'io_acct_set' is only used for raid0 and raid456, prepare to use it for
raid1 and raid10, so that io accounting from different levels can be
consistent.

By the way, follow up patches will also use this io clone mechanism to
make sure 'active_io' represents in flight io, not io that is dispatching,
so that mddev_suspend will wait for io to be done as desgined.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/md.c    | 27 ++++++++++-----------------
 drivers/md/md.h    |  2 --
 drivers/md/raid0.c | 16 ++--------------
 drivers/md/raid5.c | 41 +++++++++++------------------------------
 4 files changed, 23 insertions(+), 63 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8d62f85d2ab0..42347289195a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5886,6 +5886,13 @@ int md_run(struct mddev *mddev)
 			goto exit_bio_set;
 	}
 
+	if (!bioset_initialized(&mddev->io_acct_set)) {
+		err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
+				  offsetof(struct md_io_acct, bio_clone), 0);
+		if (err)
+			goto exit_sync_set;
+	}
+
 	spin_lock(&pers_lock);
 	pers = find_pers(mddev->level, mddev->clevel);
 	if (!pers || !try_module_get(pers->owner)) {
@@ -6063,6 +6070,8 @@ int md_run(struct mddev *mddev)
 	module_put(pers->owner);
 	md_bitmap_destroy(mddev);
 abort:
+	bioset_exit(&mddev->io_acct_set);
+exit_sync_set:
 	bioset_exit(&mddev->sync_set);
 exit_bio_set:
 	bioset_exit(&mddev->bio_set);
@@ -6286,6 +6295,7 @@ static void __md_stop(struct mddev *mddev)
 	percpu_ref_exit(&mddev->active_io);
 	bioset_exit(&mddev->bio_set);
 	bioset_exit(&mddev->sync_set);
+	bioset_exit(&mddev->io_acct_set);
 }
 
 void md_stop(struct mddev *mddev)
@@ -8651,23 +8661,6 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
 }
 EXPORT_SYMBOL_GPL(md_submit_discard_bio);
 
-int acct_bioset_init(struct mddev *mddev)
-{
-	int err = 0;
-
-	if (!bioset_initialized(&mddev->io_acct_set))
-		err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
-			offsetof(struct md_io_acct, bio_clone), 0);
-	return err;
-}
-EXPORT_SYMBOL_GPL(acct_bioset_init);
-
-void acct_bioset_exit(struct mddev *mddev)
-{
-	bioset_exit(&mddev->io_acct_set);
-}
-EXPORT_SYMBOL_GPL(acct_bioset_exit);
-
 static void md_end_io_acct(struct bio *bio)
 {
 	struct md_io_acct *md_io_acct = bio->bi_private;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7cab9c7c45b8..11299d94b239 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -776,8 +776,6 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
 extern void md_finish_reshape(struct mddev *mddev);
 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
 			struct bio *bio, sector_t start, sector_t size);
-int acct_bioset_init(struct mddev *mddev);
-void acct_bioset_exit(struct mddev *mddev);
 void md_account_bio(struct mddev *mddev, struct bio **bio);
 
 extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index f8ee9a95e25d..38d9209cada1 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -365,7 +365,6 @@ static void raid0_free(struct mddev *mddev, void *priv)
 	struct r0conf *conf = priv;
 
 	free_conf(mddev, conf);
-	acct_bioset_exit(mddev);
 }
 
 static int raid0_run(struct mddev *mddev)
@@ -380,16 +379,11 @@ static int raid0_run(struct mddev *mddev)
 	if (md_check_no_bitmap(mddev))
 		return -EINVAL;
 
-	if (acct_bioset_init(mddev)) {
-		pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev));
-		return -ENOMEM;
-	}
-
 	/* if private is not null, we are here after takeover */
 	if (mddev->private == NULL) {
 		ret = create_strip_zones(mddev, &conf);
 		if (ret < 0)
-			goto exit_acct_set;
+			return ret;
 		mddev->private = conf;
 	}
 	conf = mddev->private;
@@ -420,15 +414,9 @@ static int raid0_run(struct mddev *mddev)
 
 	ret = md_integrity_register(mddev);
 	if (ret)
-		goto free;
+		free_conf(mddev, conf);
 
 	return ret;
-
-free:
-	free_conf(mddev, conf);
-exit_acct_set:
-	acct_bioset_exit(mddev);
-	return ret;
 }
 
 static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f8bc74e16811..29cf5455d7a5 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7787,19 +7787,12 @@ static int raid5_run(struct mddev *mddev)
 	struct md_rdev *rdev;
 	struct md_rdev *journal_dev = NULL;
 	sector_t reshape_offset = 0;
-	int i, ret = 0;
+	int i;
 	long long min_offset_diff = 0;
 	int first = 1;
 
-	if (acct_bioset_init(mddev)) {
-		pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
+	if (mddev_init_writes_pending(mddev) < 0)
 		return -ENOMEM;
-	}
-
-	if (mddev_init_writes_pending(mddev) < 0) {
-		ret = -ENOMEM;
-		goto exit_acct_set;
-	}
 
 	if (mddev->recovery_cp != MaxSector)
 		pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
@@ -7830,8 +7823,7 @@ static int raid5_run(struct mddev *mddev)
 	    (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
 		pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
 			  mdname(mddev));
-		ret = -EINVAL;
-		goto exit_acct_set;
+		return -EINVAL;
 	}
 
 	if (mddev->reshape_position != MaxSector) {
@@ -7856,15 +7848,13 @@ static int raid5_run(struct mddev *mddev)
 		if (journal_dev) {
 			pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
 				mdname(mddev));
-			ret = -EINVAL;
-			goto exit_acct_set;
+			return -EINVAL;
 		}
 
 		if (mddev->new_level != mddev->level) {
 			pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
 				mdname(mddev));
-			ret = -EINVAL;
-			goto exit_acct_set;
+			return -EINVAL;
 		}
 		old_disks = mddev->raid_disks - mddev->delta_disks;
 		/* reshape_position must be on a new-stripe boundary, and one
@@ -7880,8 +7870,7 @@ static int raid5_run(struct mddev *mddev)
 		if (sector_div(here_new, chunk_sectors * new_data_disks)) {
 			pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
 				mdname(mddev));
-			ret = -EINVAL;
-			goto exit_acct_set;
+			return -EINVAL;
 		}
 		reshape_offset = here_new * chunk_sectors;
 		/* here_new is the stripe we will write to */
@@ -7903,8 +7892,7 @@ static int raid5_run(struct mddev *mddev)
 			else if (mddev->ro == 0) {
 				pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
 					mdname(mddev));
-				ret = -EINVAL;
-				goto exit_acct_set;
+				return -EINVAL;
 			}
 		} else if (mddev->reshape_backwards
 		    ? (here_new * chunk_sectors + min_offset_diff <=
@@ -7914,8 +7902,7 @@ static int raid5_run(struct mddev *mddev)
 			/* Reading from the same stripe as writing to - bad */
 			pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
 				mdname(mddev));
-			ret = -EINVAL;
-			goto exit_acct_set;
+			return -EINVAL;
 		}
 		pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
 		/* OK, we should be able to continue; */
@@ -7939,10 +7926,8 @@ static int raid5_run(struct mddev *mddev)
 	else
 		conf = mddev->private;
 
-	if (IS_ERR(conf)) {
-		ret = PTR_ERR(conf);
-		goto exit_acct_set;
-	}
+	if (IS_ERR(conf))
+		return PTR_ERR(conf);
 
 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
 		if (!journal_dev) {
@@ -8140,10 +8125,7 @@ static int raid5_run(struct mddev *mddev)
 	free_conf(conf);
 	mddev->private = NULL;
 	pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
-	ret = -EIO;
-exit_acct_set:
-	acct_bioset_exit(mddev);
-	return ret;
+	return -EIO;
 }
 
 static void raid5_free(struct mddev *mddev, void *priv)
@@ -8151,7 +8133,6 @@ static void raid5_free(struct mddev *mddev, void *priv)
 	struct r5conf *conf = priv;
 
 	free_conf(conf);
-	acct_bioset_exit(mddev);
 	mddev->to_remove = &raid5_attrs_group;
 }
 
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH -next 2/8] md: also clone new io if io accounting is disabled
  2023-06-19 20:48 [PATCH -next 0/8] md: fix and refactor io accounting and 'active_io' Yu Kuai
  2023-06-19 20:48 ` [PATCH -next 1/8] md: move initialization and destruction of 'io_acct_set' to md.c Yu Kuai
@ 2023-06-19 20:48 ` Yu Kuai
  2023-06-20  8:48   ` Xiao Ni
  2023-06-19 20:48 ` [PATCH -next 3/8] raid5: fix missing io accounting in raid5_align_endio() Yu Kuai
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 22+ messages in thread
From: Yu Kuai @ 2023-06-19 20:48 UTC (permalink / raw)
  To: xni, song; +Cc: linux-raid, linux-kernel, yukuai3, yukuai1, yi.zhang, yangerkun

From: Yu Kuai <yukuai3@huawei.com>

Currently, 'active_io' is grabed before make_reqeust() is called, and
it's dropped immediately make_reqeust() returns. Hence 'active_io'
actually means io is dispatching, not io is inflight.

For raid0 and raid456 that io accounting is enabled, 'active_io' will
also be grabed when bio is cloned for io accounting, and this 'active_io'
is dropped until io is done.

Always clone new bio so that 'active_io' will mean that io is inflight,
raid1 and raid10 will switch to use this method in later patches. Once
these are implemented, it can be cleaned up that 'active_io' is grabed
twice for one io.

Now that bio will be cloned even if io accounting is disabled, also
rename related structure from '*_acct_*' to '*_clone_*'.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/md.c    | 61 ++++++++++++++++++++++++----------------------
 drivers/md/md.h    |  4 +--
 drivers/md/raid5.c | 18 +++++++-------
 3 files changed, 43 insertions(+), 40 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 42347289195a..5ad8e7f3aebd 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2314,7 +2314,7 @@ int md_integrity_register(struct mddev *mddev)
 	pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
 	if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
 	    (mddev->level != 1 && mddev->level != 10 &&
-	     bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) {
+	     bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) {
 		/*
 		 * No need to handle the failure of bioset_integrity_create,
 		 * because the function is called by md_run() -> pers->run(),
@@ -5886,9 +5886,9 @@ int md_run(struct mddev *mddev)
 			goto exit_bio_set;
 	}
 
-	if (!bioset_initialized(&mddev->io_acct_set)) {
-		err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
-				  offsetof(struct md_io_acct, bio_clone), 0);
+	if (!bioset_initialized(&mddev->io_clone_set)) {
+		err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
+				  offsetof(struct md_io_clone, bio_clone), 0);
 		if (err)
 			goto exit_sync_set;
 	}
@@ -6070,7 +6070,7 @@ int md_run(struct mddev *mddev)
 	module_put(pers->owner);
 	md_bitmap_destroy(mddev);
 abort:
-	bioset_exit(&mddev->io_acct_set);
+	bioset_exit(&mddev->io_clone_set);
 exit_sync_set:
 	bioset_exit(&mddev->sync_set);
 exit_bio_set:
@@ -6295,7 +6295,7 @@ static void __md_stop(struct mddev *mddev)
 	percpu_ref_exit(&mddev->active_io);
 	bioset_exit(&mddev->bio_set);
 	bioset_exit(&mddev->sync_set);
-	bioset_exit(&mddev->io_acct_set);
+	bioset_exit(&mddev->io_clone_set);
 }
 
 void md_stop(struct mddev *mddev)
@@ -8661,45 +8661,48 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
 }
 EXPORT_SYMBOL_GPL(md_submit_discard_bio);
 
-static void md_end_io_acct(struct bio *bio)
+static void md_end_clone_io(struct bio *bio)
 {
-	struct md_io_acct *md_io_acct = bio->bi_private;
-	struct bio *orig_bio = md_io_acct->orig_bio;
-	struct mddev *mddev = md_io_acct->mddev;
+	struct md_io_clone *md_io_clone = bio->bi_private;
+	struct bio *orig_bio = md_io_clone->orig_bio;
+	struct mddev *mddev = md_io_clone->mddev;
 
 	orig_bio->bi_status = bio->bi_status;
 
-	bio_end_io_acct(orig_bio, md_io_acct->start_time);
+	if (md_io_clone->start_time)
+		bio_end_io_acct(orig_bio, md_io_clone->start_time);
+
 	bio_put(bio);
 	bio_endio(orig_bio);
-
 	percpu_ref_put(&mddev->active_io);
 }
 
+static void md_clone_bio(struct mddev *mddev, struct bio **bio)
+{
+	struct block_device *bdev = (*bio)->bi_bdev;
+	struct md_io_clone *md_io_clone;
+	struct bio *clone =
+		bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set);
+
+	md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
+	md_io_clone->orig_bio = *bio;
+	md_io_clone->mddev = mddev;
+	if (blk_queue_io_stat(bdev->bd_disk->queue))
+		md_io_clone->start_time = bio_start_io_acct(*bio);
+
+	clone->bi_end_io = md_end_clone_io;
+	clone->bi_private = md_io_clone;
+	*bio = clone;
+}
+
 /*
  * Used by personalities that don't already clone the bio and thus can't
  * easily add the timestamp to their extended bio structure.
  */
 void md_account_bio(struct mddev *mddev, struct bio **bio)
 {
-	struct block_device *bdev = (*bio)->bi_bdev;
-	struct md_io_acct *md_io_acct;
-	struct bio *clone;
-
-	if (!blk_queue_io_stat(bdev->bd_disk->queue))
-		return;
-
 	percpu_ref_get(&mddev->active_io);
-
-	clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set);
-	md_io_acct = container_of(clone, struct md_io_acct, bio_clone);
-	md_io_acct->orig_bio = *bio;
-	md_io_acct->start_time = bio_start_io_acct(*bio);
-	md_io_acct->mddev = mddev;
-
-	clone->bi_end_io = md_end_io_acct;
-	clone->bi_private = md_io_acct;
-	*bio = clone;
+	md_clone_bio(mddev, bio);
 }
 EXPORT_SYMBOL_GPL(md_account_bio);
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 11299d94b239..892a598a5029 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -510,7 +510,7 @@ struct mddev {
 	struct bio_set			sync_set; /* for sync operations like
 						   * metadata and bitmap writes
 						   */
-	struct bio_set			io_acct_set; /* for raid0 and raid5 io accounting */
+	struct bio_set			io_clone_set;
 
 	/* Generic flush handling.
 	 * The last to finish preflush schedules a worker to submit
@@ -738,7 +738,7 @@ struct md_thread {
 	void			*private;
 };
 
-struct md_io_acct {
+struct md_io_clone {
 	struct mddev	*mddev;
 	struct bio	*orig_bio;
 	unsigned long	start_time;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 29cf5455d7a5..cef0b400b2ee 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5468,13 +5468,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
  */
 static void raid5_align_endio(struct bio *bi)
 {
-	struct md_io_acct *md_io_acct = bi->bi_private;
-	struct bio *raid_bi = md_io_acct->orig_bio;
+	struct md_io_clone *md_io_clone = bi->bi_private;
+	struct bio *raid_bi = md_io_clone->orig_bio;
 	struct mddev *mddev;
 	struct r5conf *conf;
 	struct md_rdev *rdev;
 	blk_status_t error = bi->bi_status;
-	unsigned long start_time = md_io_acct->start_time;
+	unsigned long start_time = md_io_clone->start_time;
 
 	bio_put(bi);
 
@@ -5506,7 +5506,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	struct md_rdev *rdev;
 	sector_t sector, end_sector, first_bad;
 	int bad_sectors, dd_idx;
-	struct md_io_acct *md_io_acct;
+	struct md_io_clone *md_io_clone;
 	bool did_inc;
 
 	if (!in_chunk_boundary(mddev, raid_bio)) {
@@ -5544,15 +5544,15 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	}
 
 	align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
-				    &mddev->io_acct_set);
-	md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
+				    &mddev->io_clone_set);
+	md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone);
 	raid_bio->bi_next = (void *)rdev;
 	if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
-		md_io_acct->start_time = bio_start_io_acct(raid_bio);
-	md_io_acct->orig_bio = raid_bio;
+		md_io_clone->start_time = bio_start_io_acct(raid_bio);
+	md_io_clone->orig_bio = raid_bio;
 
 	align_bio->bi_end_io = raid5_align_endio;
-	align_bio->bi_private = md_io_acct;
+	align_bio->bi_private = md_io_clone;
 	align_bio->bi_iter.bi_sector = sector;
 
 	/* No reshape active, so we can trust rdev->data_offset */
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH -next 3/8] raid5: fix missing io accounting in raid5_align_endio()
  2023-06-19 20:48 [PATCH -next 0/8] md: fix and refactor io accounting and 'active_io' Yu Kuai
  2023-06-19 20:48 ` [PATCH -next 1/8] md: move initialization and destruction of 'io_acct_set' to md.c Yu Kuai
  2023-06-19 20:48 ` [PATCH -next 2/8] md: also clone new io if io accounting is disabled Yu Kuai
@ 2023-06-19 20:48 ` Yu Kuai
  2023-06-20  8:52   ` Xiao Ni
  2023-06-20  9:57   ` Paul Menzel
  2023-06-19 20:48 ` [PATCH -next 4/8] md/raid1: switch to use md_account_bio() for io accounting Yu Kuai
                   ` (5 subsequent siblings)
  8 siblings, 2 replies; 22+ messages in thread
From: Yu Kuai @ 2023-06-19 20:48 UTC (permalink / raw)
  To: xni, song; +Cc: linux-raid, linux-kernel, yukuai3, yukuai1, yi.zhang, yangerkun

From: Yu Kuai <yukuai3@huawei.com>

Io will only be accounted as done from raid5_align_endio() if the io
succeed, and io inflight counter will be leaked if such io failed.

Fix this problem by switching to use md_account_bio() for io accounting.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/raid5.c | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cef0b400b2ee..4cdb35e54251 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5468,26 +5468,17 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
  */
 static void raid5_align_endio(struct bio *bi)
 {
-	struct md_io_clone *md_io_clone = bi->bi_private;
-	struct bio *raid_bi = md_io_clone->orig_bio;
-	struct mddev *mddev;
-	struct r5conf *conf;
-	struct md_rdev *rdev;
+	struct bio *raid_bi = bi->bi_private;
+	struct md_rdev *rdev = (void *)raid_bi->bi_next;
+	struct mddev *mddev = rdev->mddev;
+	struct r5conf *conf = mddev->private;
 	blk_status_t error = bi->bi_status;
-	unsigned long start_time = md_io_clone->start_time;
 
 	bio_put(bi);
-
-	rdev = (void*)raid_bi->bi_next;
 	raid_bi->bi_next = NULL;
-	mddev = rdev->mddev;
-	conf = mddev->private;
-
 	rdev_dec_pending(rdev, conf->mddev);
 
 	if (!error) {
-		if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
-			bio_end_io_acct(raid_bi, start_time);
 		bio_endio(raid_bi);
 		if (atomic_dec_and_test(&conf->active_aligned_reads))
 			wake_up(&conf->wait_for_quiescent);
@@ -5506,7 +5497,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	struct md_rdev *rdev;
 	sector_t sector, end_sector, first_bad;
 	int bad_sectors, dd_idx;
-	struct md_io_clone *md_io_clone;
 	bool did_inc;
 
 	if (!in_chunk_boundary(mddev, raid_bio)) {
@@ -5543,16 +5533,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 		return 0;
 	}
 
-	align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
-				    &mddev->io_clone_set);
-	md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone);
+	md_account_bio(mddev, &raid_bio);
 	raid_bio->bi_next = (void *)rdev;
-	if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
-		md_io_clone->start_time = bio_start_io_acct(raid_bio);
-	md_io_clone->orig_bio = raid_bio;
 
+	align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
+				    &mddev->bio_set);
 	align_bio->bi_end_io = raid5_align_endio;
-	align_bio->bi_private = md_io_clone;
+	align_bio->bi_private = raid_bio;
 	align_bio->bi_iter.bi_sector = sector;
 
 	/* No reshape active, so we can trust rdev->data_offset */
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH -next 4/8] md/raid1: switch to use md_account_bio() for io accounting
  2023-06-19 20:48 [PATCH -next 0/8] md: fix and refactor io accounting and 'active_io' Yu Kuai
                   ` (2 preceding siblings ...)
  2023-06-19 20:48 ` [PATCH -next 3/8] raid5: fix missing io accounting in raid5_align_endio() Yu Kuai
@ 2023-06-19 20:48 ` Yu Kuai
  2023-06-20  9:07   ` Xiao Ni
  2023-06-19 20:48 ` [PATCH -next 5/8] md/raid10: " Yu Kuai
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 22+ messages in thread
From: Yu Kuai @ 2023-06-19 20:48 UTC (permalink / raw)
  To: xni, song; +Cc: linux-raid, linux-kernel, yukuai3, yukuai1, yi.zhang, yangerkun

From: Yu Kuai <yukuai3@huawei.com>

Two problems can be fixed this way:

1) 'active_io' will represent inflight io instead of io that is
dispatching.

2) If io accounting is enabled or disabled while io is still inflight,
bio_start_io_acct() and bio_end_io_acct() is not balanced and io
inflight counter will be leaked.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/raid1.c | 14 ++++++--------
 drivers/md/raid1.h |  1 -
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index dd25832eb045..06fa1580501f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -304,8 +304,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
 	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
 		bio->bi_status = BLK_STS_IOERR;
 
-	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
-		bio_end_io_acct(bio, r1_bio->start_time);
 	bio_endio(bio);
 }
 
@@ -1303,10 +1301,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	}
 
 	r1_bio->read_disk = rdisk;
-
-	if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
-		r1_bio->start_time = bio_start_io_acct(bio);
-
+	if (!r1bio_existed) {
+		md_account_bio(mddev, &bio);
+		r1_bio->master_bio = bio;
+	}
 	read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
 				   &mddev->bio_set);
 
@@ -1500,8 +1498,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		r1_bio->sectors = max_sectors;
 	}
 
-	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
-		r1_bio->start_time = bio_start_io_acct(bio);
+	md_account_bio(mddev, &bio);
+	r1_bio->master_bio = bio;
 	atomic_set(&r1_bio->remaining, 1);
 	atomic_set(&r1_bio->behind_remaining, 0);
 
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 468f189da7a0..14d4211a123a 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -157,7 +157,6 @@ struct r1bio {
 	sector_t		sector;
 	int			sectors;
 	unsigned long		state;
-	unsigned long		start_time;
 	struct mddev		*mddev;
 	/*
 	 * original bio going to /dev/mdx
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH -next 5/8] md/raid10: switch to use md_account_bio() for io accounting
  2023-06-19 20:48 [PATCH -next 0/8] md: fix and refactor io accounting and 'active_io' Yu Kuai
                   ` (3 preceding siblings ...)
  2023-06-19 20:48 ` [PATCH -next 4/8] md/raid1: switch to use md_account_bio() for io accounting Yu Kuai
@ 2023-06-19 20:48 ` Yu Kuai
  2023-06-20  9:10   ` Xiao Ni
  2023-06-19 20:48 ` [PATCH -next 6/8] md/md-multipath: enable " Yu Kuai
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 22+ messages in thread
From: Yu Kuai @ 2023-06-19 20:48 UTC (permalink / raw)
  To: xni, song; +Cc: linux-raid, linux-kernel, yukuai3, yukuai1, yi.zhang, yangerkun

From: Yu Kuai <yukuai3@huawei.com>

Make sure that 'active_io' will represent inflight io instead of io that
is dispatching, and io accounting from all levels will be consistent.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/raid10.c | 20 +++++++++-----------
 drivers/md/raid10.h |  1 -
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 79067769e44b..69f6d7b1e600 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -325,8 +325,6 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
 		bio->bi_status = BLK_STS_IOERR;
 
-	if (r10_bio->start_time)
-		bio_end_io_acct(bio, r10_bio->start_time);
 	bio_endio(bio);
 	/*
 	 * Wake up any possible resync thread that waits for the device
@@ -1172,7 +1170,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
 }
 
 static void raid10_read_request(struct mddev *mddev, struct bio *bio,
-				struct r10bio *r10_bio)
+				struct r10bio *r10_bio, bool io_accounting)
 {
 	struct r10conf *conf = mddev->private;
 	struct bio *read_bio;
@@ -1243,9 +1241,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	}
 	slot = r10_bio->read_slot;
 
-	if (!r10_bio->start_time &&
-	    blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
-		r10_bio->start_time = bio_start_io_acct(bio);
+	if (io_accounting) {
+		md_account_bio(mddev, &bio);
+		r10_bio->master_bio = bio;
+	}
 	read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set);
 
 	r10_bio->devs[slot].bio = read_bio;
@@ -1543,8 +1542,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 		r10_bio->master_bio = bio;
 	}
 
-	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
-		r10_bio->start_time = bio_start_io_acct(bio);
+	md_account_bio(mddev, &bio);
+	r10_bio->master_bio = bio;
 	atomic_set(&r10_bio->remaining, 1);
 	md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
 
@@ -1571,12 +1570,11 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
 	r10_bio->sector = bio->bi_iter.bi_sector;
 	r10_bio->state = 0;
 	r10_bio->read_slot = -1;
-	r10_bio->start_time = 0;
 	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
 			conf->geo.raid_disks);
 
 	if (bio_data_dir(bio) == READ)
-		raid10_read_request(mddev, bio, r10_bio);
+		raid10_read_request(mddev, bio, r10_bio, true);
 	else
 		raid10_write_request(mddev, bio, r10_bio);
 }
@@ -2985,7 +2983,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 
 	rdev_dec_pending(rdev, mddev);
 	r10_bio->state = 0;
-	raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
+	raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false);
 	/*
 	 * allow_barrier after re-submit to ensure no sync io
 	 * can be issued while regular io pending.
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 63e48b11b552..2e75e88d0802 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -123,7 +123,6 @@ struct r10bio {
 	sector_t		sector;	/* virtual sector number */
 	int			sectors;
 	unsigned long		state;
-	unsigned long		start_time;
 	struct mddev		*mddev;
 	/*
 	 * original bio going to /dev/mdx
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH -next 6/8] md/md-multipath: enable io accounting
  2023-06-19 20:48 [PATCH -next 0/8] md: fix and refactor io accounting and 'active_io' Yu Kuai
                   ` (4 preceding siblings ...)
  2023-06-19 20:48 ` [PATCH -next 5/8] md/raid10: " Yu Kuai
@ 2023-06-19 20:48 ` Yu Kuai
  2023-06-20  9:11   ` Xiao Ni
  2023-06-19 20:48 ` [PATCH -next 7/8] md/md-linear: " Yu Kuai
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 22+ messages in thread
From: Yu Kuai @ 2023-06-19 20:48 UTC (permalink / raw)
  To: xni, song; +Cc: linux-raid, linux-kernel, yukuai3, yukuai1, yi.zhang, yangerkun

From: Yu Kuai <yukuai3@huawei.com>

use md_account_bio() to enable io accounting, also make sure
mddev_suspend() will wait for all io to be done.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/md-multipath.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 92c45be203d7..d22276870283 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -107,6 +107,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
 	    && md_flush_request(mddev, bio))
 		return true;
 
+	md_account_bio(mddev, &bio);
 	mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
 
 	mp_bh->master_bio = bio;
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH -next 7/8] md/md-linear: enable io accounting
  2023-06-19 20:48 [PATCH -next 0/8] md: fix and refactor io accounting and 'active_io' Yu Kuai
                   ` (5 preceding siblings ...)
  2023-06-19 20:48 ` [PATCH -next 6/8] md/md-multipath: enable " Yu Kuai
@ 2023-06-19 20:48 ` Yu Kuai
  2023-06-20  9:12   ` Xiao Ni
  2023-06-19 20:48 ` [PATCH -next 8/8] md/md-faulty: " Yu Kuai
  2023-06-20 17:45 ` [PATCH -next 0/8] md: fix and refactor io accounting and 'active_io' Song Liu
  8 siblings, 1 reply; 22+ messages in thread
From: Yu Kuai @ 2023-06-19 20:48 UTC (permalink / raw)
  To: xni, song; +Cc: linux-raid, linux-kernel, yukuai3, yukuai1, yi.zhang, yangerkun

From: Yu Kuai <yukuai3@huawei.com>

use md_account_bio() to enable io accounting, also make sure
mddev_suspend() will wait for all io to be done.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/md-linear.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 4eb72b9dd933..71ac99646827 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -238,6 +238,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
 		bio = split;
 	}
 
+	md_account_bio(mddev, &bio);
 	bio_set_dev(bio, tmp_dev->rdev->bdev);
 	bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
 		start_sector + data_offset;
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH -next 8/8] md/md-faulty: enable io accounting
  2023-06-19 20:48 [PATCH -next 0/8] md: fix and refactor io accounting and 'active_io' Yu Kuai
                   ` (6 preceding siblings ...)
  2023-06-19 20:48 ` [PATCH -next 7/8] md/md-linear: " Yu Kuai
@ 2023-06-19 20:48 ` Yu Kuai
  2023-06-20  9:13   ` Xiao Ni
  2023-06-20 17:45 ` [PATCH -next 0/8] md: fix and refactor io accounting and 'active_io' Song Liu
  8 siblings, 1 reply; 22+ messages in thread
From: Yu Kuai @ 2023-06-19 20:48 UTC (permalink / raw)
  To: xni, song; +Cc: linux-raid, linux-kernel, yukuai3, yukuai1, yi.zhang, yangerkun

From: Yu Kuai <yukuai3@huawei.com>

use md_account_bio() to enable io accounting, also make sure
mddev_suspend() will wait for all io to be done.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/md-faulty.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
index 50ad818978a4..a039e8e20f55 100644
--- a/drivers/md/md-faulty.c
+++ b/drivers/md/md-faulty.c
@@ -204,6 +204,8 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
 			failit = 1;
 		}
 	}
+
+	md_account_bio(mddev, &bio);
 	if (failit) {
 		struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO,
 						&mddev->bio_set);
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH -next 1/8] md: move initialization and destruction of 'io_acct_set' to md.c
  2023-06-19 20:48 ` [PATCH -next 1/8] md: move initialization and destruction of 'io_acct_set' to md.c Yu Kuai
@ 2023-06-20  8:35   ` Xiao Ni
  2023-06-21  2:35     ` Yu Kuai
  0 siblings, 1 reply; 22+ messages in thread
From: Xiao Ni @ 2023-06-20  8:35 UTC (permalink / raw)
  To: Yu Kuai; +Cc: song, linux-raid, linux-kernel, yukuai3, yi.zhang, yangerkun

On Mon, Jun 19, 2023 at 8:50 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> From: Yu Kuai <yukuai3@huawei.com>
>
> 'io_acct_set' is only used for raid0 and raid456, prepare to use it for
> raid1 and raid10, so that io accounting from different levels can be
> consistent.
>
> By the way, follow up patches will also use this io clone mechanism to
> make sure 'active_io' represents in flight io, not io that is dispatching,
> so that mddev_suspend will wait for io to be done as desgined.

Hi Kuai

typo error: s/desgined/designed/g

Before this patch the personality uses ->quiesce method to wait until
all inflight ios come back. But I like this solution. It makes the
codes simpler. Not sure if it can cause problems because it changes
the meaning of ->active_io. I'm doing regression tests to check.

>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>  drivers/md/md.c    | 27 ++++++++++-----------------
>  drivers/md/md.h    |  2 --
>  drivers/md/raid0.c | 16 ++--------------
>  drivers/md/raid5.c | 41 +++++++++++------------------------------
>  4 files changed, 23 insertions(+), 63 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 8d62f85d2ab0..42347289195a 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -5886,6 +5886,13 @@ int md_run(struct mddev *mddev)
>                         goto exit_bio_set;
>         }
>
> +       if (!bioset_initialized(&mddev->io_acct_set)) {
> +               err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
> +                                 offsetof(struct md_io_acct, bio_clone), 0);
> +               if (err)
> +                       goto exit_sync_set;
> +       }
> +
>         spin_lock(&pers_lock);
>         pers = find_pers(mddev->level, mddev->clevel);
>         if (!pers || !try_module_get(pers->owner)) {
> @@ -6063,6 +6070,8 @@ int md_run(struct mddev *mddev)
>         module_put(pers->owner);
>         md_bitmap_destroy(mddev);
>  abort:
> +       bioset_exit(&mddev->io_acct_set);
> +exit_sync_set:
>         bioset_exit(&mddev->sync_set);
>  exit_bio_set:
>         bioset_exit(&mddev->bio_set);
> @@ -6286,6 +6295,7 @@ static void __md_stop(struct mddev *mddev)
>         percpu_ref_exit(&mddev->active_io);
>         bioset_exit(&mddev->bio_set);
>         bioset_exit(&mddev->sync_set);
> +       bioset_exit(&mddev->io_acct_set);
>  }
>
>  void md_stop(struct mddev *mddev)
> @@ -8651,23 +8661,6 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
>  }
>  EXPORT_SYMBOL_GPL(md_submit_discard_bio);
>
> -int acct_bioset_init(struct mddev *mddev)
> -{
> -       int err = 0;
> -
> -       if (!bioset_initialized(&mddev->io_acct_set))
> -               err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
> -                       offsetof(struct md_io_acct, bio_clone), 0);
> -       return err;
> -}
> -EXPORT_SYMBOL_GPL(acct_bioset_init);
> -
> -void acct_bioset_exit(struct mddev *mddev)
> -{
> -       bioset_exit(&mddev->io_acct_set);
> -}
> -EXPORT_SYMBOL_GPL(acct_bioset_exit);
> -
>  static void md_end_io_acct(struct bio *bio)
>  {
>         struct md_io_acct *md_io_acct = bio->bi_private;
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 7cab9c7c45b8..11299d94b239 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -776,8 +776,6 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
>  extern void md_finish_reshape(struct mddev *mddev);
>  void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
>                         struct bio *bio, sector_t start, sector_t size);
> -int acct_bioset_init(struct mddev *mddev);
> -void acct_bioset_exit(struct mddev *mddev);
>  void md_account_bio(struct mddev *mddev, struct bio **bio);
>
>  extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
> diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
> index f8ee9a95e25d..38d9209cada1 100644
> --- a/drivers/md/raid0.c
> +++ b/drivers/md/raid0.c
> @@ -365,7 +365,6 @@ static void raid0_free(struct mddev *mddev, void *priv)
>         struct r0conf *conf = priv;
>
>         free_conf(mddev, conf);
> -       acct_bioset_exit(mddev);
>  }
>
>  static int raid0_run(struct mddev *mddev)
> @@ -380,16 +379,11 @@ static int raid0_run(struct mddev *mddev)
>         if (md_check_no_bitmap(mddev))
>                 return -EINVAL;
>
> -       if (acct_bioset_init(mddev)) {
> -               pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev));
> -               return -ENOMEM;
> -       }
> -
>         /* if private is not null, we are here after takeover */
>         if (mddev->private == NULL) {
>                 ret = create_strip_zones(mddev, &conf);
>                 if (ret < 0)
> -                       goto exit_acct_set;
> +                       return ret;
>                 mddev->private = conf;
>         }
>         conf = mddev->private;
> @@ -420,15 +414,9 @@ static int raid0_run(struct mddev *mddev)
>
>         ret = md_integrity_register(mddev);
>         if (ret)
> -               goto free;
> +               free_conf(mddev, conf);
>
>         return ret;
> -
> -free:
> -       free_conf(mddev, conf);
> -exit_acct_set:
> -       acct_bioset_exit(mddev);
> -       return ret;
>  }
>
>  static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index f8bc74e16811..29cf5455d7a5 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -7787,19 +7787,12 @@ static int raid5_run(struct mddev *mddev)
>         struct md_rdev *rdev;
>         struct md_rdev *journal_dev = NULL;
>         sector_t reshape_offset = 0;
> -       int i, ret = 0;
> +       int i;
>         long long min_offset_diff = 0;
>         int first = 1;
>
> -       if (acct_bioset_init(mddev)) {
> -               pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
> +       if (mddev_init_writes_pending(mddev) < 0)
>                 return -ENOMEM;
> -       }
> -
> -       if (mddev_init_writes_pending(mddev) < 0) {
> -               ret = -ENOMEM;
> -               goto exit_acct_set;
> -       }
>
>         if (mddev->recovery_cp != MaxSector)
>                 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
> @@ -7830,8 +7823,7 @@ static int raid5_run(struct mddev *mddev)
>             (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
>                 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
>                           mdname(mddev));
> -               ret = -EINVAL;
> -               goto exit_acct_set;
> +               return -EINVAL;
>         }
>
>         if (mddev->reshape_position != MaxSector) {
> @@ -7856,15 +7848,13 @@ static int raid5_run(struct mddev *mddev)
>                 if (journal_dev) {
>                         pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
>                                 mdname(mddev));
> -                       ret = -EINVAL;
> -                       goto exit_acct_set;
> +                       return -EINVAL;
>                 }
>
>                 if (mddev->new_level != mddev->level) {
>                         pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
>                                 mdname(mddev));
> -                       ret = -EINVAL;
> -                       goto exit_acct_set;
> +                       return -EINVAL;
>                 }
>                 old_disks = mddev->raid_disks - mddev->delta_disks;
>                 /* reshape_position must be on a new-stripe boundary, and one
> @@ -7880,8 +7870,7 @@ static int raid5_run(struct mddev *mddev)
>                 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
>                         pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
>                                 mdname(mddev));
> -                       ret = -EINVAL;
> -                       goto exit_acct_set;
> +                       return -EINVAL;
>                 }
>                 reshape_offset = here_new * chunk_sectors;
>                 /* here_new is the stripe we will write to */
> @@ -7903,8 +7892,7 @@ static int raid5_run(struct mddev *mddev)
>                         else if (mddev->ro == 0) {
>                                 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
>                                         mdname(mddev));
> -                               ret = -EINVAL;
> -                               goto exit_acct_set;
> +                               return -EINVAL;
>                         }
>                 } else if (mddev->reshape_backwards
>                     ? (here_new * chunk_sectors + min_offset_diff <=
> @@ -7914,8 +7902,7 @@ static int raid5_run(struct mddev *mddev)
>                         /* Reading from the same stripe as writing to - bad */
>                         pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
>                                 mdname(mddev));
> -                       ret = -EINVAL;
> -                       goto exit_acct_set;
> +                       return -EINVAL;
>                 }
>                 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
>                 /* OK, we should be able to continue; */
> @@ -7939,10 +7926,8 @@ static int raid5_run(struct mddev *mddev)
>         else
>                 conf = mddev->private;
>
> -       if (IS_ERR(conf)) {
> -               ret = PTR_ERR(conf);
> -               goto exit_acct_set;
> -       }
> +       if (IS_ERR(conf))
> +               return PTR_ERR(conf);
>
>         if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
>                 if (!journal_dev) {
> @@ -8140,10 +8125,7 @@ static int raid5_run(struct mddev *mddev)
>         free_conf(conf);
>         mddev->private = NULL;
>         pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
> -       ret = -EIO;
> -exit_acct_set:
> -       acct_bioset_exit(mddev);
> -       return ret;
> +       return -EIO;
>  }
>
>  static void raid5_free(struct mddev *mddev, void *priv)
> @@ -8151,7 +8133,6 @@ static void raid5_free(struct mddev *mddev, void *priv)
>         struct r5conf *conf = priv;
>
>         free_conf(conf);
> -       acct_bioset_exit(mddev);
>         mddev->to_remove = &raid5_attrs_group;
>  }
>
> --
> 2.39.2
>

The patch is good for me.

Reviewed-by: Xiao Ni <xni@redhat.com>


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH -next 2/8] md: also clone new io if io accounting is disabled
  2023-06-19 20:48 ` [PATCH -next 2/8] md: also clone new io if io accounting is disabled Yu Kuai
@ 2023-06-20  8:48   ` Xiao Ni
  0 siblings, 0 replies; 22+ messages in thread
From: Xiao Ni @ 2023-06-20  8:48 UTC (permalink / raw)
  To: Yu Kuai; +Cc: song, linux-raid, linux-kernel, yukuai3, yi.zhang, yangerkun

On Mon, Jun 19, 2023 at 8:49 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> From: Yu Kuai <yukuai3@huawei.com>
>
> Currently, 'active_io' is grabed before make_reqeust() is called, and
> it's dropped immediately make_reqeust() returns. Hence 'active_io'
> actually means io is dispatching, not io is inflight.

Hi Kuai

There are three typo errors in the commit message: s/grabed/grabbed/g

>
> For raid0 and raid456 that io accounting is enabled, 'active_io' will
> also be grabed when bio is cloned for io accounting, and this 'active_io'
> is dropped until io is done.
>
> Always clone new bio so that 'active_io' will mean that io is inflight,
> raid1 and raid10 will switch to use this method in later patches. Once
> these are implemented, it can be cleaned up that 'active_io' is grabed
> twice for one io.
>
> Now that bio will be cloned even if io accounting is disabled, also
> rename related structure from '*_acct_*' to '*_clone_*'.
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>  drivers/md/md.c    | 61 ++++++++++++++++++++++++----------------------
>  drivers/md/md.h    |  4 +--
>  drivers/md/raid5.c | 18 +++++++-------
>  3 files changed, 43 insertions(+), 40 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 42347289195a..5ad8e7f3aebd 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -2314,7 +2314,7 @@ int md_integrity_register(struct mddev *mddev)
>         pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
>         if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
>             (mddev->level != 1 && mddev->level != 10 &&
> -            bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) {
> +            bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) {
>                 /*
>                  * No need to handle the failure of bioset_integrity_create,
>                  * because the function is called by md_run() -> pers->run(),
> @@ -5886,9 +5886,9 @@ int md_run(struct mddev *mddev)
>                         goto exit_bio_set;
>         }
>
> -       if (!bioset_initialized(&mddev->io_acct_set)) {
> -               err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
> -                                 offsetof(struct md_io_acct, bio_clone), 0);
> +       if (!bioset_initialized(&mddev->io_clone_set)) {
> +               err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
> +                                 offsetof(struct md_io_clone, bio_clone), 0);
>                 if (err)
>                         goto exit_sync_set;
>         }
> @@ -6070,7 +6070,7 @@ int md_run(struct mddev *mddev)
>         module_put(pers->owner);
>         md_bitmap_destroy(mddev);
>  abort:
> -       bioset_exit(&mddev->io_acct_set);
> +       bioset_exit(&mddev->io_clone_set);
>  exit_sync_set:
>         bioset_exit(&mddev->sync_set);
>  exit_bio_set:
> @@ -6295,7 +6295,7 @@ static void __md_stop(struct mddev *mddev)
>         percpu_ref_exit(&mddev->active_io);
>         bioset_exit(&mddev->bio_set);
>         bioset_exit(&mddev->sync_set);
> -       bioset_exit(&mddev->io_acct_set);
> +       bioset_exit(&mddev->io_clone_set);
>  }
>
>  void md_stop(struct mddev *mddev)
> @@ -8661,45 +8661,48 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
>  }
>  EXPORT_SYMBOL_GPL(md_submit_discard_bio);
>
> -static void md_end_io_acct(struct bio *bio)
> +static void md_end_clone_io(struct bio *bio)
>  {
> -       struct md_io_acct *md_io_acct = bio->bi_private;
> -       struct bio *orig_bio = md_io_acct->orig_bio;
> -       struct mddev *mddev = md_io_acct->mddev;
> +       struct md_io_clone *md_io_clone = bio->bi_private;
> +       struct bio *orig_bio = md_io_clone->orig_bio;
> +       struct mddev *mddev = md_io_clone->mddev;
>
>         orig_bio->bi_status = bio->bi_status;
>
> -       bio_end_io_acct(orig_bio, md_io_acct->start_time);
> +       if (md_io_clone->start_time)
> +               bio_end_io_acct(orig_bio, md_io_clone->start_time);
> +
>         bio_put(bio);
>         bio_endio(orig_bio);
> -
>         percpu_ref_put(&mddev->active_io);
>  }
>
> +static void md_clone_bio(struct mddev *mddev, struct bio **bio)
> +{
> +       struct block_device *bdev = (*bio)->bi_bdev;
> +       struct md_io_clone *md_io_clone;
> +       struct bio *clone =
> +               bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set);
> +
> +       md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
> +       md_io_clone->orig_bio = *bio;
> +       md_io_clone->mddev = mddev;
> +       if (blk_queue_io_stat(bdev->bd_disk->queue))
> +               md_io_clone->start_time = bio_start_io_acct(*bio);
> +
> +       clone->bi_end_io = md_end_clone_io;
> +       clone->bi_private = md_io_clone;
> +       *bio = clone;
> +}
> +
>  /*
>   * Used by personalities that don't already clone the bio and thus can't
>   * easily add the timestamp to their extended bio structure.
>   */
>  void md_account_bio(struct mddev *mddev, struct bio **bio)
>  {
> -       struct block_device *bdev = (*bio)->bi_bdev;
> -       struct md_io_acct *md_io_acct;
> -       struct bio *clone;
> -
> -       if (!blk_queue_io_stat(bdev->bd_disk->queue))
> -               return;
> -
>         percpu_ref_get(&mddev->active_io);
> -
> -       clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set);
> -       md_io_acct = container_of(clone, struct md_io_acct, bio_clone);
> -       md_io_acct->orig_bio = *bio;
> -       md_io_acct->start_time = bio_start_io_acct(*bio);
> -       md_io_acct->mddev = mddev;
> -
> -       clone->bi_end_io = md_end_io_acct;
> -       clone->bi_private = md_io_acct;
> -       *bio = clone;
> +       md_clone_bio(mddev, bio);
>  }
>  EXPORT_SYMBOL_GPL(md_account_bio);
>
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 11299d94b239..892a598a5029 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -510,7 +510,7 @@ struct mddev {
>         struct bio_set                  sync_set; /* for sync operations like
>                                                    * metadata and bitmap writes
>                                                    */
> -       struct bio_set                  io_acct_set; /* for raid0 and raid5 io accounting */
> +       struct bio_set                  io_clone_set;
>
>         /* Generic flush handling.
>          * The last to finish preflush schedules a worker to submit
> @@ -738,7 +738,7 @@ struct md_thread {
>         void                    *private;
>  };
>
> -struct md_io_acct {
> +struct md_io_clone {
>         struct mddev    *mddev;
>         struct bio      *orig_bio;
>         unsigned long   start_time;
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 29cf5455d7a5..cef0b400b2ee 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -5468,13 +5468,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
>   */
>  static void raid5_align_endio(struct bio *bi)
>  {
> -       struct md_io_acct *md_io_acct = bi->bi_private;
> -       struct bio *raid_bi = md_io_acct->orig_bio;
> +       struct md_io_clone *md_io_clone = bi->bi_private;
> +       struct bio *raid_bi = md_io_clone->orig_bio;
>         struct mddev *mddev;
>         struct r5conf *conf;
>         struct md_rdev *rdev;
>         blk_status_t error = bi->bi_status;
> -       unsigned long start_time = md_io_acct->start_time;
> +       unsigned long start_time = md_io_clone->start_time;
>
>         bio_put(bi);
>
> @@ -5506,7 +5506,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
>         struct md_rdev *rdev;
>         sector_t sector, end_sector, first_bad;
>         int bad_sectors, dd_idx;
> -       struct md_io_acct *md_io_acct;
> +       struct md_io_clone *md_io_clone;
>         bool did_inc;
>
>         if (!in_chunk_boundary(mddev, raid_bio)) {
> @@ -5544,15 +5544,15 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
>         }
>
>         align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
> -                                   &mddev->io_acct_set);
> -       md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
> +                                   &mddev->io_clone_set);
> +       md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone);
>         raid_bio->bi_next = (void *)rdev;
>         if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
> -               md_io_acct->start_time = bio_start_io_acct(raid_bio);
> -       md_io_acct->orig_bio = raid_bio;
> +               md_io_clone->start_time = bio_start_io_acct(raid_bio);
> +       md_io_clone->orig_bio = raid_bio;
>
>         align_bio->bi_end_io = raid5_align_endio;
> -       align_bio->bi_private = md_io_acct;
> +       align_bio->bi_private = md_io_clone;
>         align_bio->bi_iter.bi_sector = sector;
>
>         /* No reshape active, so we can trust rdev->data_offset */
> --
> 2.39.2
>

Reviewed-by: Xiao Ni <xni@redhat.com>


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH -next 3/8] raid5: fix missing io accounting in raid5_align_endio()
  2023-06-19 20:48 ` [PATCH -next 3/8] raid5: fix missing io accounting in raid5_align_endio() Yu Kuai
@ 2023-06-20  8:52   ` Xiao Ni
  2023-06-20  9:57   ` Paul Menzel
  1 sibling, 0 replies; 22+ messages in thread
From: Xiao Ni @ 2023-06-20  8:52 UTC (permalink / raw)
  To: Yu Kuai; +Cc: song, linux-raid, linux-kernel, yukuai3, yi.zhang, yangerkun

On Mon, Jun 19, 2023 at 8:50 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> From: Yu Kuai <yukuai3@huawei.com>
>
> Io will only be accounted as done from raid5_align_endio() if the io
> succeed, and io inflight counter will be leaked if such io failed.
>
> Fix this problem by switching to use md_account_bio() for io accounting.
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>  drivers/md/raid5.c | 29 ++++++++---------------------
>  1 file changed, 8 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index cef0b400b2ee..4cdb35e54251 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -5468,26 +5468,17 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
>   */
>  static void raid5_align_endio(struct bio *bi)
>  {
> -       struct md_io_clone *md_io_clone = bi->bi_private;
> -       struct bio *raid_bi = md_io_clone->orig_bio;
> -       struct mddev *mddev;
> -       struct r5conf *conf;
> -       struct md_rdev *rdev;
> +       struct bio *raid_bi = bi->bi_private;
> +       struct md_rdev *rdev = (void *)raid_bi->bi_next;
> +       struct mddev *mddev = rdev->mddev;
> +       struct r5conf *conf = mddev->private;
>         blk_status_t error = bi->bi_status;
> -       unsigned long start_time = md_io_clone->start_time;
>
>         bio_put(bi);
> -
> -       rdev = (void*)raid_bi->bi_next;
>         raid_bi->bi_next = NULL;
> -       mddev = rdev->mddev;
> -       conf = mddev->private;
> -
>         rdev_dec_pending(rdev, conf->mddev);
>
>         if (!error) {
> -               if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
> -                       bio_end_io_acct(raid_bi, start_time);
>                 bio_endio(raid_bi);
>                 if (atomic_dec_and_test(&conf->active_aligned_reads))
>                         wake_up(&conf->wait_for_quiescent);
> @@ -5506,7 +5497,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
>         struct md_rdev *rdev;
>         sector_t sector, end_sector, first_bad;
>         int bad_sectors, dd_idx;
> -       struct md_io_clone *md_io_clone;
>         bool did_inc;
>
>         if (!in_chunk_boundary(mddev, raid_bio)) {
> @@ -5543,16 +5533,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
>                 return 0;
>         }
>
> -       align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
> -                                   &mddev->io_clone_set);
> -       md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone);
> +       md_account_bio(mddev, &raid_bio);
>         raid_bio->bi_next = (void *)rdev;
> -       if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
> -               md_io_clone->start_time = bio_start_io_acct(raid_bio);
> -       md_io_clone->orig_bio = raid_bio;
>
> +       align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
> +                                   &mddev->bio_set);
>         align_bio->bi_end_io = raid5_align_endio;
> -       align_bio->bi_private = md_io_clone;
> +       align_bio->bi_private = raid_bio;
>         align_bio->bi_iter.bi_sector = sector;
>
>         /* No reshape active, so we can trust rdev->data_offset */
> --
> 2.39.2
>

Reviewed-by: Xiao Ni <xni@redhat.com>


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH -next 4/8] md/raid1: switch to use md_account_bio() for io accounting
  2023-06-19 20:48 ` [PATCH -next 4/8] md/raid1: switch to use md_account_bio() for io accounting Yu Kuai
@ 2023-06-20  9:07   ` Xiao Ni
  2023-06-20 13:38     ` Yu Kuai
  0 siblings, 1 reply; 22+ messages in thread
From: Xiao Ni @ 2023-06-20  9:07 UTC (permalink / raw)
  To: Yu Kuai; +Cc: song, linux-raid, linux-kernel, yukuai3, yi.zhang, yangerkun

On Mon, Jun 19, 2023 at 8:49 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> From: Yu Kuai <yukuai3@huawei.com>
>
> Two problems can be fixed this way:
>
> 1) 'active_io' will represent inflight io instead of io that is
> dispatching.
>
> 2) If io accounting is enabled or disabled while io is still inflight,
> bio_start_io_acct() and bio_end_io_acct() is not balanced and io
> inflight counter will be leaked.
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>  drivers/md/raid1.c | 14 ++++++--------
>  drivers/md/raid1.h |  1 -
>  2 files changed, 6 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index dd25832eb045..06fa1580501f 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -304,8 +304,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
>         if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
>                 bio->bi_status = BLK_STS_IOERR;
>
> -       if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
> -               bio_end_io_acct(bio, r1_bio->start_time);
>         bio_endio(bio);
>  }
>
> @@ -1303,10 +1301,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>         }
>
>         r1_bio->read_disk = rdisk;
> -
> -       if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
> -               r1_bio->start_time = bio_start_io_acct(bio);
> -
> +       if (!r1bio_existed) {
> +               md_account_bio(mddev, &bio);
> +               r1_bio->master_bio = bio;
> +       }
>         read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
>                                    &mddev->bio_set);
>
> @@ -1500,8 +1498,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>                 r1_bio->sectors = max_sectors;
>         }
>
> -       if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
> -               r1_bio->start_time = bio_start_io_acct(bio);
> +       md_account_bio(mddev, &bio);
> +       r1_bio->master_bio = bio;
>         atomic_set(&r1_bio->remaining, 1);
>         atomic_set(&r1_bio->behind_remaining, 0);
>
> diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
> index 468f189da7a0..14d4211a123a 100644
> --- a/drivers/md/raid1.h
> +++ b/drivers/md/raid1.h
> @@ -157,7 +157,6 @@ struct r1bio {
>         sector_t                sector;
>         int                     sectors;
>         unsigned long           state;
> -       unsigned long           start_time;
>         struct mddev            *mddev;
>         /*
>          * original bio going to /dev/mdx
> --
> 2.39.2
>

Hi Kuai

After this patch, raid1 will have one more memory allocation in the
I/O path. Not sure if it can affect performance. Beside this, the
patch is good for me.

Reviewed-by: Xiao Ni <xni@redhat.com>


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH -next 5/8] md/raid10: switch to use md_account_bio() for io accounting
  2023-06-19 20:48 ` [PATCH -next 5/8] md/raid10: " Yu Kuai
@ 2023-06-20  9:10   ` Xiao Ni
  0 siblings, 0 replies; 22+ messages in thread
From: Xiao Ni @ 2023-06-20  9:10 UTC (permalink / raw)
  To: Yu Kuai; +Cc: song, linux-raid, linux-kernel, yukuai3, yi.zhang, yangerkun

On Mon, Jun 19, 2023 at 8:50 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> From: Yu Kuai <yukuai3@huawei.com>
>
> Make sure that 'active_io' will represent inflight io instead of io that
> is dispatching, and io accounting from all levels will be consistent.
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>  drivers/md/raid10.c | 20 +++++++++-----------
>  drivers/md/raid10.h |  1 -
>  2 files changed, 9 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 79067769e44b..69f6d7b1e600 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -325,8 +325,6 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
>         if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
>                 bio->bi_status = BLK_STS_IOERR;
>
> -       if (r10_bio->start_time)
> -               bio_end_io_acct(bio, r10_bio->start_time);
>         bio_endio(bio);
>         /*
>          * Wake up any possible resync thread that waits for the device
> @@ -1172,7 +1170,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
>  }
>
>  static void raid10_read_request(struct mddev *mddev, struct bio *bio,
> -                               struct r10bio *r10_bio)
> +                               struct r10bio *r10_bio, bool io_accounting)
>  {
>         struct r10conf *conf = mddev->private;
>         struct bio *read_bio;
> @@ -1243,9 +1241,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
>         }
>         slot = r10_bio->read_slot;
>
> -       if (!r10_bio->start_time &&
> -           blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
> -               r10_bio->start_time = bio_start_io_acct(bio);
> +       if (io_accounting) {
> +               md_account_bio(mddev, &bio);
> +               r10_bio->master_bio = bio;
> +       }
>         read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set);
>
>         r10_bio->devs[slot].bio = read_bio;
> @@ -1543,8 +1542,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>                 r10_bio->master_bio = bio;
>         }
>
> -       if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
> -               r10_bio->start_time = bio_start_io_acct(bio);
> +       md_account_bio(mddev, &bio);
> +       r10_bio->master_bio = bio;
>         atomic_set(&r10_bio->remaining, 1);
>         md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
>
> @@ -1571,12 +1570,11 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
>         r10_bio->sector = bio->bi_iter.bi_sector;
>         r10_bio->state = 0;
>         r10_bio->read_slot = -1;
> -       r10_bio->start_time = 0;
>         memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
>                         conf->geo.raid_disks);
>
>         if (bio_data_dir(bio) == READ)
> -               raid10_read_request(mddev, bio, r10_bio);
> +               raid10_read_request(mddev, bio, r10_bio, true);
>         else
>                 raid10_write_request(mddev, bio, r10_bio);
>  }
> @@ -2985,7 +2983,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
>
>         rdev_dec_pending(rdev, mddev);
>         r10_bio->state = 0;
> -       raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
> +       raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false);
>         /*
>          * allow_barrier after re-submit to ensure no sync io
>          * can be issued while regular io pending.
> diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
> index 63e48b11b552..2e75e88d0802 100644
> --- a/drivers/md/raid10.h
> +++ b/drivers/md/raid10.h
> @@ -123,7 +123,6 @@ struct r10bio {
>         sector_t                sector; /* virtual sector number */
>         int                     sectors;
>         unsigned long           state;
> -       unsigned long           start_time;
>         struct mddev            *mddev;
>         /*
>          * original bio going to /dev/mdx
> --
> 2.39.2
>

Reviewed-by: Xiao Ni <xni@redhat.com>


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH -next 6/8] md/md-multipath: enable io accounting
  2023-06-19 20:48 ` [PATCH -next 6/8] md/md-multipath: enable " Yu Kuai
@ 2023-06-20  9:11   ` Xiao Ni
  0 siblings, 0 replies; 22+ messages in thread
From: Xiao Ni @ 2023-06-20  9:11 UTC (permalink / raw)
  To: Yu Kuai; +Cc: song, linux-raid, linux-kernel, yukuai3, yi.zhang, yangerkun

On Mon, Jun 19, 2023 at 8:50 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> From: Yu Kuai <yukuai3@huawei.com>
>
> use md_account_bio() to enable io accounting, also make sure
> mddev_suspend() will wait for all io to be done.
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>  drivers/md/md-multipath.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
> index 92c45be203d7..d22276870283 100644
> --- a/drivers/md/md-multipath.c
> +++ b/drivers/md/md-multipath.c
> @@ -107,6 +107,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
>             && md_flush_request(mddev, bio))
>                 return true;
>
> +       md_account_bio(mddev, &bio);
>         mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
>
>         mp_bh->master_bio = bio;
> --
> 2.39.2
>

Reviewed-by: Xiao Ni <xni@redhat.com>


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH -next 7/8] md/md-linear: enable io accounting
  2023-06-19 20:48 ` [PATCH -next 7/8] md/md-linear: " Yu Kuai
@ 2023-06-20  9:12   ` Xiao Ni
  0 siblings, 0 replies; 22+ messages in thread
From: Xiao Ni @ 2023-06-20  9:12 UTC (permalink / raw)
  To: Yu Kuai; +Cc: song, linux-raid, linux-kernel, yukuai3, yi.zhang, yangerkun

On Mon, Jun 19, 2023 at 8:50 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> From: Yu Kuai <yukuai3@huawei.com>
>
> use md_account_bio() to enable io accounting, also make sure
> mddev_suspend() will wait for all io to be done.
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>  drivers/md/md-linear.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
> index 4eb72b9dd933..71ac99646827 100644
> --- a/drivers/md/md-linear.c
> +++ b/drivers/md/md-linear.c
> @@ -238,6 +238,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
>                 bio = split;
>         }
>
> +       md_account_bio(mddev, &bio);
>         bio_set_dev(bio, tmp_dev->rdev->bdev);
>         bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
>                 start_sector + data_offset;
> --
> 2.39.2
>

Reviewed-by: Xiao Ni <xni@redhat.com>


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH -next 8/8] md/md-faulty: enable io accounting
  2023-06-19 20:48 ` [PATCH -next 8/8] md/md-faulty: " Yu Kuai
@ 2023-06-20  9:13   ` Xiao Ni
  0 siblings, 0 replies; 22+ messages in thread
From: Xiao Ni @ 2023-06-20  9:13 UTC (permalink / raw)
  To: Yu Kuai; +Cc: song, linux-raid, linux-kernel, yukuai3, yi.zhang, yangerkun

On Mon, Jun 19, 2023 at 8:50 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> From: Yu Kuai <yukuai3@huawei.com>
>
> use md_account_bio() to enable io accounting, also make sure
> mddev_suspend() will wait for all io to be done.
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>  drivers/md/md-faulty.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
> index 50ad818978a4..a039e8e20f55 100644
> --- a/drivers/md/md-faulty.c
> +++ b/drivers/md/md-faulty.c
> @@ -204,6 +204,8 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
>                         failit = 1;
>                 }
>         }
> +
> +       md_account_bio(mddev, &bio);
>         if (failit) {
>                 struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO,
>                                                 &mddev->bio_set);
> --
> 2.39.2
>

Reviewed-by: Xiao Ni <xni@redhat.com>


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH -next 3/8] raid5: fix missing io accounting in raid5_align_endio()
  2023-06-19 20:48 ` [PATCH -next 3/8] raid5: fix missing io accounting in raid5_align_endio() Yu Kuai
  2023-06-20  8:52   ` Xiao Ni
@ 2023-06-20  9:57   ` Paul Menzel
  2023-06-21  3:22     ` Yu Kuai
  1 sibling, 1 reply; 22+ messages in thread
From: Paul Menzel @ 2023-06-20  9:57 UTC (permalink / raw)
  To: Yu Kuai; +Cc: xni, song, linux-raid, linux-kernel, yukuai3, yi.zhang, yangerkun

Dear Yu,


Thank you for your patch.

Am 19.06.23 um 22:48 schrieb Yu Kuai:
> From: Yu Kuai <yukuai3@huawei.com>
> 
> Io will only be accounted as done from raid5_align_endio() if the io
> succeed, and io inflight counter will be leaked if such io failed.

succeed*s* or succeed*ed*?

> Fix this problem by switching to use md_account_bio() for io accounting.

How can this be tested?

> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>   drivers/md/raid5.c | 29 ++++++++---------------------
>   1 file changed, 8 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index cef0b400b2ee..4cdb35e54251 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -5468,26 +5468,17 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
>    */
>   static void raid5_align_endio(struct bio *bi)
>   {
> -	struct md_io_clone *md_io_clone = bi->bi_private;
> -	struct bio *raid_bi = md_io_clone->orig_bio;
> -	struct mddev *mddev;
> -	struct r5conf *conf;
> -	struct md_rdev *rdev;
> +	struct bio *raid_bi = bi->bi_private;
> +	struct md_rdev *rdev = (void *)raid_bi->bi_next;
> +	struct mddev *mddev = rdev->mddev;
> +	struct r5conf *conf = mddev->private;
>   	blk_status_t error = bi->bi_status;
> -	unsigned long start_time = md_io_clone->start_time;
>   
>   	bio_put(bi);
> -
> -	rdev = (void*)raid_bi->bi_next;
>   	raid_bi->bi_next = NULL;
> -	mddev = rdev->mddev;
> -	conf = mddev->private;
> -

This looks like unnecessary refactoring. No idea what the preferred 
style for the subsystem is though. If it is wanted, maybe make it a 
separate commit?

>   	rdev_dec_pending(rdev, conf->mddev);
>   
>   	if (!error) {
> -		if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
> -			bio_end_io_acct(raid_bi, start_time);
>   		bio_endio(raid_bi);
>   		if (atomic_dec_and_test(&conf->active_aligned_reads))
>   			wake_up(&conf->wait_for_quiescent);
> @@ -5506,7 +5497,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
>   	struct md_rdev *rdev;
>   	sector_t sector, end_sector, first_bad;
>   	int bad_sectors, dd_idx;
> -	struct md_io_clone *md_io_clone;
>   	bool did_inc;
>   
>   	if (!in_chunk_boundary(mddev, raid_bio)) {
> @@ -5543,16 +5533,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
>   		return 0;
>   	}
>   
> -	align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
> -				    &mddev->io_clone_set);
> -	md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone);
> +	md_account_bio(mddev, &raid_bio);
>   	raid_bio->bi_next = (void *)rdev;
> -	if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
> -		md_io_clone->start_time = bio_start_io_acct(raid_bio);
> -	md_io_clone->orig_bio = raid_bio;
>   
> +	align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
> +				    &mddev->bio_set);
>   	align_bio->bi_end_io = raid5_align_endio;
> -	align_bio->bi_private = md_io_clone;
> +	align_bio->bi_private = raid_bio;
>   	align_bio->bi_iter.bi_sector = sector;
>   
>   	/* No reshape active, so we can trust rdev->data_offset */


Kind regards,

Paul

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH -next 4/8] md/raid1: switch to use md_account_bio() for io accounting
  2023-06-20  9:07   ` Xiao Ni
@ 2023-06-20 13:38     ` Yu Kuai
  0 siblings, 0 replies; 22+ messages in thread
From: Yu Kuai @ 2023-06-20 13:38 UTC (permalink / raw)
  To: Xiao Ni, Yu Kuai
  Cc: song, linux-raid, linux-kernel, yi.zhang, yangerkun, yukuai (C)

Hi,

在 2023/06/20 17:07, Xiao Ni 写道:
> On Mon, Jun 19, 2023 at 8:49 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> Two problems can be fixed this way:
>>
>> 1) 'active_io' will represent inflight io instead of io that is
>> dispatching.
>>
>> 2) If io accounting is enabled or disabled while io is still inflight,
>> bio_start_io_acct() and bio_end_io_acct() is not balanced and io
>> inflight counter will be leaked.
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> ---
>>   drivers/md/raid1.c | 14 ++++++--------
>>   drivers/md/raid1.h |  1 -
>>   2 files changed, 6 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index dd25832eb045..06fa1580501f 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -304,8 +304,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
>>          if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
>>                  bio->bi_status = BLK_STS_IOERR;
>>
>> -       if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
>> -               bio_end_io_acct(bio, r1_bio->start_time);
>>          bio_endio(bio);
>>   }
>>
>> @@ -1303,10 +1301,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>>          }
>>
>>          r1_bio->read_disk = rdisk;
>> -
>> -       if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
>> -               r1_bio->start_time = bio_start_io_acct(bio);
>> -
>> +       if (!r1bio_existed) {
>> +               md_account_bio(mddev, &bio);
>> +               r1_bio->master_bio = bio;
>> +       }
>>          read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
>>                                     &mddev->bio_set);
>>
>> @@ -1500,8 +1498,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>>                  r1_bio->sectors = max_sectors;
>>          }
>>
>> -       if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
>> -               r1_bio->start_time = bio_start_io_acct(bio);
>> +       md_account_bio(mddev, &bio);
>> +       r1_bio->master_bio = bio;
>>          atomic_set(&r1_bio->remaining, 1);
>>          atomic_set(&r1_bio->behind_remaining, 0);
>>
>> diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
>> index 468f189da7a0..14d4211a123a 100644
>> --- a/drivers/md/raid1.h
>> +++ b/drivers/md/raid1.h
>> @@ -157,7 +157,6 @@ struct r1bio {
>>          sector_t                sector;
>>          int                     sectors;
>>          unsigned long           state;
>> -       unsigned long           start_time;
>>          struct mddev            *mddev;
>>          /*
>>           * original bio going to /dev/mdx
>> --
>> 2.39.2
>>
> 
> Hi Kuai
> 
> After this patch, raid1 will have one more memory allocation in the
> I/O path. Not sure if it can affect performance. Beside this, the
> patch is good for me.

Yes, I'm aware of this additional memory allocation, however, raid1(and
similar to other levels) already need to allocate r1bio and some bios(1
for read, and copies for write), so this is not a none -> new case,
it's just a allocate 2 -> allocate 3 case.

I think performance under memory pressure are both bad with or without
this patch, and one one bio clone latency without memory reclaim should
be fine.

Thanks,
Kuai
> 
> Reviewed-by: Xiao Ni <xni@redhat.com>
> 
> .
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH -next 0/8] md: fix and refactor io accounting and 'active_io'
  2023-06-19 20:48 [PATCH -next 0/8] md: fix and refactor io accounting and 'active_io' Yu Kuai
                   ` (7 preceding siblings ...)
  2023-06-19 20:48 ` [PATCH -next 8/8] md/md-faulty: " Yu Kuai
@ 2023-06-20 17:45 ` Song Liu
  8 siblings, 0 replies; 22+ messages in thread
From: Song Liu @ 2023-06-20 17:45 UTC (permalink / raw)
  To: Yu Kuai; +Cc: xni, linux-raid, linux-kernel, yukuai3, yi.zhang, yangerkun

On Mon, Jun 19, 2023 at 5:49 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> From: Yu Kuai <yukuai3@huawei.com>
>
> This patchset do following things:
>  - make io accounting for all raid levels consistent, patch 1, 3-5;
>  - enable io accounting for all raid levels, patch 6-8;
>  - hold 'active_io' before io start to dispatch, and release 'active_io'
>  when io is done, make mddev_suspend() will wait for io to be done, patch 2

This set looks good to me (pending more tests). Please respin with
feedback from folks. We need to move fast to catch the upcoming
merge window.

Thanks,
Song

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH -next 1/8] md: move initialization and destruction of 'io_acct_set' to md.c
  2023-06-20  8:35   ` Xiao Ni
@ 2023-06-21  2:35     ` Yu Kuai
  0 siblings, 0 replies; 22+ messages in thread
From: Yu Kuai @ 2023-06-21  2:35 UTC (permalink / raw)
  To: Xiao Ni, Yu Kuai
  Cc: song, linux-raid, linux-kernel, yi.zhang, yangerkun, yukuai (C)

Hi,

在 2023/06/20 16:35, Xiao Ni 写道:
> On Mon, Jun 19, 2023 at 8:50 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> 'io_acct_set' is only used for raid0 and raid456, prepare to use it for
>> raid1 and raid10, so that io accounting from different levels can be
>> consistent.
>>
>> By the way, follow up patches will also use this io clone mechanism to
>> make sure 'active_io' represents in flight io, not io that is dispatching,
>> so that mddev_suspend will wait for io to be done as desgined.
> 
> Hi Kuai
> 
> typo error: s/desgined/designed/g
> 
> Before this patch the personality uses ->quiesce method to wait until
> all inflight ios come back. But I like this solution. It makes the
> codes simpler. Not sure if it can cause problems because it changes
> the meaning of ->active_io. I'm doing regression tests to check.

Yes, actually this is the first step that I'm tring to synchronize io
and raid configuration, and I'm planing to use 'active_io' to check if
normal io exist, following are follow up plans:

1. add a new counter(perhaps we can reuse queue->q_usage_counter), and
grab it for sync io;
2. refactor and expand 'pers->quiesce', allow normal io and sync io
to be both quiesced;
3. call 'pers->quiesce' before raid configuration;
4. add a new helper to iterate rdev, grab the new counter first;
5. a lot of cleanups;

Thanks,
Kuai
> 
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> ---
>>   drivers/md/md.c    | 27 ++++++++++-----------------
>>   drivers/md/md.h    |  2 --
>>   drivers/md/raid0.c | 16 ++--------------
>>   drivers/md/raid5.c | 41 +++++++++++------------------------------
>>   4 files changed, 23 insertions(+), 63 deletions(-)
>>
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 8d62f85d2ab0..42347289195a 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -5886,6 +5886,13 @@ int md_run(struct mddev *mddev)
>>                          goto exit_bio_set;
>>          }
>>
>> +       if (!bioset_initialized(&mddev->io_acct_set)) {
>> +               err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
>> +                                 offsetof(struct md_io_acct, bio_clone), 0);
>> +               if (err)
>> +                       goto exit_sync_set;
>> +       }
>> +
>>          spin_lock(&pers_lock);
>>          pers = find_pers(mddev->level, mddev->clevel);
>>          if (!pers || !try_module_get(pers->owner)) {
>> @@ -6063,6 +6070,8 @@ int md_run(struct mddev *mddev)
>>          module_put(pers->owner);
>>          md_bitmap_destroy(mddev);
>>   abort:
>> +       bioset_exit(&mddev->io_acct_set);
>> +exit_sync_set:
>>          bioset_exit(&mddev->sync_set);
>>   exit_bio_set:
>>          bioset_exit(&mddev->bio_set);
>> @@ -6286,6 +6295,7 @@ static void __md_stop(struct mddev *mddev)
>>          percpu_ref_exit(&mddev->active_io);
>>          bioset_exit(&mddev->bio_set);
>>          bioset_exit(&mddev->sync_set);
>> +       bioset_exit(&mddev->io_acct_set);
>>   }
>>
>>   void md_stop(struct mddev *mddev)
>> @@ -8651,23 +8661,6 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
>>   }
>>   EXPORT_SYMBOL_GPL(md_submit_discard_bio);
>>
>> -int acct_bioset_init(struct mddev *mddev)
>> -{
>> -       int err = 0;
>> -
>> -       if (!bioset_initialized(&mddev->io_acct_set))
>> -               err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
>> -                       offsetof(struct md_io_acct, bio_clone), 0);
>> -       return err;
>> -}
>> -EXPORT_SYMBOL_GPL(acct_bioset_init);
>> -
>> -void acct_bioset_exit(struct mddev *mddev)
>> -{
>> -       bioset_exit(&mddev->io_acct_set);
>> -}
>> -EXPORT_SYMBOL_GPL(acct_bioset_exit);
>> -
>>   static void md_end_io_acct(struct bio *bio)
>>   {
>>          struct md_io_acct *md_io_acct = bio->bi_private;
>> diff --git a/drivers/md/md.h b/drivers/md/md.h
>> index 7cab9c7c45b8..11299d94b239 100644
>> --- a/drivers/md/md.h
>> +++ b/drivers/md/md.h
>> @@ -776,8 +776,6 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
>>   extern void md_finish_reshape(struct mddev *mddev);
>>   void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
>>                          struct bio *bio, sector_t start, sector_t size);
>> -int acct_bioset_init(struct mddev *mddev);
>> -void acct_bioset_exit(struct mddev *mddev);
>>   void md_account_bio(struct mddev *mddev, struct bio **bio);
>>
>>   extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
>> diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
>> index f8ee9a95e25d..38d9209cada1 100644
>> --- a/drivers/md/raid0.c
>> +++ b/drivers/md/raid0.c
>> @@ -365,7 +365,6 @@ static void raid0_free(struct mddev *mddev, void *priv)
>>          struct r0conf *conf = priv;
>>
>>          free_conf(mddev, conf);
>> -       acct_bioset_exit(mddev);
>>   }
>>
>>   static int raid0_run(struct mddev *mddev)
>> @@ -380,16 +379,11 @@ static int raid0_run(struct mddev *mddev)
>>          if (md_check_no_bitmap(mddev))
>>                  return -EINVAL;
>>
>> -       if (acct_bioset_init(mddev)) {
>> -               pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev));
>> -               return -ENOMEM;
>> -       }
>> -
>>          /* if private is not null, we are here after takeover */
>>          if (mddev->private == NULL) {
>>                  ret = create_strip_zones(mddev, &conf);
>>                  if (ret < 0)
>> -                       goto exit_acct_set;
>> +                       return ret;
>>                  mddev->private = conf;
>>          }
>>          conf = mddev->private;
>> @@ -420,15 +414,9 @@ static int raid0_run(struct mddev *mddev)
>>
>>          ret = md_integrity_register(mddev);
>>          if (ret)
>> -               goto free;
>> +               free_conf(mddev, conf);
>>
>>          return ret;
>> -
>> -free:
>> -       free_conf(mddev, conf);
>> -exit_acct_set:
>> -       acct_bioset_exit(mddev);
>> -       return ret;
>>   }
>>
>>   static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index f8bc74e16811..29cf5455d7a5 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -7787,19 +7787,12 @@ static int raid5_run(struct mddev *mddev)
>>          struct md_rdev *rdev;
>>          struct md_rdev *journal_dev = NULL;
>>          sector_t reshape_offset = 0;
>> -       int i, ret = 0;
>> +       int i;
>>          long long min_offset_diff = 0;
>>          int first = 1;
>>
>> -       if (acct_bioset_init(mddev)) {
>> -               pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
>> +       if (mddev_init_writes_pending(mddev) < 0)
>>                  return -ENOMEM;
>> -       }
>> -
>> -       if (mddev_init_writes_pending(mddev) < 0) {
>> -               ret = -ENOMEM;
>> -               goto exit_acct_set;
>> -       }
>>
>>          if (mddev->recovery_cp != MaxSector)
>>                  pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
>> @@ -7830,8 +7823,7 @@ static int raid5_run(struct mddev *mddev)
>>              (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
>>                  pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
>>                            mdname(mddev));
>> -               ret = -EINVAL;
>> -               goto exit_acct_set;
>> +               return -EINVAL;
>>          }
>>
>>          if (mddev->reshape_position != MaxSector) {
>> @@ -7856,15 +7848,13 @@ static int raid5_run(struct mddev *mddev)
>>                  if (journal_dev) {
>>                          pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
>>                                  mdname(mddev));
>> -                       ret = -EINVAL;
>> -                       goto exit_acct_set;
>> +                       return -EINVAL;
>>                  }
>>
>>                  if (mddev->new_level != mddev->level) {
>>                          pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
>>                                  mdname(mddev));
>> -                       ret = -EINVAL;
>> -                       goto exit_acct_set;
>> +                       return -EINVAL;
>>                  }
>>                  old_disks = mddev->raid_disks - mddev->delta_disks;
>>                  /* reshape_position must be on a new-stripe boundary, and one
>> @@ -7880,8 +7870,7 @@ static int raid5_run(struct mddev *mddev)
>>                  if (sector_div(here_new, chunk_sectors * new_data_disks)) {
>>                          pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
>>                                  mdname(mddev));
>> -                       ret = -EINVAL;
>> -                       goto exit_acct_set;
>> +                       return -EINVAL;
>>                  }
>>                  reshape_offset = here_new * chunk_sectors;
>>                  /* here_new is the stripe we will write to */
>> @@ -7903,8 +7892,7 @@ static int raid5_run(struct mddev *mddev)
>>                          else if (mddev->ro == 0) {
>>                                  pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
>>                                          mdname(mddev));
>> -                               ret = -EINVAL;
>> -                               goto exit_acct_set;
>> +                               return -EINVAL;
>>                          }
>>                  } else if (mddev->reshape_backwards
>>                      ? (here_new * chunk_sectors + min_offset_diff <=
>> @@ -7914,8 +7902,7 @@ static int raid5_run(struct mddev *mddev)
>>                          /* Reading from the same stripe as writing to - bad */
>>                          pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
>>                                  mdname(mddev));
>> -                       ret = -EINVAL;
>> -                       goto exit_acct_set;
>> +                       return -EINVAL;
>>                  }
>>                  pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
>>                  /* OK, we should be able to continue; */
>> @@ -7939,10 +7926,8 @@ static int raid5_run(struct mddev *mddev)
>>          else
>>                  conf = mddev->private;
>>
>> -       if (IS_ERR(conf)) {
>> -               ret = PTR_ERR(conf);
>> -               goto exit_acct_set;
>> -       }
>> +       if (IS_ERR(conf))
>> +               return PTR_ERR(conf);
>>
>>          if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
>>                  if (!journal_dev) {
>> @@ -8140,10 +8125,7 @@ static int raid5_run(struct mddev *mddev)
>>          free_conf(conf);
>>          mddev->private = NULL;
>>          pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
>> -       ret = -EIO;
>> -exit_acct_set:
>> -       acct_bioset_exit(mddev);
>> -       return ret;
>> +       return -EIO;
>>   }
>>
>>   static void raid5_free(struct mddev *mddev, void *priv)
>> @@ -8151,7 +8133,6 @@ static void raid5_free(struct mddev *mddev, void *priv)
>>          struct r5conf *conf = priv;
>>
>>          free_conf(conf);
>> -       acct_bioset_exit(mddev);
>>          mddev->to_remove = &raid5_attrs_group;
>>   }
>>
>> --
>> 2.39.2
>>
> 
> The patch is good for me.
> 
> Reviewed-by: Xiao Ni <xni@redhat.com>
> 
> .
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH -next 3/8] raid5: fix missing io accounting in raid5_align_endio()
  2023-06-20  9:57   ` Paul Menzel
@ 2023-06-21  3:22     ` Yu Kuai
  0 siblings, 0 replies; 22+ messages in thread
From: Yu Kuai @ 2023-06-21  3:22 UTC (permalink / raw)
  To: Paul Menzel, Yu Kuai
  Cc: xni, song, linux-raid, linux-kernel, yi.zhang, yangerkun, yukuai (C)

Hi,

在 2023/06/20 17:57, Paul Menzel 写道:
> Dear Yu,
> 
> 
> Thank you for your patch.
> 
> Am 19.06.23 um 22:48 schrieb Yu Kuai:
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> Io will only be accounted as done from raid5_align_endio() if the io
>> succeed, and io inflight counter will be leaked if such io failed.
> 
> succeed*s* or succeed*ed*?

I'll up date this.

> 
>> Fix this problem by switching to use md_account_bio() for io accounting.
> 
> How can this be tested?
> 
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> ---
>>   drivers/md/raid5.c | 29 ++++++++---------------------
>>   1 file changed, 8 insertions(+), 21 deletions(-)
>>
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index cef0b400b2ee..4cdb35e54251 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -5468,26 +5468,17 @@ static struct bio 
>> *remove_bio_from_retry(struct r5conf *conf,
>>    */
>>   static void raid5_align_endio(struct bio *bi)
>>   {
>> -    struct md_io_clone *md_io_clone = bi->bi_private;
>> -    struct bio *raid_bi = md_io_clone->orig_bio;
>> -    struct mddev *mddev;
>> -    struct r5conf *conf;
>> -    struct md_rdev *rdev;
>> +    struct bio *raid_bi = bi->bi_private;
>> +    struct md_rdev *rdev = (void *)raid_bi->bi_next;
>> +    struct mddev *mddev = rdev->mddev;
>> +    struct r5conf *conf = mddev->private;
>>       blk_status_t error = bi->bi_status;
>> -    unsigned long start_time = md_io_clone->start_time;
>>       bio_put(bi);
>> -
>> -    rdev = (void*)raid_bi->bi_next;
>>       raid_bi->bi_next = NULL;
>> -    mddev = rdev->mddev;
>> -    conf = mddev->private;
>> -
> 
> This looks like unnecessary refactoring. No idea what the preferred 
> style for the subsystem is though. If it is wanted, maybe make it a 
> separate commit?

You mean that I initialize 'rdev' and 'mdev' while declaration?
I think code is cleaner this way, and this is too tiny to make a patch
for this... I will keep this for now.  😉

Thanks,
Kuai

> 
>>       rdev_dec_pending(rdev, conf->mddev);
>>       if (!error) {
>> -        if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
>> -            bio_end_io_acct(raid_bi, start_time);
>>           bio_endio(raid_bi);
>>           if (atomic_dec_and_test(&conf->active_aligned_reads))
>>               wake_up(&conf->wait_for_quiescent);
>> @@ -5506,7 +5497,6 @@ static int raid5_read_one_chunk(struct mddev 
>> *mddev, struct bio *raid_bio)
>>       struct md_rdev *rdev;
>>       sector_t sector, end_sector, first_bad;
>>       int bad_sectors, dd_idx;
>> -    struct md_io_clone *md_io_clone;
>>       bool did_inc;
>>       if (!in_chunk_boundary(mddev, raid_bio)) {
>> @@ -5543,16 +5533,13 @@ static int raid5_read_one_chunk(struct mddev 
>> *mddev, struct bio *raid_bio)
>>           return 0;
>>       }
>> -    align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
>> -                    &mddev->io_clone_set);
>> -    md_io_clone = container_of(align_bio, struct md_io_clone, 
>> bio_clone);
>> +    md_account_bio(mddev, &raid_bio);
>>       raid_bio->bi_next = (void *)rdev;
>> -    if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
>> -        md_io_clone->start_time = bio_start_io_acct(raid_bio);
>> -    md_io_clone->orig_bio = raid_bio;
>> +    align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
>> +                    &mddev->bio_set);
>>       align_bio->bi_end_io = raid5_align_endio;
>> -    align_bio->bi_private = md_io_clone;
>> +    align_bio->bi_private = raid_bio;
>>       align_bio->bi_iter.bi_sector = sector;
>>       /* No reshape active, so we can trust rdev->data_offset */
> 
> 
> Kind regards,
> 
> Paul
> 
> .
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2023-06-21  3:42 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-19 20:48 [PATCH -next 0/8] md: fix and refactor io accounting and 'active_io' Yu Kuai
2023-06-19 20:48 ` [PATCH -next 1/8] md: move initialization and destruction of 'io_acct_set' to md.c Yu Kuai
2023-06-20  8:35   ` Xiao Ni
2023-06-21  2:35     ` Yu Kuai
2023-06-19 20:48 ` [PATCH -next 2/8] md: also clone new io if io accounting is disabled Yu Kuai
2023-06-20  8:48   ` Xiao Ni
2023-06-19 20:48 ` [PATCH -next 3/8] raid5: fix missing io accounting in raid5_align_endio() Yu Kuai
2023-06-20  8:52   ` Xiao Ni
2023-06-20  9:57   ` Paul Menzel
2023-06-21  3:22     ` Yu Kuai
2023-06-19 20:48 ` [PATCH -next 4/8] md/raid1: switch to use md_account_bio() for io accounting Yu Kuai
2023-06-20  9:07   ` Xiao Ni
2023-06-20 13:38     ` Yu Kuai
2023-06-19 20:48 ` [PATCH -next 5/8] md/raid10: " Yu Kuai
2023-06-20  9:10   ` Xiao Ni
2023-06-19 20:48 ` [PATCH -next 6/8] md/md-multipath: enable " Yu Kuai
2023-06-20  9:11   ` Xiao Ni
2023-06-19 20:48 ` [PATCH -next 7/8] md/md-linear: " Yu Kuai
2023-06-20  9:12   ` Xiao Ni
2023-06-19 20:48 ` [PATCH -next 8/8] md/md-faulty: " Yu Kuai
2023-06-20  9:13   ` Xiao Ni
2023-06-20 17:45 ` [PATCH -next 0/8] md: fix and refactor io accounting and 'active_io' Song Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).