linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] optimizations for io accounting
@ 2022-03-17 11:26 Yu Kuai
  2022-03-17 11:26 ` [PATCH 1/3] block: don't show disk stats if io accounting is disabled Yu Kuai
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Yu Kuai @ 2022-03-17 11:26 UTC (permalink / raw)
  To: axboe, mpatocka, snitzer; +Cc: linux-block, linux-kernel, yukuai3, yi.zhang

Yu Kuai (3):
  block: don't show disk stats if io accounting is disabled
  block: factor out common code for part_stat_show() and
    diskstats_show()
  block: update nsecs[] in part_stat_show() and diskstats_show()

 block/bdev.c              |   2 +
 block/blk-mq.c            |  63 +++++++++++++++-
 block/blk-mq.h            |   2 +
 block/genhd.c             | 154 ++++++++++++++++++--------------------
 include/linux/blk-mq.h    |   2 +
 include/linux/blk_types.h |   5 ++
 6 files changed, 146 insertions(+), 82 deletions(-)

-- 
2.31.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/3] block: don't show disk stats if io accounting is disabled
  2022-03-17 11:26 [PATCH 0/3] optimizations for io accounting Yu Kuai
@ 2022-03-17 11:26 ` Yu Kuai
  2022-03-17 14:06   ` Bart Van Assche
  2022-03-25  8:46   ` Christoph Hellwig
  2022-03-17 11:26 ` [PATCH 2/3] block: factor out common code for part_stat_show() and diskstats_show() Yu Kuai
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 14+ messages in thread
From: Yu Kuai @ 2022-03-17 11:26 UTC (permalink / raw)
  To: axboe, mpatocka, snitzer; +Cc: linux-block, linux-kernel, yukuai3, yi.zhang

If io accounting is disabled, there is no point to handle such device
in diskstats_show(), and it can be confused for users because all fields
in iostat are zero while the disk is handling io.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/genhd.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/block/genhd.c b/block/genhd.c
index c3b32c665aec..e5307f512185 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -937,6 +937,9 @@ ssize_t part_stat_show(struct device *dev,
 	struct disk_stats stat;
 	unsigned int inflight;
 
+	if (!blk_queue_io_stat(q))
+		return sprintf(buf, "io accounting is disabled\n");
+
 	if (queue_is_mq(q))
 		inflight = blk_mq_in_flight(q, bdev);
 	else
@@ -1207,6 +1210,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	xa_for_each(&gp->part_tbl, idx, hd) {
 		if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
 			continue;
+		if (!blk_queue_io_stat(gp->queue))
+			continue;
 		if (queue_is_mq(gp->queue))
 			inflight = blk_mq_in_flight(gp->queue, hd);
 		else
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/3] block: factor out common code for part_stat_show() and diskstats_show()
  2022-03-17 11:26 [PATCH 0/3] optimizations for io accounting Yu Kuai
  2022-03-17 11:26 ` [PATCH 1/3] block: don't show disk stats if io accounting is disabled Yu Kuai
@ 2022-03-17 11:26 ` Yu Kuai
  2022-03-25  8:46   ` Christoph Hellwig
  2022-03-17 11:26 ` [PATCH -next 3/3] block: update nsecs[] in part_get_stat() Yu Kuai
  2022-03-25  7:29 ` [PATCH 0/3] optimizations for io accounting yukuai (C)
  3 siblings, 1 reply; 14+ messages in thread
From: Yu Kuai @ 2022-03-17 11:26 UTC (permalink / raw)
  To: axboe, mpatocka, snitzer; +Cc: linux-block, linux-kernel, yukuai3, yi.zhang

part_stat_show() and diskstats_show() are very similar, just factor out
common code.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/genhd.c | 130 +++++++++++++++++++-------------------------------
 1 file changed, 49 insertions(+), 81 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index e5307f512185..f2c7de2e7ca9 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -53,6 +53,30 @@ static atomic64_t diskseq;
 #define NR_EXT_DEVT		(1 << MINORBITS)
 static DEFINE_IDA(ext_devt_ida);
 
+#define part_stat_info(inflight, stat) \
+	"%lu %lu %llu %u " \
+	"%lu %lu %llu %u " \
+	"%u %u %u " \
+	"%lu %lu %llu %u " \
+	"%lu %u" \
+	"\n", \
+	(stat).ios[STAT_READ], (stat).merges[STAT_READ], \
+	(unsigned long long)(stat).sectors[STAT_READ], \
+	(unsigned int)div_u64((stat).nsecs[STAT_READ], NSEC_PER_MSEC), \
+	(stat).ios[STAT_WRITE], (stat).merges[STAT_WRITE], \
+	(unsigned long long)(stat).sectors[STAT_WRITE], \
+	(unsigned int)div_u64((stat).nsecs[STAT_WRITE], NSEC_PER_MSEC), \
+	(inflight), jiffies_to_msecs((stat).io_ticks), \
+	(unsigned int)div_u64((stat).nsecs[STAT_READ] + \
+			      (stat).nsecs[STAT_WRITE] + \
+			      (stat).nsecs[STAT_DISCARD] + \
+			      (stat).nsecs[STAT_FLUSH], NSEC_PER_MSEC), \
+	(stat).ios[STAT_DISCARD], (stat).merges[STAT_DISCARD], \
+	(unsigned long long)(stat).sectors[STAT_DISCARD], \
+	(unsigned int)div_u64((stat).nsecs[STAT_DISCARD], NSEC_PER_MSEC), \
+	(stat).ios[STAT_FLUSH], \
+	(unsigned int)div_u64((stat).nsecs[STAT_FLUSH], NSEC_PER_MSEC)
+
 void set_capacity(struct gendisk *disk, sector_t sectors)
 {
 	struct block_device *bdev = disk->part0;
@@ -929,17 +953,13 @@ ssize_t part_size_show(struct device *dev,
 	return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
 }
 
-ssize_t part_stat_show(struct device *dev,
-		       struct device_attribute *attr, char *buf)
+static unsigned int part_get_stat(struct block_device *bdev,
+				  struct disk_stats *stat)
+
 {
-	struct block_device *bdev = dev_to_bdev(dev);
 	struct request_queue *q = bdev_get_queue(bdev);
-	struct disk_stats stat;
 	unsigned int inflight;
 
-	if (!blk_queue_io_stat(q))
-		return sprintf(buf, "io accounting is disabled\n");
-
 	if (queue_is_mq(q))
 		inflight = blk_mq_in_flight(q, bdev);
 	else
@@ -950,35 +970,23 @@ ssize_t part_stat_show(struct device *dev,
 		update_io_ticks(bdev, jiffies, true);
 		part_stat_unlock();
 	}
-	part_stat_read_all(bdev, &stat);
-	return sprintf(buf,
-		"%8lu %8lu %8llu %8u "
-		"%8lu %8lu %8llu %8u "
-		"%8u %8u %8u "
-		"%8lu %8lu %8llu %8u "
-		"%8lu %8u"
-		"\n",
-		stat.ios[STAT_READ],
-		stat.merges[STAT_READ],
-		(unsigned long long)stat.sectors[STAT_READ],
-		(unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC),
-		stat.ios[STAT_WRITE],
-		stat.merges[STAT_WRITE],
-		(unsigned long long)stat.sectors[STAT_WRITE],
-		(unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
-		inflight,
-		jiffies_to_msecs(stat.io_ticks),
-		(unsigned int)div_u64(stat.nsecs[STAT_READ] +
-				      stat.nsecs[STAT_WRITE] +
-				      stat.nsecs[STAT_DISCARD] +
-				      stat.nsecs[STAT_FLUSH],
-						NSEC_PER_MSEC),
-		stat.ios[STAT_DISCARD],
-		stat.merges[STAT_DISCARD],
-		(unsigned long long)stat.sectors[STAT_DISCARD],
-		(unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC),
-		stat.ios[STAT_FLUSH],
-		(unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
+	part_stat_read_all(bdev, stat);
+
+	return inflight;
+}
+
+ssize_t part_stat_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
+{
+	struct block_device *bdev = dev_to_bdev(dev);
+	struct disk_stats stat;
+	unsigned int inflight;
+
+	if (!blk_queue_io_stat(bdev_get_queue(bdev)))
+		return sprintf(buf, "io accounting is disabled\n");
+
+	inflight = part_get_stat(bdev, &stat);
+	return sprintf(buf, part_stat_info(inflight, stat));
 }
 
 ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
@@ -1212,51 +1220,11 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			continue;
 		if (!blk_queue_io_stat(gp->queue))
 			continue;
-		if (queue_is_mq(gp->queue))
-			inflight = blk_mq_in_flight(gp->queue, hd);
-		else
-			inflight = part_in_flight(hd);
-
-		if (inflight) {
-			part_stat_lock();
-			update_io_ticks(hd, jiffies, true);
-			part_stat_unlock();
-		}
-		part_stat_read_all(hd, &stat);
-		seq_printf(seqf, "%4d %7d %pg "
-			   "%lu %lu %lu %u "
-			   "%lu %lu %lu %u "
-			   "%u %u %u "
-			   "%lu %lu %lu %u "
-			   "%lu %u"
-			   "\n",
-			   MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd,
-			   stat.ios[STAT_READ],
-			   stat.merges[STAT_READ],
-			   stat.sectors[STAT_READ],
-			   (unsigned int)div_u64(stat.nsecs[STAT_READ],
-							NSEC_PER_MSEC),
-			   stat.ios[STAT_WRITE],
-			   stat.merges[STAT_WRITE],
-			   stat.sectors[STAT_WRITE],
-			   (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
-							NSEC_PER_MSEC),
-			   inflight,
-			   jiffies_to_msecs(stat.io_ticks),
-			   (unsigned int)div_u64(stat.nsecs[STAT_READ] +
-						 stat.nsecs[STAT_WRITE] +
-						 stat.nsecs[STAT_DISCARD] +
-						 stat.nsecs[STAT_FLUSH],
-							NSEC_PER_MSEC),
-			   stat.ios[STAT_DISCARD],
-			   stat.merges[STAT_DISCARD],
-			   stat.sectors[STAT_DISCARD],
-			   (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
-						 NSEC_PER_MSEC),
-			   stat.ios[STAT_FLUSH],
-			   (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
-						 NSEC_PER_MSEC)
-			);
+
+		inflight = part_get_stat(hd, &stat);
+		seq_printf(seqf, "%4d %7d %pg ",
+			   MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd);
+		seq_printf(seqf, part_stat_info(inflight, stat));
 	}
 	rcu_read_unlock();
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH -next 3/3] block: update nsecs[] in part_get_stat()
  2022-03-17 11:26 [PATCH 0/3] optimizations for io accounting Yu Kuai
  2022-03-17 11:26 ` [PATCH 1/3] block: don't show disk stats if io accounting is disabled Yu Kuai
  2022-03-17 11:26 ` [PATCH 2/3] block: factor out common code for part_stat_show() and diskstats_show() Yu Kuai
@ 2022-03-17 11:26 ` Yu Kuai
  2022-04-01  3:42   ` yukuai (C)
  2022-03-25  7:29 ` [PATCH 0/3] optimizations for io accounting yukuai (C)
  3 siblings, 1 reply; 14+ messages in thread
From: Yu Kuai @ 2022-03-17 11:26 UTC (permalink / raw)
  To: axboe, mpatocka, snitzer; +Cc: linux-block, linux-kernel, yukuai3, yi.zhang

commit 86d7331299fd("block: update io_ticks when io hang") fixed that
%util will be zero for iostat when io is hanged, however, avgqu-sz is
still zero while it represents the number of io that are hunged. On the
other hand, for some slow device, if an io is started before and done
after diskstats is read, the avgqu-sz will be miscalculated.

To fix the problem, update 'nsecs[]' when part_stat_show() or
diskstats_show() is called. In order to do that, add 'stat_time' in
struct block_device and 'rq_stat_time' in struct request to record the
time. And during iteration, update 'nsecs[]' for each inflight request.

Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/bdev.c              |  2 ++
 block/blk-mq.c            | 63 ++++++++++++++++++++++++++++++++++++++-
 block/blk-mq.h            |  2 ++
 block/genhd.c             | 25 ++++++++++++++--
 include/linux/blk-mq.h    |  2 ++
 include/linux/blk_types.h |  5 ++++
 6 files changed, 95 insertions(+), 4 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index 13de871fa816..5dced478f190 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -487,9 +487,11 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 	bdev = I_BDEV(inode);
 	mutex_init(&bdev->bd_fsfreeze_mutex);
 	spin_lock_init(&bdev->bd_size_lock);
+	spin_lock_init(&bdev->bd_stat_lock);
 	bdev->bd_partno = partno;
 	bdev->bd_inode = inode;
 	bdev->bd_queue = disk->queue;
+	bdev->stat_time = 0;
 	bdev->bd_stats = alloc_percpu(struct disk_stats);
 	if (!bdev->bd_stats) {
 		iput(inode);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c33fb378b168..7a14d4058b14 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -149,6 +149,48 @@ unsigned int blk_mq_in_flight(struct request_queue *q,
 	return mi.inflight[0] + mi.inflight[1];
 }
 
+static bool blk_mq_check_inflight_stat(struct request *rq, void *priv,
+				       bool reserved)
+{
+	struct mq_inflight *mi = priv;
+
+	if ((!mi->part->bd_partno || rq->part == mi->part) &&
+	    blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
+		u64 stat_time;
+
+		mi->inflight[rq_data_dir(rq)]++;
+		if (!rq->part)
+			return true;
+
+		stat_time = READ_ONCE(rq->stat_time_ns);
+		/*
+		 * This might fail if 'req->stat_time_ns' happen to be updated
+		 * in blk_account_io_done().
+		 */
+		if (likely(cmpxchg(&rq->stat_time_ns, stat_time,
+			    rq->part->stat_time) == stat_time)) {
+			int sgrp = op_stat_group(req_op(rq));
+			u64 duation = stat_time ?
+				rq->part->stat_time - stat_time :
+				rq->part->stat_time - rq->start_time_ns;
+
+			part_stat_add(rq->part, nsecs[sgrp], duation);
+		}
+	}
+
+	return true;
+}
+
+unsigned int blk_mq_in_flight_stat(struct request_queue *q,
+		struct block_device *part)
+{
+	struct mq_inflight mi = { .part = part };
+
+	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_stat, &mi);
+
+	return mi.inflight[0] + mi.inflight[1];
+}
+
 void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
 		unsigned int inflight[2])
 {
@@ -368,6 +410,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		rq->start_time_ns = ktime_get_ns();
 	else
 		rq->start_time_ns = 0;
+	rq->stat_time_ns = 0;
 	rq->part = NULL;
 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
 	rq->alloc_time_ns = alloc_time_ns;
@@ -874,7 +917,25 @@ static void __blk_account_io_done(struct request *req, u64 now)
 	part_stat_lock();
 	update_io_ticks(req->part, jiffies, true);
 	part_stat_inc(req->part, ios[sgrp]);
-	part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+
+	if (queue_is_mq(req->q)) {
+		u64 stat_time = READ_ONCE(req->stat_time_ns);
+
+		/*
+		 * This might fail if 'req->stat_time_ns' happen to be updated
+		 * in part_get_stat().
+		 */
+		if (likely(cmpxchg(&req->stat_time_ns, stat_time, now) ==
+			   stat_time)) {
+			u64 duation = stat_time ? now - stat_time :
+						  now - req->start_time_ns;
+
+			part_stat_add(req->part, nsecs[sgrp], duation);
+		}
+	} else {
+		part_stat_add(req->part, nsecs[sgrp],
+			      now - req->part->stat_time);
+	}
 	part_stat_unlock();
 }
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 7d1acf8e1e51..37941f5f1d08 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -189,6 +189,8 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 
 unsigned int blk_mq_in_flight(struct request_queue *q,
 		struct block_device *part);
+unsigned int blk_mq_in_flight_stat(struct request_queue *q,
+		struct block_device *part);
 void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
 		unsigned int inflight[2]);
 
diff --git a/block/genhd.c b/block/genhd.c
index f2c7de2e7ca9..31603c7aff49 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -953,6 +953,19 @@ ssize_t part_size_show(struct device *dev,
 	return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
 }
 
+static void part_set_stat_time(struct block_device *part)
+{
+	u64 now = ktime_get_ns();
+
+again:
+	part->stat_time = now;
+
+	if (part->bd_partno) {
+		part = bdev_whole(part);
+		goto again;
+	}
+}
+
 static unsigned int part_get_stat(struct block_device *bdev,
 				  struct disk_stats *stat)
 
@@ -960,10 +973,16 @@ static unsigned int part_get_stat(struct block_device *bdev,
 	struct request_queue *q = bdev_get_queue(bdev);
 	unsigned int inflight;
 
-	if (queue_is_mq(q))
-		inflight = blk_mq_in_flight(q, bdev);
-	else
+	if (queue_is_mq(q)) {
+		spin_lock(&bdev->bd_stat_lock);
+		part_stat_lock();
+		part_set_stat_time(bdev);
+		inflight = blk_mq_in_flight_stat(q, bdev);
+		part_stat_unlock();
+		spin_unlock(&bdev->bd_stat_lock);
+	} else {
 		inflight = part_in_flight(bdev);
+	}
 
 	if (inflight) {
 		part_stat_lock();
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e512da636e0f..e98f07d1741c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -108,6 +108,8 @@ struct request {
 	u64 start_time_ns;
 	/* Time that I/O was submitted to the device. */
 	u64 io_start_time_ns;
+	/* Time that I/O was accounted in part_get_stat() */
+	u64 stat_time_ns;
 
 #ifdef CONFIG_BLK_WBT
 	unsigned short wbt_flags;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index c6ed4a559e6a..5cd399b5670d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -65,6 +65,11 @@ struct block_device {
 	struct super_block	*bd_fsfreeze_sb;
 
 	struct partition_meta_info *bd_meta_info;
+
+	/* Prevent part_get_stat() to be called concurrently */
+	spinlock_t		bd_stat_lock;
+	/* Will be set when part_stat_show() or diskstats_show() is called */
+	u64			stat_time;
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	bool			bd_make_it_fail;
 #endif
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] block: don't show disk stats if io accounting is disabled
  2022-03-17 11:26 ` [PATCH 1/3] block: don't show disk stats if io accounting is disabled Yu Kuai
@ 2022-03-17 14:06   ` Bart Van Assche
  2022-03-18  1:36     ` yukuai (C)
  2022-03-25  8:46   ` Christoph Hellwig
  1 sibling, 1 reply; 14+ messages in thread
From: Bart Van Assche @ 2022-03-17 14:06 UTC (permalink / raw)
  To: Yu Kuai, axboe, mpatocka, snitzer; +Cc: linux-block, linux-kernel, yi.zhang

On 3/17/22 04:26, Yu Kuai wrote:
> If io accounting is disabled, there is no point to handle such device
> in diskstats_show(), and it can be confused for users because all fields
> in iostat are zero while the disk is handling io.
> 
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>   block/genhd.c | 5 +++++
>   1 file changed, 5 insertions(+)
> 
> diff --git a/block/genhd.c b/block/genhd.c
> index c3b32c665aec..e5307f512185 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -937,6 +937,9 @@ ssize_t part_stat_show(struct device *dev,
>   	struct disk_stats stat;
>   	unsigned int inflight;
>   
> +	if (!blk_queue_io_stat(q))
> +		return sprintf(buf, "io accounting is disabled\n");
> +

Hmm ... the above looks sub-optimal to me. Has it been considered to 
return an error code instead or even better to hide the stat attribute 
if I/O accounting is disabled? The latter can be achieved by modifying 
disk_visible().

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] block: don't show disk stats if io accounting is disabled
  2022-03-17 14:06   ` Bart Van Assche
@ 2022-03-18  1:36     ` yukuai (C)
  2022-03-18  3:10       ` Bart Van Assche
  0 siblings, 1 reply; 14+ messages in thread
From: yukuai (C) @ 2022-03-18  1:36 UTC (permalink / raw)
  To: Bart Van Assche, axboe, mpatocka, snitzer
  Cc: linux-block, linux-kernel, yi.zhang

在 2022/03/17 22:06, Bart Van Assche 写道:
> On 3/17/22 04:26, Yu Kuai wrote:
>> If io accounting is disabled, there is no point to handle such device
>> in diskstats_show(), and it can be confused for users because all fields
>> in iostat are zero while the disk is handling io.
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> ---
>>   block/genhd.c | 5 +++++
>>   1 file changed, 5 insertions(+)
>>
>> diff --git a/block/genhd.c b/block/genhd.c
>> index c3b32c665aec..e5307f512185 100644
>> --- a/block/genhd.c
>> +++ b/block/genhd.c
>> @@ -937,6 +937,9 @@ ssize_t part_stat_show(struct device *dev,
>>       struct disk_stats stat;
>>       unsigned int inflight;
>> +    if (!blk_queue_io_stat(q))
>> +        return sprintf(buf, "io accounting is disabled\n");
>> +
> 
> Hmm ... the above looks sub-optimal to me. Has it been considered to 
> return an error code instead or even better to hide the stat attribute 
> if I/O accounting is disabled? The latter can be achieved by modifying 
> disk_visible().

Hi,

It's right this way is much better, i'll hide the 'stat' in next
iteration.

BTW, do you have any suggestion about patch 3?

Thanks,
Kuai

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] block: don't show disk stats if io accounting is disabled
  2022-03-18  1:36     ` yukuai (C)
@ 2022-03-18  3:10       ` Bart Van Assche
  2022-03-18  6:12         ` yukuai (C)
  0 siblings, 1 reply; 14+ messages in thread
From: Bart Van Assche @ 2022-03-18  3:10 UTC (permalink / raw)
  To: yukuai (C), axboe, mpatocka, snitzer; +Cc: linux-block, linux-kernel, yi.zhang

On 3/17/22 18:36, yukuai (C) wrote:
> 在 2022/03/17 22:06, Bart Van Assche 写道:
>> On 3/17/22 04:26, Yu Kuai wrote:
>>> If io accounting is disabled, there is no point to handle such device
>>> in diskstats_show(), and it can be confused for users because all fields
>>> in iostat are zero while the disk is handling io.
>>>
>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>>> ---
>>>   block/genhd.c | 5 +++++
>>>   1 file changed, 5 insertions(+)
>>>
>>> diff --git a/block/genhd.c b/block/genhd.c
>>> index c3b32c665aec..e5307f512185 100644
>>> --- a/block/genhd.c
>>> +++ b/block/genhd.c
>>> @@ -937,6 +937,9 @@ ssize_t part_stat_show(struct device *dev,
>>>       struct disk_stats stat;
>>>       unsigned int inflight;
>>> +    if (!blk_queue_io_stat(q))
>>> +        return sprintf(buf, "io accounting is disabled\n");
>>> +
>>
>> Hmm ... the above looks sub-optimal to me. Has it been considered to 
>> return an error code instead or even better to hide the stat attribute 
>> if I/O accounting is disabled? The latter can be achieved by modifying 
>> disk_visible().
 >
> It's right this way is much better, i'll hide the 'stat' in next
> iteration.

Please note that modifying disk_visible() only is not sufficient. 
sysfs_update_group() needs to be called to trigger a call to 
disk_visible() if a variable has changed that affects the return value 
of disk_visible().

> BTW, do you have any suggestion about patch 3?

I need more time to analyze that patch.

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] block: don't show disk stats if io accounting is disabled
  2022-03-18  3:10       ` Bart Van Assche
@ 2022-03-18  6:12         ` yukuai (C)
  0 siblings, 0 replies; 14+ messages in thread
From: yukuai (C) @ 2022-03-18  6:12 UTC (permalink / raw)
  To: Bart Van Assche, axboe, mpatocka, snitzer
  Cc: linux-block, linux-kernel, yi.zhang

在 2022/03/18 11:10, Bart Van Assche 写道:
> On 3/17/22 18:36, yukuai (C) wrote:
>> 在 2022/03/17 22:06, Bart Van Assche 写道:
>>> On 3/17/22 04:26, Yu Kuai wrote:
>>>> If io accounting is disabled, there is no point to handle such device
>>>> in diskstats_show(), and it can be confused for users because all 
>>>> fields
>>>> in iostat are zero while the disk is handling io.
>>>>
>>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>>>> ---
>>>>   block/genhd.c | 5 +++++
>>>>   1 file changed, 5 insertions(+)
>>>>
>>>> diff --git a/block/genhd.c b/block/genhd.c
>>>> index c3b32c665aec..e5307f512185 100644
>>>> --- a/block/genhd.c
>>>> +++ b/block/genhd.c
>>>> @@ -937,6 +937,9 @@ ssize_t part_stat_show(struct device *dev,
>>>>       struct disk_stats stat;
>>>>       unsigned int inflight;
>>>> +    if (!blk_queue_io_stat(q))
>>>> +        return sprintf(buf, "io accounting is disabled\n");
>>>> +
>>>
>>> Hmm ... the above looks sub-optimal to me. Has it been considered to 
>>> return an error code instead or even better to hide the stat 
>>> attribute if I/O accounting is disabled? The latter can be achieved 
>>> by modifying disk_visible().
>  >
>> It's right this way is much better, i'll hide the 'stat' in next
>> iteration.
> 
> Please note that modifying disk_visible() only is not sufficient. 
> sysfs_update_group() needs to be called to trigger a call to 
> disk_visible() if a variable has changed that affects the return value 
> of disk_visible().

Hi,

Thanks for your advice, I wonder can the queue flag
'QUEUE_FLAG_IO_STAT' change after the disk is created? I don't
see it can change anywhere. In this case, perhaps we don't need
to consider calling sysfs_update_group().

> 
>> BTW, do you have any suggestion about patch 3?
> 
> I need more time to analyze that patch.

Thanks for sending time on the patch.

Kuai

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/3] optimizations for io accounting
  2022-03-17 11:26 [PATCH 0/3] optimizations for io accounting Yu Kuai
                   ` (2 preceding siblings ...)
  2022-03-17 11:26 ` [PATCH -next 3/3] block: update nsecs[] in part_get_stat() Yu Kuai
@ 2022-03-25  7:29 ` yukuai (C)
  3 siblings, 0 replies; 14+ messages in thread
From: yukuai (C) @ 2022-03-25  7:29 UTC (permalink / raw)
  To: axboe, mpatocka, snitzer; +Cc: linux-block, linux-kernel, yi.zhang

friently ping ...

在 2022/03/17 19:26, Yu Kuai 写道:
> Yu Kuai (3):
>    block: don't show disk stats if io accounting is disabled
>    block: factor out common code for part_stat_show() and
>      diskstats_show()
>    block: update nsecs[] in part_stat_show() and diskstats_show()
> 
>   block/bdev.c              |   2 +
>   block/blk-mq.c            |  63 +++++++++++++++-
>   block/blk-mq.h            |   2 +
>   block/genhd.c             | 154 ++++++++++++++++++--------------------
>   include/linux/blk-mq.h    |   2 +
>   include/linux/blk_types.h |   5 ++
>   6 files changed, 146 insertions(+), 82 deletions(-)
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/3] block: factor out common code for part_stat_show() and diskstats_show()
  2022-03-17 11:26 ` [PATCH 2/3] block: factor out common code for part_stat_show() and diskstats_show() Yu Kuai
@ 2022-03-25  8:46   ` Christoph Hellwig
  2022-03-26  1:11     ` yukuai (C)
  0 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2022-03-25  8:46 UTC (permalink / raw)
  To: Yu Kuai; +Cc: axboe, mpatocka, snitzer, linux-block, linux-kernel, yi.zhang

On Thu, Mar 17, 2022 at 07:26:52PM +0800, Yu Kuai wrote:
> part_stat_show() and diskstats_show() are very similar, just factor out
> common code.

Well, it doesn't really "factor" much, but creates a big and pretty
unmaintainble macro.  I don't really see the benefit here.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] block: don't show disk stats if io accounting is disabled
  2022-03-17 11:26 ` [PATCH 1/3] block: don't show disk stats if io accounting is disabled Yu Kuai
  2022-03-17 14:06   ` Bart Van Assche
@ 2022-03-25  8:46   ` Christoph Hellwig
  1 sibling, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2022-03-25  8:46 UTC (permalink / raw)
  To: Yu Kuai; +Cc: axboe, mpatocka, snitzer, linux-block, linux-kernel, yi.zhang

On Thu, Mar 17, 2022 at 07:26:51PM +0800, Yu Kuai wrote:
> If io accounting is disabled, there is no point to handle such device
> in diskstats_show(), and it can be confused for users because all fields
> in iostat are zero while the disk is handling io.

But changing the output will break existing users looking at it.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/3] block: factor out common code for part_stat_show() and diskstats_show()
  2022-03-25  8:46   ` Christoph Hellwig
@ 2022-03-26  1:11     ` yukuai (C)
  0 siblings, 0 replies; 14+ messages in thread
From: yukuai (C) @ 2022-03-26  1:11 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe, mpatocka, snitzer, linux-block, linux-kernel, yi.zhang

在 2022/03/25 16:46, Christoph Hellwig 写道:
> On Thu, Mar 17, 2022 at 07:26:52PM +0800, Yu Kuai wrote:
>> part_stat_show() and diskstats_show() are very similar, just factor out
>> common code.
> 
> Well, it doesn't really "factor" much, but creates a big and pretty
> unmaintainble macro.  I don't really see the benefit here.

Hi,

Thanks for your advice, I'll remove this patch.

BTW, do you have any suggestion about patch 3 ?

Thanks,
Kuai
> .
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH -next 3/3] block: update nsecs[] in part_get_stat()
  2022-03-17 11:26 ` [PATCH -next 3/3] block: update nsecs[] in part_get_stat() Yu Kuai
@ 2022-04-01  3:42   ` yukuai (C)
  2022-04-08  6:52     ` yukuai (C)
  0 siblings, 1 reply; 14+ messages in thread
From: yukuai (C) @ 2022-04-01  3:42 UTC (permalink / raw)
  To: axboe, mpatocka, snitzer; +Cc: linux-block, linux-kernel, yi.zhang

在 2022/03/17 19:26, Yu Kuai 写道:
> commit 86d7331299fd("block: update io_ticks when io hang") fixed that
> %util will be zero for iostat when io is hanged, however, avgqu-sz is
> still zero while it represents the number of io that are hunged. On the
> other hand, for some slow device, if an io is started before and done
> after diskstats is read, the avgqu-sz will be miscalculated.
> 
> To fix the problem, update 'nsecs[]' when part_stat_show() or
> diskstats_show() is called. In order to do that, add 'stat_time' in
> struct block_device and 'rq_stat_time' in struct request to record the
> time. And during iteration, update 'nsecs[]' for each inflight request.
Friendly ping... Does anyone have any suggestions on this apporch ?

Thanks,
Kuai
> 
> Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting")
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>   block/bdev.c              |  2 ++
>   block/blk-mq.c            | 63 ++++++++++++++++++++++++++++++++++++++-
>   block/blk-mq.h            |  2 ++
>   block/genhd.c             | 25 ++++++++++++++--
>   include/linux/blk-mq.h    |  2 ++
>   include/linux/blk_types.h |  5 ++++
>   6 files changed, 95 insertions(+), 4 deletions(-)
> 
> diff --git a/block/bdev.c b/block/bdev.c
> index 13de871fa816..5dced478f190 100644
> --- a/block/bdev.c
> +++ b/block/bdev.c
> @@ -487,9 +487,11 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
>   	bdev = I_BDEV(inode);
>   	mutex_init(&bdev->bd_fsfreeze_mutex);
>   	spin_lock_init(&bdev->bd_size_lock);
> +	spin_lock_init(&bdev->bd_stat_lock);
>   	bdev->bd_partno = partno;
>   	bdev->bd_inode = inode;
>   	bdev->bd_queue = disk->queue;
> +	bdev->stat_time = 0;
>   	bdev->bd_stats = alloc_percpu(struct disk_stats);
>   	if (!bdev->bd_stats) {
>   		iput(inode);
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index c33fb378b168..7a14d4058b14 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -149,6 +149,48 @@ unsigned int blk_mq_in_flight(struct request_queue *q,
>   	return mi.inflight[0] + mi.inflight[1];
>   }
>   
> +static bool blk_mq_check_inflight_stat(struct request *rq, void *priv,
> +				       bool reserved)
> +{
> +	struct mq_inflight *mi = priv;
> +
> +	if ((!mi->part->bd_partno || rq->part == mi->part) &&
> +	    blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
> +		u64 stat_time;
> +
> +		mi->inflight[rq_data_dir(rq)]++;
> +		if (!rq->part)
> +			return true;
> +
> +		stat_time = READ_ONCE(rq->stat_time_ns);
> +		/*
> +		 * This might fail if 'req->stat_time_ns' happen to be updated
> +		 * in blk_account_io_done().
> +		 */
> +		if (likely(cmpxchg(&rq->stat_time_ns, stat_time,
> +			    rq->part->stat_time) == stat_time)) {
> +			int sgrp = op_stat_group(req_op(rq));
> +			u64 duation = stat_time ?
> +				rq->part->stat_time - stat_time :
> +				rq->part->stat_time - rq->start_time_ns;
> +
> +			part_stat_add(rq->part, nsecs[sgrp], duation);
> +		}
> +	}
> +
> +	return true;
> +}
> +
> +unsigned int blk_mq_in_flight_stat(struct request_queue *q,
> +		struct block_device *part)
> +{
> +	struct mq_inflight mi = { .part = part };
> +
> +	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_stat, &mi);
> +
> +	return mi.inflight[0] + mi.inflight[1];
> +}
> +
>   void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
>   		unsigned int inflight[2])
>   {
> @@ -368,6 +410,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>   		rq->start_time_ns = ktime_get_ns();
>   	else
>   		rq->start_time_ns = 0;
> +	rq->stat_time_ns = 0;
>   	rq->part = NULL;
>   #ifdef CONFIG_BLK_RQ_ALLOC_TIME
>   	rq->alloc_time_ns = alloc_time_ns;
> @@ -874,7 +917,25 @@ static void __blk_account_io_done(struct request *req, u64 now)
>   	part_stat_lock();
>   	update_io_ticks(req->part, jiffies, true);
>   	part_stat_inc(req->part, ios[sgrp]);
> -	part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
> +
> +	if (queue_is_mq(req->q)) {
> +		u64 stat_time = READ_ONCE(req->stat_time_ns);
> +
> +		/*
> +		 * This might fail if 'req->stat_time_ns' happen to be updated
> +		 * in part_get_stat().
> +		 */
> +		if (likely(cmpxchg(&req->stat_time_ns, stat_time, now) ==
> +			   stat_time)) {
> +			u64 duation = stat_time ? now - stat_time :
> +						  now - req->start_time_ns;
> +
> +			part_stat_add(req->part, nsecs[sgrp], duation);
> +		}
> +	} else {
> +		part_stat_add(req->part, nsecs[sgrp],
> +			      now - req->part->stat_time);
> +	}
>   	part_stat_unlock();
>   }
>   
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 7d1acf8e1e51..37941f5f1d08 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -189,6 +189,8 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
>   
>   unsigned int blk_mq_in_flight(struct request_queue *q,
>   		struct block_device *part);
> +unsigned int blk_mq_in_flight_stat(struct request_queue *q,
> +		struct block_device *part);
>   void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
>   		unsigned int inflight[2]);
>   
> diff --git a/block/genhd.c b/block/genhd.c
> index f2c7de2e7ca9..31603c7aff49 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -953,6 +953,19 @@ ssize_t part_size_show(struct device *dev,
>   	return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
>   }
>   
> +static void part_set_stat_time(struct block_device *part)
> +{
> +	u64 now = ktime_get_ns();
> +
> +again:
> +	part->stat_time = now;
> +
> +	if (part->bd_partno) {
> +		part = bdev_whole(part);
> +		goto again;
> +	}
> +}
> +
>   static unsigned int part_get_stat(struct block_device *bdev,
>   				  struct disk_stats *stat)
>   
> @@ -960,10 +973,16 @@ static unsigned int part_get_stat(struct block_device *bdev,
>   	struct request_queue *q = bdev_get_queue(bdev);
>   	unsigned int inflight;
>   
> -	if (queue_is_mq(q))
> -		inflight = blk_mq_in_flight(q, bdev);
> -	else
> +	if (queue_is_mq(q)) {
> +		spin_lock(&bdev->bd_stat_lock);
> +		part_stat_lock();
> +		part_set_stat_time(bdev);
> +		inflight = blk_mq_in_flight_stat(q, bdev);
> +		part_stat_unlock();
> +		spin_unlock(&bdev->bd_stat_lock);
> +	} else {
>   		inflight = part_in_flight(bdev);
> +	}
>   
>   	if (inflight) {
>   		part_stat_lock();
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index e512da636e0f..e98f07d1741c 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -108,6 +108,8 @@ struct request {
>   	u64 start_time_ns;
>   	/* Time that I/O was submitted to the device. */
>   	u64 io_start_time_ns;
> +	/* Time that I/O was accounted in part_get_stat() */
> +	u64 stat_time_ns;
>   
>   #ifdef CONFIG_BLK_WBT
>   	unsigned short wbt_flags;
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index c6ed4a559e6a..5cd399b5670d 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -65,6 +65,11 @@ struct block_device {
>   	struct super_block	*bd_fsfreeze_sb;
>   
>   	struct partition_meta_info *bd_meta_info;
> +
> +	/* Prevent part_get_stat() to be called concurrently */
> +	spinlock_t		bd_stat_lock;
> +	/* Will be set when part_stat_show() or diskstats_show() is called */
> +	u64			stat_time;
>   #ifdef CONFIG_FAIL_MAKE_REQUEST
>   	bool			bd_make_it_fail;
>   #endif
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH -next 3/3] block: update nsecs[] in part_get_stat()
  2022-04-01  3:42   ` yukuai (C)
@ 2022-04-08  6:52     ` yukuai (C)
  0 siblings, 0 replies; 14+ messages in thread
From: yukuai (C) @ 2022-04-08  6:52 UTC (permalink / raw)
  To: axboe, mpatocka, snitzer; +Cc: linux-block, linux-kernel, yi.zhang

在 2022/04/01 11:42, yukuai (C) 写道:
> 在 2022/03/17 19:26, Yu Kuai 写道:
>> commit 86d7331299fd("block: update io_ticks when io hang") fixed that
>> %util will be zero for iostat when io is hanged, however, avgqu-sz is
>> still zero while it represents the number of io that are hunged. On the
>> other hand, for some slow device, if an io is started before and done
>> after diskstats is read, the avgqu-sz will be miscalculated.
>>
>> To fix the problem, update 'nsecs[]' when part_stat_show() or
>> diskstats_show() is called. In order to do that, add 'stat_time' in
>> struct block_device and 'rq_stat_time' in struct request to record the
>> time. And during iteration, update 'nsecs[]' for each inflight request.
> Friendly ping... Does anyone have any suggestions on this apporch ?
friendlig ping ...
> 
> Thanks,
> Kuai
>>
>> Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to 
>> less precise counting")
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> ---
>>   block/bdev.c              |  2 ++
>>   block/blk-mq.c            | 63 ++++++++++++++++++++++++++++++++++++++-
>>   block/blk-mq.h            |  2 ++
>>   block/genhd.c             | 25 ++++++++++++++--
>>   include/linux/blk-mq.h    |  2 ++
>>   include/linux/blk_types.h |  5 ++++
>>   6 files changed, 95 insertions(+), 4 deletions(-)
>>
>> diff --git a/block/bdev.c b/block/bdev.c
>> index 13de871fa816..5dced478f190 100644
>> --- a/block/bdev.c
>> +++ b/block/bdev.c
>> @@ -487,9 +487,11 @@ struct block_device *bdev_alloc(struct gendisk 
>> *disk, u8 partno)
>>       bdev = I_BDEV(inode);
>>       mutex_init(&bdev->bd_fsfreeze_mutex);
>>       spin_lock_init(&bdev->bd_size_lock);
>> +    spin_lock_init(&bdev->bd_stat_lock);
>>       bdev->bd_partno = partno;
>>       bdev->bd_inode = inode;
>>       bdev->bd_queue = disk->queue;
>> +    bdev->stat_time = 0;
>>       bdev->bd_stats = alloc_percpu(struct disk_stats);
>>       if (!bdev->bd_stats) {
>>           iput(inode);
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index c33fb378b168..7a14d4058b14 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -149,6 +149,48 @@ unsigned int blk_mq_in_flight(struct 
>> request_queue *q,
>>       return mi.inflight[0] + mi.inflight[1];
>>   }
>> +static bool blk_mq_check_inflight_stat(struct request *rq, void *priv,
>> +                       bool reserved)
>> +{
>> +    struct mq_inflight *mi = priv;
>> +
>> +    if ((!mi->part->bd_partno || rq->part == mi->part) &&
>> +        blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
>> +        u64 stat_time;
>> +
>> +        mi->inflight[rq_data_dir(rq)]++;
>> +        if (!rq->part)
>> +            return true;
>> +
>> +        stat_time = READ_ONCE(rq->stat_time_ns);
>> +        /*
>> +         * This might fail if 'req->stat_time_ns' happen to be updated
>> +         * in blk_account_io_done().
>> +         */
>> +        if (likely(cmpxchg(&rq->stat_time_ns, stat_time,
>> +                rq->part->stat_time) == stat_time)) {
>> +            int sgrp = op_stat_group(req_op(rq));
>> +            u64 duation = stat_time ?
>> +                rq->part->stat_time - stat_time :
>> +                rq->part->stat_time - rq->start_time_ns;
>> +
>> +            part_stat_add(rq->part, nsecs[sgrp], duation);
>> +        }
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +unsigned int blk_mq_in_flight_stat(struct request_queue *q,
>> +        struct block_device *part)
>> +{
>> +    struct mq_inflight mi = { .part = part };
>> +
>> +    blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_stat, &mi);
>> +
>> +    return mi.inflight[0] + mi.inflight[1];
>> +}
>> +
>>   void blk_mq_in_flight_rw(struct request_queue *q, struct 
>> block_device *part,
>>           unsigned int inflight[2])
>>   {
>> @@ -368,6 +410,7 @@ static struct request *blk_mq_rq_ctx_init(struct 
>> blk_mq_alloc_data *data,
>>           rq->start_time_ns = ktime_get_ns();
>>       else
>>           rq->start_time_ns = 0;
>> +    rq->stat_time_ns = 0;
>>       rq->part = NULL;
>>   #ifdef CONFIG_BLK_RQ_ALLOC_TIME
>>       rq->alloc_time_ns = alloc_time_ns;
>> @@ -874,7 +917,25 @@ static void __blk_account_io_done(struct request 
>> *req, u64 now)
>>       part_stat_lock();
>>       update_io_ticks(req->part, jiffies, true);
>>       part_stat_inc(req->part, ios[sgrp]);
>> -    part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
>> +
>> +    if (queue_is_mq(req->q)) {
>> +        u64 stat_time = READ_ONCE(req->stat_time_ns);
>> +
>> +        /*
>> +         * This might fail if 'req->stat_time_ns' happen to be updated
>> +         * in part_get_stat().
>> +         */
>> +        if (likely(cmpxchg(&req->stat_time_ns, stat_time, now) ==
>> +               stat_time)) {
>> +            u64 duation = stat_time ? now - stat_time :
>> +                          now - req->start_time_ns;
>> +
>> +            part_stat_add(req->part, nsecs[sgrp], duation);
>> +        }
>> +    } else {
>> +        part_stat_add(req->part, nsecs[sgrp],
>> +                  now - req->part->stat_time);
>> +    }
>>       part_stat_unlock();
>>   }
>> diff --git a/block/blk-mq.h b/block/blk-mq.h
>> index 7d1acf8e1e51..37941f5f1d08 100644
>> --- a/block/blk-mq.h
>> +++ b/block/blk-mq.h
>> @@ -189,6 +189,8 @@ static inline bool blk_mq_hw_queue_mapped(struct 
>> blk_mq_hw_ctx *hctx)
>>   unsigned int blk_mq_in_flight(struct request_queue *q,
>>           struct block_device *part);
>> +unsigned int blk_mq_in_flight_stat(struct request_queue *q,
>> +        struct block_device *part);
>>   void blk_mq_in_flight_rw(struct request_queue *q, struct 
>> block_device *part,
>>           unsigned int inflight[2]);
>> diff --git a/block/genhd.c b/block/genhd.c
>> index f2c7de2e7ca9..31603c7aff49 100644
>> --- a/block/genhd.c
>> +++ b/block/genhd.c
>> @@ -953,6 +953,19 @@ ssize_t part_size_show(struct device *dev,
>>       return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
>>   }
>> +static void part_set_stat_time(struct block_device *part)
>> +{
>> +    u64 now = ktime_get_ns();
>> +
>> +again:
>> +    part->stat_time = now;
>> +
>> +    if (part->bd_partno) {
>> +        part = bdev_whole(part);
>> +        goto again;
>> +    }
>> +}
>> +
>>   static unsigned int part_get_stat(struct block_device *bdev,
>>                     struct disk_stats *stat)
>> @@ -960,10 +973,16 @@ static unsigned int part_get_stat(struct 
>> block_device *bdev,
>>       struct request_queue *q = bdev_get_queue(bdev);
>>       unsigned int inflight;
>> -    if (queue_is_mq(q))
>> -        inflight = blk_mq_in_flight(q, bdev);
>> -    else
>> +    if (queue_is_mq(q)) {
>> +        spin_lock(&bdev->bd_stat_lock);
>> +        part_stat_lock();
>> +        part_set_stat_time(bdev);
>> +        inflight = blk_mq_in_flight_stat(q, bdev);
>> +        part_stat_unlock();
>> +        spin_unlock(&bdev->bd_stat_lock);
>> +    } else {
>>           inflight = part_in_flight(bdev);
>> +    }
>>       if (inflight) {
>>           part_stat_lock();
>> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
>> index e512da636e0f..e98f07d1741c 100644
>> --- a/include/linux/blk-mq.h
>> +++ b/include/linux/blk-mq.h
>> @@ -108,6 +108,8 @@ struct request {
>>       u64 start_time_ns;
>>       /* Time that I/O was submitted to the device. */
>>       u64 io_start_time_ns;
>> +    /* Time that I/O was accounted in part_get_stat() */
>> +    u64 stat_time_ns;
>>   #ifdef CONFIG_BLK_WBT
>>       unsigned short wbt_flags;
>> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
>> index c6ed4a559e6a..5cd399b5670d 100644
>> --- a/include/linux/blk_types.h
>> +++ b/include/linux/blk_types.h
>> @@ -65,6 +65,11 @@ struct block_device {
>>       struct super_block    *bd_fsfreeze_sb;
>>       struct partition_meta_info *bd_meta_info;
>> +
>> +    /* Prevent part_get_stat() to be called concurrently */
>> +    spinlock_t        bd_stat_lock;
>> +    /* Will be set when part_stat_show() or diskstats_show() is 
>> called */
>> +    u64            stat_time;
>>   #ifdef CONFIG_FAIL_MAKE_REQUEST
>>       bool            bd_make_it_fail;
>>   #endif
>>

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2022-04-08  6:52 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-17 11:26 [PATCH 0/3] optimizations for io accounting Yu Kuai
2022-03-17 11:26 ` [PATCH 1/3] block: don't show disk stats if io accounting is disabled Yu Kuai
2022-03-17 14:06   ` Bart Van Assche
2022-03-18  1:36     ` yukuai (C)
2022-03-18  3:10       ` Bart Van Assche
2022-03-18  6:12         ` yukuai (C)
2022-03-25  8:46   ` Christoph Hellwig
2022-03-17 11:26 ` [PATCH 2/3] block: factor out common code for part_stat_show() and diskstats_show() Yu Kuai
2022-03-25  8:46   ` Christoph Hellwig
2022-03-26  1:11     ` yukuai (C)
2022-03-17 11:26 ` [PATCH -next 3/3] block: update nsecs[] in part_get_stat() Yu Kuai
2022-04-01  3:42   ` yukuai (C)
2022-04-08  6:52     ` yukuai (C)
2022-03-25  7:29 ` [PATCH 0/3] optimizations for io accounting yukuai (C)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).