linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] block/diskstats: accumulate all per-cpu counters in one pass
@ 2019-04-01 15:59 Konstantin Khlebnikov
  2019-04-01 15:59 ` [PATCH 2/3] block/diskstats: remove redundant and inaccurate time_in_queue counter Konstantin Khlebnikov
  2019-04-01 15:59 ` [PATCH 3/3] block/diskstats: more accurate approximation of io_ticks for slow disks Konstantin Khlebnikov
  0 siblings, 2 replies; 6+ messages in thread
From: Konstantin Khlebnikov @ 2019-04-01 15:59 UTC (permalink / raw)
  To: linux-block, Jens Axboe, linux-kernel; +Cc: Mikulas Patocka, Mike Snitzer

This is faster and makes code much smaller:

./scripts/bloat-o-meter vmlinux.before vmlinux.after
add/remove: 1/0 grow/shrink: 0/2 up/down: 176/-1348 (-1172)
Function                                     old     new   delta
part_stat_read_all                             -     176    +176
diskstats_show                              1190     569    -621
part_stat_show                              1064     337    -727
Total: Before=13762189, After=13761017, chg -0.01%

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
 block/genhd.c             |   62 +++++++++++++++++++++++++++++++++++----------
 block/partition-generic.c |   31 ++++++++++++-----------
 include/linux/genhd.h     |    5 +---
 3 files changed, 67 insertions(+), 31 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 703267865f14..2f986af81ba1 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -45,6 +45,34 @@ static void disk_add_events(struct gendisk *disk);
 static void disk_del_events(struct gendisk *disk);
 static void disk_release_events(struct gendisk *disk);
 
+#ifdef CONFIG_SMP
+void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
+{
+	int cpu;
+
+	memset(stat, 0, sizeof(struct disk_stats));
+	for_each_possible_cpu(cpu) {
+		struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu);
+		int group;
+
+		for (group = 0; group < NR_STAT_GROUPS; group++) {
+			stat->nsecs[group] += ptr->nsecs[group];
+			stat->sectors[group] += ptr->sectors[group];
+			stat->ios[group] += ptr->ios[group];
+			stat->merges[group] += ptr->merges[group];
+		}
+
+		stat->io_ticks += ptr->io_ticks;
+		stat->time_in_queue += ptr->time_in_queue;
+	}
+}
+#else /* CONFIG_SMP */
+void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
+{
+	memcpy(stat, &part->dkstats, sizeof(struct disk_stats));
+}
+#endif /* CONFIG_SMP */
+
 void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
 {
 	if (queue_is_mq(q))
@@ -1341,6 +1369,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	struct hd_struct *hd;
 	char buf[BDEVNAME_SIZE];
 	unsigned int inflight;
+	struct disk_stats stat;
 
 	/*
 	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
@@ -1352,7 +1381,9 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 
 	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
 	while ((hd = disk_part_iter_next(&piter))) {
+		part_stat_read_all(hd, &stat);
 		inflight = part_in_flight(gp->queue, hd);
+
 		seq_printf(seqf, "%4d %7d %s "
 			   "%lu %lu %lu %u "
 			   "%lu %lu %lu %u "
@@ -1360,21 +1391,24 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   "%lu %lu %lu %u\n",
 			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
 			   disk_name(gp, hd->partno, buf),
-			   part_stat_read(hd, ios[STAT_READ]),
-			   part_stat_read(hd, merges[STAT_READ]),
-			   part_stat_read(hd, sectors[STAT_READ]),
-			   (unsigned int)part_stat_read_msecs(hd, STAT_READ),
-			   part_stat_read(hd, ios[STAT_WRITE]),
-			   part_stat_read(hd, merges[STAT_WRITE]),
-			   part_stat_read(hd, sectors[STAT_WRITE]),
-			   (unsigned int)part_stat_read_msecs(hd, STAT_WRITE),
+			   stat.ios[STAT_READ],
+			   stat.merges[STAT_READ],
+			   stat.sectors[STAT_READ],
+			   (unsigned int)div_u64(stat.nsecs[STAT_READ],
+							NSEC_PER_MSEC),
+			   stat.ios[STAT_WRITE],
+			   stat.merges[STAT_WRITE],
+			   stat.sectors[STAT_WRITE],
+			   (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
+							NSEC_PER_MSEC),
 			   inflight,
-			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
-			   jiffies_to_msecs(part_stat_read(hd, time_in_queue)),
-			   part_stat_read(hd, ios[STAT_DISCARD]),
-			   part_stat_read(hd, merges[STAT_DISCARD]),
-			   part_stat_read(hd, sectors[STAT_DISCARD]),
-			   (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD)
+			   jiffies_to_msecs(stat.io_ticks),
+			   jiffies_to_msecs(stat.time_in_queue),
+			   stat.ios[STAT_DISCARD],
+			   stat.merges[STAT_DISCARD],
+			   stat.sectors[STAT_DISCARD],
+			   (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
+							NSEC_PER_MSEC)
 			);
 	}
 	disk_part_iter_exit(&piter);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 8e596a8dff32..621e0d9f3c92 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -120,30 +120,33 @@ ssize_t part_stat_show(struct device *dev,
 {
 	struct hd_struct *p = dev_to_part(dev);
 	struct request_queue *q = part_to_disk(p)->queue;
+	struct disk_stats stat;
 	unsigned int inflight;
 
+	part_stat_read_all(p, &stat);
 	inflight = part_in_flight(q, p);
+
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
 		"%8lu %8lu %8llu %8u "
 		"%8u %8u %8u "
 		"%8lu %8lu %8llu %8u"
 		"\n",
-		part_stat_read(p, ios[STAT_READ]),
-		part_stat_read(p, merges[STAT_READ]),
-		(unsigned long long)part_stat_read(p, sectors[STAT_READ]),
-		(unsigned int)part_stat_read_msecs(p, STAT_READ),
-		part_stat_read(p, ios[STAT_WRITE]),
-		part_stat_read(p, merges[STAT_WRITE]),
-		(unsigned long long)part_stat_read(p, sectors[STAT_WRITE]),
-		(unsigned int)part_stat_read_msecs(p, STAT_WRITE),
+		stat.ios[STAT_READ],
+		stat.merges[STAT_READ],
+		(unsigned long long)stat.sectors[STAT_READ],
+		(unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC),
+		stat.ios[STAT_WRITE],
+		stat.merges[STAT_WRITE],
+		(unsigned long long)stat.sectors[STAT_WRITE],
+		(unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
 		inflight,
-		jiffies_to_msecs(part_stat_read(p, io_ticks)),
-		jiffies_to_msecs(part_stat_read(p, time_in_queue)),
-		part_stat_read(p, ios[STAT_DISCARD]),
-		part_stat_read(p, merges[STAT_DISCARD]),
-		(unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]),
-		(unsigned int)part_stat_read_msecs(p, STAT_DISCARD));
+		jiffies_to_msecs(stat.io_ticks),
+		jiffies_to_msecs(stat.time_in_queue),
+		stat.ios[STAT_DISCARD],
+		stat.merges[STAT_DISCARD],
+		(unsigned long long)stat.sectors[STAT_DISCARD],
+		(unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC));
 }
 
 ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 06c0fd594097..7f981be190b4 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -357,9 +357,6 @@ static inline void free_part_stats(struct hd_struct *part)
 
 #endif /* CONFIG_SMP */
 
-#define part_stat_read_msecs(part, which)				\
-	div_u64(part_stat_read(part, nsecs[which]), NSEC_PER_MSEC)
-
 #define part_stat_read_accum(part, field)				\
 	(part_stat_read(part, field[STAT_READ]) +			\
 	 part_stat_read(part, field[STAT_WRITE]) +			\
@@ -391,6 +388,8 @@ static inline void free_part_stats(struct hd_struct *part)
 #define part_stat_local_read_cpu(gendiskp, field, cpu)			\
 	local_read(&(part_stat_get_cpu(gendiskp, field, cpu)))
 
+void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat);
+
 unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part);
 void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 		       unsigned int inflight[2]);


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/3] block/diskstats: remove redundant and inaccurate time_in_queue counter
  2019-04-01 15:59 [PATCH 1/3] block/diskstats: accumulate all per-cpu counters in one pass Konstantin Khlebnikov
@ 2019-04-01 15:59 ` Konstantin Khlebnikov
  2019-04-01 15:59 ` [PATCH 3/3] block/diskstats: more accurate approximation of io_ticks for slow disks Konstantin Khlebnikov
  1 sibling, 0 replies; 6+ messages in thread
From: Konstantin Khlebnikov @ 2019-04-01 15:59 UTC (permalink / raw)
  To: linux-block, Jens Axboe, linux-kernel; +Cc: Mikulas Patocka, Mike Snitzer

Diskstat's column "time_in_queue" must be equal to sum "read tisks",
"write ticks" and "discard ticks". But it is accounted separately and
in jiffies rather than nanoseconds as other times. As a result total
time is not even close to the sum, especially for fast ssd disks.

In Documentation/block/stat.txt these fields are defined as product of
time when disk had such requests and count of them. But that is equal to
sum of running time of completed requests plus running time of in-flight
requests. Latter part is small or even zero if disk is idle.

This patch removes field time_in_queue and replaces with simple sum.

Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting")
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
 block/bio.c               |    1 -
 block/blk-core.c          |    1 -
 block/genhd.c             |    6 ++++--
 block/partition-generic.c |    5 ++++-
 include/linux/genhd.h     |    1 -
 5 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index b64cedc7f87c..c0a60f3e9b7b 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1772,7 +1772,6 @@ void generic_end_io_acct(struct request_queue *q, int req_op,
 
 	update_io_ticks(part, now);
 	part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
-	part_stat_add(part, time_in_queue, duration);
 	part_dec_in_flight(q, part, op_is_write(req_op));
 
 	part_stat_unlock();
diff --git a/block/blk-core.c b/block/blk-core.c
index 4673ebe42255..d89168b167e9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1337,7 +1337,6 @@ void blk_account_io_done(struct request *req, u64 now)
 		update_io_ticks(part, jiffies);
 		part_stat_inc(part, ios[sgrp]);
 		part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
-		part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns));
 		part_dec_in_flight(req->q, part, rq_data_dir(req));
 
 		hd_struct_put(part);
diff --git a/block/genhd.c b/block/genhd.c
index 2f986af81ba1..7e53d6af08fb 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -63,7 +63,6 @@ void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
 		}
 
 		stat->io_ticks += ptr->io_ticks;
-		stat->time_in_queue += ptr->time_in_queue;
 	}
 }
 #else /* CONFIG_SMP */
@@ -1403,7 +1402,10 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 							NSEC_PER_MSEC),
 			   inflight,
 			   jiffies_to_msecs(stat.io_ticks),
-			   jiffies_to_msecs(stat.time_in_queue),
+			   (unsigned int)div_u64(stat.nsecs[STAT_READ] +
+						 stat.nsecs[STAT_WRITE] +
+						 stat.nsecs[STAT_DISCARD],
+							NSEC_PER_MSEC),
 			   stat.ios[STAT_DISCARD],
 			   stat.merges[STAT_DISCARD],
 			   stat.sectors[STAT_DISCARD],
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 621e0d9f3c92..935df55c0c91 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -142,7 +142,10 @@ ssize_t part_stat_show(struct device *dev,
 		(unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
 		inflight,
 		jiffies_to_msecs(stat.io_ticks),
-		jiffies_to_msecs(stat.time_in_queue),
+		(unsigned int)div_u64(stat.nsecs[STAT_READ] +
+				      stat.nsecs[STAT_WRITE] +
+				      stat.nsecs[STAT_DISCARD],
+						NSEC_PER_MSEC),
 		stat.ios[STAT_DISCARD],
 		stat.merges[STAT_DISCARD],
 		(unsigned long long)stat.sectors[STAT_DISCARD],
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 7f981be190b4..2f5a9ed7e86e 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -89,7 +89,6 @@ struct disk_stats {
 	unsigned long ios[NR_STAT_GROUPS];
 	unsigned long merges[NR_STAT_GROUPS];
 	unsigned long io_ticks;
-	unsigned long time_in_queue;
 	local_t in_flight[2];
 };
 


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/3] block/diskstats: more accurate approximation of io_ticks for slow disks
  2019-04-01 15:59 [PATCH 1/3] block/diskstats: accumulate all per-cpu counters in one pass Konstantin Khlebnikov
  2019-04-01 15:59 ` [PATCH 2/3] block/diskstats: remove redundant and inaccurate time_in_queue counter Konstantin Khlebnikov
@ 2019-04-01 15:59 ` Konstantin Khlebnikov
  2019-05-07  9:13   ` Alan Jenkins
  1 sibling, 1 reply; 6+ messages in thread
From: Konstantin Khlebnikov @ 2019-04-01 15:59 UTC (permalink / raw)
  To: linux-block, Jens Axboe, linux-kernel; +Cc: Mikulas Patocka, Mike Snitzer

Currently io_ticks is approximated by adding one at each
start and end of requests if jiffies has changed.
This works perfectly for requests shorter than a jiffy.

Fix for slow requests is simple: at the end of request add
count of jiffies passed since last update.

Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting")
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
 block/bio.c           |    8 ++++----
 block/blk-core.c      |    4 ++--
 include/linux/genhd.h |    2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index c0a60f3e9b7b..245056797999 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1729,14 +1729,14 @@ void bio_check_pages_dirty(struct bio *bio)
 	schedule_work(&bio_dirty_work);
 }
 
-void update_io_ticks(struct hd_struct *part, unsigned long now)
+void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
 {
 	unsigned long stamp;
 again:
 	stamp = READ_ONCE(part->stamp);
 	if (unlikely(stamp != now)) {
 		if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) {
-			__part_stat_add(part, io_ticks, 1);
+			__part_stat_add(part, io_ticks, end ? now - stamp : 1);
 		}
 	}
 	if (part->partno) {
@@ -1752,7 +1752,7 @@ void generic_start_io_acct(struct request_queue *q, int op,
 
 	part_stat_lock();
 
-	update_io_ticks(part, jiffies);
+	update_io_ticks(part, jiffies, false);
 	part_stat_inc(part, ios[sgrp]);
 	part_stat_add(part, sectors[sgrp], sectors);
 	part_inc_in_flight(q, part, op_is_write(op));
@@ -1770,7 +1770,7 @@ void generic_end_io_acct(struct request_queue *q, int req_op,
 
 	part_stat_lock();
 
-	update_io_ticks(part, now);
+	update_io_ticks(part, now, true);
 	part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
 	part_dec_in_flight(q, part, op_is_write(req_op));
 
diff --git a/block/blk-core.c b/block/blk-core.c
index d89168b167e9..6e8f0b9e7731 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1334,7 +1334,7 @@ void blk_account_io_done(struct request *req, u64 now)
 		part_stat_lock();
 		part = req->part;
 
-		update_io_ticks(part, jiffies);
+		update_io_ticks(part, jiffies, true);
 		part_stat_inc(part, ios[sgrp]);
 		part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
 		part_dec_in_flight(req->q, part, rq_data_dir(req));
@@ -1375,7 +1375,7 @@ void blk_account_io_start(struct request *rq, bool new_io)
 		rq->part = part;
 	}
 
-	update_io_ticks(part, jiffies);
+	update_io_ticks(part, jiffies, false);
 
 	part_stat_unlock();
 }
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 2f5a9ed7e86e..8ece8e02c609 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -410,7 +410,7 @@ static inline void free_part_info(struct hd_struct *part)
 	kfree(part->info);
 }
 
-void update_io_ticks(struct hd_struct *part, unsigned long now);
+void update_io_ticks(struct hd_struct *part, unsigned long now, bool end);
 
 /* block/genhd.c */
 extern void device_add_disk(struct device *parent, struct gendisk *disk,


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 3/3] block/diskstats: more accurate approximation of io_ticks for slow disks
  2019-04-01 15:59 ` [PATCH 3/3] block/diskstats: more accurate approximation of io_ticks for slow disks Konstantin Khlebnikov
@ 2019-05-07  9:13   ` Alan Jenkins
  2019-05-07  9:13     ` Alan Jenkins
  2019-06-09 10:50     ` Konstantin Khlebnikov
  0 siblings, 2 replies; 6+ messages in thread
From: Alan Jenkins @ 2019-05-07  9:13 UTC (permalink / raw)
  To: Konstantin Khlebnikov, linux-block, Jens Axboe, linux-kernel
  Cc: Mikulas Patocka, Mike Snitzer

On 01/04/2019 16:59, Konstantin Khlebnikov wrote:
> Currently io_ticks is approximated by adding one at each
> start and end of requests if jiffies has changed.
> This works perfectly for requests shorter than a jiffy.
> 
> Fix for slow requests is simple: at the end of request add
> count of jiffies passed since last update.
> 
> Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting")
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>

Thanks for working on this!  I noticed the problem behaviour using the 
Fedora 29 kernel [1].  I wasn't sure how it could be fixed.  Now I found 
this patch series, but I still have some questions :-).

[1] 
https://unix.stackexchange.com/questions/517132/dd-is-running-at-full-speed-but-i-only-see-20-disk-utilization-why

With these patches, `atopsar -d 2` shows about 100% "busy" when running 
a simple `dd` command, instead of 20 or 35%.  So that looked promising.

I saw some samples showing 112, 113, and 114% utilization. 
Unfortunately I'm not sure exactly how to reproduce that.  I think it 
happened during filesystem buffered writes (i.e. `dd` without 
`oflag=direct`), with the IO scheduler set to "none", on my SATA HDD.

Getting some "101% busy" samples seemed fairly easy to trigger, but I am 
not sure whether that is just a rounding error in `atopsar` :-(.


Q1)

 > Fix for slow requests is simple: at the end of request add
 > count of jiffies passed since last update.

Even considering the simple case of a single CPU, the approximation may 
be "less accurate" when requests complete out of order.  Is that right?

  t        1      10     20   30
  io1      start              end
  io2             start  end
  io_ticks 1      2      11   21

            ^^^^^^
               \
                 9 ticks not accounted as "busy"


At least, I found something fun happens if I run `dd if=/dev/urandom 
of=~/t bs=1M oflag=direct` at the same time as `dd if=/dev/sda 
of=/dev/null bs=1M oflag=direct` .  With scheduler=none, it reduces 
"busy" from 100%, down to 97%.  With scheduler=mq-deadline, it reduces 
"busy" from 100% to 60% :-).  Even though the system "iowait" is about 
100% (equivalent to one CPU).

(Environment: My /dev/sda max read speed is about 150MB/s.  My 
/dev/urandom read speed is about 140 MB/s.  I have 4 logical CPUs).

It feels like it should be possible to improve io_ticks, by basically 
changing the condition in your fix, from (end==true), to (inflight>0). 
Would that make sense?


Q2) But what most confuses me, is that I think `io_ticks` is defined as 
a per-cpu field.  And the per-cpu fields are summed into one value, 
which is reported to userspace.

I have tried playing with a couple `dd iflag=direct` readers, using 
`taskset` to pin them to different CPUs, but I only got 98-102% "busy". 
It did not rise to 200% :-).

i) Is it possible to see 200% "busy", due to per-cpu ioticks?

ii) If so, then can per-cpu ioticks also cause us to see 100% "busy", 
when IO's were only inflight for 50% of the time (but on two different 
CPUs)?

     If it is possible to trigger both of these cases, this metric seems 
very difficult to trust and use in any reliable way.  It seems 
preferable to disable it altogether and force people to use more 
trustworthy metrics.  E.g. system-wide "iowait", and iostat field 11 
"weighted # of milliseconds spent doing I/Os" which `iostat` uses to 
show "average queue length".

     Or the "special pleading" approach?  Should ioticks accounted at 
the level of the hardware queue, and be disabled if the device has is 
using more than one hardware queue?


Q3) In case I am mistaken in some way, and Q2 is not an issue at all:

I still think reporting over 100% utilization is something new.  At 
least the comments I see were removed in the "Fixes" commit seem to 
agree.  That one error of 14% ("114% busy") that I saw, seems fairly big 
:-).

I wonder if we can better explain how much of a rough approximation this 
metric is now, e.g. in Documentation/iostats.txt ?

So far I don't know what the real description and limitations would be...

I think it would be understandable e.g. if we were able to say "busy%" 
should show around 100% when the device is used around 100% of the time, 
and definitely 0% when it is idle, but is probably not as accurate as 
"iowait". ("iowait" reported by the CPU scheduler.  Not to say these are 
the same or equivalent.  And I understand "iowait" is another ball of 
approximations and confusion.)


Regards
Alan


> ---
>   block/bio.c           |    8 ++++----
>   block/blk-core.c      |    4 ++--
>   include/linux/genhd.h |    2 +-
>   3 files changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index c0a60f3e9b7b..245056797999 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1729,14 +1729,14 @@ void bio_check_pages_dirty(struct bio *bio)
>   	schedule_work(&bio_dirty_work);
>   }
>   
> -void update_io_ticks(struct hd_struct *part, unsigned long now)
> +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
>   {
>   	unsigned long stamp;
>   again:
>   	stamp = READ_ONCE(part->stamp);
>   	if (unlikely(stamp != now)) {
>   		if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) {
> -			__part_stat_add(part, io_ticks, 1);
> +			__part_stat_add(part, io_ticks, end ? now - stamp : 1);
>   		}
>   	}
>   	if (part->partno) {
> @@ -1752,7 +1752,7 @@ void generic_start_io_acct(struct request_queue *q, int op,
>   
>   	part_stat_lock();
>   
> -	update_io_ticks(part, jiffies);
> +	update_io_ticks(part, jiffies, false);
>   	part_stat_inc(part, ios[sgrp]);
>   	part_stat_add(part, sectors[sgrp], sectors);
>   	part_inc_in_flight(q, part, op_is_write(op));
> @@ -1770,7 +1770,7 @@ void generic_end_io_acct(struct request_queue *q, int req_op,
>   
>   	part_stat_lock();
>   
> -	update_io_ticks(part, now);
> +	update_io_ticks(part, now, true);
>   	part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
>   	part_dec_in_flight(q, part, op_is_write(req_op));
>   
> diff --git a/block/blk-core.c b/block/blk-core.c
> index d89168b167e9..6e8f0b9e7731 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1334,7 +1334,7 @@ void blk_account_io_done(struct request *req, u64 now)
>   		part_stat_lock();
>   		part = req->part;
>   
> -		update_io_ticks(part, jiffies);
> +		update_io_ticks(part, jiffies, true);
>   		part_stat_inc(part, ios[sgrp]);
>   		part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
>   		part_dec_in_flight(req->q, part, rq_data_dir(req));
> @@ -1375,7 +1375,7 @@ void blk_account_io_start(struct request *rq, bool new_io)
>   		rq->part = part;
>   	}
>   
> -	update_io_ticks(part, jiffies);
> +	update_io_ticks(part, jiffies, false);
>   
>   	part_stat_unlock();
>   }
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> index 2f5a9ed7e86e..8ece8e02c609 100644
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -410,7 +410,7 @@ static inline void free_part_info(struct hd_struct *part)
>   	kfree(part->info);
>   }
>   
> -void update_io_ticks(struct hd_struct *part, unsigned long now);
> +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end);
>   
>   /* block/genhd.c */
>   extern void device_add_disk(struct device *parent, struct gendisk *disk,
> 
> 


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 3/3] block/diskstats: more accurate approximation of io_ticks for slow disks
  2019-05-07  9:13   ` Alan Jenkins
@ 2019-05-07  9:13     ` Alan Jenkins
  2019-06-09 10:50     ` Konstantin Khlebnikov
  1 sibling, 0 replies; 6+ messages in thread
From: Alan Jenkins @ 2019-05-07  9:13 UTC (permalink / raw)
  To: linux-block; +Cc: linux-kernel, linux-block

On 01/04/2019 16:59, Konstantin Khlebnikov wrote:
> Currently io_ticks is approximated by adding one at each
> start and end of requests if jiffies has changed.
> This works perfectly for requests shorter than a jiffy.
> 
> Fix for slow requests is simple: at the end of request add
> count of jiffies passed since last update.
> 
> Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting")
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>

Thanks for working on this!  I noticed the problem behaviour using the 
Fedora 29 kernel [1].  I wasn't sure how it could be fixed.  Now I found 
this patch series, but I still have some questions :-).

[1] 
https://unix.stackexchange.com/questions/517132/dd-is-running-at-full-speed-but-i-only-see-20-disk-utilization-why

With these patches, `atopsar -d 2` shows about 100% "busy" when running 
a simple `dd` command, instead of 20 or 35%.  So that looked promising.

I saw some samples showing 112, 113, and 114% utilization. 
Unfortunately I'm not sure exactly how to reproduce that.  I think it 
happened during filesystem buffered writes (i.e. `dd` without 
`oflag=direct`), with the IO scheduler set to "none", on my SATA HDD.

Getting some "101% busy" samples seemed fairly easy to trigger, but I am 
not sure whether that is just a rounding error in `atopsar` :-(.


Q1)

 > Fix for slow requests is simple: at the end of request add
 > count of jiffies passed since last update.

Even considering the simple case of a single CPU, the approximation may 
be "less accurate" when requests complete out of order.  Is that right?

  t        1      10     20   30
  io1      start              end
  io2             start  end
  io_ticks 1      2      11   21

            ^^^^^^
               \
                 9 ticks not accounted as "busy"


At least, I found something fun happens if I run `dd if=/dev/urandom 
of=~/t bs=1M oflag=direct` at the same time as `dd if=/dev/sda 
of=/dev/null bs=1M oflag=direct` .  With scheduler=none, it reduces 
"busy" from 100%, down to 97%.  With scheduler=mq-deadline, it reduces 
"busy" from 100% to 60% :-).  Even though the system "iowait" is about 
100% (equivalent to one CPU).

(Environment: My /dev/sda max read speed is about 150MB/s.  My 
/dev/urandom read speed is about 140 MB/s.  I have 4 logical CPUs).

It feels like it should be possible to improve io_ticks, by basically 
changing the condition in your fix, from (end==true), to (inflight>0). 
Would that make sense?


Q2) But what most confuses me, is that I think `io_ticks` is defined as 
a per-cpu field.  And the per-cpu fields are summed into one value, 
which is reported to userspace.

I have tried playing with a couple `dd iflag=direct` readers, using 
`taskset` to pin them to different CPUs, but I only got 98-102% "busy". 
It did not rise to 200% :-).

i) Is it possible to see 200% "busy", due to per-cpu ioticks?

ii) If so, then can per-cpu ioticks also cause us to see 100% "busy", 
when IO's were only inflight for 50% of the time (but on two different 
CPUs)?

     If it is possible to trigger both of these cases, this metric seems 
very difficult to trust and use in any reliable way.  It seems 
preferable to disable it altogether and force people to use more 
trustworthy metrics.  E.g. system-wide "iowait", and iostat field 11 
"weighted # of milliseconds spent doing I/Os" which `iostat` uses to 
show "average queue length".

     Or the "special pleading" approach?  Should ioticks accounted at 
the level of the hardware queue, and be disabled if the device has is 
using more than one hardware queue?


Q3) In case I am mistaken in some way, and Q2 is not an issue at all:

I still think reporting over 100% utilization is something new.  At 
least the comments I see were removed in the "Fixes" commit seem to 
agree.  That one error of 14% ("114% busy") that I saw, seems fairly big 
:-).

I wonder if we can better explain how much of a rough approximation this 
metric is now, e.g. in Documentation/iostats.txt ?

So far I don't know what the real description and limitations would be...

I think it would be understandable e.g. if we were able to say "busy%" 
should show around 100% when the device is used around 100% of the time, 
and definitely 0% when it is idle, but is probably not as accurate as 
"iowait". ("iowait" reported by the CPU scheduler.  Not to say these are 
the same or equivalent.  And I understand "iowait" is another ball of 
approximations and confusion.)


Regards
Alan


> ---
>   block/bio.c           |    8 ++++----
>   block/blk-core.c      |    4 ++--
>   include/linux/genhd.h |    2 +-
>   3 files changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index c0a60f3e9b7b..245056797999 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1729,14 +1729,14 @@ void bio_check_pages_dirty(struct bio *bio)
>   	schedule_work(&bio_dirty_work);
>   }
>   
> -void update_io_ticks(struct hd_struct *part, unsigned long now)
> +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
>   {
>   	unsigned long stamp;
>   again:
>   	stamp = READ_ONCE(part->stamp);
>   	if (unlikely(stamp != now)) {
>   		if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) {
> -			__part_stat_add(part, io_ticks, 1);
> +			__part_stat_add(part, io_ticks, end ? now - stamp : 1);
>   		}
>   	}
>   	if (part->partno) {
> @@ -1752,7 +1752,7 @@ void generic_start_io_acct(struct request_queue *q, int op,
>   
>   	part_stat_lock();
>   
> -	update_io_ticks(part, jiffies);
> +	update_io_ticks(part, jiffies, false);
>   	part_stat_inc(part, ios[sgrp]);
>   	part_stat_add(part, sectors[sgrp], sectors);
>   	part_inc_in_flight(q, part, op_is_write(op));
> @@ -1770,7 +1770,7 @@ void generic_end_io_acct(struct request_queue *q, int req_op,
>   
>   	part_stat_lock();
>   
> -	update_io_ticks(part, now);
> +	update_io_ticks(part, now, true);
>   	part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
>   	part_dec_in_flight(q, part, op_is_write(req_op));
>   
> diff --git a/block/blk-core.c b/block/blk-core.c
> index d89168b167e9..6e8f0b9e7731 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1334,7 +1334,7 @@ void blk_account_io_done(struct request *req, u64 now)
>   		part_stat_lock();
>   		part = req->part;
>   
> -		update_io_ticks(part, jiffies);
> +		update_io_ticks(part, jiffies, true);
>   		part_stat_inc(part, ios[sgrp]);
>   		part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
>   		part_dec_in_flight(req->q, part, rq_data_dir(req));
> @@ -1375,7 +1375,7 @@ void blk_account_io_start(struct request *rq, bool new_io)
>   		rq->part = part;
>   	}
>   
> -	update_io_ticks(part, jiffies);
> +	update_io_ticks(part, jiffies, false);
>   
>   	part_stat_unlock();
>   }
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> index 2f5a9ed7e86e..8ece8e02c609 100644
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -410,7 +410,7 @@ static inline void free_part_info(struct hd_struct *part)
>   	kfree(part->info);
>   }
>   
> -void update_io_ticks(struct hd_struct *part, unsigned long now);
> +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end);
>   
>   /* block/genhd.c */
>   extern void device_add_disk(struct device *parent, struct gendisk *disk,
> 
> 



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 3/3] block/diskstats: more accurate approximation of io_ticks for slow disks
  2019-05-07  9:13   ` Alan Jenkins
  2019-05-07  9:13     ` Alan Jenkins
@ 2019-06-09 10:50     ` Konstantin Khlebnikov
  1 sibling, 0 replies; 6+ messages in thread
From: Konstantin Khlebnikov @ 2019-06-09 10:50 UTC (permalink / raw)
  To: Alan Jenkins, linux-block, Jens Axboe, linux-kernel
  Cc: Mikulas Patocka, Mike Snitzer

On 07.05.2019 12:13, Alan Jenkins wrote:
> On 01/04/2019 16:59, Konstantin Khlebnikov wrote:
>> Currently io_ticks is approximated by adding one at each
>> start and end of requests if jiffies has changed.
>> This works perfectly for requests shorter than a jiffy.
>>
>> Fix for slow requests is simple: at the end of request add
>> count of jiffies passed since last update.
>>
>> Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting")
>> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
> 
> Thanks for working on this!  I noticed the problem behaviour using the Fedora 29 kernel [1].  I wasn't sure how it could be fixed.  Now I 
> found this patch series, but I still have some questions :-).
> 
> [1] https://unix.stackexchange.com/questions/517132/dd-is-running-at-full-speed-but-i-only-see-20-disk-utilization-why
> 
> With these patches, `atopsar -d 2` shows about 100% "busy" when running a simple `dd` command, instead of 20 or 35%.  So that looked promising.
> 
> I saw some samples showing 112, 113, and 114% utilization. Unfortunately I'm not sure exactly how to reproduce that.  I think it happened 
> during filesystem buffered writes (i.e. `dd` without `oflag=direct`), with the IO scheduler set to "none", on my SATA HDD.
> 
> Getting some "101% busy" samples seemed fairly easy to trigger, but I am not sure whether that is just a rounding error in `atopsar` :-(.

I suppose atop/sar divides delta(io_ticks) and delta(real_time)

With my patch io_tics accounts whole operation then it ends
it could be started at previous interval of atop statistics.
as a result delta(io_ticks) will be bigger than delta(real_time).

> 
> 
> Q1)
> 
>  > Fix for slow requests is simple: at the end of request add
>  > count of jiffies passed since last update.
> 
> Even considering the simple case of a single CPU, the approximation may be "less accurate" when requests complete out of order.  Is that right?
> 
>   t        1      10     20   30
>   io1      start              end
>   io2             start  end
>   io_ticks 1      2      11   21
> 
>             ^^^^^^
>                \
>                  9 ticks not accounted as "busy"

Yep. Without my patch there will be also 9 tick gap between ends.

> 
> 
> At least, I found something fun happens if I run `dd if=/dev/urandom of=~/t bs=1M oflag=direct` at the same time as `dd if=/dev/sda 
> of=/dev/null bs=1M oflag=direct` .  With scheduler=none, it reduces "busy" from 100%, down to 97%.  With scheduler=mq-deadline, it reduces 
> "busy" from 100% to 60% :-).  Even though the system "iowait" is about 100% (equivalent to one CPU).
> 
> (Environment: My /dev/sda max read speed is about 150MB/s.  My /dev/urandom read speed is about 140 MB/s.  I have 4 logical CPUs).
> 
> It feels like it should be possible to improve io_ticks, by basically changing the condition in your fix, from (end==true), to (inflight>0). 
> Would that make sense?

inflight now per-cpu counter for performance reason.
request starts on one cpu and completes on another so it's hard to check exact count.

> 
> 
> Q2) But what most confuses me, is that I think `io_ticks` is defined as a per-cpu field.  And the per-cpu fields are summed into one value, 
> which is reported to userspace.
> 
> I have tried playing with a couple `dd iflag=direct` readers, using `taskset` to pin them to different CPUs, but I only got 98-102% "busy". 
> It did not rise to 200% :-).
> 
> i) Is it possible to see 200% "busy", due to per-cpu ioticks?

Nope. Percpu io_ticks accounts steps of single (per-partition) atomic time stamp.

	stamp = READ_ONCE(part->stamp);
	if (unlikely(stamp != now)) {
		if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) {
			__part_stat_add(part, io_ticks, end ? now - stamp : 1);
		}
	}

> 
> ii) If so, then can per-cpu ioticks also cause us to see 100% "busy", when IO's were only inflight for 50% of the time (but on two different 
> CPUs)?
> 
>      If it is possible to trigger both of these cases, this metric seems very difficult to trust and use in any reliable way.  It seems 
> preferable to disable it altogether and force people to use more trustworthy metrics.  E.g. system-wide "iowait", and iostat field 11 
> "weighted # of milliseconds spent doing I/Os" which `iostat` uses to show "average queue length".

Yep. io_ticks is a legacy. "weighted" counters better resembles for modern parallel hardware.

> 
>      Or the "special pleading" approach?  Should ioticks accounted at the level of the hardware queue, and be disabled if the device has is  > using more than one hardware queue?
> 
> 
> Q3) In case I am mistaken in some way, and Q2 is not an issue at all:
> 
> I still think reporting over 100% utilization is something new.  At least the comments I see were removed in the "Fixes" commit seem to 
> agree.  That one error of 14% ("114% busy") that I saw, seems fairly big :-).
> 
> I wonder if we can better explain how much of a rough approximation this metric is now, e.g. in Documentation/iostats.txt ?
> 
> So far I don't know what the real description and limitations would be...
> 
> I think it would be understandable e.g. if we were able to say "busy%" should show around 100% when the device is used around 100% of the 
> time, and definitely 0% when it is idle, but is probably not as accurate as "iowait". ("iowait" reported by the CPU scheduler.  Not to say 
> these are the same or equivalent.  And I understand "iowait" is another ball of approximations and confusion.)

Current approach emulates legacy io_ticks with minimal effort.

Right now io_ticks counts jiffies when was at least one request was started or completed.
Time between these events are not accounted. Probably it would be better to document it in this way.

> 
> 
> Regards
> Alan
> 
> 
>> ---
>>   block/bio.c           |    8 ++++----
>>   block/blk-core.c      |    4 ++--
>>   include/linux/genhd.h |    2 +-
>>   3 files changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/block/bio.c b/block/bio.c
>> index c0a60f3e9b7b..245056797999 100644
>> --- a/block/bio.c
>> +++ b/block/bio.c
>> @@ -1729,14 +1729,14 @@ void bio_check_pages_dirty(struct bio *bio)
>>       schedule_work(&bio_dirty_work);
>>   }
>> -void update_io_ticks(struct hd_struct *part, unsigned long now)
>> +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
>>   {
>>       unsigned long stamp;
>>   again:
>>       stamp = READ_ONCE(part->stamp);
>>       if (unlikely(stamp != now)) {
>>           if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) {
>> -            __part_stat_add(part, io_ticks, 1);
>> +            __part_stat_add(part, io_ticks, end ? now - stamp : 1);
>>           }
>>       }
>>       if (part->partno) {
>> @@ -1752,7 +1752,7 @@ void generic_start_io_acct(struct request_queue *q, int op,
>>       part_stat_lock();
>> -    update_io_ticks(part, jiffies);
>> +    update_io_ticks(part, jiffies, false);
>>       part_stat_inc(part, ios[sgrp]);
>>       part_stat_add(part, sectors[sgrp], sectors);
>>       part_inc_in_flight(q, part, op_is_write(op));
>> @@ -1770,7 +1770,7 @@ void generic_end_io_acct(struct request_queue *q, int req_op,
>>       part_stat_lock();
>> -    update_io_ticks(part, now);
>> +    update_io_ticks(part, now, true);
>>       part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
>>       part_dec_in_flight(q, part, op_is_write(req_op));
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index d89168b167e9..6e8f0b9e7731 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -1334,7 +1334,7 @@ void blk_account_io_done(struct request *req, u64 now)
>>           part_stat_lock();
>>           part = req->part;
>> -        update_io_ticks(part, jiffies);
>> +        update_io_ticks(part, jiffies, true);
>>           part_stat_inc(part, ios[sgrp]);
>>           part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
>>           part_dec_in_flight(req->q, part, rq_data_dir(req));
>> @@ -1375,7 +1375,7 @@ void blk_account_io_start(struct request *rq, bool new_io)
>>           rq->part = part;
>>       }
>> -    update_io_ticks(part, jiffies);
>> +    update_io_ticks(part, jiffies, false);
>>       part_stat_unlock();
>>   }
>> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
>> index 2f5a9ed7e86e..8ece8e02c609 100644
>> --- a/include/linux/genhd.h
>> +++ b/include/linux/genhd.h
>> @@ -410,7 +410,7 @@ static inline void free_part_info(struct hd_struct *part)
>>       kfree(part->info);
>>   }
>> -void update_io_ticks(struct hd_struct *part, unsigned long now);
>> +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end);
>>   /* block/genhd.c */
>>   extern void device_add_disk(struct device *parent, struct gendisk *disk,
>>
>>
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2019-06-09 10:57 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-01 15:59 [PATCH 1/3] block/diskstats: accumulate all per-cpu counters in one pass Konstantin Khlebnikov
2019-04-01 15:59 ` [PATCH 2/3] block/diskstats: remove redundant and inaccurate time_in_queue counter Konstantin Khlebnikov
2019-04-01 15:59 ` [PATCH 3/3] block/diskstats: more accurate approximation of io_ticks for slow disks Konstantin Khlebnikov
2019-05-07  9:13   ` Alan Jenkins
2019-05-07  9:13     ` Alan Jenkins
2019-06-09 10:50     ` Konstantin Khlebnikov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).