LKML Archive on lore.kernel.org
 help / color / Atom feed
From: Dennis Zhou <dennisszhou@gmail.com>
To: Jens Axboe <axboe@kernel.dk>, Tejun Heo <tj@kernel.org>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Josef Bacik <josef@toxicpanda.com>
Cc: kernel-team@fb.com, linux-block@vger.kernel.org,
	cgroups@vger.kernel.org, linux-kernel@vger.kernel.org,
	"Dennis Zhou (Facebook)" <dennisszhou@gmail.com>
Subject: [PATCH 15/15] blkcg: add average latency tracking to blk-cgroup
Date: Thu, 30 Aug 2018 21:53:56 -0400
Message-ID: <20180831015356.69796-16-dennisszhou@gmail.com> (raw)
In-Reply-To: <20180831015356.69796-1-dennisszhou@gmail.com>

From: "Dennis Zhou (Facebook)" <dennisszhou@gmail.com>

Latency is an important metric to understanding whether or not you're
receiving adequate service from your block devices. blk-iolatency
demonstrates the utility of such information.

This patch introduces a moving average to track latency to blk-cgroup.
The value can be found in all non-root cgroups in io.stat. A bio's
latency is counted and propagated up to, but excluding, the root cgroup.
It uses a minimum window of 1s and windows only elapse with active bios.
A single value is contributed to the moving average from each window.
The percpu stats are long running, thus each interval requires
calculating the delta between the previous read and current read.

Signed-off-by: Dennis Zhou <dennisszhou@gmail.com>
---
 Documentation/admin-guide/cgroup-v2.rst |   6 +-
 block/bio.c                             |   3 +
 block/blk-cgroup.c                      | 117 +++++++++++++++++++++++-
 include/linux/blk-cgroup.h              |   9 ++
 4 files changed, 127 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 2dc8f95077aa..1cdc0e4279c5 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1521,9 +1521,9 @@ IO Latency Interface Files
 
 	  avg_lat
 		This is an exponential moving average with a decay rate of 1/exp
-		bound by the sampling interval.  The decay rate interval can be
-		calculated by multiplying the win value in io.stat by the
-		corresponding number of samples based on the win value.
+		every 12 samples, with a sampling rate of 1s.  Only IO activity
+		can elapse a window and idle periods extend the most recent
+		window.
 
 	  win
 		The sampling window size in milliseconds.  This is the minimum
diff --git a/block/bio.c b/block/bio.c
index a0b816811e7d..2739e6f5acb7 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1720,6 +1720,9 @@ void bio_endio(struct bio *bio)
 	if (!bio_integrity_endio(bio))
 		return;
 
+	if (bio->bi_blkg && bio->bi_blkg->parent)
+		blkg_record_latency(bio);
+
 	if (bio->bi_disk)
 		rq_qos_done_bio(bio->bi_disk->queue, bio);
 
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1eaf097e38b0..b720ca629eea 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -17,6 +17,7 @@
 #include <linux/ioprio.h>
 #include <linux/kdev_t.h>
 #include <linux/module.h>
+#include <linux/sched/loadavg.h>
 #include <linux/sched/signal.h>
 #include <linux/err.h>
 #include <linux/blkdev.h>
@@ -32,6 +33,18 @@
 
 #define MAX_KEY_LEN 100
 
+/*
+ * This constant is used to fake the fixed-point moving average calculation
+ * just like load average for blkg->lat_avg.  The call to CALC_LOAD folds
+ * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg.  The sampling
+ * window size is fixed to 1s, so BLKCG_EXP_12s is the corresponding value
+ * to create a 1/exp decay rate every 12s when windows elapse immediately.
+ * Note, windows only elapse with IO activity and idle periods extend the
+ * most recent window.
+ */
+#define BLKG_EXP_12s 1884
+#define BLKG_STAT_WIN_SIZE NSEC_PER_SEC
+
 /*
  * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
  * blkcg_pol_register_mutex nests outside of it and synchronizes entire
@@ -72,6 +85,9 @@ static void blkg_free(struct blkcg_gq *blkg)
 	if (!blkg)
 		return;
 
+	if (blkg->rq_stat)
+		free_percpu(blkg->rq_stat);
+
 	for (i = 0; i < BLKCG_MAX_POLS; i++)
 		if (blkg->pd[i])
 			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
@@ -120,7 +136,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 				   gfp_t gfp_mask)
 {
 	struct blkcg_gq *blkg;
-	int i;
+	int i, cpu;
 
 	/* alloc and init base part */
 	blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
@@ -159,6 +175,20 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 		pd->plid = i;
 	}
 
+	/* init rq_stats */
+	blkg->rq_stat = __alloc_percpu_gfp(sizeof(struct blk_rq_stat),
+					   __alignof__(struct blk_rq_stat),
+					   gfp_mask);
+	if (!blkg->rq_stat)
+		goto err_free;
+	for_each_possible_cpu(cpu) {
+		struct blk_rq_stat *s;
+		s = per_cpu_ptr(blkg->rq_stat, cpu);
+		blk_rq_stat_init(s);
+	}
+	blk_rq_stat_init(&blkg->last_rq_stat);
+	atomic64_set(&blkg->win_start, ktime_to_ns(ktime_get()));
+
 	return blkg;
 
 err_free:
@@ -981,7 +1011,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 		const char *dname;
 		char *buf;
 		struct blkg_rwstat rwstat;
-		u64 rbytes, wbytes, rios, wios, dbytes, dios;
+		u64 rbytes, wbytes, rios, wios, dbytes, dios, avg_lat;
 		size_t size = seq_get_buf(sf, &buf), off = 0;
 		int i;
 		bool has_stats = false;
@@ -1012,14 +1042,16 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 		wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
 		dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
 
+		avg_lat = div64_u64(blkg->lat_avg, NSEC_PER_USEC);
+
 		spin_unlock_irq(blkg->q->queue_lock);
 
 		if (rbytes || wbytes || rios || wios) {
 			has_stats = true;
 			off += scnprintf(buf+off, size-off,
-					 "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
-					 rbytes, wbytes, rios, wios,
-					 dbytes, dios);
+				 "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu avg_lat=%llu",
+				 rbytes, wbytes, rios, wios, dbytes, dios,
+				 avg_lat);
 		}
 
 		if (!blkcg_debug_stats)
@@ -1638,6 +1670,81 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
 
+/*
+ * This aggregates the latency of all bios under this cgroup and then
+ * advances the moving average window.  A window contributes a single
+ * value to the moving average regardless of how many IOs occurred.
+ */
+static void blkg_aggregate_latency(struct blkcg_gq *blkg)
+{
+	struct blk_rq_stat rq_stat;
+	struct blk_rq_stat *last_rq_stat;
+	u64 mean;
+	int cpu;
+
+	blk_rq_stat_init(&rq_stat);
+	preempt_disable();
+	for_each_online_cpu(cpu) {
+		struct blk_rq_stat *s;
+		s = per_cpu_ptr(blkg->rq_stat, cpu);
+		blk_rq_stat_sum(&rq_stat, s);
+	}
+	preempt_enable();
+
+	last_rq_stat = &blkg->last_rq_stat;
+
+	mean = div64_u64(rq_stat.nr_samples * rq_stat.mean -
+			 last_rq_stat->nr_samples * last_rq_stat->mean,
+			 rq_stat.nr_samples - last_rq_stat->nr_samples);
+	CALC_LOAD(blkg->lat_avg, BLKG_EXP_12s, mean);
+	blkg->last_rq_stat = rq_stat;
+}
+
+/**
+ * blkg_record_latency - records the latency of a bio
+ * @bio: bio of interest
+ *
+ * This records the latency of a bio in all nodes up to root, excluding root.
+ */
+void blkg_record_latency(struct bio *bio)
+{
+	u64 now = ktime_to_ns(ktime_get());
+	u64 start = bio_issue_time(&bio->bi_issue);
+	u64 win_start, req_time;
+	struct blkcg_gq *blkg;
+	struct blk_rq_stat *rq_stat;
+	bool issue_as_root = bio_issue_as_root_blkg(bio);
+
+	blkg = bio->bi_blkg;
+	if (!blkg)
+		return;
+
+	/*
+	 * Have to do this so we are truncated to the correct time that our
+	 * issue is truncated to.
+	 */
+	now = __bio_issue_time(now);
+
+	if (now <= start || issue_as_root)
+		return;
+
+	req_time = now - start;
+
+	while (blkg && blkg->parent) {
+		rq_stat = get_cpu_ptr(blkg->rq_stat);
+		blk_rq_stat_add(rq_stat, req_time);
+		put_cpu_ptr(rq_stat);
+
+		win_start = atomic64_read(&blkg->win_start);
+		if (now > win_start && (now - win_start) >= BLKG_STAT_WIN_SIZE)
+			if (atomic64_cmpxchg(&blkg->win_start,
+					     win_start, now) == win_start)
+				blkg_aggregate_latency(blkg);
+
+		blkg = blkg->parent;
+	}
+}
+
 /*
  * Scale the accumulated delay based on how long it has been since we updated
  * the delay.  We only call this when we are adding delay, in case it's been a
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 0134cdd270b8..215af051f876 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -136,6 +136,11 @@ struct blkcg_gq {
 
 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
 
+	struct blk_rq_stat __percpu	*rq_stat;
+	struct blk_rq_stat		last_rq_stat;
+	atomic64_t			win_start;
+	u64				lat_avg;
+
 	struct rcu_head			rcu_head;
 
 	atomic_t			use_delay;
@@ -895,6 +900,8 @@ static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
 	}
 }
 
+void blkg_record_latency(struct bio *bio);
+
 void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
 void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
 void blkcg_maybe_throttle_current(void);
@@ -917,6 +924,8 @@ struct blkcg_policy {
 
 #define blkcg_root_css	((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
 
+static inline void blkg_record_latency(struct bio *bio) {}
+
 static inline void blkcg_maybe_throttle_current(void) { }
 static inline bool blk_cgroup_congested(void) { return false; }
 
-- 
2.17.1


  parent reply index

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-08-31  1:53 [PATCH 00/15] blkcg ref count refactor/cleanup + blkcg avg_lat Dennis Zhou
2018-08-31  1:53 ` [PATCH 01/15] Revert "blk-throttle: fix race between blkcg_bio_issue_check() and cgroup_rmdir()" Dennis Zhou
2018-08-31  1:53 ` [PATCH 02/15] blkcg: delay blkg destruction until after writeback has finished Dennis Zhou
2018-08-31 15:27   ` Josef Bacik
2018-08-31 20:19     ` Dennis Zhou
2018-08-31  1:53 ` [PATCH 03/15] blkcg: use tryget logic when associating a blkg with a bio Dennis Zhou
2018-08-31 15:30   ` Josef Bacik
2018-08-31 20:20     ` Dennis Zhou
2018-08-31  1:53 ` [PATCH 04/15] blkcg: fix ref count issue with bio_blkcg using task_css Dennis Zhou
2018-08-31 15:35   ` Josef Bacik
2018-08-31 23:04     ` Tejun Heo
2018-09-06 15:21     ` Dennis Zhou
2018-08-31  1:53 ` [PATCH 05/15] blkcg: update blkg_lookup_create to do locking Dennis Zhou
2018-08-31 15:37   ` Josef Bacik
2018-08-31 23:09   ` Tejun Heo
2018-08-31  1:53 ` [PATCH 06/15] blkcg: always associate a bio with a blkg Dennis Zhou
2018-08-31  9:01   ` kbuild test robot
2018-08-31 10:02   ` kbuild test robot
2018-08-31 23:16   ` Tejun Heo
2018-09-06 20:41     ` Dennis Zhou
2018-09-07  3:03   ` [LKP] [blkcg] c02c58dab2: WARNING:at_block/blk-throttle.c:#blk_throtl_bio kernel test robot
2018-08-31  1:53 ` [PATCH 07/15] blkcg: consolidate bio_issue_init and blkg association Dennis Zhou
2018-08-31  9:19   ` kbuild test robot
2018-08-31 11:11   ` kbuild test robot
2018-08-31 15:42   ` Josef Bacik
2018-09-06 20:43     ` Dennis Zhou
2018-08-31 23:45   ` Tejun Heo
2018-08-31  1:53 ` [PATCH 08/15] blkcg: associate a blkg for pages being evicted by swap Dennis Zhou
2018-08-31 15:44   ` Josef Bacik
2018-08-31 23:47   ` Tejun Heo
2018-08-31  1:53 ` [PATCH 09/15] blkcg: associate writeback bios with a blkg Dennis Zhou
2018-08-31 15:45   ` Josef Bacik
2018-08-31 23:53   ` Tejun Heo
2018-08-31  1:53 ` [PATCH 10/15] blkcg: remove bio->bi_css and instead use bio->bi_blkg Dennis Zhou
2018-08-31 15:46   ` Josef Bacik
2018-09-01  0:13   ` Tejun Heo
2018-08-31  1:53 ` [PATCH 11/15] blkcg: remove additional reference to the css Dennis Zhou
2018-09-01  0:26   ` Tejun Heo
2018-09-06 20:45     ` Dennis Zhou
2018-08-31  1:53 ` [PATCH 12/15] blkcg: cleanup and make blk_get_rl use blkg_lookup_create Dennis Zhou
2018-09-01  0:29   ` Tejun Heo
2018-09-11  2:37   ` [LKP] [blkcg] 22f657e287: general_protection_fault:#[##] kernel test robot
2018-08-31  1:53 ` [PATCH 13/15] blkcg: change blkg reference counting to use percpu_ref Dennis Zhou
2018-08-31 15:49   ` Josef Bacik
2018-09-01  0:31   ` Tejun Heo
2018-09-06 20:46     ` Dennis Zhou
2018-09-07  3:08   ` [LKP] [blkcg] 6ef69a3a0b: WARNING:suspicious_RCU_usage kernel test robot
2018-08-31  1:53 ` [PATCH 14/15] blkcg: rename blkg_try_get to blkg_tryget Dennis Zhou
2018-08-31 15:50   ` Josef Bacik
2018-09-01  0:32   ` Tejun Heo
2018-08-31  1:53 ` Dennis Zhou [this message]
2018-08-31 10:22   ` [PATCH 15/15] blkcg: add average latency tracking to blk-cgroup kbuild test robot
2018-08-31 11:38   ` kbuild test robot
2018-09-01  0:35 ` [PATCH 00/15] blkcg ref count refactor/cleanup + blkcg avg_lat Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180831015356.69796-16-dennisszhou@gmail.com \
    --to=dennisszhou@gmail.com \
    --cc=axboe@kernel.dk \
    --cc=cgroups@vger.kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=josef@toxicpanda.com \
    --cc=kernel-team@fb.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

LKML Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/lkml/0 lkml/git/0.git
	git clone --mirror https://lore.kernel.org/lkml/1 lkml/git/1.git
	git clone --mirror https://lore.kernel.org/lkml/2 lkml/git/2.git
	git clone --mirror https://lore.kernel.org/lkml/3 lkml/git/3.git
	git clone --mirror https://lore.kernel.org/lkml/4 lkml/git/4.git
	git clone --mirror https://lore.kernel.org/lkml/5 lkml/git/5.git
	git clone --mirror https://lore.kernel.org/lkml/6 lkml/git/6.git
	git clone --mirror https://lore.kernel.org/lkml/7 lkml/git/7.git
	git clone --mirror https://lore.kernel.org/lkml/8 lkml/git/8.git
	git clone --mirror https://lore.kernel.org/lkml/9 lkml/git/9.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 lkml lkml/ https://lore.kernel.org/lkml \
		linux-kernel@vger.kernel.org
	public-inbox-index lkml

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-kernel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git