linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/7] Adjust hybrid polling sleep time
@ 2019-04-30  7:34 Pavel Begunkov (Silence)
  2019-04-30  7:34 ` [PATCH 1/7] blk-iolatency: Fix zero mean in previous stats Pavel Begunkov (Silence)
                   ` (7 more replies)
  0 siblings, 8 replies; 9+ messages in thread
From: Pavel Begunkov (Silence) @ 2019-04-30  7:34 UTC (permalink / raw)
  To: Jens Axboe, linux-block, linux-kernel; +Cc: Pavel Begunkov

From: Pavel Begunkov <asml.silence@gmail.com>

Sleep time for adaptive hybrid polling is coarse and can be improved to
decrease CPU load. Use variation of the 3-sigma rule and runtime
tuning.

This approach gives up to 2x CPU load reduction keeping the same latency
distribution and throughput.

Pavel Begunkov (7):
  blk-iolatency: Fix zero mean in previous stats
  blk-stats: Introduce explicit stat staging buffers
  blk-mq: Fix disabled hybrid polling
  blk-stats: Add left mean deviation to blk_stats
  blk-mq: Precalculate hybrid polling time
  blk-mq: Track num of overslept by hybrid poll rqs
  blk-mq: Adjust hybrid poll sleep time

 block/blk-core.c          |   7 +-
 block/blk-iolatency.c     |  60 ++++++++++----
 block/blk-mq-debugfs.c    |  14 ++--
 block/blk-mq.c            | 163 ++++++++++++++++++++++++++++----------
 block/blk-stat.c          |  67 +++++++++++++---
 block/blk-stat.h          |  15 +++-
 include/linux/blk_types.h |   9 +++
 include/linux/blkdev.h    |  17 +++-
 8 files changed, 271 insertions(+), 81 deletions(-)

-- 
2.21.0


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/7] blk-iolatency: Fix zero mean in previous stats
  2019-04-30  7:34 [PATCH 0/7] Adjust hybrid polling sleep time Pavel Begunkov (Silence)
@ 2019-04-30  7:34 ` Pavel Begunkov (Silence)
  2019-04-30  7:34 ` [PATCH 2/7] blk-stats: Introduce explicit stat staging buffers Pavel Begunkov (Silence)
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Pavel Begunkov (Silence) @ 2019-04-30  7:34 UTC (permalink / raw)
  To: Jens Axboe, linux-block, linux-kernel; +Cc: Pavel Begunkov

From: Pavel Begunkov <asml.silence@gmail.com>

blk_rq_stat_sum() expects src argument (struct blk_rq_stat) to have
valid batch field and won't calculate it for dst. Thus, former dst
shouldn't be used as an src arg. iolatency_check_latencies() violates
that, making iolat->cur_stat.rqs.mean always to be 0 for non-ssd
devices.

Use 2 distinct functions instead: one to collect intermediate stats
(i.e. with valid batch), and the second one for merging already
accumulated stats (i.e. with valid mean).

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 block/blk-iolatency.c | 21 ++++++++++++++++-----
 block/blk-stat.c      | 20 ++++++++++++++++++--
 block/blk-stat.h      |  3 ++-
 3 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 507212d75ee2..4010152ebeb2 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -198,7 +198,7 @@ static inline void latency_stat_init(struct iolatency_grp *iolat,
 		blk_rq_stat_init(&stat->rqs);
 }
 
-static inline void latency_stat_sum(struct iolatency_grp *iolat,
+static inline void latency_stat_merge(struct iolatency_grp *iolat,
 				    struct latency_stat *sum,
 				    struct latency_stat *stat)
 {
@@ -206,7 +206,18 @@ static inline void latency_stat_sum(struct iolatency_grp *iolat,
 		sum->ps.total += stat->ps.total;
 		sum->ps.missed += stat->ps.missed;
 	} else
-		blk_rq_stat_sum(&sum->rqs, &stat->rqs);
+		blk_rq_stat_merge(&sum->rqs, &stat->rqs);
+}
+
+static inline void latency_stat_collect(struct iolatency_grp *iolat,
+					struct latency_stat *sum,
+					struct latency_stat *stat)
+{
+	if (iolat->ssd) {
+		sum->ps.total += stat->ps.total;
+		sum->ps.missed += stat->ps.missed;
+	} else
+		blk_rq_stat_collect(&sum->rqs, &stat->rqs);
 }
 
 static inline void latency_stat_record_time(struct iolatency_grp *iolat,
@@ -530,7 +541,7 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
 	for_each_online_cpu(cpu) {
 		struct latency_stat *s;
 		s = per_cpu_ptr(iolat->stats, cpu);
-		latency_stat_sum(iolat, &stat, s);
+		latency_stat_collect(iolat, &stat, s);
 		latency_stat_init(iolat, s);
 	}
 	preempt_enable();
@@ -551,7 +562,7 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
 	/* Somebody beat us to the punch, just bail. */
 	spin_lock_irqsave(&lat_info->lock, flags);
 
-	latency_stat_sum(iolat, &iolat->cur_stat, &stat);
+	latency_stat_merge(iolat, &iolat->cur_stat, &stat);
 	lat_info->nr_samples -= iolat->nr_samples;
 	lat_info->nr_samples += latency_stat_samples(iolat, &iolat->cur_stat);
 	iolat->nr_samples = latency_stat_samples(iolat, &iolat->cur_stat);
@@ -912,7 +923,7 @@ static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf,
 	for_each_online_cpu(cpu) {
 		struct latency_stat *s;
 		s = per_cpu_ptr(iolat->stats, cpu);
-		latency_stat_sum(iolat, &stat, s);
+		latency_stat_collect(iolat, &stat, s);
 	}
 	preempt_enable();
 
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 696a04176e4d..a6da68af45db 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -25,7 +25,7 @@ void blk_rq_stat_init(struct blk_rq_stat *stat)
 }
 
 /* src is a per-cpu stat, mean isn't initialized */
-void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
+void blk_rq_stat_collect(struct blk_rq_stat *dst, struct blk_rq_stat *src)
 {
 	if (!src->nr_samples)
 		return;
@@ -39,6 +39,21 @@ void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
 	dst->nr_samples += src->nr_samples;
 }
 
+void blk_rq_stat_merge(struct blk_rq_stat *dst, struct blk_rq_stat *src)
+{
+	if (!src->nr_samples)
+		return;
+
+	dst->min = min(dst->min, src->min);
+	dst->max = max(dst->max, src->max);
+
+	dst->mean = div_u64(src->mean * src->nr_samples +
+				dst->mean * dst->nr_samples,
+				dst->nr_samples + src->nr_samples);
+
+	dst->nr_samples += src->nr_samples;
+}
+
 void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value)
 {
 	stat->min = min(stat->min, value);
@@ -89,7 +104,8 @@ static void blk_stat_timer_fn(struct timer_list *t)
 
 		cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
 		for (bucket = 0; bucket < cb->buckets; bucket++) {
-			blk_rq_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
+			blk_rq_stat_collect(&cb->stat[bucket],
+					    &cpu_stat[bucket]);
 			blk_rq_stat_init(&cpu_stat[bucket]);
 		}
 	}
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 17b47a86eefb..5597ecc34ef5 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -165,7 +165,8 @@ static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb,
 }
 
 void blk_rq_stat_add(struct blk_rq_stat *, u64);
-void blk_rq_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *);
+void blk_rq_stat_collect(struct blk_rq_stat *dst, struct blk_rq_stat *src);
+void blk_rq_stat_merge(struct blk_rq_stat *dst, struct blk_rq_stat *src);
 void blk_rq_stat_init(struct blk_rq_stat *);
 
 #endif
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/7] blk-stats: Introduce explicit stat staging buffers
  2019-04-30  7:34 [PATCH 0/7] Adjust hybrid polling sleep time Pavel Begunkov (Silence)
  2019-04-30  7:34 ` [PATCH 1/7] blk-iolatency: Fix zero mean in previous stats Pavel Begunkov (Silence)
@ 2019-04-30  7:34 ` Pavel Begunkov (Silence)
  2019-04-30  7:34 ` [PATCH 3/7] blk-mq: Fix disabled hybrid polling Pavel Begunkov (Silence)
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Pavel Begunkov (Silence) @ 2019-04-30  7:34 UTC (permalink / raw)
  To: Jens Axboe, linux-block, linux-kernel; +Cc: Pavel Begunkov

From: Pavel Begunkov <asml.silence@gmail.com>

struct blk_rq_stat could be in one of two implicit states, which use
different set of fields:
1. per-cpu intermediate (i.e. staging) (keep batch, invalid mean)
2. calculated stats (see blk_rq_stat_collect) (w/o batch, w/ mean)

blk_rq_stat_*() expect their arguments to be in the right state, and it
is not documented in which. That's error prone.

Split blk_rq_stat into 2 structs corresponding to one of the states.
That requires some code duplication, but
1. prevents misuses (compile-time type-system check)
2. reduces memory needed
3. makes it easier to extend stats

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 block/blk-iolatency.c     | 41 +++++++++++++++++++++++++++++----------
 block/blk-stat.c          | 30 +++++++++++++++++-----------
 block/blk-stat.h          |  8 +++++---
 include/linux/blk_types.h |  6 ++++++
 4 files changed, 61 insertions(+), 24 deletions(-)

diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 4010152ebeb2..df9d37398a0f 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -129,9 +129,16 @@ struct latency_stat {
 	};
 };
 
+struct latency_stat_staging {
+	union {
+		struct percentile_stats ps;
+		struct blk_rq_stat_staging rqs;
+	};
+};
+
 struct iolatency_grp {
 	struct blkg_policy_data pd;
-	struct latency_stat __percpu *stats;
+	struct latency_stat_staging __percpu *stats;
 	struct latency_stat cur_stat;
 	struct blk_iolatency *blkiolat;
 	struct rq_depth rq_depth;
@@ -198,6 +205,16 @@ static inline void latency_stat_init(struct iolatency_grp *iolat,
 		blk_rq_stat_init(&stat->rqs);
 }
 
+static inline void latency_stat_init_staging(struct iolatency_grp *iolat,
+					     struct latency_stat_staging *stat)
+{
+	if (iolat->ssd) {
+		stat->ps.total = 0;
+		stat->ps.missed = 0;
+	} else
+		blk_rq_stat_init_staging(&stat->rqs);
+}
+
 static inline void latency_stat_merge(struct iolatency_grp *iolat,
 				    struct latency_stat *sum,
 				    struct latency_stat *stat)
@@ -211,7 +228,7 @@ static inline void latency_stat_merge(struct iolatency_grp *iolat,
 
 static inline void latency_stat_collect(struct iolatency_grp *iolat,
 					struct latency_stat *sum,
-					struct latency_stat *stat)
+					struct latency_stat_staging *stat)
 {
 	if (iolat->ssd) {
 		sum->ps.total += stat->ps.total;
@@ -223,7 +240,8 @@ static inline void latency_stat_collect(struct iolatency_grp *iolat,
 static inline void latency_stat_record_time(struct iolatency_grp *iolat,
 					    u64 req_time)
 {
-	struct latency_stat *stat = get_cpu_ptr(iolat->stats);
+	struct latency_stat_staging *stat = get_cpu_ptr(iolat->stats);
+
 	if (iolat->ssd) {
 		if (req_time >= iolat->min_lat_nsec)
 			stat->ps.missed++;
@@ -539,10 +557,11 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
 	latency_stat_init(iolat, &stat);
 	preempt_disable();
 	for_each_online_cpu(cpu) {
-		struct latency_stat *s;
+		struct latency_stat_staging *s;
+
 		s = per_cpu_ptr(iolat->stats, cpu);
 		latency_stat_collect(iolat, &stat, s);
-		latency_stat_init(iolat, s);
+		latency_stat_init_staging(iolat, s);
 	}
 	preempt_enable();
 
@@ -921,7 +940,8 @@ static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf,
 	latency_stat_init(iolat, &stat);
 	preempt_disable();
 	for_each_online_cpu(cpu) {
-		struct latency_stat *s;
+		struct latency_stat_staging *s;
+
 		s = per_cpu_ptr(iolat->stats, cpu);
 		latency_stat_collect(iolat, &stat, s);
 	}
@@ -965,8 +985,8 @@ static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
 	iolat = kzalloc_node(sizeof(*iolat), gfp, node);
 	if (!iolat)
 		return NULL;
-	iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat),
-				       __alignof__(struct latency_stat), gfp);
+	iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat_staging),
+				__alignof__(struct latency_stat_staging), gfp);
 	if (!iolat->stats) {
 		kfree(iolat);
 		return NULL;
@@ -989,9 +1009,10 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
 		iolat->ssd = false;
 
 	for_each_possible_cpu(cpu) {
-		struct latency_stat *stat;
+		struct latency_stat_staging *stat;
+
 		stat = per_cpu_ptr(iolat->stats, cpu);
-		latency_stat_init(iolat, stat);
+		latency_stat_init_staging(iolat, stat);
 	}
 
 	latency_stat_init(iolat, &iolat->cur_stat);
diff --git a/block/blk-stat.c b/block/blk-stat.c
index a6da68af45db..13f93249fd5f 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -17,15 +17,22 @@ struct blk_queue_stats {
 	bool enable_accounting;
 };
 
+void blk_rq_stat_init_staging(struct blk_rq_stat_staging *stat)
+{
+	stat->min = -1ULL;
+	stat->max = 0;
+	stat->batch = 0;
+	stat->nr_samples = 0;
+}
+
 void blk_rq_stat_init(struct blk_rq_stat *stat)
 {
 	stat->min = -1ULL;
 	stat->max = stat->nr_samples = stat->mean = 0;
-	stat->batch = 0;
 }
 
-/* src is a per-cpu stat, mean isn't initialized */
-void blk_rq_stat_collect(struct blk_rq_stat *dst, struct blk_rq_stat *src)
+void blk_rq_stat_collect(struct blk_rq_stat *dst,
+			 struct blk_rq_stat_staging *src)
 {
 	if (!src->nr_samples)
 		return;
@@ -54,7 +61,7 @@ void blk_rq_stat_merge(struct blk_rq_stat *dst, struct blk_rq_stat *src)
 	dst->nr_samples += src->nr_samples;
 }
 
-void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value)
+void blk_rq_stat_add(struct blk_rq_stat_staging *stat, u64 value)
 {
 	stat->min = min(stat->min, value);
 	stat->max = max(stat->max, value);
@@ -66,7 +73,7 @@ void blk_stat_add(struct request *rq, u64 now)
 {
 	struct request_queue *q = rq->q;
 	struct blk_stat_callback *cb;
-	struct blk_rq_stat *stat;
+	struct blk_rq_stat_staging *stat;
 	int bucket;
 	u64 value;
 
@@ -100,13 +107,13 @@ static void blk_stat_timer_fn(struct timer_list *t)
 		blk_rq_stat_init(&cb->stat[bucket]);
 
 	for_each_online_cpu(cpu) {
-		struct blk_rq_stat *cpu_stat;
+		struct blk_rq_stat_staging *cpu_stat;
 
 		cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
 		for (bucket = 0; bucket < cb->buckets; bucket++) {
 			blk_rq_stat_collect(&cb->stat[bucket],
 					    &cpu_stat[bucket]);
-			blk_rq_stat_init(&cpu_stat[bucket]);
+			blk_rq_stat_init_staging(&cpu_stat[bucket]);
 		}
 	}
 
@@ -130,8 +137,9 @@ blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
 		kfree(cb);
 		return NULL;
 	}
-	cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat),
-				      __alignof__(struct blk_rq_stat));
+	cb->cpu_stat = __alloc_percpu(
+				buckets * sizeof(struct blk_rq_stat_staging),
+				__alignof__(struct blk_rq_stat_staging));
 	if (!cb->cpu_stat) {
 		kfree(cb->stat);
 		kfree(cb);
@@ -154,11 +162,11 @@ void blk_stat_add_callback(struct request_queue *q,
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		struct blk_rq_stat *cpu_stat;
+		struct blk_rq_stat_staging *cpu_stat;
 
 		cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
 		for (bucket = 0; bucket < cb->buckets; bucket++)
-			blk_rq_stat_init(&cpu_stat[bucket]);
+			blk_rq_stat_init_staging(&cpu_stat[bucket]);
 	}
 
 	spin_lock(&q->stats->lock);
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 5597ecc34ef5..e5c753fbd6e6 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -30,7 +30,7 @@ struct blk_stat_callback {
 	/**
 	 * @cpu_stat: Per-cpu statistics buckets.
 	 */
-	struct blk_rq_stat __percpu *cpu_stat;
+	struct blk_rq_stat_staging __percpu *cpu_stat;
 
 	/**
 	 * @bucket_fn: Given a request, returns which statistics bucket it
@@ -164,9 +164,11 @@ static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb,
 	mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs));
 }
 
-void blk_rq_stat_add(struct blk_rq_stat *, u64);
-void blk_rq_stat_collect(struct blk_rq_stat *dst, struct blk_rq_stat *src);
+void blk_rq_stat_add(struct blk_rq_stat_staging *stat, u64);
+void blk_rq_stat_collect(struct blk_rq_stat *dst,
+			 struct blk_rq_stat_staging *src);
 void blk_rq_stat_merge(struct blk_rq_stat *dst, struct blk_rq_stat *src);
 void blk_rq_stat_init(struct blk_rq_stat *);
+void blk_rq_stat_init_staging(struct blk_rq_stat_staging *stat);
 
 #endif
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 791fee35df88..5718a4e2e731 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -446,7 +446,13 @@ struct blk_rq_stat {
 	u64 min;
 	u64 max;
 	u32 nr_samples;
+};
+
+struct blk_rq_stat_staging {
+	u64 min;
+	u64 max;
 	u64 batch;
+	u32 nr_samples;
 };
 
 #endif /* __LINUX_BLK_TYPES_H */
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 3/7] blk-mq: Fix disabled hybrid polling
  2019-04-30  7:34 [PATCH 0/7] Adjust hybrid polling sleep time Pavel Begunkov (Silence)
  2019-04-30  7:34 ` [PATCH 1/7] blk-iolatency: Fix zero mean in previous stats Pavel Begunkov (Silence)
  2019-04-30  7:34 ` [PATCH 2/7] blk-stats: Introduce explicit stat staging buffers Pavel Begunkov (Silence)
@ 2019-04-30  7:34 ` Pavel Begunkov (Silence)
  2019-04-30  7:34 ` [PATCH 4/7] blk-stats: Add left mean deviation to blk_stats Pavel Begunkov (Silence)
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Pavel Begunkov (Silence) @ 2019-04-30  7:34 UTC (permalink / raw)
  To: Jens Axboe, linux-block, linux-kernel; +Cc: Pavel Begunkov

From: Pavel Begunkov <asml.silence@gmail.com>

Commit 4bc6339a583cec650b05 ("block: move blk_stat_add() to
__blk_mq_end_request()") moved blk_stat_add(), so now it's called after
blk_update_request(), which zeroes rq->__data_len. Without length,
blk_stat_add() can't calculate stat bucket and returns error,
effectively disabling hybrid polling.

Move it back to __blk_mq_complete_request.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 block/blk-mq.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index fc60ed7e940e..cc3f73e4e01c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -535,11 +535,6 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 	if (blk_mq_need_time_stamp(rq))
 		now = ktime_get_ns();
 
-	if (rq->rq_flags & RQF_STATS) {
-		blk_mq_poll_stats_start(rq->q);
-		blk_stat_add(rq, now);
-	}
-
 	if (rq->internal_tag != -1)
 		blk_mq_sched_completed_request(rq, now);
 
@@ -578,6 +573,11 @@ static void __blk_mq_complete_request(struct request *rq)
 	int cpu;
 
 	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
+
+	if (rq->rq_flags & RQF_STATS) {
+		blk_mq_poll_stats_start(rq->q);
+		blk_stat_add(rq, ktime_get_ns());
+	}
 	/*
 	 * Most of single queue controllers, there is only one irq vector
 	 * for handling IO completion, and the only irq's affinity is set
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 4/7] blk-stats: Add left mean deviation to blk_stats
  2019-04-30  7:34 [PATCH 0/7] Adjust hybrid polling sleep time Pavel Begunkov (Silence)
                   ` (2 preceding siblings ...)
  2019-04-30  7:34 ` [PATCH 3/7] blk-mq: Fix disabled hybrid polling Pavel Begunkov (Silence)
@ 2019-04-30  7:34 ` Pavel Begunkov (Silence)
  2019-04-30  7:34 ` [PATCH 5/7] blk-mq: Precalculate hybrid polling time Pavel Begunkov (Silence)
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Pavel Begunkov (Silence) @ 2019-04-30  7:34 UTC (permalink / raw)
  To: Jens Axboe, linux-block, linux-kernel; +Cc: Pavel Begunkov

From: Pavel Begunkov <asml.silence@gmail.com>

The basic idea is to use the 3-sigma rule to guess adaptive polling
sleep time. Effective standard deviation calculation could easily
overflow u64, thus decided to use mean absolute deviation (MAD) as an
approximation. As only the left bound is needed, to increase accuracy
MAD is replaced by the left mean deviation (LMD).

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 block/blk-mq-debugfs.c    | 10 ++++++----
 block/blk-stat.c          | 21 +++++++++++++++++++--
 block/blk-stat.h          |  6 ++++++
 include/linux/blk_types.h |  3 +++
 4 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index ec1d18cb643c..b62bd4468db3 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -27,12 +27,14 @@
 
 static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
 {
-	if (stat->nr_samples) {
-		seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
-			   stat->nr_samples, stat->mean, stat->min, stat->max);
-	} else {
+	if (!stat->nr_samples) {
 		seq_puts(m, "samples=0");
+		return;
 	}
+
+	seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu, lmd=%llu",
+		   stat->nr_samples, stat->mean, stat->min, stat->max,
+		   stat->lmd);
 }
 
 static int queue_poll_stat_show(void *data, struct seq_file *m)
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 13f93249fd5f..e1915a4e41b9 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -17,14 +17,21 @@ struct blk_queue_stats {
 	bool enable_accounting;
 };
 
-void blk_rq_stat_init_staging(struct blk_rq_stat_staging *stat)
+void blk_rq_stat_reset(struct blk_rq_stat_staging *stat)
 {
 	stat->min = -1ULL;
 	stat->max = 0;
 	stat->batch = 0;
+	stat->lmd_batch = 0;
 	stat->nr_samples = 0;
 }
 
+void blk_rq_stat_init_staging(struct blk_rq_stat_staging *stat)
+{
+	blk_rq_stat_reset(stat);
+	stat->mean_last = 0;
+}
+
 void blk_rq_stat_init(struct blk_rq_stat *stat)
 {
 	stat->min = -1ULL;
@@ -42,8 +49,12 @@ void blk_rq_stat_collect(struct blk_rq_stat *dst,
 
 	dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples,
 				dst->nr_samples + src->nr_samples);
+	dst->lmd = div_u64(src->lmd_batch + dst->lmd * dst->nr_samples,
+				dst->nr_samples + src->nr_samples);
 
 	dst->nr_samples += src->nr_samples;
+	/* pass mean back for lmd computation */
+	src->mean_last = dst->mean;
 }
 
 void blk_rq_stat_merge(struct blk_rq_stat *dst, struct blk_rq_stat *src)
@@ -57,6 +68,9 @@ void blk_rq_stat_merge(struct blk_rq_stat *dst, struct blk_rq_stat *src)
 	dst->mean = div_u64(src->mean * src->nr_samples +
 				dst->mean * dst->nr_samples,
 				dst->nr_samples + src->nr_samples);
+	dst->lmd = div_u64(src->lmd * src->nr_samples +
+				dst->lmd * dst->nr_samples,
+				dst->nr_samples + src->nr_samples);
 
 	dst->nr_samples += src->nr_samples;
 }
@@ -67,6 +81,9 @@ void blk_rq_stat_add(struct blk_rq_stat_staging *stat, u64 value)
 	stat->max = max(stat->max, value);
 	stat->batch += value;
 	stat->nr_samples++;
+
+	if (value < stat->mean_last)
+		stat->lmd_batch += stat->mean_last - value;
 }
 
 void blk_stat_add(struct request *rq, u64 now)
@@ -113,7 +130,7 @@ static void blk_stat_timer_fn(struct timer_list *t)
 		for (bucket = 0; bucket < cb->buckets; bucket++) {
 			blk_rq_stat_collect(&cb->stat[bucket],
 					    &cpu_stat[bucket]);
-			blk_rq_stat_init_staging(&cpu_stat[bucket]);
+			blk_rq_stat_reset(&cpu_stat[bucket]);
 		}
 	}
 
diff --git a/block/blk-stat.h b/block/blk-stat.h
index e5c753fbd6e6..ad81b2ce58bf 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -170,5 +170,11 @@ void blk_rq_stat_collect(struct blk_rq_stat *dst,
 void blk_rq_stat_merge(struct blk_rq_stat *dst, struct blk_rq_stat *src);
 void blk_rq_stat_init(struct blk_rq_stat *);
 void blk_rq_stat_init_staging(struct blk_rq_stat_staging *stat);
+/*
+ * Prepare stat to the next statistics round. Similar to
+ * blk_rq_stat_init_staging, but retains some information
+ * about the previous round (see last_mean).
+ */
+void blk_rq_stat_reset(struct blk_rq_stat_staging *stat);
 
 #endif
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 5718a4e2e731..fe0ad7b2e6ca 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -445,13 +445,16 @@ struct blk_rq_stat {
 	u64 mean;
 	u64 min;
 	u64 max;
+	u64 lmd; /* left mean deviation */
 	u32 nr_samples;
 };
 
 struct blk_rq_stat_staging {
+	u64 mean_last;
 	u64 min;
 	u64 max;
 	u64 batch;
+	u64 lmd_batch;
 	u32 nr_samples;
 };
 
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 5/7] blk-mq: Precalculate hybrid polling time
  2019-04-30  7:34 [PATCH 0/7] Adjust hybrid polling sleep time Pavel Begunkov (Silence)
                   ` (3 preceding siblings ...)
  2019-04-30  7:34 ` [PATCH 4/7] blk-stats: Add left mean deviation to blk_stats Pavel Begunkov (Silence)
@ 2019-04-30  7:34 ` Pavel Begunkov (Silence)
  2019-04-30  7:34 ` [PATCH 6/7] blk-mq: Track num of overslept by hybrid poll rqs Pavel Begunkov (Silence)
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Pavel Begunkov (Silence) @ 2019-04-30  7:34 UTC (permalink / raw)
  To: Jens Axboe, linux-block, linux-kernel; +Cc: Pavel Begunkov

From: Pavel Begunkov <asml.silence@gmail.com>

Calculation of sleep time for adaptive hybrid polling on per-request
basis could become time consuming in the future.
Precalculate it once per statistics gathering round.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 block/blk-core.c       |  5 ++++-
 block/blk-mq-debugfs.c |  4 ++--
 block/blk-mq.c         | 39 ++++++++++++++++++++++-----------------
 include/linux/blkdev.h |  8 +++++++-
 4 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a55389ba8779..daadce545e43 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -474,7 +474,7 @@ static void blk_timeout_work(struct work_struct *work)
 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 	struct request_queue *q;
-	int ret;
+	int ret, bucket;
 
 	q = kmem_cache_alloc_node(blk_requestq_cachep,
 				gfp_mask | __GFP_ZERO, node_id);
@@ -536,6 +536,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (blkcg_init_queue(q))
 		goto fail_ref;
 
+	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++)
+		q->poll_info[bucket].sleep_ns = 0;
+
 	return q;
 
 fail_ref:
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index b62bd4468db3..ab55446cb570 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -44,11 +44,11 @@ static int queue_poll_stat_show(void *data, struct seq_file *m)
 
 	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) {
 		seq_printf(m, "read  (%d Bytes): ", 1 << (9+bucket));
-		print_stat(m, &q->poll_stat[2*bucket]);
+		print_stat(m, &q->poll_info[2*bucket].stat);
 		seq_puts(m, "\n");
 
 		seq_printf(m, "write (%d Bytes): ",  1 << (9+bucket));
-		print_stat(m, &q->poll_stat[2*bucket+1]);
+		print_stat(m, &q->poll_info[2*bucket+1].stat);
 		seq_puts(m, "\n");
 	}
 	return 0;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index cc3f73e4e01c..4e54a004e345 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3312,14 +3312,32 @@ static void blk_mq_poll_stats_start(struct request_queue *q)
 	blk_stat_activate_msecs(q->poll_cb, 100);
 }
 
+static void blk_mq_update_poll_info(struct poll_info *pi,
+				    struct blk_rq_stat *stat)
+{
+	u64 sleep_ns;
+
+	if (!stat->nr_samples)
+		sleep_ns = 0;
+	else
+		sleep_ns = (stat->mean + 1) / 2;
+
+	pi->stat = *stat;
+	pi->sleep_ns = sleep_ns;
+}
+
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
 {
 	struct request_queue *q = cb->data;
 	int bucket;
 
 	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
-		if (cb->stat[bucket].nr_samples)
-			q->poll_stat[bucket] = cb->stat[bucket];
+		if (cb->stat[bucket].nr_samples) {
+			struct poll_info *pi = &q->poll_info[bucket];
+			struct blk_rq_stat *stat = &cb->stat[bucket];
+
+			blk_mq_update_poll_info(pi, stat);
+		}
 	}
 }
 
@@ -3327,7 +3345,6 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
 				       struct blk_mq_hw_ctx *hctx,
 				       struct request *rq)
 {
-	unsigned long ret = 0;
 	int bucket;
 
 	/*
@@ -3337,23 +3354,11 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
 	if (!blk_poll_stats_enable(q))
 		return 0;
 
-	/*
-	 * As an optimistic guess, use half of the mean service time
-	 * for this type of request. We can (and should) make this smarter.
-	 * For instance, if the completion latencies are tight, we can
-	 * get closer than just half the mean. This is especially
-	 * important on devices where the completion latencies are longer
-	 * than ~10 usec. We do use the stats for the relevant IO size
-	 * if available which does lead to better estimates.
-	 */
 	bucket = blk_mq_poll_stats_bkt(rq);
 	if (bucket < 0)
-		return ret;
-
-	if (q->poll_stat[bucket].nr_samples)
-		ret = (q->poll_stat[bucket].mean + 1) / 2;
+		return 0;
 
-	return ret;
+	return q->poll_info[bucket].sleep_ns;
 }
 
 static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 317ab30d2904..40c77935fd61 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -385,6 +385,12 @@ static inline int blkdev_reset_zones_ioctl(struct block_device *bdev,
 
 #endif /* CONFIG_BLK_DEV_ZONED */
 
+struct poll_info
+{
+	struct blk_rq_stat	stat;
+	u64			sleep_ns;
+};
+
 struct request_queue {
 	/*
 	 * Together with queue_head for cacheline sharing
@@ -477,7 +483,7 @@ struct request_queue {
 	int			poll_nsec;
 
 	struct blk_stat_callback	*poll_cb;
-	struct blk_rq_stat	poll_stat[BLK_MQ_POLL_STATS_BKTS];
+	struct poll_info	poll_info[BLK_MQ_POLL_STATS_BKTS];
 
 	struct timer_list	timeout;
 	struct work_struct	timeout_work;
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 6/7] blk-mq: Track num of overslept by hybrid poll rqs
  2019-04-30  7:34 [PATCH 0/7] Adjust hybrid polling sleep time Pavel Begunkov (Silence)
                   ` (4 preceding siblings ...)
  2019-04-30  7:34 ` [PATCH 5/7] blk-mq: Precalculate hybrid polling time Pavel Begunkov (Silence)
@ 2019-04-30  7:34 ` Pavel Begunkov (Silence)
  2019-04-30  7:34 ` [PATCH 7/7] blk-mq: Adjust hybrid poll sleep time Pavel Begunkov (Silence)
  2019-05-24  9:06 ` [PATCH 0/7] Adjust hybrid polling " Pavel Begunkov
  7 siblings, 0 replies; 9+ messages in thread
From: Pavel Begunkov (Silence) @ 2019-04-30  7:34 UTC (permalink / raw)
  To: Jens Axboe, linux-block, linux-kernel; +Cc: Pavel Begunkov

From: Pavel Begunkov <asml.silence@gmail.com>

To fine-tune adaptive polling sleep time, it's needed to know how
accurate the current estimate is, which could be done using the ratio of
missed (i.e., overslept) requests.

The collection of the missed number is performed with an assumption,
that a request needs to busy poll for some time after wake up to
complete. And if it was completed by the first poll call, than
that's a miss.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 block/blk-core.c       |  4 +-
 block/blk-mq.c         | 94 ++++++++++++++++++++++++++++++------------
 block/blk-stat.c       |  2 +-
 include/linux/blkdev.h |  9 ++++
 4 files changed, 81 insertions(+), 28 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index daadce545e43..88d8ec4268ca 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -536,8 +536,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (blkcg_init_queue(q))
 		goto fail_ref;
 
-	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++)
+	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
 		q->poll_info[bucket].sleep_ns = 0;
+		atomic_set(&q->poll_info[bucket].nr_misses, 0);
+	}
 
 	return q;
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4e54a004e345..ec7cde754c2f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -528,6 +528,34 @@ void blk_mq_free_request(struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
+static inline void blk_mq_record_stats(struct request *rq, u64 now)
+{
+	int bucket = blk_mq_poll_stats_bkt(rq);
+
+	if (bucket >= 0 && !(rq->rq_flags & RQF_MQ_POLLED)) {
+		struct poll_info *pi;
+		u64 threshold;
+
+		pi = &rq->q->poll_info[bucket];
+		/*
+		 * Even if the time for hybrid polling predicted well, the
+		 * completion could oversleep because of a timer's lag. Try
+		 * to detect and skip accounting for such outliers.
+		 */
+		threshold = pi->stat.mean;
+
+		/*
+		 * Ideally, miss count should be close to 0,
+		 * so should not happen often.
+		 */
+		if (blk_rq_io_time(rq, now) < threshold)
+			atomic_inc(&pi->nr_misses);
+	}
+
+	blk_mq_poll_stats_start(rq->q);
+	blk_stat_add(rq, now);
+}
+
 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 {
 	u64 now = 0;
@@ -574,10 +602,8 @@ static void __blk_mq_complete_request(struct request *rq)
 
 	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
 
-	if (rq->rq_flags & RQF_STATS) {
-		blk_mq_poll_stats_start(rq->q);
-		blk_stat_add(rq, ktime_get_ns());
-	}
+	if (rq->rq_flags & RQF_STATS)
+		blk_mq_record_stats(rq, ktime_get_ns());
 	/*
 	 * Most of single queue controllers, there is only one irq vector
 	 * for handling IO completion, and the only irq's affinity is set
@@ -3316,14 +3342,25 @@ static void blk_mq_update_poll_info(struct poll_info *pi,
 				    struct blk_rq_stat *stat)
 {
 	u64 sleep_ns;
+	u32 nr_misses, nr_samples;
+
+	nr_samples = stat->nr_samples;
+	nr_misses = atomic_read(&pi->nr_misses);
+	if (nr_misses > nr_samples)
+		nr_misses = nr_samples;
 
-	if (!stat->nr_samples)
+	if (!nr_samples)
 		sleep_ns = 0;
 	else
 		sleep_ns = (stat->mean + 1) / 2;
 
+	/*
+	 * Use miss ratio here to adjust sleep time
+	 */
+
 	pi->stat = *stat;
 	pi->sleep_ns = sleep_ns;
+	atomic_set(&pi->nr_misses, 0);
 }
 
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
@@ -3389,10 +3426,6 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 
 	rq->rq_flags |= RQF_MQ_POLL_SLEPT;
 
-	/*
-	 * This will be replaced with the stats tracking code, using
-	 * 'avg_completion_time / 2' as the pre-sleep target.
-	 */
 	kt = nsecs;
 
 	mode = HRTIMER_MODE_REL;
@@ -3417,30 +3450,34 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 }
 
 static bool blk_mq_poll_hybrid(struct request_queue *q,
-			       struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
+			       struct blk_mq_hw_ctx *hctx,
+			       struct request *rq)
 {
-	struct request *rq;
-
 	if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
 		return false;
 
-	if (!blk_qc_t_is_internal(cookie))
-		rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
-	else {
-		rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
-		/*
-		 * With scheduling, if the request has completed, we'll
-		 * get a NULL return here, as we clear the sched tag when
-		 * that happens. The request still remains valid, like always,
-		 * so we should be safe with just the NULL check.
-		 */
-		if (!rq)
-			return false;
-	}
+	/*
+	 * With scheduling, if the request has completed, we'll
+	 * get a NULL request here, as we clear the sched tag when
+	 * that happens. The request still remains valid, like always,
+	 * so we should be safe with just the NULL check.
+	 */
+	if (!rq)
+		return false;
 
 	return blk_mq_poll_hybrid_sleep(q, hctx, rq);
 }
 
+static inline struct request *qc_t_to_request(struct blk_mq_hw_ctx *hctx,
+		blk_qc_t cookie)
+{
+	struct blk_mq_tags *tags;
+
+	tags = blk_qc_t_is_internal(cookie) ? hctx->sched_tags : hctx->tags;
+
+	return blk_mq_tag_to_rq(tags, blk_qc_t_to_tag(cookie));
+}
+
 /**
  * blk_poll - poll for IO completions
  * @q:  the queue
@@ -3456,6 +3493,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q,
 int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 {
 	struct blk_mq_hw_ctx *hctx;
+	struct request *rq;
 	long state;
 
 	if (!blk_qc_t_valid(cookie) ||
@@ -3466,6 +3504,7 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 		blk_flush_plug_list(current->plug, false);
 
 	hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
+	rq = qc_t_to_request(hctx, cookie);
 
 	/*
 	 * If we sleep, have the caller restart the poll loop to reset
@@ -3474,7 +3513,7 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 	 * the IO isn't complete, we'll get called again and will go
 	 * straight to the busy poll loop.
 	 */
-	if (blk_mq_poll_hybrid(q, hctx, cookie))
+	if (blk_mq_poll_hybrid(q, hctx, rq))
 		return 1;
 
 	hctx->poll_considered++;
@@ -3486,6 +3525,9 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 		hctx->poll_invoked++;
 
 		ret = q->mq_ops->poll(hctx);
+		if (rq)
+			rq->rq_flags |= RQF_MQ_POLLED;
+
 		if (ret > 0) {
 			hctx->poll_success++;
 			__set_current_state(TASK_RUNNING);
diff --git a/block/blk-stat.c b/block/blk-stat.c
index e1915a4e41b9..33b7b9c35791 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -94,7 +94,7 @@ void blk_stat_add(struct request *rq, u64 now)
 	int bucket;
 	u64 value;
 
-	value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
+	value = blk_rq_io_time(rq, now);
 
 	blk_throtl_stat_add(rq, value);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 40c77935fd61..36f17ed1376a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -109,6 +109,9 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 20))
 /* ->timeout has been called, don't expire again */
 #define RQF_TIMED_OUT		((__force req_flags_t)(1 << 21))
+/* Request has been polled at least once */
+#define RQF_MQ_POLLED		((__force req_flags_t)(1 << 22))
+
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
@@ -389,6 +392,7 @@ struct poll_info
 {
 	struct blk_rq_stat	stat;
 	u64			sleep_ns;
+	atomic_t		nr_misses;
 };
 
 struct request_queue {
@@ -924,6 +928,11 @@ static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
 }
 #endif /* CONFIG_BLK_DEV_ZONED */
 
+static inline u64 blk_rq_io_time(struct request *rq, u64 now)
+{
+	return (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
+}
+
 /*
  * Some commands like WRITE SAME have a payload or data transfer size which
  * is different from the size of the request.  Any driver that supports such
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 7/7] blk-mq: Adjust hybrid poll sleep time
  2019-04-30  7:34 [PATCH 0/7] Adjust hybrid polling sleep time Pavel Begunkov (Silence)
                   ` (5 preceding siblings ...)
  2019-04-30  7:34 ` [PATCH 6/7] blk-mq: Track num of overslept by hybrid poll rqs Pavel Begunkov (Silence)
@ 2019-04-30  7:34 ` Pavel Begunkov (Silence)
  2019-05-24  9:06 ` [PATCH 0/7] Adjust hybrid polling " Pavel Begunkov
  7 siblings, 0 replies; 9+ messages in thread
From: Pavel Begunkov (Silence) @ 2019-04-30  7:34 UTC (permalink / raw)
  To: Jens Axboe, linux-block, linux-kernel; +Cc: Pavel Begunkov

From: Pavel Begunkov <asml.silence@gmail.com>

Sleep for (mean / 2) in the adaptive polling is often too pessimistic,
use a variation of the 3-sigma rule (mean - 4 * lmd) and tune it in
runtime using percentage of missed (i.e. overslept) requests:
1. if more than ~3% of requests are missed, then fallback to (mean / 2)
2. if more than ~0.4% is missed, then scale down

Pitfalls:
1. any missed request increases the mean, synergistically increasing
mean and sleep time, so, scale down fast in the case
2. even if the sleep time is predicted well, sleep loop could greatly
oversleep by itself. Then try to detect it and skip the miss accounting.

Tested on an NVMe SSD:
{4K,8K} read-only workloads give similar latency distribution (up to
7 nines), and decreases CPU load twice (50% -> 25%). New method even
outperform the old one a bit (in terms of throughput and latencies),
presumably, because it alleviates the 2nd pitfall.
For write-only workload it falls back to (mean / 2).

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 block/blk-mq.c | 44 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ec7cde754c2f..efa44a617bea 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3338,10 +3338,21 @@ static void blk_mq_poll_stats_start(struct request_queue *q)
 	blk_stat_activate_msecs(q->poll_cb, 100);
 }
 
+/*
+ * Thresholds are ilog2(nr_requests / nr_misses)
+ * To calculate tolerated miss ratio from it, use
+ * f(x) ~= 2 ^ -(x + 1)
+ *
+ * fallback ~ 3.1%
+ * throttle ~ 0.4%
+ */
+#define BLK_POLL_FALLBACK_THRESHOLD	4
+#define BLK_POLL_THROTTLE_THRESHOLD	7
+
 static void blk_mq_update_poll_info(struct poll_info *pi,
 				    struct blk_rq_stat *stat)
 {
-	u64 sleep_ns;
+	u64 half_mean, indent, sleep_ns;
 	u32 nr_misses, nr_samples;
 
 	nr_samples = stat->nr_samples;
@@ -3349,14 +3360,33 @@ static void blk_mq_update_poll_info(struct poll_info *pi,
 	if (nr_misses > nr_samples)
 		nr_misses = nr_samples;
 
-	if (!nr_samples)
+	half_mean = (stat->mean + 1) / 2;
+	indent = stat->lmd * 4;
+
+	if (!stat->nr_samples) {
 		sleep_ns = 0;
-	else
-		sleep_ns = (stat->mean + 1) / 2;
+	} else if (!stat->lmd || stat->mean <= indent) {
+		sleep_ns = half_mean;
+	} else {
+		int ratio = INT_MAX;
 
-	/*
-	 * Use miss ratio here to adjust sleep time
-	 */
+		sleep_ns = stat->mean - indent;
+
+		/*
+		 * If a completion is overslept, the observable time will
+		 * be greater than the actual, so increasing mean. It
+		 * also increases sleep time estimation, synergistically
+		 * backfiring on mean. Need to scale down / fallback early.
+		 */
+		if (nr_misses)
+			ratio = ilog2(nr_samples / nr_misses);
+		if (ratio <= BLK_POLL_FALLBACK_THRESHOLD)
+			sleep_ns = half_mean;
+		else if (ratio <= BLK_POLL_THROTTLE_THRESHOLD)
+			sleep_ns -= sleep_ns / 4;
+
+		sleep_ns = max(sleep_ns, half_mean);
+	}
 
 	pi->stat = *stat;
 	pi->sleep_ns = sleep_ns;
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/7] Adjust hybrid polling sleep time
  2019-04-30  7:34 [PATCH 0/7] Adjust hybrid polling sleep time Pavel Begunkov (Silence)
                   ` (6 preceding siblings ...)
  2019-04-30  7:34 ` [PATCH 7/7] blk-mq: Adjust hybrid poll sleep time Pavel Begunkov (Silence)
@ 2019-05-24  9:06 ` Pavel Begunkov
  7 siblings, 0 replies; 9+ messages in thread
From: Pavel Begunkov @ 2019-05-24  9:06 UTC (permalink / raw)
  To: Jens Axboe, linux-block, linux-kernel

Any suggestions?

You might also want to consider (and hopefully apply) the first 3
separately as they are bug fixes. (e.g. hybrid polling turned out to be
disabled).
Would it be better for me to split the patchset?


On 4/30/2019 10:34 AM, Pavel Begunkov (Silence) wrote:
> From: Pavel Begunkov <asml.silence@gmail.com>
> 
> Sleep time for adaptive hybrid polling is coarse and can be improved to
> decrease CPU load. Use variation of the 3-sigma rule and runtime
> tuning.
> 
> This approach gives up to 2x CPU load reduction keeping the same latency
> distribution and throughput.
> 
> Pavel Begunkov (7):
>   blk-iolatency: Fix zero mean in previous stats
>   blk-stats: Introduce explicit stat staging buffers
>   blk-mq: Fix disabled hybrid polling
>   blk-stats: Add left mean deviation to blk_stats
>   blk-mq: Precalculate hybrid polling time
>   blk-mq: Track num of overslept by hybrid poll rqs
>   blk-mq: Adjust hybrid poll sleep time
> 
>  block/blk-core.c          |   7 +-
>  block/blk-iolatency.c     |  60 ++++++++++----
>  block/blk-mq-debugfs.c    |  14 ++--
>  block/blk-mq.c            | 163 ++++++++++++++++++++++++++++----------
>  block/blk-stat.c          |  67 +++++++++++++---
>  block/blk-stat.h          |  15 +++-
>  include/linux/blk_types.h |   9 +++
>  include/linux/blkdev.h    |  17 +++-
>  8 files changed, 271 insertions(+), 81 deletions(-)
> 

-- 
Yours sincerely,
Pavel Begunkov

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-05-24  9:06 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-30  7:34 [PATCH 0/7] Adjust hybrid polling sleep time Pavel Begunkov (Silence)
2019-04-30  7:34 ` [PATCH 1/7] blk-iolatency: Fix zero mean in previous stats Pavel Begunkov (Silence)
2019-04-30  7:34 ` [PATCH 2/7] blk-stats: Introduce explicit stat staging buffers Pavel Begunkov (Silence)
2019-04-30  7:34 ` [PATCH 3/7] blk-mq: Fix disabled hybrid polling Pavel Begunkov (Silence)
2019-04-30  7:34 ` [PATCH 4/7] blk-stats: Add left mean deviation to blk_stats Pavel Begunkov (Silence)
2019-04-30  7:34 ` [PATCH 5/7] blk-mq: Precalculate hybrid polling time Pavel Begunkov (Silence)
2019-04-30  7:34 ` [PATCH 6/7] blk-mq: Track num of overslept by hybrid poll rqs Pavel Begunkov (Silence)
2019-04-30  7:34 ` [PATCH 7/7] blk-mq: Adjust hybrid poll sleep time Pavel Begunkov (Silence)
2019-05-24  9:06 ` [PATCH 0/7] Adjust hybrid polling " Pavel Begunkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).