From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756897AbcKCLRZ (ORCPT ); Thu, 3 Nov 2016 07:17:25 -0400 Received: from mail-vk0-f66.google.com ([209.85.213.66]:34384 "EHLO mail-vk0-f66.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756763AbcKCLRW (ORCPT ); Thu, 3 Nov 2016 07:17:22 -0400 MIME-Version: 1.0 In-Reply-To: <1478034325-28232-2-git-send-email-axboe@fb.com> References: <1478034325-28232-1-git-send-email-axboe@fb.com> <1478034325-28232-2-git-send-email-axboe@fb.com> From: Ming Lei Date: Thu, 3 Nov 2016 19:17:20 +0800 Message-ID: Subject: Re: [PATCH 1/4] block: add scalable completion tracking of requests To: Jens Axboe Cc: Jens Axboe , Linux Kernel Mailing List , linux-block , Christoph Hellwig Content-Type: text/plain; charset=UTF-8 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Wed, Nov 2, 2016 at 5:05 AM, Jens Axboe wrote: > For legacy block, we simply track them in the request queue. For > blk-mq, we track them on a per-sw queue basis, which we can then > sum up through the hardware queues and finally to a per device > state. > > The stats are tracked in, roughly, 0.1s interval windows. > > Add sysfs files to display the stats. > > Signed-off-by: Jens Axboe > --- > block/Makefile | 2 +- > block/blk-core.c | 4 + > block/blk-mq-sysfs.c | 47 ++++++++++ > block/blk-mq.c | 14 +++ > block/blk-mq.h | 3 + > block/blk-stat.c | 226 ++++++++++++++++++++++++++++++++++++++++++++++ > block/blk-stat.h | 37 ++++++++ > block/blk-sysfs.c | 26 ++++++ > include/linux/blk_types.h | 16 ++++ > include/linux/blkdev.h | 4 + > 10 files changed, 378 insertions(+), 1 deletion(-) > create mode 100644 block/blk-stat.c > create mode 100644 block/blk-stat.h > > diff --git a/block/Makefile b/block/Makefile > index 934dac73fb37..2528c596f7ec 100644 > --- a/block/Makefile > +++ b/block/Makefile > @@ -5,7 +5,7 @@ > obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ > blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ > blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ > - blk-lib.o blk-mq.o blk-mq-tag.o \ > + blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ > blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \ > genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ > badblocks.o partitions/ > diff --git a/block/blk-core.c b/block/blk-core.c > index 0bfaa54d3e9f..ca77c725b4e5 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -2462,6 +2462,8 @@ void blk_start_request(struct request *req) > { > blk_dequeue_request(req); > > + blk_stat_set_issue_time(&req->issue_stat); > + > /* > * We are now handing the request to the hardware, initialize > * resid_len to full count and add the timeout handler. > @@ -2529,6 +2531,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) > > trace_block_rq_complete(req->q, req, nr_bytes); > > + blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req); blk_update_request() is often called lockless, so it isn't good to do it here. > + > if (!req->bio) > return false; > > diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c > index 01fb455d3377..633c79a538ea 100644 > --- a/block/blk-mq-sysfs.c > +++ b/block/blk-mq-sysfs.c > @@ -259,6 +259,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) > return ret; > } > > +static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx) > +{ > + struct blk_mq_ctx *ctx; > + unsigned int i; > + > + hctx_for_each_ctx(hctx, ctx, i) { > + blk_stat_init(&ctx->stat[0]); > + blk_stat_init(&ctx->stat[1]); > + } > +} > + > +static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx, > + const char *page, size_t count) > +{ > + blk_mq_stat_clear(hctx); > + return count; > +} > + > +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) > +{ > + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", > + pre, (long long) stat->nr_samples, > + (long long) stat->mean, (long long) stat->min, > + (long long) stat->max); > +} > + > +static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page) > +{ > + struct blk_rq_stat stat[2]; > + ssize_t ret; > + > + blk_stat_init(&stat[0]); > + blk_stat_init(&stat[1]); > + > + blk_hctx_stat_get(hctx, stat); > + > + ret = print_stat(page, &stat[0], "read :"); > + ret += print_stat(page + ret, &stat[1], "write:"); > + return ret; > +} > + > static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { > .attr = {.name = "dispatched", .mode = S_IRUGO }, > .show = blk_mq_sysfs_dispatched_show, > @@ -317,6 +358,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { > .show = blk_mq_hw_sysfs_poll_show, > .store = blk_mq_hw_sysfs_poll_store, > }; > +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = { > + .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR }, > + .show = blk_mq_hw_sysfs_stat_show, > + .store = blk_mq_hw_sysfs_stat_store, > +}; > > static struct attribute *default_hw_ctx_attrs[] = { > &blk_mq_hw_sysfs_queued.attr, > @@ -327,6 +373,7 @@ static struct attribute *default_hw_ctx_attrs[] = { > &blk_mq_hw_sysfs_cpus.attr, > &blk_mq_hw_sysfs_active.attr, > &blk_mq_hw_sysfs_poll.attr, > + &blk_mq_hw_sysfs_stat.attr, > NULL, > }; > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 2da1a0ee3318..4555a76d22a7 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -30,6 +30,7 @@ > #include "blk.h" > #include "blk-mq.h" > #include "blk-mq-tag.h" > +#include "blk-stat.h" > > static DEFINE_MUTEX(all_q_mutex); > static LIST_HEAD(all_q_list); > @@ -376,10 +377,19 @@ static void blk_mq_ipi_complete_request(struct request *rq) > put_cpu(); > } > > +static void blk_mq_stat_add(struct request *rq) > +{ > + struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)]; > + > + blk_stat_add(stat, rq); > +} > + > static void __blk_mq_complete_request(struct request *rq) > { > struct request_queue *q = rq->q; > > + blk_mq_stat_add(rq); It is still possible for rqs belonging to same sw queue to complete on different CPUs, so same issue with legacy queue. > + > if (!q->softirq_done_fn) > blk_mq_end_request(rq, rq->errors); > else > @@ -423,6 +433,8 @@ void blk_mq_start_request(struct request *rq) > if (unlikely(blk_bidi_rq(rq))) > rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); > > + blk_stat_set_issue_time(&rq->issue_stat); > + > blk_add_timer(rq); > > /* > @@ -1708,6 +1720,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, > spin_lock_init(&__ctx->lock); > INIT_LIST_HEAD(&__ctx->rq_list); > __ctx->queue = q; > + blk_stat_init(&__ctx->stat[0]); > + blk_stat_init(&__ctx->stat[1]); > > /* If the cpu isn't online, the cpu is mapped to first hctx */ > if (!cpu_online(i)) > diff --git a/block/blk-mq.h b/block/blk-mq.h > index e5d25249028c..8cf16cb69f64 100644 > --- a/block/blk-mq.h > +++ b/block/blk-mq.h > @@ -1,6 +1,8 @@ > #ifndef INT_BLK_MQ_H > #define INT_BLK_MQ_H > > +#include "blk-stat.h" > + > struct blk_mq_tag_set; > > struct blk_mq_ctx { > @@ -18,6 +20,7 @@ struct blk_mq_ctx { > > /* incremented at completion time */ > unsigned long ____cacheline_aligned_in_smp rq_completed[2]; > + struct blk_rq_stat stat[2]; > > struct request_queue *queue; > struct kobject kobj; > diff --git a/block/blk-stat.c b/block/blk-stat.c > new file mode 100644 > index 000000000000..642afdc6d0f8 > --- /dev/null > +++ b/block/blk-stat.c > @@ -0,0 +1,226 @@ > +/* > + * Block stat tracking code > + * > + * Copyright (C) 2016 Jens Axboe > + */ > +#include > +#include > + > +#include "blk-stat.h" > +#include "blk-mq.h" > + > +static void blk_stat_flush_batch(struct blk_rq_stat *stat) > +{ > + if (!stat->nr_batch) > + return; > + if (!stat->nr_samples) > + stat->mean = div64_s64(stat->batch, stat->nr_batch); > + else { > + stat->mean = div64_s64((stat->mean * stat->nr_samples) + > + stat->batch, > + stat->nr_samples + stat->nr_batch); > + } > + > + stat->nr_samples += stat->nr_batch; > + stat->nr_batch = stat->batch = 0; > +} > + > +void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) > +{ > + if (!src->nr_samples) > + return; > + > + blk_stat_flush_batch(src); > + > + dst->min = min(dst->min, src->min); > + dst->max = max(dst->max, src->max); > + > + if (!dst->nr_samples) > + dst->mean = src->mean; > + else { > + dst->mean = div64_s64((src->mean * src->nr_samples) + > + (dst->mean * dst->nr_samples), > + dst->nr_samples + src->nr_samples); > + } > + dst->nr_samples += src->nr_samples; > +} > + > +static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst) > +{ > + struct blk_mq_hw_ctx *hctx; > + struct blk_mq_ctx *ctx; > + uint64_t latest = 0; > + int i, j, nr; > + > + blk_stat_init(&dst[0]); > + blk_stat_init(&dst[1]); > + > + nr = 0; > + do { > + uint64_t newest = 0; > + > + queue_for_each_hw_ctx(q, hctx, i) { > + hctx_for_each_ctx(hctx, ctx, j) { > + if (!ctx->stat[0].nr_samples && > + !ctx->stat[1].nr_samples) > + continue; > + if (ctx->stat[0].time > newest) > + newest = ctx->stat[0].time; > + if (ctx->stat[1].time > newest) > + newest = ctx->stat[1].time; > + } > + } > + > + /* > + * No samples > + */ > + if (!newest) > + break; > + > + if (newest > latest) > + latest = newest; > + > + queue_for_each_hw_ctx(q, hctx, i) { > + hctx_for_each_ctx(hctx, ctx, j) { > + if (ctx->stat[0].time == newest) { > + blk_stat_sum(&dst[0], &ctx->stat[0]); > + nr++; > + } > + if (ctx->stat[1].time == newest) { > + blk_stat_sum(&dst[1], &ctx->stat[1]); > + nr++; > + } > + } > + } > + /* > + * If we race on finding an entry, just loop back again. > + * Should be very rare. > + */ > + } while (!nr); > + > + dst[0].time = dst[1].time = latest; > +} > + > +void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst) > +{ > + if (q->mq_ops) > + blk_mq_stat_get(q, dst); > + else { > + memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat)); > + memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat)); > + } > +} > + > +void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst) > +{ > + struct blk_mq_ctx *ctx; > + unsigned int i, nr; > + > + nr = 0; > + do { > + uint64_t newest = 0; > + > + hctx_for_each_ctx(hctx, ctx, i) { > + if (!ctx->stat[0].nr_samples && > + !ctx->stat[1].nr_samples) > + continue; > + > + if (ctx->stat[0].time > newest) > + newest = ctx->stat[0].time; > + if (ctx->stat[1].time > newest) > + newest = ctx->stat[1].time; > + } > + > + if (!newest) > + break; > + > + hctx_for_each_ctx(hctx, ctx, i) { > + if (ctx->stat[0].time == newest) { > + blk_stat_sum(&dst[0], &ctx->stat[0]); > + nr++; > + } > + if (ctx->stat[1].time == newest) { > + blk_stat_sum(&dst[1], &ctx->stat[1]); > + nr++; > + } > + } > + /* > + * If we race on finding an entry, just loop back again. > + * Should be very rare, as the window is only updated > + * occasionally > + */ > + } while (!nr); > +} > + > +static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now) > +{ > + stat->min = -1ULL; > + stat->max = stat->nr_samples = stat->mean = 0; > + stat->batch = stat->nr_batch = 0; > + stat->time = time_now & BLK_STAT_NSEC_MASK; > +} > + > +void blk_stat_init(struct blk_rq_stat *stat) > +{ > + __blk_stat_init(stat, ktime_to_ns(ktime_get())); > +} > + > +static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now) > +{ > + return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK); > +} > + > +bool blk_stat_is_current(struct blk_rq_stat *stat) > +{ > + return __blk_stat_is_current(stat, ktime_to_ns(ktime_get())); > +} > + > +void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) > +{ > + s64 now, value; > + > + now = __blk_stat_time(ktime_to_ns(ktime_get())); > + if (now < blk_stat_time(&rq->issue_stat)) > + return; > + > + if (!__blk_stat_is_current(stat, now)) > + __blk_stat_init(stat, now); > + > + value = now - blk_stat_time(&rq->issue_stat); > + if (value > stat->max) > + stat->max = value; > + if (value < stat->min) > + stat->min = value; > + > + if (stat->batch + value < stat->batch || > + stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) > + blk_stat_flush_batch(stat); > + > + stat->batch += value; > + stat->nr_batch++; > +} > + > +void blk_stat_clear(struct request_queue *q) > +{ > + if (q->mq_ops) { > + struct blk_mq_hw_ctx *hctx; > + struct blk_mq_ctx *ctx; > + int i, j; > + > + queue_for_each_hw_ctx(q, hctx, i) { > + hctx_for_each_ctx(hctx, ctx, j) { > + blk_stat_init(&ctx->stat[0]); > + blk_stat_init(&ctx->stat[1]); > + } > + } > + } else { > + blk_stat_init(&q->rq_stats[0]); > + blk_stat_init(&q->rq_stats[1]); > + } > +} > + > +void blk_stat_set_issue_time(struct blk_issue_stat *stat) > +{ > + stat->time = (stat->time & BLK_STAT_MASK) | > + (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK); > +} > diff --git a/block/blk-stat.h b/block/blk-stat.h > new file mode 100644 > index 000000000000..26b1f45dff26 > --- /dev/null > +++ b/block/blk-stat.h > @@ -0,0 +1,37 @@ > +#ifndef BLK_STAT_H > +#define BLK_STAT_H > + > +/* > + * ~0.13s window as a power-of-2 (2^27 nsecs) > + */ > +#define BLK_STAT_NSEC 134217728ULL > +#define BLK_STAT_NSEC_MASK ~(BLK_STAT_NSEC - 1) > + > +/* > + * Upper 3 bits can be used elsewhere > + */ > +#define BLK_STAT_RES_BITS 3 > +#define BLK_STAT_SHIFT (64 - BLK_STAT_RES_BITS) > +#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SHIFT) - 1) > +#define BLK_STAT_MASK ~BLK_STAT_TIME_MASK > + > +void blk_stat_add(struct blk_rq_stat *, struct request *); > +void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *); > +void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *); > +void blk_stat_clear(struct request_queue *q); > +void blk_stat_init(struct blk_rq_stat *); > +void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *); > +bool blk_stat_is_current(struct blk_rq_stat *); > +void blk_stat_set_issue_time(struct blk_issue_stat *); > + > +static inline u64 __blk_stat_time(u64 time) > +{ > + return time & BLK_STAT_TIME_MASK; > +} > + > +static inline u64 blk_stat_time(struct blk_issue_stat *stat) > +{ > + return __blk_stat_time(stat->time); > +} > + > +#endif > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c > index 488c2e28feb8..5bb4648f434a 100644 > --- a/block/blk-sysfs.c > +++ b/block/blk-sysfs.c > @@ -401,6 +401,26 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page) > return queue_var_show(blk_queue_dax(q), page); > } > > +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) > +{ > + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", > + pre, (long long) stat->nr_samples, > + (long long) stat->mean, (long long) stat->min, > + (long long) stat->max); > +} > + > +static ssize_t queue_stats_show(struct request_queue *q, char *page) > +{ > + struct blk_rq_stat stat[2]; > + ssize_t ret; > + > + blk_queue_stat_get(q, stat); > + > + ret = print_stat(page, &stat[0], "read :"); > + ret += print_stat(page + ret, &stat[1], "write:"); > + return ret; > +} > + > static struct queue_sysfs_entry queue_requests_entry = { > .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, > .show = queue_requests_show, > @@ -553,6 +573,11 @@ static struct queue_sysfs_entry queue_dax_entry = { > .show = queue_dax_show, > }; > > +static struct queue_sysfs_entry queue_stats_entry = { > + .attr = {.name = "stats", .mode = S_IRUGO }, > + .show = queue_stats_show, > +}; > + > static struct attribute *default_attrs[] = { > &queue_requests_entry.attr, > &queue_ra_entry.attr, > @@ -582,6 +607,7 @@ static struct attribute *default_attrs[] = { > &queue_poll_entry.attr, > &queue_wc_entry.attr, > &queue_dax_entry.attr, > + &queue_stats_entry.attr, > NULL, > }; > > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h > index bb921028e7c5..a59a214c39ae 100644 > --- a/include/linux/blk_types.h > +++ b/include/linux/blk_types.h > @@ -248,4 +248,20 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) > return cookie & ((1u << BLK_QC_T_SHIFT) - 1); > } > > +struct blk_issue_stat { > + u64 time; > +}; > + > +#define BLK_RQ_STAT_BATCH 64 > + > +struct blk_rq_stat { > + s64 mean; > + u64 min; > + u64 max; > + s32 nr_samples; > + s32 nr_batch; > + u64 batch; > + s64 time; > +}; > + > #endif /* __LINUX_BLK_TYPES_H */ > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index 8396da2bb698..dcd8d6e8801f 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -197,6 +197,7 @@ struct request { > struct gendisk *rq_disk; > struct hd_struct *part; > unsigned long start_time; > + struct blk_issue_stat issue_stat; > #ifdef CONFIG_BLK_CGROUP > struct request_list *rl; /* rl this rq is alloced from */ > unsigned long long start_time_ns; > @@ -490,6 +491,9 @@ struct request_queue { > > unsigned int nr_sorted; > unsigned int in_flight[2]; > + > + struct blk_rq_stat rq_stats[2]; > + > /* > * Number of active block driver functions for which blk_drain_queue() > * must wait. Must be incremented around functions that unlock the > -- > 2.7.4 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-block" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Ming Lei