From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-kernel-owner@vger.kernel.org>
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1756211AbcEEHwt (ORCPT <rfc822;w@1wt.eu>);
	Thu, 5 May 2016 03:52:49 -0400
Received: from mail-yw0-f193.google.com ([209.85.161.193]:36075 "EHLO
	mail-yw0-f193.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1751528AbcEEHwr (ORCPT
	<rfc822;linux-kernel@vger.kernel.org>);
	Thu, 5 May 2016 03:52:47 -0400
MIME-Version: 1.0
In-Reply-To: <1461686131-22999-7-git-send-email-axboe@fb.com>
References: <1461686131-22999-1-git-send-email-axboe@fb.com>
	<1461686131-22999-7-git-send-email-axboe@fb.com>
Date: Thu, 5 May 2016 15:52:46 +0800
Message-ID: <CACVXFVPYQUEpoSHL8pdyCx41gZ3_=eEy45Sr4HOOvQF9ErGrGA@mail.gmail.com>
Subject: Re: [PATCH 6/8] block: add scalable completion tracking of requests
From: Ming Lei <tom.leiming@gmail.com>
To: Jens Axboe <axboe@fb.com>
Cc: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
        Linux FS Devel <linux-fsdevel@vger.kernel.org>,
        linux-block@vger.kernel.org, Jan Kara <jack@suse.cz>,
        dchinner@redhat.com, sedat.dilek@gmail.com
Content-Type: text/plain; charset=UTF-8
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

On Tue, Apr 26, 2016 at 11:55 PM, Jens Axboe <axboe@fb.com> wrote:
> For legacy block, we simply track them in the request queue. For
> blk-mq, we track them on a per-sw queue basis, which we can then
> sum up through the hardware queues and finally to a per device
> state.
>
> The stats are tracked in, roughly, 0.1s interval windows.
>
> Add sysfs files to display the stats.
>
> Signed-off-by: Jens Axboe <axboe@fb.com>
> ---
>  block/Makefile            |   2 +-
>  block/blk-core.c          |   4 +
>  block/blk-mq-sysfs.c      |  47 ++++++++++++
>  block/blk-mq.c            |  14 ++++
>  block/blk-mq.h            |   3 +
>  block/blk-stat.c          | 184 ++++++++++++++++++++++++++++++++++++++++++++++
>  block/blk-stat.h          |  17 +++++
>  block/blk-sysfs.c         |  26 +++++++
>  include/linux/blk_types.h |   8 ++
>  include/linux/blkdev.h    |   4 +
>  10 files changed, 308 insertions(+), 1 deletion(-)
>  create mode 100644 block/blk-stat.c
>  create mode 100644 block/blk-stat.h
>
> diff --git a/block/Makefile b/block/Makefile
> index 9eda2322b2d4..3446e0472df0 100644
> --- a/block/Makefile
> +++ b/block/Makefile
> @@ -5,7 +5,7 @@
>  obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
>                         blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
>                         blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
> -                       blk-lib.o blk-mq.o blk-mq-tag.o \
> +                       blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
>                         blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
>                         genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
>                         badblocks.o partitions/
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 74c16fd8995d..40b57bf4852c 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -2514,6 +2514,8 @@ void blk_start_request(struct request *req)
>  {
>         blk_dequeue_request(req);
>
> +       req->issue_time = ktime_to_ns(ktime_get());
> +
>         /*
>          * We are now handing the request to the hardware, initialize
>          * resid_len to full count and add the timeout handler.
> @@ -2581,6 +2583,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
>
>         trace_block_rq_complete(req->q, req, nr_bytes);
>
> +       blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req);

blk_update_request() is often run lockless, so it might be a problem
to add into queue's status here in case of non-blk-mq. Maybe it is
better to do it in blk_finish_request()?

For blk-mq, blk_stat_add() should be avoided here.

> +
>         if (!req->bio)
>                 return false;
>
> diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
> index 4ea4dd8a1eed..2f68015f8616 100644
> --- a/block/blk-mq-sysfs.c
> +++ b/block/blk-mq-sysfs.c
> @@ -247,6 +247,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
>         return ret;
>  }
>
> +static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx)
> +{
> +       struct blk_mq_ctx *ctx;
> +       unsigned int i;
> +
> +       hctx_for_each_ctx(hctx, ctx, i) {
> +               blk_stat_init(&ctx->stat[0]);
> +               blk_stat_init(&ctx->stat[1]);
> +       }
> +}
> +
> +static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx,
> +                                         const char *page, size_t count)
> +{
> +       blk_mq_stat_clear(hctx);
> +       return count;
> +}
> +
> +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
> +{
> +       return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
> +                       pre, (long long) stat->nr_samples,
> +                       (long long) stat->mean, (long long) stat->min,
> +                       (long long) stat->max);
> +}
> +
> +static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page)
> +{
> +       struct blk_rq_stat stat[2];
> +       ssize_t ret;
> +
> +       blk_stat_init(&stat[0]);
> +       blk_stat_init(&stat[1]);
> +
> +       blk_hctx_stat_get(hctx, stat);
> +
> +       ret = print_stat(page, &stat[0], "read :");
> +       ret += print_stat(page + ret, &stat[1], "write:");
> +       return ret;
> +}
> +
>  static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
>         .attr = {.name = "dispatched", .mode = S_IRUGO },
>         .show = blk_mq_sysfs_dispatched_show,
> @@ -304,6 +345,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
>         .attr = {.name = "io_poll", .mode = S_IRUGO },
>         .show = blk_mq_hw_sysfs_poll_show,
>  };
> +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = {
> +       .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR },
> +       .show = blk_mq_hw_sysfs_stat_show,
> +       .store = blk_mq_hw_sysfs_stat_store,
> +};
>
>  static struct attribute *default_hw_ctx_attrs[] = {
>         &blk_mq_hw_sysfs_queued.attr,
> @@ -314,6 +360,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
>         &blk_mq_hw_sysfs_cpus.attr,
>         &blk_mq_hw_sysfs_active.attr,
>         &blk_mq_hw_sysfs_poll.attr,
> +       &blk_mq_hw_sysfs_stat.attr,
>         NULL,
>  };
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 1699baf39b78..71b4a13fbf94 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -29,6 +29,7 @@
>  #include "blk.h"
>  #include "blk-mq.h"
>  #include "blk-mq-tag.h"
> +#include "blk-stat.h"
>
>  static DEFINE_MUTEX(all_q_mutex);
>  static LIST_HEAD(all_q_list);
> @@ -356,10 +357,19 @@ static void blk_mq_ipi_complete_request(struct request *rq)
>         put_cpu();
>  }
>
> +static void blk_mq_stat_add(struct request *rq)
> +{
> +       struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)];
> +
> +       blk_stat_add(stat, rq);
> +}
> +
>  static void __blk_mq_complete_request(struct request *rq)
>  {
>         struct request_queue *q = rq->q;
>
> +       blk_mq_stat_add(rq);
> +
>         if (!q->softirq_done_fn)
>                 blk_mq_end_request(rq, rq->errors);
>         else
> @@ -403,6 +413,8 @@ void blk_mq_start_request(struct request *rq)
>         if (unlikely(blk_bidi_rq(rq)))
>                 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
>
> +       rq->issue_time = ktime_to_ns(ktime_get());
> +
>         blk_add_timer(rq);
>
>         /*
> @@ -1761,6 +1773,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
>                 spin_lock_init(&__ctx->lock);
>                 INIT_LIST_HEAD(&__ctx->rq_list);
>                 __ctx->queue = q;
> +               blk_stat_init(&__ctx->stat[0]);
> +               blk_stat_init(&__ctx->stat[1]);
>
>                 /* If the cpu isn't online, the cpu is mapped to first hctx */
>                 if (!cpu_online(i))
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 9087b11037b7..e107f700ff17 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -1,6 +1,8 @@
>  #ifndef INT_BLK_MQ_H
>  #define INT_BLK_MQ_H
>
> +#include "blk-stat.h"
> +
>  struct blk_mq_tag_set;
>
>  struct blk_mq_ctx {
> @@ -20,6 +22,7 @@ struct blk_mq_ctx {
>
>         /* incremented at completion time */
>         unsigned long           ____cacheline_aligned_in_smp rq_completed[2];
> +       struct blk_rq_stat      stat[2];
>
>         struct request_queue    *queue;
>         struct kobject          kobj;
> diff --git a/block/blk-stat.c b/block/blk-stat.c
> new file mode 100644
> index 000000000000..b38776a83173
> --- /dev/null
> +++ b/block/blk-stat.c
> @@ -0,0 +1,184 @@
> +/*
> + * Block stat tracking code
> + *
> + * Copyright (C) 2016 Jens Axboe
> + */
> +#include <linux/kernel.h>
> +#include <linux/blk-mq.h>
> +
> +#include "blk-stat.h"
> +#include "blk-mq.h"
> +
> +void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
> +{
> +       if (!src->nr_samples)
> +               return;
> +
> +       dst->min = min(dst->min, src->min);
> +       dst->max = max(dst->max, src->max);
> +
> +       if (!dst->nr_samples)
> +               dst->mean = src->mean;
> +       else {
> +               dst->mean = div64_s64((src->mean * src->nr_samples) +
> +                                       (dst->mean * dst->nr_samples),
> +                                       dst->nr_samples + src->nr_samples);
> +       }
> +       dst->nr_samples += src->nr_samples;
> +}
> +
> +static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
> +{
> +       struct blk_mq_hw_ctx *hctx;
> +       struct blk_mq_ctx *ctx;
> +       int i, j, nr;
> +
> +       blk_stat_init(&dst[0]);
> +       blk_stat_init(&dst[1]);
> +
> +       nr = 0;
> +       do {
> +               uint64_t newest = 0;
> +
> +               queue_for_each_hw_ctx(q, hctx, i) {
> +                       hctx_for_each_ctx(hctx, ctx, j) {
> +                               if (!ctx->stat[0].nr_samples &&
> +                                   !ctx->stat[1].nr_samples)
> +                                       continue;
> +                               if (ctx->stat[0].time > newest)
> +                                       newest = ctx->stat[0].time;
> +                               if (ctx->stat[1].time > newest)
> +                                       newest = ctx->stat[1].time;
> +                       }
> +               }
> +
> +               /*
> +                * No samples
> +                */
> +               if (!newest)
> +                       break;
> +
> +               queue_for_each_hw_ctx(q, hctx, i) {
> +                       hctx_for_each_ctx(hctx, ctx, j) {
> +                               if (ctx->stat[0].time == newest) {
> +                                       blk_stat_sum(&dst[0], &ctx->stat[0]);
> +                                       nr++;
> +                               }
> +                               if (ctx->stat[1].time == newest) {
> +                                       blk_stat_sum(&dst[1], &ctx->stat[1]);
> +                                       nr++;
> +                               }
> +                       }
> +               }
> +               /*
> +                * If we race on finding an entry, just loop back again.
> +                * Should be very rare.
> +                */
> +       } while (!nr);
> +}
> +
> +void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
> +{
> +       if (q->mq_ops)
> +               blk_mq_stat_get(q, dst);
> +       else {
> +               memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat));
> +               memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat));
> +       }
> +}
> +
> +void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
> +{
> +       struct blk_mq_ctx *ctx;
> +       unsigned int i, nr;
> +
> +       nr = 0;
> +       do {
> +               uint64_t newest = 0;
> +
> +               hctx_for_each_ctx(hctx, ctx, i) {
> +                       if (!ctx->stat[0].nr_samples &&
> +                           !ctx->stat[1].nr_samples)
> +                               continue;
> +
> +                       if (ctx->stat[0].time > newest)
> +                               newest = ctx->stat[0].time;
> +                       if (ctx->stat[1].time > newest)
> +                               newest = ctx->stat[1].time;
> +               }
> +
> +               if (!newest)
> +                       break;
> +
> +               hctx_for_each_ctx(hctx, ctx, i) {
> +                       if (ctx->stat[0].time == newest) {
> +                               blk_stat_sum(&dst[0], &ctx->stat[0]);
> +                               nr++;
> +                       }
> +                       if (ctx->stat[1].time == newest) {
> +                               blk_stat_sum(&dst[1], &ctx->stat[1]);
> +                               nr++;
> +                       }
> +               }
> +               /*
> +                * If we race on finding an entry, just loop back again.
> +                * Should be very rare, as the window is only updated
> +                * occasionally
> +                */
> +       } while (!nr);
> +}
> +
> +static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
> +{
> +       stat->min = -1ULL;
> +       stat->max = stat->nr_samples = stat->mean = 0;
> +       stat->time = time_now & BLK_STAT_MASK;
> +}
> +
> +void blk_stat_init(struct blk_rq_stat *stat)
> +{
> +       __blk_stat_init(stat, ktime_to_ns(ktime_get()));
> +}
> +
> +void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
> +{
> +       s64 delta, now, value;
> +
> +       now = ktime_to_ns(ktime_get());
> +       if (now < rq->issue_time)
> +               return;
> +
> +       if ((now & BLK_STAT_MASK) != (stat->time & BLK_STAT_MASK))
> +               __blk_stat_init(stat, now);
> +
> +       value = now - rq->issue_time;
> +       if (value > stat->max)
> +               stat->max = value;
> +       if (value < stat->min)
> +               stat->min = value;
> +
> +       delta = value - stat->mean;
> +       if (delta)
> +               stat->mean += div64_s64(delta, stat->nr_samples + 1);
> +
> +       stat->nr_samples++;
> +}
> +
> +void blk_stat_clear(struct request_queue *q)
> +{
> +       if (q->mq_ops) {
> +               struct blk_mq_hw_ctx *hctx;
> +               struct blk_mq_ctx *ctx;
> +               int i, j;
> +
> +               queue_for_each_hw_ctx(q, hctx, i) {
> +                       hctx_for_each_ctx(hctx, ctx, j) {
> +                               blk_stat_init(&ctx->stat[0]);
> +                               blk_stat_init(&ctx->stat[1]);
> +                       }
> +               }
> +       } else {
> +               blk_stat_init(&q->rq_stats[0]);
> +               blk_stat_init(&q->rq_stats[1]);
> +       }
> +}
> diff --git a/block/blk-stat.h b/block/blk-stat.h
> new file mode 100644
> index 000000000000..d77548dbf196
> --- /dev/null
> +++ b/block/blk-stat.h
> @@ -0,0 +1,17 @@
> +#ifndef BLK_STAT_H
> +#define BLK_STAT_H
> +
> +/*
> + * ~0.13s window as a power-of-2 (2^27 nsecs)
> + */
> +#define BLK_STAT_NSEC  134217728ULL
> +#define BLK_STAT_MASK  ~(BLK_STAT_NSEC - 1)
> +
> +void blk_stat_add(struct blk_rq_stat *, struct request *);
> +void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
> +void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
> +void blk_stat_clear(struct request_queue *q);
> +void blk_stat_init(struct blk_rq_stat *);
> +void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *);
> +
> +#endif
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 99205965f559..6e516cc0d3d0 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -379,6 +379,26 @@ static ssize_t queue_wc_store(struct request_queue *q, const char *page,
>         return count;
>  }
>
> +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
> +{
> +       return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
> +                       pre, (long long) stat->nr_samples,
> +                       (long long) stat->mean, (long long) stat->min,
> +                       (long long) stat->max);
> +}
> +
> +static ssize_t queue_stats_show(struct request_queue *q, char *page)
> +{
> +       struct blk_rq_stat stat[2];
> +       ssize_t ret;
> +
> +       blk_queue_stat_get(q, stat);
> +
> +       ret = print_stat(page, &stat[0], "read :");
> +       ret += print_stat(page + ret, &stat[1], "write:");
> +       return ret;
> +}
> +
>  static struct queue_sysfs_entry queue_requests_entry = {
>         .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
>         .show = queue_requests_show,
> @@ -516,6 +536,11 @@ static struct queue_sysfs_entry queue_wc_entry = {
>         .store = queue_wc_store,
>  };
>
> +static struct queue_sysfs_entry queue_stats_entry = {
> +       .attr = {.name = "stats", .mode = S_IRUGO },
> +       .show = queue_stats_show,
> +};
> +
>  static struct attribute *default_attrs[] = {
>         &queue_requests_entry.attr,
>         &queue_ra_entry.attr,
> @@ -542,6 +567,7 @@ static struct attribute *default_attrs[] = {
>         &queue_random_entry.attr,
>         &queue_poll_entry.attr,
>         &queue_wc_entry.attr,
> +       &queue_stats_entry.attr,
>         NULL,
>  };
>
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 223012451c7a..2b4414fb4d8e 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -268,4 +268,12 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
>         return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
>  }
>
> +struct blk_rq_stat {
> +       s64 mean;
> +       u64 min;
> +       u64 max;
> +       s64 nr_samples;
> +       s64 time;
> +};
> +
>  #endif /* __LINUX_BLK_TYPES_H */
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index eee94bd6de52..87f6703ced71 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -153,6 +153,7 @@ struct request {
>         struct gendisk *rq_disk;
>         struct hd_struct *part;
>         unsigned long start_time;
> +       s64 issue_time;

io_start_time_ns may be reused for same purpose.

>  #ifdef CONFIG_BLK_CGROUP
>         struct request_list *rl;                /* rl this rq is alloced from */
>         unsigned long long start_time_ns;
> @@ -402,6 +403,9 @@ struct request_queue {
>
>         unsigned int            nr_sorted;
>         unsigned int            in_flight[2];
> +
> +       struct blk_rq_stat      rq_stats[2];
> +
>         /*
>          * Number of active block driver functions for which blk_drain_queue()
>          * must wait. Must be incremented around functions that unlock the
> --
> 2.8.0.rc4.6.g7e4ba36
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


-- 
Ming Lei