From: Damien Le Moal <Damien.LeMoal@wdc.com>
To: Bart Van Assche <bvanassche@acm.org>, Jens Axboe <axboe@kernel.dk>
Cc: "linux-block@vger.kernel.org" <linux-block@vger.kernel.org>,
Christoph Hellwig <hch@lst.de>, Jaegeuk Kim <jaegeuk@kernel.org>,
Niklas Cassel <Niklas.Cassel@wdc.com>,
Hannes Reinecke <hare@suse.de>
Subject: Re: [PATCH 3/4] block/mq-deadline: Stop using per-CPU counters
Date: Fri, 24 Sep 2021 10:58:35 +0000 [thread overview]
Message-ID: <DM6PR04MB7081B7096944F8115A8BE2B6E7A49@DM6PR04MB7081.namprd04.prod.outlook.com> (raw)
In-Reply-To: 20210923232655.3907383-3-bvanassche@acm.org
On 2021/09/24 8:27, Bart Van Assche wrote:
> Calculating the sum over all CPUs of per-CPU counters frequently is
> inefficient. Hence switch from per-CPU to individual counters. Three
> counters are protected by the mq-deadline spinlock since these are
> only accessed from contexts that already hold that spinlock. The fourth
> counter is atomic because protecting it with the mq-deadline spinlock
> would trigger lock contention.
>
> Cc: Damien Le Moal <damien.lemoal@wdc.com>
> Cc: Niklas Cassel <Niklas.Cassel@wdc.com>
> Cc: Hannes Reinecke <hare@suse.de>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
> block/mq-deadline.c | 124 ++++++++++++++++++++------------------------
> 1 file changed, 56 insertions(+), 68 deletions(-)
>
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> index 6deb4306bcf3..b0cfc84a4e6b 100644
> --- a/block/mq-deadline.c
> +++ b/block/mq-deadline.c
> @@ -51,17 +51,16 @@ enum dd_prio {
>
> enum { DD_PRIO_COUNT = 3 };
>
> -/* I/O statistics per I/O priority. */
> +/*
> + * I/O statistics per I/O priority. It is fine if these counters overflow.
> + * What matters is that these counters are at least as wide as
> + * log2(max_outstanding_requests).
> + */
> struct io_stats_per_prio {
> - local_t inserted;
> - local_t merged;
> - local_t dispatched;
> - local_t completed;
> -};
> -
> -/* I/O statistics for all I/O priorities (enum dd_prio). */
> -struct io_stats {
> - struct io_stats_per_prio stats[DD_PRIO_COUNT];
> + uint32_t inserted;
> + uint32_t merged;
> + uint32_t dispatched;
> + atomic_t completed;
Why not use 64-bits types (regular unsigned long long and atomic64_t) ?
> };
>
> /*
> @@ -74,6 +73,7 @@ struct dd_per_prio {
> struct list_head fifo_list[DD_DIR_COUNT];
> /* Next request in FIFO order. Read, write or both are NULL. */
> struct request *next_rq[DD_DIR_COUNT];
> + struct io_stats_per_prio stats;
> };
>
> struct deadline_data {
> @@ -88,8 +88,6 @@ struct deadline_data {
> unsigned int batching; /* number of sequential requests made */
> unsigned int starved; /* times reads have starved writes */
>
> - struct io_stats __percpu *stats;
> -
> /*
> * settings that change how the i/o scheduler behaves
> */
> @@ -103,33 +101,6 @@ struct deadline_data {
> spinlock_t zone_lock;
> };
>
> -/* Count one event of type 'event_type' and with I/O priority 'prio' */
> -#define dd_count(dd, event_type, prio) do { \
> - struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \
> - \
> - BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \
> - BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \
> - local_inc(&io_stats->stats[(prio)].event_type); \
> - put_cpu_ptr(io_stats); \
> -} while (0)
> -
> -/*
> - * Returns the total number of dd_count(dd, event_type, prio) calls across all
> - * CPUs. No locking or barriers since it is fine if the returned sum is slightly
> - * outdated.
> - */
> -#define dd_sum(dd, event_type, prio) ({ \
> - unsigned int cpu; \
> - u32 sum = 0; \
> - \
> - BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \
> - BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \
> - for_each_present_cpu(cpu) \
> - sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \
> - stats[(prio)].event_type); \
> - sum; \
> -})
> -
> /* Maps an I/O priority class to a deadline scheduler priority. */
> static const enum dd_prio ioprio_class_to_prio[] = {
> [IOPRIO_CLASS_NONE] = DD_BE_PRIO,
> @@ -233,7 +204,9 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
> const u8 ioprio_class = dd_rq_ioclass(next);
> const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
>
> - dd_count(dd, merged, prio);
> + lockdep_assert_held(&dd->lock);
> +
> + dd->per_prio[prio].stats.merged++;
>
> /*
> * if next expires before rq, assign its expire time to rq
> @@ -273,7 +246,11 @@ deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
> /* Number of requests queued for a given priority level. */
> static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
> {
> - return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio);
> + const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
> +
> + lockdep_assert_held(&dd->lock);
> +
> + return stats->inserted - atomic_read(&stats->completed);
> }
>
> /*
> @@ -463,7 +440,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
> done:
> ioprio_class = dd_rq_ioclass(rq);
> prio = ioprio_class_to_prio[ioprio_class];
> - dd_count(dd, dispatched, prio);
> + dd->per_prio[prio].stats.dispatched++;
> /*
> * If the request needs its target zone locked, do it.
> */
> @@ -542,19 +519,22 @@ static void dd_exit_sched(struct elevator_queue *e)
>
> for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
> struct dd_per_prio *per_prio = &dd->per_prio[prio];
> + const struct io_stats_per_prio *stats = &per_prio->stats;
> + uint32_t queued;
>
> WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
> WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
> - WARN_ONCE(dd_queued(dd, prio) != 0,
> +
> + spin_lock(&dd->lock);
> + queued = dd_queued(dd, prio);
> + spin_unlock(&dd->lock);
> +
> + WARN_ONCE(queued,
> "statistics for priority %d: i %u m %u d %u c %u\n",
> - prio, dd_sum(dd, inserted, prio),
> - dd_sum(dd, merged, prio),
> - dd_sum(dd, dispatched, prio),
> - dd_sum(dd, completed, prio));
> + prio, stats->inserted, stats->merged,
> + stats->dispatched, atomic_read(&stats->completed));
> }
>
> - free_percpu(dd->stats);
> -
> kfree(dd);
> }
>
> @@ -578,11 +558,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
>
> eq->elevator_data = dd;
>
> - dd->stats = alloc_percpu_gfp(typeof(*dd->stats),
> - GFP_KERNEL | __GFP_ZERO);
> - if (!dd->stats)
> - goto free_dd;
> -
> for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
> struct dd_per_prio *per_prio = &dd->per_prio[prio];
>
> @@ -604,9 +579,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
> q->elevator = eq;
> return 0;
>
> -free_dd:
> - kfree(dd);
> -
> put_eq:
> kobject_put(&eq->kobj);
> return ret;
> @@ -689,8 +661,9 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
> blk_req_zone_write_unlock(rq);
>
> prio = ioprio_class_to_prio[ioprio_class];
> + per_prio = &dd->per_prio[prio];
> if (!rq->elv.priv[0]) {
> - dd_count(dd, inserted, prio);
> + per_prio->stats.inserted++;
> rq->elv.priv[0] = (void *)(uintptr_t)1;
> }
>
> @@ -701,7 +674,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
>
> trace_block_rq_insert(rq);
>
> - per_prio = &dd->per_prio[prio];
> if (at_head) {
> list_add(&rq->queuelist, &per_prio->dispatch);
> } else {
> @@ -779,7 +751,7 @@ static void dd_finish_request(struct request *rq)
> if (!rq->elv.priv[0])
> return;
>
> - dd_count(dd, completed, prio);
> + atomic_inc(&per_prio->stats.completed);
>
> if (blk_queue_is_zoned(q)) {
> unsigned long flags;
> @@ -966,28 +938,44 @@ static int dd_queued_show(void *data, struct seq_file *m)
> {
> struct request_queue *q = data;
> struct deadline_data *dd = q->elevator->elevator_data;
> + u32 rt, be, idle;
> +
> + spin_lock(&dd->lock);
> + rt = dd_queued(dd, DD_RT_PRIO);
> + be = dd_queued(dd, DD_BE_PRIO);
> + idle = dd_queued(dd, DD_IDLE_PRIO);
> + spin_unlock(&dd->lock);
> +
> + seq_printf(m, "%u %u %u\n", rt, be, idle);
>
> - seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO),
> - dd_queued(dd, DD_BE_PRIO),
> - dd_queued(dd, DD_IDLE_PRIO));
> return 0;
> }
>
> /* Number of requests owned by the block driver for a given priority. */
> static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
> {
> - return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio)
> - - dd_sum(dd, completed, prio);
> + const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
> +
> + lockdep_assert_held(&dd->lock);
> +
> + return stats->dispatched + stats->merged -
> + atomic_read(&stats->completed);
> }
>
> static int dd_owned_by_driver_show(void *data, struct seq_file *m)
> {
> struct request_queue *q = data;
> struct deadline_data *dd = q->elevator->elevator_data;
> + u32 rt, be, idle;
> +
> + spin_lock(&dd->lock);
> + rt = dd_owned_by_driver(dd, DD_RT_PRIO);
> + be = dd_owned_by_driver(dd, DD_BE_PRIO);
> + idle = dd_owned_by_driver(dd, DD_IDLE_PRIO);
> + spin_unlock(&dd->lock);
> +
> + seq_printf(m, "%u %u %u\n", rt, be, idle);
>
> - seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO),
> - dd_owned_by_driver(dd, DD_BE_PRIO),
> - dd_owned_by_driver(dd, DD_IDLE_PRIO));
> return 0;
> }
>
>
Apart from the nit/comment above, looks OK to me.
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
--
Damien Le Moal
Western Digital Research
next prev parent reply other threads:[~2021-09-24 10:58 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-09-23 23:23 [PATCH 0/4] Restore I/O priority support in the mq-deadline scheduler Bart Van Assche
2021-09-23 23:26 ` [PATCH 1/4] block/mq-deadline: Improve request accounting further Bart Van Assche
2021-09-23 23:26 ` [PATCH 2/4] block/mq-deadline: Add an invariant check Bart Van Assche
2021-09-24 10:54 ` Damien Le Moal
2021-09-27 15:53 ` Niklas Cassel
2021-09-23 23:26 ` [PATCH 3/4] block/mq-deadline: Stop using per-CPU counters Bart Van Assche
2021-09-24 10:58 ` Damien Le Moal [this message]
2021-09-25 2:59 ` Bart Van Assche
2021-09-27 0:19 ` Damien Le Moal
2021-09-27 2:38 ` Bart Van Assche
2021-09-27 3:41 ` Damien Le Moal
2021-09-27 15:53 ` Niklas Cassel
2021-09-23 23:26 ` [PATCH 4/4] block/mq-deadline: Prioritize high-priority requests Bart Van Assche
2021-09-24 11:08 ` Damien Le Moal
2021-09-27 20:03 ` Bart Van Assche
2021-09-24 10:54 ` [PATCH 1/4] block/mq-deadline: Improve request accounting further Damien Le Moal
2021-09-27 15:53 ` Niklas Cassel
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=DM6PR04MB7081B7096944F8115A8BE2B6E7A49@DM6PR04MB7081.namprd04.prod.outlook.com \
--to=damien.lemoal@wdc.com \
--cc=Niklas.Cassel@wdc.com \
--cc=axboe@kernel.dk \
--cc=bvanassche@acm.org \
--cc=hare@suse.de \
--cc=hch@lst.de \
--cc=jaegeuk@kernel.org \
--cc=linux-block@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).