From c49ec4e8b0e4135a87c9894597901539f3e3ca08 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 21 Dec 2016 12:39:33 +0100 Subject: [PATCH 3/3] blk-mq: Split driver and scheduler tags Add 'sched_tags' next to 'tags' in struct blk_mq_hw_ctx and also in struct blk_mq_tag_set. Add 'sched_tag' next to 'tag' in struct request. Modify blk_mq_update_nr_requests() such that it accepts values larger than the queue depth. Make __blk_mq_free_request() free both tags. Make blk_mq_alloc_tag_set() allocate both tag sets. Make blk_mq_free_tag_set() free both tag sets. Make blk_mq_dispatch_rq_list() allocate the driver tag. Modify blk_mq_update_nr_requests() such that it accepts a size that exceeds the queue depth. --- block/blk-flush.c | 9 ++- block/blk-mq.c | 160 +++++++++++++++++++++++++++++++++++-------------- block/blk-mq.h | 5 +- block/blk-tag.c | 1 + include/linux/blk-mq.h | 2 + include/linux/blkdev.h | 1 + 6 files changed, 129 insertions(+), 49 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 6a7c29d2eb3c..46d12bbfde85 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -170,6 +170,8 @@ static bool blk_flush_complete_seq(struct request *rq, struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; bool queued = false, kicked; + BUG_ON(rq->tag < 0); + BUG_ON(rq->flush.seq & seq); rq->flush.seq |= seq; @@ -319,6 +321,8 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) if (q->mq_ops) { struct blk_mq_hw_ctx *hctx; + BUG_ON(first_rq->tag < 0); + flush_rq->mq_ctx = first_rq->mq_ctx; flush_rq->tag = first_rq->tag; fq->orig_rq = first_rq; @@ -452,8 +456,9 @@ void blk_insert_flush(struct request *rq) * processed directly without going through flush machinery. Queue * for normal execution. */ - if ((policy & REQ_FSEQ_DATA) && - !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { + if (((policy & REQ_FSEQ_DATA) && + !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) || + (q->mq_ops && blk_mq_assign_drv_tag(rq) < 0)) { if (q->mq_ops) blk_mq_sched_insert_request(rq, false, true, false); else diff --git a/block/blk-mq.c b/block/blk-mq.c index b68b7fc43e46..48d7968d4ed9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -220,20 +220,21 @@ EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init); struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op) { - struct blk_mq_tags *tags = data->hctx->tags; + struct blk_mq_tags *tags = data->hctx->sched_tags; struct request *rq; - unsigned int tag; + unsigned int sched_tag; - tag = blk_mq_get_tag(data, tags); - if (tag != BLK_MQ_TAG_FAIL) { - rq = tags->rqs[tag]; + sched_tag = blk_mq_get_tag(data, tags); + if (sched_tag != BLK_MQ_TAG_FAIL) { + rq = tags->rqs[sched_tag]; + rq->tag = -1; if (blk_mq_tag_busy(data->hctx)) { rq->rq_flags = RQF_MQ_INFLIGHT; atomic_inc(&data->hctx->nr_active); } - rq->tag = tag; + rq->sched_tag = sched_tag; blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); return rq; } @@ -328,6 +329,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq) { const int tag = rq->tag; + const int sched_tag = rq->sched_tag; struct request_queue *q = rq->q; ctx->rq_completed[rq_is_sync(rq)]++; @@ -340,7 +342,13 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); - blk_mq_put_tag(hctx, hctx->tags, ctx, tag); + if (tag >= 0) { + WARN_ON_ONCE(hctx->tags->rqs[tag] != rq); + hctx->tags->rqs[tag] = NULL; + blk_mq_put_tag(hctx, hctx->tags, ctx, tag); + } + if (sched_tag >= 0) + blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag); blk_queue_exit(q); } @@ -844,6 +852,26 @@ static inline unsigned int queued_to_index(unsigned int queued) return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } +int blk_mq_assign_drv_tag(struct request *rq) +{ + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); + struct blk_mq_alloc_data data = { + .q = rq->q, + .ctx = rq->mq_ctx, + .hctx = hctx, + }; + + rq->tag = blk_mq_get_tag(&data, hctx->tags); + if (rq->tag < 0) + goto out; + WARN_ON_ONCE(hctx->tags->rqs[rq->tag]); + hctx->tags->rqs[rq->tag] = rq; + +out: + return rq->tag; +} + bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) { struct request_queue *q = hctx->queue; @@ -866,6 +894,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) struct blk_mq_queue_data bd; rq = list_first_entry(list, struct request, queuelist); + if (rq->tag < 0 && blk_mq_assign_drv_tag(rq) < 0) + break; list_del_init(&rq->queuelist); bd.rq = rq; @@ -1296,7 +1326,8 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) goto insert; hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); - if (blk_mq_hctx_stopped(hctx)) + if (blk_mq_hctx_stopped(hctx) || + (rq->tag < 0 && blk_mq_assign_drv_tag(rq) < 0)) goto insert; new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num); @@ -1592,17 +1623,19 @@ void blk_mq_free_rq_map(struct blk_mq_tags *tags) } struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx) + unsigned int hctx_idx, + unsigned int nr_tags, + unsigned int reserved_tags) { struct blk_mq_tags *tags; - tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, + tags = blk_mq_init_tags(nr_tags, reserved_tags, set->numa_node, BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL; - tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), + tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, set->numa_node); if (!tags->rqs) { @@ -1800,6 +1833,7 @@ static int blk_mq_init_hctx(struct request_queue *q, cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); hctx->tags = set->tags[hctx_idx]; + hctx->sched_tags = set->sched_tags[hctx_idx]; /* * Allocate space for all possible cpus to avoid allocation at @@ -1881,6 +1915,38 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, } } +static void __blk_mq_free_rq_map_i(struct blk_mq_tag_set *set, int hctx_idx) +{ + if (set->sched_tags[hctx_idx]) { + blk_mq_free_rqs(set, set->sched_tags[hctx_idx], hctx_idx); + blk_mq_free_rq_map(set->sched_tags[hctx_idx]); + set->sched_tags[hctx_idx] = NULL; + } + if (set->tags[hctx_idx]) { + blk_mq_free_rq_map(set->tags[hctx_idx]); + set->tags[hctx_idx] = NULL; + } +} + +static bool __blk_mq_alloc_rq_map_i(struct blk_mq_tag_set *set, int hctx_idx, + unsigned int nr_requests) +{ + int ret = 0; + + set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, + set->queue_depth, set->reserved_tags); + set->sched_tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, + nr_requests, 0); + if (set->sched_tags[hctx_idx]) + ret = blk_mq_alloc_rqs(set, set->sched_tags[hctx_idx], + hctx_idx); + if (!set->tags[hctx_idx] || !set->sched_tags[hctx_idx] || ret < 0) { + __blk_mq_free_rq_map_i(set, hctx_idx); + return false; + } + return true; +} + static void blk_mq_map_swqueue(struct request_queue *q, const struct cpumask *online_mask) { @@ -1909,23 +1975,15 @@ static void blk_mq_map_swqueue(struct request_queue *q, hctx_idx = q->mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ - if (!set->tags[hctx_idx]) { - set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, - hctx_idx); - if (blk_mq_alloc_rqs(set, set->tags[hctx_idx], - hctx_idx) < 0) { - blk_mq_free_rq_map(set->tags[hctx_idx]); - set->tags[hctx_idx] = NULL; - } - + if (!set->tags[hctx_idx] && + !__blk_mq_alloc_rq_map_i(set, hctx_idx, q->nr_requests)) { /* * If tags initialization fail for some hctx, * that hctx won't be brought online. In this * case, remap the current ctx to hctx[0] which * is guaranteed to always have tags allocated */ - if (!set->tags[hctx_idx]) - q->mq_map[i] = 0; + q->mq_map[i] = 0; } ctx = per_cpu_ptr(q->queue_ctx, i); @@ -2318,26 +2376,20 @@ static int blk_mq_queue_reinit_prepare(unsigned int cpu) return 0; } -static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) +static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set, + unsigned int nr_requests) { int i; - for (i = 0; i < set->nr_hw_queues; i++) { - set->tags[i] = blk_mq_alloc_rq_map(set, i); - if (!set->tags[i]) + for (i = 0; i < set->nr_hw_queues; i++) + if (!__blk_mq_alloc_rq_map_i(set, i, nr_requests)) goto out_unwind; - if (blk_mq_alloc_rqs(set, set->tags[i], i) < 0) - goto free_rq_map; - } return 0; out_unwind: - while (--i >= 0) { - blk_mq_free_rqs(set, set->tags[i], i); -free_rq_map: - blk_mq_free_rq_map(set->tags[i]); - } + while (--i >= 0) + __blk_mq_free_rq_map_i(set, i); return -ENOMEM; } @@ -2347,14 +2399,15 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) * may reduce the depth asked for, if memory is tight. set->queue_depth * will be updated to reflect the allocated depth. */ -static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) +static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set, + unsigned int nr_requests) { unsigned int depth; int err; depth = set->queue_depth; do { - err = __blk_mq_alloc_rq_maps(set); + err = __blk_mq_alloc_rq_maps(set, nr_requests); if (!err) break; @@ -2385,7 +2438,7 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { - int ret; + int ret = -ENOMEM; BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); @@ -2425,32 +2478,39 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (!set->tags) return -ENOMEM; - ret = -ENOMEM; + set->sched_tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *), + GFP_KERNEL, set->numa_node); + if (!set->sched_tags) + goto free_drv_tags; + set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids, GFP_KERNEL, set->numa_node); if (!set->mq_map) - goto out_free_tags; + goto free_sched_tags; if (set->ops->map_queues) ret = set->ops->map_queues(set); else ret = blk_mq_map_queues(set); if (ret) - goto out_free_mq_map; + goto free_mq_map; - ret = blk_mq_alloc_rq_maps(set); + ret = blk_mq_alloc_rq_maps(set, set->queue_depth/*q->nr_requests*/); if (ret) - goto out_free_mq_map; + goto free_mq_map; mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; -out_free_mq_map: +free_mq_map: kfree(set->mq_map); set->mq_map = NULL; -out_free_tags: +free_sched_tags: + kfree(set->sched_tags); + set->sched_tags = NULL; +free_drv_tags: kfree(set->tags); set->tags = NULL; return ret; @@ -2465,12 +2525,16 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) if (set->tags[i]) { blk_mq_free_rqs(set, set->tags[i], i); blk_mq_free_rq_map(set->tags[i]); + blk_mq_free_rq_map(set->sched_tags[i]); } } kfree(set->mq_map); set->mq_map = NULL; + kfree(set->sched_tags); + set->sched_tags = NULL; + kfree(set->tags); set->tags = NULL; } @@ -2482,14 +2546,18 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) struct blk_mq_hw_ctx *hctx; int i, ret; - if (!set || nr > set->queue_depth) + if (!set) return -EINVAL; ret = 0; queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->tags) continue; - ret = blk_mq_tag_update_depth(hctx->tags, nr); + ret = blk_mq_tag_update_depth(hctx->tags, + min(nr, set->queue_depth)); + if (ret) + break; + ret = blk_mq_tag_update_depth(hctx->sched_tags, nr); if (ret) break; } diff --git a/block/blk-mq.h b/block/blk-mq.h index 2e98dd8ccee2..0368c513c2ab 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -31,6 +31,7 @@ void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); +int blk_mq_assign_drv_tag(struct request *rq); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); @@ -41,7 +42,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); void blk_mq_free_rq_map(struct blk_mq_tags *tags); struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx); + unsigned int hctx_idx, + unsigned int nr_tags, + unsigned int reserved_tags); int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); diff --git a/block/blk-tag.c b/block/blk-tag.c index bae1decb6ec3..319a3a3eb1d7 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -272,6 +272,7 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq) list_del_init(&rq->queuelist); rq->rq_flags &= ~RQF_QUEUED; rq->tag = -1; + rq->sched_tag = -1; if (unlikely(bqt->tag_index[tag] == NULL)) printk(KERN_ERR "%s: tag %d is missing\n", diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9255ccb043f2..377594bcda8d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -36,6 +36,7 @@ struct blk_mq_hw_ctx { atomic_t wait_index; struct blk_mq_tags *tags; + struct blk_mq_tags *sched_tags; struct srcu_struct queue_rq_srcu; @@ -72,6 +73,7 @@ struct blk_mq_tag_set { void *driver_data; struct blk_mq_tags **tags; + struct blk_mq_tags **sched_tags; struct mutex tag_list_lock; struct list_head tag_list; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7c40fb838b44..112b57bce9e9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -223,6 +223,7 @@ struct request { void *special; /* opaque pointer available for LLD use */ int tag; + int sched_tag; int errors; /* -- 2.11.0