linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCHSET v2 0/6] blk-mq: per-ctx tag caching
@ 2020-01-07 16:30 Jens Axboe
  2020-01-07 16:30 ` [PATCH 1/6] sbitmap: remove cleared bitmask Jens Axboe
                   ` (5 more replies)
  0 siblings, 6 replies; 8+ messages in thread
From: Jens Axboe @ 2020-01-07 16:30 UTC (permalink / raw)
  To: linux-block

Here's v2 of this patchset. It should be solid now, the previous version
didn't handle tag flushing correctly, or multiple hardware queue types.

The idea here is that we can reduce the cost of getting a tag for a new
request, if we don't get them piecemeal. Add a per-ctx tag cache, and
grab batches of tags if it's empty. If it's not empty, we can just find
a free bit there.

/sys/kernel/debug/block/<dev>/<hctx>/<cpu>/tag_hit holds some stats
associated with this, so you can check how it's doing.

I've seen nice improvements with this in testing.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/6] sbitmap: remove cleared bitmask
  2020-01-07 16:30 [PATCHSET v2 0/6] blk-mq: per-ctx tag caching Jens Axboe
@ 2020-01-07 16:30 ` Jens Axboe
  2020-01-07 16:30 ` [PATCH 2/6] sbitmap: add batch tag retrieval Jens Axboe
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Jens Axboe @ 2020-01-07 16:30 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

This is in preparation for doing something better, which doesn't need
us to maintain two sets of bitmaps.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 25 +-----------
 lib/sbitmap.c           | 88 +++++------------------------------------
 2 files changed, 10 insertions(+), 103 deletions(-)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index e40d019c3d9d..7cdd82e0e0dd 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -27,16 +27,6 @@ struct sbitmap_word {
 	 * @word: word holding free bits
 	 */
 	unsigned long word ____cacheline_aligned_in_smp;
-
-	/**
-	 * @cleared: word holding cleared bits
-	 */
-	unsigned long cleared ____cacheline_aligned_in_smp;
-
-	/**
-	 * @swap_lock: Held while swapping word <-> cleared
-	 */
-	spinlock_t swap_lock;
 } ____cacheline_aligned_in_smp;
 
 /**
@@ -251,7 +241,7 @@ static inline void __sbitmap_for_each_set(struct sbitmap *sb,
 					   sb->depth - scanned);
 
 		scanned += depth;
-		word = sb->map[index].word & ~sb->map[index].cleared;
+		word = sb->map[index].word;
 		if (!word)
 			goto next;
 
@@ -307,19 +297,6 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
 	clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
 }
 
-/*
- * This one is special, since it doesn't actually clear the bit, rather it
- * sets the corresponding bit in the ->cleared mask instead. Paired with
- * the caller doing sbitmap_deferred_clear() if a given index is full, which
- * will clear the previously freed entries in the corresponding ->word.
- */
-static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
-{
-	unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared;
-
-	set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
-}
-
 static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb,
 					    unsigned int bitnr)
 {
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 33feec8989f1..af6d6578809f 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -9,38 +9,6 @@
 #include <linux/sbitmap.h>
 #include <linux/seq_file.h>
 
-/*
- * See if we have deferred clears that we can batch move
- */
-static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index)
-{
-	unsigned long mask, val;
-	bool ret = false;
-	unsigned long flags;
-
-	spin_lock_irqsave(&sb->map[index].swap_lock, flags);
-
-	if (!sb->map[index].cleared)
-		goto out_unlock;
-
-	/*
-	 * First get a stable cleared mask, setting the old mask to 0.
-	 */
-	mask = xchg(&sb->map[index].cleared, 0);
-
-	/*
-	 * Now clear the masked bits in our free word
-	 */
-	do {
-		val = sb->map[index].word;
-	} while (cmpxchg(&sb->map[index].word, val, val & ~mask) != val);
-
-	ret = true;
-out_unlock:
-	spin_unlock_irqrestore(&sb->map[index].swap_lock, flags);
-	return ret;
-}
-
 int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
 		      gfp_t flags, int node)
 {
@@ -80,7 +48,6 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
 	for (i = 0; i < sb->map_nr; i++) {
 		sb->map[i].depth = min(depth, bits_per_word);
 		depth -= sb->map[i].depth;
-		spin_lock_init(&sb->map[i].swap_lock);
 	}
 	return 0;
 }
@@ -91,9 +58,6 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
 	unsigned int bits_per_word = 1U << sb->shift;
 	unsigned int i;
 
-	for (i = 0; i < sb->map_nr; i++)
-		sbitmap_deferred_clear(sb, i);
-
 	sb->depth = depth;
 	sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
 
@@ -136,24 +100,6 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
 	return nr;
 }
 
-static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
-				     unsigned int alloc_hint, bool round_robin)
-{
-	int nr;
-
-	do {
-		nr = __sbitmap_get_word(&sb->map[index].word,
-					sb->map[index].depth, alloc_hint,
-					!round_robin);
-		if (nr != -1)
-			break;
-		if (!sbitmap_deferred_clear(sb, index))
-			break;
-	} while (1);
-
-	return nr;
-}
-
 int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 {
 	unsigned int i, index;
@@ -172,8 +118,10 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 		alloc_hint = 0;
 
 	for (i = 0; i < sb->map_nr; i++) {
-		nr = sbitmap_find_bit_in_index(sb, index, alloc_hint,
-						round_robin);
+		nr = __sbitmap_get_word(&sb->map[index].word,
+					sb->map[index].depth, alloc_hint,
+					!round_robin);
+
 		if (nr != -1) {
 			nr += index << sb->shift;
 			break;
@@ -198,7 +146,6 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
 	index = SB_NR_TO_INDEX(sb, alloc_hint);
 
 	for (i = 0; i < sb->map_nr; i++) {
-again:
 		nr = __sbitmap_get_word(&sb->map[index].word,
 					min(sb->map[index].depth, shallow_depth),
 					SB_NR_TO_BIT(sb, alloc_hint), true);
@@ -207,9 +154,6 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
 			break;
 		}
 
-		if (sbitmap_deferred_clear(sb, index))
-			goto again;
-
 		/* Jump to next index. */
 		index++;
 		alloc_hint = index << sb->shift;
@@ -229,43 +173,29 @@ bool sbitmap_any_bit_set(const struct sbitmap *sb)
 	unsigned int i;
 
 	for (i = 0; i < sb->map_nr; i++) {
-		if (sb->map[i].word & ~sb->map[i].cleared)
+		if (sb->map[i].word)
 			return true;
 	}
 	return false;
 }
 EXPORT_SYMBOL_GPL(sbitmap_any_bit_set);
 
-static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
+static unsigned int sbitmap_weight(const struct sbitmap *sb)
 {
 	unsigned int i, weight = 0;
 
 	for (i = 0; i < sb->map_nr; i++) {
 		const struct sbitmap_word *word = &sb->map[i];
 
-		if (set)
-			weight += bitmap_weight(&word->word, word->depth);
-		else
-			weight += bitmap_weight(&word->cleared, word->depth);
+		weight += bitmap_weight(&word->word, word->depth);
 	}
 	return weight;
 }
 
-static unsigned int sbitmap_weight(const struct sbitmap *sb)
-{
-	return __sbitmap_weight(sb, true);
-}
-
-static unsigned int sbitmap_cleared(const struct sbitmap *sb)
-{
-	return __sbitmap_weight(sb, false);
-}
-
 void sbitmap_show(struct sbitmap *sb, struct seq_file *m)
 {
 	seq_printf(m, "depth=%u\n", sb->depth);
-	seq_printf(m, "busy=%u\n", sbitmap_weight(sb) - sbitmap_cleared(sb));
-	seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb));
+	seq_printf(m, "busy=%u\n", sbitmap_weight(sb));
 	seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift);
 	seq_printf(m, "map_nr=%u\n", sb->map_nr);
 }
@@ -570,7 +500,7 @@ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 	 * is in use.
 	 */
 	smp_mb__before_atomic();
-	sbitmap_deferred_clear_bit(&sbq->sb, nr);
+	sbitmap_clear_bit_unlock(&sbq->sb, nr);
 
 	/*
 	 * Pairs with the memory barrier in set_current_state() to ensure the
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/6] sbitmap: add batch tag retrieval
  2020-01-07 16:30 [PATCHSET v2 0/6] blk-mq: per-ctx tag caching Jens Axboe
  2020-01-07 16:30 ` [PATCH 1/6] sbitmap: remove cleared bitmask Jens Axboe
@ 2020-01-07 16:30 ` Jens Axboe
  2020-01-07 16:30 ` [PATCH 3/6] blk-mq: remove 'clear_ctx_on_error' Jens Axboe
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Jens Axboe @ 2020-01-07 16:30 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

This allows retrieving a batch of tags by the caller, instead of getting
them one at the time.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 21 +++++++++
 lib/sbitmap.c           | 97 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 7cdd82e0e0dd..0d686b64a4b8 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -366,6 +366,27 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
  */
 void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
 
+/**
+ * __sbitmap_queue_get_batch() - Try to allocate a batch of free tags from a
+ * &struct sbitmap_queue with preemption already disabled.
+ * @sbq: Bitmap queue to allocate from.
+ * @offset: tag offset
+ * @mask: mask of free tags
+ *
+ * Return: Zero if successful, non-zero if not
+ */
+int __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, unsigned int *offset,
+			      unsigned long *mask);
+
+/**
+ * __sbitmap_queue_clear_batch() - Free a batch a tags
+ * @sbq: Bitmap queue to allocate from.
+ * @offset: tag offset
+ * @mask: mask of free tags
+ */
+void __sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, unsigned int offset,
+				 unsigned long mask);
+
 /**
  * __sbitmap_queue_get() - Try to allocate a free bit from a &struct
  * sbitmap_queue with preemption already disabled.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index af6d6578809f..530d1a1e15c6 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -137,6 +137,45 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 }
 EXPORT_SYMBOL_GPL(sbitmap_get);
 
+static int __sbitmap_get_batch(struct sbitmap *sb, unsigned int index,
+			       unsigned long *ret)
+{
+	unsigned long val, new_val;
+
+	do {
+		val = sb->map[index].word;
+
+		*ret = ~val;
+		if (sb->map[index].depth != BITS_PER_LONG)
+			*ret &= (1UL << sb->map[index].depth) - 1;
+		if (!*ret)
+			return -1;
+
+		new_val = val | *ret;
+		if (cmpxchg(&sb->map[index].word, val, new_val) == val)
+			break;
+	} while (1);
+
+	return 0;
+}
+
+static unsigned int sbitmap_get_batch(struct sbitmap *sb, unsigned int index,
+				      unsigned long *ret)
+{
+	int i;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		if (!__sbitmap_get_batch(sb, index, ret))
+			return index;
+
+		/* Jump to next index. */
+		if (++index >= sb->map_nr)
+			index = 0;
+	}
+
+	return -1U;
+}
+
 int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
 			unsigned long shallow_depth)
 {
@@ -348,6 +387,64 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
 
+void __sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, unsigned int index,
+				 unsigned long mask)
+{
+	index >>= sbq->sb.shift;
+	do {
+		unsigned long val = sbq->sb.map[index].word;
+		unsigned long new_val = ~(val & mask);
+
+		if (cmpxchg(&sbq->sb.map[index].word, val, new_val) == val)
+			break;
+	} while (1);
+
+	/*
+	 * Pairs with the memory barrier in set_current_state() to ensure the
+	 * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
+	 * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
+	 * waiter. See the comment on waitqueue_active().
+	 */
+	smp_mb__after_atomic();
+	sbitmap_queue_wake_up(sbq);
+}
+
+int __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, unsigned int *offset,
+			      unsigned long *mask)
+{
+	struct sbitmap *sb = &sbq->sb;
+	unsigned long __mask = 0;
+	unsigned int hint, depth;
+	unsigned int index;
+
+	hint = this_cpu_read(*sbq->alloc_hint);
+	depth = READ_ONCE(sb->depth);
+	if (unlikely(hint >= depth))
+		hint = depth ? prandom_u32() % depth : 0;
+
+	index = sbitmap_get_batch(&sbq->sb, SB_NR_TO_INDEX(sb, hint), &__mask);
+
+	if (index == -1U) {
+		/* If the map is full, a hint won't do us much good. */
+		this_cpu_write(*sbq->alloc_hint, 0);
+		return 1;
+	}
+
+	/*
+	 * Only update the hint if we used it. We might not have gotten a
+	 * full 'count' worth of bits, but pretend we did. Even if we didn't,
+	 * we want to advance to the next index since we failed to get a full
+	 * batch in this one.
+	 */
+	hint = (index + 1) << sb->shift;
+	if (hint >= depth - 1)
+		hint = 0;
+	this_cpu_write(*sbq->alloc_hint, hint);
+	*offset = index << sb->shift;
+	*mask = __mask;
+	return 0;
+}
+
 int __sbitmap_queue_get(struct sbitmap_queue *sbq)
 {
 	unsigned int hint, depth;
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 3/6] blk-mq: remove 'clear_ctx_on_error'
  2020-01-07 16:30 [PATCHSET v2 0/6] blk-mq: per-ctx tag caching Jens Axboe
  2020-01-07 16:30 ` [PATCH 1/6] sbitmap: remove cleared bitmask Jens Axboe
  2020-01-07 16:30 ` [PATCH 2/6] sbitmap: add batch tag retrieval Jens Axboe
@ 2020-01-07 16:30 ` Jens Axboe
  2020-01-07 16:30 ` [PATCH 4/6] blk-mq: remove ctx->queue Jens Axboe
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Jens Axboe @ 2020-01-07 16:30 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

We used to have this because getting the ctx meant disabling preemption,
but that hasn't been the case since commit c05f42206f4d.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a12b1763508d..6a68e8a246dc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -338,7 +338,6 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	struct elevator_queue *e = q->elevator;
 	struct request *rq;
 	unsigned int tag;
-	bool clear_ctx_on_error = false;
 	u64 alloc_time_ns = 0;
 
 	blk_queue_enter_live(q);
@@ -348,10 +347,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 		alloc_time_ns = ktime_get_ns();
 
 	data->q = q;
-	if (likely(!data->ctx)) {
+	if (likely(!data->ctx))
 		data->ctx = blk_mq_get_ctx(q);
-		clear_ctx_on_error = true;
-	}
 	if (likely(!data->hctx))
 		data->hctx = blk_mq_map_queue(q, data->cmd_flags,
 						data->ctx);
@@ -376,8 +373,6 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 
 	tag = blk_mq_get_tag(data);
 	if (tag == BLK_MQ_TAG_FAIL) {
-		if (clear_ctx_on_error)
-			data->ctx = NULL;
 		blk_queue_exit(q);
 		return NULL;
 	}
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 4/6] blk-mq: remove ctx->queue
  2020-01-07 16:30 [PATCHSET v2 0/6] blk-mq: per-ctx tag caching Jens Axboe
                   ` (2 preceding siblings ...)
  2020-01-07 16:30 ` [PATCH 3/6] blk-mq: remove 'clear_ctx_on_error' Jens Axboe
@ 2020-01-07 16:30 ` Jens Axboe
  2020-01-07 16:30 ` [PATCH 5/6] blk-mq: add struct blk_mq_ctx_type Jens Axboe
  2020-01-07 16:30 ` [PATCH 6/6] blk-mq: allocate tags in batches Jens Axboe
  5 siblings, 0 replies; 8+ messages in thread
From: Jens Axboe @ 2020-01-07 16:30 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

We only use this for a lookup in the sysfs code, replace with getting
the queue off the default set of hardware queues.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sysfs.c | 4 ++--
 block/blk-mq.c       | 2 --
 block/blk-mq.h       | 1 -
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 062229395a50..1f3cb13f932e 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -69,7 +69,7 @@ static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
 
 	entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
 	ctx = container_of(kobj, struct blk_mq_ctx, kobj);
-	q = ctx->queue;
+	q = ctx->hctxs[0]->queue;
 
 	if (!entry->show)
 		return -EIO;
@@ -90,7 +90,7 @@ static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
 
 	entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
 	ctx = container_of(kobj, struct blk_mq_ctx, kobj);
-	q = ctx->queue;
+	q = ctx->hctxs[0]->queue;
 
 	if (!entry->store)
 		return -EIO;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6a68e8a246dc..a36764c38bfb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2439,8 +2439,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
 			INIT_LIST_HEAD(&__ctx->rq_lists[k]);
 
-		__ctx->queue = q;
-
 		/*
 		 * Set local node, IFF we have more than one hw queue. If
 		 * not, we remain on the home node of the device
diff --git a/block/blk-mq.h b/block/blk-mq.h
index eaaca8fc1c28..d15ef0bafe29 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -32,7 +32,6 @@ struct blk_mq_ctx {
 	/* incremented at completion time */
 	unsigned long		____cacheline_aligned_in_smp rq_completed[2];
 
-	struct request_queue	*queue;
 	struct blk_mq_ctxs      *ctxs;
 	struct kobject		kobj;
 } ____cacheline_aligned_in_smp;
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 5/6] blk-mq: add struct blk_mq_ctx_type
  2020-01-07 16:30 [PATCHSET v2 0/6] blk-mq: per-ctx tag caching Jens Axboe
                   ` (3 preceding siblings ...)
  2020-01-07 16:30 ` [PATCH 4/6] blk-mq: remove ctx->queue Jens Axboe
@ 2020-01-07 16:30 ` Jens Axboe
  2020-01-07 16:30 ` [PATCH 6/6] blk-mq: allocate tags in batches Jens Axboe
  5 siblings, 0 replies; 8+ messages in thread
From: Jens Axboe @ 2020-01-07 16:30 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

This only holds the dispatch list for now, and there should be no
functional changes in this patch. This is in preparation for adding more
items to the per-ctx type structure.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c |  6 +++---
 block/blk-mq-sched.c   |  4 ++--
 block/blk-mq.c         | 22 +++++++++++-----------
 block/blk-mq.h         |  6 +++++-
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index b3f2ba483992..e789f830ff59 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -622,14 +622,14 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
 	return 0;
 }
 
-#define CTX_RQ_SEQ_OPS(name, type)					\
+#define CTX_RQ_SEQ_OPS(name, __type)					\
 static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \
 	__acquires(&ctx->lock)						\
 {									\
 	struct blk_mq_ctx *ctx = m->private;				\
 									\
 	spin_lock(&ctx->lock);						\
-	return seq_list_start(&ctx->rq_lists[type], *pos);		\
+	return seq_list_start(&ctx->type[__type].rq_list, *pos);		\
 }									\
 									\
 static void *ctx_##name##_rq_list_next(struct seq_file *m, void *v,	\
@@ -637,7 +637,7 @@ static void *ctx_##name##_rq_list_next(struct seq_file *m, void *v,	\
 {									\
 	struct blk_mq_ctx *ctx = m->private;				\
 									\
-	return seq_list_next(v, &ctx->rq_lists[type], pos);		\
+	return seq_list_next(v, &ctx->type[__type].rq_list, pos);	\
 }									\
 									\
 static void ctx_##name##_rq_list_stop(struct seq_file *m, void *v)	\
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ca22afd47b3d..52368c9005e5 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -313,7 +313,7 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 
 	lockdep_assert_held(&ctx->lock);
 
-	if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
+	if (blk_mq_bio_list_merge(q, &ctx->type[type].rq_list, bio, nr_segs)) {
 		ctx->rq_merged++;
 		return true;
 	}
@@ -335,7 +335,7 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 
 	type = hctx->type;
 	if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
-			!list_empty_careful(&ctx->rq_lists[type])) {
+			!list_empty_careful(&ctx->type[type].rq_list)) {
 		/* default per sw-queue merge */
 		spin_lock(&ctx->lock);
 		ret = blk_mq_attempt_merge(q, hctx, ctx, bio, nr_segs);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a36764c38bfb..cc48a0ffa5ec 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -953,7 +953,7 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
 	enum hctx_type type = hctx->type;
 
 	spin_lock(&ctx->lock);
-	list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
+	list_splice_tail_init(&ctx->type[type].rq_list, flush_data->list);
 	sbitmap_clear_bit(sb, bitnr);
 	spin_unlock(&ctx->lock);
 	return true;
@@ -985,13 +985,13 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
 	struct dispatch_rq_data *dispatch_data = data;
 	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
 	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
-	enum hctx_type type = hctx->type;
+	struct blk_mq_ctx_type *type = &ctx->type[hctx->type];
 
 	spin_lock(&ctx->lock);
-	if (!list_empty(&ctx->rq_lists[type])) {
-		dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
+	if (!list_empty(&type->rq_list)) {
+		dispatch_data->rq = list_entry_rq(type->rq_list.next);
 		list_del_init(&dispatch_data->rq->queuelist);
-		if (list_empty(&ctx->rq_lists[type]))
+		if (list_empty(&type->rq_list))
 			sbitmap_clear_bit(sb, bitnr);
 	}
 	spin_unlock(&ctx->lock);
@@ -1648,9 +1648,9 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
 	trace_block_rq_insert(hctx->queue, rq);
 
 	if (at_head)
-		list_add(&rq->queuelist, &ctx->rq_lists[type]);
+		list_add(&rq->queuelist, &ctx->type[type].rq_list);
 	else
-		list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
+		list_add_tail(&rq->queuelist, &ctx->type[type].rq_list);
 }
 
 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
@@ -1701,7 +1701,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 	}
 
 	spin_lock(&ctx->lock);
-	list_splice_tail_init(list, &ctx->rq_lists[type]);
+	list_splice_tail_init(list, &ctx->type[type].rq_list);
 	blk_mq_hctx_mark_pending(hctx, ctx);
 	spin_unlock(&ctx->lock);
 }
@@ -2256,8 +2256,8 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 	type = hctx->type;
 
 	spin_lock(&ctx->lock);
-	if (!list_empty(&ctx->rq_lists[type])) {
-		list_splice_init(&ctx->rq_lists[type], &tmp);
+	if (!list_empty(&ctx->type[type].rq_list)) {
+		list_splice_init(&ctx->type[type].rq_list, &tmp);
 		blk_mq_hctx_clear_pending(hctx, ctx);
 	}
 	spin_unlock(&ctx->lock);
@@ -2437,7 +2437,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		__ctx->cpu = i;
 		spin_lock_init(&__ctx->lock);
 		for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
-			INIT_LIST_HEAD(&__ctx->rq_lists[k]);
+			INIT_LIST_HEAD(&__ctx->type[k].rq_list);
 
 		/*
 		 * Set local node, IFF we have more than one hw queue. If
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d15ef0bafe29..271f16771499 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -12,13 +12,17 @@ struct blk_mq_ctxs {
 	struct blk_mq_ctx __percpu	*queue_ctx;
 };
 
+struct blk_mq_ctx_type {
+	struct list_head		rq_list;
+};
+
 /**
  * struct blk_mq_ctx - State for a software queue facing the submitting CPUs
  */
 struct blk_mq_ctx {
 	struct {
 		spinlock_t		lock;
-		struct list_head	rq_lists[HCTX_MAX_TYPES];
+		struct blk_mq_ctx_type	type[HCTX_MAX_TYPES];
 	} ____cacheline_aligned_in_smp;
 
 	unsigned int		cpu;
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 6/6] blk-mq: allocate tags in batches
  2020-01-07 16:30 [PATCHSET v2 0/6] blk-mq: per-ctx tag caching Jens Axboe
                   ` (4 preceding siblings ...)
  2020-01-07 16:30 ` [PATCH 5/6] blk-mq: add struct blk_mq_ctx_type Jens Axboe
@ 2020-01-07 16:30 ` Jens Axboe
  2020-01-15 12:07   ` Ming Lei
  5 siblings, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2020-01-07 16:30 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

Instead of grabbing tags one by one, grab a batch and store the local
cache in the software queue. Then subsequent tag allocations can just
grab free tags from there, without having to hit the shared tag map.

We flush these batches out if we run out of tags on the hardware queue.
The intent here is this should rarely happen.

This works very well in practice, with anywhere from 40-60 batch counts
seen regularly in testing.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c |  18 +++++++
 block/blk-mq-tag.c     | 104 ++++++++++++++++++++++++++++++++++++++++-
 block/blk-mq-tag.h     |   3 ++
 block/blk-mq.c         |  16 +++++--
 block/blk-mq.h         |   5 ++
 include/linux/blk-mq.h |   2 +
 6 files changed, 144 insertions(+), 4 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index e789f830ff59..914be72d080e 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -659,6 +659,23 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
 CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
 CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
 
+static ssize_t ctx_tag_hit_write(void *data, const char __user *buf,
+				    size_t count, loff_t *ppos)
+{
+	struct blk_mq_ctx *ctx = data;
+
+	ctx->tag_hit = ctx->tag_refill = 0;
+	return count;
+}
+
+static int ctx_tag_hit_show(void *data, struct seq_file *m)
+{
+	struct blk_mq_ctx *ctx = data;
+
+	seq_printf(m, "hit=%lu refills=%lu\n", ctx->tag_hit, ctx->tag_refill);
+	return 0;
+}
+
 static int ctx_dispatched_show(void *data, struct seq_file *m)
 {
 	struct blk_mq_ctx *ctx = data;
@@ -800,6 +817,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
 	{"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
 	{"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
 	{"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
+	{"tag_hit", 0600, ctx_tag_hit_show, ctx_tag_hit_write},
 	{"merged", 0600, ctx_merged_show, ctx_merged_write},
 	{"completed", 0600, ctx_completed_show, ctx_completed_write},
 	{},
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index fbacde454718..94c1f16c6c71 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -99,6 +99,100 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
 		return __sbitmap_queue_get(bt);
 }
 
+static void ctx_flush_ipi(void *data)
+{
+	struct blk_mq_hw_ctx *hctx = data;
+	struct sbitmap_queue *bt = &hctx->tags->bitmap_tags;
+	struct blk_mq_ctx *ctx;
+	unsigned int i;
+
+	ctx = __blk_mq_get_ctx(hctx->queue, smp_processor_id());
+
+	for (i = 0; i < hctx->queue->tag_set->nr_maps; i++) {
+		struct blk_mq_ctx_type *type = &ctx->type[i];
+
+		if (!type->tags)
+			continue;
+
+		__sbitmap_queue_clear_batch(bt, type->tag_offset, type->tags);
+		type->tags = 0;
+	}
+	atomic_dec(&hctx->flush_pending);
+}
+
+void blk_mq_tag_ctx_flush_batch(struct blk_mq_hw_ctx *hctx,
+				struct blk_mq_ctx *ctx)
+{
+	atomic_inc(&hctx->flush_pending);
+	smp_call_function_single(ctx->cpu, ctx_flush_ipi, hctx, false);
+}
+
+static void blk_mq_tag_flush_batches(struct blk_mq_hw_ctx *hctx)
+{
+	if (atomic_cmpxchg(&hctx->flush_pending, 0, hctx->nr_ctx))
+		return;
+	preempt_disable();
+	if (cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
+		ctx_flush_ipi(hctx);
+	smp_call_function_many(hctx->cpumask, ctx_flush_ipi, hctx, false);
+	preempt_enable();
+}
+
+void blk_mq_tag_queue_flush_batches(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i;
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		blk_mq_tag_flush_batches(hctx);
+}
+
+static int blk_mq_get_tag_batch(struct blk_mq_alloc_data *data)
+{
+	struct blk_mq_hw_ctx *hctx = data->hctx;
+	struct blk_mq_ctx_type *type;
+	struct blk_mq_ctx *ctx = data->ctx;
+	struct blk_mq_tags *tags;
+	struct sbitmap_queue *bt;
+	int tag = -1;
+
+	if (!ctx || (data->flags & BLK_MQ_REQ_INTERNAL))
+		return -1;
+
+	tags = hctx->tags;
+	bt = &tags->bitmap_tags;
+	/* don't do batches for round-robin or (very) sparse maps */
+	if (bt->round_robin || bt->sb.shift < ilog2(BITS_PER_LONG) - 1)
+		return -1;
+
+	/* we could make do with preempt disable, but we need to block flush */
+	local_irq_disable();
+	if (unlikely(ctx->cpu != smp_processor_id()))
+		goto out;
+
+	type = &ctx->type[hctx->type];
+
+	if (type->tags) {
+get_tag:
+		ctx->tag_hit++;
+
+		tag = __ffs(type->tags);
+		type->tags &= ~(1UL << tag);
+		tag += type->tag_offset;
+out:
+		local_irq_enable();
+		return tag;
+	}
+
+	/* no current tag cache, attempt to refill a batch */
+	if (!__sbitmap_queue_get_batch(bt, &type->tag_offset, &type->tags)) {
+		ctx->tag_refill++;
+		goto get_tag;
+	}
+
+	goto out;
+}
+
 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 {
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
@@ -116,8 +210,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		bt = &tags->breserved_tags;
 		tag_offset = 0;
 	} else {
-		bt = &tags->bitmap_tags;
 		tag_offset = tags->nr_reserved_tags;
+
+		tag = blk_mq_get_tag_batch(data);
+		if (tag != -1)
+			goto found_tag;
+
+		bt = &tags->bitmap_tags;
 	}
 
 	tag = __blk_mq_get_tag(data, bt);
@@ -152,6 +251,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != -1)
 			break;
 
+		if (!(data->flags & BLK_MQ_REQ_RESERVED))
+			blk_mq_tag_flush_batches(data->hctx);
+
 		bt_prev = bt;
 		io_schedule();
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 15bc74acb57e..b5964fff1630 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -34,6 +34,9 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		void *priv);
+void blk_mq_tag_queue_flush_batches(struct request_queue *q);
+void blk_mq_tag_ctx_flush_batch(struct blk_mq_hw_ctx *hctx,
+				struct blk_mq_ctx *ctx);
 
 static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
 						 struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index cc48a0ffa5ec..81140f61a7c9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2255,6 +2255,8 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
 	type = hctx->type;
 
+	blk_mq_tag_ctx_flush_batch(hctx, ctx);
+
 	spin_lock(&ctx->lock);
 	if (!list_empty(&ctx->type[type].rq_list)) {
 		list_splice_init(&ctx->type[type].rq_list, &tmp);
@@ -2436,8 +2438,10 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 
 		__ctx->cpu = i;
 		spin_lock_init(&__ctx->lock);
-		for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
+		for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) {
 			INIT_LIST_HEAD(&__ctx->type[k].rq_list);
+			__ctx->type[k].tags = 0;
+		}
 
 		/*
 		 * Set local node, IFF we have more than one hw queue. If
@@ -2521,6 +2525,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 			}
 
 			hctx = blk_mq_map_queue_type(q, j, i);
+			ctx->type[j].tags = 0;
 			ctx->hctxs[j] = hctx;
 			/*
 			 * If the CPU is already set in the mask, then we've
@@ -2542,9 +2547,11 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 			BUG_ON(!hctx->nr_ctx);
 		}
 
-		for (; j < HCTX_MAX_TYPES; j++)
+		for (; j < HCTX_MAX_TYPES; j++) {
 			ctx->hctxs[j] = blk_mq_map_queue_type(q,
 					HCTX_TYPE_DEFAULT, i);
+			ctx->type[j].tags = 0;
+		}
 	}
 
 	queue_for_each_hw_ctx(q, hctx, i) {
@@ -3298,8 +3305,11 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
 	if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
 		return;
 
-	list_for_each_entry(q, &set->tag_list, tag_set_list)
+	list_for_each_entry(q, &set->tag_list, tag_set_list) {
+		blk_mq_tag_queue_flush_batches(q);
 		blk_mq_freeze_queue(q);
+	}
+
 	/*
 	 * Switch IO scheduler to 'none', cleaning up the data associated
 	 * with the previous scheduler. We will switch back once we are done
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 271f16771499..b6095cc50921 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -14,6 +14,10 @@ struct blk_mq_ctxs {
 
 struct blk_mq_ctx_type {
 	struct list_head		rq_list;
+
+	/* tag batch cache */
+	unsigned long			tags;
+	unsigned int			tag_offset;
 };
 
 /**
@@ -23,6 +27,7 @@ struct blk_mq_ctx {
 	struct {
 		spinlock_t		lock;
 		struct blk_mq_ctx_type	type[HCTX_MAX_TYPES];
+		unsigned long		tag_hit, tag_refill;
 	} ____cacheline_aligned_in_smp;
 
 	unsigned int		cpu;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 11cfd6470b1a..2c6a8657a72c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -140,6 +140,8 @@ struct blk_mq_hw_ctx {
 	 */
 	atomic_t		nr_active;
 
+	atomic_t		flush_pending;
+
 	/** @cpuhp_dead: List to store request if some CPU die. */
 	struct hlist_node	cpuhp_dead;
 	/** @kobj: Kernel object for sysfs. */
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 6/6] blk-mq: allocate tags in batches
  2020-01-07 16:30 ` [PATCH 6/6] blk-mq: allocate tags in batches Jens Axboe
@ 2020-01-15 12:07   ` Ming Lei
  0 siblings, 0 replies; 8+ messages in thread
From: Ming Lei @ 2020-01-15 12:07 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

On Tue, Jan 07, 2020 at 09:30:37AM -0700, Jens Axboe wrote:
> Instead of grabbing tags one by one, grab a batch and store the local
> cache in the software queue. Then subsequent tag allocations can just
> grab free tags from there, without having to hit the shared tag map.
> 
> We flush these batches out if we run out of tags on the hardware queue.
> The intent here is this should rarely happen.
> 
> This works very well in practice, with anywhere from 40-60 batch counts
> seen regularly in testing.

Could you describe your test a bit? I am just wondering if multi-task IO
can perform well as before.

> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  block/blk-mq-debugfs.c |  18 +++++++
>  block/blk-mq-tag.c     | 104 ++++++++++++++++++++++++++++++++++++++++-
>  block/blk-mq-tag.h     |   3 ++
>  block/blk-mq.c         |  16 +++++--
>  block/blk-mq.h         |   5 ++
>  include/linux/blk-mq.h |   2 +
>  6 files changed, 144 insertions(+), 4 deletions(-)
> 
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index e789f830ff59..914be72d080e 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -659,6 +659,23 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
>  CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
>  CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
>  
> +static ssize_t ctx_tag_hit_write(void *data, const char __user *buf,
> +				    size_t count, loff_t *ppos)
> +{
> +	struct blk_mq_ctx *ctx = data;
> +
> +	ctx->tag_hit = ctx->tag_refill = 0;
> +	return count;
> +}
> +
> +static int ctx_tag_hit_show(void *data, struct seq_file *m)
> +{
> +	struct blk_mq_ctx *ctx = data;
> +
> +	seq_printf(m, "hit=%lu refills=%lu\n", ctx->tag_hit, ctx->tag_refill);
> +	return 0;
> +}
> +
>  static int ctx_dispatched_show(void *data, struct seq_file *m)
>  {
>  	struct blk_mq_ctx *ctx = data;
> @@ -800,6 +817,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
>  	{"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
>  	{"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
>  	{"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
> +	{"tag_hit", 0600, ctx_tag_hit_show, ctx_tag_hit_write},
>  	{"merged", 0600, ctx_merged_show, ctx_merged_write},
>  	{"completed", 0600, ctx_completed_show, ctx_completed_write},
>  	{},
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index fbacde454718..94c1f16c6c71 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -99,6 +99,100 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
>  		return __sbitmap_queue_get(bt);
>  }
>  
> +static void ctx_flush_ipi(void *data)
> +{
> +	struct blk_mq_hw_ctx *hctx = data;
> +	struct sbitmap_queue *bt = &hctx->tags->bitmap_tags;
> +	struct blk_mq_ctx *ctx;
> +	unsigned int i;
> +
> +	ctx = __blk_mq_get_ctx(hctx->queue, smp_processor_id());
> +
> +	for (i = 0; i < hctx->queue->tag_set->nr_maps; i++) {
> +		struct blk_mq_ctx_type *type = &ctx->type[i];
> +
> +		if (!type->tags)
> +			continue;
> +
> +		__sbitmap_queue_clear_batch(bt, type->tag_offset, type->tags);
> +		type->tags = 0;
> +	}
> +	atomic_dec(&hctx->flush_pending);
> +}
> +
> +void blk_mq_tag_ctx_flush_batch(struct blk_mq_hw_ctx *hctx,
> +				struct blk_mq_ctx *ctx)
> +{
> +	atomic_inc(&hctx->flush_pending);
> +	smp_call_function_single(ctx->cpu, ctx_flush_ipi, hctx, false);
> +}
> +
> +static void blk_mq_tag_flush_batches(struct blk_mq_hw_ctx *hctx)
> +{
> +	if (atomic_cmpxchg(&hctx->flush_pending, 0, hctx->nr_ctx))
> +		return;
> +	preempt_disable();
> +	if (cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
> +		ctx_flush_ipi(hctx);
> +	smp_call_function_many(hctx->cpumask, ctx_flush_ipi, hctx, false);
> +	preempt_enable();
> +}
> +
> +void blk_mq_tag_queue_flush_batches(struct request_queue *q)
> +{
> +	struct blk_mq_hw_ctx *hctx;
> +	unsigned int i;
> +
> +	queue_for_each_hw_ctx(q, hctx, i)
> +		blk_mq_tag_flush_batches(hctx);
> +}
> +
> +static int blk_mq_get_tag_batch(struct blk_mq_alloc_data *data)
> +{
> +	struct blk_mq_hw_ctx *hctx = data->hctx;
> +	struct blk_mq_ctx_type *type;
> +	struct blk_mq_ctx *ctx = data->ctx;
> +	struct blk_mq_tags *tags;
> +	struct sbitmap_queue *bt;
> +	int tag = -1;
> +
> +	if (!ctx || (data->flags & BLK_MQ_REQ_INTERNAL))
> +		return -1;
> +
> +	tags = hctx->tags;
> +	bt = &tags->bitmap_tags;
> +	/* don't do batches for round-robin or (very) sparse maps */
> +	if (bt->round_robin || bt->sb.shift < ilog2(BITS_PER_LONG) - 1)
> +		return -1;
> +
> +	/* we could make do with preempt disable, but we need to block flush */
> +	local_irq_disable();
> +	if (unlikely(ctx->cpu != smp_processor_id()))
> +		goto out;
> +
> +	type = &ctx->type[hctx->type];
> +
> +	if (type->tags) {
> +get_tag:
> +		ctx->tag_hit++;
> +
> +		tag = __ffs(type->tags);
> +		type->tags &= ~(1UL << tag);
> +		tag += type->tag_offset;
> +out:
> +		local_irq_enable();
> +		return tag;
> +	}
> +
> +	/* no current tag cache, attempt to refill a batch */
> +	if (!__sbitmap_queue_get_batch(bt, &type->tag_offset, &type->tags)) {
> +		ctx->tag_refill++;
> +		goto get_tag;
> +	}
> +
> +	goto out;
> +}
> +
>  unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>  {
>  	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
> @@ -116,8 +210,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>  		bt = &tags->breserved_tags;
>  		tag_offset = 0;
>  	} else {
> -		bt = &tags->bitmap_tags;
>  		tag_offset = tags->nr_reserved_tags;
> +
> +		tag = blk_mq_get_tag_batch(data);
> +		if (tag != -1)
> +			goto found_tag;
> +
> +		bt = &tags->bitmap_tags;
>  	}
>  
>  	tag = __blk_mq_get_tag(data, bt);
> @@ -152,6 +251,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>  		if (tag != -1)
>  			break;
>  
> +		if (!(data->flags & BLK_MQ_REQ_RESERVED))
> +			blk_mq_tag_flush_batches(data->hctx);
> +
>  		bt_prev = bt;
>  		io_schedule();
>  
> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> index 15bc74acb57e..b5964fff1630 100644
> --- a/block/blk-mq-tag.h
> +++ b/block/blk-mq-tag.h
> @@ -34,6 +34,9 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
>  extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
>  void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
>  		void *priv);
> +void blk_mq_tag_queue_flush_batches(struct request_queue *q);
> +void blk_mq_tag_ctx_flush_batch(struct blk_mq_hw_ctx *hctx,
> +				struct blk_mq_ctx *ctx);
>  
>  static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
>  						 struct blk_mq_hw_ctx *hctx)
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index cc48a0ffa5ec..81140f61a7c9 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2255,6 +2255,8 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
>  	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
>  	type = hctx->type;
>  
> +	blk_mq_tag_ctx_flush_batch(hctx, ctx);

When blk_mq_hctx_notify_dead() is called, the 'cpu' has been offline
already, so the flush via IPI may not work as expected.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2020-01-15 12:08 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-01-07 16:30 [PATCHSET v2 0/6] blk-mq: per-ctx tag caching Jens Axboe
2020-01-07 16:30 ` [PATCH 1/6] sbitmap: remove cleared bitmask Jens Axboe
2020-01-07 16:30 ` [PATCH 2/6] sbitmap: add batch tag retrieval Jens Axboe
2020-01-07 16:30 ` [PATCH 3/6] blk-mq: remove 'clear_ctx_on_error' Jens Axboe
2020-01-07 16:30 ` [PATCH 4/6] blk-mq: remove ctx->queue Jens Axboe
2020-01-07 16:30 ` [PATCH 5/6] blk-mq: add struct blk_mq_ctx_type Jens Axboe
2020-01-07 16:30 ` [PATCH 6/6] blk-mq: allocate tags in batches Jens Axboe
2020-01-15 12:07   ` Ming Lei

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).