All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHSET v3] sbitmap optimizations
@ 2018-11-30  2:09 Jens Axboe
  2018-11-30  2:09 ` [PATCH 1/2] sbitmap: ammortize cost of clearing bits Jens Axboe
  2018-11-30  2:09 ` [PATCH 2/2] sbitmap: optimize wakeup check Jens Axboe
  0 siblings, 2 replies; 3+ messages in thread
From: Jens Axboe @ 2018-11-30  2:09 UTC (permalink / raw)
  To: linux-block, osandov

The v2 posting got screwed up somehow, sending a v3 just to make
sure things are sane.

Changes:

- Dropped the alignment patch, it should not be needed unless we have
  debugging enabled of some sort.

- Fumbled the optimized wakeup, it's important we match prep + finish
  with that interface.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 1/2] sbitmap: ammortize cost of clearing bits
  2018-11-30  2:09 [PATCHSET v3] sbitmap optimizations Jens Axboe
@ 2018-11-30  2:09 ` Jens Axboe
  2018-11-30  2:09 ` [PATCH 2/2] sbitmap: optimize wakeup check Jens Axboe
  1 sibling, 0 replies; 3+ messages in thread
From: Jens Axboe @ 2018-11-30  2:09 UTC (permalink / raw)
  To: linux-block, osandov; +Cc: Jens Axboe

sbitmap maintains a set of words that we use to set and clear bits, with
each bit representing a tag for blk-mq. Even though we spread the bits
out and maintain a hint cache, one particular bit allocated will end up
being cleared in the exact same spot.

This introduces batched clearing of bits. Instead of clearing a given
bit, the same bit is set in a cleared/free mask instead. If we fail
allocating a bit from a given word, then we check the free mask, and
batch move those cleared bits at that time. This trades 64 atomic bitops
for 2 cmpxchg().

In a threaded poll test case, half the overhead of getting and clearing
tags is removed with this change. On another poll test case with a
single thread, performance is unchanged.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 26 +++++++++++++++---
 lib/sbitmap.c           | 60 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 78 insertions(+), 8 deletions(-)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 804a50983ec5..cec685b89998 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -30,14 +30,19 @@ struct seq_file;
  */
 struct sbitmap_word {
 	/**
-	 * @word: The bitmap word itself.
+	 * @depth: Number of bits being used in @word/@cleared
 	 */
-	unsigned long word;
+	unsigned long depth;
 
 	/**
-	 * @depth: Number of bits being used in @word.
+	 * @word: word holding free bits
 	 */
-	unsigned long depth;
+	unsigned long word ____cacheline_aligned_in_smp;
+
+	/**
+	 * @cleared: word holding cleared bits
+	 */
+	unsigned long cleared ____cacheline_aligned_in_smp;
 } ____cacheline_aligned_in_smp;
 
 /**
@@ -310,6 +315,19 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
 	clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
 }
 
+/*
+ * This one is special, since it doesn't actually clear the bit, rather it
+ * sets the corresponding bit in the ->cleared mask instead. Paired with
+ * the caller doing sbitmap_batch_clear() if a given index is full, which
+ * will clear the previously freed entries in the corresponding ->word.
+ */
+static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
+{
+	unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared;
+
+	set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
+}
+
 static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb,
 					    unsigned int bitnr)
 {
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 45cab6bbc1c7..2316f53f3e1d 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -111,6 +111,58 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
 	return nr;
 }
 
+/*
+ * See if we have deferred clears that we can batch move
+ */
+static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index)
+{
+	unsigned long mask, val;
+
+	if (!sb->map[index].cleared)
+		return false;
+
+	/*
+	 * First get a stable cleared mask, setting the old mask to 0.
+	 */
+	do {
+		mask = sb->map[index].cleared;
+	} while (cmpxchg(&sb->map[index].cleared, mask, 0) != mask);
+
+	/*
+	 * Now clear the masked bits in our free word
+	 */
+	do {
+		val = sb->map[index].word;
+	} while (cmpxchg(&sb->map[index].word, val, val & ~mask) != val);
+
+	/*
+	 * If someone found ->cleared == 0 before we wrote ->word, then
+	 * they could have failed when they should not have. Check for
+	 * waiters.
+	 */
+	smp_mb__after_atomic();
+	sbitmap_queue_wake_up(container_of(sb, struct sbitmap_queue, sb));
+	return true;
+}
+
+static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
+				     unsigned int alloc_hint, bool round_robin)
+{
+	int nr;
+
+	do {
+		nr = __sbitmap_get_word(&sb->map[index].word,
+					sb->map[index].depth, alloc_hint,
+					!round_robin);
+		if (nr != -1)
+			break;
+		if (!sbitmap_deferred_clear(sb, index))
+			break;
+	} while (1);
+
+	return nr;
+}
+
 int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 {
 	unsigned int i, index;
@@ -129,9 +181,8 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 		alloc_hint = 0;
 
 	for (i = 0; i < sb->map_nr; i++) {
-		nr = __sbitmap_get_word(&sb->map[index].word,
-					sb->map[index].depth, alloc_hint,
-					!round_robin);
+		nr = sbitmap_find_bit_in_index(sb, index, alloc_hint,
+						round_robin);
 		if (nr != -1) {
 			nr += index << sb->shift;
 			break;
@@ -514,7 +565,8 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 			 unsigned int cpu)
 {
-	sbitmap_clear_bit_unlock(&sbq->sb, nr);
+	sbitmap_deferred_clear_bit(&sbq->sb, nr);
+
 	/*
 	 * Pairs with the memory barrier in set_current_state() to ensure the
 	 * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/2] sbitmap: optimize wakeup check
  2018-11-30  2:09 [PATCHSET v3] sbitmap optimizations Jens Axboe
  2018-11-30  2:09 ` [PATCH 1/2] sbitmap: ammortize cost of clearing bits Jens Axboe
@ 2018-11-30  2:09 ` Jens Axboe
  1 sibling, 0 replies; 3+ messages in thread
From: Jens Axboe @ 2018-11-30  2:09 UTC (permalink / raw)
  To: linux-block, osandov; +Cc: Jens Axboe

Even if we have no waiters on any of the sbitmap_queue wait states, we
still have to loop every entry to check. We do this for every IO, so
the cost adds up.

Shift a bit of the cost to the slow path, when we actually have waiters.
Wrap prepare_to_wait_exclusive() and finish_wait(), so we can maintain
an internal count of how many are currently active. Then we can simply
check this count in sbq_wake_ptr() and not have to loop if we don't
have any sleepers.

Convert the two users of sbitmap with waiting, blk-mq-tag and iSCSI.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c                       | 13 ++++++-------
 drivers/target/iscsi/iscsi_target_util.c |  9 ++++++---
 include/linux/sbitmap.h                  | 19 +++++++++++++++++++
 lib/sbitmap.c                            | 21 +++++++++++++++++++++
 4 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 87bc5df72d48..4704046ce493 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -154,12 +154,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != -1)
 			break;
 
-		prepare_to_wait_exclusive(&ws->wait, &wait,
-						TASK_UNINTERRUPTIBLE);
+		sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
 
 		tag = __blk_mq_get_tag(data, bt);
-		if (tag != -1)
+		if (tag != -1) {
+			sbitmap_finish_wait(bt, ws, &wait);
 			break;
+		}
 
 		if (data->ctx)
 			blk_mq_put_ctx(data->ctx);
@@ -167,6 +168,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		bt_prev = bt;
 		io_schedule();
 
+		sbitmap_finish_wait(bt, ws, &wait);
+
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
 						data->ctx->cpu);
@@ -176,8 +179,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		else
 			bt = &tags->bitmap_tags;
 
-		finish_wait(&ws->wait, &wait);
-
 		/*
 		 * If destination hw queue is changed, fake wake up on
 		 * previous queue for compensating the wake up miss, so
@@ -192,8 +193,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	if (drop_ctx && data->ctx)
 		blk_mq_put_ctx(data->ctx);
 
-	finish_wait(&ws->wait, &wait);
-
 found_tag:
 	return tag + tag_offset;
 }
diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index 36b742932c72..37db1f80a219 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -152,22 +152,25 @@ static int iscsit_wait_for_tag(struct se_session *se_sess, int state, int *cpup)
 	int tag = -1;
 	DEFINE_WAIT(wait);
 	struct sbq_wait_state *ws;
+	struct sbitmap_queue *sbq;
 
 	if (state == TASK_RUNNING)
 		return tag;
 
-	ws = &se_sess->sess_tag_pool.ws[0];
+	sbq = &se_sess->sess_tag_pool;
+	ws = &sbq->ws[0];
 	for (;;) {
-		prepare_to_wait_exclusive(&ws->wait, &wait, state);
+		sbitmap_prepare_to_wait(sbq, ws, &wait, state);
 		if (signal_pending_state(state, current))
 			break;
 		tag = sbitmap_queue_get(&se_sess->sess_tag_pool, cpup);
 		if (tag >= 0)
 			break;
 		schedule();
+		sbitmap_finish_wait(sbq, ws, &wait);
 	}
+	sbitmap_finish_wait(sbq, ws, &wait);
 
-	finish_wait(&ws->wait, &wait);
 	return tag;
 }
 
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index cec685b89998..a65d5b67a611 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -130,6 +130,11 @@ struct sbitmap_queue {
 	 */
 	struct sbq_wait_state *ws;
 
+	/*
+	 * @ws_active: count of currently active ws waitqueues
+	 */
+	atomic_t ws_active;
+
 	/**
 	 * @round_robin: Allocate bits in strict round-robin order.
 	 */
@@ -549,4 +554,18 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
  */
 void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m);
 
+/*
+ * Wrapper around prepare_to_wait_exclusive(), which maintains some extra
+ * internal state.
+ */
+void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
+				struct sbq_wait_state *ws,
+				struct wait_queue_entry *wait, int state);
+
+/*
+ * Must be paired with sbitmap_prepare_to_wait().
+ */
+void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
+				struct wait_queue_entry *wait);
+
 #endif /* __LINUX_SCALE_BITMAP_H */
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 2316f53f3e1d..ba51f538011a 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -381,6 +381,7 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 	sbq->min_shallow_depth = UINT_MAX;
 	sbq->wake_batch = sbq_calc_wake_batch(sbq, depth);
 	atomic_set(&sbq->wake_index, 0);
+	atomic_set(&sbq->ws_active, 0);
 
 	sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
 	if (!sbq->ws) {
@@ -496,6 +497,9 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 {
 	int i, wake_index;
 
+	if (!atomic_read(&sbq->ws_active))
+		return NULL;
+
 	wake_index = atomic_read(&sbq->wake_index);
 	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
 		struct sbq_wait_state *ws = &sbq->ws[wake_index];
@@ -636,3 +640,20 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
 	seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_show);
+
+void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
+			     struct sbq_wait_state *ws,
+			     struct wait_queue_entry *wait, int state)
+{
+	atomic_inc(&sbq->ws_active);
+	prepare_to_wait_exclusive(&ws->wait, wait, state);
+}
+EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait);
+
+void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
+			 struct wait_queue_entry *wait)
+{
+	finish_wait(&ws->wait, wait);
+	atomic_dec(&sbq->ws_active);
+}
+EXPORT_SYMBOL_GPL(sbitmap_finish_wait);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2018-11-30  2:09 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-30  2:09 [PATCHSET v3] sbitmap optimizations Jens Axboe
2018-11-30  2:09 ` [PATCH 1/2] sbitmap: ammortize cost of clearing bits Jens Axboe
2018-11-30  2:09 ` [PATCH 2/2] sbitmap: optimize wakeup check Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.