All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHSET v4] sbitmap optimizations
@ 2018-11-30 16:01 Jens Axboe
  2018-11-30 16:01 ` [PATCH 1/2] sbitmap: ammortize cost of clearing bits Jens Axboe
  2018-11-30 16:01 ` [PATCH 2/2] sbitmap: optimize wakeup check Jens Axboe
  0 siblings, 2 replies; 8+ messages in thread
From: Jens Axboe @ 2018-11-30 16:01 UTC (permalink / raw)
  To: linux-block, osandov

This versions tests out solid, and we're still seeing the same
improvements.

Changes:

- Lock map index for the move. This eliminates the race completely,
  since it's now not possible to find ->cleared == 0 while swap of
  bits is in progress. The previous version was fine for users that
  re-check after having added themselves to the waitqueue, but we
  have users that don't re-check after getting failure. This works
  for both.

- Add the new states to the blk-mq debugfs output.

- Wrap the waitqueue in a sbitmap waitqueue, so we can ensure that
  we account it properly. This means that any kind of prep+finish
  on the waitqueue will work fine, just like before.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/2] sbitmap: ammortize cost of clearing bits
  2018-11-30 16:01 [PATCHSET v4] sbitmap optimizations Jens Axboe
@ 2018-11-30 16:01 ` Jens Axboe
  2018-11-30 20:03   ` Omar Sandoval
  2018-11-30 16:01 ` [PATCH 2/2] sbitmap: optimize wakeup check Jens Axboe
  1 sibling, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2018-11-30 16:01 UTC (permalink / raw)
  To: linux-block, osandov; +Cc: Jens Axboe

sbitmap maintains a set of words that we use to set and clear bits, with
each bit representing a tag for blk-mq. Even though we spread the bits
out and maintain a hint cache, one particular bit allocated will end up
being cleared in the exact same spot.

This introduces batched clearing of bits. Instead of clearing a given
bit, the same bit is set in a cleared/free mask instead. If we fail
allocating a bit from a given word, then we check the free mask, and
batch move those cleared bits at that time. This trades 64 atomic bitops
for 2 cmpxchg().

In a threaded poll test case, half the overhead of getting and clearing
tags is removed with this change. On another poll test case with a
single thread, performance is unchanged.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 31 +++++++++++++---
 lib/sbitmap.c           | 80 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 100 insertions(+), 11 deletions(-)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 804a50983ec5..07f117ee19dc 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -30,14 +30,24 @@ struct seq_file;
  */
 struct sbitmap_word {
 	/**
-	 * @word: The bitmap word itself.
+	 * @depth: Number of bits being used in @word/@cleared
 	 */
-	unsigned long word;
+	unsigned long depth;
 
 	/**
-	 * @depth: Number of bits being used in @word.
+	 * @word: word holding free bits
 	 */
-	unsigned long depth;
+	unsigned long word ____cacheline_aligned_in_smp;
+
+	/**
+	 * @cleared: word holding cleared bits
+	 */
+	unsigned long cleared ____cacheline_aligned_in_smp;
+
+	/**
+	 * @swap_lock: Held while swapping word <-> cleared
+	 */
+	spinlock_t swap_lock;
 } ____cacheline_aligned_in_smp;
 
 /**
@@ -310,6 +320,19 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
 	clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
 }
 
+/*
+ * This one is special, since it doesn't actually clear the bit, rather it
+ * sets the corresponding bit in the ->cleared mask instead. Paired with
+ * the caller doing sbitmap_batch_clear() if a given index is full, which
+ * will clear the previously freed entries in the corresponding ->word.
+ */
+static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
+{
+	unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared;
+
+	set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
+}
+
 static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb,
 					    unsigned int bitnr)
 {
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 45cab6bbc1c7..f6a9553617bd 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -59,6 +59,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
 	for (i = 0; i < sb->map_nr; i++) {
 		sb->map[i].depth = min(depth, bits_per_word);
 		depth -= sb->map[i].depth;
+		spin_lock_init(&sb->map[i].swap_lock);
 	}
 	return 0;
 }
@@ -111,6 +112,57 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
 	return nr;
 }
 
+/*
+ * See if we have deferred clears that we can batch move
+ */
+static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index)
+{
+	unsigned long mask, val;
+	bool ret = false;
+
+	spin_lock(&sb->map[index].swap_lock);
+
+	if (!sb->map[index].cleared)
+		goto out_unlock;
+
+	/*
+	 * First get a stable cleared mask, setting the old mask to 0.
+	 */
+	do {
+		mask = sb->map[index].cleared;
+	} while (cmpxchg(&sb->map[index].cleared, mask, 0) != mask);
+
+	/*
+	 * Now clear the masked bits in our free word
+	 */
+	do {
+		val = sb->map[index].word;
+	} while (cmpxchg(&sb->map[index].word, val, val & ~mask) != val);
+
+	ret = true;
+out_unlock:
+	spin_unlock(&sb->map[index].swap_lock);
+	return ret;
+}
+
+static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
+				     unsigned int alloc_hint, bool round_robin)
+{
+	int nr;
+
+	do {
+		nr = __sbitmap_get_word(&sb->map[index].word,
+					sb->map[index].depth, alloc_hint,
+					!round_robin);
+		if (nr != -1)
+			break;
+		if (!sbitmap_deferred_clear(sb, index))
+			break;
+	} while (1);
+
+	return nr;
+}
+
 int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 {
 	unsigned int i, index;
@@ -129,9 +181,8 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 		alloc_hint = 0;
 
 	for (i = 0; i < sb->map_nr; i++) {
-		nr = __sbitmap_get_word(&sb->map[index].word,
-					sb->map[index].depth, alloc_hint,
-					!round_robin);
+		nr = sbitmap_find_bit_in_index(sb, index, alloc_hint,
+						round_robin);
 		if (nr != -1) {
 			nr += index << sb->shift;
 			break;
@@ -206,23 +257,37 @@ bool sbitmap_any_bit_clear(const struct sbitmap *sb)
 }
 EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear);
 
-unsigned int sbitmap_weight(const struct sbitmap *sb)
+static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
 {
 	unsigned int i, weight = 0;
 
 	for (i = 0; i < sb->map_nr; i++) {
 		const struct sbitmap_word *word = &sb->map[i];
 
-		weight += bitmap_weight(&word->word, word->depth);
+		if (set)
+			weight += bitmap_weight(&word->word, word->depth);
+		else
+			weight += bitmap_weight(&word->cleared, word->depth);
 	}
 	return weight;
 }
+
+unsigned int sbitmap_weight(const struct sbitmap *sb)
+{
+	return __sbitmap_weight(sb, true);
+}
 EXPORT_SYMBOL_GPL(sbitmap_weight);
 
+static unsigned int sbitmap_cleared(const struct sbitmap *sb)
+{
+	return __sbitmap_weight(sb, false);
+}
+
 void sbitmap_show(struct sbitmap *sb, struct seq_file *m)
 {
 	seq_printf(m, "depth=%u\n", sb->depth);
-	seq_printf(m, "busy=%u\n", sbitmap_weight(sb));
+	seq_printf(m, "busy=%u\n", sbitmap_weight(sb) - sbitmap_cleared(sb));
+	seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb));
 	seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift);
 	seq_printf(m, "map_nr=%u\n", sb->map_nr);
 }
@@ -514,7 +579,8 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 			 unsigned int cpu)
 {
-	sbitmap_clear_bit_unlock(&sbq->sb, nr);
+	sbitmap_deferred_clear_bit(&sbq->sb, nr);
+
 	/*
 	 * Pairs with the memory barrier in set_current_state() to ensure the
 	 * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2] sbitmap: optimize wakeup check
  2018-11-30 16:01 [PATCHSET v4] sbitmap optimizations Jens Axboe
  2018-11-30 16:01 ` [PATCH 1/2] sbitmap: ammortize cost of clearing bits Jens Axboe
@ 2018-11-30 16:01 ` Jens Axboe
  2018-11-30 21:37   ` Omar Sandoval
  1 sibling, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2018-11-30 16:01 UTC (permalink / raw)
  To: linux-block, osandov; +Cc: Jens Axboe

Even if we have no waiters on any of the sbitmap_queue wait states, we
still have to loop every entry to check. We do this for every IO, so
the cost adds up.

Shift a bit of the cost to the slow path, when we actually have waiters.
Wrap prepare_to_wait_exclusive() and finish_wait(), so we can maintain
an internal count of how many are currently active. Then we can simply
check this count in sbq_wake_ptr() and not have to loop if we don't
have any sleepers.

Convert the two users of sbitmap with waiting, blk-mq-tag and iSCSI.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c                       | 11 ++++----
 drivers/target/iscsi/iscsi_target_util.c | 12 +++++----
 include/linux/sbitmap.h                  | 34 ++++++++++++++++++++++++
 lib/sbitmap.c                            | 28 +++++++++++++++++++
 4 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 87bc5df72d48..2089c6c62f44 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -110,7 +110,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 	struct sbitmap_queue *bt;
 	struct sbq_wait_state *ws;
-	DEFINE_WAIT(wait);
+	DEFINE_SBQ_WAIT(wait);
 	unsigned int tag_offset;
 	bool drop_ctx;
 	int tag;
@@ -154,8 +154,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != -1)
 			break;
 
-		prepare_to_wait_exclusive(&ws->wait, &wait,
-						TASK_UNINTERRUPTIBLE);
+		sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
 
 		tag = __blk_mq_get_tag(data, bt);
 		if (tag != -1)
@@ -167,6 +166,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		bt_prev = bt;
 		io_schedule();
 
+		sbitmap_finish_wait(bt, ws, &wait);
+
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
 						data->ctx->cpu);
@@ -176,8 +177,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		else
 			bt = &tags->bitmap_tags;
 
-		finish_wait(&ws->wait, &wait);
-
 		/*
 		 * If destination hw queue is changed, fake wake up on
 		 * previous queue for compensating the wake up miss, so
@@ -192,7 +191,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	if (drop_ctx && data->ctx)
 		blk_mq_put_ctx(data->ctx);
 
-	finish_wait(&ws->wait, &wait);
+	sbitmap_finish_wait(bt, ws, &wait);
 
 found_tag:
 	return tag + tag_offset;
diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index 36b742932c72..86987da86dd6 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -150,24 +150,26 @@ void iscsit_free_r2ts_from_list(struct iscsi_cmd *cmd)
 static int iscsit_wait_for_tag(struct se_session *se_sess, int state, int *cpup)
 {
 	int tag = -1;
-	DEFINE_WAIT(wait);
+	DEFINE_SBQ_WAIT(wait);
 	struct sbq_wait_state *ws;
+	struct sbitmap_queue *sbq;
 
 	if (state == TASK_RUNNING)
 		return tag;
 
-	ws = &se_sess->sess_tag_pool.ws[0];
+	sbq = &se_sess->sess_tag_pool;
+	ws = &sbq->ws[0];
 	for (;;) {
-		prepare_to_wait_exclusive(&ws->wait, &wait, state);
+		sbitmap_prepare_to_wait(sbq, ws, &wait, state);
 		if (signal_pending_state(state, current))
 			break;
-		tag = sbitmap_queue_get(&se_sess->sess_tag_pool, cpup);
+		tag = sbitmap_queue_get(sbq, cpup);
 		if (tag >= 0)
 			break;
 		schedule();
 	}
 
-	finish_wait(&ws->wait, &wait);
+	sbitmap_finish_wait(sbq, ws, &wait);
 	return tag;
 }
 
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 07f117ee19dc..f0f49bbb2617 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -135,6 +135,11 @@ struct sbitmap_queue {
 	 */
 	struct sbq_wait_state *ws;
 
+	/*
+	 * @ws_active: count of currently active ws waitqueues
+	 */
+	atomic_t ws_active;
+
 	/**
 	 * @round_robin: Allocate bits in strict round-robin order.
 	 */
@@ -554,4 +559,33 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
  */
 void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m);
 
+struct sbq_wait {
+	int accounted;
+	struct wait_queue_entry wait;
+};
+
+#define DEFINE_SBQ_WAIT(name)							\
+	struct sbq_wait name = {						\
+		.accounted = 0,							\
+		.wait = {							\
+			.private	= current,				\
+			.func		= autoremove_wake_function,		\
+			.entry		= LIST_HEAD_INIT((name).wait.entry),	\
+		}								\
+	}
+
+/*
+ * Wrapper around prepare_to_wait_exclusive(), which maintains some extra
+ * internal state.
+ */
+void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
+				struct sbq_wait_state *ws,
+				struct sbq_wait *sbq_wait, int state);
+
+/*
+ * Must be paired with sbitmap_prepare_to_wait().
+ */
+void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
+				struct sbq_wait *sbq_wait);
+
 #endif /* __LINUX_SCALE_BITMAP_H */
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index f6a9553617bd..90a295cc782c 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -395,6 +395,7 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 	sbq->min_shallow_depth = UINT_MAX;
 	sbq->wake_batch = sbq_calc_wake_batch(sbq, depth);
 	atomic_set(&sbq->wake_index, 0);
+	atomic_set(&sbq->ws_active, 0);
 
 	sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
 	if (!sbq->ws) {
@@ -510,6 +511,9 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 {
 	int i, wake_index;
 
+	if (!atomic_read(&sbq->ws_active))
+		return NULL;
+
 	wake_index = atomic_read(&sbq->wake_index);
 	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
 		struct sbq_wait_state *ws = &sbq->ws[wake_index];
@@ -635,6 +639,7 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
 
 	seq_printf(m, "wake_batch=%u\n", sbq->wake_batch);
 	seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index));
+	seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active));
 
 	seq_puts(m, "ws={\n");
 	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
@@ -650,3 +655,26 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
 	seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_show);
+
+void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
+			     struct sbq_wait_state *ws,
+			     struct sbq_wait *sbq_wait, int state)
+{
+	if (!sbq_wait->accounted) {
+		atomic_inc(&sbq->ws_active);
+		sbq_wait->accounted = 1;
+	}
+	prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state);
+}
+EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait);
+
+void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
+			 struct sbq_wait *sbq_wait)
+{
+	finish_wait(&ws->wait, &sbq_wait->wait);
+	if (sbq_wait->accounted) {
+		atomic_dec(&sbq->ws_active);
+		sbq_wait->accounted = 0;
+	}
+}
+EXPORT_SYMBOL_GPL(sbitmap_finish_wait);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] sbitmap: ammortize cost of clearing bits
  2018-11-30 16:01 ` [PATCH 1/2] sbitmap: ammortize cost of clearing bits Jens Axboe
@ 2018-11-30 20:03   ` Omar Sandoval
  2018-11-30 20:10     ` Jens Axboe
  0 siblings, 1 reply; 8+ messages in thread
From: Omar Sandoval @ 2018-11-30 20:03 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

On Fri, Nov 30, 2018 at 09:01:17AM -0700, Jens Axboe wrote:
> sbitmap maintains a set of words that we use to set and clear bits, with
> each bit representing a tag for blk-mq. Even though we spread the bits
> out and maintain a hint cache, one particular bit allocated will end up
> being cleared in the exact same spot.
> 
> This introduces batched clearing of bits. Instead of clearing a given
> bit, the same bit is set in a cleared/free mask instead. If we fail
> allocating a bit from a given word, then we check the free mask, and
> batch move those cleared bits at that time. This trades 64 atomic bitops
> for 2 cmpxchg().
> 
> In a threaded poll test case, half the overhead of getting and clearing
> tags is removed with this change. On another poll test case with a
> single thread, performance is unchanged.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  include/linux/sbitmap.h | 31 +++++++++++++---
>  lib/sbitmap.c           | 80 +++++++++++++++++++++++++++++++++++++----
>  2 files changed, 100 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
> index 804a50983ec5..07f117ee19dc 100644
> --- a/include/linux/sbitmap.h
> +++ b/include/linux/sbitmap.h
> @@ -30,14 +30,24 @@ struct seq_file;
>   */
>  struct sbitmap_word {
>  	/**
> -	 * @word: The bitmap word itself.
> +	 * @depth: Number of bits being used in @word/@cleared
>  	 */
> -	unsigned long word;
> +	unsigned long depth;
>  
>  	/**
> -	 * @depth: Number of bits being used in @word.
> +	 * @word: word holding free bits
>  	 */
> -	unsigned long depth;
> +	unsigned long word ____cacheline_aligned_in_smp;

Still splitting up word and depth in separate cachelines?

> +	/**
> +	 * @cleared: word holding cleared bits
> +	 */
> +	unsigned long cleared ____cacheline_aligned_in_smp;
> +
> +	/**
> +	 * @swap_lock: Held while swapping word <-> cleared
> +	 */
> +	spinlock_t swap_lock;
>  } ____cacheline_aligned_in_smp;
>  
>  /**
> @@ -310,6 +320,19 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
>  	clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
>  }
>  
> +/*
> + * This one is special, since it doesn't actually clear the bit, rather it
> + * sets the corresponding bit in the ->cleared mask instead. Paired with
> + * the caller doing sbitmap_batch_clear() if a given index is full, which
> + * will clear the previously freed entries in the corresponding ->word.
> + */
> +static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
> +{
> +	unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared;
> +
> +	set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
> +}
> +
>  static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb,
>  					    unsigned int bitnr)
>  {
> diff --git a/lib/sbitmap.c b/lib/sbitmap.c
> index 45cab6bbc1c7..f6a9553617bd 100644
> --- a/lib/sbitmap.c
> +++ b/lib/sbitmap.c
> @@ -59,6 +59,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
>  	for (i = 0; i < sb->map_nr; i++) {
>  		sb->map[i].depth = min(depth, bits_per_word);
>  		depth -= sb->map[i].depth;
> +		spin_lock_init(&sb->map[i].swap_lock);
>  	}
>  	return 0;
>  }
> @@ -111,6 +112,57 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
>  	return nr;
>  }
>  
> +/*
> + * See if we have deferred clears that we can batch move
> + */
> +static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index)
> +{
> +	unsigned long mask, val;
> +	bool ret = false;
> +
> +	spin_lock(&sb->map[index].swap_lock);
> +
> +	if (!sb->map[index].cleared)
> +		goto out_unlock;
> +
> +	/*
> +	 * First get a stable cleared mask, setting the old mask to 0.
> +	 */
> +	do {
> +		mask = sb->map[index].cleared;
> +	} while (cmpxchg(&sb->map[index].cleared, mask, 0) != mask);
> +
> +	/*
> +	 * Now clear the masked bits in our free word
> +	 */
> +	do {
> +		val = sb->map[index].word;
> +	} while (cmpxchg(&sb->map[index].word, val, val & ~mask) != val);
> +
> +	ret = true;
> +out_unlock:
> +	spin_unlock(&sb->map[index].swap_lock);
> +	return ret;
> +}

Okay, I couldn't find any holes in this one :)

> +static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
> +				     unsigned int alloc_hint, bool round_robin)
> +{
> +	int nr;
> +
> +	do {
> +		nr = __sbitmap_get_word(&sb->map[index].word,
> +					sb->map[index].depth, alloc_hint,
> +					!round_robin);
> +		if (nr != -1)
> +			break;
> +		if (!sbitmap_deferred_clear(sb, index))
> +			break;
> +	} while (1);
> +
> +	return nr;
> +}
> +
>  int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
>  {
>  	unsigned int i, index;
> @@ -129,9 +181,8 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
>  		alloc_hint = 0;
>  
>  	for (i = 0; i < sb->map_nr; i++) {
> -		nr = __sbitmap_get_word(&sb->map[index].word,
> -					sb->map[index].depth, alloc_hint,
> -					!round_robin);
> +		nr = sbitmap_find_bit_in_index(sb, index, alloc_hint,
> +						round_robin);
>  		if (nr != -1) {
>  			nr += index << sb->shift;
>  			break;
> @@ -206,23 +257,37 @@ bool sbitmap_any_bit_clear(const struct sbitmap *sb)
>  }
>  EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear);
>  
> -unsigned int sbitmap_weight(const struct sbitmap *sb)
> +static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
>  {
>  	unsigned int i, weight = 0;
>  
>  	for (i = 0; i < sb->map_nr; i++) {
>  		const struct sbitmap_word *word = &sb->map[i];
>  
> -		weight += bitmap_weight(&word->word, word->depth);
> +		if (set)
> +			weight += bitmap_weight(&word->word, word->depth);

Should probably do
			weight -= bitmap_weight(&word->cleared, word->depth);

too, right?

> +		else
> +			weight += bitmap_weight(&word->cleared, word->depth);
>  	}
>  	return weight;
>  }
> +
> +unsigned int sbitmap_weight(const struct sbitmap *sb)
> +{
> +	return __sbitmap_weight(sb, true);
> +}
>  EXPORT_SYMBOL_GPL(sbitmap_weight);
>  
> +static unsigned int sbitmap_cleared(const struct sbitmap *sb)
> +{
> +	return __sbitmap_weight(sb, false);
> +}
> +
>  void sbitmap_show(struct sbitmap *sb, struct seq_file *m)
>  {
>  	seq_printf(m, "depth=%u\n", sb->depth);
> -	seq_printf(m, "busy=%u\n", sbitmap_weight(sb));
> +	seq_printf(m, "busy=%u\n", sbitmap_weight(sb) - sbitmap_cleared(sb));
> +	seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb));
>  	seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift);
>  	seq_printf(m, "map_nr=%u\n", sb->map_nr);
>  }
> @@ -514,7 +579,8 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
>  void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
>  			 unsigned int cpu)
>  {
> -	sbitmap_clear_bit_unlock(&sbq->sb, nr);
> +	sbitmap_deferred_clear_bit(&sbq->sb, nr);
> +
>  	/*
>  	 * Pairs with the memory barrier in set_current_state() to ensure the
>  	 * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
> -- 
> 2.17.1
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] sbitmap: ammortize cost of clearing bits
  2018-11-30 20:03   ` Omar Sandoval
@ 2018-11-30 20:10     ` Jens Axboe
  2018-11-30 21:41       ` Omar Sandoval
  0 siblings, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2018-11-30 20:10 UTC (permalink / raw)
  To: Omar Sandoval; +Cc: linux-block

On 11/30/18 1:03 PM, Omar Sandoval wrote:
> On Fri, Nov 30, 2018 at 09:01:17AM -0700, Jens Axboe wrote:
>> sbitmap maintains a set of words that we use to set and clear bits, with
>> each bit representing a tag for blk-mq. Even though we spread the bits
>> out and maintain a hint cache, one particular bit allocated will end up
>> being cleared in the exact same spot.
>>
>> This introduces batched clearing of bits. Instead of clearing a given
>> bit, the same bit is set in a cleared/free mask instead. If we fail
>> allocating a bit from a given word, then we check the free mask, and
>> batch move those cleared bits at that time. This trades 64 atomic bitops
>> for 2 cmpxchg().
>>
>> In a threaded poll test case, half the overhead of getting and clearing
>> tags is removed with this change. On another poll test case with a
>> single thread, performance is unchanged.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>  include/linux/sbitmap.h | 31 +++++++++++++---
>>  lib/sbitmap.c           | 80 +++++++++++++++++++++++++++++++++++++----
>>  2 files changed, 100 insertions(+), 11 deletions(-)
>>
>> diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
>> index 804a50983ec5..07f117ee19dc 100644
>> --- a/include/linux/sbitmap.h
>> +++ b/include/linux/sbitmap.h
>> @@ -30,14 +30,24 @@ struct seq_file;
>>   */
>>  struct sbitmap_word {
>>  	/**
>> -	 * @word: The bitmap word itself.
>> +	 * @depth: Number of bits being used in @word/@cleared
>>  	 */
>> -	unsigned long word;
>> +	unsigned long depth;
>>  
>>  	/**
>> -	 * @depth: Number of bits being used in @word.
>> +	 * @word: word holding free bits
>>  	 */
>> -	unsigned long depth;
>> +	unsigned long word ____cacheline_aligned_in_smp;
> 
> Still splitting up word and depth in separate cachelines?

Yeah, I mentioned that in one of the other postings, there's still a
definite win to doing that.

> Okay, I couldn't find any holes in this one :)

Good to hear that :-)

>> -unsigned int sbitmap_weight(const struct sbitmap *sb)
>> +static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
>>  {
>>  	unsigned int i, weight = 0;
>>  
>>  	for (i = 0; i < sb->map_nr; i++) {
>>  		const struct sbitmap_word *word = &sb->map[i];
>>  
>> -		weight += bitmap_weight(&word->word, word->depth);
>> +		if (set)
>> +			weight += bitmap_weight(&word->word, word->depth);
> 
> Should probably do
> 			weight -= bitmap_weight(&word->cleared, word->depth);
> 
> too, right?

We only use these for the debugfs stuff, how about I just make it static
instead?


-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] sbitmap: optimize wakeup check
  2018-11-30 16:01 ` [PATCH 2/2] sbitmap: optimize wakeup check Jens Axboe
@ 2018-11-30 21:37   ` Omar Sandoval
  0 siblings, 0 replies; 8+ messages in thread
From: Omar Sandoval @ 2018-11-30 21:37 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

On Fri, Nov 30, 2018 at 09:01:18AM -0700, Jens Axboe wrote:
> Even if we have no waiters on any of the sbitmap_queue wait states, we
> still have to loop every entry to check. We do this for every IO, so
> the cost adds up.
> 
> Shift a bit of the cost to the slow path, when we actually have waiters.
> Wrap prepare_to_wait_exclusive() and finish_wait(), so we can maintain
> an internal count of how many are currently active. Then we can simply
> check this count in sbq_wake_ptr() and not have to loop if we don't
> have any sleepers.
> 
> Convert the two users of sbitmap with waiting, blk-mq-tag and iSCSI.

Reviewed-by: Omar Sandoval <osandov@fb.com>

> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  block/blk-mq-tag.c                       | 11 ++++----
>  drivers/target/iscsi/iscsi_target_util.c | 12 +++++----
>  include/linux/sbitmap.h                  | 34 ++++++++++++++++++++++++
>  lib/sbitmap.c                            | 28 +++++++++++++++++++
>  4 files changed, 74 insertions(+), 11 deletions(-)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] sbitmap: ammortize cost of clearing bits
  2018-11-30 20:10     ` Jens Axboe
@ 2018-11-30 21:41       ` Omar Sandoval
  0 siblings, 0 replies; 8+ messages in thread
From: Omar Sandoval @ 2018-11-30 21:41 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

On Fri, Nov 30, 2018 at 01:10:47PM -0700, Jens Axboe wrote:
> On 11/30/18 1:03 PM, Omar Sandoval wrote:
> > On Fri, Nov 30, 2018 at 09:01:17AM -0700, Jens Axboe wrote:
> >> sbitmap maintains a set of words that we use to set and clear bits, with
> >> each bit representing a tag for blk-mq. Even though we spread the bits
> >> out and maintain a hint cache, one particular bit allocated will end up
> >> being cleared in the exact same spot.
> >>
> >> This introduces batched clearing of bits. Instead of clearing a given
> >> bit, the same bit is set in a cleared/free mask instead. If we fail
> >> allocating a bit from a given word, then we check the free mask, and
> >> batch move those cleared bits at that time. This trades 64 atomic bitops
> >> for 2 cmpxchg().
> >>
> >> In a threaded poll test case, half the overhead of getting and clearing
> >> tags is removed with this change. On another poll test case with a
> >> single thread, performance is unchanged.
> >>
> >> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> >> ---
> >>  include/linux/sbitmap.h | 31 +++++++++++++---
> >>  lib/sbitmap.c           | 80 +++++++++++++++++++++++++++++++++++++----
> >>  2 files changed, 100 insertions(+), 11 deletions(-)
> >>
> >> diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
> >> index 804a50983ec5..07f117ee19dc 100644
> >> --- a/include/linux/sbitmap.h
> >> +++ b/include/linux/sbitmap.h
> >> @@ -30,14 +30,24 @@ struct seq_file;
> >>   */
> >>  struct sbitmap_word {
> >>  	/**
> >> -	 * @word: The bitmap word itself.
> >> +	 * @depth: Number of bits being used in @word/@cleared
> >>  	 */
> >> -	unsigned long word;
> >> +	unsigned long depth;
> >>  
> >>  	/**
> >> -	 * @depth: Number of bits being used in @word.
> >> +	 * @word: word holding free bits
> >>  	 */
> >> -	unsigned long depth;
> >> +	unsigned long word ____cacheline_aligned_in_smp;
> > 
> > Still splitting up word and depth in separate cachelines?
> 
> Yeah, I mentioned that in one of the other postings, there's still a
> definite win to doing that.
> 
> > Okay, I couldn't find any holes in this one :)
> 
> Good to hear that :-)
> 
> >> -unsigned int sbitmap_weight(const struct sbitmap *sb)
> >> +static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
> >>  {
> >>  	unsigned int i, weight = 0;
> >>  
> >>  	for (i = 0; i < sb->map_nr; i++) {
> >>  		const struct sbitmap_word *word = &sb->map[i];
> >>  
> >> -		weight += bitmap_weight(&word->word, word->depth);
> >> +		if (set)
> >> +			weight += bitmap_weight(&word->word, word->depth);
> > 
> > Should probably do
> > 			weight -= bitmap_weight(&word->cleared, word->depth);
> > 
> > too, right?
> 
> We only use these for the debugfs stuff, how about I just make it static
> instead?

Yeah, with that,

Reviewed-by: Omar Sandoval <osandov@fb.com>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/2] sbitmap: ammortize cost of clearing bits
  2018-11-30  2:09 [PATCHSET v3] sbitmap optimizations Jens Axboe
@ 2018-11-30  2:09 ` Jens Axboe
  0 siblings, 0 replies; 8+ messages in thread
From: Jens Axboe @ 2018-11-30  2:09 UTC (permalink / raw)
  To: linux-block, osandov; +Cc: Jens Axboe

sbitmap maintains a set of words that we use to set and clear bits, with
each bit representing a tag for blk-mq. Even though we spread the bits
out and maintain a hint cache, one particular bit allocated will end up
being cleared in the exact same spot.

This introduces batched clearing of bits. Instead of clearing a given
bit, the same bit is set in a cleared/free mask instead. If we fail
allocating a bit from a given word, then we check the free mask, and
batch move those cleared bits at that time. This trades 64 atomic bitops
for 2 cmpxchg().

In a threaded poll test case, half the overhead of getting and clearing
tags is removed with this change. On another poll test case with a
single thread, performance is unchanged.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 26 +++++++++++++++---
 lib/sbitmap.c           | 60 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 78 insertions(+), 8 deletions(-)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 804a50983ec5..cec685b89998 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -30,14 +30,19 @@ struct seq_file;
  */
 struct sbitmap_word {
 	/**
-	 * @word: The bitmap word itself.
+	 * @depth: Number of bits being used in @word/@cleared
 	 */
-	unsigned long word;
+	unsigned long depth;
 
 	/**
-	 * @depth: Number of bits being used in @word.
+	 * @word: word holding free bits
 	 */
-	unsigned long depth;
+	unsigned long word ____cacheline_aligned_in_smp;
+
+	/**
+	 * @cleared: word holding cleared bits
+	 */
+	unsigned long cleared ____cacheline_aligned_in_smp;
 } ____cacheline_aligned_in_smp;
 
 /**
@@ -310,6 +315,19 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
 	clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
 }
 
+/*
+ * This one is special, since it doesn't actually clear the bit, rather it
+ * sets the corresponding bit in the ->cleared mask instead. Paired with
+ * the caller doing sbitmap_batch_clear() if a given index is full, which
+ * will clear the previously freed entries in the corresponding ->word.
+ */
+static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
+{
+	unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared;
+
+	set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
+}
+
 static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb,
 					    unsigned int bitnr)
 {
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 45cab6bbc1c7..2316f53f3e1d 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -111,6 +111,58 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
 	return nr;
 }
 
+/*
+ * See if we have deferred clears that we can batch move
+ */
+static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index)
+{
+	unsigned long mask, val;
+
+	if (!sb->map[index].cleared)
+		return false;
+
+	/*
+	 * First get a stable cleared mask, setting the old mask to 0.
+	 */
+	do {
+		mask = sb->map[index].cleared;
+	} while (cmpxchg(&sb->map[index].cleared, mask, 0) != mask);
+
+	/*
+	 * Now clear the masked bits in our free word
+	 */
+	do {
+		val = sb->map[index].word;
+	} while (cmpxchg(&sb->map[index].word, val, val & ~mask) != val);
+
+	/*
+	 * If someone found ->cleared == 0 before we wrote ->word, then
+	 * they could have failed when they should not have. Check for
+	 * waiters.
+	 */
+	smp_mb__after_atomic();
+	sbitmap_queue_wake_up(container_of(sb, struct sbitmap_queue, sb));
+	return true;
+}
+
+static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
+				     unsigned int alloc_hint, bool round_robin)
+{
+	int nr;
+
+	do {
+		nr = __sbitmap_get_word(&sb->map[index].word,
+					sb->map[index].depth, alloc_hint,
+					!round_robin);
+		if (nr != -1)
+			break;
+		if (!sbitmap_deferred_clear(sb, index))
+			break;
+	} while (1);
+
+	return nr;
+}
+
 int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 {
 	unsigned int i, index;
@@ -129,9 +181,8 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 		alloc_hint = 0;
 
 	for (i = 0; i < sb->map_nr; i++) {
-		nr = __sbitmap_get_word(&sb->map[index].word,
-					sb->map[index].depth, alloc_hint,
-					!round_robin);
+		nr = sbitmap_find_bit_in_index(sb, index, alloc_hint,
+						round_robin);
 		if (nr != -1) {
 			nr += index << sb->shift;
 			break;
@@ -514,7 +565,8 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 			 unsigned int cpu)
 {
-	sbitmap_clear_bit_unlock(&sbq->sb, nr);
+	sbitmap_deferred_clear_bit(&sbq->sb, nr);
+
 	/*
 	 * Pairs with the memory barrier in set_current_state() to ensure the
 	 * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-11-30 21:41 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-30 16:01 [PATCHSET v4] sbitmap optimizations Jens Axboe
2018-11-30 16:01 ` [PATCH 1/2] sbitmap: ammortize cost of clearing bits Jens Axboe
2018-11-30 20:03   ` Omar Sandoval
2018-11-30 20:10     ` Jens Axboe
2018-11-30 21:41       ` Omar Sandoval
2018-11-30 16:01 ` [PATCH 2/2] sbitmap: optimize wakeup check Jens Axboe
2018-11-30 21:37   ` Omar Sandoval
  -- strict thread matches above, loose matches on Subject: below --
2018-11-30  2:09 [PATCHSET v3] sbitmap optimizations Jens Axboe
2018-11-30  2:09 ` [PATCH 1/2] sbitmap: ammortize cost of clearing bits Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.