From: John Garry <john.garry@huawei.com>
To: <axboe@kernel.dk>, <linux-block@vger.kernel.org>
Cc: <linux-kernel@vger.kernel.org>, <linux-scsi@vger.kernel.org>,
John Garry <john.garry@huawei.com>
Subject: [RFC PATCH 2/2] sbitmap: Spread sbitmap word allocation over NUMA nodes
Date: Tue, 10 May 2022 19:14:34 +0800 [thread overview]
Message-ID: <1652181274-136198-3-git-send-email-john.garry@huawei.com> (raw)
In-Reply-To: <1652181274-136198-1-git-send-email-john.garry@huawei.com>
Currently sbitmap words are allocated in a single array. That array is
flagged for allocating at the NUMA node passed in sbitmap_init_node().
However often the sbitmap will be accessed by all the CPUs in the system -
for example, when BLK_MQ_F_TAG_HCTX_SHARED is set for the blk-mq tagset.
This can lead to performance issues where all CPUs in the system are doing
cross-NUMA node accesses to read/set/unset sbitmap tags.
Improve this by spreading the word allocations across all NUMA nodes as
evenly as possible. We set the per-CPU hint to fall within range of words
allocated for the NUMA node to which that CPU belongs.
Known issues:
- sbitmap resize does not work well for this scheme
- Improve updating hint for sbitmap_get() failure and sbitmap_put() when
hint is outside range of CPU's NUMA node words.
- Add intelligence for sub-arrays to be allocated at a single node, e.g.
when numa != NUMA_NO_NODE in sbitmap_init_node()
Signed-off-by: John Garry <john.garry@huawei.com>
---
include/linux/sbitmap.h | 5 ++++
lib/sbitmap.c | 63 +++++++++++++++++++++++++++++++++--------
2 files changed, 56 insertions(+), 12 deletions(-)
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 46268f391e32..6d897032dbc6 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -60,6 +60,11 @@ struct sbitmap {
*/
unsigned int map_nr;
+ /**
+ * @map_nr_per_node: Number of words being used per NUMA node.
+ */
+ unsigned int map_nr_per_node;
+
/**
* @round_robin: Allocate bits in strict round-robin order.
*/
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 64fb9800ed8c..99c87fbfa1a1 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -8,6 +8,17 @@
#include <linux/random.h>
#include <linux/sbitmap.h>
#include <linux/seq_file.h>
+#include <linux/mm.h>
+
+static unsigned int sbitmap_get_new_hint(struct sbitmap *sb, int cpu)
+{
+ unsigned int shift = sb->shift;
+ unsigned int map_nr_per_node = sb->map_nr_per_node;
+ unsigned int bit_per_node = map_nr_per_node << shift;
+ unsigned int hint_base = bit_per_node * cpu_to_node(cpu);
+
+ return hint_base + (prandom_u32() % bit_per_node);
+}
static int init_alloc_hint(struct sbitmap *sb, gfp_t flags)
{
@@ -20,8 +31,10 @@ static int init_alloc_hint(struct sbitmap *sb, gfp_t flags)
if (depth && !sb->round_robin) {
int i;
- for_each_possible_cpu(i)
- *per_cpu_ptr(sb->alloc_hint, i) = prandom_u32() % depth;
+ for_each_possible_cpu(i) {
+ *per_cpu_ptr(sb->alloc_hint, i) =
+ sbitmap_get_new_hint(sb, i);
+ }
}
return 0;
}
@@ -86,7 +99,8 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
{
unsigned int bits_per_word;
struct sbitmap_word *map;
- int index;
+ int index, num_nodes = num_online_nodes();
+ int nid, map_nr_cnt;
if (shift < 0)
shift = sbitmap_calculate_shift(depth);
@@ -105,6 +119,11 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
return 0;
}
+ if (sb->map_nr < num_nodes) {
+ sb->map_nr_per_node = 1;
+ } else {
+ sb->map_nr_per_node = sb->map_nr / num_nodes;
+ }
if (alloc_hint) {
if (init_alloc_hint(sb, flags))
return -ENOMEM;
@@ -113,23 +132,43 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
}
sb->map = kvzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node);
- if (!sb->map) {
- free_percpu(sb->alloc_hint);
- return -ENOMEM;
- }
-
- map = kvzalloc_node(sb->map_nr * sizeof(**sb->map), flags, node);
- if (!map)
- return -ENOMEM;
+ if (!sb->map)
+ goto err_map;
- for (index = 0; index < sb->map_nr; index++, map++) {
+ for (index = 0, nid = 0; index < sb->map_nr; index++, map++, map_nr_cnt++) {
struct sbitmap_word **_map;
+ if ((index % sb->map_nr_per_node) == 0) {
+ int cnt;
+
+ if (index == 0) {
+ cnt = sb->map_nr_per_node +
+ (sb->map_nr % sb->map_nr_per_node);
+ } else {
+ cnt = sb->map_nr_per_node;
+ }
+
+ map = kvzalloc_node(cnt * sizeof(**sb->map), flags, nid);
+ if (!map)
+ goto err_map_numa;
+ nid++;
+ }
+
_map = &sb->map[index];
*_map = map;
}
return 0;
+err_map_numa:
+ for (index = 0; index < sb->map_nr; index++, map++) {
+ if ((index % sb->map_nr_per_node) == 0) {
+ kfree(map);
+ }
+ }
+err_map:
+ free_percpu(sb->alloc_hint);
+
+ return -ENOMEM;
}
EXPORT_SYMBOL_GPL(sbitmap_init_node);
--
2.26.2
next prev parent reply other threads:[~2022-05-10 11:21 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-05-10 11:14 [RFC PATCH 0/2] sbitmap: NUMA node spreading John Garry
2022-05-10 11:14 ` [RFC PATCH 1/2] sbitmap: Make sbitmap.map a double pointer John Garry
2022-05-10 11:14 ` John Garry [this message]
2022-05-10 12:50 ` [RFC PATCH 0/2] sbitmap: NUMA node spreading Jens Axboe
2022-05-10 13:44 ` John Garry
2022-05-10 14:34 ` Jens Axboe
2022-05-10 15:03 ` John Garry
2022-05-11 2:07 ` Ming Lei
2022-05-11 9:57 ` John Garry
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1652181274-136198-3-git-send-email-john.garry@huawei.com \
--to=john.garry@huawei.com \
--cc=axboe@kernel.dk \
--cc=linux-block@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-scsi@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).