[PATCH v2 9/9] mm: vmalloc: Set nr_nodes/node_size based on CPU-cores

From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
To: linux-mm@kvack.org, Andrew Morton <akpm@linux-foundation.org>
Cc: LKML <linux-kernel@vger.kernel.org>, Baoquan He <bhe@redhat.com>,
	Lorenzo Stoakes <lstoakes@gmail.com>,
	Christoph Hellwig <hch@infradead.org>,
	Matthew Wilcox <willy@infradead.org>,
	"Liam R . Howlett" <Liam.Howlett@oracle.com>,
	Dave Chinner <david@fromorbit.com>,
	"Paul E . McKenney" <paulmck@kernel.org>,
	Joel Fernandes <joel@joelfernandes.org>,
	Uladzislau Rezki <urezki@gmail.com>,
	Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Subject: [PATCH v2 9/9] mm: vmalloc: Set nr_nodes/node_size based on CPU-cores
Date: Tue, 29 Aug 2023 10:11:42 +0200	[thread overview]
Message-ID: <20230829081142.3619-10-urezki@gmail.com> (raw)
In-Reply-To: <20230829081142.3619-1-urezki@gmail.com>

The density ratio is set to 2, i.e. two users per one node.
For example if there are 6 cores in a system the "nr_nodes"
is 3.

The "node_size" also depends on number of physical cores.
A high-threshold limit is hard-coded and set to SZ_4M.

For 32-bit, single/dual core systems an access to a global
vmap heap is not balanced. Such small systems do not suffer
from lock contentions due to limitation of CPU-cores.

Test on AMD Ryzen Threadripper 3970X 32-Core Processor:
sudo ./test_vmalloc.sh run_test_mask=127 nr_threads=64

<default perf>
 94.17%     0.90%  [kernel]    [k] _raw_spin_lock
 93.27%    93.05%  [kernel]    [k] native_queued_spin_lock_slowpath
 74.69%     0.25%  [kernel]    [k] __vmalloc_node_range
 72.64%     0.01%  [kernel]    [k] __get_vm_area_node
 72.04%     0.89%  [kernel]    [k] alloc_vmap_area
 42.17%     0.00%  [kernel]    [k] vmalloc
 32.53%     0.00%  [kernel]    [k] __vmalloc_node
 24.91%     0.25%  [kernel]    [k] vfree
 24.32%     0.01%  [kernel]    [k] remove_vm_area
 22.63%     0.21%  [kernel]    [k] find_unlink_vmap_area
 15.51%     0.00%  [unknown]   [k] 0xffffffffc09a74ac
 14.35%     0.00%  [kernel]    [k] ret_from_fork_asm
 14.35%     0.00%  [kernel]    [k] ret_from_fork
 14.35%     0.00%  [kernel]    [k] kthread
<default perf>
   vs
<patch-series perf>
 74.32%     2.42%  [kernel]    [k] __vmalloc_node_range
 69.58%     0.01%  [kernel]    [k] vmalloc
 54.21%     1.17%  [kernel]    [k] __alloc_pages_bulk
 48.13%    47.91%  [kernel]    [k] clear_page_orig
 43.60%     0.01%  [unknown]   [k] 0xffffffffc082f16f
 32.06%     0.00%  [kernel]    [k] ret_from_fork_asm
 32.06%     0.00%  [kernel]    [k] ret_from_fork
 32.06%     0.00%  [kernel]    [k] kthread
 31.30%     0.00%  [unknown]   [k] 0xffffffffc082f889
 22.98%     4.16%  [kernel]    [k] vfree
 14.36%     0.28%  [kernel]    [k] __get_vm_area_node
 13.43%     3.35%  [kernel]    [k] alloc_vmap_area
 10.86%     0.04%  [kernel]    [k] remove_vm_area
  8.89%     2.75%  [kernel]    [k] _raw_spin_lock
  7.19%     0.00%  [unknown]   [k] 0xffffffffc082fba3
  6.65%     1.37%  [kernel]    [k] free_unref_page
  6.13%     6.11%  [kernel]    [k] native_queued_spin_lock_slowpath
<patch-series perf>

confirms that a native_queued_spin_lock_slowpath bottle-neck
can be considered as negligible for the patch-series version.

The throughput is ~15x higher:

urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=127 nr_threads=64
Run the test with following parameters: run_test_mask=127 nr_threads=64
Done.
Check the kernel ring buffer to see the summary.

real    24m3.305s
user    0m0.361s
sys     0m0.013s
urezki@pc638:~$

urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=127 nr_threads=64
Run the test with following parameters: run_test_mask=127 nr_threads=64
Done.
Check the kernel ring buffer to see the summary.

real    1m28.382s
user    0m0.014s
sys     0m0.026s
urezki@pc638:~$

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 mm/vmalloc.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9cce012aecdb..08990f630c21 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -796,6 +796,9 @@ struct vmap_node {
 	atomic_t fill_in_progress;
 };
 
+#define MAX_NODES U8_MAX
+#define MAX_NODE_SIZE SZ_4M
+
 static struct vmap_node *nodes, snode;
 static __read_mostly unsigned int nr_nodes = 1;
 static __read_mostly unsigned int node_size = 1;
@@ -4803,11 +4806,24 @@ static void vmap_init_free_space(void)
 	}
 }
 
+static unsigned int calculate_nr_nodes(void)
+{
+	unsigned int nr_cpus;
+
+	nr_cpus = num_present_cpus();
+	if (nr_cpus <= 1)
+		nr_cpus = num_possible_cpus();
+
+	/* Density factor. Two users per a node. */
+	return clamp_t(unsigned int, nr_cpus >> 1, 1, MAX_NODES);
+}
+
 static void vmap_init_nodes(void)
 {
 	struct vmap_node *vn;
 	int i;
 
+	nr_nodes = calculate_nr_nodes();
 	nodes = &snode;
 
 	if (nr_nodes > 1) {
@@ -4830,6 +4846,16 @@ static void vmap_init_nodes(void)
 		INIT_LIST_HEAD(&vn->free.head);
 		spin_lock_init(&vn->free.lock);
 	}
+
+	/*
+	 * Scale a node size to number of CPUs. Each power of two
+	 * value doubles a node size. A high-threshold limit is set
+	 * to 4M.
+	 */
+#if BITS_PER_LONG == 64
+	if (nr_nodes > 1)
+		node_size = min(SZ_64K << fls(num_possible_cpus()), SZ_4M);
+#endif
 }
 
 void __init vmalloc_init(void)
-- 
2.30.2