All of lore.kernel.org
 help / color / mirror / Atom feed
From: Huang Shijie <shijie@os.amperecomputing.com>
To: gregkh@linuxfoundation.org
Cc: patches@amperecomputing.com, rafael@kernel.org,
	paul.walmsley@sifive.com, palmer@dabbelt.com,
	aou@eecs.berkeley.edu, yury.norov@gmail.com, kuba@kernel.org,
	vschneid@redhat.com, mingo@kernel.org, akpm@linux-foundation.org,
	vbabka@suse.cz, rppt@kernel.org, tglx@linutronix.de,
	jpoimboe@kernel.org, ndesaulniers@google.com,
	mikelley@microsoft.com, mhiramat@kernel.org, arnd@arndb.de,
	linux-kernel@vger.kernel.org, linux-riscv@lists.infradead.org,
	linux-arm-kernel@lists.infradead.org, catalin.marinas@arm.com,
	will@kernel.org, mark.rutland@arm.com, mpe@ellerman.id.au,
	linuxppc-dev@lists.ozlabs.org, chenhuacai@kernel.org,
	jiaxun.yang@flygoat.com, linux-mips@vger.kernel.org,
	cl@os.amperecomputing.com,
	Huang Shijie <shijie@os.amperecomputing.com>
Subject: [PATCH] init: refactor the generic cpu_to_node for NUMA
Date: Thu, 18 Jan 2024 11:14:12 +0800	[thread overview]
Message-ID: <20240118031412.3300-1-shijie@os.amperecomputing.com> (raw)

(0) We list the ARCHs which support the NUMA:
       arm64, loongarch, powerpc, riscv,
       sparc, mips, s390, x86,

(1) Some ARCHs in (0) override the generic cpu_to_node(), such as:
       sparc, mips, s390, x86.

    Since these ARCHs have their own cpu_to_node(), we do not care
    about them.

(2) The ARCHs enable NUMA and use the generic cpu_to_node.
    From (0) and (1), we can know that four ARCHs support NUMA and
    use the generic cpu_to_node:
        arm64, loongarch, powerpc, riscv,

    The generic cpu_to_node depends on percpu "numa_node".

    (2.1) The loongarch sets "numa_node" in:
          start_kernel --> smp_prepare_boot_cpu()

    (2.2) The arm64, powerpc, riscv set "numa_node" in:
       	  start_kernel --> arch_call_rest_init() --> rest_init()
       	               --> kernel_init() --> kernel_init_freeable()
                       --> smp_prepare_cpus()

    (2.3) The first place calling the cpu_to_node() is early_trace_init():
          start_kernel --> early_trace_init()--> __ring_buffer_alloc()
	               --> rb_allocate_cpu_buffer()

    (2.4) So it safe for loongarch. But for arm64, powerpc and riscv,
          there are at least four places in the common code where
	  the cpu_to_node() is called before it is initialized:
	   a.) early_trace_init()         in kernel/trace/trace.c
	   b.) sched_init()               in kernel/sched/core.c
	   c.) init_sched_fair_class()    in kernel/sched/fair.c
	   d.) workqueue_init_early()     in kernel/workqueue.c

(3) In order to fix the issue, the patch refactors the generic cpu_to_node:
    (3.1) change cpu_to_node to function pointer,
          and export it for kernel modules.

    (3.2) introduce _cpu_to_node() which is the original cpu_to_node().

    (3.3) introduce smp_prepare_boot_cpu_start() to wrap the original
          smp_prepare_boot_cpu(), and set cpu_to_node with
	  early_cpu_to_node which works fine for arm64, powerpc,
	  riscv and loongarch.

    (3.4) introduce smp_prepare_cpus_done() to wrap the original
          smp_prepare_cpus().
	  The "numa_node" is ready after smp_prepare_cpus(),
	  then set cpu_to_node with _cpu_to_node().

Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
---
 drivers/base/arch_numa.c | 11 +++++++++++
 include/linux/topology.h |  6 ++----
 init/main.c              | 29 +++++++++++++++++++++++++++--
 3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
index 5b59d133b6af..867a477fa975 100644
--- a/drivers/base/arch_numa.c
+++ b/drivers/base/arch_numa.c
@@ -61,6 +61,17 @@ EXPORT_SYMBOL(cpumask_of_node);
 
 #endif
 
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+#ifndef cpu_to_node
+int _cpu_to_node(int cpu)
+{
+	return per_cpu(numa_node, cpu);
+}
+int (*cpu_to_node)(int cpu);
+EXPORT_SYMBOL(cpu_to_node);
+#endif
+#endif
+
 static void numa_update_cpu(unsigned int cpu, bool remove)
 {
 	int nid = cpu_to_node(cpu);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 52f5850730b3..e7ce2bae11dd 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -91,10 +91,8 @@ static inline int numa_node_id(void)
 #endif
 
 #ifndef cpu_to_node
-static inline int cpu_to_node(int cpu)
-{
-	return per_cpu(numa_node, cpu);
-}
+extern int (*cpu_to_node)(int cpu);
+extern int _cpu_to_node(int cpu);
 #endif
 
 #ifndef set_numa_node
diff --git a/init/main.c b/init/main.c
index e24b0780fdff..b142e9c51161 100644
--- a/init/main.c
+++ b/init/main.c
@@ -870,6 +870,18 @@ static void __init print_unknown_bootoptions(void)
 	memblock_free(unknown_options, len);
 }
 
+static void __init smp_prepare_boot_cpu_start(void)
+{
+	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
+
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+#ifndef cpu_to_node
+	/* The early_cpu_to_node should be ready now. */
+	cpu_to_node = early_cpu_to_node;
+#endif
+#endif
+}
+
 asmlinkage __visible __init __no_sanitize_address __noreturn __no_stack_protector
 void start_kernel(void)
 {
@@ -899,7 +911,7 @@ void start_kernel(void)
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
-	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
+	smp_prepare_boot_cpu_start();
 	boot_cpu_hotplug_init();
 
 	pr_notice("Kernel command line: %s\n", saved_command_line);
@@ -1519,6 +1531,19 @@ void __init console_on_rootfs(void)
 	fput(file);
 }
 
+static void __init smp_prepare_cpus_done(unsigned int setup_max_cpus)
+{
+	/* Different ARCHs may override smp_prepare_cpus() */
+	smp_prepare_cpus(setup_max_cpus);
+
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+#ifndef cpu_to_node
+	/* Change to the formal function. */
+	cpu_to_node = _cpu_to_node;
+#endif
+#endif
+}
+
 static noinline void __init kernel_init_freeable(void)
 {
 	/* Now the scheduler is fully set up and can do blocking allocations */
@@ -1531,7 +1556,7 @@ static noinline void __init kernel_init_freeable(void)
 
 	cad_pid = get_pid(task_pid(current));
 
-	smp_prepare_cpus(setup_max_cpus);
+	smp_prepare_cpus_done(setup_max_cpus);
 
 	workqueue_init();
 
-- 
2.40.1


WARNING: multiple messages have this Message-ID (diff)
From: Huang Shijie <shijie@os.amperecomputing.com>
To: gregkh@linuxfoundation.org
Cc: patches@amperecomputing.com, rafael@kernel.org,
	paul.walmsley@sifive.com, palmer@dabbelt.com,
	aou@eecs.berkeley.edu, yury.norov@gmail.com, kuba@kernel.org,
	vschneid@redhat.com, mingo@kernel.org, akpm@linux-foundation.org,
	vbabka@suse.cz, rppt@kernel.org, tglx@linutronix.de,
	jpoimboe@kernel.org, ndesaulniers@google.com,
	mikelley@microsoft.com, mhiramat@kernel.org, arnd@arndb.de,
	linux-kernel@vger.kernel.org, linux-riscv@lists.infradead.org,
	linux-arm-kernel@lists.infradead.org, catalin.marinas@arm.com,
	will@kernel.org, mark.rutland@arm.com, mpe@ellerman.id.au,
	linuxppc-dev@lists.ozlabs.org, chenhuacai@kernel.org,
	jiaxun.yang@flygoat.com, linux-mips@vger.kernel.org,
	cl@os.amperecomputing.com,
	Huang Shijie <shijie@os.amperecomputing.com>
Subject: [PATCH] init: refactor the generic cpu_to_node for NUMA
Date: Thu, 18 Jan 2024 11:14:12 +0800	[thread overview]
Message-ID: <20240118031412.3300-1-shijie@os.amperecomputing.com> (raw)

(0) We list the ARCHs which support the NUMA:
       arm64, loongarch, powerpc, riscv,
       sparc, mips, s390, x86,

(1) Some ARCHs in (0) override the generic cpu_to_node(), such as:
       sparc, mips, s390, x86.

    Since these ARCHs have their own cpu_to_node(), we do not care
    about them.

(2) The ARCHs enable NUMA and use the generic cpu_to_node.
    From (0) and (1), we can know that four ARCHs support NUMA and
    use the generic cpu_to_node:
        arm64, loongarch, powerpc, riscv,

    The generic cpu_to_node depends on percpu "numa_node".

    (2.1) The loongarch sets "numa_node" in:
          start_kernel --> smp_prepare_boot_cpu()

    (2.2) The arm64, powerpc, riscv set "numa_node" in:
       	  start_kernel --> arch_call_rest_init() --> rest_init()
       	               --> kernel_init() --> kernel_init_freeable()
                       --> smp_prepare_cpus()

    (2.3) The first place calling the cpu_to_node() is early_trace_init():
          start_kernel --> early_trace_init()--> __ring_buffer_alloc()
	               --> rb_allocate_cpu_buffer()

    (2.4) So it safe for loongarch. But for arm64, powerpc and riscv,
          there are at least four places in the common code where
	  the cpu_to_node() is called before it is initialized:
	   a.) early_trace_init()         in kernel/trace/trace.c
	   b.) sched_init()               in kernel/sched/core.c
	   c.) init_sched_fair_class()    in kernel/sched/fair.c
	   d.) workqueue_init_early()     in kernel/workqueue.c

(3) In order to fix the issue, the patch refactors the generic cpu_to_node:
    (3.1) change cpu_to_node to function pointer,
          and export it for kernel modules.

    (3.2) introduce _cpu_to_node() which is the original cpu_to_node().

    (3.3) introduce smp_prepare_boot_cpu_start() to wrap the original
          smp_prepare_boot_cpu(), and set cpu_to_node with
	  early_cpu_to_node which works fine for arm64, powerpc,
	  riscv and loongarch.

    (3.4) introduce smp_prepare_cpus_done() to wrap the original
          smp_prepare_cpus().
	  The "numa_node" is ready after smp_prepare_cpus(),
	  then set cpu_to_node with _cpu_to_node().

Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
---
 drivers/base/arch_numa.c | 11 +++++++++++
 include/linux/topology.h |  6 ++----
 init/main.c              | 29 +++++++++++++++++++++++++++--
 3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
index 5b59d133b6af..867a477fa975 100644
--- a/drivers/base/arch_numa.c
+++ b/drivers/base/arch_numa.c
@@ -61,6 +61,17 @@ EXPORT_SYMBOL(cpumask_of_node);
 
 #endif
 
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+#ifndef cpu_to_node
+int _cpu_to_node(int cpu)
+{
+	return per_cpu(numa_node, cpu);
+}
+int (*cpu_to_node)(int cpu);
+EXPORT_SYMBOL(cpu_to_node);
+#endif
+#endif
+
 static void numa_update_cpu(unsigned int cpu, bool remove)
 {
 	int nid = cpu_to_node(cpu);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 52f5850730b3..e7ce2bae11dd 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -91,10 +91,8 @@ static inline int numa_node_id(void)
 #endif
 
 #ifndef cpu_to_node
-static inline int cpu_to_node(int cpu)
-{
-	return per_cpu(numa_node, cpu);
-}
+extern int (*cpu_to_node)(int cpu);
+extern int _cpu_to_node(int cpu);
 #endif
 
 #ifndef set_numa_node
diff --git a/init/main.c b/init/main.c
index e24b0780fdff..b142e9c51161 100644
--- a/init/main.c
+++ b/init/main.c
@@ -870,6 +870,18 @@ static void __init print_unknown_bootoptions(void)
 	memblock_free(unknown_options, len);
 }
 
+static void __init smp_prepare_boot_cpu_start(void)
+{
+	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
+
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+#ifndef cpu_to_node
+	/* The early_cpu_to_node should be ready now. */
+	cpu_to_node = early_cpu_to_node;
+#endif
+#endif
+}
+
 asmlinkage __visible __init __no_sanitize_address __noreturn __no_stack_protector
 void start_kernel(void)
 {
@@ -899,7 +911,7 @@ void start_kernel(void)
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
-	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
+	smp_prepare_boot_cpu_start();
 	boot_cpu_hotplug_init();
 
 	pr_notice("Kernel command line: %s\n", saved_command_line);
@@ -1519,6 +1531,19 @@ void __init console_on_rootfs(void)
 	fput(file);
 }
 
+static void __init smp_prepare_cpus_done(unsigned int setup_max_cpus)
+{
+	/* Different ARCHs may override smp_prepare_cpus() */
+	smp_prepare_cpus(setup_max_cpus);
+
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+#ifndef cpu_to_node
+	/* Change to the formal function. */
+	cpu_to_node = _cpu_to_node;
+#endif
+#endif
+}
+
 static noinline void __init kernel_init_freeable(void)
 {
 	/* Now the scheduler is fully set up and can do blocking allocations */
@@ -1531,7 +1556,7 @@ static noinline void __init kernel_init_freeable(void)
 
 	cad_pid = get_pid(task_pid(current));
 
-	smp_prepare_cpus(setup_max_cpus);
+	smp_prepare_cpus_done(setup_max_cpus);
 
 	workqueue_init();
 
-- 
2.40.1


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

WARNING: multiple messages have this Message-ID (diff)
From: Huang Shijie <shijie@os.amperecomputing.com>
To: gregkh@linuxfoundation.org
Cc: patches@amperecomputing.com, rafael@kernel.org,
	paul.walmsley@sifive.com, palmer@dabbelt.com,
	aou@eecs.berkeley.edu, yury.norov@gmail.com, kuba@kernel.org,
	vschneid@redhat.com, mingo@kernel.org, akpm@linux-foundation.org,
	vbabka@suse.cz, rppt@kernel.org, tglx@linutronix.de,
	jpoimboe@kernel.org, ndesaulniers@google.com,
	mikelley@microsoft.com, mhiramat@kernel.org, arnd@arndb.de,
	linux-kernel@vger.kernel.org, linux-riscv@lists.infradead.org,
	linux-arm-kernel@lists.infradead.org, catalin.marinas@arm.com,
	will@kernel.org, mark.rutland@arm.com, mpe@ellerman.id.au,
	linuxppc-dev@lists.ozlabs.org, chenhuacai@kernel.org,
	jiaxun.yang@flygoat.com, linux-mips@vger.kernel.org,
	cl@os.amperecomputing.com,
	Huang Shijie <shijie@os.amperecomputing.com>
Subject: [PATCH] init: refactor the generic cpu_to_node for NUMA
Date: Thu, 18 Jan 2024 11:14:12 +0800	[thread overview]
Message-ID: <20240118031412.3300-1-shijie@os.amperecomputing.com> (raw)

(0) We list the ARCHs which support the NUMA:
       arm64, loongarch, powerpc, riscv,
       sparc, mips, s390, x86,

(1) Some ARCHs in (0) override the generic cpu_to_node(), such as:
       sparc, mips, s390, x86.

    Since these ARCHs have their own cpu_to_node(), we do not care
    about them.

(2) The ARCHs enable NUMA and use the generic cpu_to_node.
    From (0) and (1), we can know that four ARCHs support NUMA and
    use the generic cpu_to_node:
        arm64, loongarch, powerpc, riscv,

    The generic cpu_to_node depends on percpu "numa_node".

    (2.1) The loongarch sets "numa_node" in:
          start_kernel --> smp_prepare_boot_cpu()

    (2.2) The arm64, powerpc, riscv set "numa_node" in:
       	  start_kernel --> arch_call_rest_init() --> rest_init()
       	               --> kernel_init() --> kernel_init_freeable()
                       --> smp_prepare_cpus()

    (2.3) The first place calling the cpu_to_node() is early_trace_init():
          start_kernel --> early_trace_init()--> __ring_buffer_alloc()
	               --> rb_allocate_cpu_buffer()

    (2.4) So it safe for loongarch. But for arm64, powerpc and riscv,
          there are at least four places in the common code where
	  the cpu_to_node() is called before it is initialized:
	   a.) early_trace_init()         in kernel/trace/trace.c
	   b.) sched_init()               in kernel/sched/core.c
	   c.) init_sched_fair_class()    in kernel/sched/fair.c
	   d.) workqueue_init_early()     in kernel/workqueue.c

(3) In order to fix the issue, the patch refactors the generic cpu_to_node:
    (3.1) change cpu_to_node to function pointer,
          and export it for kernel modules.

    (3.2) introduce _cpu_to_node() which is the original cpu_to_node().

    (3.3) introduce smp_prepare_boot_cpu_start() to wrap the original
          smp_prepare_boot_cpu(), and set cpu_to_node with
	  early_cpu_to_node which works fine for arm64, powerpc,
	  riscv and loongarch.

    (3.4) introduce smp_prepare_cpus_done() to wrap the original
          smp_prepare_cpus().
	  The "numa_node" is ready after smp_prepare_cpus(),
	  then set cpu_to_node with _cpu_to_node().

Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
---
 drivers/base/arch_numa.c | 11 +++++++++++
 include/linux/topology.h |  6 ++----
 init/main.c              | 29 +++++++++++++++++++++++++++--
 3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
index 5b59d133b6af..867a477fa975 100644
--- a/drivers/base/arch_numa.c
+++ b/drivers/base/arch_numa.c
@@ -61,6 +61,17 @@ EXPORT_SYMBOL(cpumask_of_node);
 
 #endif
 
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+#ifndef cpu_to_node
+int _cpu_to_node(int cpu)
+{
+	return per_cpu(numa_node, cpu);
+}
+int (*cpu_to_node)(int cpu);
+EXPORT_SYMBOL(cpu_to_node);
+#endif
+#endif
+
 static void numa_update_cpu(unsigned int cpu, bool remove)
 {
 	int nid = cpu_to_node(cpu);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 52f5850730b3..e7ce2bae11dd 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -91,10 +91,8 @@ static inline int numa_node_id(void)
 #endif
 
 #ifndef cpu_to_node
-static inline int cpu_to_node(int cpu)
-{
-	return per_cpu(numa_node, cpu);
-}
+extern int (*cpu_to_node)(int cpu);
+extern int _cpu_to_node(int cpu);
 #endif
 
 #ifndef set_numa_node
diff --git a/init/main.c b/init/main.c
index e24b0780fdff..b142e9c51161 100644
--- a/init/main.c
+++ b/init/main.c
@@ -870,6 +870,18 @@ static void __init print_unknown_bootoptions(void)
 	memblock_free(unknown_options, len);
 }
 
+static void __init smp_prepare_boot_cpu_start(void)
+{
+	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
+
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+#ifndef cpu_to_node
+	/* The early_cpu_to_node should be ready now. */
+	cpu_to_node = early_cpu_to_node;
+#endif
+#endif
+}
+
 asmlinkage __visible __init __no_sanitize_address __noreturn __no_stack_protector
 void start_kernel(void)
 {
@@ -899,7 +911,7 @@ void start_kernel(void)
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
-	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
+	smp_prepare_boot_cpu_start();
 	boot_cpu_hotplug_init();
 
 	pr_notice("Kernel command line: %s\n", saved_command_line);
@@ -1519,6 +1531,19 @@ void __init console_on_rootfs(void)
 	fput(file);
 }
 
+static void __init smp_prepare_cpus_done(unsigned int setup_max_cpus)
+{
+	/* Different ARCHs may override smp_prepare_cpus() */
+	smp_prepare_cpus(setup_max_cpus);
+
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+#ifndef cpu_to_node
+	/* Change to the formal function. */
+	cpu_to_node = _cpu_to_node;
+#endif
+#endif
+}
+
 static noinline void __init kernel_init_freeable(void)
 {
 	/* Now the scheduler is fully set up and can do blocking allocations */
@@ -1531,7 +1556,7 @@ static noinline void __init kernel_init_freeable(void)
 
 	cad_pid = get_pid(task_pid(current));
 
-	smp_prepare_cpus(setup_max_cpus);
+	smp_prepare_cpus_done(setup_max_cpus);
 
 	workqueue_init();
 
-- 
2.40.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

WARNING: multiple messages have this Message-ID (diff)
From: Huang Shijie <shijie@os.amperecomputing.com>
To: gregkh@linuxfoundation.org
Cc: mark.rutland@arm.com, rafael@kernel.org, catalin.marinas@arm.com,
	jiaxun.yang@flygoat.com, mikelley@microsoft.com,
	linux-riscv@lists.infradead.org, will@kernel.org,
	mingo@kernel.org, vschneid@redhat.com, arnd@arndb.de,
	chenhuacai@kernel.org, cl@os.amperecomputing.com, vbabka@suse.cz,
	kuba@kernel.org, patches@amperecomputing.com,
	linux-mips@vger.kernel.org, aou@eecs.berkeley.edu,
	yury.norov@gmail.com, paul.walmsley@sifive.com,
	tglx@linutronix.de, jpoimboe@kernel.org,
	linux-arm-kernel@lists.infradead.org,
	Huang Shijie <shijie@os.amperecomputing.com>,
	ndesaulniers@google.com, linux-kernel@vger.kernel.org,
	palmer@dabbelt.com, mhiramat@kernel.org,
	akpm@linux-foundation.org, linuxppc-dev@lists.ozlabs.org,
	rppt@kernel.org
Subject: [PATCH] init: refactor the generic cpu_to_node for NUMA
Date: Thu, 18 Jan 2024 11:14:12 +0800	[thread overview]
Message-ID: <20240118031412.3300-1-shijie@os.amperecomputing.com> (raw)

(0) We list the ARCHs which support the NUMA:
       arm64, loongarch, powerpc, riscv,
       sparc, mips, s390, x86,

(1) Some ARCHs in (0) override the generic cpu_to_node(), such as:
       sparc, mips, s390, x86.

    Since these ARCHs have their own cpu_to_node(), we do not care
    about them.

(2) The ARCHs enable NUMA and use the generic cpu_to_node.
    From (0) and (1), we can know that four ARCHs support NUMA and
    use the generic cpu_to_node:
        arm64, loongarch, powerpc, riscv,

    The generic cpu_to_node depends on percpu "numa_node".

    (2.1) The loongarch sets "numa_node" in:
          start_kernel --> smp_prepare_boot_cpu()

    (2.2) The arm64, powerpc, riscv set "numa_node" in:
       	  start_kernel --> arch_call_rest_init() --> rest_init()
       	               --> kernel_init() --> kernel_init_freeable()
                       --> smp_prepare_cpus()

    (2.3) The first place calling the cpu_to_node() is early_trace_init():
          start_kernel --> early_trace_init()--> __ring_buffer_alloc()
	               --> rb_allocate_cpu_buffer()

    (2.4) So it safe for loongarch. But for arm64, powerpc and riscv,
          there are at least four places in the common code where
	  the cpu_to_node() is called before it is initialized:
	   a.) early_trace_init()         in kernel/trace/trace.c
	   b.) sched_init()               in kernel/sched/core.c
	   c.) init_sched_fair_class()    in kernel/sched/fair.c
	   d.) workqueue_init_early()     in kernel/workqueue.c

(3) In order to fix the issue, the patch refactors the generic cpu_to_node:
    (3.1) change cpu_to_node to function pointer,
          and export it for kernel modules.

    (3.2) introduce _cpu_to_node() which is the original cpu_to_node().

    (3.3) introduce smp_prepare_boot_cpu_start() to wrap the original
          smp_prepare_boot_cpu(), and set cpu_to_node with
	  early_cpu_to_node which works fine for arm64, powerpc,
	  riscv and loongarch.

    (3.4) introduce smp_prepare_cpus_done() to wrap the original
          smp_prepare_cpus().
	  The "numa_node" is ready after smp_prepare_cpus(),
	  then set cpu_to_node with _cpu_to_node().

Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
---
 drivers/base/arch_numa.c | 11 +++++++++++
 include/linux/topology.h |  6 ++----
 init/main.c              | 29 +++++++++++++++++++++++++++--
 3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
index 5b59d133b6af..867a477fa975 100644
--- a/drivers/base/arch_numa.c
+++ b/drivers/base/arch_numa.c
@@ -61,6 +61,17 @@ EXPORT_SYMBOL(cpumask_of_node);
 
 #endif
 
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+#ifndef cpu_to_node
+int _cpu_to_node(int cpu)
+{
+	return per_cpu(numa_node, cpu);
+}
+int (*cpu_to_node)(int cpu);
+EXPORT_SYMBOL(cpu_to_node);
+#endif
+#endif
+
 static void numa_update_cpu(unsigned int cpu, bool remove)
 {
 	int nid = cpu_to_node(cpu);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 52f5850730b3..e7ce2bae11dd 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -91,10 +91,8 @@ static inline int numa_node_id(void)
 #endif
 
 #ifndef cpu_to_node
-static inline int cpu_to_node(int cpu)
-{
-	return per_cpu(numa_node, cpu);
-}
+extern int (*cpu_to_node)(int cpu);
+extern int _cpu_to_node(int cpu);
 #endif
 
 #ifndef set_numa_node
diff --git a/init/main.c b/init/main.c
index e24b0780fdff..b142e9c51161 100644
--- a/init/main.c
+++ b/init/main.c
@@ -870,6 +870,18 @@ static void __init print_unknown_bootoptions(void)
 	memblock_free(unknown_options, len);
 }
 
+static void __init smp_prepare_boot_cpu_start(void)
+{
+	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
+
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+#ifndef cpu_to_node
+	/* The early_cpu_to_node should be ready now. */
+	cpu_to_node = early_cpu_to_node;
+#endif
+#endif
+}
+
 asmlinkage __visible __init __no_sanitize_address __noreturn __no_stack_protector
 void start_kernel(void)
 {
@@ -899,7 +911,7 @@ void start_kernel(void)
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
-	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
+	smp_prepare_boot_cpu_start();
 	boot_cpu_hotplug_init();
 
 	pr_notice("Kernel command line: %s\n", saved_command_line);
@@ -1519,6 +1531,19 @@ void __init console_on_rootfs(void)
 	fput(file);
 }
 
+static void __init smp_prepare_cpus_done(unsigned int setup_max_cpus)
+{
+	/* Different ARCHs may override smp_prepare_cpus() */
+	smp_prepare_cpus(setup_max_cpus);
+
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+#ifndef cpu_to_node
+	/* Change to the formal function. */
+	cpu_to_node = _cpu_to_node;
+#endif
+#endif
+}
+
 static noinline void __init kernel_init_freeable(void)
 {
 	/* Now the scheduler is fully set up and can do blocking allocations */
@@ -1531,7 +1556,7 @@ static noinline void __init kernel_init_freeable(void)
 
 	cad_pid = get_pid(task_pid(current));
 
-	smp_prepare_cpus(setup_max_cpus);
+	smp_prepare_cpus_done(setup_max_cpus);
 
 	workqueue_init();
 
-- 
2.40.1


             reply	other threads:[~2024-01-18  3:15 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-01-18  3:14 Huang Shijie [this message]
2024-01-18  3:14 ` [PATCH] init: refactor the generic cpu_to_node for NUMA Huang Shijie
2024-01-18  3:14 ` Huang Shijie
2024-01-18  3:14 ` Huang Shijie
2024-01-18  9:27 ` Greg KH
2024-01-18  9:27   ` Greg KH
2024-01-18  9:27   ` Greg KH
2024-01-18  9:27   ` Greg KH
2024-01-18  9:43   ` Shijie Huang
2024-01-18  9:43     ` Shijie Huang
2024-01-18  9:43     ` Shijie Huang
2024-01-18  9:43     ` Shijie Huang
2024-01-28 17:58 ` kernel test robot
2024-01-28 17:58   ` kernel test robot
2024-01-28 17:58   ` kernel test robot
2024-01-28 17:58   ` kernel test robot
2024-01-28 19:43 ` kernel test robot
2024-01-28 19:43   ` kernel test robot
2024-01-28 19:43   ` kernel test robot
2024-01-28 19:43   ` kernel test robot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240118031412.3300-1-shijie@os.amperecomputing.com \
    --to=shijie@os.amperecomputing.com \
    --cc=akpm@linux-foundation.org \
    --cc=aou@eecs.berkeley.edu \
    --cc=arnd@arndb.de \
    --cc=catalin.marinas@arm.com \
    --cc=chenhuacai@kernel.org \
    --cc=cl@os.amperecomputing.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=jiaxun.yang@flygoat.com \
    --cc=jpoimboe@kernel.org \
    --cc=kuba@kernel.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mips@vger.kernel.org \
    --cc=linux-riscv@lists.infradead.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=mark.rutland@arm.com \
    --cc=mhiramat@kernel.org \
    --cc=mikelley@microsoft.com \
    --cc=mingo@kernel.org \
    --cc=mpe@ellerman.id.au \
    --cc=ndesaulniers@google.com \
    --cc=palmer@dabbelt.com \
    --cc=patches@amperecomputing.com \
    --cc=paul.walmsley@sifive.com \
    --cc=rafael@kernel.org \
    --cc=rppt@kernel.org \
    --cc=tglx@linutronix.de \
    --cc=vbabka@suse.cz \
    --cc=vschneid@redhat.com \
    --cc=will@kernel.org \
    --cc=yury.norov@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.