All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] x86/apic: reduce cache line misses in __x2apic_send_IPI_mask()
@ 2021-10-07 14:35 Eric Dumazet
  2021-10-12 12:47 ` Peter Zijlstra
  2021-10-29  8:09 ` [tip: x86/apic] x86/apic: Reduce " tip-bot2 for Eric Dumazet
  0 siblings, 2 replies; 4+ messages in thread
From: Eric Dumazet @ 2021-10-07 14:35 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: linux-kernel, Eric Dumazet, Eric Dumazet, Dave Hansen,
	Borislav Petkov, Peter Zijlstra

From: Eric Dumazet <edumazet@google.com>

Using per-cpu storage for @x86_cpu_to_logical_apicid
is not optimal.

Broadcast IPI will need at least one cache line
per cpu to access this field.

__x2apic_send_IPI_mask() is using standard bitmask operators.

By converting x86_cpu_to_logical_apicid to an array,
we divide by 16x number of needed cache lines, because
we find 16 values per cache line. CPU prefetcher can
kick nicely.

Also move @cluster_masks to READ_MOSTLY section to avoid false sharing.

Tested on a dual socket host with 256 cpus,
cost for a full broadcast is now 11 usec instead of 33 usec.

v2: use a dynamically allocated array, as suggested by Peter.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/x2apic_cluster.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index f4da9bb69a8859ff10824315388aeb49c2ccfad9..e696e22d0531976f7cba72ed17443592eac72c13 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -15,9 +15,15 @@ struct cluster_mask {
 	struct cpumask	mask;
 };
 
-static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+/*
+ * __x2apic_send_IPI_mask() possibly needs to read
+ * x86_cpu_to_logical_apicid for all online cpus in a sequential way.
+ * Using per cpu variable would cost one cache line per cpu.
+ */
+static u32 *x86_cpu_to_logical_apicid __read_mostly;
+
 static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
-static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
+static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks);
 static struct cluster_mask *cluster_hotplug_mask;
 
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -27,7 +33,7 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 
 static void x2apic_send_IPI(int cpu, int vector)
 {
-	u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+	u32 dest = x86_cpu_to_logical_apicid[cpu];
 
 	/* x2apic MSRs are special and need a special fence: */
 	weak_wrmsr_fence();
@@ -58,7 +64,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 
 		dest = 0;
 		for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
-			dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu);
+			dest |= x86_cpu_to_logical_apicid[clustercpu];
 
 		if (!dest)
 			continue;
@@ -94,7 +100,7 @@ static void x2apic_send_IPI_all(int vector)
 
 static u32 x2apic_calc_apicid(unsigned int cpu)
 {
-	return per_cpu(x86_cpu_to_logical_apicid, cpu);
+	return x86_cpu_to_logical_apicid[cpu];
 }
 
 static void init_x2apic_ldr(void)
@@ -103,7 +109,7 @@ static void init_x2apic_ldr(void)
 	u32 cluster, apicid = apic_read(APIC_LDR);
 	unsigned int cpu;
 
-	this_cpu_write(x86_cpu_to_logical_apicid, apicid);
+	x86_cpu_to_logical_apicid[smp_processor_id()] = apicid;
 
 	if (cmsk)
 		goto update;
@@ -166,12 +172,21 @@ static int x2apic_dead_cpu(unsigned int dead_cpu)
 
 static int x2apic_cluster_probe(void)
 {
+	u32 slots;
+
 	if (!x2apic_mode)
 		return 0;
 
+	slots = max_t(u32, L1_CACHE_BYTES/sizeof(u32), nr_cpu_ids);
+	x86_cpu_to_logical_apicid = kcalloc(slots, sizeof(u32), GFP_KERNEL);
+	if (!x86_cpu_to_logical_apicid)
+		return 0;
+
 	if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
 			      x2apic_prepare_cpu, x2apic_dead_cpu) < 0) {
 		pr_err("Failed to register X2APIC_PREPARE\n");
+		kfree(x86_cpu_to_logical_apicid);
+		x86_cpu_to_logical_apicid = NULL;
 		return 0;
 	}
 	init_x2apic_ldr();
-- 
2.33.0.882.g93a45727a2-goog


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] x86/apic: reduce cache line misses in __x2apic_send_IPI_mask()
  2021-10-07 14:35 [PATCH v2] x86/apic: reduce cache line misses in __x2apic_send_IPI_mask() Eric Dumazet
@ 2021-10-12 12:47 ` Peter Zijlstra
  2021-10-29  0:14   ` Eric Dumazet
  2021-10-29  8:09 ` [tip: x86/apic] x86/apic: Reduce " tip-bot2 for Eric Dumazet
  1 sibling, 1 reply; 4+ messages in thread
From: Peter Zijlstra @ 2021-10-12 12:47 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Thomas Gleixner, linux-kernel, Eric Dumazet, Dave Hansen,
	Borislav Petkov

On Thu, Oct 07, 2021 at 07:35:56AM -0700, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> Using per-cpu storage for @x86_cpu_to_logical_apicid
> is not optimal.
> 
> Broadcast IPI will need at least one cache line
> per cpu to access this field.
> 
> __x2apic_send_IPI_mask() is using standard bitmask operators.
> 
> By converting x86_cpu_to_logical_apicid to an array,
> we divide by 16x number of needed cache lines, because
> we find 16 values per cache line. CPU prefetcher can
> kick nicely.
> 
> Also move @cluster_masks to READ_MOSTLY section to avoid false sharing.
> 
> Tested on a dual socket host with 256 cpus,
> cost for a full broadcast is now 11 usec instead of 33 usec.
> 
> v2: use a dynamically allocated array, as suggested by Peter.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] x86/apic: reduce cache line misses in __x2apic_send_IPI_mask()
  2021-10-12 12:47 ` Peter Zijlstra
@ 2021-10-29  0:14   ` Eric Dumazet
  0 siblings, 0 replies; 4+ messages in thread
From: Eric Dumazet @ 2021-10-29  0:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Eric Dumazet, Thomas Gleixner, linux-kernel, Dave Hansen,
	Borislav Petkov

On Tue, Oct 12, 2021 at 5:47 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Oct 07, 2021 at 07:35:56AM -0700, Eric Dumazet wrote:
> > From: Eric Dumazet <edumazet@google.com>
> >
> > Using per-cpu storage for @x86_cpu_to_logical_apicid
> > is not optimal.
> >
> > Broadcast IPI will need at least one cache line
> > per cpu to access this field.
> >
> > __x2apic_send_IPI_mask() is using standard bitmask operators.
> >
> > By converting x86_cpu_to_logical_apicid to an array,
> > we divide by 16x number of needed cache lines, because
> > we find 16 values per cache line. CPU prefetcher can
> > kick nicely.
> >
> > Also move @cluster_masks to READ_MOSTLY section to avoid false sharing.
> >
> > Tested on a dual socket host with 256 cpus,
> > cost for a full broadcast is now 11 usec instead of 33 usec.
> >
> > v2: use a dynamically allocated array, as suggested by Peter.
> >
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
>
> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Has this patch been merged x86 maintainers tree yet ?

Thanks for reviewing !

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [tip: x86/apic] x86/apic: Reduce cache line misses in __x2apic_send_IPI_mask()
  2021-10-07 14:35 [PATCH v2] x86/apic: reduce cache line misses in __x2apic_send_IPI_mask() Eric Dumazet
  2021-10-12 12:47 ` Peter Zijlstra
@ 2021-10-29  8:09 ` tip-bot2 for Eric Dumazet
  1 sibling, 0 replies; 4+ messages in thread
From: tip-bot2 for Eric Dumazet @ 2021-10-29  8:09 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Eric Dumazet, Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the x86/apic branch of tip:

Commit-ID:     cc95a07fef06a2c7917acd827b3a8322772969eb
Gitweb:        https://git.kernel.org/tip/cc95a07fef06a2c7917acd827b3a8322772969eb
Author:        Eric Dumazet <edumazet@google.com>
AuthorDate:    Thu, 07 Oct 2021 07:35:56 -07:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Fri, 29 Oct 2021 10:02:17 +02:00

x86/apic: Reduce cache line misses in __x2apic_send_IPI_mask()

Using per-cpu storage for @x86_cpu_to_logical_apicid is not optimal.

Broadcast IPI will need at least one cache line per cpu to access this
field.

__x2apic_send_IPI_mask() is using standard bitmask operators.

By converting x86_cpu_to_logical_apicid to an array, we divide by 16x
number of needed cache lines, because we find 16 values per cache
line. CPU prefetcher can kick nicely.

Also move @cluster_masks to READ_MOSTLY section to avoid false sharing.

Tested on a dual socket host with 256 cpus, cost for a full broadcast
is now 11 usec instead of 33 usec.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20211007143556.574911-1-eric.dumazet@gmail.com
---
 arch/x86/kernel/apic/x2apic_cluster.c | 27 ++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index f4da9bb..e696e22 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -15,9 +15,15 @@ struct cluster_mask {
 	struct cpumask	mask;
 };
 
-static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+/*
+ * __x2apic_send_IPI_mask() possibly needs to read
+ * x86_cpu_to_logical_apicid for all online cpus in a sequential way.
+ * Using per cpu variable would cost one cache line per cpu.
+ */
+static u32 *x86_cpu_to_logical_apicid __read_mostly;
+
 static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
-static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
+static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks);
 static struct cluster_mask *cluster_hotplug_mask;
 
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -27,7 +33,7 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 
 static void x2apic_send_IPI(int cpu, int vector)
 {
-	u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+	u32 dest = x86_cpu_to_logical_apicid[cpu];
 
 	/* x2apic MSRs are special and need a special fence: */
 	weak_wrmsr_fence();
@@ -58,7 +64,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 
 		dest = 0;
 		for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
-			dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu);
+			dest |= x86_cpu_to_logical_apicid[clustercpu];
 
 		if (!dest)
 			continue;
@@ -94,7 +100,7 @@ static void x2apic_send_IPI_all(int vector)
 
 static u32 x2apic_calc_apicid(unsigned int cpu)
 {
-	return per_cpu(x86_cpu_to_logical_apicid, cpu);
+	return x86_cpu_to_logical_apicid[cpu];
 }
 
 static void init_x2apic_ldr(void)
@@ -103,7 +109,7 @@ static void init_x2apic_ldr(void)
 	u32 cluster, apicid = apic_read(APIC_LDR);
 	unsigned int cpu;
 
-	this_cpu_write(x86_cpu_to_logical_apicid, apicid);
+	x86_cpu_to_logical_apicid[smp_processor_id()] = apicid;
 
 	if (cmsk)
 		goto update;
@@ -166,12 +172,21 @@ static int x2apic_dead_cpu(unsigned int dead_cpu)
 
 static int x2apic_cluster_probe(void)
 {
+	u32 slots;
+
 	if (!x2apic_mode)
 		return 0;
 
+	slots = max_t(u32, L1_CACHE_BYTES/sizeof(u32), nr_cpu_ids);
+	x86_cpu_to_logical_apicid = kcalloc(slots, sizeof(u32), GFP_KERNEL);
+	if (!x86_cpu_to_logical_apicid)
+		return 0;
+
 	if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
 			      x2apic_prepare_cpu, x2apic_dead_cpu) < 0) {
 		pr_err("Failed to register X2APIC_PREPARE\n");
+		kfree(x86_cpu_to_logical_apicid);
+		x86_cpu_to_logical_apicid = NULL;
 		return 0;
 	}
 	init_x2apic_ldr();

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-10-29  8:09 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-07 14:35 [PATCH v2] x86/apic: reduce cache line misses in __x2apic_send_IPI_mask() Eric Dumazet
2021-10-12 12:47 ` Peter Zijlstra
2021-10-29  0:14   ` Eric Dumazet
2021-10-29  8:09 ` [tip: x86/apic] x86/apic: Reduce " tip-bot2 for Eric Dumazet

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.