[PATCH 0/3] x86: clflush related adjustments

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/3] x86: clflush related adjustments
@ 2016-02-10 12:40 Jan Beulich
  2016-02-10 12:56 ` [PATCH 1/3] x86: avoid flush IPI when possible Jan Beulich
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Jan Beulich @ 2016-02-10 12:40 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

1: avoid flush IPI when possible
2: use CLFLUSHOPT when available
3: rename X86_FEATURE_{CLFLSH -> CLFLUSH}

Signed-off-by: Jan Beulich <jbeulich@suse.com>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/3] x86: avoid flush IPI when possible
  2016-02-10 12:40 [PATCH 0/3] x86: clflush related adjustments Jan Beulich
@ 2016-02-10 12:56 ` Jan Beulich
  2016-02-10 15:00   ` Andrew Cooper
  2016-02-10 12:57 ` [PATCH 2/3] x86: use CLFLUSHOPT when available Jan Beulich
  2016-02-10 12:57 ` [PATCH 3/3] x86: rename X86_FEATURE_{CLFLSH -> CLFLUSH} Jan Beulich
  2 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2016-02-10 12:56 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 3125 bytes --]

Since CLFLUSH, other than WBINVD, is a coherency domain wide flush,
there's no need to IPI other CPUs if this is the only flushing being
requested. (As a secondary change, move a local variable into the scope
where it's actually needed.)

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -91,9 +91,8 @@ void write_cr3(unsigned long cr3)
     local_irq_restore(flags);
 }
 
-void flush_area_local(const void *va, unsigned int flags)
+unsigned int flush_area_local(const void *va, unsigned int flags)
 {
-    const struct cpuinfo_x86 *c = &current_cpu_data;
     unsigned int order = (flags - 1) & FLUSH_ORDER_MASK;
     unsigned long irqfl;
 
@@ -130,6 +129,7 @@ void flush_area_local(const void *va, un
 
     if ( flags & FLUSH_CACHE )
     {
+        const struct cpuinfo_x86 *c = &current_cpu_data;
         unsigned long i, sz = 0;
 
         if ( order < (BITS_PER_LONG - PAGE_SHIFT) )
@@ -147,8 +147,11 @@ void flush_area_local(const void *va, un
         else
         {
             wbinvd();
+            flags &= ~FLUSH_CACHE;
         }
     }
 
     local_irq_restore(irqfl);
+
+    return flags & FLUSH_CACHE;
 }
--- a/xen/arch/x86/smp.c
+++ b/xen/arch/x86/smp.c
@@ -205,26 +205,30 @@ static unsigned int flush_flags;
 
 void invalidate_interrupt(struct cpu_user_regs *regs)
 {
+    unsigned int flags = flush_flags;
     ack_APIC_irq();
     perfc_incr(ipis);
-    if ( !__sync_local_execstate() ||
-         (flush_flags & (FLUSH_TLB_GLOBAL | FLUSH_CACHE)) )
-        flush_area_local(flush_va, flush_flags);
+    if ( __sync_local_execstate() )
+        flags &= ~FLUSH_TLB;
+    flush_area_local(flush_va, flags);
     cpumask_clear_cpu(smp_processor_id(), &flush_cpumask);
 }
 
 void flush_area_mask(const cpumask_t *mask, const void *va, unsigned int flags)
 {
+    unsigned int cpu = smp_processor_id();
+
     ASSERT(local_irq_is_enabled());
 
-    if ( cpumask_test_cpu(smp_processor_id(), mask) )
-        flush_area_local(va, flags);
+    if ( cpumask_test_cpu(cpu, mask) )
+        flags &= ~flush_area_local(va, flags);
 
-    if ( !cpumask_subset(mask, cpumask_of(smp_processor_id())) )
+    if ( (flags & ~FLUSH_ORDER_MASK) &&
+         !cpumask_subset(mask, cpumask_of(cpu)) )
     {
         spin_lock(&flush_lock);
         cpumask_and(&flush_cpumask, mask, &cpu_online_map);
-        cpumask_clear_cpu(smp_processor_id(), &flush_cpumask);
+        cpumask_clear_cpu(cpu, &flush_cpumask);
         flush_va      = va;
         flush_flags   = flags;
         send_IPI_mask(&flush_cpumask, INVALIDATE_TLB_VECTOR);
--- a/xen/include/asm-x86/flushtlb.h
+++ b/xen/include/asm-x86/flushtlb.h
@@ -87,7 +87,7 @@ void write_cr3(unsigned long cr3);
 #define FLUSH_CACHE      0x400
 
 /* Flush local TLBs/caches. */
-void flush_area_local(const void *va, unsigned int flags);
+unsigned int flush_area_local(const void *va, unsigned int flags);
 #define flush_local(flags) flush_area_local(NULL, flags)
 
 /* Flush specified CPUs' TLBs/caches */




[-- Attachment #2: x86-flush-overhead.patch --]
[-- Type: text/plain, Size: 3157 bytes --]

x86: avoid flush IPI when possible

Since CLFLUSH, other than WBINVD, is a coherency domain wide flush,
there's no need to IPI other CPUs if this is the only flushing being
requested. (As a secondary change, move a local variable into the scope
where it's actually needed.)

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -91,9 +91,8 @@ void write_cr3(unsigned long cr3)
     local_irq_restore(flags);
 }
 
-void flush_area_local(const void *va, unsigned int flags)
+unsigned int flush_area_local(const void *va, unsigned int flags)
 {
-    const struct cpuinfo_x86 *c = &current_cpu_data;
     unsigned int order = (flags - 1) & FLUSH_ORDER_MASK;
     unsigned long irqfl;
 
@@ -130,6 +129,7 @@ void flush_area_local(const void *va, un
 
     if ( flags & FLUSH_CACHE )
     {
+        const struct cpuinfo_x86 *c = &current_cpu_data;
         unsigned long i, sz = 0;
 
         if ( order < (BITS_PER_LONG - PAGE_SHIFT) )
@@ -147,8 +147,11 @@ void flush_area_local(const void *va, un
         else
         {
             wbinvd();
+            flags &= ~FLUSH_CACHE;
         }
     }
 
     local_irq_restore(irqfl);
+
+    return flags & FLUSH_CACHE;
 }
--- a/xen/arch/x86/smp.c
+++ b/xen/arch/x86/smp.c
@@ -205,26 +205,30 @@ static unsigned int flush_flags;
 
 void invalidate_interrupt(struct cpu_user_regs *regs)
 {
+    unsigned int flags = flush_flags;
     ack_APIC_irq();
     perfc_incr(ipis);
-    if ( !__sync_local_execstate() ||
-         (flush_flags & (FLUSH_TLB_GLOBAL | FLUSH_CACHE)) )
-        flush_area_local(flush_va, flush_flags);
+    if ( __sync_local_execstate() )
+        flags &= ~FLUSH_TLB;
+    flush_area_local(flush_va, flags);
     cpumask_clear_cpu(smp_processor_id(), &flush_cpumask);
 }
 
 void flush_area_mask(const cpumask_t *mask, const void *va, unsigned int flags)
 {
+    unsigned int cpu = smp_processor_id();
+
     ASSERT(local_irq_is_enabled());
 
-    if ( cpumask_test_cpu(smp_processor_id(), mask) )
-        flush_area_local(va, flags);
+    if ( cpumask_test_cpu(cpu, mask) )
+        flags &= ~flush_area_local(va, flags);
 
-    if ( !cpumask_subset(mask, cpumask_of(smp_processor_id())) )
+    if ( (flags & ~FLUSH_ORDER_MASK) &&
+         !cpumask_subset(mask, cpumask_of(cpu)) )
     {
         spin_lock(&flush_lock);
         cpumask_and(&flush_cpumask, mask, &cpu_online_map);
-        cpumask_clear_cpu(smp_processor_id(), &flush_cpumask);
+        cpumask_clear_cpu(cpu, &flush_cpumask);
         flush_va      = va;
         flush_flags   = flags;
         send_IPI_mask(&flush_cpumask, INVALIDATE_TLB_VECTOR);
--- a/xen/include/asm-x86/flushtlb.h
+++ b/xen/include/asm-x86/flushtlb.h
@@ -87,7 +87,7 @@ void write_cr3(unsigned long cr3);
 #define FLUSH_CACHE      0x400
 
 /* Flush local TLBs/caches. */
-void flush_area_local(const void *va, unsigned int flags);
+unsigned int flush_area_local(const void *va, unsigned int flags);
 #define flush_local(flags) flush_area_local(NULL, flags)
 
 /* Flush specified CPUs' TLBs/caches */

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 2/3] x86: use CLFLUSHOPT when available
  2016-02-10 12:40 [PATCH 0/3] x86: clflush related adjustments Jan Beulich
  2016-02-10 12:56 ` [PATCH 1/3] x86: avoid flush IPI when possible Jan Beulich
@ 2016-02-10 12:57 ` Jan Beulich
  2016-02-10 15:03   ` Andrew Cooper
  2016-02-10 12:57 ` [PATCH 3/3] x86: rename X86_FEATURE_{CLFLSH -> CLFLUSH} Jan Beulich
  2 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2016-02-10 12:57 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 1506 bytes --]

Also drop an unnecessary va adjustment in the code being touched.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -139,10 +139,12 @@ unsigned int flush_area_local(const void
              c->x86_clflush_size && c->x86_cache_size && sz &&
              ((sz >> 10) < c->x86_cache_size) )
         {
-            va = (const void *)((unsigned long)va & ~(sz - 1));
+            alternative(ASM_NOP3, "sfence", X86_FEATURE_CLFLUSHOPT);
             for ( i = 0; i < sz; i += c->x86_clflush_size )
-                 asm volatile ( "clflush %0"
-                                : : "m" (((const char *)va)[i]) );
+                 alternative_input("rex clflush %0",
+                                   "data16 clflush %0",
+                                   X86_FEATURE_CLFLUSHOPT,
+                                   "m" (((const char *)va)[i]));
         }
         else
         {
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -163,6 +163,7 @@
 #define X86_FEATURE_ADX		(7*32+19) /* ADCX, ADOX instructions */
 #define X86_FEATURE_SMAP	(7*32+20) /* Supervisor Mode Access Prevention */
 #define X86_FEATURE_PCOMMIT	(7*32+22) /* PCOMMIT instruction */
+#define X86_FEATURE_CLFLUSHOPT	(7*32+23) /* CLFLUSHOPT instruction */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 8 */
 #define X86_FEATURE_PKU	(8*32+ 3) /* Protection Keys for Userspace */




[-- Attachment #2: x86-clflushopt.patch --]
[-- Type: text/plain, Size: 1538 bytes --]

x86: use CLFLUSHOPT when available

Also drop an unnecessary va adjustment in the code being touched.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -139,10 +139,12 @@ unsigned int flush_area_local(const void
              c->x86_clflush_size && c->x86_cache_size && sz &&
              ((sz >> 10) < c->x86_cache_size) )
         {
-            va = (const void *)((unsigned long)va & ~(sz - 1));
+            alternative(ASM_NOP3, "sfence", X86_FEATURE_CLFLUSHOPT);
             for ( i = 0; i < sz; i += c->x86_clflush_size )
-                 asm volatile ( "clflush %0"
-                                : : "m" (((const char *)va)[i]) );
+                 alternative_input("rex clflush %0",
+                                   "data16 clflush %0",
+                                   X86_FEATURE_CLFLUSHOPT,
+                                   "m" (((const char *)va)[i]));
         }
         else
         {
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -163,6 +163,7 @@
 #define X86_FEATURE_ADX		(7*32+19) /* ADCX, ADOX instructions */
 #define X86_FEATURE_SMAP	(7*32+20) /* Supervisor Mode Access Prevention */
 #define X86_FEATURE_PCOMMIT	(7*32+22) /* PCOMMIT instruction */
+#define X86_FEATURE_CLFLUSHOPT	(7*32+23) /* CLFLUSHOPT instruction */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 8 */
 #define X86_FEATURE_PKU	(8*32+ 3) /* Protection Keys for Userspace */

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 3/3] x86: rename X86_FEATURE_{CLFLSH -> CLFLUSH}
  2016-02-10 12:40 [PATCH 0/3] x86: clflush related adjustments Jan Beulich
  2016-02-10 12:56 ` [PATCH 1/3] x86: avoid flush IPI when possible Jan Beulich
  2016-02-10 12:57 ` [PATCH 2/3] x86: use CLFLUSHOPT when available Jan Beulich
@ 2016-02-10 12:57 ` Jan Beulich
  2016-02-10 15:04   ` Andrew Cooper
  2 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2016-02-10 12:57 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 4008 bytes --]

This is both more natural and in line with a Linux change (between 3.14
and 3.15).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -206,7 +206,7 @@ static void __init early_cpu_detect(void
 	c->x86_mask = eax & 15;
 	edx &= ~cleared_caps[cpufeat_word(X86_FEATURE_FPU)];
 	ecx &= ~cleared_caps[cpufeat_word(X86_FEATURE_XMM3)];
-	if (edx & cpufeat_mask(X86_FEATURE_CLFLSH))
+	if (edx & cpufeat_mask(X86_FEATURE_CLFLUSH))
 		c->x86_cache_alignment = ((ebx >> 8) & 0xff) * 8;
 	/* Leaf 0x1 capabilities filled in early for Xen. */
 	c->x86_capability[cpufeat_word(X86_FEATURE_FPU)] = edx;
@@ -251,7 +251,7 @@ static void generic_identify(struct cpui
 	c->x86_capability[cpufeat_word(X86_FEATURE_FPU)] = edx;
 	c->x86_capability[cpufeat_word(X86_FEATURE_XMM3)] = ecx;
 
-	if ( cpu_has(c, X86_FEATURE_CLFLSH) )
+	if ( cpu_has(c, X86_FEATURE_CLFLUSH) )
 		c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
 
 	if ( (c->cpuid_level >= CPUID_PM_LEAF) &&
--- a/xen/include/asm-x86/amd.h
+++ b/xen/include/asm-x86/amd.h
@@ -11,17 +11,17 @@
 
 /* Family 0Fh, Revision C */
 #define AMD_FEATURES_K8_REV_C_ECX  0
-#define AMD_FEATURES_K8_REV_C_EDX (					    \
-	cpufeat_mask(X86_FEATURE_FPU)   | cpufeat_mask(X86_FEATURE_VME)   | \
-	cpufeat_mask(X86_FEATURE_DE)    | cpufeat_mask(X86_FEATURE_PSE)   | \
-	cpufeat_mask(X86_FEATURE_TSC)   | cpufeat_mask(X86_FEATURE_MSR)   | \
-	cpufeat_mask(X86_FEATURE_PAE)   | cpufeat_mask(X86_FEATURE_MCE)   | \
-	cpufeat_mask(X86_FEATURE_CX8)   | cpufeat_mask(X86_FEATURE_APIC)  | \
-	cpufeat_mask(X86_FEATURE_SEP)   | cpufeat_mask(X86_FEATURE_MTRR)  | \
-	cpufeat_mask(X86_FEATURE_PGE)   | cpufeat_mask(X86_FEATURE_MCA)   | \
-	cpufeat_mask(X86_FEATURE_CMOV)  | cpufeat_mask(X86_FEATURE_PAT)   | \
-	cpufeat_mask(X86_FEATURE_PSE36) | cpufeat_mask(X86_FEATURE_CLFLSH)| \
-	cpufeat_mask(X86_FEATURE_MMX)   | cpufeat_mask(X86_FEATURE_FXSR)  | \
+#define AMD_FEATURES_K8_REV_C_EDX (					     \
+	cpufeat_mask(X86_FEATURE_FPU)   | cpufeat_mask(X86_FEATURE_VME)    | \
+	cpufeat_mask(X86_FEATURE_DE)    | cpufeat_mask(X86_FEATURE_PSE)    | \
+	cpufeat_mask(X86_FEATURE_TSC)   | cpufeat_mask(X86_FEATURE_MSR)    | \
+	cpufeat_mask(X86_FEATURE_PAE)   | cpufeat_mask(X86_FEATURE_MCE)    | \
+	cpufeat_mask(X86_FEATURE_CX8)   | cpufeat_mask(X86_FEATURE_APIC)   | \
+	cpufeat_mask(X86_FEATURE_SEP)   | cpufeat_mask(X86_FEATURE_MTRR)   | \
+	cpufeat_mask(X86_FEATURE_PGE)   | cpufeat_mask(X86_FEATURE_MCA)    | \
+	cpufeat_mask(X86_FEATURE_CMOV)  | cpufeat_mask(X86_FEATURE_PAT)    | \
+	cpufeat_mask(X86_FEATURE_PSE36) | cpufeat_mask(X86_FEATURE_CLFLUSH)| \
+	cpufeat_mask(X86_FEATURE_MMX)   | cpufeat_mask(X86_FEATURE_FXSR)   | \
 	cpufeat_mask(X86_FEATURE_XMM)   | cpufeat_mask(X86_FEATURE_XMM2))
 #define AMD_EXTFEATURES_K8_REV_C_ECX  0
 #define AMD_EXTFEATURES_K8_REV_C_EDX  (					       \
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -32,7 +32,7 @@
 #define X86_FEATURE_PAT		(0*32+16) /* Page Attribute Table */
 #define X86_FEATURE_PSE36	(0*32+17) /* 36-bit PSEs */
 #define X86_FEATURE_PN		(0*32+18) /* Processor serial number */
-#define X86_FEATURE_CLFLSH	(0*32+19) /* Supports the CLFLUSH instruction */
+#define X86_FEATURE_CLFLUSH	(0*32+19) /* Supports the CLFLUSH instruction */
 #define X86_FEATURE_DS		(0*32+21) /* Debug Store */
 #define X86_FEATURE_ACPI	(0*32+22) /* ACPI via MSR */
 #define X86_FEATURE_MMX		(0*32+23) /* Multimedia Extensions */
@@ -197,7 +197,7 @@
 #define cpu_has_ht		boot_cpu_has(X86_FEATURE_HT)
 #define cpu_has_mp		1
 #define cpu_has_nx		boot_cpu_has(X86_FEATURE_NX)
-#define cpu_has_clflush		boot_cpu_has(X86_FEATURE_CLFLSH)
+#define cpu_has_clflush		boot_cpu_has(X86_FEATURE_CLFLUSH)
 #define cpu_has_page1gb		boot_cpu_has(X86_FEATURE_PAGE1GB)
 #define cpu_has_fsgsbase	boot_cpu_has(X86_FEATURE_FSGSBASE)
 #define cpu_has_aperfmperf	boot_cpu_has(X86_FEATURE_APERFMPERF)




[-- Attachment #2: x86-feature-clflush.patch --]
[-- Type: text/plain, Size: 4049 bytes --]

x86: rename X86_FEATURE_{CLFLSH -> CLFLUSH}

This is both more natural and in line with a Linux change (between 3.14
and 3.15).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -206,7 +206,7 @@ static void __init early_cpu_detect(void
 	c->x86_mask = eax & 15;
 	edx &= ~cleared_caps[cpufeat_word(X86_FEATURE_FPU)];
 	ecx &= ~cleared_caps[cpufeat_word(X86_FEATURE_XMM3)];
-	if (edx & cpufeat_mask(X86_FEATURE_CLFLSH))
+	if (edx & cpufeat_mask(X86_FEATURE_CLFLUSH))
 		c->x86_cache_alignment = ((ebx >> 8) & 0xff) * 8;
 	/* Leaf 0x1 capabilities filled in early for Xen. */
 	c->x86_capability[cpufeat_word(X86_FEATURE_FPU)] = edx;
@@ -251,7 +251,7 @@ static void generic_identify(struct cpui
 	c->x86_capability[cpufeat_word(X86_FEATURE_FPU)] = edx;
 	c->x86_capability[cpufeat_word(X86_FEATURE_XMM3)] = ecx;
 
-	if ( cpu_has(c, X86_FEATURE_CLFLSH) )
+	if ( cpu_has(c, X86_FEATURE_CLFLUSH) )
 		c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
 
 	if ( (c->cpuid_level >= CPUID_PM_LEAF) &&
--- a/xen/include/asm-x86/amd.h
+++ b/xen/include/asm-x86/amd.h
@@ -11,17 +11,17 @@
 
 /* Family 0Fh, Revision C */
 #define AMD_FEATURES_K8_REV_C_ECX  0
-#define AMD_FEATURES_K8_REV_C_EDX (					    \
-	cpufeat_mask(X86_FEATURE_FPU)   | cpufeat_mask(X86_FEATURE_VME)   | \
-	cpufeat_mask(X86_FEATURE_DE)    | cpufeat_mask(X86_FEATURE_PSE)   | \
-	cpufeat_mask(X86_FEATURE_TSC)   | cpufeat_mask(X86_FEATURE_MSR)   | \
-	cpufeat_mask(X86_FEATURE_PAE)   | cpufeat_mask(X86_FEATURE_MCE)   | \
-	cpufeat_mask(X86_FEATURE_CX8)   | cpufeat_mask(X86_FEATURE_APIC)  | \
-	cpufeat_mask(X86_FEATURE_SEP)   | cpufeat_mask(X86_FEATURE_MTRR)  | \
-	cpufeat_mask(X86_FEATURE_PGE)   | cpufeat_mask(X86_FEATURE_MCA)   | \
-	cpufeat_mask(X86_FEATURE_CMOV)  | cpufeat_mask(X86_FEATURE_PAT)   | \
-	cpufeat_mask(X86_FEATURE_PSE36) | cpufeat_mask(X86_FEATURE_CLFLSH)| \
-	cpufeat_mask(X86_FEATURE_MMX)   | cpufeat_mask(X86_FEATURE_FXSR)  | \
+#define AMD_FEATURES_K8_REV_C_EDX (					     \
+	cpufeat_mask(X86_FEATURE_FPU)   | cpufeat_mask(X86_FEATURE_VME)    | \
+	cpufeat_mask(X86_FEATURE_DE)    | cpufeat_mask(X86_FEATURE_PSE)    | \
+	cpufeat_mask(X86_FEATURE_TSC)   | cpufeat_mask(X86_FEATURE_MSR)    | \
+	cpufeat_mask(X86_FEATURE_PAE)   | cpufeat_mask(X86_FEATURE_MCE)    | \
+	cpufeat_mask(X86_FEATURE_CX8)   | cpufeat_mask(X86_FEATURE_APIC)   | \
+	cpufeat_mask(X86_FEATURE_SEP)   | cpufeat_mask(X86_FEATURE_MTRR)   | \
+	cpufeat_mask(X86_FEATURE_PGE)   | cpufeat_mask(X86_FEATURE_MCA)    | \
+	cpufeat_mask(X86_FEATURE_CMOV)  | cpufeat_mask(X86_FEATURE_PAT)    | \
+	cpufeat_mask(X86_FEATURE_PSE36) | cpufeat_mask(X86_FEATURE_CLFLUSH)| \
+	cpufeat_mask(X86_FEATURE_MMX)   | cpufeat_mask(X86_FEATURE_FXSR)   | \
 	cpufeat_mask(X86_FEATURE_XMM)   | cpufeat_mask(X86_FEATURE_XMM2))
 #define AMD_EXTFEATURES_K8_REV_C_ECX  0
 #define AMD_EXTFEATURES_K8_REV_C_EDX  (					       \
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -32,7 +32,7 @@
 #define X86_FEATURE_PAT		(0*32+16) /* Page Attribute Table */
 #define X86_FEATURE_PSE36	(0*32+17) /* 36-bit PSEs */
 #define X86_FEATURE_PN		(0*32+18) /* Processor serial number */
-#define X86_FEATURE_CLFLSH	(0*32+19) /* Supports the CLFLUSH instruction */
+#define X86_FEATURE_CLFLUSH	(0*32+19) /* Supports the CLFLUSH instruction */
 #define X86_FEATURE_DS		(0*32+21) /* Debug Store */
 #define X86_FEATURE_ACPI	(0*32+22) /* ACPI via MSR */
 #define X86_FEATURE_MMX		(0*32+23) /* Multimedia Extensions */
@@ -197,7 +197,7 @@
 #define cpu_has_ht		boot_cpu_has(X86_FEATURE_HT)
 #define cpu_has_mp		1
 #define cpu_has_nx		boot_cpu_has(X86_FEATURE_NX)
-#define cpu_has_clflush		boot_cpu_has(X86_FEATURE_CLFLSH)
+#define cpu_has_clflush		boot_cpu_has(X86_FEATURE_CLFLUSH)
 #define cpu_has_page1gb		boot_cpu_has(X86_FEATURE_PAGE1GB)
 #define cpu_has_fsgsbase	boot_cpu_has(X86_FEATURE_FSGSBASE)
 #define cpu_has_aperfmperf	boot_cpu_has(X86_FEATURE_APERFMPERF)

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] x86: avoid flush IPI when possible
  2016-02-10 12:56 ` [PATCH 1/3] x86: avoid flush IPI when possible Jan Beulich
@ 2016-02-10 15:00   ` Andrew Cooper
  2016-02-10 15:37     ` Jan Beulich
  0 siblings, 1 reply; 12+ messages in thread
From: Andrew Cooper @ 2016-02-10 15:00 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 10/02/16 12:56, Jan Beulich wrote:
> Since CLFLUSH, other than WBINVD, is a coherency domain wide flush,

I can't parse this sentence.

CLFUSH states "Invalidates from every level of the cache hierarchy in
the cache coherence domain"

WBINVD however states "The instruction then issues a special-function
bus cycle that directs external caches to also write back modified data
and another bus cycle to indicate that the external caches should be
invalidated."

I think we need input from Intel and AMD here as to the behaviour and
terminology here, and in particular, where the coherency domain
boundaries are.  All CPUs, even across multiple sockets, see coherent
caching, but it is unclear whether this qualifies them to be in the same
cache coherency domain per the instruction spec.

In particular, given the architecture of 8-socket systems and 45MB of
RAM in L3 caches, does wbinvd seriously drain all caches everywhere? 
Causing 45MB of data to move to remote memory controllers all at once
would cause a massive system stall.

~Andrew

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] x86: use CLFLUSHOPT when available
  2016-02-10 12:57 ` [PATCH 2/3] x86: use CLFLUSHOPT when available Jan Beulich
@ 2016-02-10 15:03   ` Andrew Cooper
  2016-02-10 15:39     ` Jan Beulich
  0 siblings, 1 reply; 12+ messages in thread
From: Andrew Cooper @ 2016-02-10 15:03 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser


[-- Attachment #1.1: Type: text/plain, Size: 1966 bytes --]

On 10/02/16 12:57, Jan Beulich wrote:
> Also drop an unnecessary va adjustment in the code being touched.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/flushtlb.c
> +++ b/xen/arch/x86/flushtlb.c
> @@ -139,10 +139,12 @@ unsigned int flush_area_local(const void
>               c->x86_clflush_size && c->x86_cache_size && sz &&
>               ((sz >> 10) < c->x86_cache_size) )
>          {
> -            va = (const void *)((unsigned long)va & ~(sz - 1));
> +            alternative(ASM_NOP3, "sfence", X86_FEATURE_CLFLUSHOPT);

Why separate?  This would be better in the lower alternative(), with one
single nop making up the difference in length.  That way, processors
without CLFLUSHOPT don't suffer the 1 cycle instruction decode stall
from the redundant rex prefix.

~Andrew

>              for ( i = 0; i < sz; i += c->x86_clflush_size )
> -                 asm volatile ( "clflush %0"
> -                                : : "m" (((const char *)va)[i]) );
> +                 alternative_input("rex clflush %0",
> +                                   "data16 clflush %0",
> +                                   X86_FEATURE_CLFLUSHOPT,
> +                                   "m" (((const char *)va)[i]));
>          }
>          else
>          {
> --- a/xen/include/asm-x86/cpufeature.h
> +++ b/xen/include/asm-x86/cpufeature.h
> @@ -163,6 +163,7 @@
>  #define X86_FEATURE_ADX		(7*32+19) /* ADCX, ADOX instructions */
>  #define X86_FEATURE_SMAP	(7*32+20) /* Supervisor Mode Access Prevention */
>  #define X86_FEATURE_PCOMMIT	(7*32+22) /* PCOMMIT instruction */
> +#define X86_FEATURE_CLFLUSHOPT	(7*32+23) /* CLFLUSHOPT instruction */
>  
>  /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 8 */
>  #define X86_FEATURE_PKU	(8*32+ 3) /* Protection Keys for Userspace */
>
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


[-- Attachment #1.2: Type: text/html, Size: 2914 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] x86: rename X86_FEATURE_{CLFLSH -> CLFLUSH}
  2016-02-10 12:57 ` [PATCH 3/3] x86: rename X86_FEATURE_{CLFLSH -> CLFLUSH} Jan Beulich
@ 2016-02-10 15:04   ` Andrew Cooper
  0 siblings, 0 replies; 12+ messages in thread
From: Andrew Cooper @ 2016-02-10 15:04 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 10/02/16 12:57, Jan Beulich wrote:
> This is both more natural and in line with a Linux change (between 3.14
> and 3.15).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] x86: avoid flush IPI when possible
  2016-02-10 15:00   ` Andrew Cooper
@ 2016-02-10 15:37     ` Jan Beulich
  2016-02-10 17:51       ` Andrew Cooper
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2016-02-10 15:37 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 10.02.16 at 16:00, <andrew.cooper3@citrix.com> wrote:
> On 10/02/16 12:56, Jan Beulich wrote:
>> Since CLFLUSH, other than WBINVD, is a coherency domain wide flush,
> 
> I can't parse this sentence.

Should have been "..., is a cache coherency domain wide flush, ..." -
does it read any better then?

> CLFUSH states "Invalidates from every level of the cache hierarchy in
> the cache coherence domain"
> 
> WBINVD however states "The instruction then issues a special-function
> bus cycle that directs external caches to also write back modified data
> and another bus cycle to indicate that the external caches should be
> invalidated."
> 
> I think we need input from Intel and AMD here as to the behaviour and
> terminology here, and in particular, where the coherency domain
> boundaries are.  All CPUs, even across multiple sockets, see coherent
> caching, but it is unclear whether this qualifies them to be in the same
> cache coherency domain per the instruction spec.

Linux already doing what this patch switches us to, I'm not sure
we need much extra input.

> In particular, given the architecture of 8-socket systems and 45MB of
> RAM in L3 caches, does wbinvd seriously drain all caches everywhere? 

Not everywhere, just on the local socket (assuming there's no external
cache).

> Causing 45MB of data to move to remote memory controllers all at once
> would cause a massive system stall.

That's why it takes (as we know) so long. See the figure in SDM Vol 3
section "Invalidating Caches and TLBs".

Jan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] x86: use CLFLUSHOPT when available
  2016-02-10 15:03   ` Andrew Cooper
@ 2016-02-10 15:39     ` Jan Beulich
  2016-02-10 16:27       ` Andrew Cooper
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2016-02-10 15:39 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 10.02.16 at 16:03, <andrew.cooper3@citrix.com> wrote:
> On 10/02/16 12:57, Jan Beulich wrote:
>> Also drop an unnecessary va adjustment in the code being touched.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>
>> --- a/xen/arch/x86/flushtlb.c
>> +++ b/xen/arch/x86/flushtlb.c
>> @@ -139,10 +139,12 @@ unsigned int flush_area_local(const void
>>               c->x86_clflush_size && c->x86_cache_size && sz &&
>>               ((sz >> 10) < c->x86_cache_size) )
>>          {
>> -            va = (const void *)((unsigned long)va & ~(sz - 1));
>> +            alternative(ASM_NOP3, "sfence", X86_FEATURE_CLFLUSHOPT);
> 
> Why separate?  This would be better in the lower alternative(), with one
> single nop making up the difference in length.  That way, processors
> without CLFLUSHOPT don't suffer the 1 cycle instruction decode stall
> from the redundant rex prefix.

Why would we want the fence inside the loop - a single fence is
sufficient for the entire flush.

Also if we're worried about the REX decode, this could easily be a
NOP instead, just that I'm not certain which one in the end is less
decode overhead.

Jan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] x86: use CLFLUSHOPT when available
  2016-02-10 15:39     ` Jan Beulich
@ 2016-02-10 16:27       ` Andrew Cooper
  0 siblings, 0 replies; 12+ messages in thread
From: Andrew Cooper @ 2016-02-10 16:27 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Keir Fraser

On 10/02/16 15:39, Jan Beulich wrote:
>>>> On 10.02.16 at 16:03, <andrew.cooper3@citrix.com> wrote:
>> On 10/02/16 12:57, Jan Beulich wrote:
>>> Also drop an unnecessary va adjustment in the code being touched.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>
>>> --- a/xen/arch/x86/flushtlb.c
>>> +++ b/xen/arch/x86/flushtlb.c
>>> @@ -139,10 +139,12 @@ unsigned int flush_area_local(const void
>>>               c->x86_clflush_size && c->x86_cache_size && sz &&
>>>               ((sz >> 10) < c->x86_cache_size) )
>>>          {
>>> -            va = (const void *)((unsigned long)va & ~(sz - 1));
>>> +            alternative(ASM_NOP3, "sfence", X86_FEATURE_CLFLUSHOPT);
>> Why separate?  This would be better in the lower alternative(), with one
>> single nop making up the difference in length.  That way, processors
>> without CLFLUSHOPT don't suffer the 1 cycle instruction decode stall
>> from the redundant rex prefix.
> Why would we want the fence inside the loop - a single fence is
> sufficient for the entire flush.

Ah yes - of course.

>
> Also if we're worried about the REX decode, this could easily be a
> NOP instead, just that I'm not certain which one in the end is less
> decode overhead.

A redundant prefix will generally have a lower overhead than a full new
instruction.

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] x86: avoid flush IPI when possible
  2016-02-10 15:37     ` Jan Beulich
@ 2016-02-10 17:51       ` Andrew Cooper
  2016-02-11 10:48         ` Jan Beulich
  0 siblings, 1 reply; 12+ messages in thread
From: Andrew Cooper @ 2016-02-10 17:51 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Keir Fraser

On 10/02/16 15:37, Jan Beulich wrote:
>>>> On 10.02.16 at 16:00, <andrew.cooper3@citrix.com> wrote:
>> On 10/02/16 12:56, Jan Beulich wrote:
>>> Since CLFLUSH, other than WBINVD, is a coherency domain wide flush,
>> I can't parse this sentence.
> Should have been "..., is a cache coherency domain wide flush, ..." -
> does it read any better then?

I believe, given the code in the patch, your intent is "if we WBINVD, we
don't need to IPI other cores cache flushing reasons".

However, given your comment below...

>
>> CLFUSH states "Invalidates from every level of the cache hierarchy in
>> the cache coherence domain"
>>
>> WBINVD however states "The instruction then issues a special-function
>> bus cycle that directs external caches to also write back modified data
>> and another bus cycle to indicate that the external caches should be
>> invalidated."
>>
>> I think we need input from Intel and AMD here as to the behaviour and
>> terminology here, and in particular, where the coherency domain
>> boundaries are.  All CPUs, even across multiple sockets, see coherent
>> caching, but it is unclear whether this qualifies them to be in the same
>> cache coherency domain per the instruction spec.
> Linux already doing what this patch switches us to, I'm not sure
> we need much extra input.
>
>> In particular, given the architecture of 8-socket systems and 45MB of
>> RAM in L3 caches, does wbinvd seriously drain all caches everywhere? 
> Not everywhere, just on the local socket (assuming there's no external
> cache).

If this is true, then it is clearly not safe to omit the IPIs.

>
>> Causing 45MB of data to move to remote memory controllers all at once
>> would cause a massive system stall.
> That's why it takes (as we know) so long. See the figure in SDM Vol 3
> section "Invalidating Caches and TLBs".

I presume you mean Figure 2-10. WBINVD Invalidation of Shared and
Non-Shared Cache Hierarchy?

This quite clearly shows that WBINVD will not invalidate or write back
the L1 caches for other cores in the same processor.

Have I misunderstood the logic for choosing when to omit the IPIs?

~Andrew

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] x86: avoid flush IPI when possible
  2016-02-10 17:51       ` Andrew Cooper
@ 2016-02-11 10:48         ` Jan Beulich
  0 siblings, 0 replies; 12+ messages in thread
From: Jan Beulich @ 2016-02-11 10:48 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 10.02.16 at 18:51, <andrew.cooper3@citrix.com> wrote:
> On 10/02/16 15:37, Jan Beulich wrote:
>>>>> On 10.02.16 at 16:00, <andrew.cooper3@citrix.com> wrote:
>>> On 10/02/16 12:56, Jan Beulich wrote:
>>>> Since CLFLUSH, other than WBINVD, is a coherency domain wide flush,
>>> I can't parse this sentence.
>> Should have been "..., is a cache coherency domain wide flush, ..." -
>> does it read any better then?
> 
> I believe, given the code in the patch, your intent is "if we WBINVD, we
> don't need to IPI other cores cache flushing reasons".

I don't see how this can be read from the sentence. The primary
statement is makes if "CLFLUSH is a cache coherency domain wide
flush". A secondary statement is that this is different from WBINVD.

> However, given your comment below...
> 
>>
>>> CLFUSH states "Invalidates from every level of the cache hierarchy in
>>> the cache coherence domain"
>>>
>>> WBINVD however states "The instruction then issues a special-function
>>> bus cycle that directs external caches to also write back modified data
>>> and another bus cycle to indicate that the external caches should be
>>> invalidated."
>>>
>>> I think we need input from Intel and AMD here as to the behaviour and
>>> terminology here, and in particular, where the coherency domain
>>> boundaries are.  All CPUs, even across multiple sockets, see coherent
>>> caching, but it is unclear whether this qualifies them to be in the same
>>> cache coherency domain per the instruction spec.
>> Linux already doing what this patch switches us to, I'm not sure
>> we need much extra input.
>>
>>> In particular, given the architecture of 8-socket systems and 45MB of
>>> RAM in L3 caches, does wbinvd seriously drain all caches everywhere? 
>> Not everywhere, just on the local socket (assuming there's no external
>> cache).
> 
> If this is true, then it is clearly not safe to omit the IPIs.

When using CLFLUSH it is safe, while when using WBINVD it's not.

>>> Causing 45MB of data to move to remote memory controllers all at once
>>> would cause a massive system stall.
>> That's why it takes (as we know) so long. See the figure in SDM Vol 3
>> section "Invalidating Caches and TLBs".
> 
> I presume you mean Figure 2-10. WBINVD Invalidation of Shared and
> Non-Shared Cache Hierarchy?
> 
> This quite clearly shows that WBINVD will not invalidate or write back
> the L1 caches for other cores in the same processor.
> 
> Have I misunderstood the logic for choosing when to omit the IPIs?

I'm afraid you did, or else I must have introduced a (latent,
because I didn't notice any issues so far) bug.

Jan

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2016-02-11 10:48 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-02-10 12:40 [PATCH 0/3] x86: clflush related adjustments Jan Beulich
2016-02-10 12:56 ` [PATCH 1/3] x86: avoid flush IPI when possible Jan Beulich
2016-02-10 15:00   ` Andrew Cooper
2016-02-10 15:37     ` Jan Beulich
2016-02-10 17:51       ` Andrew Cooper
2016-02-11 10:48         ` Jan Beulich
2016-02-10 12:57 ` [PATCH 2/3] x86: use CLFLUSHOPT when available Jan Beulich
2016-02-10 15:03   ` Andrew Cooper
2016-02-10 15:39     ` Jan Beulich
2016-02-10 16:27       ` Andrew Cooper
2016-02-10 12:57 ` [PATCH 3/3] x86: rename X86_FEATURE_{CLFLSH -> CLFLUSH} Jan Beulich
2016-02-10 15:04   ` Andrew Cooper

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.