All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-10 15:43 ` Waiman Long
  0 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-10 15:43 UTC (permalink / raw)
  To: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Peter Zijlstra, Ingo Molnar, Thomas Gleixner, H. Peter Anvin
  Cc: linux-arch, x86, linux-kernel, virtualization, xen-devel, kvm,
	Pan Xinhui, Paolo Bonzini, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross, Waiman Long

It was found when running fio sequential write test with a XFS ramdisk
on a VM running on a 2-socket x86-64 system, the %CPU times as reported
by perf were as follows:

 69.75%  0.59%  fio  [k] down_write
 69.15%  0.01%  fio  [k] call_rwsem_down_write_failed
 67.12%  1.12%  fio  [k] rwsem_down_write_failed
 63.48% 52.77%  fio  [k] osq_lock
  9.46%  7.88%  fio  [k] __raw_callee_save___kvm_vcpu_is_preempt
  3.93%  3.93%  fio  [k] __kvm_vcpu_is_preempted

Making vcpu_is_preempted() a callee-save function has a relatively
high cost on x86-64 primarily due to at least one more cacheline of
data access from the saving and restoring of registers (8 of them)
to and from stack as well as one more level of function call. As
vcpu_is_preempted() is called within the spinlock, mutex and rwsem
slowpaths, there isn't much to gain by making it callee-save. So it
is now changed to a normal function call instead.

With this patch applied on both bare-metal & KVM guest on a 2-socekt
16-core 32-thread system with 16 parallel jobs (8 on each socket), the
aggregrate bandwidth of the fio test on an XFS ramdisk were as follows:

                       Bare Metal                KVM Guest
   I/O Type      w/o patch    with patch   w/o patch    with patch
   --------      ---------    ----------   ---------    ----------
   random read   8650.5 MB/s  8560.9 MB/s  7602.9 MB/s  8196.1 MB/s  
   seq read      9104.8 MB/s  9397.2 MB/s  8293.7 MB/s  8566.9 MB/s
   random write  1623.8 MB/s  1626.7 MB/s  1590.6 MB/s  1700.7 MB/s
   seq write     1626.4 MB/s  1624.9 MB/s  1604.8 MB/s  1726.3 MB/s

The perf data (on KVM guest) now became:

 70.78%  0.58%  fio  [k] down_write
 70.20%  0.01%  fio  [k] call_rwsem_down_write_failed
 69.70%  1.17%  fio  [k] rwsem_down_write_failed
 59.91% 55.42%  fio  [k] osq_lock
 10.14% 10.14%  fio  [k] __kvm_vcpu_is_preempted

On bare metal, the patch doesn't introduce any performance
regression. On KVM guest, it produces noticeable performance
improvement (up to 7%).

Signed-off-by: Waiman Long <longman@redhat.com>
---
 v1->v2:
  - Rerun the fio test on a different system on both bare-metal and a
    KVM guest. Both sockets were utilized in this test.
  - The commit log was updated with new performance numbers, but the
    patch wasn't changed.
  - Drop patch 2.

 arch/x86/include/asm/paravirt.h       | 2 +-
 arch/x86/include/asm/paravirt_types.h | 2 +-
 arch/x86/kernel/kvm.c                 | 7 ++-----
 arch/x86/kernel/paravirt-spinlocks.c  | 6 ++----
 arch/x86/xen/spinlock.c               | 4 +---
 5 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 864f57b..2515885 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -676,7 +676,7 @@ static __always_inline void pv_kick(int cpu)
 
 static __always_inline bool pv_vcpu_is_preempted(int cpu)
 {
-	return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
+	return PVOP_CALL1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
 }
 
 #endif /* SMP && PARAVIRT_SPINLOCKS */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index bb2de45..88dc852 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -309,7 +309,7 @@ struct pv_lock_ops {
 	void (*wait)(u8 *ptr, u8 val);
 	void (*kick)(int cpu);
 
-	struct paravirt_callee_save vcpu_is_preempted;
+	bool (*vcpu_is_preempted)(int cpu);
 };
 
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 099fcba..eb3753d 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -595,7 +595,6 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
 
 	return !!src->preempted;
 }
-PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
 
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
@@ -614,10 +613,8 @@ void __init kvm_spinlock_init(void)
 	pv_lock_ops.wait = kvm_wait;
 	pv_lock_ops.kick = kvm_kick_cpu;
 
-	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
-		pv_lock_ops.vcpu_is_preempted =
-			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
-	}
+	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
+		pv_lock_ops.vcpu_is_preempted = __kvm_vcpu_is_preempted;
 }
 
 #endif	/* CONFIG_PARAVIRT_SPINLOCKS */
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 6259327..da050bc 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -24,12 +24,10 @@ __visible bool __native_vcpu_is_preempted(int cpu)
 {
 	return false;
 }
-PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
 
 bool pv_is_native_vcpu_is_preempted(void)
 {
-	return pv_lock_ops.vcpu_is_preempted.func ==
-		__raw_callee_save___native_vcpu_is_preempted;
+	return pv_lock_ops.vcpu_is_preempted == __native_vcpu_is_preempted;
 }
 
 struct pv_lock_ops pv_lock_ops = {
@@ -38,7 +36,7 @@ struct pv_lock_ops pv_lock_ops = {
 	.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
 	.wait = paravirt_nop,
 	.kick = paravirt_nop,
-	.vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted),
+	.vcpu_is_preempted = __native_vcpu_is_preempted,
 #endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 25a7c43..c85bb8f 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -114,8 +114,6 @@ void xen_uninit_lock_cpu(int cpu)
 	per_cpu(irq_name, cpu) = NULL;
 }
 
-PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
-
 /*
  * Our init of PV spinlocks is split in two init functions due to us
  * using paravirt patching and jump labels patching and having to do
@@ -138,7 +136,7 @@ void __init xen_init_spinlocks(void)
 	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
 	pv_lock_ops.wait = xen_qlock_wait;
 	pv_lock_ops.kick = xen_qlock_kick;
-	pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
+	pv_lock_ops.vcpu_is_preempted = xen_vcpu_stolen;
 }
 
 static __init int xen_parse_nopvspin(char *arg)
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-10 15:43 ` Waiman Long
  0 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-10 15:43 UTC (permalink / raw)
  To: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Peter Zijlstra, Ingo Molnar, Thomas Gleixner, H. Peter Anvin
  Cc: linux-arch, Juergen Gross, kvm, Radim Krčmář,
	Pan Xinhui, x86, linux-kernel, virtualization, Waiman Long,
	Paolo Bonzini, xen-devel, Boris Ostrovsky

It was found when running fio sequential write test with a XFS ramdisk
on a VM running on a 2-socket x86-64 system, the %CPU times as reported
by perf were as follows:

 69.75%  0.59%  fio  [k] down_write
 69.15%  0.01%  fio  [k] call_rwsem_down_write_failed
 67.12%  1.12%  fio  [k] rwsem_down_write_failed
 63.48% 52.77%  fio  [k] osq_lock
  9.46%  7.88%  fio  [k] __raw_callee_save___kvm_vcpu_is_preempt
  3.93%  3.93%  fio  [k] __kvm_vcpu_is_preempted

Making vcpu_is_preempted() a callee-save function has a relatively
high cost on x86-64 primarily due to at least one more cacheline of
data access from the saving and restoring of registers (8 of them)
to and from stack as well as one more level of function call. As
vcpu_is_preempted() is called within the spinlock, mutex and rwsem
slowpaths, there isn't much to gain by making it callee-save. So it
is now changed to a normal function call instead.

With this patch applied on both bare-metal & KVM guest on a 2-socekt
16-core 32-thread system with 16 parallel jobs (8 on each socket), the
aggregrate bandwidth of the fio test on an XFS ramdisk were as follows:

                       Bare Metal                KVM Guest
   I/O Type      w/o patch    with patch   w/o patch    with patch
   --------      ---------    ----------   ---------    ----------
   random read   8650.5 MB/s  8560.9 MB/s  7602.9 MB/s  8196.1 MB/s  
   seq read      9104.8 MB/s  9397.2 MB/s  8293.7 MB/s  8566.9 MB/s
   random write  1623.8 MB/s  1626.7 MB/s  1590.6 MB/s  1700.7 MB/s
   seq write     1626.4 MB/s  1624.9 MB/s  1604.8 MB/s  1726.3 MB/s

The perf data (on KVM guest) now became:

 70.78%  0.58%  fio  [k] down_write
 70.20%  0.01%  fio  [k] call_rwsem_down_write_failed
 69.70%  1.17%  fio  [k] rwsem_down_write_failed
 59.91% 55.42%  fio  [k] osq_lock
 10.14% 10.14%  fio  [k] __kvm_vcpu_is_preempted

On bare metal, the patch doesn't introduce any performance
regression. On KVM guest, it produces noticeable performance
improvement (up to 7%).

Signed-off-by: Waiman Long <longman@redhat.com>
---
 v1->v2:
  - Rerun the fio test on a different system on both bare-metal and a
    KVM guest. Both sockets were utilized in this test.
  - The commit log was updated with new performance numbers, but the
    patch wasn't changed.
  - Drop patch 2.

 arch/x86/include/asm/paravirt.h       | 2 +-
 arch/x86/include/asm/paravirt_types.h | 2 +-
 arch/x86/kernel/kvm.c                 | 7 ++-----
 arch/x86/kernel/paravirt-spinlocks.c  | 6 ++----
 arch/x86/xen/spinlock.c               | 4 +---
 5 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 864f57b..2515885 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -676,7 +676,7 @@ static __always_inline void pv_kick(int cpu)
 
 static __always_inline bool pv_vcpu_is_preempted(int cpu)
 {
-	return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
+	return PVOP_CALL1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
 }
 
 #endif /* SMP && PARAVIRT_SPINLOCKS */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index bb2de45..88dc852 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -309,7 +309,7 @@ struct pv_lock_ops {
 	void (*wait)(u8 *ptr, u8 val);
 	void (*kick)(int cpu);
 
-	struct paravirt_callee_save vcpu_is_preempted;
+	bool (*vcpu_is_preempted)(int cpu);
 };
 
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 099fcba..eb3753d 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -595,7 +595,6 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
 
 	return !!src->preempted;
 }
-PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
 
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
@@ -614,10 +613,8 @@ void __init kvm_spinlock_init(void)
 	pv_lock_ops.wait = kvm_wait;
 	pv_lock_ops.kick = kvm_kick_cpu;
 
-	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
-		pv_lock_ops.vcpu_is_preempted =
-			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
-	}
+	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
+		pv_lock_ops.vcpu_is_preempted = __kvm_vcpu_is_preempted;
 }
 
 #endif	/* CONFIG_PARAVIRT_SPINLOCKS */
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 6259327..da050bc 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -24,12 +24,10 @@ __visible bool __native_vcpu_is_preempted(int cpu)
 {
 	return false;
 }
-PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
 
 bool pv_is_native_vcpu_is_preempted(void)
 {
-	return pv_lock_ops.vcpu_is_preempted.func ==
-		__raw_callee_save___native_vcpu_is_preempted;
+	return pv_lock_ops.vcpu_is_preempted == __native_vcpu_is_preempted;
 }
 
 struct pv_lock_ops pv_lock_ops = {
@@ -38,7 +36,7 @@ struct pv_lock_ops pv_lock_ops = {
 	.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
 	.wait = paravirt_nop,
 	.kick = paravirt_nop,
-	.vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted),
+	.vcpu_is_preempted = __native_vcpu_is_preempted,
 #endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 25a7c43..c85bb8f 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -114,8 +114,6 @@ void xen_uninit_lock_cpu(int cpu)
 	per_cpu(irq_name, cpu) = NULL;
 }
 
-PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
-
 /*
  * Our init of PV spinlocks is split in two init functions due to us
  * using paravirt patching and jump labels patching and having to do
@@ -138,7 +136,7 @@ void __init xen_init_spinlocks(void)
 	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
 	pv_lock_ops.wait = xen_qlock_wait;
 	pv_lock_ops.kick = xen_qlock_kick;
-	pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
+	pv_lock_ops.vcpu_is_preempted = xen_vcpu_stolen;
 }
 
 static __init int xen_parse_nopvspin(char *arg)
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-10 15:43 ` Waiman Long
@ 2017-02-10 16:19   ` Peter Zijlstra
  -1 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-10 16:19 UTC (permalink / raw)
  To: Waiman Long
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, H. Peter Anvin, linux-arch, x86,
	linux-kernel, virtualization, xen-devel, kvm, Pan Xinhui,
	Paolo Bonzini, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On Fri, Feb 10, 2017 at 10:43:09AM -0500, Waiman Long wrote:
> It was found when running fio sequential write test with a XFS ramdisk
> on a VM running on a 2-socket x86-64 system, the %CPU times as reported
> by perf were as follows:
> 
>  69.75%  0.59%  fio  [k] down_write
>  69.15%  0.01%  fio  [k] call_rwsem_down_write_failed
>  67.12%  1.12%  fio  [k] rwsem_down_write_failed
>  63.48% 52.77%  fio  [k] osq_lock
>   9.46%  7.88%  fio  [k] __raw_callee_save___kvm_vcpu_is_preempt
>   3.93%  3.93%  fio  [k] __kvm_vcpu_is_preempted
> 

Thinking about this again, wouldn't something like the below also work?


diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 099fcba4981d..6aa33702c15c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -589,6 +589,7 @@ static void kvm_wait(u8 *ptr, u8 val)
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_X86_32
 __visible bool __kvm_vcpu_is_preempted(int cpu)
 {
 	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
@@ -597,6 +598,31 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
 }
 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
 
+#else
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(int);
+
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+FRAME_BEGIN
+"push %rdi;"
+"push %rdx;"
+"movslq  %edi, %rdi;"
+"movq    $steal_time+16, %rax;"
+"movq    __per_cpu_offset(,%rdi,8), %rdx;"
+"cmpb    $0, (%rdx,%rax);"
+"setne   %al;"
+"pop %rdx;"
+"pop %rdi;"
+FRAME_END
+"ret;"
+".popsection");
+
+#endif
+
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  */

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-10 16:19   ` Peter Zijlstra
  0 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-10 16:19 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, H. Peter Anvin,
	xen-devel, Alok Kataria, Thomas Gleixner

On Fri, Feb 10, 2017 at 10:43:09AM -0500, Waiman Long wrote:
> It was found when running fio sequential write test with a XFS ramdisk
> on a VM running on a 2-socket x86-64 system, the %CPU times as reported
> by perf were as follows:
> 
>  69.75%  0.59%  fio  [k] down_write
>  69.15%  0.01%  fio  [k] call_rwsem_down_write_failed
>  67.12%  1.12%  fio  [k] rwsem_down_write_failed
>  63.48% 52.77%  fio  [k] osq_lock
>   9.46%  7.88%  fio  [k] __raw_callee_save___kvm_vcpu_is_preempt
>   3.93%  3.93%  fio  [k] __kvm_vcpu_is_preempted
> 

Thinking about this again, wouldn't something like the below also work?


diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 099fcba4981d..6aa33702c15c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -589,6 +589,7 @@ static void kvm_wait(u8 *ptr, u8 val)
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_X86_32
 __visible bool __kvm_vcpu_is_preempted(int cpu)
 {
 	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
@@ -597,6 +598,31 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
 }
 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
 
+#else
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(int);
+
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+FRAME_BEGIN
+"push %rdi;"
+"push %rdx;"
+"movslq  %edi, %rdi;"
+"movq    $steal_time+16, %rax;"
+"movq    __per_cpu_offset(,%rdi,8), %rdx;"
+"cmpb    $0, (%rdx,%rax);"
+"setne   %al;"
+"pop %rdx;"
+"pop %rdi;"
+FRAME_END
+"ret;"
+".popsection");
+
+#endif
+
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  */

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-10 15:43 ` Waiman Long
  (?)
@ 2017-02-10 16:19 ` Peter Zijlstra
  -1 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-10 16:19 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, Rusty Russell,
	linux-kernel, virtualization, Chris Wright, Ingo Molnar,
	H. Peter Anvin, xen-devel, Alok Kataria, Thomas Gleixner

On Fri, Feb 10, 2017 at 10:43:09AM -0500, Waiman Long wrote:
> It was found when running fio sequential write test with a XFS ramdisk
> on a VM running on a 2-socket x86-64 system, the %CPU times as reported
> by perf were as follows:
> 
>  69.75%  0.59%  fio  [k] down_write
>  69.15%  0.01%  fio  [k] call_rwsem_down_write_failed
>  67.12%  1.12%  fio  [k] rwsem_down_write_failed
>  63.48% 52.77%  fio  [k] osq_lock
>   9.46%  7.88%  fio  [k] __raw_callee_save___kvm_vcpu_is_preempt
>   3.93%  3.93%  fio  [k] __kvm_vcpu_is_preempted
> 

Thinking about this again, wouldn't something like the below also work?


diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 099fcba4981d..6aa33702c15c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -589,6 +589,7 @@ static void kvm_wait(u8 *ptr, u8 val)
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_X86_32
 __visible bool __kvm_vcpu_is_preempted(int cpu)
 {
 	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
@@ -597,6 +598,31 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
 }
 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
 
+#else
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(int);
+
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+FRAME_BEGIN
+"push %rdi;"
+"push %rdx;"
+"movslq  %edi, %rdi;"
+"movq    $steal_time+16, %rax;"
+"movq    __per_cpu_offset(,%rdi,8), %rdx;"
+"cmpb    $0, (%rdx,%rax);"
+"setne   %al;"
+"pop %rdx;"
+"pop %rdi;"
+FRAME_END
+"ret;"
+".popsection");
+
+#endif
+
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  */

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-10 15:43 ` Waiman Long
@ 2017-02-10 16:22   ` Paolo Bonzini
  -1 siblings, 0 replies; 69+ messages in thread
From: Paolo Bonzini @ 2017-02-10 16:22 UTC (permalink / raw)
  To: Waiman Long, Jeremy Fitzhardinge, Chris Wright, Alok Kataria,
	Rusty Russell, Peter Zijlstra, Ingo Molnar, Thomas Gleixner,
	H. Peter Anvin
  Cc: linux-arch, x86, linux-kernel, virtualization, xen-devel, kvm,
	Pan Xinhui, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross



On 10/02/2017 16:43, Waiman Long wrote:
> It was found when running fio sequential write test with a XFS ramdisk
> on a VM running on a 2-socket x86-64 system, the %CPU times as reported
> by perf were as follows:
> 
>  69.75%  0.59%  fio  [k] down_write
>  69.15%  0.01%  fio  [k] call_rwsem_down_write_failed
>  67.12%  1.12%  fio  [k] rwsem_down_write_failed
>  63.48% 52.77%  fio  [k] osq_lock
>   9.46%  7.88%  fio  [k] __raw_callee_save___kvm_vcpu_is_preempt
>   3.93%  3.93%  fio  [k] __kvm_vcpu_is_preempted
> 
> Making vcpu_is_preempted() a callee-save function has a relatively
> high cost on x86-64 primarily due to at least one more cacheline of
> data access from the saving and restoring of registers (8 of them)
> to and from stack as well as one more level of function call. As
> vcpu_is_preempted() is called within the spinlock, mutex and rwsem
> slowpaths, there isn't much to gain by making it callee-save. So it
> is now changed to a normal function call instead.
> 
> With this patch applied on both bare-metal & KVM guest on a 2-socekt
> 16-core 32-thread system with 16 parallel jobs (8 on each socket), the
> aggregrate bandwidth of the fio test on an XFS ramdisk were as follows:
> 
>                        Bare Metal                KVM Guest
>    I/O Type      w/o patch    with patch   w/o patch    with patch
>    --------      ---------    ----------   ---------    ----------
>    random read   8650.5 MB/s  8560.9 MB/s  7602.9 MB/s  8196.1 MB/s  
>    seq read      9104.8 MB/s  9397.2 MB/s  8293.7 MB/s  8566.9 MB/s
>    random write  1623.8 MB/s  1626.7 MB/s  1590.6 MB/s  1700.7 MB/s
>    seq write     1626.4 MB/s  1624.9 MB/s  1604.8 MB/s  1726.3 MB/s
> 
> The perf data (on KVM guest) now became:
> 
>  70.78%  0.58%  fio  [k] down_write
>  70.20%  0.01%  fio  [k] call_rwsem_down_write_failed
>  69.70%  1.17%  fio  [k] rwsem_down_write_failed
>  59.91% 55.42%  fio  [k] osq_lock
>  10.14% 10.14%  fio  [k] __kvm_vcpu_is_preempted
> 
> On bare metal, the patch doesn't introduce any performance
> regression. On KVM guest, it produces noticeable performance
> improvement (up to 7%).
> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>  v1->v2:
>   - Rerun the fio test on a different system on both bare-metal and a
>     KVM guest. Both sockets were utilized in this test.
>   - The commit log was updated with new performance numbers, but the
>     patch wasn't changed.
>   - Drop patch 2.
> 
>  arch/x86/include/asm/paravirt.h       | 2 +-
>  arch/x86/include/asm/paravirt_types.h | 2 +-
>  arch/x86/kernel/kvm.c                 | 7 ++-----
>  arch/x86/kernel/paravirt-spinlocks.c  | 6 ++----
>  arch/x86/xen/spinlock.c               | 4 +---
>  5 files changed, 7 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
> index 864f57b..2515885 100644
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -676,7 +676,7 @@ static __always_inline void pv_kick(int cpu)
>  
>  static __always_inline bool pv_vcpu_is_preempted(int cpu)
>  {
> -	return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
> +	return PVOP_CALL1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
>  }
>  
>  #endif /* SMP && PARAVIRT_SPINLOCKS */
> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
> index bb2de45..88dc852 100644
> --- a/arch/x86/include/asm/paravirt_types.h
> +++ b/arch/x86/include/asm/paravirt_types.h
> @@ -309,7 +309,7 @@ struct pv_lock_ops {
>  	void (*wait)(u8 *ptr, u8 val);
>  	void (*kick)(int cpu);
>  
> -	struct paravirt_callee_save vcpu_is_preempted;
> +	bool (*vcpu_is_preempted)(int cpu);
>  };
>  
>  /* This contains all the paravirt structures: we get a convenient
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 099fcba..eb3753d 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -595,7 +595,6 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
>  
>  	return !!src->preempted;
>  }
> -PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
>  
>  /*
>   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
> @@ -614,10 +613,8 @@ void __init kvm_spinlock_init(void)
>  	pv_lock_ops.wait = kvm_wait;
>  	pv_lock_ops.kick = kvm_kick_cpu;
>  
> -	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
> -		pv_lock_ops.vcpu_is_preempted =
> -			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
> -	}
> +	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
> +		pv_lock_ops.vcpu_is_preempted = __kvm_vcpu_is_preempted;
>  }
>  
>  #endif	/* CONFIG_PARAVIRT_SPINLOCKS */
> diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
> index 6259327..da050bc 100644
> --- a/arch/x86/kernel/paravirt-spinlocks.c
> +++ b/arch/x86/kernel/paravirt-spinlocks.c
> @@ -24,12 +24,10 @@ __visible bool __native_vcpu_is_preempted(int cpu)
>  {
>  	return false;
>  }
> -PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
>  
>  bool pv_is_native_vcpu_is_preempted(void)
>  {
> -	return pv_lock_ops.vcpu_is_preempted.func ==
> -		__raw_callee_save___native_vcpu_is_preempted;
> +	return pv_lock_ops.vcpu_is_preempted == __native_vcpu_is_preempted;
>  }
>  
>  struct pv_lock_ops pv_lock_ops = {
> @@ -38,7 +36,7 @@ struct pv_lock_ops pv_lock_ops = {
>  	.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
>  	.wait = paravirt_nop,
>  	.kick = paravirt_nop,
> -	.vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted),
> +	.vcpu_is_preempted = __native_vcpu_is_preempted,
>  #endif /* SMP */
>  };
>  EXPORT_SYMBOL(pv_lock_ops);
> diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
> index 25a7c43..c85bb8f 100644
> --- a/arch/x86/xen/spinlock.c
> +++ b/arch/x86/xen/spinlock.c
> @@ -114,8 +114,6 @@ void xen_uninit_lock_cpu(int cpu)
>  	per_cpu(irq_name, cpu) = NULL;
>  }
>  
> -PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
> -
>  /*
>   * Our init of PV spinlocks is split in two init functions due to us
>   * using paravirt patching and jump labels patching and having to do
> @@ -138,7 +136,7 @@ void __init xen_init_spinlocks(void)
>  	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
>  	pv_lock_ops.wait = xen_qlock_wait;
>  	pv_lock_ops.kick = xen_qlock_kick;
> -	pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
> +	pv_lock_ops.vcpu_is_preempted = xen_vcpu_stolen;
>  }
>  
>  static __init int xen_parse_nopvspin(char *arg)
> 

Acked-by: Paolo Bonzini <pbonzini@redhat.com>

Thank you very much!

Paolo

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-10 16:22   ` Paolo Bonzini
  0 siblings, 0 replies; 69+ messages in thread
From: Paolo Bonzini @ 2017-02-10 16:22 UTC (permalink / raw)
  To: Waiman Long, Jeremy Fitzhardinge, Chris Wright, Alok Kataria,
	Rusty Russell, Peter Zijlstra, Ingo Molnar, Thomas Gleixner,
	H. Peter Anvin
  Cc: linux-arch, Juergen Gross, kvm, Radim Krčmář,
	Pan Xinhui, x86, linux-kernel, virtualization, xen-devel,
	Boris Ostrovsky



On 10/02/2017 16:43, Waiman Long wrote:
> It was found when running fio sequential write test with a XFS ramdisk
> on a VM running on a 2-socket x86-64 system, the %CPU times as reported
> by perf were as follows:
> 
>  69.75%  0.59%  fio  [k] down_write
>  69.15%  0.01%  fio  [k] call_rwsem_down_write_failed
>  67.12%  1.12%  fio  [k] rwsem_down_write_failed
>  63.48% 52.77%  fio  [k] osq_lock
>   9.46%  7.88%  fio  [k] __raw_callee_save___kvm_vcpu_is_preempt
>   3.93%  3.93%  fio  [k] __kvm_vcpu_is_preempted
> 
> Making vcpu_is_preempted() a callee-save function has a relatively
> high cost on x86-64 primarily due to at least one more cacheline of
> data access from the saving and restoring of registers (8 of them)
> to and from stack as well as one more level of function call. As
> vcpu_is_preempted() is called within the spinlock, mutex and rwsem
> slowpaths, there isn't much to gain by making it callee-save. So it
> is now changed to a normal function call instead.
> 
> With this patch applied on both bare-metal & KVM guest on a 2-socekt
> 16-core 32-thread system with 16 parallel jobs (8 on each socket), the
> aggregrate bandwidth of the fio test on an XFS ramdisk were as follows:
> 
>                        Bare Metal                KVM Guest
>    I/O Type      w/o patch    with patch   w/o patch    with patch
>    --------      ---------    ----------   ---------    ----------
>    random read   8650.5 MB/s  8560.9 MB/s  7602.9 MB/s  8196.1 MB/s  
>    seq read      9104.8 MB/s  9397.2 MB/s  8293.7 MB/s  8566.9 MB/s
>    random write  1623.8 MB/s  1626.7 MB/s  1590.6 MB/s  1700.7 MB/s
>    seq write     1626.4 MB/s  1624.9 MB/s  1604.8 MB/s  1726.3 MB/s
> 
> The perf data (on KVM guest) now became:
> 
>  70.78%  0.58%  fio  [k] down_write
>  70.20%  0.01%  fio  [k] call_rwsem_down_write_failed
>  69.70%  1.17%  fio  [k] rwsem_down_write_failed
>  59.91% 55.42%  fio  [k] osq_lock
>  10.14% 10.14%  fio  [k] __kvm_vcpu_is_preempted
> 
> On bare metal, the patch doesn't introduce any performance
> regression. On KVM guest, it produces noticeable performance
> improvement (up to 7%).
> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>  v1->v2:
>   - Rerun the fio test on a different system on both bare-metal and a
>     KVM guest. Both sockets were utilized in this test.
>   - The commit log was updated with new performance numbers, but the
>     patch wasn't changed.
>   - Drop patch 2.
> 
>  arch/x86/include/asm/paravirt.h       | 2 +-
>  arch/x86/include/asm/paravirt_types.h | 2 +-
>  arch/x86/kernel/kvm.c                 | 7 ++-----
>  arch/x86/kernel/paravirt-spinlocks.c  | 6 ++----
>  arch/x86/xen/spinlock.c               | 4 +---
>  5 files changed, 7 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
> index 864f57b..2515885 100644
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -676,7 +676,7 @@ static __always_inline void pv_kick(int cpu)
>  
>  static __always_inline bool pv_vcpu_is_preempted(int cpu)
>  {
> -	return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
> +	return PVOP_CALL1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
>  }
>  
>  #endif /* SMP && PARAVIRT_SPINLOCKS */
> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
> index bb2de45..88dc852 100644
> --- a/arch/x86/include/asm/paravirt_types.h
> +++ b/arch/x86/include/asm/paravirt_types.h
> @@ -309,7 +309,7 @@ struct pv_lock_ops {
>  	void (*wait)(u8 *ptr, u8 val);
>  	void (*kick)(int cpu);
>  
> -	struct paravirt_callee_save vcpu_is_preempted;
> +	bool (*vcpu_is_preempted)(int cpu);
>  };
>  
>  /* This contains all the paravirt structures: we get a convenient
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 099fcba..eb3753d 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -595,7 +595,6 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
>  
>  	return !!src->preempted;
>  }
> -PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
>  
>  /*
>   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
> @@ -614,10 +613,8 @@ void __init kvm_spinlock_init(void)
>  	pv_lock_ops.wait = kvm_wait;
>  	pv_lock_ops.kick = kvm_kick_cpu;
>  
> -	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
> -		pv_lock_ops.vcpu_is_preempted =
> -			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
> -	}
> +	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
> +		pv_lock_ops.vcpu_is_preempted = __kvm_vcpu_is_preempted;
>  }
>  
>  #endif	/* CONFIG_PARAVIRT_SPINLOCKS */
> diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
> index 6259327..da050bc 100644
> --- a/arch/x86/kernel/paravirt-spinlocks.c
> +++ b/arch/x86/kernel/paravirt-spinlocks.c
> @@ -24,12 +24,10 @@ __visible bool __native_vcpu_is_preempted(int cpu)
>  {
>  	return false;
>  }
> -PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
>  
>  bool pv_is_native_vcpu_is_preempted(void)
>  {
> -	return pv_lock_ops.vcpu_is_preempted.func ==
> -		__raw_callee_save___native_vcpu_is_preempted;
> +	return pv_lock_ops.vcpu_is_preempted == __native_vcpu_is_preempted;
>  }
>  
>  struct pv_lock_ops pv_lock_ops = {
> @@ -38,7 +36,7 @@ struct pv_lock_ops pv_lock_ops = {
>  	.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
>  	.wait = paravirt_nop,
>  	.kick = paravirt_nop,
> -	.vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted),
> +	.vcpu_is_preempted = __native_vcpu_is_preempted,
>  #endif /* SMP */
>  };
>  EXPORT_SYMBOL(pv_lock_ops);
> diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
> index 25a7c43..c85bb8f 100644
> --- a/arch/x86/xen/spinlock.c
> +++ b/arch/x86/xen/spinlock.c
> @@ -114,8 +114,6 @@ void xen_uninit_lock_cpu(int cpu)
>  	per_cpu(irq_name, cpu) = NULL;
>  }
>  
> -PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
> -
>  /*
>   * Our init of PV spinlocks is split in two init functions due to us
>   * using paravirt patching and jump labels patching and having to do
> @@ -138,7 +136,7 @@ void __init xen_init_spinlocks(void)
>  	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
>  	pv_lock_ops.wait = xen_qlock_wait;
>  	pv_lock_ops.kick = xen_qlock_kick;
> -	pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
> +	pv_lock_ops.vcpu_is_preempted = xen_vcpu_stolen;
>  }
>  
>  static __init int xen_parse_nopvspin(char *arg)
> 

Acked-by: Paolo Bonzini <pbonzini@redhat.com>

Thank you very much!

Paolo

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-10 15:43 ` Waiman Long
                   ` (2 preceding siblings ...)
  (?)
@ 2017-02-10 16:22 ` Paolo Bonzini
  -1 siblings, 0 replies; 69+ messages in thread
From: Paolo Bonzini @ 2017-02-10 16:22 UTC (permalink / raw)
  To: Waiman Long, Jeremy Fitzhardinge, Chris Wright, Alok Kataria,
	Rusty Russell, Peter Zijlstra, Ingo Molnar, Thomas Gleixner,
	H. Peter Anvin
  Cc: linux-arch, Juergen Gross, kvm, Radim Krčmář,
	Pan Xinhui, x86, linux-kernel, virtualization, xen-devel,
	Boris Ostrovsky



On 10/02/2017 16:43, Waiman Long wrote:
> It was found when running fio sequential write test with a XFS ramdisk
> on a VM running on a 2-socket x86-64 system, the %CPU times as reported
> by perf were as follows:
> 
>  69.75%  0.59%  fio  [k] down_write
>  69.15%  0.01%  fio  [k] call_rwsem_down_write_failed
>  67.12%  1.12%  fio  [k] rwsem_down_write_failed
>  63.48% 52.77%  fio  [k] osq_lock
>   9.46%  7.88%  fio  [k] __raw_callee_save___kvm_vcpu_is_preempt
>   3.93%  3.93%  fio  [k] __kvm_vcpu_is_preempted
> 
> Making vcpu_is_preempted() a callee-save function has a relatively
> high cost on x86-64 primarily due to at least one more cacheline of
> data access from the saving and restoring of registers (8 of them)
> to and from stack as well as one more level of function call. As
> vcpu_is_preempted() is called within the spinlock, mutex and rwsem
> slowpaths, there isn't much to gain by making it callee-save. So it
> is now changed to a normal function call instead.
> 
> With this patch applied on both bare-metal & KVM guest on a 2-socekt
> 16-core 32-thread system with 16 parallel jobs (8 on each socket), the
> aggregrate bandwidth of the fio test on an XFS ramdisk were as follows:
> 
>                        Bare Metal                KVM Guest
>    I/O Type      w/o patch    with patch   w/o patch    with patch
>    --------      ---------    ----------   ---------    ----------
>    random read   8650.5 MB/s  8560.9 MB/s  7602.9 MB/s  8196.1 MB/s  
>    seq read      9104.8 MB/s  9397.2 MB/s  8293.7 MB/s  8566.9 MB/s
>    random write  1623.8 MB/s  1626.7 MB/s  1590.6 MB/s  1700.7 MB/s
>    seq write     1626.4 MB/s  1624.9 MB/s  1604.8 MB/s  1726.3 MB/s
> 
> The perf data (on KVM guest) now became:
> 
>  70.78%  0.58%  fio  [k] down_write
>  70.20%  0.01%  fio  [k] call_rwsem_down_write_failed
>  69.70%  1.17%  fio  [k] rwsem_down_write_failed
>  59.91% 55.42%  fio  [k] osq_lock
>  10.14% 10.14%  fio  [k] __kvm_vcpu_is_preempted
> 
> On bare metal, the patch doesn't introduce any performance
> regression. On KVM guest, it produces noticeable performance
> improvement (up to 7%).
> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>  v1->v2:
>   - Rerun the fio test on a different system on both bare-metal and a
>     KVM guest. Both sockets were utilized in this test.
>   - The commit log was updated with new performance numbers, but the
>     patch wasn't changed.
>   - Drop patch 2.
> 
>  arch/x86/include/asm/paravirt.h       | 2 +-
>  arch/x86/include/asm/paravirt_types.h | 2 +-
>  arch/x86/kernel/kvm.c                 | 7 ++-----
>  arch/x86/kernel/paravirt-spinlocks.c  | 6 ++----
>  arch/x86/xen/spinlock.c               | 4 +---
>  5 files changed, 7 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
> index 864f57b..2515885 100644
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -676,7 +676,7 @@ static __always_inline void pv_kick(int cpu)
>  
>  static __always_inline bool pv_vcpu_is_preempted(int cpu)
>  {
> -	return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
> +	return PVOP_CALL1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
>  }
>  
>  #endif /* SMP && PARAVIRT_SPINLOCKS */
> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
> index bb2de45..88dc852 100644
> --- a/arch/x86/include/asm/paravirt_types.h
> +++ b/arch/x86/include/asm/paravirt_types.h
> @@ -309,7 +309,7 @@ struct pv_lock_ops {
>  	void (*wait)(u8 *ptr, u8 val);
>  	void (*kick)(int cpu);
>  
> -	struct paravirt_callee_save vcpu_is_preempted;
> +	bool (*vcpu_is_preempted)(int cpu);
>  };
>  
>  /* This contains all the paravirt structures: we get a convenient
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 099fcba..eb3753d 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -595,7 +595,6 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
>  
>  	return !!src->preempted;
>  }
> -PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
>  
>  /*
>   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
> @@ -614,10 +613,8 @@ void __init kvm_spinlock_init(void)
>  	pv_lock_ops.wait = kvm_wait;
>  	pv_lock_ops.kick = kvm_kick_cpu;
>  
> -	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
> -		pv_lock_ops.vcpu_is_preempted =
> -			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
> -	}
> +	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
> +		pv_lock_ops.vcpu_is_preempted = __kvm_vcpu_is_preempted;
>  }
>  
>  #endif	/* CONFIG_PARAVIRT_SPINLOCKS */
> diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
> index 6259327..da050bc 100644
> --- a/arch/x86/kernel/paravirt-spinlocks.c
> +++ b/arch/x86/kernel/paravirt-spinlocks.c
> @@ -24,12 +24,10 @@ __visible bool __native_vcpu_is_preempted(int cpu)
>  {
>  	return false;
>  }
> -PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
>  
>  bool pv_is_native_vcpu_is_preempted(void)
>  {
> -	return pv_lock_ops.vcpu_is_preempted.func ==
> -		__raw_callee_save___native_vcpu_is_preempted;
> +	return pv_lock_ops.vcpu_is_preempted == __native_vcpu_is_preempted;
>  }
>  
>  struct pv_lock_ops pv_lock_ops = {
> @@ -38,7 +36,7 @@ struct pv_lock_ops pv_lock_ops = {
>  	.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
>  	.wait = paravirt_nop,
>  	.kick = paravirt_nop,
> -	.vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted),
> +	.vcpu_is_preempted = __native_vcpu_is_preempted,
>  #endif /* SMP */
>  };
>  EXPORT_SYMBOL(pv_lock_ops);
> diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
> index 25a7c43..c85bb8f 100644
> --- a/arch/x86/xen/spinlock.c
> +++ b/arch/x86/xen/spinlock.c
> @@ -114,8 +114,6 @@ void xen_uninit_lock_cpu(int cpu)
>  	per_cpu(irq_name, cpu) = NULL;
>  }
>  
> -PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
> -
>  /*
>   * Our init of PV spinlocks is split in two init functions due to us
>   * using paravirt patching and jump labels patching and having to do
> @@ -138,7 +136,7 @@ void __init xen_init_spinlocks(void)
>  	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
>  	pv_lock_ops.wait = xen_qlock_wait;
>  	pv_lock_ops.kick = xen_qlock_kick;
> -	pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
> +	pv_lock_ops.vcpu_is_preempted = xen_vcpu_stolen;
>  }
>  
>  static __init int xen_parse_nopvspin(char *arg)
> 

Acked-by: Paolo Bonzini <pbonzini@redhat.com>

Thank you very much!

Paolo

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-10 16:19   ` Peter Zijlstra
@ 2017-02-10 16:35     ` Waiman Long
  -1 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-10 16:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, H. Peter Anvin, linux-arch, x86,
	linux-kernel, virtualization, xen-devel, kvm, Pan Xinhui,
	Paolo Bonzini, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On 02/10/2017 11:19 AM, Peter Zijlstra wrote:
> On Fri, Feb 10, 2017 at 10:43:09AM -0500, Waiman Long wrote:
>> It was found when running fio sequential write test with a XFS ramdisk
>> on a VM running on a 2-socket x86-64 system, the %CPU times as reported
>> by perf were as follows:
>>
>>  69.75%  0.59%  fio  [k] down_write
>>  69.15%  0.01%  fio  [k] call_rwsem_down_write_failed
>>  67.12%  1.12%  fio  [k] rwsem_down_write_failed
>>  63.48% 52.77%  fio  [k] osq_lock
>>   9.46%  7.88%  fio  [k] __raw_callee_save___kvm_vcpu_is_preempt
>>   3.93%  3.93%  fio  [k] __kvm_vcpu_is_preempted
>>
> Thinking about this again, wouldn't something like the below also work?
>
>
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 099fcba4981d..6aa33702c15c 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -589,6 +589,7 @@ static void kvm_wait(u8 *ptr, u8 val)
>  	local_irq_restore(flags);
>  }
>  
> +#ifdef CONFIG_X86_32
>  __visible bool __kvm_vcpu_is_preempted(int cpu)
>  {
>  	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
> @@ -597,6 +598,31 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
>  }
>  PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
>  
> +#else
> +
> +extern bool __raw_callee_save___kvm_vcpu_is_preempted(int);
> +
> +asm(
> +".pushsection .text;"
> +".global __raw_callee_save___kvm_vcpu_is_preempted;"
> +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
> +"__raw_callee_save___kvm_vcpu_is_preempted:"
> +FRAME_BEGIN
> +"push %rdi;"
> +"push %rdx;"
> +"movslq  %edi, %rdi;"
> +"movq    $steal_time+16, %rax;"
> +"movq    __per_cpu_offset(,%rdi,8), %rdx;"
> +"cmpb    $0, (%rdx,%rax);"
> +"setne   %al;"
> +"pop %rdx;"
> +"pop %rdi;"
> +FRAME_END
> +"ret;"
> +".popsection");
> +
> +#endif
> +
>  /*
>   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
>   */

That should work for now. I have done something similar for
__pv_queued_spin_unlock. However, this has the problem of creating a
dependency on the exact layout of the steal_time structure. Maybe the
constant 16 can be passed in as a parameter offsetof(struct
kvm_steal_time, preempted) to the asm call.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-10 16:35     ` Waiman Long
  0 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-10 16:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, H. Peter Anvin,
	xen-devel, Alok Kataria, Thomas Gleixner

On 02/10/2017 11:19 AM, Peter Zijlstra wrote:
> On Fri, Feb 10, 2017 at 10:43:09AM -0500, Waiman Long wrote:
>> It was found when running fio sequential write test with a XFS ramdisk
>> on a VM running on a 2-socket x86-64 system, the %CPU times as reported
>> by perf were as follows:
>>
>>  69.75%  0.59%  fio  [k] down_write
>>  69.15%  0.01%  fio  [k] call_rwsem_down_write_failed
>>  67.12%  1.12%  fio  [k] rwsem_down_write_failed
>>  63.48% 52.77%  fio  [k] osq_lock
>>   9.46%  7.88%  fio  [k] __raw_callee_save___kvm_vcpu_is_preempt
>>   3.93%  3.93%  fio  [k] __kvm_vcpu_is_preempted
>>
> Thinking about this again, wouldn't something like the below also work?
>
>
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 099fcba4981d..6aa33702c15c 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -589,6 +589,7 @@ static void kvm_wait(u8 *ptr, u8 val)
>  	local_irq_restore(flags);
>  }
>  
> +#ifdef CONFIG_X86_32
>  __visible bool __kvm_vcpu_is_preempted(int cpu)
>  {
>  	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
> @@ -597,6 +598,31 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
>  }
>  PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
>  
> +#else
> +
> +extern bool __raw_callee_save___kvm_vcpu_is_preempted(int);
> +
> +asm(
> +".pushsection .text;"
> +".global __raw_callee_save___kvm_vcpu_is_preempted;"
> +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
> +"__raw_callee_save___kvm_vcpu_is_preempted:"
> +FRAME_BEGIN
> +"push %rdi;"
> +"push %rdx;"
> +"movslq  %edi, %rdi;"
> +"movq    $steal_time+16, %rax;"
> +"movq    __per_cpu_offset(,%rdi,8), %rdx;"
> +"cmpb    $0, (%rdx,%rax);"
> +"setne   %al;"
> +"pop %rdx;"
> +"pop %rdi;"
> +FRAME_END
> +"ret;"
> +".popsection");
> +
> +#endif
> +
>  /*
>   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
>   */

That should work for now. I have done something similar for
__pv_queued_spin_unlock. However, this has the problem of creating a
dependency on the exact layout of the steal_time structure. Maybe the
constant 16 can be passed in as a parameter offsetof(struct
kvm_steal_time, preempted) to the asm call.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-10 16:19   ` Peter Zijlstra
  (?)
  (?)
@ 2017-02-10 16:35   ` Waiman Long
  -1 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-10 16:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, Rusty Russell,
	linux-kernel, virtualization, Chris Wright, Ingo Molnar,
	H. Peter Anvin, xen-devel, Alok Kataria, Thomas Gleixner

On 02/10/2017 11:19 AM, Peter Zijlstra wrote:
> On Fri, Feb 10, 2017 at 10:43:09AM -0500, Waiman Long wrote:
>> It was found when running fio sequential write test with a XFS ramdisk
>> on a VM running on a 2-socket x86-64 system, the %CPU times as reported
>> by perf were as follows:
>>
>>  69.75%  0.59%  fio  [k] down_write
>>  69.15%  0.01%  fio  [k] call_rwsem_down_write_failed
>>  67.12%  1.12%  fio  [k] rwsem_down_write_failed
>>  63.48% 52.77%  fio  [k] osq_lock
>>   9.46%  7.88%  fio  [k] __raw_callee_save___kvm_vcpu_is_preempt
>>   3.93%  3.93%  fio  [k] __kvm_vcpu_is_preempted
>>
> Thinking about this again, wouldn't something like the below also work?
>
>
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 099fcba4981d..6aa33702c15c 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -589,6 +589,7 @@ static void kvm_wait(u8 *ptr, u8 val)
>  	local_irq_restore(flags);
>  }
>  
> +#ifdef CONFIG_X86_32
>  __visible bool __kvm_vcpu_is_preempted(int cpu)
>  {
>  	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
> @@ -597,6 +598,31 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
>  }
>  PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
>  
> +#else
> +
> +extern bool __raw_callee_save___kvm_vcpu_is_preempted(int);
> +
> +asm(
> +".pushsection .text;"
> +".global __raw_callee_save___kvm_vcpu_is_preempted;"
> +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
> +"__raw_callee_save___kvm_vcpu_is_preempted:"
> +FRAME_BEGIN
> +"push %rdi;"
> +"push %rdx;"
> +"movslq  %edi, %rdi;"
> +"movq    $steal_time+16, %rax;"
> +"movq    __per_cpu_offset(,%rdi,8), %rdx;"
> +"cmpb    $0, (%rdx,%rax);"
> +"setne   %al;"
> +"pop %rdx;"
> +"pop %rdi;"
> +FRAME_END
> +"ret;"
> +".popsection");
> +
> +#endif
> +
>  /*
>   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
>   */

That should work for now. I have done something similar for
__pv_queued_spin_unlock. However, this has the problem of creating a
dependency on the exact layout of the steal_time structure. Maybe the
constant 16 can be passed in as a parameter offsetof(struct
kvm_steal_time, preempted) to the asm call.

Cheers,
Longman



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-10 16:35     ` Waiman Long
@ 2017-02-10 17:00       ` Waiman Long
  -1 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-10 17:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, H. Peter Anvin, linux-arch, x86,
	linux-kernel, virtualization, xen-devel, kvm, Pan Xinhui,
	Paolo Bonzini, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On 02/10/2017 11:35 AM, Waiman Long wrote:
> On 02/10/2017 11:19 AM, Peter Zijlstra wrote:
>> On Fri, Feb 10, 2017 at 10:43:09AM -0500, Waiman Long wrote:
>>> It was found when running fio sequential write test with a XFS ramdisk
>>> on a VM running on a 2-socket x86-64 system, the %CPU times as reported
>>> by perf were as follows:
>>>
>>>  69.75%  0.59%  fio  [k] down_write
>>>  69.15%  0.01%  fio  [k] call_rwsem_down_write_failed
>>>  67.12%  1.12%  fio  [k] rwsem_down_write_failed
>>>  63.48% 52.77%  fio  [k] osq_lock
>>>   9.46%  7.88%  fio  [k] __raw_callee_save___kvm_vcpu_is_preempt
>>>   3.93%  3.93%  fio  [k] __kvm_vcpu_is_preempted
>>>
>> Thinking about this again, wouldn't something like the below also work?
>>
>>
>> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
>> index 099fcba4981d..6aa33702c15c 100644
>> --- a/arch/x86/kernel/kvm.c
>> +++ b/arch/x86/kernel/kvm.c
>> @@ -589,6 +589,7 @@ static void kvm_wait(u8 *ptr, u8 val)
>>  	local_irq_restore(flags);
>>  }
>>  
>> +#ifdef CONFIG_X86_32
>>  __visible bool __kvm_vcpu_is_preempted(int cpu)
>>  {
>>  	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
>> @@ -597,6 +598,31 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
>>  }
>>  PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
>>  
>> +#else
>> +
>> +extern bool __raw_callee_save___kvm_vcpu_is_preempted(int);
>> +
>> +asm(
>> +".pushsection .text;"
>> +".global __raw_callee_save___kvm_vcpu_is_preempted;"
>> +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
>> +"__raw_callee_save___kvm_vcpu_is_preempted:"
>> +FRAME_BEGIN
>> +"push %rdi;"
>> +"push %rdx;"
>> +"movslq  %edi, %rdi;"
>> +"movq    $steal_time+16, %rax;"
>> +"movq    __per_cpu_offset(,%rdi,8), %rdx;"
>> +"cmpb    $0, (%rdx,%rax);"
>> +"setne   %al;"
>> +"pop %rdx;"
>> +"pop %rdi;"
>> +FRAME_END
>> +"ret;"
>> +".popsection");
>> +
>> +#endif
>> +
>>  /*
>>   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
>>   */
> That should work for now. I have done something similar for
> __pv_queued_spin_unlock. However, this has the problem of creating a
> dependency on the exact layout of the steal_time structure. Maybe the
> constant 16 can be passed in as a parameter offsetof(struct
> kvm_steal_time, preempted) to the asm call.
>
> Cheers,
> Longman

One more thing, that will improve KVM performance, but it won't help Xen.

I looked into the assembly code for rwsem_spin_on_owner, It need to save
and restore 2 additional registers with my patch. Doing it your way,
will transfer the save and restore overhead to the assembly code.
However, __kvm_vcpu_is_preempted() is called multiple times per
invocation of rwsem_spin_on_owner. That function is simple enough that
making __kvm_vcpu_is_preempted() callee-save won't produce much compiler
optimization opportunity. The outer function rwsem_down_write_failed()
does appear to be a bit bigger (from 866 bytes to 884 bytes) though.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-10 17:00       ` Waiman Long
  0 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-10 17:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, H. Peter Anvin,
	xen-devel, Alok Kataria, Thomas Gleixner

On 02/10/2017 11:35 AM, Waiman Long wrote:
> On 02/10/2017 11:19 AM, Peter Zijlstra wrote:
>> On Fri, Feb 10, 2017 at 10:43:09AM -0500, Waiman Long wrote:
>>> It was found when running fio sequential write test with a XFS ramdisk
>>> on a VM running on a 2-socket x86-64 system, the %CPU times as reported
>>> by perf were as follows:
>>>
>>>  69.75%  0.59%  fio  [k] down_write
>>>  69.15%  0.01%  fio  [k] call_rwsem_down_write_failed
>>>  67.12%  1.12%  fio  [k] rwsem_down_write_failed
>>>  63.48% 52.77%  fio  [k] osq_lock
>>>   9.46%  7.88%  fio  [k] __raw_callee_save___kvm_vcpu_is_preempt
>>>   3.93%  3.93%  fio  [k] __kvm_vcpu_is_preempted
>>>
>> Thinking about this again, wouldn't something like the below also work?
>>
>>
>> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
>> index 099fcba4981d..6aa33702c15c 100644
>> --- a/arch/x86/kernel/kvm.c
>> +++ b/arch/x86/kernel/kvm.c
>> @@ -589,6 +589,7 @@ static void kvm_wait(u8 *ptr, u8 val)
>>  	local_irq_restore(flags);
>>  }
>>  
>> +#ifdef CONFIG_X86_32
>>  __visible bool __kvm_vcpu_is_preempted(int cpu)
>>  {
>>  	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
>> @@ -597,6 +598,31 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
>>  }
>>  PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
>>  
>> +#else
>> +
>> +extern bool __raw_callee_save___kvm_vcpu_is_preempted(int);
>> +
>> +asm(
>> +".pushsection .text;"
>> +".global __raw_callee_save___kvm_vcpu_is_preempted;"
>> +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
>> +"__raw_callee_save___kvm_vcpu_is_preempted:"
>> +FRAME_BEGIN
>> +"push %rdi;"
>> +"push %rdx;"
>> +"movslq  %edi, %rdi;"
>> +"movq    $steal_time+16, %rax;"
>> +"movq    __per_cpu_offset(,%rdi,8), %rdx;"
>> +"cmpb    $0, (%rdx,%rax);"
>> +"setne   %al;"
>> +"pop %rdx;"
>> +"pop %rdi;"
>> +FRAME_END
>> +"ret;"
>> +".popsection");
>> +
>> +#endif
>> +
>>  /*
>>   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
>>   */
> That should work for now. I have done something similar for
> __pv_queued_spin_unlock. However, this has the problem of creating a
> dependency on the exact layout of the steal_time structure. Maybe the
> constant 16 can be passed in as a parameter offsetof(struct
> kvm_steal_time, preempted) to the asm call.
>
> Cheers,
> Longman

One more thing, that will improve KVM performance, but it won't help Xen.

I looked into the assembly code for rwsem_spin_on_owner, It need to save
and restore 2 additional registers with my patch. Doing it your way,
will transfer the save and restore overhead to the assembly code.
However, __kvm_vcpu_is_preempted() is called multiple times per
invocation of rwsem_spin_on_owner. That function is simple enough that
making __kvm_vcpu_is_preempted() callee-save won't produce much compiler
optimization opportunity. The outer function rwsem_down_write_failed()
does appear to be a bit bigger (from 866 bytes to 884 bytes) though.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-10 16:35     ` Waiman Long
  (?)
@ 2017-02-10 17:00     ` Waiman Long
  -1 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-10 17:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, Rusty Russell,
	linux-kernel, virtualization, Chris Wright, Ingo Molnar,
	H. Peter Anvin, xen-devel, Alok Kataria, Thomas Gleixner

On 02/10/2017 11:35 AM, Waiman Long wrote:
> On 02/10/2017 11:19 AM, Peter Zijlstra wrote:
>> On Fri, Feb 10, 2017 at 10:43:09AM -0500, Waiman Long wrote:
>>> It was found when running fio sequential write test with a XFS ramdisk
>>> on a VM running on a 2-socket x86-64 system, the %CPU times as reported
>>> by perf were as follows:
>>>
>>>  69.75%  0.59%  fio  [k] down_write
>>>  69.15%  0.01%  fio  [k] call_rwsem_down_write_failed
>>>  67.12%  1.12%  fio  [k] rwsem_down_write_failed
>>>  63.48% 52.77%  fio  [k] osq_lock
>>>   9.46%  7.88%  fio  [k] __raw_callee_save___kvm_vcpu_is_preempt
>>>   3.93%  3.93%  fio  [k] __kvm_vcpu_is_preempted
>>>
>> Thinking about this again, wouldn't something like the below also work?
>>
>>
>> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
>> index 099fcba4981d..6aa33702c15c 100644
>> --- a/arch/x86/kernel/kvm.c
>> +++ b/arch/x86/kernel/kvm.c
>> @@ -589,6 +589,7 @@ static void kvm_wait(u8 *ptr, u8 val)
>>  	local_irq_restore(flags);
>>  }
>>  
>> +#ifdef CONFIG_X86_32
>>  __visible bool __kvm_vcpu_is_preempted(int cpu)
>>  {
>>  	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
>> @@ -597,6 +598,31 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
>>  }
>>  PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
>>  
>> +#else
>> +
>> +extern bool __raw_callee_save___kvm_vcpu_is_preempted(int);
>> +
>> +asm(
>> +".pushsection .text;"
>> +".global __raw_callee_save___kvm_vcpu_is_preempted;"
>> +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
>> +"__raw_callee_save___kvm_vcpu_is_preempted:"
>> +FRAME_BEGIN
>> +"push %rdi;"
>> +"push %rdx;"
>> +"movslq  %edi, %rdi;"
>> +"movq    $steal_time+16, %rax;"
>> +"movq    __per_cpu_offset(,%rdi,8), %rdx;"
>> +"cmpb    $0, (%rdx,%rax);"
>> +"setne   %al;"
>> +"pop %rdx;"
>> +"pop %rdi;"
>> +FRAME_END
>> +"ret;"
>> +".popsection");
>> +
>> +#endif
>> +
>>  /*
>>   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
>>   */
> That should work for now. I have done something similar for
> __pv_queued_spin_unlock. However, this has the problem of creating a
> dependency on the exact layout of the steal_time structure. Maybe the
> constant 16 can be passed in as a parameter offsetof(struct
> kvm_steal_time, preempted) to the asm call.
>
> Cheers,
> Longman

One more thing, that will improve KVM performance, but it won't help Xen.

I looked into the assembly code for rwsem_spin_on_owner, It need to save
and restore 2 additional registers with my patch. Doing it your way,
will transfer the save and restore overhead to the assembly code.
However, __kvm_vcpu_is_preempted() is called multiple times per
invocation of rwsem_spin_on_owner. That function is simple enough that
making __kvm_vcpu_is_preempted() callee-save won't produce much compiler
optimization opportunity. The outer function rwsem_down_write_failed()
does appear to be a bit bigger (from 866 bytes to 884 bytes) though.

Cheers,
Longman



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-10 17:00       ` Waiman Long
@ 2017-02-13 10:47         ` Peter Zijlstra
  -1 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-13 10:47 UTC (permalink / raw)
  To: Waiman Long
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, H. Peter Anvin, linux-arch, x86,
	linux-kernel, virtualization, xen-devel, kvm, Pan Xinhui,
	Paolo Bonzini, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On Fri, Feb 10, 2017 at 12:00:43PM -0500, Waiman Long wrote:

> >> +asm(
> >> +".pushsection .text;"
> >> +".global __raw_callee_save___kvm_vcpu_is_preempted;"
> >> +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
> >> +"__raw_callee_save___kvm_vcpu_is_preempted:"
> >> +FRAME_BEGIN
> >> +"push %rdi;"
> >> +"push %rdx;"
> >> +"movslq  %edi, %rdi;"
> >> +"movq    $steal_time+16, %rax;"
> >> +"movq    __per_cpu_offset(,%rdi,8), %rdx;"
> >> +"cmpb    $0, (%rdx,%rax);"

Could we not put the $steal_time+16 displacement as an immediate in the
cmpb and save a whole register here?

That way we'd end up with something like:

asm("
push %rdi;
movslq %edi, %rdi;
movq __per_cpu_offset(,%rdi,8), %rax;
cmpb $0, %[offset](%rax);
setne %al;
pop %rdi;
" : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));

And if we could get rid of the sign extend on edi we could avoid all the
push-pop nonsense, but I'm not sure I see how to do that (then again,
this asm foo isn't my strongest point).

> >> +"setne   %al;"
> >> +"pop %rdx;"
> >> +"pop %rdi;"
> >> +FRAME_END
> >> +"ret;"
> >> +".popsection");
> >> +
> >> +#endif
> >> +
> >>  /*
> >>   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
> >>   */
> > That should work for now. I have done something similar for
> > __pv_queued_spin_unlock. However, this has the problem of creating a
> > dependency on the exact layout of the steal_time structure. Maybe the
> > constant 16 can be passed in as a parameter offsetof(struct
> > kvm_steal_time, preempted) to the asm call.

Yeah it should be well possible to pass that in. But ideally we'd have
GCC grow something like __attribute__((callee_saved)) or somesuch and it
would do all this for us.

> One more thing, that will improve KVM performance, but it won't help Xen.

People still use Xen? ;-) In any case, their implementation looks very
similar and could easily crib this.

> I looked into the assembly code for rwsem_spin_on_owner, It need to save
> and restore 2 additional registers with my patch. Doing it your way,
> will transfer the save and restore overhead to the assembly code.
> However, __kvm_vcpu_is_preempted() is called multiple times per
> invocation of rwsem_spin_on_owner. That function is simple enough that
> making __kvm_vcpu_is_preempted() callee-save won't produce much compiler
> optimization opportunity.

This is because of that noinline, right? Otherwise it would've been
folded and register pressure would be much higher.

> The outer function rwsem_down_write_failed()
> does appear to be a bit bigger (from 866 bytes to 884 bytes) though.

I suspect GCC is being clever and since all this is static it plays
games with the calling convention and pushes these clobbers out.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-13 10:47         ` Peter Zijlstra
  0 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-13 10:47 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, H. Peter Anvin,
	xen-devel, Alok Kataria, Thomas Gleixner

On Fri, Feb 10, 2017 at 12:00:43PM -0500, Waiman Long wrote:

> >> +asm(
> >> +".pushsection .text;"
> >> +".global __raw_callee_save___kvm_vcpu_is_preempted;"
> >> +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
> >> +"__raw_callee_save___kvm_vcpu_is_preempted:"
> >> +FRAME_BEGIN
> >> +"push %rdi;"
> >> +"push %rdx;"
> >> +"movslq  %edi, %rdi;"
> >> +"movq    $steal_time+16, %rax;"
> >> +"movq    __per_cpu_offset(,%rdi,8), %rdx;"
> >> +"cmpb    $0, (%rdx,%rax);"

Could we not put the $steal_time+16 displacement as an immediate in the
cmpb and save a whole register here?

That way we'd end up with something like:

asm("
push %rdi;
movslq %edi, %rdi;
movq __per_cpu_offset(,%rdi,8), %rax;
cmpb $0, %[offset](%rax);
setne %al;
pop %rdi;
" : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));

And if we could get rid of the sign extend on edi we could avoid all the
push-pop nonsense, but I'm not sure I see how to do that (then again,
this asm foo isn't my strongest point).

> >> +"setne   %al;"
> >> +"pop %rdx;"
> >> +"pop %rdi;"
> >> +FRAME_END
> >> +"ret;"
> >> +".popsection");
> >> +
> >> +#endif
> >> +
> >>  /*
> >>   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
> >>   */
> > That should work for now. I have done something similar for
> > __pv_queued_spin_unlock. However, this has the problem of creating a
> > dependency on the exact layout of the steal_time structure. Maybe the
> > constant 16 can be passed in as a parameter offsetof(struct
> > kvm_steal_time, preempted) to the asm call.

Yeah it should be well possible to pass that in. But ideally we'd have
GCC grow something like __attribute__((callee_saved)) or somesuch and it
would do all this for us.

> One more thing, that will improve KVM performance, but it won't help Xen.

People still use Xen? ;-) In any case, their implementation looks very
similar and could easily crib this.

> I looked into the assembly code for rwsem_spin_on_owner, It need to save
> and restore 2 additional registers with my patch. Doing it your way,
> will transfer the save and restore overhead to the assembly code.
> However, __kvm_vcpu_is_preempted() is called multiple times per
> invocation of rwsem_spin_on_owner. That function is simple enough that
> making __kvm_vcpu_is_preempted() callee-save won't produce much compiler
> optimization opportunity.

This is because of that noinline, right? Otherwise it would've been
folded and register pressure would be much higher.

> The outer function rwsem_down_write_failed()
> does appear to be a bit bigger (from 866 bytes to 884 bytes) though.

I suspect GCC is being clever and since all this is static it plays
games with the calling convention and pushes these clobbers out.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-10 17:00       ` Waiman Long
  (?)
@ 2017-02-13 10:47       ` Peter Zijlstra
  -1 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-13 10:47 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, Rusty Russell,
	linux-kernel, virtualization, Chris Wright, Ingo Molnar,
	H. Peter Anvin, xen-devel, Alok Kataria, Thomas Gleixner

On Fri, Feb 10, 2017 at 12:00:43PM -0500, Waiman Long wrote:

> >> +asm(
> >> +".pushsection .text;"
> >> +".global __raw_callee_save___kvm_vcpu_is_preempted;"
> >> +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
> >> +"__raw_callee_save___kvm_vcpu_is_preempted:"
> >> +FRAME_BEGIN
> >> +"push %rdi;"
> >> +"push %rdx;"
> >> +"movslq  %edi, %rdi;"
> >> +"movq    $steal_time+16, %rax;"
> >> +"movq    __per_cpu_offset(,%rdi,8), %rdx;"
> >> +"cmpb    $0, (%rdx,%rax);"

Could we not put the $steal_time+16 displacement as an immediate in the
cmpb and save a whole register here?

That way we'd end up with something like:

asm("
push %rdi;
movslq %edi, %rdi;
movq __per_cpu_offset(,%rdi,8), %rax;
cmpb $0, %[offset](%rax);
setne %al;
pop %rdi;
" : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));

And if we could get rid of the sign extend on edi we could avoid all the
push-pop nonsense, but I'm not sure I see how to do that (then again,
this asm foo isn't my strongest point).

> >> +"setne   %al;"
> >> +"pop %rdx;"
> >> +"pop %rdi;"
> >> +FRAME_END
> >> +"ret;"
> >> +".popsection");
> >> +
> >> +#endif
> >> +
> >>  /*
> >>   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
> >>   */
> > That should work for now. I have done something similar for
> > __pv_queued_spin_unlock. However, this has the problem of creating a
> > dependency on the exact layout of the steal_time structure. Maybe the
> > constant 16 can be passed in as a parameter offsetof(struct
> > kvm_steal_time, preempted) to the asm call.

Yeah it should be well possible to pass that in. But ideally we'd have
GCC grow something like __attribute__((callee_saved)) or somesuch and it
would do all this for us.

> One more thing, that will improve KVM performance, but it won't help Xen.

People still use Xen? ;-) In any case, their implementation looks very
similar and could easily crib this.

> I looked into the assembly code for rwsem_spin_on_owner, It need to save
> and restore 2 additional registers with my patch. Doing it your way,
> will transfer the save and restore overhead to the assembly code.
> However, __kvm_vcpu_is_preempted() is called multiple times per
> invocation of rwsem_spin_on_owner. That function is simple enough that
> making __kvm_vcpu_is_preempted() callee-save won't produce much compiler
> optimization opportunity.

This is because of that noinline, right? Otherwise it would've been
folded and register pressure would be much higher.

> The outer function rwsem_down_write_failed()
> does appear to be a bit bigger (from 866 bytes to 884 bytes) though.

I suspect GCC is being clever and since all this is static it plays
games with the calling convention and pushes these clobbers out.



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 10:47         ` Peter Zijlstra
@ 2017-02-13 10:53           ` Peter Zijlstra
  -1 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-13 10:53 UTC (permalink / raw)
  To: Waiman Long
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, H. Peter Anvin, linux-arch, x86,
	linux-kernel, virtualization, xen-devel, kvm, Pan Xinhui,
	Paolo Bonzini, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
> That way we'd end up with something like:
> 
> asm("
> push %rdi;
> movslq %edi, %rdi;
> movq __per_cpu_offset(,%rdi,8), %rax;
> cmpb $0, %[offset](%rax);
> setne %al;
> pop %rdi;
> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
> 
> And if we could get rid of the sign extend on edi we could avoid all the
> push-pop nonsense, but I'm not sure I see how to do that (then again,
> this asm foo isn't my strongest point).

Maybe:

movsql %edi, %rax;
movq __per_cpu_offset(,%rax,8), %rax;
cmpb $0, %[offset](%rax);
setne %al;

?

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-13 10:53           ` Peter Zijlstra
  0 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-13 10:53 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, H. Peter Anvin,
	xen-devel, Alok Kataria, Thomas Gleixner

On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
> That way we'd end up with something like:
> 
> asm("
> push %rdi;
> movslq %edi, %rdi;
> movq __per_cpu_offset(,%rdi,8), %rax;
> cmpb $0, %[offset](%rax);
> setne %al;
> pop %rdi;
> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
> 
> And if we could get rid of the sign extend on edi we could avoid all the
> push-pop nonsense, but I'm not sure I see how to do that (then again,
> this asm foo isn't my strongest point).

Maybe:

movsql %edi, %rax;
movq __per_cpu_offset(,%rax,8), %rax;
cmpb $0, %[offset](%rax);
setne %al;

?

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 10:47         ` Peter Zijlstra
  (?)
@ 2017-02-13 10:53         ` Peter Zijlstra
  -1 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-13 10:53 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, Rusty Russell,
	linux-kernel, virtualization, Chris Wright, Ingo Molnar,
	H. Peter Anvin, xen-devel, Alok Kataria, Thomas Gleixner

On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
> That way we'd end up with something like:
> 
> asm("
> push %rdi;
> movslq %edi, %rdi;
> movq __per_cpu_offset(,%rdi,8), %rax;
> cmpb $0, %[offset](%rax);
> setne %al;
> pop %rdi;
> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
> 
> And if we could get rid of the sign extend on edi we could avoid all the
> push-pop nonsense, but I'm not sure I see how to do that (then again,
> this asm foo isn't my strongest point).

Maybe:

movsql %edi, %rax;
movq __per_cpu_offset(,%rax,8), %rax;
cmpb $0, %[offset](%rax);
setne %al;

?

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 10:47         ` Peter Zijlstra
                           ` (2 preceding siblings ...)
  (?)
@ 2017-02-13 19:41         ` Waiman Long
  -1 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-13 19:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, H. Peter Anvin, linux-arch, x86,
	linux-kernel, virtualization, xen-devel, kvm, Pan Xinhui,
	Paolo Bonzini, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On 02/13/2017 05:47 AM, Peter Zijlstra wrote:
> On Fri, Feb 10, 2017 at 12:00:43PM -0500, Waiman Long wrote:
>
>>>> +asm(
>>>> +".pushsection .text;"
>>>> +".global __raw_callee_save___kvm_vcpu_is_preempted;"
>>>> +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
>>>> +"__raw_callee_save___kvm_vcpu_is_preempted:"
>>>> +FRAME_BEGIN
>>>> +"push %rdi;"
>>>> +"push %rdx;"
>>>> +"movslq  %edi, %rdi;"
>>>> +"movq    $steal_time+16, %rax;"
>>>> +"movq    __per_cpu_offset(,%rdi,8), %rdx;"
>>>> +"cmpb    $0, (%rdx,%rax);"
> Could we not put the $steal_time+16 displacement as an immediate in the
> cmpb and save a whole register here?
>
> That way we'd end up with something like:
>
> asm("
> push %rdi;
> movslq %edi, %rdi;
> movq __per_cpu_offset(,%rdi,8), %rax;
> cmpb $0, %[offset](%rax);
> setne %al;
> pop %rdi;
> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
>
> And if we could get rid of the sign extend on edi we could avoid all the
> push-pop nonsense, but I'm not sure I see how to do that (then again,
> this asm foo isn't my strongest point).

Yes, I think that can work. I will try to ran this patch to see how
thing goes.

>>>> +"setne   %al;"
>>>> +"pop %rdx;"
>>>> +"pop %rdi;"
>>>> +FRAME_END
>>>> +"ret;"
>>>> +".popsection");
>>>> +
>>>> +#endif
>>>> +
>>>>  /*
>>>>   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
>>>>   */
>>> That should work for now. I have done something similar for
>>> __pv_queued_spin_unlock. However, this has the problem of creating a
>>> dependency on the exact layout of the steal_time structure. Maybe the
>>> constant 16 can be passed in as a parameter offsetof(struct
>>> kvm_steal_time, preempted) to the asm call.
> Yeah it should be well possible to pass that in. But ideally we'd have
> GCC grow something like __attribute__((callee_saved)) or somesuch and it
> would do all this for us.

That will be really nice too. I am not too fond of working in assembly.

>> One more thing, that will improve KVM performance, but it won't help Xen.
> People still use Xen? ;-) In any case, their implementation looks very
> similar and could easily crib this.

In Red Hat, my focus will be on KVM performance. I do believe that there
are still Xen users out there. So we still need to keep their interest
into consideration. Given that, I am OK to make it work better in KVM
first and then think about Xen later.

>> I looked into the assembly code for rwsem_spin_on_owner, It need to save
>> and restore 2 additional registers with my patch. Doing it your way,
>> will transfer the save and restore overhead to the assembly code.
>> However, __kvm_vcpu_is_preempted() is called multiple times per
>> invocation of rwsem_spin_on_owner. That function is simple enough that
>> making __kvm_vcpu_is_preempted() callee-save won't produce much compiler
>> optimization opportunity.
> This is because of that noinline, right? Otherwise it would've been
> folded and register pressure would be much higher.

Yes, I guess so. The noinline is there so that we know what the CPU time
is for spinning rather than other activities within the slowpath.

>
>> The outer function rwsem_down_write_failed()
>> does appear to be a bit bigger (from 866 bytes to 884 bytes) though.
> I suspect GCC is being clever and since all this is static it plays
> games with the calling convention and pushes these clobbers out.
>
>

Cheers,
Longman

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 10:47         ` Peter Zijlstra
                           ` (4 preceding siblings ...)
  (?)
@ 2017-02-13 19:41         ` Waiman Long
  -1 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-13 19:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, H. Peter Anvin,
	xen-devel, Alok Kataria, Thomas Gleixner

On 02/13/2017 05:47 AM, Peter Zijlstra wrote:
> On Fri, Feb 10, 2017 at 12:00:43PM -0500, Waiman Long wrote:
>
>>>> +asm(
>>>> +".pushsection .text;"
>>>> +".global __raw_callee_save___kvm_vcpu_is_preempted;"
>>>> +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
>>>> +"__raw_callee_save___kvm_vcpu_is_preempted:"
>>>> +FRAME_BEGIN
>>>> +"push %rdi;"
>>>> +"push %rdx;"
>>>> +"movslq  %edi, %rdi;"
>>>> +"movq    $steal_time+16, %rax;"
>>>> +"movq    __per_cpu_offset(,%rdi,8), %rdx;"
>>>> +"cmpb    $0, (%rdx,%rax);"
> Could we not put the $steal_time+16 displacement as an immediate in the
> cmpb and save a whole register here?
>
> That way we'd end up with something like:
>
> asm("
> push %rdi;
> movslq %edi, %rdi;
> movq __per_cpu_offset(,%rdi,8), %rax;
> cmpb $0, %[offset](%rax);
> setne %al;
> pop %rdi;
> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
>
> And if we could get rid of the sign extend on edi we could avoid all the
> push-pop nonsense, but I'm not sure I see how to do that (then again,
> this asm foo isn't my strongest point).

Yes, I think that can work. I will try to ran this patch to see how
thing goes.

>>>> +"setne   %al;"
>>>> +"pop %rdx;"
>>>> +"pop %rdi;"
>>>> +FRAME_END
>>>> +"ret;"
>>>> +".popsection");
>>>> +
>>>> +#endif
>>>> +
>>>>  /*
>>>>   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
>>>>   */
>>> That should work for now. I have done something similar for
>>> __pv_queued_spin_unlock. However, this has the problem of creating a
>>> dependency on the exact layout of the steal_time structure. Maybe the
>>> constant 16 can be passed in as a parameter offsetof(struct
>>> kvm_steal_time, preempted) to the asm call.
> Yeah it should be well possible to pass that in. But ideally we'd have
> GCC grow something like __attribute__((callee_saved)) or somesuch and it
> would do all this for us.

That will be really nice too. I am not too fond of working in assembly.

>> One more thing, that will improve KVM performance, but it won't help Xen.
> People still use Xen? ;-) In any case, their implementation looks very
> similar and could easily crib this.

In Red Hat, my focus will be on KVM performance. I do believe that there
are still Xen users out there. So we still need to keep their interest
into consideration. Given that, I am OK to make it work better in KVM
first and then think about Xen later.

>> I looked into the assembly code for rwsem_spin_on_owner, It need to save
>> and restore 2 additional registers with my patch. Doing it your way,
>> will transfer the save and restore overhead to the assembly code.
>> However, __kvm_vcpu_is_preempted() is called multiple times per
>> invocation of rwsem_spin_on_owner. That function is simple enough that
>> making __kvm_vcpu_is_preempted() callee-save won't produce much compiler
>> optimization opportunity.
> This is because of that noinline, right? Otherwise it would've been
> folded and register pressure would be much higher.

Yes, I guess so. The noinline is there so that we know what the CPU time
is for spinning rather than other activities within the slowpath.

>
>> The outer function rwsem_down_write_failed()
>> does appear to be a bit bigger (from 866 bytes to 884 bytes) though.
> I suspect GCC is being clever and since all this is static it plays
> games with the calling convention and pushes these clobbers out.
>
>

Cheers,
Longman

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 10:47         ` Peter Zijlstra
                           ` (3 preceding siblings ...)
  (?)
@ 2017-02-13 19:41         ` Waiman Long
  -1 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-13 19:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, Rusty Russell,
	linux-kernel, virtualization, Chris Wright, Ingo Molnar,
	H. Peter Anvin, xen-devel, Alok Kataria, Thomas Gleixner

On 02/13/2017 05:47 AM, Peter Zijlstra wrote:
> On Fri, Feb 10, 2017 at 12:00:43PM -0500, Waiman Long wrote:
>
>>>> +asm(
>>>> +".pushsection .text;"
>>>> +".global __raw_callee_save___kvm_vcpu_is_preempted;"
>>>> +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
>>>> +"__raw_callee_save___kvm_vcpu_is_preempted:"
>>>> +FRAME_BEGIN
>>>> +"push %rdi;"
>>>> +"push %rdx;"
>>>> +"movslq  %edi, %rdi;"
>>>> +"movq    $steal_time+16, %rax;"
>>>> +"movq    __per_cpu_offset(,%rdi,8), %rdx;"
>>>> +"cmpb    $0, (%rdx,%rax);"
> Could we not put the $steal_time+16 displacement as an immediate in the
> cmpb and save a whole register here?
>
> That way we'd end up with something like:
>
> asm("
> push %rdi;
> movslq %edi, %rdi;
> movq __per_cpu_offset(,%rdi,8), %rax;
> cmpb $0, %[offset](%rax);
> setne %al;
> pop %rdi;
> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
>
> And if we could get rid of the sign extend on edi we could avoid all the
> push-pop nonsense, but I'm not sure I see how to do that (then again,
> this asm foo isn't my strongest point).

Yes, I think that can work. I will try to ran this patch to see how
thing goes.

>>>> +"setne   %al;"
>>>> +"pop %rdx;"
>>>> +"pop %rdi;"
>>>> +FRAME_END
>>>> +"ret;"
>>>> +".popsection");
>>>> +
>>>> +#endif
>>>> +
>>>>  /*
>>>>   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
>>>>   */
>>> That should work for now. I have done something similar for
>>> __pv_queued_spin_unlock. However, this has the problem of creating a
>>> dependency on the exact layout of the steal_time structure. Maybe the
>>> constant 16 can be passed in as a parameter offsetof(struct
>>> kvm_steal_time, preempted) to the asm call.
> Yeah it should be well possible to pass that in. But ideally we'd have
> GCC grow something like __attribute__((callee_saved)) or somesuch and it
> would do all this for us.

That will be really nice too. I am not too fond of working in assembly.

>> One more thing, that will improve KVM performance, but it won't help Xen.
> People still use Xen? ;-) In any case, their implementation looks very
> similar and could easily crib this.

In Red Hat, my focus will be on KVM performance. I do believe that there
are still Xen users out there. So we still need to keep their interest
into consideration. Given that, I am OK to make it work better in KVM
first and then think about Xen later.

>> I looked into the assembly code for rwsem_spin_on_owner, It need to save
>> and restore 2 additional registers with my patch. Doing it your way,
>> will transfer the save and restore overhead to the assembly code.
>> However, __kvm_vcpu_is_preempted() is called multiple times per
>> invocation of rwsem_spin_on_owner. That function is simple enough that
>> making __kvm_vcpu_is_preempted() callee-save won't produce much compiler
>> optimization opportunity.
> This is because of that noinline, right? Otherwise it would've been
> folded and register pressure would be much higher.

Yes, I guess so. The noinline is there so that we know what the CPU time
is for spinning rather than other activities within the slowpath.

>
>> The outer function rwsem_down_write_failed()
>> does appear to be a bit bigger (from 866 bytes to 884 bytes) though.
> I suspect GCC is being clever and since all this is static it plays
> games with the calling convention and pushes these clobbers out.
>
>

Cheers,
Longman


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 10:53           ` Peter Zijlstra
                             ` (2 preceding siblings ...)
  (?)
@ 2017-02-13 19:42           ` Waiman Long
  2017-02-13 20:12               ` Waiman Long
  2017-02-13 20:12             ` Waiman Long
  -1 siblings, 2 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-13 19:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, H. Peter Anvin, linux-arch, x86,
	linux-kernel, virtualization, xen-devel, kvm, Pan Xinhui,
	Paolo Bonzini, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>> That way we'd end up with something like:
>>
>> asm("
>> push %rdi;
>> movslq %edi, %rdi;
>> movq __per_cpu_offset(,%rdi,8), %rax;
>> cmpb $0, %[offset](%rax);
>> setne %al;
>> pop %rdi;
>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
>>
>> And if we could get rid of the sign extend on edi we could avoid all the
>> push-pop nonsense, but I'm not sure I see how to do that (then again,
>> this asm foo isn't my strongest point).
> Maybe:
>
> movsql %edi, %rax;
> movq __per_cpu_offset(,%rax,8), %rax;
> cmpb $0, %[offset](%rax);
> setne %al;
>
> ?

Yes, that looks good to me.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 10:53           ` Peter Zijlstra
  (?)
@ 2017-02-13 19:42           ` Waiman Long
  -1 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-13 19:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, H. Peter Anvin,
	xen-devel, Alok Kataria, Thomas Gleixner

On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>> That way we'd end up with something like:
>>
>> asm("
>> push %rdi;
>> movslq %edi, %rdi;
>> movq __per_cpu_offset(,%rdi,8), %rax;
>> cmpb $0, %[offset](%rax);
>> setne %al;
>> pop %rdi;
>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
>>
>> And if we could get rid of the sign extend on edi we could avoid all the
>> push-pop nonsense, but I'm not sure I see how to do that (then again,
>> this asm foo isn't my strongest point).
> Maybe:
>
> movsql %edi, %rax;
> movq __per_cpu_offset(,%rax,8), %rax;
> cmpb $0, %[offset](%rax);
> setne %al;
>
> ?

Yes, that looks good to me.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 10:53           ` Peter Zijlstra
  (?)
  (?)
@ 2017-02-13 19:42           ` Waiman Long
  -1 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-13 19:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, Rusty Russell,
	linux-kernel, virtualization, Chris Wright, Ingo Molnar,
	H. Peter Anvin, xen-devel, Alok Kataria, Thomas Gleixner

On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>> That way we'd end up with something like:
>>
>> asm("
>> push %rdi;
>> movslq %edi, %rdi;
>> movq __per_cpu_offset(,%rdi,8), %rax;
>> cmpb $0, %[offset](%rax);
>> setne %al;
>> pop %rdi;
>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
>>
>> And if we could get rid of the sign extend on edi we could avoid all the
>> push-pop nonsense, but I'm not sure I see how to do that (then again,
>> this asm foo isn't my strongest point).
> Maybe:
>
> movsql %edi, %rax;
> movq __per_cpu_offset(,%rax,8), %rax;
> cmpb $0, %[offset](%rax);
> setne %al;
>
> ?

Yes, that looks good to me.

Cheers,
Longman


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 10:53           ` Peter Zijlstra
@ 2017-02-13 20:06             ` hpa
  -1 siblings, 0 replies; 69+ messages in thread
From: hpa @ 2017-02-13 20:06 UTC (permalink / raw)
  To: Peter Zijlstra, Waiman Long
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, linux-arch, x86, linux-kernel,
	virtualization, xen-devel, kvm, Pan Xinhui, Paolo Bonzini,
	Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On February 13, 2017 2:53:43 AM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>> That way we'd end up with something like:
>> 
>> asm("
>> push %rdi;
>> movslq %edi, %rdi;
>> movq __per_cpu_offset(,%rdi,8), %rax;
>> cmpb $0, %[offset](%rax);
>> setne %al;
>> pop %rdi;
>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct
>steal_time, preempted)));
>> 
>> And if we could get rid of the sign extend on edi we could avoid all
>the
>> push-pop nonsense, but I'm not sure I see how to do that (then again,
>> this asm foo isn't my strongest point).
>
>Maybe:
>
>movsql %edi, %rax;
>movq __per_cpu_offset(,%rax,8), %rax;
>cmpb $0, %[offset](%rax);
>setne %al;
>
>?

We could kill the zero or sign extend by changing the calling interface to pass an unsigned long instead of an int.  It is much more likely that a zero extend is free for the caller than a sign extend.
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-13 20:06             ` hpa
  0 siblings, 0 replies; 69+ messages in thread
From: hpa @ 2017-02-13 20:06 UTC (permalink / raw)
  To: Peter Zijlstra, Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, linux-kernel, virtualization,
	Chris Wright, Ingo Molnar, Paolo Bonzini, xen-devel,
	Alok Kataria, Thomas Gleixner

On February 13, 2017 2:53:43 AM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>> That way we'd end up with something like:
>> 
>> asm("
>> push %rdi;
>> movslq %edi, %rdi;
>> movq __per_cpu_offset(,%rdi,8), %rax;
>> cmpb $0, %[offset](%rax);
>> setne %al;
>> pop %rdi;
>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct
>steal_time, preempted)));
>> 
>> And if we could get rid of the sign extend on edi we could avoid all
>the
>> push-pop nonsense, but I'm not sure I see how to do that (then again,
>> this asm foo isn't my strongest point).
>
>Maybe:
>
>movsql %edi, %rax;
>movq __per_cpu_offset(,%rax,8), %rax;
>cmpb $0, %[offset](%rax);
>setne %al;
>
>?

We could kill the zero or sign extend by changing the calling interface to pass an unsigned long instead of an int.  It is much more likely that a zero extend is free for the caller than a sign extend.
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 10:53           ` Peter Zijlstra
                             ` (4 preceding siblings ...)
  (?)
@ 2017-02-13 20:06           ` hpa
  -1 siblings, 0 replies; 69+ messages in thread
From: hpa @ 2017-02-13 20:06 UTC (permalink / raw)
  To: Peter Zijlstra, Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Rusty Russell, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, Paolo Bonzini,
	xen-devel, Alok Kataria, Thomas Gleixner

On February 13, 2017 2:53:43 AM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>> That way we'd end up with something like:
>> 
>> asm("
>> push %rdi;
>> movslq %edi, %rdi;
>> movq __per_cpu_offset(,%rdi,8), %rax;
>> cmpb $0, %[offset](%rax);
>> setne %al;
>> pop %rdi;
>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct
>steal_time, preempted)));
>> 
>> And if we could get rid of the sign extend on edi we could avoid all
>the
>> push-pop nonsense, but I'm not sure I see how to do that (then again,
>> this asm foo isn't my strongest point).
>
>Maybe:
>
>movsql %edi, %rax;
>movq __per_cpu_offset(,%rax,8), %rax;
>cmpb $0, %[offset](%rax);
>setne %al;
>
>?

We could kill the zero or sign extend by changing the calling interface to pass an unsigned long instead of an int.  It is much more likely that a zero extend is free for the caller than a sign extend.
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 19:42           ` Waiman Long
@ 2017-02-13 20:12               ` Waiman Long
  2017-02-13 20:12             ` Waiman Long
  1 sibling, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-13 20:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, H. Peter Anvin, linux-arch, x86,
	linux-kernel, virtualization, xen-devel, kvm, Pan Xinhui,
	Paolo Bonzini, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On 02/13/2017 02:42 PM, Waiman Long wrote:
> On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
>> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>>> That way we'd end up with something like:
>>>
>>> asm("
>>> push %rdi;
>>> movslq %edi, %rdi;
>>> movq __per_cpu_offset(,%rdi,8), %rax;
>>> cmpb $0, %[offset](%rax);
>>> setne %al;
>>> pop %rdi;
>>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
>>>
>>> And if we could get rid of the sign extend on edi we could avoid all the
>>> push-pop nonsense, but I'm not sure I see how to do that (then again,
>>> this asm foo isn't my strongest point).
>> Maybe:
>>
>> movsql %edi, %rax;
>> movq __per_cpu_offset(,%rax,8), %rax;
>> cmpb $0, %[offset](%rax);
>> setne %al;
>>
>> ?
> Yes, that looks good to me.
>
> Cheers,
> Longman
>
Sorry, I am going to take it back. The displacement or offset can only
be up to 32-bit. So we will still need to use at least one more
register, I think.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-13 20:12               ` Waiman Long
  0 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-13 20:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, H. Peter Anvin,
	xen-devel, Alok Kataria, Thomas Gleixner

On 02/13/2017 02:42 PM, Waiman Long wrote:
> On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
>> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>>> That way we'd end up with something like:
>>>
>>> asm("
>>> push %rdi;
>>> movslq %edi, %rdi;
>>> movq __per_cpu_offset(,%rdi,8), %rax;
>>> cmpb $0, %[offset](%rax);
>>> setne %al;
>>> pop %rdi;
>>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
>>>
>>> And if we could get rid of the sign extend on edi we could avoid all the
>>> push-pop nonsense, but I'm not sure I see how to do that (then again,
>>> this asm foo isn't my strongest point).
>> Maybe:
>>
>> movsql %edi, %rax;
>> movq __per_cpu_offset(,%rax,8), %rax;
>> cmpb $0, %[offset](%rax);
>> setne %al;
>>
>> ?
> Yes, that looks good to me.
>
> Cheers,
> Longman
>
Sorry, I am going to take it back. The displacement or offset can only
be up to 32-bit. So we will still need to use at least one more
register, I think.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 19:42           ` Waiman Long
  2017-02-13 20:12               ` Waiman Long
@ 2017-02-13 20:12             ` Waiman Long
  1 sibling, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-13 20:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, Rusty Russell,
	linux-kernel, virtualization, Chris Wright, Ingo Molnar,
	H. Peter Anvin, xen-devel, Alok Kataria, Thomas Gleixner

On 02/13/2017 02:42 PM, Waiman Long wrote:
> On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
>> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>>> That way we'd end up with something like:
>>>
>>> asm("
>>> push %rdi;
>>> movslq %edi, %rdi;
>>> movq __per_cpu_offset(,%rdi,8), %rax;
>>> cmpb $0, %[offset](%rax);
>>> setne %al;
>>> pop %rdi;
>>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
>>>
>>> And if we could get rid of the sign extend on edi we could avoid all the
>>> push-pop nonsense, but I'm not sure I see how to do that (then again,
>>> this asm foo isn't my strongest point).
>> Maybe:
>>
>> movsql %edi, %rax;
>> movq __per_cpu_offset(,%rax,8), %rax;
>> cmpb $0, %[offset](%rax);
>> setne %al;
>>
>> ?
> Yes, that looks good to me.
>
> Cheers,
> Longman
>
Sorry, I am going to take it back. The displacement or offset can only
be up to 32-bit. So we will still need to use at least one more
register, I think.

Cheers,
Longman


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 20:12               ` Waiman Long
@ 2017-02-13 21:52                 ` Peter Zijlstra
  -1 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-13 21:52 UTC (permalink / raw)
  To: Waiman Long
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, H. Peter Anvin, linux-arch, x86,
	linux-kernel, virtualization, xen-devel, kvm, Pan Xinhui,
	Paolo Bonzini, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On Mon, Feb 13, 2017 at 03:12:45PM -0500, Waiman Long wrote:
> On 02/13/2017 02:42 PM, Waiman Long wrote:
> > On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
> >> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
> >>> That way we'd end up with something like:
> >>>
> >>> asm("
> >>> push %rdi;
> >>> movslq %edi, %rdi;
> >>> movq __per_cpu_offset(,%rdi,8), %rax;
> >>> cmpb $0, %[offset](%rax);
> >>> setne %al;
> >>> pop %rdi;
> >>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
> >>>
> >>> And if we could get rid of the sign extend on edi we could avoid all the
> >>> push-pop nonsense, but I'm not sure I see how to do that (then again,
> >>> this asm foo isn't my strongest point).
> >> Maybe:
> >>
> >> movsql %edi, %rax;
> >> movq __per_cpu_offset(,%rax,8), %rax;
> >> cmpb $0, %[offset](%rax);
> >> setne %al;
> >>
> >> ?
> > Yes, that looks good to me.
> >
> > Cheers,
> > Longman
> >
> Sorry, I am going to take it back. The displacement or offset can only
> be up to 32-bit. So we will still need to use at least one more
> register, I think.

I don't think that would be a problem, I very much doubt we declare more
than 4G worth of per-cpu variables in the kernel.

In any case, use "e" or "Z" as constraint (I never quite know when to
use which). That are s32 and u32 displacement immediates resp. and
should fail compile with a semi-sensible failure if the displacement is
too big.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-13 21:52                 ` Peter Zijlstra
  0 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-13 21:52 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, H. Peter Anvin,
	xen-devel, Alok Kataria, Thomas Gleixner

On Mon, Feb 13, 2017 at 03:12:45PM -0500, Waiman Long wrote:
> On 02/13/2017 02:42 PM, Waiman Long wrote:
> > On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
> >> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
> >>> That way we'd end up with something like:
> >>>
> >>> asm("
> >>> push %rdi;
> >>> movslq %edi, %rdi;
> >>> movq __per_cpu_offset(,%rdi,8), %rax;
> >>> cmpb $0, %[offset](%rax);
> >>> setne %al;
> >>> pop %rdi;
> >>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
> >>>
> >>> And if we could get rid of the sign extend on edi we could avoid all the
> >>> push-pop nonsense, but I'm not sure I see how to do that (then again,
> >>> this asm foo isn't my strongest point).
> >> Maybe:
> >>
> >> movsql %edi, %rax;
> >> movq __per_cpu_offset(,%rax,8), %rax;
> >> cmpb $0, %[offset](%rax);
> >> setne %al;
> >>
> >> ?
> > Yes, that looks good to me.
> >
> > Cheers,
> > Longman
> >
> Sorry, I am going to take it back. The displacement or offset can only
> be up to 32-bit. So we will still need to use at least one more
> register, I think.

I don't think that would be a problem, I very much doubt we declare more
than 4G worth of per-cpu variables in the kernel.

In any case, use "e" or "Z" as constraint (I never quite know when to
use which). That are s32 and u32 displacement immediates resp. and
should fail compile with a semi-sensible failure if the displacement is
too big.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 20:12               ` Waiman Long
  (?)
@ 2017-02-13 21:52               ` Peter Zijlstra
  -1 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-13 21:52 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, Rusty Russell,
	linux-kernel, virtualization, Chris Wright, Ingo Molnar,
	H. Peter Anvin, xen-devel, Alok Kataria, Thomas Gleixner

On Mon, Feb 13, 2017 at 03:12:45PM -0500, Waiman Long wrote:
> On 02/13/2017 02:42 PM, Waiman Long wrote:
> > On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
> >> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
> >>> That way we'd end up with something like:
> >>>
> >>> asm("
> >>> push %rdi;
> >>> movslq %edi, %rdi;
> >>> movq __per_cpu_offset(,%rdi,8), %rax;
> >>> cmpb $0, %[offset](%rax);
> >>> setne %al;
> >>> pop %rdi;
> >>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
> >>>
> >>> And if we could get rid of the sign extend on edi we could avoid all the
> >>> push-pop nonsense, but I'm not sure I see how to do that (then again,
> >>> this asm foo isn't my strongest point).
> >> Maybe:
> >>
> >> movsql %edi, %rax;
> >> movq __per_cpu_offset(,%rax,8), %rax;
> >> cmpb $0, %[offset](%rax);
> >> setne %al;
> >>
> >> ?
> > Yes, that looks good to me.
> >
> > Cheers,
> > Longman
> >
> Sorry, I am going to take it back. The displacement or offset can only
> be up to 32-bit. So we will still need to use at least one more
> register, I think.

I don't think that would be a problem, I very much doubt we declare more
than 4G worth of per-cpu variables in the kernel.

In any case, use "e" or "Z" as constraint (I never quite know when to
use which). That are s32 and u32 displacement immediates resp. and
should fail compile with a semi-sensible failure if the displacement is
too big.


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 20:06             ` hpa
@ 2017-02-13 21:57               ` Peter Zijlstra
  -1 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-13 21:57 UTC (permalink / raw)
  To: hpa
  Cc: Waiman Long, Jeremy Fitzhardinge, Chris Wright, Alok Kataria,
	Rusty Russell, Ingo Molnar, Thomas Gleixner, linux-arch, x86,
	linux-kernel, virtualization, xen-devel, kvm, Pan Xinhui,
	Paolo Bonzini, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On Mon, Feb 13, 2017 at 12:06:44PM -0800, hpa@zytor.com wrote:

> >Maybe:
> >
> >movsql %edi, %rax;
> >movq __per_cpu_offset(,%rax,8), %rax;
> >cmpb $0, %[offset](%rax);
> >setne %al;
> >
> >?
> 
> We could kill the zero or sign extend by changing the calling
> interface to pass an unsigned long instead of an int.  It is much more
> likely that a zero extend is free for the caller than a sign extend.

Right, Boris and me talked about that on IRC. I was wondering if the
argument was u32 if we could assume the top 32 bits are 0 and then use
rdi without prior movzx.

That would allow reducing the thing one more instruction.

Also, PVOP_CALL_ARG#() have an (unsigned long) cast in them that doesn't
make sense. That cast ends up resulting in the calling code doing
explicit sign or zero extends into the full 64bit register for no good
reason.

If one removes that cast things still compile, but I worry something
somehow relies on this weird behaviour and will come apart.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-13 21:57               ` Peter Zijlstra
  0 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-13 21:57 UTC (permalink / raw)
  To: hpa
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, xen-devel,
	Waiman Long, Alok Kataria, Thomas Gleixner

On Mon, Feb 13, 2017 at 12:06:44PM -0800, hpa@zytor.com wrote:

> >Maybe:
> >
> >movsql %edi, %rax;
> >movq __per_cpu_offset(,%rax,8), %rax;
> >cmpb $0, %[offset](%rax);
> >setne %al;
> >
> >?
> 
> We could kill the zero or sign extend by changing the calling
> interface to pass an unsigned long instead of an int.  It is much more
> likely that a zero extend is free for the caller than a sign extend.

Right, Boris and me talked about that on IRC. I was wondering if the
argument was u32 if we could assume the top 32 bits are 0 and then use
rdi without prior movzx.

That would allow reducing the thing one more instruction.

Also, PVOP_CALL_ARG#() have an (unsigned long) cast in them that doesn't
make sense. That cast ends up resulting in the calling code doing
explicit sign or zero extends into the full 64bit register for no good
reason.

If one removes that cast things still compile, but I worry something
somehow relies on this weird behaviour and will come apart.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 20:06             ` hpa
  (?)
  (?)
@ 2017-02-13 21:57             ` Peter Zijlstra
  -1 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-13 21:57 UTC (permalink / raw)
  To: hpa
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, Rusty Russell,
	linux-kernel, virtualization, Chris Wright, Ingo Molnar,
	xen-devel, Waiman Long, Alok Kataria, Thomas Gleixner

On Mon, Feb 13, 2017 at 12:06:44PM -0800, hpa@zytor.com wrote:

> >Maybe:
> >
> >movsql %edi, %rax;
> >movq __per_cpu_offset(,%rax,8), %rax;
> >cmpb $0, %[offset](%rax);
> >setne %al;
> >
> >?
> 
> We could kill the zero or sign extend by changing the calling
> interface to pass an unsigned long instead of an int.  It is much more
> likely that a zero extend is free for the caller than a sign extend.

Right, Boris and me talked about that on IRC. I was wondering if the
argument was u32 if we could assume the top 32 bits are 0 and then use
rdi without prior movzx.

That would allow reducing the thing one more instruction.

Also, PVOP_CALL_ARG#() have an (unsigned long) cast in them that doesn't
make sense. That cast ends up resulting in the calling code doing
explicit sign or zero extends into the full 64bit register for no good
reason.

If one removes that cast things still compile, but I worry something
somehow relies on this weird behaviour and will come apart.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 21:52                 ` Peter Zijlstra
@ 2017-02-13 22:00                   ` hpa
  -1 siblings, 0 replies; 69+ messages in thread
From: hpa @ 2017-02-13 22:00 UTC (permalink / raw)
  To: Peter Zijlstra, Waiman Long
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, linux-arch, x86, linux-kernel,
	virtualization, xen-devel, kvm, Pan Xinhui, Paolo Bonzini,
	Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On February 13, 2017 1:52:20 PM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>On Mon, Feb 13, 2017 at 03:12:45PM -0500, Waiman Long wrote:
>> On 02/13/2017 02:42 PM, Waiman Long wrote:
>> > On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
>> >> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>> >>> That way we'd end up with something like:
>> >>>
>> >>> asm("
>> >>> push %rdi;
>> >>> movslq %edi, %rdi;
>> >>> movq __per_cpu_offset(,%rdi,8), %rax;
>> >>> cmpb $0, %[offset](%rax);
>> >>> setne %al;
>> >>> pop %rdi;
>> >>> " : : [offset] "i" (((unsigned long)&steal_time) +
>offsetof(struct steal_time, preempted)));
>> >>>
>> >>> And if we could get rid of the sign extend on edi we could avoid
>all the
>> >>> push-pop nonsense, but I'm not sure I see how to do that (then
>again,
>> >>> this asm foo isn't my strongest point).
>> >> Maybe:
>> >>
>> >> movsql %edi, %rax;
>> >> movq __per_cpu_offset(,%rax,8), %rax;
>> >> cmpb $0, %[offset](%rax);
>> >> setne %al;
>> >>
>> >> ?
>> > Yes, that looks good to me.
>> >
>> > Cheers,
>> > Longman
>> >
>> Sorry, I am going to take it back. The displacement or offset can
>only
>> be up to 32-bit. So we will still need to use at least one more
>> register, I think.
>
>I don't think that would be a problem, I very much doubt we declare
>more
>than 4G worth of per-cpu variables in the kernel.
>
>In any case, use "e" or "Z" as constraint (I never quite know when to
>use which). That are s32 and u32 displacement immediates resp. and
>should fail compile with a semi-sensible failure if the displacement is
>too big.

e for signed, Z for unsigned.  Obviously you have to use a matching instruction: an immediate or displacement in a 64-bit instruction is sign-extended, in a 32-bit instruction zero-extended.  E.g.:

   movl %0,%%eax # use Z, all of %rax will be set
   movq %0,%%rax # use e
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-13 22:00                   ` hpa
  0 siblings, 0 replies; 69+ messages in thread
From: hpa @ 2017-02-13 22:00 UTC (permalink / raw)
  To: Peter Zijlstra, Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, linux-kernel, virtualization,
	Chris Wright, Ingo Molnar, Paolo Bonzini, xen-devel,
	Alok Kataria, Thomas Gleixner

On February 13, 2017 1:52:20 PM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>On Mon, Feb 13, 2017 at 03:12:45PM -0500, Waiman Long wrote:
>> On 02/13/2017 02:42 PM, Waiman Long wrote:
>> > On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
>> >> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>> >>> That way we'd end up with something like:
>> >>>
>> >>> asm("
>> >>> push %rdi;
>> >>> movslq %edi, %rdi;
>> >>> movq __per_cpu_offset(,%rdi,8), %rax;
>> >>> cmpb $0, %[offset](%rax);
>> >>> setne %al;
>> >>> pop %rdi;
>> >>> " : : [offset] "i" (((unsigned long)&steal_time) +
>offsetof(struct steal_time, preempted)));
>> >>>
>> >>> And if we could get rid of the sign extend on edi we could avoid
>all the
>> >>> push-pop nonsense, but I'm not sure I see how to do that (then
>again,
>> >>> this asm foo isn't my strongest point).
>> >> Maybe:
>> >>
>> >> movsql %edi, %rax;
>> >> movq __per_cpu_offset(,%rax,8), %rax;
>> >> cmpb $0, %[offset](%rax);
>> >> setne %al;
>> >>
>> >> ?
>> > Yes, that looks good to me.
>> >
>> > Cheers,
>> > Longman
>> >
>> Sorry, I am going to take it back. The displacement or offset can
>only
>> be up to 32-bit. So we will still need to use at least one more
>> register, I think.
>
>I don't think that would be a problem, I very much doubt we declare
>more
>than 4G worth of per-cpu variables in the kernel.
>
>In any case, use "e" or "Z" as constraint (I never quite know when to
>use which). That are s32 and u32 displacement immediates resp. and
>should fail compile with a semi-sensible failure if the displacement is
>too big.

e for signed, Z for unsigned.  Obviously you have to use a matching instruction: an immediate or displacement in a 64-bit instruction is sign-extended, in a 32-bit instruction zero-extended.  E.g.:

   movl %0,%%eax # use Z, all of %rax will be set
   movq %0,%%rax # use e
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 21:52                 ` Peter Zijlstra
  (?)
@ 2017-02-13 22:00                 ` hpa
  -1 siblings, 0 replies; 69+ messages in thread
From: hpa @ 2017-02-13 22:00 UTC (permalink / raw)
  To: Peter Zijlstra, Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Rusty Russell, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, Paolo Bonzini,
	xen-devel, Alok Kataria, Thomas Gleixner

On February 13, 2017 1:52:20 PM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>On Mon, Feb 13, 2017 at 03:12:45PM -0500, Waiman Long wrote:
>> On 02/13/2017 02:42 PM, Waiman Long wrote:
>> > On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
>> >> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>> >>> That way we'd end up with something like:
>> >>>
>> >>> asm("
>> >>> push %rdi;
>> >>> movslq %edi, %rdi;
>> >>> movq __per_cpu_offset(,%rdi,8), %rax;
>> >>> cmpb $0, %[offset](%rax);
>> >>> setne %al;
>> >>> pop %rdi;
>> >>> " : : [offset] "i" (((unsigned long)&steal_time) +
>offsetof(struct steal_time, preempted)));
>> >>>
>> >>> And if we could get rid of the sign extend on edi we could avoid
>all the
>> >>> push-pop nonsense, but I'm not sure I see how to do that (then
>again,
>> >>> this asm foo isn't my strongest point).
>> >> Maybe:
>> >>
>> >> movsql %edi, %rax;
>> >> movq __per_cpu_offset(,%rax,8), %rax;
>> >> cmpb $0, %[offset](%rax);
>> >> setne %al;
>> >>
>> >> ?
>> > Yes, that looks good to me.
>> >
>> > Cheers,
>> > Longman
>> >
>> Sorry, I am going to take it back. The displacement or offset can
>only
>> be up to 32-bit. So we will still need to use at least one more
>> register, I think.
>
>I don't think that would be a problem, I very much doubt we declare
>more
>than 4G worth of per-cpu variables in the kernel.
>
>In any case, use "e" or "Z" as constraint (I never quite know when to
>use which). That are s32 and u32 displacement immediates resp. and
>should fail compile with a semi-sensible failure if the displacement is
>too big.

e for signed, Z for unsigned.  Obviously you have to use a matching instruction: an immediate or displacement in a 64-bit instruction is sign-extended, in a 32-bit instruction zero-extended.  E.g.:

   movl %0,%%eax # use Z, all of %rax will be set
   movq %0,%%rax # use e
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 21:52                 ` Peter Zijlstra
@ 2017-02-13 22:07                   ` hpa
  -1 siblings, 0 replies; 69+ messages in thread
From: hpa @ 2017-02-13 22:07 UTC (permalink / raw)
  To: Peter Zijlstra, Waiman Long
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, linux-arch, x86, linux-kernel,
	virtualization, xen-devel, kvm, Pan Xinhui, Paolo Bonzini,
	Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On February 13, 2017 1:52:20 PM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>On Mon, Feb 13, 2017 at 03:12:45PM -0500, Waiman Long wrote:
>> On 02/13/2017 02:42 PM, Waiman Long wrote:
>> > On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
>> >> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>> >>> That way we'd end up with something like:
>> >>>
>> >>> asm("
>> >>> push %rdi;
>> >>> movslq %edi, %rdi;
>> >>> movq __per_cpu_offset(,%rdi,8), %rax;
>> >>> cmpb $0, %[offset](%rax);
>> >>> setne %al;
>> >>> pop %rdi;
>> >>> " : : [offset] "i" (((unsigned long)&steal_time) +
>offsetof(struct steal_time, preempted)));
>> >>>
>> >>> And if we could get rid of the sign extend on edi we could avoid
>all the
>> >>> push-pop nonsense, but I'm not sure I see how to do that (then
>again,
>> >>> this asm foo isn't my strongest point).
>> >> Maybe:
>> >>
>> >> movsql %edi, %rax;
>> >> movq __per_cpu_offset(,%rax,8), %rax;
>> >> cmpb $0, %[offset](%rax);
>> >> setne %al;
>> >>
>> >> ?
>> > Yes, that looks good to me.
>> >
>> > Cheers,
>> > Longman
>> >
>> Sorry, I am going to take it back. The displacement or offset can
>only
>> be up to 32-bit. So we will still need to use at least one more
>> register, I think.
>
>I don't think that would be a problem, I very much doubt we declare
>more
>than 4G worth of per-cpu variables in the kernel.
>
>In any case, use "e" or "Z" as constraint (I never quite know when to
>use which). That are s32 and u32 displacement immediates resp. and
>should fail compile with a semi-sensible failure if the displacement is
>too big.

Oh, and unless you are explicitly forcing 32-bit addressing mode, displacements are always "e" (or "m" if you let gcc pick the addressing mode.)
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-13 22:07                   ` hpa
  0 siblings, 0 replies; 69+ messages in thread
From: hpa @ 2017-02-13 22:07 UTC (permalink / raw)
  To: Peter Zijlstra, Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, linux-kernel, virtualization,
	Chris Wright, Ingo Molnar, Paolo Bonzini, xen-devel,
	Alok Kataria, Thomas Gleixner

On February 13, 2017 1:52:20 PM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>On Mon, Feb 13, 2017 at 03:12:45PM -0500, Waiman Long wrote:
>> On 02/13/2017 02:42 PM, Waiman Long wrote:
>> > On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
>> >> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>> >>> That way we'd end up with something like:
>> >>>
>> >>> asm("
>> >>> push %rdi;
>> >>> movslq %edi, %rdi;
>> >>> movq __per_cpu_offset(,%rdi,8), %rax;
>> >>> cmpb $0, %[offset](%rax);
>> >>> setne %al;
>> >>> pop %rdi;
>> >>> " : : [offset] "i" (((unsigned long)&steal_time) +
>offsetof(struct steal_time, preempted)));
>> >>>
>> >>> And if we could get rid of the sign extend on edi we could avoid
>all the
>> >>> push-pop nonsense, but I'm not sure I see how to do that (then
>again,
>> >>> this asm foo isn't my strongest point).
>> >> Maybe:
>> >>
>> >> movsql %edi, %rax;
>> >> movq __per_cpu_offset(,%rax,8), %rax;
>> >> cmpb $0, %[offset](%rax);
>> >> setne %al;
>> >>
>> >> ?
>> > Yes, that looks good to me.
>> >
>> > Cheers,
>> > Longman
>> >
>> Sorry, I am going to take it back. The displacement or offset can
>only
>> be up to 32-bit. So we will still need to use at least one more
>> register, I think.
>
>I don't think that would be a problem, I very much doubt we declare
>more
>than 4G worth of per-cpu variables in the kernel.
>
>In any case, use "e" or "Z" as constraint (I never quite know when to
>use which). That are s32 and u32 displacement immediates resp. and
>should fail compile with a semi-sensible failure if the displacement is
>too big.

Oh, and unless you are explicitly forcing 32-bit addressing mode, displacements are always "e" (or "m" if you let gcc pick the addressing mode.)
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 21:52                 ` Peter Zijlstra
                                   ` (3 preceding siblings ...)
  (?)
@ 2017-02-13 22:07                 ` hpa
  -1 siblings, 0 replies; 69+ messages in thread
From: hpa @ 2017-02-13 22:07 UTC (permalink / raw)
  To: Peter Zijlstra, Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Rusty Russell, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, Paolo Bonzini,
	xen-devel, Alok Kataria, Thomas Gleixner

On February 13, 2017 1:52:20 PM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>On Mon, Feb 13, 2017 at 03:12:45PM -0500, Waiman Long wrote:
>> On 02/13/2017 02:42 PM, Waiman Long wrote:
>> > On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
>> >> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>> >>> That way we'd end up with something like:
>> >>>
>> >>> asm("
>> >>> push %rdi;
>> >>> movslq %edi, %rdi;
>> >>> movq __per_cpu_offset(,%rdi,8), %rax;
>> >>> cmpb $0, %[offset](%rax);
>> >>> setne %al;
>> >>> pop %rdi;
>> >>> " : : [offset] "i" (((unsigned long)&steal_time) +
>offsetof(struct steal_time, preempted)));
>> >>>
>> >>> And if we could get rid of the sign extend on edi we could avoid
>all the
>> >>> push-pop nonsense, but I'm not sure I see how to do that (then
>again,
>> >>> this asm foo isn't my strongest point).
>> >> Maybe:
>> >>
>> >> movsql %edi, %rax;
>> >> movq __per_cpu_offset(,%rax,8), %rax;
>> >> cmpb $0, %[offset](%rax);
>> >> setne %al;
>> >>
>> >> ?
>> > Yes, that looks good to me.
>> >
>> > Cheers,
>> > Longman
>> >
>> Sorry, I am going to take it back. The displacement or offset can
>only
>> be up to 32-bit. So we will still need to use at least one more
>> register, I think.
>
>I don't think that would be a problem, I very much doubt we declare
>more
>than 4G worth of per-cpu variables in the kernel.
>
>In any case, use "e" or "Z" as constraint (I never quite know when to
>use which). That are s32 and u32 displacement immediates resp. and
>should fail compile with a semi-sensible failure if the displacement is
>too big.

Oh, and unless you are explicitly forcing 32-bit addressing mode, displacements are always "e" (or "m" if you let gcc pick the addressing mode.)
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 20:06             ` hpa
@ 2017-02-13 22:24               ` Waiman Long
  -1 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-13 22:24 UTC (permalink / raw)
  To: hpa, Peter Zijlstra
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, linux-arch, x86, linux-kernel,
	virtualization, xen-devel, kvm, Pan Xinhui, Paolo Bonzini,
	Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On 02/13/2017 03:06 PM, hpa@zytor.com wrote:
> On February 13, 2017 2:53:43 AM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>>> That way we'd end up with something like:
>>>
>>> asm("
>>> push %rdi;
>>> movslq %edi, %rdi;
>>> movq __per_cpu_offset(,%rdi,8), %rax;
>>> cmpb $0, %[offset](%rax);
>>> setne %al;
>>> pop %rdi;
>>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct
>> steal_time, preempted)));
>>> And if we could get rid of the sign extend on edi we could avoid all
>> the
>>> push-pop nonsense, but I'm not sure I see how to do that (then again,
>>> this asm foo isn't my strongest point).
>> Maybe:
>>
>> movsql %edi, %rax;
>> movq __per_cpu_offset(,%rax,8), %rax;
>> cmpb $0, %[offset](%rax);
>> setne %al;
>>
>> ?
> We could kill the zero or sign extend by changing the calling interface to pass an unsigned long instead of an int.  It is much more likely that a zero extend is free for the caller than a sign extend.

I have thought of that too. However, the goal is to eliminate memory
read/write from/to stack. Eliminating a register sign-extend instruction
won't help much in term of performance.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-13 22:24               ` Waiman Long
  0 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-13 22:24 UTC (permalink / raw)
  To: hpa, Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, linux-kernel, virtualization,
	Chris Wright, Ingo Molnar, Paolo Bonzini, xen-devel,
	Alok Kataria, Thomas Gleixner

On 02/13/2017 03:06 PM, hpa@zytor.com wrote:
> On February 13, 2017 2:53:43 AM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>>> That way we'd end up with something like:
>>>
>>> asm("
>>> push %rdi;
>>> movslq %edi, %rdi;
>>> movq __per_cpu_offset(,%rdi,8), %rax;
>>> cmpb $0, %[offset](%rax);
>>> setne %al;
>>> pop %rdi;
>>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct
>> steal_time, preempted)));
>>> And if we could get rid of the sign extend on edi we could avoid all
>> the
>>> push-pop nonsense, but I'm not sure I see how to do that (then again,
>>> this asm foo isn't my strongest point).
>> Maybe:
>>
>> movsql %edi, %rax;
>> movq __per_cpu_offset(,%rax,8), %rax;
>> cmpb $0, %[offset](%rax);
>> setne %al;
>>
>> ?
> We could kill the zero or sign extend by changing the calling interface to pass an unsigned long instead of an int.  It is much more likely that a zero extend is free for the caller than a sign extend.

I have thought of that too. However, the goal is to eliminate memory
read/write from/to stack. Eliminating a register sign-extend instruction
won't help much in term of performance.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 20:06             ` hpa
                               ` (3 preceding siblings ...)
  (?)
@ 2017-02-13 22:24             ` Waiman Long
  -1 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-13 22:24 UTC (permalink / raw)
  To: hpa, Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Rusty Russell, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, Paolo Bonzini,
	xen-devel, Alok Kataria, Thomas Gleixner

On 02/13/2017 03:06 PM, hpa@zytor.com wrote:
> On February 13, 2017 2:53:43 AM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>>> That way we'd end up with something like:
>>>
>>> asm("
>>> push %rdi;
>>> movslq %edi, %rdi;
>>> movq __per_cpu_offset(,%rdi,8), %rax;
>>> cmpb $0, %[offset](%rax);
>>> setne %al;
>>> pop %rdi;
>>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct
>> steal_time, preempted)));
>>> And if we could get rid of the sign extend on edi we could avoid all
>> the
>>> push-pop nonsense, but I'm not sure I see how to do that (then again,
>>> this asm foo isn't my strongest point).
>> Maybe:
>>
>> movsql %edi, %rax;
>> movq __per_cpu_offset(,%rax,8), %rax;
>> cmpb $0, %[offset](%rax);
>> setne %al;
>>
>> ?
> We could kill the zero or sign extend by changing the calling interface to pass an unsigned long instead of an int.  It is much more likely that a zero extend is free for the caller than a sign extend.

I have thought of that too. However, the goal is to eliminate memory
read/write from/to stack. Eliminating a register sign-extend instruction
won't help much in term of performance.

Cheers,
Longman


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 22:24               ` Waiman Long
@ 2017-02-13 22:31                 ` Peter Zijlstra
  -1 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-13 22:31 UTC (permalink / raw)
  To: Waiman Long
  Cc: hpa, Jeremy Fitzhardinge, Chris Wright, Alok Kataria,
	Rusty Russell, Ingo Molnar, Thomas Gleixner, linux-arch, x86,
	linux-kernel, virtualization, xen-devel, kvm, Pan Xinhui,
	Paolo Bonzini, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On Mon, Feb 13, 2017 at 05:24:36PM -0500, Waiman Long wrote:

> >> movsql %edi, %rax;
> >> movq __per_cpu_offset(,%rax,8), %rax;
> >> cmpb $0, %[offset](%rax);
> >> setne %al;

> I have thought of that too. However, the goal is to eliminate memory
> read/write from/to stack. Eliminating a register sign-extend instruction
> won't help much in term of performance.

Problem here is that all instructions have dependencies, so if you can
get rid of the sign extend mov you kill a bunch of stall cycles (I would
expect).

But yes, peanuts vs the stack load/stores.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-13 22:31                 ` Peter Zijlstra
  0 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-13 22:31 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, hpa, xen-devel,
	Alok Kataria, Thomas Gleixner

On Mon, Feb 13, 2017 at 05:24:36PM -0500, Waiman Long wrote:

> >> movsql %edi, %rax;
> >> movq __per_cpu_offset(,%rax,8), %rax;
> >> cmpb $0, %[offset](%rax);
> >> setne %al;

> I have thought of that too. However, the goal is to eliminate memory
> read/write from/to stack. Eliminating a register sign-extend instruction
> won't help much in term of performance.

Problem here is that all instructions have dependencies, so if you can
get rid of the sign extend mov you kill a bunch of stall cycles (I would
expect).

But yes, peanuts vs the stack load/stores.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 22:24               ` Waiman Long
  (?)
@ 2017-02-13 22:31               ` Peter Zijlstra
  -1 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-13 22:31 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, Rusty Russell,
	linux-kernel, virtualization, Chris Wright, Ingo Molnar, hpa,
	xen-devel, Alok Kataria, Thomas Gleixner

On Mon, Feb 13, 2017 at 05:24:36PM -0500, Waiman Long wrote:

> >> movsql %edi, %rax;
> >> movq __per_cpu_offset(,%rax,8), %rax;
> >> cmpb $0, %[offset](%rax);
> >> setne %al;

> I have thought of that too. However, the goal is to eliminate memory
> read/write from/to stack. Eliminating a register sign-extend instruction
> won't help much in term of performance.

Problem here is that all instructions have dependencies, so if you can
get rid of the sign extend mov you kill a bunch of stall cycles (I would
expect).

But yes, peanuts vs the stack load/stores.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 21:52                 ` Peter Zijlstra
@ 2017-02-13 22:34                   ` Waiman Long
  -1 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-13 22:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, H. Peter Anvin, linux-arch, x86,
	linux-kernel, virtualization, xen-devel, kvm, Pan Xinhui,
	Paolo Bonzini, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On 02/13/2017 04:52 PM, Peter Zijlstra wrote:
> On Mon, Feb 13, 2017 at 03:12:45PM -0500, Waiman Long wrote:
>> On 02/13/2017 02:42 PM, Waiman Long wrote:
>>> On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
>>>> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>>>>> That way we'd end up with something like:
>>>>>
>>>>> asm("
>>>>> push %rdi;
>>>>> movslq %edi, %rdi;
>>>>> movq __per_cpu_offset(,%rdi,8), %rax;
>>>>> cmpb $0, %[offset](%rax);
>>>>> setne %al;
>>>>> pop %rdi;
>>>>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
>>>>>
>>>>> And if we could get rid of the sign extend on edi we could avoid all the
>>>>> push-pop nonsense, but I'm not sure I see how to do that (then again,
>>>>> this asm foo isn't my strongest point).
>>>> Maybe:
>>>>
>>>> movsql %edi, %rax;
>>>> movq __per_cpu_offset(,%rax,8), %rax;
>>>> cmpb $0, %[offset](%rax);
>>>> setne %al;
>>>>
>>>> ?
>>> Yes, that looks good to me.
>>>
>>> Cheers,
>>> Longman
>>>
>> Sorry, I am going to take it back. The displacement or offset can only
>> be up to 32-bit. So we will still need to use at least one more
>> register, I think.
> I don't think that would be a problem, I very much doubt we declare more
> than 4G worth of per-cpu variables in the kernel.
>
> In any case, use "e" or "Z" as constraint (I never quite know when to
> use which). That are s32 and u32 displacement immediates resp. and
> should fail compile with a semi-sensible failure if the displacement is
> too big.
>
It is the address of &steal_time that will exceed the 32-bit limit.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-13 22:34                   ` Waiman Long
  0 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-13 22:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, H. Peter Anvin,
	xen-devel, Alok Kataria, Thomas Gleixner

On 02/13/2017 04:52 PM, Peter Zijlstra wrote:
> On Mon, Feb 13, 2017 at 03:12:45PM -0500, Waiman Long wrote:
>> On 02/13/2017 02:42 PM, Waiman Long wrote:
>>> On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
>>>> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>>>>> That way we'd end up with something like:
>>>>>
>>>>> asm("
>>>>> push %rdi;
>>>>> movslq %edi, %rdi;
>>>>> movq __per_cpu_offset(,%rdi,8), %rax;
>>>>> cmpb $0, %[offset](%rax);
>>>>> setne %al;
>>>>> pop %rdi;
>>>>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
>>>>>
>>>>> And if we could get rid of the sign extend on edi we could avoid all the
>>>>> push-pop nonsense, but I'm not sure I see how to do that (then again,
>>>>> this asm foo isn't my strongest point).
>>>> Maybe:
>>>>
>>>> movsql %edi, %rax;
>>>> movq __per_cpu_offset(,%rax,8), %rax;
>>>> cmpb $0, %[offset](%rax);
>>>> setne %al;
>>>>
>>>> ?
>>> Yes, that looks good to me.
>>>
>>> Cheers,
>>> Longman
>>>
>> Sorry, I am going to take it back. The displacement or offset can only
>> be up to 32-bit. So we will still need to use at least one more
>> register, I think.
> I don't think that would be a problem, I very much doubt we declare more
> than 4G worth of per-cpu variables in the kernel.
>
> In any case, use "e" or "Z" as constraint (I never quite know when to
> use which). That are s32 and u32 displacement immediates resp. and
> should fail compile with a semi-sensible failure if the displacement is
> too big.
>
It is the address of &steal_time that will exceed the 32-bit limit.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 21:52                 ` Peter Zijlstra
                                   ` (4 preceding siblings ...)
  (?)
@ 2017-02-13 22:34                 ` Waiman Long
  -1 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-13 22:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, Rusty Russell,
	linux-kernel, virtualization, Chris Wright, Ingo Molnar,
	H. Peter Anvin, xen-devel, Alok Kataria, Thomas Gleixner

On 02/13/2017 04:52 PM, Peter Zijlstra wrote:
> On Mon, Feb 13, 2017 at 03:12:45PM -0500, Waiman Long wrote:
>> On 02/13/2017 02:42 PM, Waiman Long wrote:
>>> On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
>>>> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>>>>> That way we'd end up with something like:
>>>>>
>>>>> asm("
>>>>> push %rdi;
>>>>> movslq %edi, %rdi;
>>>>> movq __per_cpu_offset(,%rdi,8), %rax;
>>>>> cmpb $0, %[offset](%rax);
>>>>> setne %al;
>>>>> pop %rdi;
>>>>> " : : [offset] "i" (((unsigned long)&steal_time) + offsetof(struct steal_time, preempted)));
>>>>>
>>>>> And if we could get rid of the sign extend on edi we could avoid all the
>>>>> push-pop nonsense, but I'm not sure I see how to do that (then again,
>>>>> this asm foo isn't my strongest point).
>>>> Maybe:
>>>>
>>>> movsql %edi, %rax;
>>>> movq __per_cpu_offset(,%rax,8), %rax;
>>>> cmpb $0, %[offset](%rax);
>>>> setne %al;
>>>>
>>>> ?
>>> Yes, that looks good to me.
>>>
>>> Cheers,
>>> Longman
>>>
>> Sorry, I am going to take it back. The displacement or offset can only
>> be up to 32-bit. So we will still need to use at least one more
>> register, I think.
> I don't think that would be a problem, I very much doubt we declare more
> than 4G worth of per-cpu variables in the kernel.
>
> In any case, use "e" or "Z" as constraint (I never quite know when to
> use which). That are s32 and u32 displacement immediates resp. and
> should fail compile with a semi-sensible failure if the displacement is
> too big.
>
It is the address of &steal_time that will exceed the 32-bit limit.

Cheers,
Longman




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 22:34                   ` Waiman Long
@ 2017-02-13 22:36                     ` hpa
  -1 siblings, 0 replies; 69+ messages in thread
From: hpa @ 2017-02-13 22:36 UTC (permalink / raw)
  To: Waiman Long, Peter Zijlstra
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, linux-arch, x86, linux-kernel,
	virtualization, xen-devel, kvm, Pan Xinhui, Paolo Bonzini,
	Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On February 13, 2017 2:34:01 PM PST, Waiman Long <longman@redhat.com> wrote:
>On 02/13/2017 04:52 PM, Peter Zijlstra wrote:
>> On Mon, Feb 13, 2017 at 03:12:45PM -0500, Waiman Long wrote:
>>> On 02/13/2017 02:42 PM, Waiman Long wrote:
>>>> On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
>>>>> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>>>>>> That way we'd end up with something like:
>>>>>>
>>>>>> asm("
>>>>>> push %rdi;
>>>>>> movslq %edi, %rdi;
>>>>>> movq __per_cpu_offset(,%rdi,8), %rax;
>>>>>> cmpb $0, %[offset](%rax);
>>>>>> setne %al;
>>>>>> pop %rdi;
>>>>>> " : : [offset] "i" (((unsigned long)&steal_time) +
>offsetof(struct steal_time, preempted)));
>>>>>>
>>>>>> And if we could get rid of the sign extend on edi we could avoid
>all the
>>>>>> push-pop nonsense, but I'm not sure I see how to do that (then
>again,
>>>>>> this asm foo isn't my strongest point).
>>>>> Maybe:
>>>>>
>>>>> movsql %edi, %rax;
>>>>> movq __per_cpu_offset(,%rax,8), %rax;
>>>>> cmpb $0, %[offset](%rax);
>>>>> setne %al;
>>>>>
>>>>> ?
>>>> Yes, that looks good to me.
>>>>
>>>> Cheers,
>>>> Longman
>>>>
>>> Sorry, I am going to take it back. The displacement or offset can
>only
>>> be up to 32-bit. So we will still need to use at least one more
>>> register, I think.
>> I don't think that would be a problem, I very much doubt we declare
>more
>> than 4G worth of per-cpu variables in the kernel.
>>
>> In any case, use "e" or "Z" as constraint (I never quite know when to
>> use which). That are s32 and u32 displacement immediates resp. and
>> should fail compile with a semi-sensible failure if the displacement
>is
>> too big.
>>
>It is the address of &steal_time that will exceed the 32-bit limit.
>
>Cheers,
>Longman

That seems odd in the extreme?
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-13 22:36                     ` hpa
  0 siblings, 0 replies; 69+ messages in thread
From: hpa @ 2017-02-13 22:36 UTC (permalink / raw)
  To: Waiman Long, Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, linux-kernel, virtualization,
	Chris Wright, Ingo Molnar, Paolo Bonzini, xen-devel,
	Alok Kataria, Thomas Gleixner

On February 13, 2017 2:34:01 PM PST, Waiman Long <longman@redhat.com> wrote:
>On 02/13/2017 04:52 PM, Peter Zijlstra wrote:
>> On Mon, Feb 13, 2017 at 03:12:45PM -0500, Waiman Long wrote:
>>> On 02/13/2017 02:42 PM, Waiman Long wrote:
>>>> On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
>>>>> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>>>>>> That way we'd end up with something like:
>>>>>>
>>>>>> asm("
>>>>>> push %rdi;
>>>>>> movslq %edi, %rdi;
>>>>>> movq __per_cpu_offset(,%rdi,8), %rax;
>>>>>> cmpb $0, %[offset](%rax);
>>>>>> setne %al;
>>>>>> pop %rdi;
>>>>>> " : : [offset] "i" (((unsigned long)&steal_time) +
>offsetof(struct steal_time, preempted)));
>>>>>>
>>>>>> And if we could get rid of the sign extend on edi we could avoid
>all the
>>>>>> push-pop nonsense, but I'm not sure I see how to do that (then
>again,
>>>>>> this asm foo isn't my strongest point).
>>>>> Maybe:
>>>>>
>>>>> movsql %edi, %rax;
>>>>> movq __per_cpu_offset(,%rax,8), %rax;
>>>>> cmpb $0, %[offset](%rax);
>>>>> setne %al;
>>>>>
>>>>> ?
>>>> Yes, that looks good to me.
>>>>
>>>> Cheers,
>>>> Longman
>>>>
>>> Sorry, I am going to take it back. The displacement or offset can
>only
>>> be up to 32-bit. So we will still need to use at least one more
>>> register, I think.
>> I don't think that would be a problem, I very much doubt we declare
>more
>> than 4G worth of per-cpu variables in the kernel.
>>
>> In any case, use "e" or "Z" as constraint (I never quite know when to
>> use which). That are s32 and u32 displacement immediates resp. and
>> should fail compile with a semi-sensible failure if the displacement
>is
>> too big.
>>
>It is the address of &steal_time that will exceed the 32-bit limit.
>
>Cheers,
>Longman

That seems odd in the extreme?
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 22:34                   ` Waiman Long
  (?)
  (?)
@ 2017-02-13 22:36                   ` hpa
  -1 siblings, 0 replies; 69+ messages in thread
From: hpa @ 2017-02-13 22:36 UTC (permalink / raw)
  To: Waiman Long, Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Rusty Russell, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, Paolo Bonzini,
	xen-devel, Alok Kataria, Thomas Gleixner

On February 13, 2017 2:34:01 PM PST, Waiman Long <longman@redhat.com> wrote:
>On 02/13/2017 04:52 PM, Peter Zijlstra wrote:
>> On Mon, Feb 13, 2017 at 03:12:45PM -0500, Waiman Long wrote:
>>> On 02/13/2017 02:42 PM, Waiman Long wrote:
>>>> On 02/13/2017 05:53 AM, Peter Zijlstra wrote:
>>>>> On Mon, Feb 13, 2017 at 11:47:16AM +0100, Peter Zijlstra wrote:
>>>>>> That way we'd end up with something like:
>>>>>>
>>>>>> asm("
>>>>>> push %rdi;
>>>>>> movslq %edi, %rdi;
>>>>>> movq __per_cpu_offset(,%rdi,8), %rax;
>>>>>> cmpb $0, %[offset](%rax);
>>>>>> setne %al;
>>>>>> pop %rdi;
>>>>>> " : : [offset] "i" (((unsigned long)&steal_time) +
>offsetof(struct steal_time, preempted)));
>>>>>>
>>>>>> And if we could get rid of the sign extend on edi we could avoid
>all the
>>>>>> push-pop nonsense, but I'm not sure I see how to do that (then
>again,
>>>>>> this asm foo isn't my strongest point).
>>>>> Maybe:
>>>>>
>>>>> movsql %edi, %rax;
>>>>> movq __per_cpu_offset(,%rax,8), %rax;
>>>>> cmpb $0, %[offset](%rax);
>>>>> setne %al;
>>>>>
>>>>> ?
>>>> Yes, that looks good to me.
>>>>
>>>> Cheers,
>>>> Longman
>>>>
>>> Sorry, I am going to take it back. The displacement or offset can
>only
>>> be up to 32-bit. So we will still need to use at least one more
>>> register, I think.
>> I don't think that would be a problem, I very much doubt we declare
>more
>> than 4G worth of per-cpu variables in the kernel.
>>
>> In any case, use "e" or "Z" as constraint (I never quite know when to
>> use which). That are s32 and u32 displacement immediates resp. and
>> should fail compile with a semi-sensible failure if the displacement
>is
>> too big.
>>
>It is the address of &steal_time that will exceed the 32-bit limit.
>
>Cheers,
>Longman

That seems odd in the extreme?
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 22:34                   ` Waiman Long
@ 2017-02-14  9:39                     ` Peter Zijlstra
  -1 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-14  9:39 UTC (permalink / raw)
  To: Waiman Long
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, H. Peter Anvin, linux-arch, x86,
	linux-kernel, virtualization, xen-devel, kvm, Pan Xinhui,
	Paolo Bonzini, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On Mon, Feb 13, 2017 at 05:34:01PM -0500, Waiman Long wrote:
> It is the address of &steal_time that will exceed the 32-bit limit.

That seems extremely unlikely. That would mean we have more than 4G
worth of per-cpu variables declared in the kernel.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-14  9:39                     ` Peter Zijlstra
  0 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-14  9:39 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, H. Peter Anvin,
	xen-devel, Alok Kataria, Thomas Gleixner

On Mon, Feb 13, 2017 at 05:34:01PM -0500, Waiman Long wrote:
> It is the address of &steal_time that will exceed the 32-bit limit.

That seems extremely unlikely. That would mean we have more than 4G
worth of per-cpu variables declared in the kernel.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-13 22:34                   ` Waiman Long
                                     ` (2 preceding siblings ...)
  (?)
@ 2017-02-14  9:39                   ` Peter Zijlstra
  -1 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-14  9:39 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, Rusty Russell,
	linux-kernel, virtualization, Chris Wright, Ingo Molnar,
	H. Peter Anvin, xen-devel, Alok Kataria, Thomas Gleixner

On Mon, Feb 13, 2017 at 05:34:01PM -0500, Waiman Long wrote:
> It is the address of &steal_time that will exceed the 32-bit limit.

That seems extremely unlikely. That would mean we have more than 4G
worth of per-cpu variables declared in the kernel.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-14  9:39                     ` Peter Zijlstra
@ 2017-02-14 14:46                       ` Waiman Long
  -1 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-14 14:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, H. Peter Anvin, linux-arch, x86,
	linux-kernel, virtualization, xen-devel, kvm, Pan Xinhui,
	Paolo Bonzini, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On 02/14/2017 04:39 AM, Peter Zijlstra wrote:
> On Mon, Feb 13, 2017 at 05:34:01PM -0500, Waiman Long wrote:
>> It is the address of &steal_time that will exceed the 32-bit limit.
> That seems extremely unlikely. That would mean we have more than 4G
> worth of per-cpu variables declared in the kernel.

I have some doubt about if the compiler is able to properly use
RIP-relative addressing for this. Anyway, it seems like constraints
aren't allowed for asm() when not in the function context, at least for
the the compiler that I am using (4.8.5). So it is a moot point.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-14 14:46                       ` Waiman Long
  0 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-14 14:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, H. Peter Anvin,
	xen-devel, Alok Kataria, Thomas Gleixner

On 02/14/2017 04:39 AM, Peter Zijlstra wrote:
> On Mon, Feb 13, 2017 at 05:34:01PM -0500, Waiman Long wrote:
>> It is the address of &steal_time that will exceed the 32-bit limit.
> That seems extremely unlikely. That would mean we have more than 4G
> worth of per-cpu variables declared in the kernel.

I have some doubt about if the compiler is able to properly use
RIP-relative addressing for this. Anyway, it seems like constraints
aren't allowed for asm() when not in the function context, at least for
the the compiler that I am using (4.8.5). So it is a moot point.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-14  9:39                     ` Peter Zijlstra
  (?)
@ 2017-02-14 14:46                     ` Waiman Long
  -1 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-14 14:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, Rusty Russell,
	linux-kernel, virtualization, Chris Wright, Ingo Molnar,
	H. Peter Anvin, xen-devel, Alok Kataria, Thomas Gleixner

On 02/14/2017 04:39 AM, Peter Zijlstra wrote:
> On Mon, Feb 13, 2017 at 05:34:01PM -0500, Waiman Long wrote:
>> It is the address of &steal_time that will exceed the 32-bit limit.
> That seems extremely unlikely. That would mean we have more than 4G
> worth of per-cpu variables declared in the kernel.

I have some doubt about if the compiler is able to properly use
RIP-relative addressing for this. Anyway, it seems like constraints
aren't allowed for asm() when not in the function context, at least for
the the compiler that I am using (4.8.5). So it is a moot point.

Cheers,
Longman



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-14 14:46                       ` Waiman Long
@ 2017-02-14 16:03                         ` Peter Zijlstra
  -1 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-14 16:03 UTC (permalink / raw)
  To: Waiman Long
  Cc: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Ingo Molnar, Thomas Gleixner, H. Peter Anvin, linux-arch, x86,
	linux-kernel, virtualization, xen-devel, kvm, Pan Xinhui,
	Paolo Bonzini, Radim Krčmář,
	Boris Ostrovsky, Juergen Gross

On Tue, Feb 14, 2017 at 09:46:17AM -0500, Waiman Long wrote:
> On 02/14/2017 04:39 AM, Peter Zijlstra wrote:
> > On Mon, Feb 13, 2017 at 05:34:01PM -0500, Waiman Long wrote:
> >> It is the address of &steal_time that will exceed the 32-bit limit.
> > That seems extremely unlikely. That would mean we have more than 4G
> > worth of per-cpu variables declared in the kernel.
> 
> I have some doubt about if the compiler is able to properly use
> RIP-relative addressing for this.

Its not RIP relative, &steal_time lives in the .data..percpu section and
is absolute in that.

> Anyway, it seems like constraints
> aren't allowed for asm() when not in the function context, at least for
> the the compiler that I am using (4.8.5). So it is a moot point.

Well kvm_steal_time is (host/guest) ABI anyway, so the offset is fixed
and hard-coding it isn't a problem.

$ readelf -s defconfig-build/vmlinux | grep steal_time
100843: 0000000000017ac0    64 OBJECT  WEAK   DEFAULT   35 steal_time

$ objdump -dr defconfig-build/vmlinux | awk '/[<][^>]*[>]:/ { o=0 } /[<]__raw_callee_save___kvm_vcpu_is_preempted[>]:/ {o=1} { if (o) print $0 }'
ffffffff810b4480 <__raw_callee_save___kvm_vcpu_is_preempted>:
ffffffff810b4480:       55                      push   %rbp
ffffffff810b4481:       48 89 e5                mov    %rsp,%rbp
ffffffff810b4484:       48 8b 04 fd 00 94 46    mov    -0x7db96c00(,%rdi,8),%rax
ffffffff810b448b:       82 
                        ffffffff810b4488: R_X86_64_32S  __per_cpu_offset
ffffffff810b448c:       80 b8 d0 7a 01 00 00    cmpb   $0x0,0x17ad0(%rax)
                        ffffffff810b448e: R_X86_64_32S  steal_time+0x10
ffffffff810b4493:       0f 95 c0                setne  %al
ffffffff810b4496:       5d                      pop    %rbp
ffffffff810b4497:       c3                      retq   


And as you'll note, the displacement is correct and 'small'.

The below relies on the 'extra' cast in PVOP_CALL_ARG1() to extend the
argument to 64bit on the call side of things.

---
 arch/x86/kernel/kvm.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 099fcba..2c854b8 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -589,6 +589,7 @@ static void kvm_wait(u8 *ptr, u8 val)
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_X86_32
 __visible bool __kvm_vcpu_is_preempted(int cpu)
 {
 	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
@@ -597,6 +598,26 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
 }
 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
 
+#else
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(int cpu);
+
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+FRAME_BEGIN
+"movq __per_cpu_offset(,%rdi,8), %rax;"
+"cmpb $0, 16+steal_time(%rax);"
+"setne %al;"
+FRAME_END
+"ret;"
+".popsection"
+);
+
+#endif
+
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  */

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-14 16:03                         ` Peter Zijlstra
  0 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-14 16:03 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, linux-kernel,
	virtualization, Chris Wright, Ingo Molnar, H. Peter Anvin,
	xen-devel, Alok Kataria, Thomas Gleixner

On Tue, Feb 14, 2017 at 09:46:17AM -0500, Waiman Long wrote:
> On 02/14/2017 04:39 AM, Peter Zijlstra wrote:
> > On Mon, Feb 13, 2017 at 05:34:01PM -0500, Waiman Long wrote:
> >> It is the address of &steal_time that will exceed the 32-bit limit.
> > That seems extremely unlikely. That would mean we have more than 4G
> > worth of per-cpu variables declared in the kernel.
> 
> I have some doubt about if the compiler is able to properly use
> RIP-relative addressing for this.

Its not RIP relative, &steal_time lives in the .data..percpu section and
is absolute in that.

> Anyway, it seems like constraints
> aren't allowed for asm() when not in the function context, at least for
> the the compiler that I am using (4.8.5). So it is a moot point.

Well kvm_steal_time is (host/guest) ABI anyway, so the offset is fixed
and hard-coding it isn't a problem.

$ readelf -s defconfig-build/vmlinux | grep steal_time
100843: 0000000000017ac0    64 OBJECT  WEAK   DEFAULT   35 steal_time

$ objdump -dr defconfig-build/vmlinux | awk '/[<][^>]*[>]:/ { o=0 } /[<]__raw_callee_save___kvm_vcpu_is_preempted[>]:/ {o=1} { if (o) print $0 }'
ffffffff810b4480 <__raw_callee_save___kvm_vcpu_is_preempted>:
ffffffff810b4480:       55                      push   %rbp
ffffffff810b4481:       48 89 e5                mov    %rsp,%rbp
ffffffff810b4484:       48 8b 04 fd 00 94 46    mov    -0x7db96c00(,%rdi,8),%rax
ffffffff810b448b:       82 
                        ffffffff810b4488: R_X86_64_32S  __per_cpu_offset
ffffffff810b448c:       80 b8 d0 7a 01 00 00    cmpb   $0x0,0x17ad0(%rax)
                        ffffffff810b448e: R_X86_64_32S  steal_time+0x10
ffffffff810b4493:       0f 95 c0                setne  %al
ffffffff810b4496:       5d                      pop    %rbp
ffffffff810b4497:       c3                      retq   


And as you'll note, the displacement is correct and 'small'.

The below relies on the 'extra' cast in PVOP_CALL_ARG1() to extend the
argument to 64bit on the call side of things.

---
 arch/x86/kernel/kvm.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 099fcba..2c854b8 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -589,6 +589,7 @@ static void kvm_wait(u8 *ptr, u8 val)
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_X86_32
 __visible bool __kvm_vcpu_is_preempted(int cpu)
 {
 	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
@@ -597,6 +598,26 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
 }
 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
 
+#else
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(int cpu);
+
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+FRAME_BEGIN
+"movq __per_cpu_offset(,%rdi,8), %rax;"
+"cmpb $0, 16+steal_time(%rax);"
+"setne %al;"
+FRAME_END
+"ret;"
+".popsection"
+);
+
+#endif
+
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  */

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-14 14:46                       ` Waiman Long
  (?)
@ 2017-02-14 16:03                       ` Peter Zijlstra
  -1 siblings, 0 replies; 69+ messages in thread
From: Peter Zijlstra @ 2017-02-14 16:03 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, Rusty Russell,
	linux-kernel, virtualization, Chris Wright, Ingo Molnar,
	H. Peter Anvin, xen-devel, Alok Kataria, Thomas Gleixner

On Tue, Feb 14, 2017 at 09:46:17AM -0500, Waiman Long wrote:
> On 02/14/2017 04:39 AM, Peter Zijlstra wrote:
> > On Mon, Feb 13, 2017 at 05:34:01PM -0500, Waiman Long wrote:
> >> It is the address of &steal_time that will exceed the 32-bit limit.
> > That seems extremely unlikely. That would mean we have more than 4G
> > worth of per-cpu variables declared in the kernel.
> 
> I have some doubt about if the compiler is able to properly use
> RIP-relative addressing for this.

Its not RIP relative, &steal_time lives in the .data..percpu section and
is absolute in that.

> Anyway, it seems like constraints
> aren't allowed for asm() when not in the function context, at least for
> the the compiler that I am using (4.8.5). So it is a moot point.

Well kvm_steal_time is (host/guest) ABI anyway, so the offset is fixed
and hard-coding it isn't a problem.

$ readelf -s defconfig-build/vmlinux | grep steal_time
100843: 0000000000017ac0    64 OBJECT  WEAK   DEFAULT   35 steal_time

$ objdump -dr defconfig-build/vmlinux | awk '/[<][^>]*[>]:/ { o=0 } /[<]__raw_callee_save___kvm_vcpu_is_preempted[>]:/ {o=1} { if (o) print $0 }'
ffffffff810b4480 <__raw_callee_save___kvm_vcpu_is_preempted>:
ffffffff810b4480:       55                      push   %rbp
ffffffff810b4481:       48 89 e5                mov    %rsp,%rbp
ffffffff810b4484:       48 8b 04 fd 00 94 46    mov    -0x7db96c00(,%rdi,8),%rax
ffffffff810b448b:       82 
                        ffffffff810b4488: R_X86_64_32S  __per_cpu_offset
ffffffff810b448c:       80 b8 d0 7a 01 00 00    cmpb   $0x0,0x17ad0(%rax)
                        ffffffff810b448e: R_X86_64_32S  steal_time+0x10
ffffffff810b4493:       0f 95 c0                setne  %al
ffffffff810b4496:       5d                      pop    %rbp
ffffffff810b4497:       c3                      retq   


And as you'll note, the displacement is correct and 'small'.

The below relies on the 'extra' cast in PVOP_CALL_ARG1() to extend the
argument to 64bit on the call side of things.

---
 arch/x86/kernel/kvm.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 099fcba..2c854b8 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -589,6 +589,7 @@ static void kvm_wait(u8 *ptr, u8 val)
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_X86_32
 __visible bool __kvm_vcpu_is_preempted(int cpu)
 {
 	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
@@ -597,6 +598,26 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
 }
 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
 
+#else
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(int cpu);
+
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+FRAME_BEGIN
+"movq __per_cpu_offset(,%rdi,8), %rax;"
+"cmpb $0, 16+steal_time(%rax);"
+"setne %al;"
+FRAME_END
+"ret;"
+".popsection"
+);
+
+#endif
+
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  */

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply related	[flat|nested] 69+ messages in thread

* Re: [Xen-devel] [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-14 14:46                       ` Waiman Long
                                         ` (4 preceding siblings ...)
  (?)
@ 2017-02-14 16:18                       ` Andrew Cooper
  -1 siblings, 0 replies; 69+ messages in thread
From: Andrew Cooper @ 2017-02-14 16:18 UTC (permalink / raw)
  To: Waiman Long, Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, x86, kvm,
	Radim Krčmář,
	Boris Ostrovsky, Pan Xinhui, Paolo Bonzini, Rusty Russell,
	linux-kernel, virtualization, Chris Wright, Ingo Molnar,
	H. Peter Anvin, xen-devel, Alok Kataria, Thomas Gleixner

On 14/02/17 14:46, Waiman Long wrote:
> On 02/14/2017 04:39 AM, Peter Zijlstra wrote:
>> On Mon, Feb 13, 2017 at 05:34:01PM -0500, Waiman Long wrote:
>>> It is the address of &steal_time that will exceed the 32-bit limit.
>> That seems extremely unlikely. That would mean we have more than 4G
>> worth of per-cpu variables declared in the kernel.
> I have some doubt about if the compiler is able to properly use
> RIP-relative addressing for this. Anyway, it seems like constraints
> aren't allowed for asm() when not in the function context, at least for
> the the compiler that I am using (4.8.5). So it is a moot point.

You can work the issue of not having parameters in a plain asm()
statement by using an asm-offset, stringizing it, and have C put the
string fragments back together.

"cmpb $0, " STR(STEAL_TIME_preempted) "(%rax);"

~Andrew

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [Xen-devel] [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-14 14:46                       ` Waiman Long
                                         ` (2 preceding siblings ...)
  (?)
@ 2017-02-14 16:18                       ` Andrew Cooper
  -1 siblings, 0 replies; 69+ messages in thread
From: Andrew Cooper @ 2017-02-14 16:18 UTC (permalink / raw)
  To: Waiman Long, Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, xen-devel, kvm,
	Radim Krčmář,
	Pan Xinhui, x86, linux-kernel, virtualization, Chris Wright,
	Ingo Molnar, Thomas Gleixner, H. Peter Anvin, Paolo Bonzini,
	Boris Ostrovsky, Alok Kataria

On 14/02/17 14:46, Waiman Long wrote:
> On 02/14/2017 04:39 AM, Peter Zijlstra wrote:
>> On Mon, Feb 13, 2017 at 05:34:01PM -0500, Waiman Long wrote:
>>> It is the address of &steal_time that will exceed the 32-bit limit.
>> That seems extremely unlikely. That would mean we have more than 4G
>> worth of per-cpu variables declared in the kernel.
> I have some doubt about if the compiler is able to properly use
> RIP-relative addressing for this. Anyway, it seems like constraints
> aren't allowed for asm() when not in the function context, at least for
> the the compiler that I am using (4.8.5). So it is a moot point.

You can work the issue of not having parameters in a plain asm()
statement by using an asm-offset, stringizing it, and have C put the
string fragments back together.

"cmpb $0, " STR(STEAL_TIME_preempted) "(%rax);"

~Andrew

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
  2017-02-14 14:46                       ` Waiman Long
                                         ` (3 preceding siblings ...)
  (?)
@ 2017-02-14 16:18                       ` Andrew Cooper
  -1 siblings, 0 replies; 69+ messages in thread
From: Andrew Cooper @ 2017-02-14 16:18 UTC (permalink / raw)
  To: Waiman Long, Peter Zijlstra
  Cc: linux-arch, Juergen Gross, Jeremy Fitzhardinge, Rusty Russell,
	xen-devel, kvm, Radim Krčmář,
	Pan Xinhui, x86, linux-kernel, virtualization, Chris Wright,
	Ingo Molnar, Thomas Gleixner, H. Peter Anvin, Paolo Bonzini,
	Boris Ostrovsky, Alok Kataria

On 14/02/17 14:46, Waiman Long wrote:
> On 02/14/2017 04:39 AM, Peter Zijlstra wrote:
>> On Mon, Feb 13, 2017 at 05:34:01PM -0500, Waiman Long wrote:
>>> It is the address of &steal_time that will exceed the 32-bit limit.
>> That seems extremely unlikely. That would mean we have more than 4G
>> worth of per-cpu variables declared in the kernel.
> I have some doubt about if the compiler is able to properly use
> RIP-relative addressing for this. Anyway, it seems like constraints
> aren't allowed for asm() when not in the function context, at least for
> the the compiler that I am using (4.8.5). So it is a moot point.

You can work the issue of not having parameters in a plain asm()
statement by using an asm-offset, stringizing it, and have C put the
string fragments back together.

"cmpb $0, " STR(STEAL_TIME_preempted) "(%rax);"

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 69+ messages in thread

* [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function
@ 2017-02-10 15:43 Waiman Long
  0 siblings, 0 replies; 69+ messages in thread
From: Waiman Long @ 2017-02-10 15:43 UTC (permalink / raw)
  To: Jeremy Fitzhardinge, Chris Wright, Alok Kataria, Rusty Russell,
	Peter Zijlstra, Ingo Molnar, Thomas Gleixner, H. Peter Anvin
  Cc: linux-arch, Juergen Gross, kvm, Radim Krčmář,
	Pan Xinhui, x86, linux-kernel, virtualization, Waiman Long,
	Paolo Bonzini, xen-devel, Boris Ostrovsky

It was found when running fio sequential write test with a XFS ramdisk
on a VM running on a 2-socket x86-64 system, the %CPU times as reported
by perf were as follows:

 69.75%  0.59%  fio  [k] down_write
 69.15%  0.01%  fio  [k] call_rwsem_down_write_failed
 67.12%  1.12%  fio  [k] rwsem_down_write_failed
 63.48% 52.77%  fio  [k] osq_lock
  9.46%  7.88%  fio  [k] __raw_callee_save___kvm_vcpu_is_preempt
  3.93%  3.93%  fio  [k] __kvm_vcpu_is_preempted

Making vcpu_is_preempted() a callee-save function has a relatively
high cost on x86-64 primarily due to at least one more cacheline of
data access from the saving and restoring of registers (8 of them)
to and from stack as well as one more level of function call. As
vcpu_is_preempted() is called within the spinlock, mutex and rwsem
slowpaths, there isn't much to gain by making it callee-save. So it
is now changed to a normal function call instead.

With this patch applied on both bare-metal & KVM guest on a 2-socekt
16-core 32-thread system with 16 parallel jobs (8 on each socket), the
aggregrate bandwidth of the fio test on an XFS ramdisk were as follows:

                       Bare Metal                KVM Guest
   I/O Type      w/o patch    with patch   w/o patch    with patch
   --------      ---------    ----------   ---------    ----------
   random read   8650.5 MB/s  8560.9 MB/s  7602.9 MB/s  8196.1 MB/s  
   seq read      9104.8 MB/s  9397.2 MB/s  8293.7 MB/s  8566.9 MB/s
   random write  1623.8 MB/s  1626.7 MB/s  1590.6 MB/s  1700.7 MB/s
   seq write     1626.4 MB/s  1624.9 MB/s  1604.8 MB/s  1726.3 MB/s

The perf data (on KVM guest) now became:

 70.78%  0.58%  fio  [k] down_write
 70.20%  0.01%  fio  [k] call_rwsem_down_write_failed
 69.70%  1.17%  fio  [k] rwsem_down_write_failed
 59.91% 55.42%  fio  [k] osq_lock
 10.14% 10.14%  fio  [k] __kvm_vcpu_is_preempted

On bare metal, the patch doesn't introduce any performance
regression. On KVM guest, it produces noticeable performance
improvement (up to 7%).

Signed-off-by: Waiman Long <longman@redhat.com>
---
 v1->v2:
  - Rerun the fio test on a different system on both bare-metal and a
    KVM guest. Both sockets were utilized in this test.
  - The commit log was updated with new performance numbers, but the
    patch wasn't changed.
  - Drop patch 2.

 arch/x86/include/asm/paravirt.h       | 2 +-
 arch/x86/include/asm/paravirt_types.h | 2 +-
 arch/x86/kernel/kvm.c                 | 7 ++-----
 arch/x86/kernel/paravirt-spinlocks.c  | 6 ++----
 arch/x86/xen/spinlock.c               | 4 +---
 5 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 864f57b..2515885 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -676,7 +676,7 @@ static __always_inline void pv_kick(int cpu)
 
 static __always_inline bool pv_vcpu_is_preempted(int cpu)
 {
-	return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
+	return PVOP_CALL1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
 }
 
 #endif /* SMP && PARAVIRT_SPINLOCKS */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index bb2de45..88dc852 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -309,7 +309,7 @@ struct pv_lock_ops {
 	void (*wait)(u8 *ptr, u8 val);
 	void (*kick)(int cpu);
 
-	struct paravirt_callee_save vcpu_is_preempted;
+	bool (*vcpu_is_preempted)(int cpu);
 };
 
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 099fcba..eb3753d 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -595,7 +595,6 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
 
 	return !!src->preempted;
 }
-PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
 
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
@@ -614,10 +613,8 @@ void __init kvm_spinlock_init(void)
 	pv_lock_ops.wait = kvm_wait;
 	pv_lock_ops.kick = kvm_kick_cpu;
 
-	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
-		pv_lock_ops.vcpu_is_preempted =
-			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
-	}
+	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
+		pv_lock_ops.vcpu_is_preempted = __kvm_vcpu_is_preempted;
 }
 
 #endif	/* CONFIG_PARAVIRT_SPINLOCKS */
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 6259327..da050bc 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -24,12 +24,10 @@ __visible bool __native_vcpu_is_preempted(int cpu)
 {
 	return false;
 }
-PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
 
 bool pv_is_native_vcpu_is_preempted(void)
 {
-	return pv_lock_ops.vcpu_is_preempted.func ==
-		__raw_callee_save___native_vcpu_is_preempted;
+	return pv_lock_ops.vcpu_is_preempted == __native_vcpu_is_preempted;
 }
 
 struct pv_lock_ops pv_lock_ops = {
@@ -38,7 +36,7 @@ struct pv_lock_ops pv_lock_ops = {
 	.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
 	.wait = paravirt_nop,
 	.kick = paravirt_nop,
-	.vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted),
+	.vcpu_is_preempted = __native_vcpu_is_preempted,
 #endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 25a7c43..c85bb8f 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -114,8 +114,6 @@ void xen_uninit_lock_cpu(int cpu)
 	per_cpu(irq_name, cpu) = NULL;
 }
 
-PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
-
 /*
  * Our init of PV spinlocks is split in two init functions due to us
  * using paravirt patching and jump labels patching and having to do
@@ -138,7 +136,7 @@ void __init xen_init_spinlocks(void)
 	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
 	pv_lock_ops.wait = xen_qlock_wait;
 	pv_lock_ops.kick = xen_qlock_kick;
-	pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
+	pv_lock_ops.vcpu_is_preempted = xen_vcpu_stolen;
 }
 
 static __init int xen_parse_nopvspin(char *arg)
-- 
1.8.3.1


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply related	[flat|nested] 69+ messages in thread

end of thread, other threads:[~2017-02-14 16:19 UTC | newest]

Thread overview: 69+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-02-10 15:43 [PATCH v2] x86/paravirt: Don't make vcpu_is_preempted() a callee-save function Waiman Long
2017-02-10 15:43 ` Waiman Long
2017-02-10 16:19 ` Peter Zijlstra
2017-02-10 16:19 ` Peter Zijlstra
2017-02-10 16:19   ` Peter Zijlstra
2017-02-10 16:35   ` Waiman Long
2017-02-10 16:35     ` Waiman Long
2017-02-10 17:00     ` Waiman Long
2017-02-10 17:00     ` Waiman Long
2017-02-10 17:00       ` Waiman Long
2017-02-13 10:47       ` Peter Zijlstra
2017-02-13 10:47       ` Peter Zijlstra
2017-02-13 10:47         ` Peter Zijlstra
2017-02-13 10:53         ` Peter Zijlstra
2017-02-13 10:53         ` Peter Zijlstra
2017-02-13 10:53           ` Peter Zijlstra
2017-02-13 19:42           ` Waiman Long
2017-02-13 19:42           ` Waiman Long
2017-02-13 19:42           ` Waiman Long
2017-02-13 20:12             ` Waiman Long
2017-02-13 20:12               ` Waiman Long
2017-02-13 21:52               ` Peter Zijlstra
2017-02-13 21:52               ` Peter Zijlstra
2017-02-13 21:52                 ` Peter Zijlstra
2017-02-13 22:00                 ` hpa
2017-02-13 22:00                 ` hpa
2017-02-13 22:00                   ` hpa
2017-02-13 22:07                 ` hpa
2017-02-13 22:07                   ` hpa
2017-02-13 22:07                 ` hpa
2017-02-13 22:34                 ` Waiman Long
2017-02-13 22:34                 ` Waiman Long
2017-02-13 22:34                   ` Waiman Long
2017-02-13 22:36                   ` hpa
2017-02-13 22:36                     ` hpa
2017-02-13 22:36                   ` hpa
2017-02-14  9:39                   ` Peter Zijlstra
2017-02-14  9:39                   ` Peter Zijlstra
2017-02-14  9:39                     ` Peter Zijlstra
2017-02-14 14:46                     ` Waiman Long
2017-02-14 14:46                     ` Waiman Long
2017-02-14 14:46                       ` Waiman Long
2017-02-14 16:03                       ` Peter Zijlstra
2017-02-14 16:03                       ` Peter Zijlstra
2017-02-14 16:03                         ` Peter Zijlstra
2017-02-14 16:18                       ` [Xen-devel] " Andrew Cooper
2017-02-14 16:18                       ` Andrew Cooper
2017-02-14 16:18                       ` [Xen-devel] " Andrew Cooper
2017-02-13 20:12             ` Waiman Long
2017-02-13 20:06           ` hpa
2017-02-13 20:06             ` hpa
2017-02-13 21:57             ` Peter Zijlstra
2017-02-13 21:57               ` Peter Zijlstra
2017-02-13 21:57             ` Peter Zijlstra
2017-02-13 22:24             ` Waiman Long
2017-02-13 22:24               ` Waiman Long
2017-02-13 22:31               ` Peter Zijlstra
2017-02-13 22:31               ` Peter Zijlstra
2017-02-13 22:31                 ` Peter Zijlstra
2017-02-13 22:24             ` Waiman Long
2017-02-13 20:06           ` hpa
2017-02-13 19:41         ` Waiman Long
2017-02-13 19:41         ` Waiman Long
2017-02-13 19:41         ` Waiman Long
2017-02-10 16:35   ` Waiman Long
2017-02-10 16:22 ` Paolo Bonzini
2017-02-10 16:22 ` Paolo Bonzini
2017-02-10 16:22   ` Paolo Bonzini
  -- strict thread matches above, loose matches on Subject: below --
2017-02-10 15:43 Waiman Long

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.