All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()
@ 2018-02-02 14:59 David Woodhouse
  2018-02-02 15:43 ` Sironi, Filippo
  2018-02-02 18:50 ` Linus Torvalds
  0 siblings, 2 replies; 12+ messages in thread
From: David Woodhouse @ 2018-02-02 14:59 UTC (permalink / raw)
  To: tglx, karahmed, sironi, x86, kvm, torvalds, pbonzini,
	linux-kernel, bp, peterz

With retpoline, tight loops of "call this function for every XXX" are
very much pessimised by taking a prediction miss *every* time.

This one showed up very high in our early testing, and it only has five
things it'll ever call so make it take an 'op' enum instead of a
function pointer and let's see how that works out...

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
Not sure I like this. Better suggestions welcomed...

In the general case, we have a few things we can do with the calls that
retpoline turns into bottlenecks. This is one of them.

Another option, if there are just one or two "likely" functions, is
something along the lines of

 if (func == likelyfunc)
    likelyfunc()
 else
    (*func)(); // GCC does retpoline for this

For things like kvm_x86_ops we really could just turn *all* of those
into direct calls at runtime, like pvops does.

There are some which land somewhere in the middle, like the default
dma_ops. We probably want something like the 'likelyfunc' version
above, except that we *also* want to flip the likelyfunc between the
Intel and AMD IOMMU ops functions, at early boot. I'll see what I can
come up with...

 arch/x86/kvm/mmu.c | 72 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 48 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2b8eb4d..44f9de7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5055,12 +5055,21 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
 }
 
 /* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
+enum slot_handler_op {
+	SLOT_RMAP_CLEAR_DIRTY,
+	SLOT_RMAP_SET_DIRTY,
+	SLOT_RMAP_WRITE_PROTECT,
+	SLOT_ZAP_RMAPP,
+	SLOT_ZAP_COLLAPSIBLE_SPTE,
+};
+
+static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
+					 struct kvm_rmap_head *rmap_head);
 
 /* The caller should hold mmu-lock before calling this function. */
 static bool
 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
-			slot_level_handler fn, int start_level, int end_level,
+			enum slot_handler_op op, int start_level, int end_level,
 			gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
 {
 	struct slot_rmap_walk_iterator iterator;
@@ -5068,8 +5077,29 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
 
 	for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
 			end_gfn, &iterator) {
-		if (iterator.rmap)
-			flush |= fn(kvm, iterator.rmap);
+		if (iterator.rmap) {
+			switch (op) {
+			case SLOT_RMAP_CLEAR_DIRTY:
+				flush |= __rmap_clear_dirty(kvm, iterator.rmap);
+				break;
+
+			case SLOT_RMAP_SET_DIRTY:
+				flush |= __rmap_set_dirty(kvm, iterator.rmap);
+				break;
+
+			case SLOT_RMAP_WRITE_PROTECT:
+				flush |= __rmap_write_protect(kvm, iterator.rmap, false);
+				break;
+
+			case SLOT_ZAP_RMAPP:
+				flush |= kvm_zap_rmapp(kvm, iterator.rmap);
+				break;
+
+			case SLOT_ZAP_COLLAPSIBLE_SPTE:
+				flush |= kvm_mmu_zap_collapsible_spte(kvm, iterator.rmap);
+				break;
+			}
+		}
 
 		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
 			if (flush && lock_flush_tlb) {
@@ -5090,10 +5120,10 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
 
 static bool
 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
-		  slot_level_handler fn, int start_level, int end_level,
+		  enum slot_handler_op op, int start_level, int end_level,
 		  bool lock_flush_tlb)
 {
-	return slot_handle_level_range(kvm, memslot, fn, start_level,
+	return slot_handle_level_range(kvm, memslot, op, start_level,
 			end_level, memslot->base_gfn,
 			memslot->base_gfn + memslot->npages - 1,
 			lock_flush_tlb);
@@ -5101,25 +5131,25 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 
 static bool
 slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
-		      slot_level_handler fn, bool lock_flush_tlb)
+		      enum slot_handler_op op, bool lock_flush_tlb)
 {
-	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+	return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL,
 				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
 }
 
 static bool
 slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
-			slot_level_handler fn, bool lock_flush_tlb)
+			enum slot_handler_op op, bool lock_flush_tlb)
 {
-	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
+	return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL + 1,
 				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
 }
 
 static bool
 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
-		 slot_level_handler fn, bool lock_flush_tlb)
+		 enum slot_handler_op op, bool lock_flush_tlb)
 {
-	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+	return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL,
 				 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
 }
 
@@ -5140,7 +5170,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 			if (start >= end)
 				continue;
 
-			slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+			slot_handle_level_range(kvm, memslot, SLOT_ZAP_RMAPP,
 						PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
 						start, end - 1, true);
 		}
@@ -5149,19 +5179,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 	spin_unlock(&kvm->mmu_lock);
 }
 
-static bool slot_rmap_write_protect(struct kvm *kvm,
-				    struct kvm_rmap_head *rmap_head)
-{
-	return __rmap_write_protect(kvm, rmap_head, false);
-}
-
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 				      struct kvm_memory_slot *memslot)
 {
 	bool flush;
 
 	spin_lock(&kvm->mmu_lock);
-	flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
+	flush = slot_handle_all_level(kvm, memslot, SLOT_RMAP_WRITE_PROTECT,
 				      false);
 	spin_unlock(&kvm->mmu_lock);
 
@@ -5226,7 +5250,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 	/* FIXME: const-ify all uses of struct kvm_memory_slot.  */
 	spin_lock(&kvm->mmu_lock);
 	slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
-			 kvm_mmu_zap_collapsible_spte, true);
+			 SLOT_ZAP_COLLAPSIBLE_SPTE, true);
 	spin_unlock(&kvm->mmu_lock);
 }
 
@@ -5236,7 +5260,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 	bool flush;
 
 	spin_lock(&kvm->mmu_lock);
-	flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
+	flush = slot_handle_leaf(kvm, memslot, SLOT_RMAP_CLEAR_DIRTY, false);
 	spin_unlock(&kvm->mmu_lock);
 
 	lockdep_assert_held(&kvm->slots_lock);
@@ -5258,7 +5282,7 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
 	bool flush;
 
 	spin_lock(&kvm->mmu_lock);
-	flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
+	flush = slot_handle_large_level(kvm, memslot, SLOT_RMAP_WRITE_PROTECT,
 					false);
 	spin_unlock(&kvm->mmu_lock);
 
@@ -5276,7 +5300,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
 	bool flush;
 
 	spin_lock(&kvm->mmu_lock);
-	flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
+	flush = slot_handle_all_level(kvm, memslot, SLOT_RMAP_SET_DIRTY, false);
 	spin_unlock(&kvm->mmu_lock);
 
 	lockdep_assert_held(&kvm->slots_lock);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()
  2018-02-02 14:59 [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range() David Woodhouse
@ 2018-02-02 15:43 ` Sironi, Filippo
  2018-02-02 21:10   ` Paolo Bonzini
  2018-02-02 18:50 ` Linus Torvalds
  1 sibling, 1 reply; 12+ messages in thread
From: Sironi, Filippo @ 2018-02-02 15:43 UTC (permalink / raw)
  To: Woodhouse, David
  Cc: tglx, Raslan, KarimAllah, x86, KVM list, torvalds, pbonzini,
	linux-kernel, bp, peterz


> On 2. Feb 2018, at 15:59, David Woodhouse <dwmw@amazon.co.uk> wrote:
> 
> With retpoline, tight loops of "call this function for every XXX" are
> very much pessimised by taking a prediction miss *every* time.
> 
> This one showed up very high in our early testing, and it only has five
> things it'll ever call so make it take an 'op' enum instead of a
> function pointer and let's see how that works out...
> 
> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> ---
> Not sure I like this. Better suggestions welcomed...
> 
> In the general case, we have a few things we can do with the calls that
> retpoline turns into bottlenecks. This is one of them.
> 
> Another option, if there are just one or two "likely" functions, is
> something along the lines of
> 
> if (func == likelyfunc)
>    likelyfunc()
> else
>    (*func)(); // GCC does retpoline for this
> 
> For things like kvm_x86_ops we really could just turn *all* of those
> into direct calls at runtime, like pvops does.
> 
> There are some which land somewhere in the middle, like the default
> dma_ops. We probably want something like the 'likelyfunc' version
> above, except that we *also* want to flip the likelyfunc between the
> Intel and AMD IOMMU ops functions, at early boot. I'll see what I can
> come up with...
> 
> arch/x86/kvm/mmu.c | 72 ++++++++++++++++++++++++++++++++++++------------------
> 1 file changed, 48 insertions(+), 24 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 2b8eb4d..44f9de7 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -5055,12 +5055,21 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
> }
> 
> /* The return value indicates if tlb flush on all vcpus is needed. */
> -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
> +enum slot_handler_op {
> +	SLOT_RMAP_CLEAR_DIRTY,
> +	SLOT_RMAP_SET_DIRTY,
> +	SLOT_RMAP_WRITE_PROTECT,
> +	SLOT_ZAP_RMAPP,
> +	SLOT_ZAP_COLLAPSIBLE_SPTE,
> +};
> +
> +static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
> +					 struct kvm_rmap_head *rmap_head);
> 
> /* The caller should hold mmu-lock before calling this function. */
> static bool
> slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
> -			slot_level_handler fn, int start_level, int end_level,
> +			enum slot_handler_op op, int start_level, int end_level,
> 			gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
> {
> 	struct slot_rmap_walk_iterator iterator;
> @@ -5068,8 +5077,29 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
> 
> 	for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
> 			end_gfn, &iterator) {
> -		if (iterator.rmap)
> -			flush |= fn(kvm, iterator.rmap);
> +		if (iterator.rmap) {
> +			switch (op) {
> +			case SLOT_RMAP_CLEAR_DIRTY:
> +				flush |= __rmap_clear_dirty(kvm, iterator.rmap);
> +				break;
> +
> +			case SLOT_RMAP_SET_DIRTY:
> +				flush |= __rmap_set_dirty(kvm, iterator.rmap);
> +				break;
> +
> +			case SLOT_RMAP_WRITE_PROTECT:
> +				flush |= __rmap_write_protect(kvm, iterator.rmap, false);
> +				break;
> +
> +			case SLOT_ZAP_RMAPP:
> +				flush |= kvm_zap_rmapp(kvm, iterator.rmap);
> +				break;
> +
> +			case SLOT_ZAP_COLLAPSIBLE_SPTE:
> +				flush |= kvm_mmu_zap_collapsible_spte(kvm, iterator.rmap);
> +				break;
> +			}
> +		}
> 
> 		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
> 			if (flush && lock_flush_tlb) {
> @@ -5090,10 +5120,10 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
> 
> static bool
> slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
> -		  slot_level_handler fn, int start_level, int end_level,
> +		  enum slot_handler_op op, int start_level, int end_level,
> 		  bool lock_flush_tlb)
> {
> -	return slot_handle_level_range(kvm, memslot, fn, start_level,
> +	return slot_handle_level_range(kvm, memslot, op, start_level,
> 			end_level, memslot->base_gfn,
> 			memslot->base_gfn + memslot->npages - 1,
> 			lock_flush_tlb);
> @@ -5101,25 +5131,25 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
> 
> static bool
> slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
> -		      slot_level_handler fn, bool lock_flush_tlb)
> +		      enum slot_handler_op op, bool lock_flush_tlb)
> {
> -	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
> +	return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL,
> 				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
> }
> 
> static bool
> slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
> -			slot_level_handler fn, bool lock_flush_tlb)
> +			enum slot_handler_op op, bool lock_flush_tlb)
> {
> -	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
> +	return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL + 1,
> 				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
> }
> 
> static bool
> slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
> -		 slot_level_handler fn, bool lock_flush_tlb)
> +		 enum slot_handler_op op, bool lock_flush_tlb)
> {
> -	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
> +	return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL,
> 				 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
> }
> 
> @@ -5140,7 +5170,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
> 			if (start >= end)
> 				continue;
> 
> -			slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
> +			slot_handle_level_range(kvm, memslot, SLOT_ZAP_RMAPP,
> 						PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
> 						start, end - 1, true);
> 		}
> @@ -5149,19 +5179,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
> 	spin_unlock(&kvm->mmu_lock);
> }
> 
> -static bool slot_rmap_write_protect(struct kvm *kvm,
> -				    struct kvm_rmap_head *rmap_head)
> -{
> -	return __rmap_write_protect(kvm, rmap_head, false);
> -}
> -
> void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
> 				      struct kvm_memory_slot *memslot)
> {
> 	bool flush;
> 
> 	spin_lock(&kvm->mmu_lock);
> -	flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
> +	flush = slot_handle_all_level(kvm, memslot, SLOT_RMAP_WRITE_PROTECT,
> 				      false);
> 	spin_unlock(&kvm->mmu_lock);
> 
> @@ -5226,7 +5250,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
> 	/* FIXME: const-ify all uses of struct kvm_memory_slot.  */
> 	spin_lock(&kvm->mmu_lock);
> 	slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
> -			 kvm_mmu_zap_collapsible_spte, true);
> +			 SLOT_ZAP_COLLAPSIBLE_SPTE, true);
> 	spin_unlock(&kvm->mmu_lock);
> }
> 
> @@ -5236,7 +5260,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
> 	bool flush;
> 
> 	spin_lock(&kvm->mmu_lock);
> -	flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
> +	flush = slot_handle_leaf(kvm, memslot, SLOT_RMAP_CLEAR_DIRTY, false);
> 	spin_unlock(&kvm->mmu_lock);
> 
> 	lockdep_assert_held(&kvm->slots_lock);
> @@ -5258,7 +5282,7 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
> 	bool flush;
> 
> 	spin_lock(&kvm->mmu_lock);
> -	flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
> +	flush = slot_handle_large_level(kvm, memslot, SLOT_RMAP_WRITE_PROTECT,
> 					false);
> 	spin_unlock(&kvm->mmu_lock);
> 
> @@ -5276,7 +5300,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
> 	bool flush;
> 
> 	spin_lock(&kvm->mmu_lock);
> -	flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
> +	flush = slot_handle_all_level(kvm, memslot, SLOT_RMAP_SET_DIRTY, false);
> 	spin_unlock(&kvm->mmu_lock);
> 
> 	lockdep_assert_held(&kvm->slots_lock);
> -- 
> 2.7.4
> 

Let's add more context.

vmx_slot_disable_log_dirty() was already one of the bottlenecks on instance launch
(at least with our setup).  With retpoline, it became horribly slow (like twice as
slow).

Up to know, we're using a ugly workaround that works for us but of course isn't
acceptable in the long run.  I'm going to explore the issue further earlier next
week.

Filippo


Amazon Development Center Germany GmbH
Berlin - Dresden - Aachen
main office: Krausenstr. 38, 10117 Berlin
Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
Ust-ID: DE289237879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()
  2018-02-02 14:59 [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range() David Woodhouse
  2018-02-02 15:43 ` Sironi, Filippo
@ 2018-02-02 18:50 ` Linus Torvalds
  2018-02-02 19:10   ` Linus Torvalds
  2018-02-05 15:15   ` Paolo Bonzini
  1 sibling, 2 replies; 12+ messages in thread
From: Linus Torvalds @ 2018-02-02 18:50 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Thomas Gleixner, KarimAllah Ahmed, sironi,
	the arch/x86 maintainers, KVM list, Paolo Bonzini,
	Linux Kernel Mailing List, Borislav Petkov, Peter Zijlstra

On Fri, Feb 2, 2018 at 6:59 AM, David Woodhouse <dwmw@amazon.co.uk> wrote:
> With retpoline, tight loops of "call this function for every XXX" are
> very much pessimised by taking a prediction miss *every* time.
>
> This one showed up very high in our early testing, and it only has five
> things it'll ever call so make it take an 'op' enum instead of a
> function pointer and let's see how that works out...

Umm. May I suggest a different workaround?

Honestly, if this is so performance-critical, the *real* fix is to
actually just mark all those "slot_handle_*()" functions as
"always_inline".

Because that will *really* improve performance, by simply removing the
indirection entirely - since then the functions involved will become
static. You might get other code improvements too, because I suspect
it will end up removing an extra level of function call due to those
trivial wrapper functions. And there's a couple of "bool
lock_flush_tlb" arguments that will simply become constant and
generate much better code that way.

And maybe you don't want to inline all of the slot_handle_*()
functions, and it's only one or two of them that matter because they
loop over a lot of entries, but honestly, most of those
slot_handle_xyz() functions seem to have just a couple of call sites
anyway.

slot_handle_large_level() is probably already inlined by the compiler
because it only has a single call-site.

Will it make for bigger code? Yes. But probably not really all *that*
much bigger, because of how it also will allow the compiler to
simplify some things.  An dif this really is so critical that those
non-predicted calls were that noticeable, those other simplifications
probably also matter.

And then  you get rid of all run-time conditionals, and all the
indirect jumps entirely. Plus the patch will be smaller and simpler
too.

Hmm?

              Linus

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()
  2018-02-02 18:50 ` Linus Torvalds
@ 2018-02-02 19:10   ` Linus Torvalds
  2018-02-02 19:17     ` David Woodhouse
  2018-02-05 15:15   ` Paolo Bonzini
  1 sibling, 1 reply; 12+ messages in thread
From: Linus Torvalds @ 2018-02-02 19:10 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Thomas Gleixner, KarimAllah Ahmed, sironi,
	the arch/x86 maintainers, KVM list, Paolo Bonzini,
	Linux Kernel Mailing List, Borislav Petkov, Peter Zijlstra

[-- Attachment #1: Type: text/plain, Size: 950 bytes --]

On Fri, Feb 2, 2018 at 10:50 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Will it make for bigger code? Yes. But probably not really all *that*
> much bigger, because of how it also will allow the compiler to
> simplify some things.

Actually, testing this with my fairly minimal config, it actually
makes for *smaller* code to inline those things.

That may be a quirk of my configuration, or maybe I screwed something
else up, but:

  [torvalds@i7 linux]$ size ~/mmu.o arch/x86/kvm/mmu.o
     text    data     bss     dec     hex filename
    85587    9310     120   95017   17329 /home/torvalds/mmu.o
    85531    9310     120   94961   172f1 arch/x86/kvm/mmu.o

so the attached patch actually shrank things down by about 50 bytes
because of the code simplification.

Of course, I have been known to screw up retpoline testing in the
past, so my numbers are suspect ;). Somebody should double-check me.

                    Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/plain, Size: 1905 bytes --]

 arch/x86/kvm/mmu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2b8eb4da4d08..b9f0de6e309b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5058,7 +5058,7 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
 typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
 
 /* The caller should hold mmu-lock before calling this function. */
-static bool
+static bool __always_inline
 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			slot_level_handler fn, int start_level, int end_level,
 			gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
@@ -5088,7 +5088,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	return flush;
 }
 
-static bool
+static bool __always_inline
 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		  slot_level_handler fn, int start_level, int end_level,
 		  bool lock_flush_tlb)
@@ -5099,7 +5099,7 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			lock_flush_tlb);
 }
 
-static bool
+static bool __always_inline
 slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		      slot_level_handler fn, bool lock_flush_tlb)
 {
@@ -5107,7 +5107,7 @@ slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
 }
 
-static bool
+static bool __always_inline
 slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			slot_level_handler fn, bool lock_flush_tlb)
 {
@@ -5115,7 +5115,7 @@ slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
 }
 
-static bool
+static bool __always_inline
 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		 slot_level_handler fn, bool lock_flush_tlb)
 {

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()
  2018-02-02 19:10   ` Linus Torvalds
@ 2018-02-02 19:17     ` David Woodhouse
  2018-02-02 21:23       ` Alan Cox
  0 siblings, 1 reply; 12+ messages in thread
From: David Woodhouse @ 2018-02-02 19:17 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Thomas Gleixner, KarimAllah Ahmed, sironi,
	the arch/x86 maintainers, KVM list, Paolo Bonzini,
	Linux Kernel Mailing List, Borislav Petkov, Peter Zijlstra

[-- Attachment #1: Type: text/plain, Size: 1772 bytes --]

On Fri, 2018-02-02 at 11:10 -0800, Linus Torvalds wrote:
> On Fri, Feb 2, 2018 at 10:50 AM, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> > 
> > 
> > Will it make for bigger code? Yes. But probably not really all *that*
> > much bigger, because of how it also will allow the compiler to
> > simplify some things.
>
> Actually, testing this with my fairly minimal config, it actually
> makes for *smaller* code to inline those things.
> 
> That may be a quirk of my configuration, or maybe I screwed something
> else up, but:
> 
>   [torvalds@i7 linux]$ size ~/mmu.o arch/x86/kvm/mmu.o
>      text    data     bss     dec     hex filename
>     85587    9310     120   95017   17329 /home/torvalds/mmu.o
>     85531    9310     120   94961   172f1 arch/x86/kvm/mmu.o
> 
> so the attached patch actually shrank things down by about 50 bytes
> because of the code simplification.
> 
> Of course, I have been known to screw up retpoline testing in the
> past, so my numbers are suspect ;). Somebody should double-check me.

I got this:

   text	   data	    bss	    dec	    hex	
filename
  87167	   9310	    120	  96597	  17955	
arch/x86/kvm/mmu.o
  88299	   9310	    120	  97729	  17dc1	
arch/x86/kvm/mmu-inline.o

But then, I'd also done kvm_handle_hva() and kvm_handle_hva_range().

Either way, that does look like a reasonable answer. I had looked at
the various one-line wrappers around slot_handle_level_range() and
thought "hm, those should be inline", but I hadn't made the next step
and pondered putting the whole thing inline. We'll give it a spin and
work out where the next performance bottleneck is. Thanks.

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5213 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()
  2018-02-02 15:43 ` Sironi, Filippo
@ 2018-02-02 21:10   ` Paolo Bonzini
  2018-02-02 21:14     ` David Woodhouse
  0 siblings, 1 reply; 12+ messages in thread
From: Paolo Bonzini @ 2018-02-02 21:10 UTC (permalink / raw)
  To: Filippo Sironi
  Cc: David Woodhouse, tglx, KarimAllah Raslan, x86, KVM list,
	torvalds, linux-kernel, bp, peterz

> > On 2. Feb 2018, at 15:59, David Woodhouse <dwmw@amazon.co.uk> wrote:
> > With retpoline, tight loops of "call this function for every XXX" are
> > very much pessimised by taking a prediction miss *every* time.
> > 
> > This one showed up very high in our early testing, and it only has five
> > things it'll ever call so make it take an 'op' enum instead of a
> > function pointer and let's see how that works out...
> > 
> > Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>

What about __always_inline instead?

Thanks,

Paolo

> > ---
> > Not sure I like this. Better suggestions welcomed...
> > 
> > In the general case, we have a few things we can do with the calls that
> > retpoline turns into bottlenecks. This is one of them.
> > 
> > Another option, if there are just one or two "likely" functions, is
> > something along the lines of
> > 
> > if (func == likelyfunc)
> >    likelyfunc()
> > else
> >    (*func)(); // GCC does retpoline for this
> > 
> > For things like kvm_x86_ops we really could just turn *all* of those
> > into direct calls at runtime, like pvops does.
> > 
> > There are some which land somewhere in the middle, like the default
> > dma_ops. We probably want something like the 'likelyfunc' version
> > above, except that we *also* want to flip the likelyfunc between the
> > Intel and AMD IOMMU ops functions, at early boot. I'll see what I can
> > come up with...
> > 
> > arch/x86/kvm/mmu.c | 72
> > ++++++++++++++++++++++++++++++++++++------------------
> > 1 file changed, 48 insertions(+), 24 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index 2b8eb4d..44f9de7 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -5055,12 +5055,21 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
> > }
> > 
> > /* The return value indicates if tlb flush on all vcpus is needed. */
> > -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head
> > *rmap_head);
> > +enum slot_handler_op {
> > +	SLOT_RMAP_CLEAR_DIRTY,
> > +	SLOT_RMAP_SET_DIRTY,
> > +	SLOT_RMAP_WRITE_PROTECT,
> > +	SLOT_ZAP_RMAPP,
> > +	SLOT_ZAP_COLLAPSIBLE_SPTE,
> > +};
> > +
> > +static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
> > +					 struct kvm_rmap_head *rmap_head);
> > 
> > /* The caller should hold mmu-lock before calling this function. */
> > static bool
> > slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
> > -			slot_level_handler fn, int start_level, int end_level,
> > +			enum slot_handler_op op, int start_level, int end_level,
> > 			gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
> > {
> > 	struct slot_rmap_walk_iterator iterator;
> > @@ -5068,8 +5077,29 @@ slot_handle_level_range(struct kvm *kvm, struct
> > kvm_memory_slot *memslot,
> > 
> > 	for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
> > 			end_gfn, &iterator) {
> > -		if (iterator.rmap)
> > -			flush |= fn(kvm, iterator.rmap);
> > +		if (iterator.rmap) {
> > +			switch (op) {
> > +			case SLOT_RMAP_CLEAR_DIRTY:
> > +				flush |= __rmap_clear_dirty(kvm, iterator.rmap);
> > +				break;
> > +
> > +			case SLOT_RMAP_SET_DIRTY:
> > +				flush |= __rmap_set_dirty(kvm, iterator.rmap);
> > +				break;
> > +
> > +			case SLOT_RMAP_WRITE_PROTECT:
> > +				flush |= __rmap_write_protect(kvm, iterator.rmap, false);
> > +				break;
> > +
> > +			case SLOT_ZAP_RMAPP:
> > +				flush |= kvm_zap_rmapp(kvm, iterator.rmap);
> > +				break;
> > +
> > +			case SLOT_ZAP_COLLAPSIBLE_SPTE:
> > +				flush |= kvm_mmu_zap_collapsible_spte(kvm, iterator.rmap);
> > +				break;
> > +			}
> > +		}
> > 
> > 		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
> > 			if (flush && lock_flush_tlb) {
> > @@ -5090,10 +5120,10 @@ slot_handle_level_range(struct kvm *kvm, struct
> > kvm_memory_slot *memslot,
> > 
> > static bool
> > slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
> > -		  slot_level_handler fn, int start_level, int end_level,
> > +		  enum slot_handler_op op, int start_level, int end_level,
> > 		  bool lock_flush_tlb)
> > {
> > -	return slot_handle_level_range(kvm, memslot, fn, start_level,
> > +	return slot_handle_level_range(kvm, memslot, op, start_level,
> > 			end_level, memslot->base_gfn,
> > 			memslot->base_gfn + memslot->npages - 1,
> > 			lock_flush_tlb);
> > @@ -5101,25 +5131,25 @@ slot_handle_level(struct kvm *kvm, struct
> > kvm_memory_slot *memslot,
> > 
> > static bool
> > slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
> > -		      slot_level_handler fn, bool lock_flush_tlb)
> > +		      enum slot_handler_op op, bool lock_flush_tlb)
> > {
> > -	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
> > +	return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL,
> > 				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
> > }
> > 
> > static bool
> > slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
> > -			slot_level_handler fn, bool lock_flush_tlb)
> > +			enum slot_handler_op op, bool lock_flush_tlb)
> > {
> > -	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
> > +	return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL + 1,
> > 				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
> > }
> > 
> > static bool
> > slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
> > -		 slot_level_handler fn, bool lock_flush_tlb)
> > +		 enum slot_handler_op op, bool lock_flush_tlb)
> > {
> > -	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
> > +	return slot_handle_level(kvm, memslot, op, PT_PAGE_TABLE_LEVEL,
> > 				 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
> > }
> > 
> > @@ -5140,7 +5170,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t
> > gfn_start, gfn_t gfn_end)
> > 			if (start >= end)
> > 				continue;
> > 
> > -			slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
> > +			slot_handle_level_range(kvm, memslot, SLOT_ZAP_RMAPP,
> > 						PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
> > 						start, end - 1, true);
> > 		}
> > @@ -5149,19 +5179,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t
> > gfn_start, gfn_t gfn_end)
> > 	spin_unlock(&kvm->mmu_lock);
> > }
> > 
> > -static bool slot_rmap_write_protect(struct kvm *kvm,
> > -				    struct kvm_rmap_head *rmap_head)
> > -{
> > -	return __rmap_write_protect(kvm, rmap_head, false);
> > -}
> > -
> > void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
> > 				      struct kvm_memory_slot *memslot)
> > {
> > 	bool flush;
> > 
> > 	spin_lock(&kvm->mmu_lock);
> > -	flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
> > +	flush = slot_handle_all_level(kvm, memslot, SLOT_RMAP_WRITE_PROTECT,
> > 				      false);
> > 	spin_unlock(&kvm->mmu_lock);
> > 
> > @@ -5226,7 +5250,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
> > 	/* FIXME: const-ify all uses of struct kvm_memory_slot.  */
> > 	spin_lock(&kvm->mmu_lock);
> > 	slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
> > -			 kvm_mmu_zap_collapsible_spte, true);
> > +			 SLOT_ZAP_COLLAPSIBLE_SPTE, true);
> > 	spin_unlock(&kvm->mmu_lock);
> > }
> > 
> > @@ -5236,7 +5260,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
> > 	bool flush;
> > 
> > 	spin_lock(&kvm->mmu_lock);
> > -	flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
> > +	flush = slot_handle_leaf(kvm, memslot, SLOT_RMAP_CLEAR_DIRTY, false);
> > 	spin_unlock(&kvm->mmu_lock);
> > 
> > 	lockdep_assert_held(&kvm->slots_lock);
> > @@ -5258,7 +5282,7 @@ void
> > kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
> > 	bool flush;
> > 
> > 	spin_lock(&kvm->mmu_lock);
> > -	flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
> > +	flush = slot_handle_large_level(kvm, memslot, SLOT_RMAP_WRITE_PROTECT,
> > 					false);
> > 	spin_unlock(&kvm->mmu_lock);
> > 
> > @@ -5276,7 +5300,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
> > 	bool flush;
> > 
> > 	spin_lock(&kvm->mmu_lock);
> > -	flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
> > +	flush = slot_handle_all_level(kvm, memslot, SLOT_RMAP_SET_DIRTY, false);
> > 	spin_unlock(&kvm->mmu_lock);
> > 
> > 	lockdep_assert_held(&kvm->slots_lock);
> > --
> > 2.7.4
> > 
> 
> Let's add more context.
> 
> vmx_slot_disable_log_dirty() was already one of the bottlenecks on instance
> launch
> (at least with our setup).  With retpoline, it became horribly slow (like
> twice as
> slow).
> 
> Up to know, we're using a ugly workaround that works for us but of course
> isn't
> acceptable in the long run.  I'm going to explore the issue further earlier
> next
> week.
> 
> Filippo
> 
> 
> Amazon Development Center Germany GmbH
> Berlin - Dresden - Aachen
> main office: Krausenstr. 38, 10117 Berlin
> Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
> Ust-ID: DE289237879
> Eingetragen am Amtsgericht Charlottenburg HRB 149173 B
> 
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()
  2018-02-02 21:10   ` Paolo Bonzini
@ 2018-02-02 21:14     ` David Woodhouse
  0 siblings, 0 replies; 12+ messages in thread
From: David Woodhouse @ 2018-02-02 21:14 UTC (permalink / raw)
  To: Paolo Bonzini, Filippo Sironi
  Cc: tglx, KarimAllah Raslan, x86, KVM list, torvalds, linux-kernel,
	bp, peterz

[-- Attachment #1: Type: text/plain, Size: 778 bytes --]

On Fri, 2018-02-02 at 16:10 -0500, Paolo Bonzini wrote:
> > > On 2. Feb 2018, at 15:59, David Woodhouse <dwmw@amazon.co.uk> wrote:
> > > With retpoline, tight loops of "call this function for every XXX" are
> > > very much pessimised by taking a prediction miss *every* time.
> > > 
> > > This one showed up very high in our early testing, and it only has five
> > > things it'll ever call so make it take an 'op' enum instead of a
> > > function pointer and let's see how that works out...
> > > 
> > > Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> 
> What about __always_inline instead?

Yeah, Linus suggested that and it looks like it generates sane (in fact
better) code. Will do some more testing and send it out for real
probably on Monday. Thanks.

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5213 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()
  2018-02-02 19:17     ` David Woodhouse
@ 2018-02-02 21:23       ` Alan Cox
  2018-02-03 14:46         ` David Woodhouse
  0 siblings, 1 reply; 12+ messages in thread
From: Alan Cox @ 2018-02-02 21:23 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Linus Torvalds, Thomas Gleixner, KarimAllah Ahmed, sironi,
	the arch/x86 maintainers, KVM list, Paolo Bonzini,
	Linux Kernel Mailing List, Borislav Petkov, Peter Zijlstra

> Either way, that does look like a reasonable answer. I had looked at
> the various one-line wrappers around slot_handle_level_range() and
> thought "hm, those should be inline", but I hadn't made the next step
> and pondered putting the whole thing inline. We'll give it a spin and
> work out where the next performance bottleneck is. Thanks.

In addition the problem with switch() is that gcc might decide in some
cases that the best way to implement your switch is an indirect call
from a lookup table.

For the simple case how about wrapping the if into

                call_likely(foo->bar, usualfunction, args)

as a companion to 

                 foo->bar(args)

that can resolve to nothing special on architectures that don't need it,
an if/else case on platforms with spectre, and potentially clever
stuff on any platform where you can beat the compiler by knowing
probabilities it can't infer ?

Alan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()
  2018-02-02 21:23       ` Alan Cox
@ 2018-02-03 14:46         ` David Woodhouse
  2018-02-05  8:51           ` Peter Zijlstra
  2018-02-05  8:55           ` Peter Zijlstra
  0 siblings, 2 replies; 12+ messages in thread
From: David Woodhouse @ 2018-02-03 14:46 UTC (permalink / raw)
  To: Alan Cox
  Cc: Linus Torvalds, Thomas Gleixner, KarimAllah Ahmed, sironi,
	the arch/x86 maintainers, KVM list, Paolo Bonzini,
	Linux Kernel Mailing List, Borislav Petkov, Peter Zijlstra

[-- Attachment #1: Type: text/plain, Size: 1288 bytes --]

On Fri, 2018-02-02 at 21:23 +0000, Alan Cox wrote:
> In addition the problem with switch() is that gcc might decide in some
> cases that the best way to implement your switch is an indirect call
> from a lookup table.

That's also true of the

  if (ptr == usualfunction)
     usualfunction();
  else
     *ptr();

construct. Especially if GCC doesn't take into account the increased
cost of indirect branches with retpoline.

> For the simple case how about wrapping the if into
> 
>                 call_likely(foo->bar, usualfunction, args)
> 
> as a companion to 
> 
>                  foo->bar(args)
> 
> that can resolve to nothing special on architectures that don't need it,
> an if/else case on platforms with spectre, and potentially clever
> stuff on any platform where you can beat the compiler by knowing
> probabilities it can't infer ?

Yeah. I'm keen on being able to use something like alternatives to
*change* 'usualfunction' at runtime though. I suspect it'll be a win
for stuff like dma_ops.

But I'm also keen to actually base such things on real data, not just
go randomly "optimising" stuff just because we can. Let's try to make
sure we fix up the real bottlenecks, and not just go crazy.

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5213 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()
  2018-02-03 14:46         ` David Woodhouse
@ 2018-02-05  8:51           ` Peter Zijlstra
  2018-02-05  8:55           ` Peter Zijlstra
  1 sibling, 0 replies; 12+ messages in thread
From: Peter Zijlstra @ 2018-02-05  8:51 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Alan Cox, Linus Torvalds, Thomas Gleixner, KarimAllah Ahmed,
	sironi, the arch/x86 maintainers, KVM list, Paolo Bonzini,
	Linux Kernel Mailing List, Borislav Petkov, Paul Turner

On Sat, Feb 03, 2018 at 02:46:47PM +0000, David Woodhouse wrote:
> > For the simple case how about wrapping the if into
> > 
> >                 call_likely(foo->bar, usualfunction, args)
> > 
> > as a companion to 
> > 
> >                  foo->bar(args)
> > 
> > that can resolve to nothing special on architectures that don't need it,
> > an if/else case on platforms with spectre, and potentially clever
> > stuff on any platform where you can beat the compiler by knowing
> > probabilities it can't infer ?
> 
> Yeah. I'm keen on being able to use something like alternatives to
> *change* 'usualfunction' at runtime though. I suspect it'll be a win
> for stuff like dma_ops.
> 
> But I'm also keen to actually base such things on real data, not just
> go randomly "optimising" stuff just because we can. Let's try to make
> sure we fix up the real bottlenecks, and not just go crazy.

Google has a fairly long history of using feedback driven optimization
compiles for the kernel. They were also the ones that developed perf
autofdo tooling IIRC.

https://gcc.gnu.org/wiki/AutoFDO/Tutorial

One of the things pjt promised was a series of patches doing the
proposed optimization for the scheduler code based on their results.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()
  2018-02-03 14:46         ` David Woodhouse
  2018-02-05  8:51           ` Peter Zijlstra
@ 2018-02-05  8:55           ` Peter Zijlstra
  1 sibling, 0 replies; 12+ messages in thread
From: Peter Zijlstra @ 2018-02-05  8:55 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Alan Cox, Linus Torvalds, Thomas Gleixner, KarimAllah Ahmed,
	sironi, the arch/x86 maintainers, KVM list, Paolo Bonzini,
	Linux Kernel Mailing List, Borislav Petkov

On Sat, Feb 03, 2018 at 02:46:47PM +0000, David Woodhouse wrote:
> Yeah. I'm keen on being able to use something like alternatives to
> *change* 'usualfunction' at runtime though. I suspect it'll be a win
> for stuff like dma_ops.

That shouldn't be too hard to implement.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()
  2018-02-02 18:50 ` Linus Torvalds
  2018-02-02 19:10   ` Linus Torvalds
@ 2018-02-05 15:15   ` Paolo Bonzini
  1 sibling, 0 replies; 12+ messages in thread
From: Paolo Bonzini @ 2018-02-05 15:15 UTC (permalink / raw)
  To: Linus Torvalds, David Woodhouse
  Cc: Thomas Gleixner, KarimAllah Ahmed, sironi,
	the arch/x86 maintainers, KVM list, Linux Kernel Mailing List,
	Borislav Petkov, Peter Zijlstra

On 02/02/2018 19:50, Linus Torvalds wrote:
> On Fri, Feb 2, 2018 at 6:59 AM, David Woodhouse <dwmw@amazon.co.uk> wrote:
>> With retpoline, tight loops of "call this function for every XXX" are
>> very much pessimised by taking a prediction miss *every* time.
>>
>> This one showed up very high in our early testing, and it only has five
>> things it'll ever call so make it take an 'op' enum instead of a
>> function pointer and let's see how that works out...
> Umm. May I suggest a different workaround?
> 
> Honestly, if this is so performance-critical, the *real* fix is to
> actually just mark all those "slot_handle_*()" functions as
> "always_inline".

I replied quickly from the phone before reading the rest of the
thread---yeah, always_inline is the way to go.  I see the same
differences as Linus and David (slight improvement for slot_handle_*,
+1k if you add kvm_handle_hva and kvm_handle_hva_range).

At least for slot_handle_* it's a no-brainer.  The others are basically
the MMU notifier implementation; in the perfect case it should actually
never be called (or at least it ought to be very rare), so I think we
can keep the indirect calls for now.

Paolo

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2018-02-05 15:15 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-02 14:59 [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range() David Woodhouse
2018-02-02 15:43 ` Sironi, Filippo
2018-02-02 21:10   ` Paolo Bonzini
2018-02-02 21:14     ` David Woodhouse
2018-02-02 18:50 ` Linus Torvalds
2018-02-02 19:10   ` Linus Torvalds
2018-02-02 19:17     ` David Woodhouse
2018-02-02 21:23       ` Alan Cox
2018-02-03 14:46         ` David Woodhouse
2018-02-05  8:51           ` Peter Zijlstra
2018-02-05  8:55           ` Peter Zijlstra
2018-02-05 15:15   ` Paolo Bonzini

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.