Re: [PATCH 6/7] x86/hyper-v: use hypercall for remove TLB flush

From: Vitaly Kuznetsov <vkuznets@redhat.com>
To: KY Srinivasan <kys@microsoft.com>
Cc: "devel\@linuxdriverproject.org" <devel@linuxdriverproject.org>,
	"x86\@kernel.org" <x86@kernel.org>,
	"linux-kernel\@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"Haiyang Zhang" <haiyangz@microsoft.com>,
	Stephen Hemminger <sthemmin@microsoft.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, "H. Peter Anvin" <hpa@zytor.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	"Jork Loeser" <Jork.Loeser@microsoft.com>
Subject: Re: [PATCH 6/7] x86/hyper-v: use hypercall for remove TLB flush
Date: Mon, 10 Apr 2017 16:44:03 +0200	[thread overview]
Message-ID: <87r310qq4s.fsf@vitty.brq.redhat.com> (raw)
In-Reply-To: <BN6PR03MB24818610B89DBF4B91099892A00F0@BN6PR03MB2481.namprd03.prod.outlook.com> (KY Srinivasan's message of "Sat, 8 Apr 2017 16:47:27 +0000")

KY Srinivasan <kys@microsoft.com> writes:

>> -----Original Message-----
>> From: Vitaly Kuznetsov [mailto:vkuznets@redhat.com]
>> Sent: Friday, April 7, 2017 4:27 AM
>> To: devel@linuxdriverproject.org; x86@kernel.org
>> Cc: linux-kernel@vger.kernel.org; KY Srinivasan <kys@microsoft.com>;
>> Haiyang Zhang <haiyangz@microsoft.com>; Stephen Hemminger
>> <sthemmin@microsoft.com>; Thomas Gleixner <tglx@linutronix.de>; Ingo
>> Molnar <mingo@redhat.com>; H. Peter Anvin <hpa@zytor.com>; Steven
>> Rostedt <rostedt@goodmis.org>; Jork Loeser <Jork.Loeser@microsoft.com>
>> Subject: [PATCH 6/7] x86/hyper-v: use hypercall for remove TLB flush
>> 
>> Hyper-V host can suggest us to use hypercall for doing remote TLB flush,
>> this is supposed to work faster than IPIs.
>> 
>> Implementation details: to do HvFlushVirtualAddress{Space,List} hypercalls
>> we need to put the input somewhere in memory and we don't really want to
>> have memory allocation on each call so we pre-allocate per cpu memory
>> areas
>> on boot. These areas are of fixes size, limit them with an arbitrary number
>> of 16 (16 gvas are able to specify 16 * 4096 pages).
>> 
>> pv_ops patching is happening very early so we need to separate
>> hyperv_setup_mmu_ops() and hyper_alloc_mmu().
>> 
>> It is possible and easy to implement local TLB flushing too and there is
>> even a hint for that. However, I don't see a room for optimization on the
>> host side as both hypercall and native tlb flush will result in vmexit. The
>> hint is also not set on modern Hyper-V versions.
>> 
>> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>> ---
>>  arch/x86/hyperv/Makefile           |   2 +-
>>  arch/x86/hyperv/hv_init.c          |   2 +
>>  arch/x86/hyperv/mmu.c              | 128
>> +++++++++++++++++++++++++++++++++++++
>>  arch/x86/include/asm/mshyperv.h    |   2 +
>>  arch/x86/include/uapi/asm/hyperv.h |   7 ++
>>  arch/x86/kernel/cpu/mshyperv.c     |   1 +
>>  6 files changed, 141 insertions(+), 1 deletion(-)
>>  create mode 100644 arch/x86/hyperv/mmu.c
>> 
>> diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
>> index 171ae09..367a820 100644
>> --- a/arch/x86/hyperv/Makefile
>> +++ b/arch/x86/hyperv/Makefile
>> @@ -1 +1 @@
>> -obj-y		:= hv_init.o
>> +obj-y		:= hv_init.o mmu.o
>> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
>> index 1c14088..2cf8a98 100644
>> --- a/arch/x86/hyperv/hv_init.c
>> +++ b/arch/x86/hyperv/hv_init.c
>> @@ -163,6 +163,8 @@ void hyperv_init(void)
>>  	hypercall_msr.guest_physical_address =
>> vmalloc_to_pfn(hv_hypercall_pg);
>>  	wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
>> 
>> +	hyper_alloc_mmu();
>> +
>>  	/*
>>  	 * Register Hyper-V specific clocksource.
>>  	 */
>> diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
>> new file mode 100644
>> index 0000000..fb487cb
>> --- /dev/null
>> +++ b/arch/x86/hyperv/mmu.c
>> @@ -0,0 +1,128 @@
>> +#include <linux/types.h>
>> +#include <linux/hyperv.h>
>> +#include <linux/slab.h>
>> +#include <asm/mshyperv.h>
>> +#include <asm/tlbflush.h>
>> +#include <asm/msr.h>
>> +#include <asm/fpu/api.h>
>> +
>> +/*
>> + * Arbitrary number; we need to pre-allocate per-cpu struct for doing TLB
>> + * flush hypercalls and we need to pick a size. '16' means we'll be able
>> + * to flush 16 * 4096 pages (256MB) with one hypercall.
>> + */
>> +#define HV_MMU_MAX_GVAS 16
>
> Did you experiment with different sizes here.

Actually, I was never able to see kernel trying to flush more than 4096
pages so we can get away with HV_MMU_MAX_GVAS=1. I went through the code
and didn't see any 'limit' for the number of pages we can ask to flush
so it can be a coincidence. Each addition gva_list item requires 8 bytes
only so I put and arbitrary '16' here.

>> +
>> +/* HvFlushVirtualAddressSpace*, HvFlushVirtualAddressList hypercalls */
>> +struct hv_flush_pcpu {
>> +	struct {
>> +		__u64 address_space;
>> +		__u64 flags;
>> +		__u64 processor_mask;
>> +		__u64 gva_list[HV_MMU_MAX_GVAS];
>> +	} flush;
>> +
>> +	spinlock_t lock;
>> +};
>> +
> We may be supporting more than 64 CPUs in this hypercall. I am going to inquire with
> the Windows folks and get back to you.

Thanks! It is even specified in the specification:
"Future versions of the hypervisor may support more than 64 virtual processors per partition. In that
case, a new field will be added to the flags value that allows the caller to define the “processor bank” to
which the processor mask applies."

We, however, need to know where to put this in flags.

>
>> +static struct hv_flush_pcpu __percpu *pcpu_flush;
>> +
>> +static void hyperv_flush_tlb_others(const struct cpumask *cpus,
>> +				    struct mm_struct *mm, unsigned long
>> start,
>> +				    unsigned long end)
>> +{
>> +	struct hv_flush_pcpu *flush;
>> +	unsigned long cur, flags;
>> +	u64 status = -1ULL;
>> +	int cpu, vcpu, gva_n;
>> +
>> +	if (!pcpu_flush || !hv_hypercall_pg)
>> +		goto do_native;
>> +
>> +	if (cpumask_empty(cpus))
>> +		return;
>> +
>> +	flush = this_cpu_ptr(pcpu_flush);
>> +	spin_lock_irqsave(&flush->lock, flags);
>> +
>> +	flush->flush.address_space = virt_to_phys(mm->pgd);
>> +	flush->flush.processor_mask = 0;
>> +	if (cpumask_equal(cpus, cpu_present_mask)) {
>> +		flush->flush.flags = HV_FLUSH_ALL_PROCESSORS;
>> +	} else {
>> +		flush->flush.flags = 0;
>> +		for_each_cpu(cpu, cpus) {
>> +			vcpu = vmbus_cpu_number_to_vp_number(cpu);
>> +			if (vcpu != -1 && vcpu < 64)
>> +				flush->flush.processor_mask |= 1 << vcpu;
>> +			else
>> +				goto unlock_do_native;
>> +		}
>> +	}
>> +
>> +	if (end == TLB_FLUSH_ALL) {
>> +		flush->flush.flags =
>> HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
>> +		status =
>> hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
>> +					 &flush->flush, NULL);
>> +	} else {
>> +		cur = start;
>> +more_gvas:
>> +		gva_n = 0;
>> +
>> +		do {
>> +			flush->flush.gva_list[gva_n] = cur & PAGE_MASK;
>> +			/*
>> +			 * Lower 12 bits encode the number of additional
>> +			 * pages to flush (in addition to the 'cur' page).
>> +			 */
>> +			if (end >= cur + PAGE_SIZE * PAGE_SIZE)
>> +				flush->flush.gva_list[gva_n] |=
>> ~PAGE_MASK;
>> +			else if (end > cur)
>> +				flush->flush.gva_list[gva_n] |=
>> +					(end - cur - 1) >> PAGE_SHIFT;
>> +
>> +			cur += PAGE_SIZE * PAGE_SIZE;
>> +			++gva_n;
>> +
>> +		} while (cur < end && gva_n < HV_MMU_MAX_GVAS);
>> +
>> +		status =
>> hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST,
>> +					     gva_n, &flush->flush, NULL);
>> +
>> +		if (!(status & 0xffff) && cur < end)
>> +			goto more_gvas;
>> +	}
>> +
>> +unlock_do_native:
>> +	spin_unlock_irqrestore(&flush->lock, flags);
>> +
>> +	if (!(status & 0xffff))
>> +		return;
>> +do_native:
>> +	native_flush_tlb_others(cpus, mm, start, end);
>> +}
>> +
>> +void hyperv_setup_mmu_ops(void)
>> +{
>> +	if (ms_hyperv.hints &
>> HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED) {
>> +		pr_info("Hyper-V: Using hypercall for remote TLB flush\n");
>> +		pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others;
>> +	}
>> +}
>> +
>> +void hyper_alloc_mmu(void)
>> +{
>> +	int cpu;
>> +	struct hv_flush_pcpu *flush;
>> +
>> +	if (ms_hyperv.hints &
>> HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED) {
>> +		pcpu_flush = alloc_percpu(struct hv_flush_pcpu);
>> +		if (!pcpu_flush)
>> +			return;
>> +
>> +		for_each_possible_cpu(cpu) {
>> +			flush = per_cpu_ptr(pcpu_flush, cpu);
>> +			spin_lock_init(&flush->lock);
>> +		}
>> +	}
>> +}
>> diff --git a/arch/x86/include/asm/mshyperv.h
>> b/arch/x86/include/asm/mshyperv.h
>> index 1293c84..a5041c3 100644
>> --- a/arch/x86/include/asm/mshyperv.h
>> +++ b/arch/x86/include/asm/mshyperv.h
>> @@ -301,6 +301,8 @@ static inline int
>> vmbus_cpu_number_to_vp_number(int cpu_number)
>>  }
>> 
>>  void hyperv_init(void);
>> +void hyperv_setup_mmu_ops(void);
>> +void hyper_alloc_mmu(void);
>>  void hyperv_report_panic(struct pt_regs *regs);
>>  bool hv_is_hypercall_page_setup(void);
>>  void hyperv_cleanup(void);
>> diff --git a/arch/x86/include/uapi/asm/hyperv.h
>> b/arch/x86/include/uapi/asm/hyperv.h
>> index c87e900..3d44036 100644
>> --- a/arch/x86/include/uapi/asm/hyperv.h
>> +++ b/arch/x86/include/uapi/asm/hyperv.h
>> @@ -239,6 +239,8 @@
>>  		(~((1ull <<
>> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
>> 
>>  /* Declare the various hypercall operations. */
>> +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE	0x0002
>> +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST	0x0003
>>  #define HVCALL_NOTIFY_LONG_SPIN_WAIT		0x0008
>>  #define HVCALL_POST_MESSAGE			0x005c
>>  #define HVCALL_SIGNAL_EVENT			0x005d
>> @@ -256,6 +258,11 @@
>>  #define HV_PROCESSOR_POWER_STATE_C2		2
>>  #define HV_PROCESSOR_POWER_STATE_C3		3
>> 
>> +#define HV_FLUSH_ALL_PROCESSORS			0x00000001
>> +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES	0x00000002
>> +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY	0x00000004
>> +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT	0x00000008
>> +
>>  /* Hypercall interface */
>>  union hv_hypercall_input {
>>  	u64 as_uint64;
>> diff --git a/arch/x86/kernel/cpu/mshyperv.c
>> b/arch/x86/kernel/cpu/mshyperv.c
>> index 04cb8d3..fc228d8 100644
>> --- a/arch/x86/kernel/cpu/mshyperv.c
>> +++ b/arch/x86/kernel/cpu/mshyperv.c
>> @@ -233,6 +233,7 @@ static void __init ms_hyperv_init_platform(void)
>>  	 * Setup the hook to get control post apic initialization.
>>  	 */
>>  	x86_platform.apic_post_init = hyperv_init;
>> +	hyperv_setup_mmu_ops();
>>  #endif
>>  }
>> 
>> --
>> 2.9.3

-- 
  Vitaly