Re: [Patch 2/4] KVM:SVM: Introduce set_spte_notify support

From: Eric van Tassell <evantass@amd.com>
To: Sean Christopherson <sean.j.christopherson@intel.com>,
	eric van tassell <Eric.VanTassell@amd.com>,
	Tom Lendacky <thomas.lendacky@amd.com>,
	"Singh, Brijesh" <Brijesh.Singh@amd.com>,
	"Grimm, Jon" <Jon.Grimm@amd.com>
Cc: kvm@vger.kernel.org, bp@alien8.de, hpa@zytor.com,
	mingo@redhat.com, jmattson@google.com, joro@8bytes.org,
	pbonzini@redhat.com, tglx@linutronix.de, vkuznets@redhat.com,
	wanpengli@tencent.com, x86@kernel.org
Subject: Re: [Patch 2/4] KVM:SVM: Introduce set_spte_notify support
Date: Sun, 2 Aug 2020 15:53:54 -0500	[thread overview]
Message-ID: <3dbf468e-2573-be5b-9160-9bb51d56882c@amd.com> (raw)
In-Reply-To: <20200731202502.GG31451@linux.intel.com>


On 7/31/20 3:25 PM, Sean Christopherson wrote:
> On Fri, Jul 24, 2020 at 06:54:46PM -0500, eric van tassell wrote:
>> Improve SEV guest startup time from O(n) to a constant by deferring
>> guest page pinning until the pages are used to satisfy nested page faults.
>>
>> Implement the code to do the pinning (sev_get_page) and the notifier
>> sev_set_spte_notify().
>>
>> Track the pinned pages with xarray so they can be released during guest
>> termination.
> 
> I like that SEV is trying to be a better citizen, but this is trading one
> hack for another.
> 
>    - KVM goes through a lot of effort to ensure page faults don't need to
>      allocate memory, and this throws all that effort out the window.
> 
can you elaborate on that?
>    - Tracking all gfns in a separate database (from the MMU) is wasteful.
> 
>    - Having to wait to free pinned memory until the VM is destroyed is less
>      than ideal.
> 
> More thoughts in the next patch.
> 
>> Signed-off-by: eric van tassell <Eric.VanTassell@amd.com>
>> ---
>>   arch/x86/kvm/svm/sev.c | 71 ++++++++++++++++++++++++++++++++++++++++++
>>   arch/x86/kvm/svm/svm.c |  2 ++
>>   arch/x86/kvm/svm/svm.h |  3 ++
>>   3 files changed, 76 insertions(+)
>>
>> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
>> index f7f1f4ecf08e..040ae4aa7c5a 100644
>> --- a/arch/x86/kvm/svm/sev.c
>> +++ b/arch/x86/kvm/svm/sev.c
>> @@ -184,6 +184,8 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>>   	sev->asid = asid;
>>   	INIT_LIST_HEAD(&sev->regions_list);
>>   
>> +	xa_init(&sev->pages_xarray);
>> +
>>   	return 0;
>>   
>>   e_free:
>> @@ -415,6 +417,42 @@ static unsigned long get_num_contig_pages(unsigned long idx,
>>   	return pages;
>>   }
>>   
>> +static int sev_get_page(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn)
>> +{
>> +	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>> +	struct xarray *xa = &sev->pages_xarray;
>> +	struct page *page = pfn_to_page(pfn);
>> +	int ret;
>> +
>> +	/* store page at index = gfn */
>> +	ret = xa_insert(xa, gfn, page, GFP_ATOMIC);
>> +	if (ret == -EBUSY) {
>> +		/*
>> +		 * If xa_insert returned -EBUSY, the  gfn was already associated
>> +		 * with a struct page *.
>> +		 */
>> +		struct page *cur_page;
>> +
>> +		cur_page = xa_load(xa, gfn);
>> +		/* If cur_page == page, no change is needed, so return 0 */
>> +		if (cur_page == page)
>> +			return 0;
>> +
>> +		/* Release the page that was stored at index = gfn */
>> +		put_page(cur_page);
>> +
>> +		/* Return result of attempting to store page at index = gfn */
>> +		ret = xa_err(xa_store(xa, gfn, page, GFP_ATOMIC));
>> +	}
>> +
>> +	if (ret)
>> +		return ret;
>> +
>> +	get_page(page);
>> +
>> +	return 0;
>> +}
>> +
>>   static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
>>   {
>>   	unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
>> @@ -1085,6 +1123,8 @@ void sev_vm_destroy(struct kvm *kvm)
>>   	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>   	struct list_head *head = &sev->regions_list;
>>   	struct list_head *pos, *q;
>> +	XA_STATE(xas, &sev->pages_xarray, 0);
>> +	struct page *xa_page;
>>   
>>   	if (!sev_guest(kvm))
>>   		return;
>> @@ -1109,6 +1149,12 @@ void sev_vm_destroy(struct kvm *kvm)
>>   		}
>>   	}
>>   
>> +	/* Release each pinned page that SEV tracked in sev->pages_xarray. */
>> +	xas_for_each(&xas, xa_page, ULONG_MAX) {
>> +		put_page(xa_page);
>> +	}
>> +	xa_destroy(&sev->pages_xarray);
>> +
>>   	mutex_unlock(&kvm->lock);
>>   
>>   	sev_unbind_asid(kvm, sev->handle);
>> @@ -1193,3 +1239,28 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
>>   	svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
>>   	vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
>>   }
>> +
>> +int sev_set_spte_notify(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn,
>> +			int level, bool mmio, u64 *spte)
>> +{
>> +	int rc;
>> +
>> +	if (!sev_guest(vcpu->kvm))
>> +		return 0;
>> +
>> +	/* MMIO page contains the unencrypted data, no need to lock this page */
>> +	if (mmio)
> 
> Rather than make this a generic set_spte() notify hook, I think it makes
> more sense to specifying have it be a "pin_spte" style hook.  That way the
> caller can skip mmio PFNs as well as flows that can't possibly be relevant
> to SEV, e.g. the sync_page() flow.
Not sure i understand. We do ignore mmio here. Can you detail a bit more 
what you see as problematic with the sync_page() flow?
> 
>> +		return 0;
>> +
>> +	rc = sev_get_page(vcpu->kvm, gfn, pfn);
>> +	if (rc)
>> +		return rc;
>> +
>> +	/*
>> +	 * Flush any cached lines of the page being added since "ownership" of
>> +	 * it will be transferred from the host to an encrypted guest.
>> +	 */
>> +	clflush_cache_range(__va(pfn << PAGE_SHIFT), page_level_size(level));
>> +
>> +	return 0;
>> +}
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index 535ad311ad02..9b304c761a99 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -4130,6 +4130,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>>   	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
>>   
>>   	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
>> +
>> +	.set_spte_notify = sev_set_spte_notify,
>>   };
>>   
>>   static struct kvm_x86_init_ops svm_init_ops __initdata = {
>> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
>> index 121b198b51e9..8a5c01516c89 100644
>> --- a/arch/x86/kvm/svm/svm.h
>> +++ b/arch/x86/kvm/svm/svm.h
>> @@ -65,6 +65,7 @@ struct kvm_sev_info {
>>   	int fd;			/* SEV device fd */
>>   	unsigned long pages_locked; /* Number of pages locked */
>>   	struct list_head regions_list;  /* List of registered regions */
>> +	struct xarray pages_xarray; /* List of PFN locked */
>>   };
>>   
>>   struct kvm_svm {
>> @@ -488,5 +489,7 @@ int svm_unregister_enc_region(struct kvm *kvm,
>>   void pre_sev_run(struct vcpu_svm *svm, int cpu);
>>   int __init sev_hardware_setup(void);
>>   void sev_hardware_teardown(void);
>> +int sev_set_spte_notify(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn,
>> +			int level, bool mmio, u64 *spte);
>>   
>>   #endif
>> -- 
>> 2.17.1
>>