All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chao Peng <chao.p.peng@linux.intel.com>
To: Sean Christopherson <seanjc@google.com>, kirill.shutemov@linux.intel.com
Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, linux-fsdevel@vger.kernel.org,
	linux-api@vger.kernel.org, linux-doc@vger.kernel.org,
	qemu-devel@nongnu.org, Paolo Bonzini <pbonzini@redhat.com>,
	Jonathan Corbet <corbet@lwn.net>,
	Vitaly Kuznetsov <vkuznets@redhat.com>,
	Wanpeng Li <wanpengli@tencent.com>,
	Jim Mattson <jmattson@google.com>, Joerg Roedel <joro@8bytes.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
	x86@kernel.org, "H . Peter Anvin" <hpa@zytor.com>,
	Hugh Dickins <hughd@google.com>, Jeff Layton <jlayton@kernel.org>,
	"J . Bruce Fields" <bfields@fieldses.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Mike Rapoport <rppt@kernel.org>,
	Steven Price <steven.price@arm.com>,
	"Maciej S . Szmigiero" <mail@maciej.szmigiero.name>,
	Vlastimil Babka <vbabka@suse.cz>,
	Vishal Annapurve <vannapurve@google.com>,
	Yu Zhang <yu.c.zhang@linux.intel.com>,
	"Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>,
	luto@kernel.org, jun.nakajima@intel.com, dave.hansen@intel.com,
	ak@linux.intel.com, david@redhat.com, aarcange@redhat.com,
	ddutile@redhat.com, dhildenb@redhat.com,
	Quentin Perret <qperret@google.com>,
	Michael Roth <michael.roth@amd.com>,
	mhocko@suse.com
Subject: Re: [PATCH v6 6/8] KVM: Handle page fault for private memory
Date: Mon, 20 Jun 2022 22:16:47 +0800	[thread overview]
Message-ID: <20220620141647.GC2016793@chaop.bj.intel.com> (raw)
In-Reply-To: <YqzyjZnflCMPo8b/@google.com>

On Fri, Jun 17, 2022 at 09:30:53PM +0000, Sean Christopherson wrote:
> On Thu, May 19, 2022, Chao Peng wrote:
> > @@ -4028,8 +4081,11 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
> >  	if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
> >  		return true;
> >  
> > -	return fault->slot &&
> > -	       mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
> > +	if (fault->is_private)
> > +		return mmu_notifier_retry(vcpu->kvm, mmu_seq);
> 
> Hmm, this is somewhat undesirable, because faulting in private pfns will be blocked
> by unrelated mmu_notifier updates.  The issue is mitigated to some degree by bumping
> the sequence count if and only if overlap with a memslot is detected, e.g. mapping
> changes that affects only userspace won't block the guest.
> 
> It probably won't be an issue, but at the same time it's easy to solve, and I don't
> like piggybacking mmu_notifier_seq as private mappings shouldn't be subject to the
> mmu_notifier.
> 
> That would also fix a theoretical bug in this patch where mmu_notifier_retry()
> wouldn't be defined if CONFIG_MEMFILE_NOTIFIER=y && CONFIG_MMU_NOTIFIER=n.a

Agreed, Thanks.

> 
> ---
>  arch/x86/kvm/mmu/mmu.c   | 11 ++++++-----
>  include/linux/kvm_host.h | 16 +++++++++++-----
>  virt/kvm/kvm_main.c      |  2 +-
>  3 files changed, 18 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 0b455c16ec64..a4cbd29433e7 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4100,10 +4100,10 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
>  		return true;
> 
>  	if (fault->is_private)
> -		return mmu_notifier_retry(vcpu->kvm, mmu_seq);
> -	else
> -		return fault->slot &&
> -			mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
> +		return memfile_notifier_retry(vcpu->kvm, mmu_seq);
> +
> +	return fault->slot &&
> +	       mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
>  }
> 
>  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> @@ -4127,7 +4127,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	if (r)
>  		return r;
> 
> -	mmu_seq = vcpu->kvm->mmu_notifier_seq;
> +	mmu_seq = fault->is_private ? vcpu->kvm->memfile_notifier_seq :
> +				      vcpu->kvm->mmu_notifier_seq;
>  	smp_rmb();
> 
>  	r = kvm_faultin_pfn(vcpu, fault);
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 92afa5bddbc5..31f704c83099 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -773,16 +773,15 @@ struct kvm {
>  	struct hlist_head irq_ack_notifier_list;
>  #endif
> 
> -#if (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)) ||\
> -	defined(CONFIG_MEMFILE_NOTIFIER)
> +#if (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER))
>  	unsigned long mmu_notifier_seq;
> -#endif
> -
> -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
>  	struct mmu_notifier mmu_notifier;
>  	long mmu_notifier_count;
>  	unsigned long mmu_notifier_range_start;
>  	unsigned long mmu_notifier_range_end;
> +#endif
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +	unsigned long memfile_notifier_seq;
>  #endif
>  	struct list_head devices;
>  	u64 manual_dirty_log_protect;
> @@ -1964,6 +1963,13 @@ static inline int mmu_notifier_retry_hva(struct kvm *kvm,
>  }
>  #endif
> 
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +static inline bool memfile_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
> +{
> +	return kvm->memfile_notifier_seq != mmu_seq;
> +}
> +#endif
> +
>  #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
> 
>  #define KVM_MAX_IRQ_ROUTES 4096 /* might need extension/rework in the future */
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 2b416d3bd60e..e6d34c964d51 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -898,7 +898,7 @@ static void kvm_private_mem_notifier_handler(struct memfile_notifier *notifier,
>  	KVM_MMU_LOCK(kvm);
>  	if (kvm_unmap_gfn_range(kvm, &gfn_range))
>  		kvm_flush_remote_tlbs(kvm);
> -	kvm->mmu_notifier_seq++;
> +	kvm->memfile_notifier_seq++;
>  	KVM_MMU_UNLOCK(kvm);
>  	srcu_read_unlock(&kvm->srcu, idx);
>  }
> 
> base-commit: 333ef501c7f6c6d4ef2b7678905cad0f8ef3e271
> --
> 
> > +	else
> > +		return fault->slot &&
> > +			mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
> >  }
> >  
> >  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> > @@ -4088,7 +4144,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> >  		read_unlock(&vcpu->kvm->mmu_lock);
> >  	else
> >  		write_unlock(&vcpu->kvm->mmu_lock);
> > -	kvm_release_pfn_clean(fault->pfn);
> > +
> > +	if (fault->is_private)
> > +		kvm_private_mem_put_pfn(fault->slot, fault->pfn);
> 
> Why does the shmem path lock the page, and then unlock it here?

Initially this is to prevent race between SLPT population and
truncate/punch on the fd. Without this, a gfn may become stale before
the page is populated in SLPT. However, with memfile_notifier_retry
mechanism, this sounds not needed.

> 
> Same question for why this path marks it dirty?  The guest has the page mapped
> so the dirty flag is immediately stale.

I believe so.

> 
> In other words, why does KVM need to do something different for private pfns?

These two are inherited from Kirill's previous code. See if he has any
comment.

> 
> > +	else
> > +		kvm_release_pfn_clean(fault->pfn);
> > +
> >  	return r;
> >  }
> >  
> 
> ...
> 
> > diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> > index 7f8f1c8dbed2..1d857919a947 100644
> > --- a/arch/x86/kvm/mmu/paging_tmpl.h
> > +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> > @@ -878,7 +878,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> >  
> >  out_unlock:
> >  	write_unlock(&vcpu->kvm->mmu_lock);
> > -	kvm_release_pfn_clean(fault->pfn);
> > +	if (fault->is_private)
> 
> Indirect MMUs can't support private faults, i.e. this is unnecessary.

Okay.

> 
> > +		kvm_private_mem_put_pfn(fault->slot, fault->pfn);
> > +	else
> > +		kvm_release_pfn_clean(fault->pfn);
> >  	return r;
> >  }
> >  
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 3fd168972ecd..b0a7910505ed 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -2241,4 +2241,26 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
> >  /* Max number of entries allowed for each kvm dirty ring */
> >  #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
> >  
> > +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> > +static inline int kvm_private_mem_get_pfn(struct kvm_memory_slot *slot,
> > +					  gfn_t gfn, kvm_pfn_t *pfn, int *order)
> > +{
> > +	int ret;
> > +	pfn_t pfnt;
> > +	pgoff_t index = gfn - slot->base_gfn +
> > +			(slot->private_offset >> PAGE_SHIFT);
> > +
> > +	ret = slot->notifier.bs->get_lock_pfn(slot->private_file, index, &pfnt,
> > +						order);
> > +	*pfn = pfn_t_to_pfn(pfnt);
> > +	return ret;
> > +}
> > +
> > +static inline void kvm_private_mem_put_pfn(struct kvm_memory_slot *slot,
> > +					   kvm_pfn_t pfn)
> > +{
> > +	slot->notifier.bs->put_unlock_pfn(pfn_to_pfn_t(pfn));
> > +}
> > +#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */
> > +
> >  #endif
> > -- 
> > 2.25.1
> > 

  reply	other threads:[~2022-06-20 14:59 UTC|newest]

Thread overview: 58+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-05-19 15:37 [PATCH v6 0/8] KVM: mm: fd-based approach for supporting KVM guest private memory Chao Peng
2022-05-19 15:37 ` [PATCH v6 1/8] mm: Introduce memfile_notifier Chao Peng
2022-05-19 15:37 ` [PATCH v6 2/8] mm/shmem: Support memfile_notifier Chao Peng
2022-05-19 15:37 ` [PATCH v6 3/8] mm/memfd: Introduce MFD_INACCESSIBLE flag Chao Peng
2022-05-31 19:15   ` Vishal Annapurve
2022-06-01 10:17     ` Chao Peng
2022-06-01 12:11       ` Gupta, Pankaj
2022-06-02 10:07         ` Chao Peng
2022-06-14 20:23           ` Sean Christopherson
2022-06-15  8:53             ` Chao Peng
2022-05-19 15:37 ` [PATCH v6 4/8] KVM: Extend the memslot to support fd-based private memory Chao Peng
2022-05-20 17:57   ` Andy Lutomirski
2022-05-20 18:31     ` Sean Christopherson
2022-05-22  4:03       ` Andy Lutomirski
2022-05-23 13:21       ` Chao Peng
2022-05-23 15:22         ` Sean Christopherson
2022-05-30 13:26           ` Chao Peng
2022-06-10 16:14             ` Sean Christopherson
2022-06-14  6:45               ` Chao Peng
2022-06-23 22:59       ` Michael Roth
2022-06-24  8:54         ` Chao Peng
2022-06-24 13:01           ` Michael Roth
2022-06-17 20:52   ` Sean Christopherson
2022-06-17 21:27     ` Sean Christopherson
2022-06-20 14:09       ` Chao Peng
2022-06-20 14:08     ` Chao Peng
2022-05-19 15:37 ` [PATCH v6 5/8] KVM: Add KVM_EXIT_MEMORY_FAULT exit Chao Peng
2022-05-19 15:37 ` [PATCH v6 6/8] KVM: Handle page fault for private memory Chao Peng
2022-06-17 21:30   ` Sean Christopherson
2022-06-20 14:16     ` Chao Peng [this message]
2022-08-19  0:40     ` Kirill A. Shutemov
2022-08-25 23:43       ` Sean Christopherson
2022-06-24  3:58   ` Nikunj A. Dadhania
2022-06-24  9:02     ` Chao Peng
2022-06-30 19:14       ` Vishal Annapurve
2022-06-30 22:21         ` Michael Roth
2022-07-01  1:21           ` Xiaoyao Li
2022-07-07 20:08             ` Sean Christopherson
2022-07-08  3:29               ` Xiaoyao Li
2022-07-20 23:08                 ` Vishal Annapurve
2022-07-21  9:45                   ` Chao Peng
2022-05-19 15:37 ` [PATCH v6 7/8] KVM: Enable and expose KVM_MEM_PRIVATE Chao Peng
2022-06-23 22:07   ` Michael Roth
2022-06-24  8:43     ` Chao Peng
2022-05-19 15:37 ` [PATCH v6 8/8] memfd_create.2: Describe MFD_INACCESSIBLE flag Chao Peng
2022-06-06 20:09 ` [PATCH v6 0/8] KVM: mm: fd-based approach for supporting KVM guest private memory Vishal Annapurve
2022-06-07  6:57   ` Chao Peng
2022-06-08  0:55     ` Marc Orr
2022-06-08  2:18       ` Chao Peng
2022-06-08 19:37         ` Vishal Annapurve
2022-06-09 20:29           ` Sean Christopherson
2022-06-14  7:28             ` Chao Peng
2022-06-14 17:37               ` Andy Lutomirski
2022-06-14 19:08                 ` Sean Christopherson
2022-06-14 20:59                   ` Andy Lutomirski
2022-06-15  9:17                     ` Chao Peng
2022-06-15 14:29                       ` Sean Christopherson
2022-06-10  0:11         ` Marc Orr

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220620141647.GC2016793@chaop.bj.intel.com \
    --to=chao.p.peng@linux.intel.com \
    --cc=aarcange@redhat.com \
    --cc=ak@linux.intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=bfields@fieldses.org \
    --cc=bp@alien8.de \
    --cc=corbet@lwn.net \
    --cc=dave.hansen@intel.com \
    --cc=david@redhat.com \
    --cc=ddutile@redhat.com \
    --cc=dhildenb@redhat.com \
    --cc=hpa@zytor.com \
    --cc=hughd@google.com \
    --cc=jlayton@kernel.org \
    --cc=jmattson@google.com \
    --cc=joro@8bytes.org \
    --cc=jun.nakajima@intel.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=mail@maciej.szmigiero.name \
    --cc=mhocko@suse.com \
    --cc=michael.roth@amd.com \
    --cc=mingo@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=qperret@google.com \
    --cc=rppt@kernel.org \
    --cc=seanjc@google.com \
    --cc=steven.price@arm.com \
    --cc=tglx@linutronix.de \
    --cc=vannapurve@google.com \
    --cc=vbabka@suse.cz \
    --cc=vkuznets@redhat.com \
    --cc=wanpengli@tencent.com \
    --cc=x86@kernel.org \
    --cc=yu.c.zhang@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.