All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alexey Kardashevskiy <aik@ozlabs.ru>
To: linuxppc-dev@lists.ozlabs.org
Cc: kvm-ppc@vger.kernel.org, David Gibson <david@gibson.dropbear.id.au>
Subject: ignore this Re: [PATCH kernel v2] KVM: PPC: Allocate guest TCEs on demand too
Date: Fri, 1 Mar 2019 15:35:24 +1100	[thread overview]
Message-ID: <f54c602f-cfcf-91fc-cc09-bde698740efc@ozlabs.ru> (raw)
In-Reply-To: <20190301043411.89935-1-aik@ozlabs.ru>

Ignore this as I forgot to change v2 to v3 so I reposted this.


On 01/03/2019 15:34, Alexey Kardashevskiy wrote:
> We already allocate hardware TCE tables in multiple levels and skip
> intermediate levels when we can, now it is a turn of the KVM TCE tables.
> Thankfully these are allocated already in 2 levels.
> 
> This moves the table's last level allocation from the creating helper to
> kvmppc_tce_put() and kvm_spapr_tce_fault().
> 
> This adds kvmppc_rm_ioba_validate() to do an additional test if
> the consequent kvmppc_tce_put() needs a page which has not been allocated;
> if this is the case, we bail out to virtual mode handlers.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> Changes:
> v3:
> * fixed alignments in kvmppc_rm_ioba_validate
> 
> v2:
> * added kvm mutex around alloc_page to prevent races; in both place we
> test the pointer, if NULL, then take a lock and check again so on a fast
> path we do not take a lock at all
> 
> 
> ---
> For NVLink2 passthrough guests with 128TiB DMA windows and very fragmented
> system RAM the difference is gigabytes of RAM.
> ---
>  arch/powerpc/kvm/book3s_64_vio.c    | 29 ++++++------
>  arch/powerpc/kvm/book3s_64_vio_hv.c | 71 ++++++++++++++++++++++++++---
>  2 files changed, 81 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
> index f02b04973710..7eed8c90ea3d 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -228,7 +228,8 @@ static void release_spapr_tce_table(struct rcu_head *head)
>  	unsigned long i, npages = kvmppc_tce_pages(stt->size);
>  
>  	for (i = 0; i < npages; i++)
> -		__free_page(stt->pages[i]);
> +		if (stt->pages[i])
> +			__free_page(stt->pages[i]);
>  
>  	kfree(stt);
>  }
> @@ -242,6 +243,20 @@ static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf)
>  		return VM_FAULT_SIGBUS;
>  
>  	page = stt->pages[vmf->pgoff];
> +	if (!page) {
> +		mutex_lock(&stt->kvm->lock);
> +		page = stt->pages[vmf->pgoff];
> +		if (!page) {
> +			page  = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +			if (!page) {
> +				mutex_unlock(&stt->kvm->lock);
> +				return VM_FAULT_OOM;
> +			}
> +			stt->pages[vmf->pgoff] = page;
> +		}
> +		mutex_unlock(&stt->kvm->lock);
> +	}
> +
>  	get_page(page);
>  	vmf->page = page;
>  	return 0;
> @@ -296,7 +311,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  	struct kvmppc_spapr_tce_table *siter;
>  	unsigned long npages, size = args->size;
>  	int ret = -ENOMEM;
> -	int i;
>  
>  	if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
>  		(args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
> @@ -320,12 +334,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  	stt->kvm = kvm;
>  	INIT_LIST_HEAD_RCU(&stt->iommu_tables);
>  
> -	for (i = 0; i < npages; i++) {
> -		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
> -		if (!stt->pages[i])
> -			goto fail;
> -	}
> -
>  	mutex_lock(&kvm->lock);
>  
>  	/* Check this LIOBN hasn't been previously allocated */
> @@ -352,11 +360,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  	if (ret >= 0)
>  		return ret;
>  
> - fail:
> -	for (i = 0; i < npages; i++)
> -		if (stt->pages[i])
> -			__free_page(stt->pages[i]);
> -
>  	kfree(stt);
>   fail_acct:
>  	kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
> index 2206bc729b9a..1cd9373f8bdc 100644
> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> @@ -158,23 +158,78 @@ static u64 *kvmppc_page_address(struct page *page)
>  	return (u64 *) page_address(page);
>  }
>  
> +/*
> + * TCEs pages are allocated in kvmppc_tce_put() which won't be able to do so
> + * in real mode.
> + * Check if kvmppc_tce_put() can succeed in real mode, i.e. a TCEs page is
> + * allocated or not required (when clearing a tce entry).
> + */
> +static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt,
> +		unsigned long ioba, unsigned long npages, bool clearing)
> +{
> +	unsigned long i, idx, sttpage, sttpages;
> +	unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages);
> +
> +	if (ret)
> +		return ret;
> +	/*
> +	 * clearing==true says kvmppc_tce_put won't be allocating pages
> +	 * for empty tces.
> +	 */
> +	if (clearing)
> +		return H_SUCCESS;
> +
> +	idx = (ioba >> stt->page_shift) - stt->offset;
> +	sttpage = idx / TCES_PER_PAGE;
> +	sttpages = _ALIGN_UP(idx % TCES_PER_PAGE + npages, TCES_PER_PAGE) /
> +			TCES_PER_PAGE;
> +	for (i = sttpage; i < sttpage + sttpages; ++i)
> +		if (!stt->pages[i])
> +			return H_TOO_HARD;
> +
> +	return H_SUCCESS;
> +}
> +
>  /*
>   * Handles TCE requests for emulated devices.
>   * Puts guest TCE values to the table and expects user space to convert them.
>   * Called in both real and virtual modes.
>   * Cannot fail so kvmppc_tce_validate must be called before it.
>   *
> - * WARNING: This will be called in real-mode on HV KVM and virtual
> - *          mode on PR KVM
> + * WARNING: This will be called in real-mode on HV HPT KVM and virtual
> + *          mode on PR KVM or HV radix KVM
>   */
>  void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
>  		unsigned long idx, unsigned long tce)
>  {
>  	struct page *page;
>  	u64 *tbl;
> +	unsigned long sttpage;
>  
>  	idx -= stt->offset;
> -	page = stt->pages[idx / TCES_PER_PAGE];
> +	sttpage = idx / TCES_PER_PAGE;
> +	page = stt->pages[sttpage];
> +
> +	if (!page) {
> +		/* We allow any TCE, not just with read|write permissions */
> +		if (!tce)
> +			return;
> +		/*
> +		 * We must not end up here in real mode,
> +		 * kvmppc_rm_ioba_validate() takes care of this.
> +		 */
> +		mutex_lock(&stt->kvm->lock);
> +		page = stt->pages[sttpage];
> +		if (!page) {
> +			page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +			if (WARN_ON_ONCE(!page)) {
> +				mutex_unlock(&stt->kvm->lock);
> +				return;
> +			}
> +			stt->pages[sttpage] = page;
> +		}
> +		mutex_unlock(&stt->kvm->lock);
> +	}
>  	tbl = kvmppc_page_address(page);
>  
>  	tbl[idx % TCES_PER_PAGE] = tce;
> @@ -381,7 +436,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
>  	if (!stt)
>  		return H_TOO_HARD;
>  
> -	ret = kvmppc_ioba_validate(stt, ioba, 1);
> +	ret = kvmppc_rm_ioba_validate(stt, ioba, 1, tce == 0);
>  	if (ret != H_SUCCESS)
>  		return ret;
>  
> @@ -480,7 +535,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>  	if (tce_list & (SZ_4K - 1))
>  		return H_PARAMETER;
>  
> -	ret = kvmppc_ioba_validate(stt, ioba, npages);
> +	ret = kvmppc_rm_ioba_validate(stt, ioba, npages, false);
>  	if (ret != H_SUCCESS)
>  		return ret;
>  
> @@ -583,7 +638,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
>  	if (!stt)
>  		return H_TOO_HARD;
>  
> -	ret = kvmppc_ioba_validate(stt, ioba, npages);
> +	ret = kvmppc_rm_ioba_validate(stt, ioba, npages, tce_value == 0);
>  	if (ret != H_SUCCESS)
>  		return ret;
>  
> @@ -635,6 +690,10 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
>  
>  	idx = (ioba >> stt->page_shift) - stt->offset;
>  	page = stt->pages[idx / TCES_PER_PAGE];
> +	if (!page) {
> +		vcpu->arch.regs.gpr[4] = 0;
> +		return H_SUCCESS;
> +	}
>  	tbl = (u64 *)page_address(page);
>  
>  	vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE];
> 

-- 
Alexey

WARNING: multiple messages have this Message-ID (diff)
From: Alexey Kardashevskiy <aik@ozlabs.ru>
To: linuxppc-dev@lists.ozlabs.org
Cc: kvm-ppc@vger.kernel.org, David Gibson <david@gibson.dropbear.id.au>
Subject: ignore this Re: [PATCH kernel v2] KVM: PPC: Allocate guest TCEs on demand too
Date: Fri, 01 Mar 2019 04:35:24 +0000	[thread overview]
Message-ID: <f54c602f-cfcf-91fc-cc09-bde698740efc@ozlabs.ru> (raw)
In-Reply-To: <20190301043411.89935-1-aik@ozlabs.ru>

Ignore this as I forgot to change v2 to v3 so I reposted this.


On 01/03/2019 15:34, Alexey Kardashevskiy wrote:
> We already allocate hardware TCE tables in multiple levels and skip
> intermediate levels when we can, now it is a turn of the KVM TCE tables.
> Thankfully these are allocated already in 2 levels.
> 
> This moves the table's last level allocation from the creating helper to
> kvmppc_tce_put() and kvm_spapr_tce_fault().
> 
> This adds kvmppc_rm_ioba_validate() to do an additional test if
> the consequent kvmppc_tce_put() needs a page which has not been allocated;
> if this is the case, we bail out to virtual mode handlers.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> Changes:
> v3:
> * fixed alignments in kvmppc_rm_ioba_validate
> 
> v2:
> * added kvm mutex around alloc_page to prevent races; in both place we
> test the pointer, if NULL, then take a lock and check again so on a fast
> path we do not take a lock at all
> 
> 
> ---
> For NVLink2 passthrough guests with 128TiB DMA windows and very fragmented
> system RAM the difference is gigabytes of RAM.
> ---
>  arch/powerpc/kvm/book3s_64_vio.c    | 29 ++++++------
>  arch/powerpc/kvm/book3s_64_vio_hv.c | 71 ++++++++++++++++++++++++++---
>  2 files changed, 81 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
> index f02b04973710..7eed8c90ea3d 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -228,7 +228,8 @@ static void release_spapr_tce_table(struct rcu_head *head)
>  	unsigned long i, npages = kvmppc_tce_pages(stt->size);
>  
>  	for (i = 0; i < npages; i++)
> -		__free_page(stt->pages[i]);
> +		if (stt->pages[i])
> +			__free_page(stt->pages[i]);
>  
>  	kfree(stt);
>  }
> @@ -242,6 +243,20 @@ static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf)
>  		return VM_FAULT_SIGBUS;
>  
>  	page = stt->pages[vmf->pgoff];
> +	if (!page) {
> +		mutex_lock(&stt->kvm->lock);
> +		page = stt->pages[vmf->pgoff];
> +		if (!page) {
> +			page  = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +			if (!page) {
> +				mutex_unlock(&stt->kvm->lock);
> +				return VM_FAULT_OOM;
> +			}
> +			stt->pages[vmf->pgoff] = page;
> +		}
> +		mutex_unlock(&stt->kvm->lock);
> +	}
> +
>  	get_page(page);
>  	vmf->page = page;
>  	return 0;
> @@ -296,7 +311,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  	struct kvmppc_spapr_tce_table *siter;
>  	unsigned long npages, size = args->size;
>  	int ret = -ENOMEM;
> -	int i;
>  
>  	if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
>  		(args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
> @@ -320,12 +334,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  	stt->kvm = kvm;
>  	INIT_LIST_HEAD_RCU(&stt->iommu_tables);
>  
> -	for (i = 0; i < npages; i++) {
> -		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
> -		if (!stt->pages[i])
> -			goto fail;
> -	}
> -
>  	mutex_lock(&kvm->lock);
>  
>  	/* Check this LIOBN hasn't been previously allocated */
> @@ -352,11 +360,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  	if (ret >= 0)
>  		return ret;
>  
> - fail:
> -	for (i = 0; i < npages; i++)
> -		if (stt->pages[i])
> -			__free_page(stt->pages[i]);
> -
>  	kfree(stt);
>   fail_acct:
>  	kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
> index 2206bc729b9a..1cd9373f8bdc 100644
> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> @@ -158,23 +158,78 @@ static u64 *kvmppc_page_address(struct page *page)
>  	return (u64 *) page_address(page);
>  }
>  
> +/*
> + * TCEs pages are allocated in kvmppc_tce_put() which won't be able to do so
> + * in real mode.
> + * Check if kvmppc_tce_put() can succeed in real mode, i.e. a TCEs page is
> + * allocated or not required (when clearing a tce entry).
> + */
> +static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt,
> +		unsigned long ioba, unsigned long npages, bool clearing)
> +{
> +	unsigned long i, idx, sttpage, sttpages;
> +	unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages);
> +
> +	if (ret)
> +		return ret;
> +	/*
> +	 * clearing=true says kvmppc_tce_put won't be allocating pages
> +	 * for empty tces.
> +	 */
> +	if (clearing)
> +		return H_SUCCESS;
> +
> +	idx = (ioba >> stt->page_shift) - stt->offset;
> +	sttpage = idx / TCES_PER_PAGE;
> +	sttpages = _ALIGN_UP(idx % TCES_PER_PAGE + npages, TCES_PER_PAGE) /
> +			TCES_PER_PAGE;
> +	for (i = sttpage; i < sttpage + sttpages; ++i)
> +		if (!stt->pages[i])
> +			return H_TOO_HARD;
> +
> +	return H_SUCCESS;
> +}
> +
>  /*
>   * Handles TCE requests for emulated devices.
>   * Puts guest TCE values to the table and expects user space to convert them.
>   * Called in both real and virtual modes.
>   * Cannot fail so kvmppc_tce_validate must be called before it.
>   *
> - * WARNING: This will be called in real-mode on HV KVM and virtual
> - *          mode on PR KVM
> + * WARNING: This will be called in real-mode on HV HPT KVM and virtual
> + *          mode on PR KVM or HV radix KVM
>   */
>  void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
>  		unsigned long idx, unsigned long tce)
>  {
>  	struct page *page;
>  	u64 *tbl;
> +	unsigned long sttpage;
>  
>  	idx -= stt->offset;
> -	page = stt->pages[idx / TCES_PER_PAGE];
> +	sttpage = idx / TCES_PER_PAGE;
> +	page = stt->pages[sttpage];
> +
> +	if (!page) {
> +		/* We allow any TCE, not just with read|write permissions */
> +		if (!tce)
> +			return;
> +		/*
> +		 * We must not end up here in real mode,
> +		 * kvmppc_rm_ioba_validate() takes care of this.
> +		 */
> +		mutex_lock(&stt->kvm->lock);
> +		page = stt->pages[sttpage];
> +		if (!page) {
> +			page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +			if (WARN_ON_ONCE(!page)) {
> +				mutex_unlock(&stt->kvm->lock);
> +				return;
> +			}
> +			stt->pages[sttpage] = page;
> +		}
> +		mutex_unlock(&stt->kvm->lock);
> +	}
>  	tbl = kvmppc_page_address(page);
>  
>  	tbl[idx % TCES_PER_PAGE] = tce;
> @@ -381,7 +436,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
>  	if (!stt)
>  		return H_TOO_HARD;
>  
> -	ret = kvmppc_ioba_validate(stt, ioba, 1);
> +	ret = kvmppc_rm_ioba_validate(stt, ioba, 1, tce = 0);
>  	if (ret != H_SUCCESS)
>  		return ret;
>  
> @@ -480,7 +535,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>  	if (tce_list & (SZ_4K - 1))
>  		return H_PARAMETER;
>  
> -	ret = kvmppc_ioba_validate(stt, ioba, npages);
> +	ret = kvmppc_rm_ioba_validate(stt, ioba, npages, false);
>  	if (ret != H_SUCCESS)
>  		return ret;
>  
> @@ -583,7 +638,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
>  	if (!stt)
>  		return H_TOO_HARD;
>  
> -	ret = kvmppc_ioba_validate(stt, ioba, npages);
> +	ret = kvmppc_rm_ioba_validate(stt, ioba, npages, tce_value = 0);
>  	if (ret != H_SUCCESS)
>  		return ret;
>  
> @@ -635,6 +690,10 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
>  
>  	idx = (ioba >> stt->page_shift) - stt->offset;
>  	page = stt->pages[idx / TCES_PER_PAGE];
> +	if (!page) {
> +		vcpu->arch.regs.gpr[4] = 0;
> +		return H_SUCCESS;
> +	}
>  	tbl = (u64 *)page_address(page);
>  
>  	vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE];
> 

-- 
Alexey

  reply	other threads:[~2019-03-01  4:39 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-03-01  4:34 [PATCH kernel v2] KVM: PPC: Allocate guest TCEs on demand too Alexey Kardashevskiy
2019-03-01  4:34 ` Alexey Kardashevskiy
2019-03-01  4:35 ` Alexey Kardashevskiy [this message]
2019-03-01  4:35   ` ignore this " Alexey Kardashevskiy
  -- strict thread matches above, loose matches on Subject: below --
2019-03-01  1:38 Alexey Kardashevskiy
2019-03-01  1:38 ` Alexey Kardashevskiy
2019-03-01  3:04 ` Alexey Kardashevskiy
2019-03-01  3:04   ` Alexey Kardashevskiy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f54c602f-cfcf-91fc-cc09-bde698740efc@ozlabs.ru \
    --to=aik@ozlabs.ru \
    --cc=david@gibson.dropbear.id.au \
    --cc=kvm-ppc@vger.kernel.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.