All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alexey Kardashevskiy <aik@ozlabs.ru>
To: linuxppc-dev@lists.ozlabs.org
Cc: kvm-ppc@vger.kernel.org, David Gibson <david@gibson.dropbear.id.au>
Subject: Re: [PATCH kernel v2] KVM: PPC: Allocate guest TCEs on demand too
Date: Fri, 1 Mar 2019 14:04:54 +1100	[thread overview]
Message-ID: <3ac28ad1-c3a3-63ef-c2e4-1f165bc95f92@ozlabs.ru> (raw)
In-Reply-To: <20190301013827.30504-1-aik@ozlabs.ru>



On 01/03/2019 12:38, Alexey Kardashevskiy wrote:
> We already allocate hardware TCE tables in multiple levels and skip
> intermediate levels when we can, now it is a turn of the KVM TCE tables.
> Thankfully these are allocated already in 2 levels.
> 
> This moves the table's last level allocation from the creating helper to
> kvmppc_tce_put() and kvm_spapr_tce_fault().
> 
> This adds kvmppc_rm_ioba_validate() to do an additional test if
> the consequent kvmppc_tce_put() needs a page which has not been allocated;
> if this is the case, we bail out to virtual mode handlers.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> Changes:
> v2:
> * added kvm mutex around alloc_page to prevent races; in both place we
> test the pointer, if NULL, then take a lock and check again so on a fast
> path we do not take a lock at all
> 
> 
> ---
> For NVLink2 passthrough guests with 128TiB DMA windows and very fragmented
> system RAM the difference is gigabytes of RAM.
> ---
>  arch/powerpc/kvm/book3s_64_vio.c    | 29 ++++++------
>  arch/powerpc/kvm/book3s_64_vio_hv.c | 69 ++++++++++++++++++++++++++---
>  2 files changed, 79 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
> index f02b049..7eed8c9 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -228,7 +228,8 @@ static void release_spapr_tce_table(struct rcu_head *head)
>  	unsigned long i, npages = kvmppc_tce_pages(stt->size);
>  
>  	for (i = 0; i < npages; i++)
> -		__free_page(stt->pages[i]);
> +		if (stt->pages[i])
> +			__free_page(stt->pages[i]);
>  
>  	kfree(stt);
>  }
> @@ -242,6 +243,20 @@ static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf)
>  		return VM_FAULT_SIGBUS;
>  
>  	page = stt->pages[vmf->pgoff];
> +	if (!page) {
> +		mutex_lock(&stt->kvm->lock);
> +		page = stt->pages[vmf->pgoff];
> +		if (!page) {
> +			page  = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +			if (!page) {
> +				mutex_unlock(&stt->kvm->lock);
> +				return VM_FAULT_OOM;
> +			}
> +			stt->pages[vmf->pgoff] = page;
> +		}
> +		mutex_unlock(&stt->kvm->lock);
> +	}
> +
>  	get_page(page);
>  	vmf->page = page;
>  	return 0;
> @@ -296,7 +311,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  	struct kvmppc_spapr_tce_table *siter;
>  	unsigned long npages, size = args->size;
>  	int ret = -ENOMEM;
> -	int i;
>  
>  	if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
>  		(args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
> @@ -320,12 +334,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  	stt->kvm = kvm;
>  	INIT_LIST_HEAD_RCU(&stt->iommu_tables);
>  
> -	for (i = 0; i < npages; i++) {
> -		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
> -		if (!stt->pages[i])
> -			goto fail;
> -	}
> -
>  	mutex_lock(&kvm->lock);
>  
>  	/* Check this LIOBN hasn't been previously allocated */
> @@ -352,11 +360,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  	if (ret >= 0)
>  		return ret;
>  
> - fail:
> -	for (i = 0; i < npages; i++)
> -		if (stt->pages[i])
> -			__free_page(stt->pages[i]);
> -
>  	kfree(stt);
>   fail_acct:
>  	kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
> index 2206bc7..a0912d5 100644
> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> @@ -158,23 +158,76 @@ static u64 *kvmppc_page_address(struct page *page)
>  	return (u64 *) page_address(page);
>  }
>  
> +/*
> + * TCEs pages are allocated in kvmppc_tce_put() which won't be able to do so
> + * in real mode.
> + * Check if kvmppc_tce_put() can succeed in real mode, i.e. a TCEs page is
> + * allocated or not required (when clearing a tce entry).
> + */
> +static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt,
> +		unsigned long ioba, unsigned long npages, bool clearing)
> +{
> +	unsigned long i, sttpage, sttpages;
> +	unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages);
> +
> +	if (ret)
> +		return ret;
> +	/*
> +	 * clearing==true says kvmppc_tce_put won't be allocating pages
> +	 * for empty tces.
> +	 */
> +	if (clearing)
> +		return H_SUCCESS;
> +
> +	sttpage = ((ioba >> stt->page_shift) - stt->offset) / TCES_PER_PAGE;
> +	sttpages = (npages + TCES_PER_PAGE - 1) / TCES_PER_PAGE;


This is wrong, v3 is coming.


-- 
Alexey

WARNING: multiple messages have this Message-ID (diff)
From: Alexey Kardashevskiy <aik@ozlabs.ru>
To: linuxppc-dev@lists.ozlabs.org
Cc: kvm-ppc@vger.kernel.org, David Gibson <david@gibson.dropbear.id.au>
Subject: Re: [PATCH kernel v2] KVM: PPC: Allocate guest TCEs on demand too
Date: Fri, 01 Mar 2019 03:04:54 +0000	[thread overview]
Message-ID: <3ac28ad1-c3a3-63ef-c2e4-1f165bc95f92@ozlabs.ru> (raw)
In-Reply-To: <20190301013827.30504-1-aik@ozlabs.ru>



On 01/03/2019 12:38, Alexey Kardashevskiy wrote:
> We already allocate hardware TCE tables in multiple levels and skip
> intermediate levels when we can, now it is a turn of the KVM TCE tables.
> Thankfully these are allocated already in 2 levels.
> 
> This moves the table's last level allocation from the creating helper to
> kvmppc_tce_put() and kvm_spapr_tce_fault().
> 
> This adds kvmppc_rm_ioba_validate() to do an additional test if
> the consequent kvmppc_tce_put() needs a page which has not been allocated;
> if this is the case, we bail out to virtual mode handlers.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> Changes:
> v2:
> * added kvm mutex around alloc_page to prevent races; in both place we
> test the pointer, if NULL, then take a lock and check again so on a fast
> path we do not take a lock at all
> 
> 
> ---
> For NVLink2 passthrough guests with 128TiB DMA windows and very fragmented
> system RAM the difference is gigabytes of RAM.
> ---
>  arch/powerpc/kvm/book3s_64_vio.c    | 29 ++++++------
>  arch/powerpc/kvm/book3s_64_vio_hv.c | 69 ++++++++++++++++++++++++++---
>  2 files changed, 79 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
> index f02b049..7eed8c9 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -228,7 +228,8 @@ static void release_spapr_tce_table(struct rcu_head *head)
>  	unsigned long i, npages = kvmppc_tce_pages(stt->size);
>  
>  	for (i = 0; i < npages; i++)
> -		__free_page(stt->pages[i]);
> +		if (stt->pages[i])
> +			__free_page(stt->pages[i]);
>  
>  	kfree(stt);
>  }
> @@ -242,6 +243,20 @@ static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf)
>  		return VM_FAULT_SIGBUS;
>  
>  	page = stt->pages[vmf->pgoff];
> +	if (!page) {
> +		mutex_lock(&stt->kvm->lock);
> +		page = stt->pages[vmf->pgoff];
> +		if (!page) {
> +			page  = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +			if (!page) {
> +				mutex_unlock(&stt->kvm->lock);
> +				return VM_FAULT_OOM;
> +			}
> +			stt->pages[vmf->pgoff] = page;
> +		}
> +		mutex_unlock(&stt->kvm->lock);
> +	}
> +
>  	get_page(page);
>  	vmf->page = page;
>  	return 0;
> @@ -296,7 +311,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  	struct kvmppc_spapr_tce_table *siter;
>  	unsigned long npages, size = args->size;
>  	int ret = -ENOMEM;
> -	int i;
>  
>  	if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
>  		(args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
> @@ -320,12 +334,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  	stt->kvm = kvm;
>  	INIT_LIST_HEAD_RCU(&stt->iommu_tables);
>  
> -	for (i = 0; i < npages; i++) {
> -		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
> -		if (!stt->pages[i])
> -			goto fail;
> -	}
> -
>  	mutex_lock(&kvm->lock);
>  
>  	/* Check this LIOBN hasn't been previously allocated */
> @@ -352,11 +360,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  	if (ret >= 0)
>  		return ret;
>  
> - fail:
> -	for (i = 0; i < npages; i++)
> -		if (stt->pages[i])
> -			__free_page(stt->pages[i]);
> -
>  	kfree(stt);
>   fail_acct:
>  	kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
> index 2206bc7..a0912d5 100644
> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> @@ -158,23 +158,76 @@ static u64 *kvmppc_page_address(struct page *page)
>  	return (u64 *) page_address(page);
>  }
>  
> +/*
> + * TCEs pages are allocated in kvmppc_tce_put() which won't be able to do so
> + * in real mode.
> + * Check if kvmppc_tce_put() can succeed in real mode, i.e. a TCEs page is
> + * allocated or not required (when clearing a tce entry).
> + */
> +static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt,
> +		unsigned long ioba, unsigned long npages, bool clearing)
> +{
> +	unsigned long i, sttpage, sttpages;
> +	unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages);
> +
> +	if (ret)
> +		return ret;
> +	/*
> +	 * clearing=true says kvmppc_tce_put won't be allocating pages
> +	 * for empty tces.
> +	 */
> +	if (clearing)
> +		return H_SUCCESS;
> +
> +	sttpage = ((ioba >> stt->page_shift) - stt->offset) / TCES_PER_PAGE;
> +	sttpages = (npages + TCES_PER_PAGE - 1) / TCES_PER_PAGE;


This is wrong, v3 is coming.


-- 
Alexey

  reply	other threads:[~2019-03-01  3:06 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-03-01  1:38 [PATCH kernel v2] KVM: PPC: Allocate guest TCEs on demand too Alexey Kardashevskiy
2019-03-01  1:38 ` Alexey Kardashevskiy
2019-03-01  3:04 ` Alexey Kardashevskiy [this message]
2019-03-01  3:04   ` Alexey Kardashevskiy
2019-03-01  4:34 Alexey Kardashevskiy
2019-03-01  4:34 ` Alexey Kardashevskiy
2019-03-01  4:35 ` ignore this " Alexey Kardashevskiy
2019-03-01  4:35   ` Alexey Kardashevskiy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=3ac28ad1-c3a3-63ef-c2e4-1f165bc95f92@ozlabs.ru \
    --to=aik@ozlabs.ru \
    --cc=david@gibson.dropbear.id.au \
    --cc=kvm-ppc@vger.kernel.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.