All of lore.kernel.org
 help / color / mirror / Atom feed
From: Christoffer Dall <christoffer.dall@linaro.org>
To: Mario Smarduch <m.smarduch@samsung.com>
Cc: marc.zyngier@arm.com, pbonzini@redhat.com,
	kvmarm@lists.cs.columbia.edu, kvm@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org
Subject: Re: [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
Date: Sun, 11 Jan 2015 15:00:03 +0100	[thread overview]
Message-ID: <20150111140003.GA2722@cbox> (raw)
In-Reply-To: <1420863441-32592-1-git-send-email-m.smarduch@samsung.com>

On Fri, Jan 09, 2015 at 08:17:20PM -0800, Mario Smarduch wrote:
> This patch adds support for 2nd stage page fault handling while dirty page
> logging. On huge page faults, huge pages are dissolved to normal pages, and
> rebuilding of 2nd stage huge pages is blocked. In case migration is 
> canceled this restriction is removed and huge pages may be rebuilt again.
> 
> This patch applies cleanly on top of patch series posted Dec. 15'th:
> https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html

In the future such information should also go under the ---
separator.

> 
> Patch #11 has been dropped, and should not be applied.

this should go under the '---' separator too.

> 
> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> ---
> 
> Change Log since last RESEND v1 --> v2:
> - Disallow dirty page logging of IO region - fail for initial write protect
>   and disable logging code in 2nd stage page fault handler.
> - Fixed auto spell correction errors
> 
> Change Log RESEND v0 --> v1:
> - fixed bug exposed by new generic __get_user_pages_fast(), when region is 
>   writable, prevent write protection of pte on read fault
> - Removed marking entire huge page dirty on initial access
> - don't dissolve huge pages of non-writable regions
> - Made updates based on Christoffers comments
>   - renamed logging status function to memslot_is_logging()
>   - changed few values to bool from longs
>   - streamlined user_mem_abort() to eliminate extra conditional checks
> ---
>  arch/arm/kvm/mmu.c |  113 ++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 105 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 73d506f..b878236 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
>  #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
>  #define kvm_pud_huge(_x)	pud_huge(_x)
>  
> +#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
> +#define KVM_S2PTE_FLAG_LOGGING_ACTIVE	(1UL << 1)
> +
> +static bool memslot_is_logging(struct kvm_memory_slot *memslot)
> +{
> +#ifdef CONFIG_ARM
> +	return !!memslot->dirty_bitmap;
> +#else
> +	return false;
> +#endif
> +}
> +
>  static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>  {
>  	/*
> @@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>  		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
>  }
>  
> +/**
> + * stage2_dissolve_pmd() - clear and flush huge PMD entry
> + * @kvm:	pointer to kvm structure.
> + * @addr:	IPA
> + * @pmd:	pmd pointer for IPA
> + *
> + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
> + * pages in the range dirty.
> + */
> +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
> +{
> +	if (!kvm_pmd_huge(*pmd))
> +		return;
> +
> +	pmd_clear(pmd);
> +	kvm_tlb_flush_vmid_ipa(kvm, addr);
> +	put_page(virt_to_page(pmd));
> +}
> +
>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>  				  int min, int max)
>  {
> @@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>  }
>  
>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
> +			  phys_addr_t addr, const pte_t *new_pte,
> +			  unsigned long flags)
>  {
>  	pmd_t *pmd;
>  	pte_t *pte, old_pte;
> +	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
> +	bool logging_active = flags & KVM_S2PTE_FLAG_LOGGING_ACTIVE;
>  
>  	/* Create stage-2 page table mapping - Levels 0 and 1 */
>  	pmd = stage2_get_pmd(kvm, cache, addr);
> @@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>  		return 0;
>  	}
>  
> +	/*
> +	 * While dirty page logging - dissolve huge PMD, then continue on to
> +	 * allocate page.
> +	 */
> +	if (logging_active)
> +		stage2_dissolve_pmd(kvm, addr, pmd);
> +
>  	/* Create stage-2 page mappings - Level 2 */
>  	if (pmd_none(*pmd)) {
>  		if (!cache)
> @@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>  		if (ret)
>  			goto out;
>  		spin_lock(&kvm->mmu_lock);
> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
> +		ret = stage2_set_pte(kvm, &cache, addr, &pte,
> +						KVM_S2PTE_FLAG_IS_IOMAP);
>  		spin_unlock(&kvm->mmu_lock);
>  		if (ret)
>  			goto out;
> @@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	pfn_t pfn;
>  	pgprot_t mem_type = PAGE_S2;
>  	bool fault_ipa_uncached;
> +	bool can_set_pte_rw = true;
> +	unsigned long set_pte_flags = 0;
>  
>  	write_fault = kvm_is_write_fault(vcpu);
>  	if (fault_status == FSC_PERM && !write_fault) {
> @@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		return -EFAULT;
>  	}
>  
> +

stray whitespace change?

>  	/* Let's check if we will get back a huge page backed by hugetlbfs */
>  	down_read(&current->mm->mmap_sem);
>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	if (is_error_pfn(pfn))
>  		return -EFAULT;
>  
> -	if (kvm_is_device_pfn(pfn))
> +	if (kvm_is_device_pfn(pfn)) {
>  		mem_type = PAGE_S2_DEVICE;
> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
> +	}
>  
>  	spin_lock(&kvm->mmu_lock);
>  	if (mmu_notifier_retry(kvm, mmu_seq))
>  		goto out_unlock;
> +
> +	/*
> +	 * When logging is enabled general page fault handling changes:
> +	 * -  Writable huge pages are dissolved on a read or write fault.

why dissolve huge pages on a read fault?

> +	 * -  pte's are not allowed write permission on a read fault to
> +	 *    writable region so future writes can be marked dirty

new line

> +	 * Access to non-writable region is unchanged, and logging of IO
> +	 * regions is not allowed.
> +	 */
> +	if (memslot_is_logging(memslot) && writable) {
> +		set_pte_flags = KVM_S2PTE_FLAG_LOGGING_ACTIVE;
> +		if (hugetlb) {
> +			gfn += pte_index(fault_ipa);
> +			pfn += pte_index(fault_ipa);
> +			hugetlb = false;
> +		}
> +		force_pte = true;

uh, not this is not what I meant, see my example (untested, partial)
patch in the end of this mail.

> +		if (!write_fault)
> +			can_set_pte_rw = false;
> +	}
> +
>  	if (!hugetlb && !force_pte)
>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>  
> @@ -1082,16 +1150,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>  	} else {
>  		pte_t new_pte = pfn_pte(pfn, mem_type);
> -		if (writable) {
> +
> +		/*
> +		 * Don't set write permission, for non-writable region, and
> +		 * for read fault to writable region while logging.
> +		 */
> +		if (writable && can_set_pte_rw) {
>  			kvm_set_s2pte_writable(&new_pte);
>  			kvm_set_pfn_dirty(pfn);
>  		}
>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>  					  fault_ipa_uncached);
>  		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> +							set_pte_flags);
>  	}
>  
> +	if (write_fault)
> +		mark_page_dirty(kvm, gfn);
>  
>  out_unlock:
>  	spin_unlock(&kvm->mmu_lock);
> @@ -1242,7 +1317,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>  {
>  	pte_t *pte = (pte_t *)data;
>  
> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
> +	/*
> +	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
> +	 * flag clear because MMU notifiers will have unmapped a huge PMD before
> +	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
> +	 * therefore stage2_set_pte() never needs to clear out a huge PMD
> +	 * through this calling path.
> +	 */
> +	stage2_set_pte(kvm, NULL, gpa, pte, 0);
>  }
>  
>  
> @@ -1396,7 +1478,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  	bool writable = !(mem->flags & KVM_MEM_READONLY);
>  	int ret = 0;
>  
> -	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
> +	/*
> +	 * Let - enable of dirty page logging through, later check if it's for
> +	 * an IO region and fail.
> +	 */

I don't understand this comment or find it helpful.

> +	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
> +		change == KVM_MR_FLAGS_ONLY &&
> +		!(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES))

this looks wrong, because you can now remove all the other checks of
change != and you are not returning early for KVM_MR_DELETE.

I think you want to add a check simply for 'change != KVM_MR_FLAGS_ONLY'
and then after the 'return 0' check the subconditions for change ==
KVM_MR_FLAGS_ONLY.

>  		return 0;
>  
>  	/*
> @@ -1447,15 +1535,24 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
>  					 vm_start - vma->vm_start;
>  
> -			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
> +			if (change != KVM_MR_FLAGS_ONLY)
> +				ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>  						    vm_end - vm_start,
>  						    writable);
> +			else
> +				/* IO region dirty page logging not allowed */
> +				return -EINVAL;
> +

this whole thing also looks weird.  I think you just need to add a check
before kvm_phys_addr_ioremap() for flags & KVM_MEM_LOG_DIRTY_PAGES and
return an error in that case (you've identified a user attempting to set
dirty page logging on something that points to device memory, it doesn't
matter at this point through which 'change' it is done).

>  			if (ret)
>  				break;
>  		}
>  		hva = vm_end;
>  	} while (hva < reg_end);
>  
> +	/* Anything after here doesn't apply to memslot flag changes */
> +	if (change == KVM_MR_FLAGS_ONLY)
> +		return ret;
> +
>  	spin_lock(&kvm->mmu_lock);
>  	if (ret)
>  		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
> -- 


What I meant last time around concerning user_mem_abort was more
something like this:

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1dc9778..38ea58e 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
-	if (is_vm_hugetlb_page(vma)) {
+	/*
+	 * Writes to pages in a memslot with logging enabled are always logged
+	 * on a singe page-by-page basis.
+	 */
+	if (memslot_is_logging(memslot) && write_fault)
+		force_pte = true;
+
+	if (is_vm_hugetlb_page(vma) && !force_pte) {
 		hugetlb = true;
 		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
 	} else {
@@ -976,6 +983,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (is_error_pfn(pfn))
 		return -EFAULT;
 
+	if (memslot_is_logging(memslot) && !write_fault)
+		writable = false;
+
 	if (kvm_is_device_pfn(pfn))
 		mem_type = PAGE_S2_DEVICE;
 
@@ -998,15 +1008,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 					  fault_ipa_uncached);
 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 	} else {
+		unsigned long flags = 0;
 		pte_t new_pte = pfn_pte(pfn, mem_type);
+
 		if (writable) {
 			kvm_set_s2pte_writable(&new_pte);
 			kvm_set_pfn_dirty(pfn);
 		}
 		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
 					  fault_ipa_uncached);
-		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
-			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
+
+		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
+			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
+
+		if (memslot_is_logging(memslot))
+			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
+
+		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
 	}
 
 

Thanks,
-Christoffer

WARNING: multiple messages have this Message-ID (diff)
From: christoffer.dall@linaro.org (Christoffer Dall)
To: linux-arm-kernel@lists.infradead.org
Subject: [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
Date: Sun, 11 Jan 2015 15:00:03 +0100	[thread overview]
Message-ID: <20150111140003.GA2722@cbox> (raw)
In-Reply-To: <1420863441-32592-1-git-send-email-m.smarduch@samsung.com>

On Fri, Jan 09, 2015 at 08:17:20PM -0800, Mario Smarduch wrote:
> This patch adds support for 2nd stage page fault handling while dirty page
> logging. On huge page faults, huge pages are dissolved to normal pages, and
> rebuilding of 2nd stage huge pages is blocked. In case migration is 
> canceled this restriction is removed and huge pages may be rebuilt again.
> 
> This patch applies cleanly on top of patch series posted Dec. 15'th:
> https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html

In the future such information should also go under the ---
separator.

> 
> Patch #11 has been dropped, and should not be applied.

this should go under the '---' separator too.

> 
> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> ---
> 
> Change Log since last RESEND v1 --> v2:
> - Disallow dirty page logging of IO region - fail for initial write protect
>   and disable logging code in 2nd stage page fault handler.
> - Fixed auto spell correction errors
> 
> Change Log RESEND v0 --> v1:
> - fixed bug exposed by new generic __get_user_pages_fast(), when region is 
>   writable, prevent write protection of pte on read fault
> - Removed marking entire huge page dirty on initial access
> - don't dissolve huge pages of non-writable regions
> - Made updates based on Christoffers comments
>   - renamed logging status function to memslot_is_logging()
>   - changed few values to bool from longs
>   - streamlined user_mem_abort() to eliminate extra conditional checks
> ---
>  arch/arm/kvm/mmu.c |  113 ++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 105 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 73d506f..b878236 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
>  #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
>  #define kvm_pud_huge(_x)	pud_huge(_x)
>  
> +#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
> +#define KVM_S2PTE_FLAG_LOGGING_ACTIVE	(1UL << 1)
> +
> +static bool memslot_is_logging(struct kvm_memory_slot *memslot)
> +{
> +#ifdef CONFIG_ARM
> +	return !!memslot->dirty_bitmap;
> +#else
> +	return false;
> +#endif
> +}
> +
>  static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>  {
>  	/*
> @@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>  		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
>  }
>  
> +/**
> + * stage2_dissolve_pmd() - clear and flush huge PMD entry
> + * @kvm:	pointer to kvm structure.
> + * @addr:	IPA
> + * @pmd:	pmd pointer for IPA
> + *
> + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
> + * pages in the range dirty.
> + */
> +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
> +{
> +	if (!kvm_pmd_huge(*pmd))
> +		return;
> +
> +	pmd_clear(pmd);
> +	kvm_tlb_flush_vmid_ipa(kvm, addr);
> +	put_page(virt_to_page(pmd));
> +}
> +
>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>  				  int min, int max)
>  {
> @@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>  }
>  
>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
> +			  phys_addr_t addr, const pte_t *new_pte,
> +			  unsigned long flags)
>  {
>  	pmd_t *pmd;
>  	pte_t *pte, old_pte;
> +	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
> +	bool logging_active = flags & KVM_S2PTE_FLAG_LOGGING_ACTIVE;
>  
>  	/* Create stage-2 page table mapping - Levels 0 and 1 */
>  	pmd = stage2_get_pmd(kvm, cache, addr);
> @@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>  		return 0;
>  	}
>  
> +	/*
> +	 * While dirty page logging - dissolve huge PMD, then continue on to
> +	 * allocate page.
> +	 */
> +	if (logging_active)
> +		stage2_dissolve_pmd(kvm, addr, pmd);
> +
>  	/* Create stage-2 page mappings - Level 2 */
>  	if (pmd_none(*pmd)) {
>  		if (!cache)
> @@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>  		if (ret)
>  			goto out;
>  		spin_lock(&kvm->mmu_lock);
> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
> +		ret = stage2_set_pte(kvm, &cache, addr, &pte,
> +						KVM_S2PTE_FLAG_IS_IOMAP);
>  		spin_unlock(&kvm->mmu_lock);
>  		if (ret)
>  			goto out;
> @@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	pfn_t pfn;
>  	pgprot_t mem_type = PAGE_S2;
>  	bool fault_ipa_uncached;
> +	bool can_set_pte_rw = true;
> +	unsigned long set_pte_flags = 0;
>  
>  	write_fault = kvm_is_write_fault(vcpu);
>  	if (fault_status == FSC_PERM && !write_fault) {
> @@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		return -EFAULT;
>  	}
>  
> +

stray whitespace change?

>  	/* Let's check if we will get back a huge page backed by hugetlbfs */
>  	down_read(&current->mm->mmap_sem);
>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	if (is_error_pfn(pfn))
>  		return -EFAULT;
>  
> -	if (kvm_is_device_pfn(pfn))
> +	if (kvm_is_device_pfn(pfn)) {
>  		mem_type = PAGE_S2_DEVICE;
> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
> +	}
>  
>  	spin_lock(&kvm->mmu_lock);
>  	if (mmu_notifier_retry(kvm, mmu_seq))
>  		goto out_unlock;
> +
> +	/*
> +	 * When logging is enabled general page fault handling changes:
> +	 * -  Writable huge pages are dissolved on a read or write fault.

why dissolve huge pages on a read fault?

> +	 * -  pte's are not allowed write permission on a read fault to
> +	 *    writable region so future writes can be marked dirty

new line

> +	 * Access to non-writable region is unchanged, and logging of IO
> +	 * regions is not allowed.
> +	 */
> +	if (memslot_is_logging(memslot) && writable) {
> +		set_pte_flags = KVM_S2PTE_FLAG_LOGGING_ACTIVE;
> +		if (hugetlb) {
> +			gfn += pte_index(fault_ipa);
> +			pfn += pte_index(fault_ipa);
> +			hugetlb = false;
> +		}
> +		force_pte = true;

uh, not this is not what I meant, see my example (untested, partial)
patch in the end of this mail.

> +		if (!write_fault)
> +			can_set_pte_rw = false;
> +	}
> +
>  	if (!hugetlb && !force_pte)
>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>  
> @@ -1082,16 +1150,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>  	} else {
>  		pte_t new_pte = pfn_pte(pfn, mem_type);
> -		if (writable) {
> +
> +		/*
> +		 * Don't set write permission, for non-writable region, and
> +		 * for read fault to writable region while logging.
> +		 */
> +		if (writable && can_set_pte_rw) {
>  			kvm_set_s2pte_writable(&new_pte);
>  			kvm_set_pfn_dirty(pfn);
>  		}
>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>  					  fault_ipa_uncached);
>  		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> +							set_pte_flags);
>  	}
>  
> +	if (write_fault)
> +		mark_page_dirty(kvm, gfn);
>  
>  out_unlock:
>  	spin_unlock(&kvm->mmu_lock);
> @@ -1242,7 +1317,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>  {
>  	pte_t *pte = (pte_t *)data;
>  
> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
> +	/*
> +	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
> +	 * flag clear because MMU notifiers will have unmapped a huge PMD before
> +	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
> +	 * therefore stage2_set_pte() never needs to clear out a huge PMD
> +	 * through this calling path.
> +	 */
> +	stage2_set_pte(kvm, NULL, gpa, pte, 0);
>  }
>  
>  
> @@ -1396,7 +1478,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  	bool writable = !(mem->flags & KVM_MEM_READONLY);
>  	int ret = 0;
>  
> -	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
> +	/*
> +	 * Let - enable of dirty page logging through, later check if it's for
> +	 * an IO region and fail.
> +	 */

I don't understand this comment or find it helpful.

> +	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
> +		change == KVM_MR_FLAGS_ONLY &&
> +		!(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES))

this looks wrong, because you can now remove all the other checks of
change != and you are not returning early for KVM_MR_DELETE.

I think you want to add a check simply for 'change != KVM_MR_FLAGS_ONLY'
and then after the 'return 0' check the subconditions for change ==
KVM_MR_FLAGS_ONLY.

>  		return 0;
>  
>  	/*
> @@ -1447,15 +1535,24 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
>  					 vm_start - vma->vm_start;
>  
> -			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
> +			if (change != KVM_MR_FLAGS_ONLY)
> +				ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>  						    vm_end - vm_start,
>  						    writable);
> +			else
> +				/* IO region dirty page logging not allowed */
> +				return -EINVAL;
> +

this whole thing also looks weird.  I think you just need to add a check
before kvm_phys_addr_ioremap() for flags & KVM_MEM_LOG_DIRTY_PAGES and
return an error in that case (you've identified a user attempting to set
dirty page logging on something that points to device memory, it doesn't
matter at this point through which 'change' it is done).

>  			if (ret)
>  				break;
>  		}
>  		hva = vm_end;
>  	} while (hva < reg_end);
>  
> +	/* Anything after here doesn't apply to memslot flag changes */
> +	if (change == KVM_MR_FLAGS_ONLY)
> +		return ret;
> +
>  	spin_lock(&kvm->mmu_lock);
>  	if (ret)
>  		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
> -- 


What I meant last time around concerning user_mem_abort was more
something like this:

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1dc9778..38ea58e 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
-	if (is_vm_hugetlb_page(vma)) {
+	/*
+	 * Writes to pages in a memslot with logging enabled are always logged
+	 * on a singe page-by-page basis.
+	 */
+	if (memslot_is_logging(memslot) && write_fault)
+		force_pte = true;
+
+	if (is_vm_hugetlb_page(vma) && !force_pte) {
 		hugetlb = true;
 		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
 	} else {
@@ -976,6 +983,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (is_error_pfn(pfn))
 		return -EFAULT;
 
+	if (memslot_is_logging(memslot) && !write_fault)
+		writable = false;
+
 	if (kvm_is_device_pfn(pfn))
 		mem_type = PAGE_S2_DEVICE;
 
@@ -998,15 +1008,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 					  fault_ipa_uncached);
 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 	} else {
+		unsigned long flags = 0;
 		pte_t new_pte = pfn_pte(pfn, mem_type);
+
 		if (writable) {
 			kvm_set_s2pte_writable(&new_pte);
 			kvm_set_pfn_dirty(pfn);
 		}
 		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
 					  fault_ipa_uncached);
-		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
-			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
+
+		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
+			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
+
+		if (memslot_is_logging(memslot))
+			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
+
+		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
 	}
 
 

Thanks,
-Christoffer

  reply	other threads:[~2015-01-11 13:59 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-01-10  4:17 [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling Mario Smarduch
2015-01-10  4:17 ` Mario Smarduch
2015-01-11 14:00 ` Christoffer Dall [this message]
2015-01-11 14:00   ` Christoffer Dall
2015-01-12 16:27   ` Mario Smarduch
2015-01-12 16:27     ` Mario Smarduch
2015-01-12 17:49     ` Christoffer Dall
2015-01-12 17:49       ` Christoffer Dall
2015-01-12 19:04       ` Mario Smarduch
2015-01-12 19:04         ` Mario Smarduch
2015-01-12 19:43         ` Christoffer Dall
2015-01-12 19:43           ` Christoffer Dall
2015-01-13 17:42           ` Mario Smarduch
2015-01-13 17:42             ` Mario Smarduch
2015-01-14 10:33             ` Christoffer Dall
2015-01-14 10:33               ` Christoffer Dall
2015-01-13 23:14       ` Mario Smarduch
2015-01-13 23:14         ` Mario Smarduch
2015-01-14 10:32         ` Christoffer Dall
2015-01-14 10:32           ` Christoffer Dall
2015-01-14 23:10           ` Mario Smarduch
2015-01-14 23:10             ` Mario Smarduch
2015-01-15 10:20             ` Christoffer Dall
2015-01-15 10:20               ` Christoffer Dall
  -- strict thread matches above, loose matches on Subject: below --
2015-01-15  2:51 Mario Smarduch
2015-01-15  2:51 ` Mario Smarduch
2015-01-15 10:55 ` Christoffer Dall
2015-01-15 10:55   ` Christoffer Dall
2015-01-09  1:42 Mario Smarduch
2015-01-09  1:42 ` Mario Smarduch

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150111140003.GA2722@cbox \
    --to=christoffer.dall@linaro.org \
    --cc=kvm@vger.kernel.org \
    --cc=kvmarm@lists.cs.columbia.edu \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=m.smarduch@samsung.com \
    --cc=marc.zyngier@arm.com \
    --cc=pbonzini@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.