All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH MANUALSEL 5.17 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs
@ 2022-04-27 15:53 Sasha Levin
  2022-04-27 15:53 ` [PATCH MANUALSEL 5.17 2/7] KVM: selftests: Silence compiler warning in the kvm_page_table_test Sasha Levin
                   ` (6 more replies)
  0 siblings, 7 replies; 14+ messages in thread
From: Sasha Levin @ 2022-04-27 15:53 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Paolo Bonzini, Nana Liu, Peter Xu, Sasha Levin, shuah, nathan,
	ndesaulniers, jmattson, oupton, seanjc, ricarkol, yang.zhong,
	aaronlewis, wei.w.wang, kvm, linux-kselftest, llvm

From: Paolo Bonzini <pbonzini@redhat.com>

[ Upstream commit f18b4aebe107d092e384b1ae680b1e1de7a0196d ]

Red Hat's QE team reported test failure on access_tracking_perf_test:

Testing guest mode: PA-bits:ANY, VA-bits:48,  4K pages
guest physical test memory offset: 0x3fffbffff000

Populating memory             : 0.684014577s
Writing to populated memory   : 0.006230175s
Reading from populated memory : 0.004557805s
==== Test Assertion Failure ====
  lib/kvm_util.c:1411: false
  pid=125806 tid=125809 errno=4 - Interrupted system call
     1  0x0000000000402f7c: addr_gpa2hva at kvm_util.c:1411
     2   (inlined by) addr_gpa2hva at kvm_util.c:1405
     3  0x0000000000401f52: lookup_pfn at access_tracking_perf_test.c:98
     4   (inlined by) mark_vcpu_memory_idle at access_tracking_perf_test.c:152
     5   (inlined by) vcpu_thread_main at access_tracking_perf_test.c:232
     6  0x00007fefe9ff81ce: ?? ??:0
     7  0x00007fefe9c64d82: ?? ??:0
  No vm physical memory at 0xffbffff000

I can easily reproduce it with a Intel(R) Xeon(R) CPU E5-2630 with 46 bits
PA.

It turns out that the address translation for clearing idle page tracking
returned a wrong result; addr_gva2gpa()'s last step, which is based on
"pte[index[0]].pfn", did the calculation with 40 bits length and the
high 12 bits got truncated.  In above case the GPA address to be returned
should be 0x3fffbffff000 for GVA 0xc0000000, but it got truncated into
0xffbffff000 and the subsequent gpa2hva lookup failed.

The width of operations on bit fields greater than 32-bit is
implementation defined, and differs between GCC (which uses the bitfield
precision) and clang (which uses 64-bit arithmetic), so this is a
potential minefield.  Remove the bit fields and using manual masking
instead.

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2075036
Reported-by: Nana Liu <nanliu@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Tested-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 .../selftests/kvm/include/x86_64/processor.h  |  15 ++
 .../selftests/kvm/lib/x86_64/processor.c      | 192 +++++++-----------
 2 files changed, 92 insertions(+), 115 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 8a470da7b71a..15a2875698b5 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -60,6 +60,21 @@
 /* CPUID.0x8000_0001.EDX */
 #define CPUID_GBPAGES		(1ul << 26)
 
+/* Page table bitfield declarations */
+#define PTE_PRESENT_MASK        BIT_ULL(0)
+#define PTE_WRITABLE_MASK       BIT_ULL(1)
+#define PTE_USER_MASK           BIT_ULL(2)
+#define PTE_ACCESSED_MASK       BIT_ULL(5)
+#define PTE_DIRTY_MASK          BIT_ULL(6)
+#define PTE_LARGE_MASK          BIT_ULL(7)
+#define PTE_GLOBAL_MASK         BIT_ULL(8)
+#define PTE_NX_MASK             BIT_ULL(63)
+
+#define PAGE_SHIFT		12
+
+#define PHYSICAL_PAGE_MASK      GENMASK_ULL(51, 12)
+#define PTE_GET_PFN(pte)        (((pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
+
 /* General Registers in 64-Bit Mode */
 struct gpr64_regs {
 	u64 rax;
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 9f000dfb5594..0dd442c26015 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -19,38 +19,6 @@
 
 vm_vaddr_t exception_handlers;
 
-/* Virtual translation table structure declarations */
-struct pageUpperEntry {
-	uint64_t present:1;
-	uint64_t writable:1;
-	uint64_t user:1;
-	uint64_t write_through:1;
-	uint64_t cache_disable:1;
-	uint64_t accessed:1;
-	uint64_t ignored_06:1;
-	uint64_t page_size:1;
-	uint64_t ignored_11_08:4;
-	uint64_t pfn:40;
-	uint64_t ignored_62_52:11;
-	uint64_t execute_disable:1;
-};
-
-struct pageTableEntry {
-	uint64_t present:1;
-	uint64_t writable:1;
-	uint64_t user:1;
-	uint64_t write_through:1;
-	uint64_t cache_disable:1;
-	uint64_t accessed:1;
-	uint64_t dirty:1;
-	uint64_t reserved_07:1;
-	uint64_t global:1;
-	uint64_t ignored_11_09:3;
-	uint64_t pfn:40;
-	uint64_t ignored_62_52:11;
-	uint64_t execute_disable:1;
-};
-
 void regs_dump(FILE *stream, struct kvm_regs *regs,
 	       uint8_t indent)
 {
@@ -195,23 +163,21 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr,
 	return &page_table[index];
 }
 
-static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
-						    uint64_t pt_pfn,
-						    uint64_t vaddr,
-						    uint64_t paddr,
-						    int level,
-						    enum x86_page_size page_size)
+static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
+				       uint64_t pt_pfn,
+				       uint64_t vaddr,
+				       uint64_t paddr,
+				       int level,
+				       enum x86_page_size page_size)
 {
-	struct pageUpperEntry *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
-
-	if (!pte->present) {
-		pte->writable = true;
-		pte->present = true;
-		pte->page_size = (level == page_size);
-		if (pte->page_size)
-			pte->pfn = paddr >> vm->page_shift;
+	uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
+
+	if (!(*pte & PTE_PRESENT_MASK)) {
+		*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
+		if (level == page_size)
+			*pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
 		else
-			pte->pfn = vm_alloc_page_table(vm) >> vm->page_shift;
+			*pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
 	} else {
 		/*
 		 * Entry already present.  Assert that the caller doesn't want
@@ -221,7 +187,7 @@ static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
 		TEST_ASSERT(level != page_size,
 			    "Cannot create hugepage at level: %u, vaddr: 0x%lx\n",
 			    page_size, vaddr);
-		TEST_ASSERT(!pte->page_size,
+		TEST_ASSERT(!(*pte & PTE_LARGE_MASK),
 			    "Cannot create page table at level: %u, vaddr: 0x%lx\n",
 			    level, vaddr);
 	}
@@ -232,8 +198,8 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 		   enum x86_page_size page_size)
 {
 	const uint64_t pg_size = 1ull << ((page_size * 9) + 12);
-	struct pageUpperEntry *pml4e, *pdpe, *pde;
-	struct pageTableEntry *pte;
+	uint64_t *pml4e, *pdpe, *pde;
+	uint64_t *pte;
 
 	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K,
 		    "Unknown or unsupported guest mode, mode: 0x%x", vm->mode);
@@ -257,24 +223,22 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 	 */
 	pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift,
 				      vaddr, paddr, 3, page_size);
-	if (pml4e->page_size)
+	if (*pml4e & PTE_LARGE_MASK)
 		return;
 
-	pdpe = virt_create_upper_pte(vm, pml4e->pfn, vaddr, paddr, 2, page_size);
-	if (pdpe->page_size)
+	pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, 2, page_size);
+	if (*pdpe & PTE_LARGE_MASK)
 		return;
 
-	pde = virt_create_upper_pte(vm, pdpe->pfn, vaddr, paddr, 1, page_size);
-	if (pde->page_size)
+	pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, 1, page_size);
+	if (*pde & PTE_LARGE_MASK)
 		return;
 
 	/* Fill in page table entry. */
-	pte = virt_get_pte(vm, pde->pfn, vaddr, 0);
-	TEST_ASSERT(!pte->present,
+	pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, 0);
+	TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
 		    "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr);
-	pte->pfn = paddr >> vm->page_shift;
-	pte->writable = true;
-	pte->present = 1;
+	*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
 }
 
 void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
@@ -282,12 +246,12 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
 	__virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K);
 }
 
-static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
+static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
 						       uint64_t vaddr)
 {
 	uint16_t index[4];
-	struct pageUpperEntry *pml4e, *pdpe, *pde;
-	struct pageTableEntry *pte;
+	uint64_t *pml4e, *pdpe, *pde;
+	uint64_t *pte;
 	struct kvm_cpuid_entry2 *entry;
 	struct kvm_sregs sregs;
 	int max_phy_addr;
@@ -329,30 +293,29 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
 	index[3] = (vaddr >> 39) & 0x1ffu;
 
 	pml4e = addr_gpa2hva(vm, vm->pgd);
-	TEST_ASSERT(pml4e[index[3]].present,
+	TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK,
 		"Expected pml4e to be present for gva: 0x%08lx", vaddr);
-	TEST_ASSERT((*(uint64_t*)(&pml4e[index[3]]) &
-		(rsvd_mask | (1ull << 7))) == 0,
+	TEST_ASSERT((pml4e[index[3]] & (rsvd_mask | PTE_LARGE_MASK)) == 0,
 		"Unexpected reserved bits set.");
 
-	pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
-	TEST_ASSERT(pdpe[index[2]].present,
+	pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
+	TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK,
 		"Expected pdpe to be present for gva: 0x%08lx", vaddr);
-	TEST_ASSERT(pdpe[index[2]].page_size == 0,
+	TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK),
 		"Expected pdpe to map a pde not a 1-GByte page.");
-	TEST_ASSERT((*(uint64_t*)(&pdpe[index[2]]) & rsvd_mask) == 0,
+	TEST_ASSERT((pdpe[index[2]] & rsvd_mask) == 0,
 		"Unexpected reserved bits set.");
 
-	pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
-	TEST_ASSERT(pde[index[1]].present,
+	pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
+	TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK,
 		"Expected pde to be present for gva: 0x%08lx", vaddr);
-	TEST_ASSERT(pde[index[1]].page_size == 0,
+	TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK),
 		"Expected pde to map a pte not a 2-MByte page.");
-	TEST_ASSERT((*(uint64_t*)(&pde[index[1]]) & rsvd_mask) == 0,
+	TEST_ASSERT((pde[index[1]] & rsvd_mask) == 0,
 		"Unexpected reserved bits set.");
 
-	pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
-	TEST_ASSERT(pte[index[0]].present,
+	pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
+	TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK,
 		"Expected pte to be present for gva: 0x%08lx", vaddr);
 
 	return &pte[index[0]];
@@ -360,7 +323,7 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
 
 uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
 {
-	struct pageTableEntry *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
+	uint64_t *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
 
 	return *(uint64_t *)pte;
 }
@@ -368,18 +331,17 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
 void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr,
 			     uint64_t pte)
 {
-	struct pageTableEntry *new_pte = _vm_get_page_table_entry(vm, vcpuid,
-								  vaddr);
+	uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
 
 	*(uint64_t *)new_pte = pte;
 }
 
 void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 {
-	struct pageUpperEntry *pml4e, *pml4e_start;
-	struct pageUpperEntry *pdpe, *pdpe_start;
-	struct pageUpperEntry *pde, *pde_start;
-	struct pageTableEntry *pte, *pte_start;
+	uint64_t *pml4e, *pml4e_start;
+	uint64_t *pdpe, *pdpe_start;
+	uint64_t *pde, *pde_start;
+	uint64_t *pte, *pte_start;
 
 	if (!vm->pgd_created)
 		return;
@@ -389,58 +351,58 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 	fprintf(stream, "%*s      index hvaddr         gpaddr         "
 		"addr         w exec dirty\n",
 		indent, "");
-	pml4e_start = (struct pageUpperEntry *) addr_gpa2hva(vm, vm->pgd);
+	pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd);
 	for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
 		pml4e = &pml4e_start[n1];
-		if (!pml4e->present)
+		if (!(*pml4e & PTE_PRESENT_MASK))
 			continue;
-		fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u "
+		fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u "
 			" %u\n",
 			indent, "",
 			pml4e - pml4e_start, pml4e,
-			addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->pfn,
-			pml4e->writable, pml4e->execute_disable);
+			addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),
+			!!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));
 
-		pdpe_start = addr_gpa2hva(vm, pml4e->pfn * vm->page_size);
+		pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);
 		for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
 			pdpe = &pdpe_start[n2];
-			if (!pdpe->present)
+			if (!(*pdpe & PTE_PRESENT_MASK))
 				continue;
-			fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10lx "
+			fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10llx "
 				"%u  %u\n",
 				indent, "",
 				pdpe - pdpe_start, pdpe,
 				addr_hva2gpa(vm, pdpe),
-				(uint64_t) pdpe->pfn, pdpe->writable,
-				pdpe->execute_disable);
+				PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK),
+				!!(*pdpe & PTE_NX_MASK));
 
-			pde_start = addr_gpa2hva(vm, pdpe->pfn * vm->page_size);
+			pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK);
 			for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
 				pde = &pde_start[n3];
-				if (!pde->present)
+				if (!(*pde & PTE_PRESENT_MASK))
 					continue;
 				fprintf(stream, "%*spde   0x%-3zx %p "
-					"0x%-12lx 0x%-10lx %u  %u\n",
+					"0x%-12lx 0x%-10llx %u  %u\n",
 					indent, "", pde - pde_start, pde,
 					addr_hva2gpa(vm, pde),
-					(uint64_t) pde->pfn, pde->writable,
-					pde->execute_disable);
+					PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK),
+					!!(*pde & PTE_NX_MASK));
 
-				pte_start = addr_gpa2hva(vm, pde->pfn * vm->page_size);
+				pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK);
 				for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
 					pte = &pte_start[n4];
-					if (!pte->present)
+					if (!(*pte & PTE_PRESENT_MASK))
 						continue;
 					fprintf(stream, "%*spte   0x%-3zx %p "
-						"0x%-12lx 0x%-10lx %u  %u "
+						"0x%-12lx 0x%-10llx %u  %u "
 						"    %u    0x%-10lx\n",
 						indent, "",
 						pte - pte_start, pte,
 						addr_hva2gpa(vm, pte),
-						(uint64_t) pte->pfn,
-						pte->writable,
-						pte->execute_disable,
-						pte->dirty,
+						PTE_GET_PFN(*pte),
+						!!(*pte & PTE_WRITABLE_MASK),
+						!!(*pte & PTE_NX_MASK),
+						!!(*pte & PTE_DIRTY_MASK),
 						((uint64_t) n1 << 27)
 							| ((uint64_t) n2 << 18)
 							| ((uint64_t) n3 << 9)
@@ -558,8 +520,8 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
 vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
 {
 	uint16_t index[4];
-	struct pageUpperEntry *pml4e, *pdpe, *pde;
-	struct pageTableEntry *pte;
+	uint64_t *pml4e, *pdpe, *pde;
+	uint64_t *pte;
 
 	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
 		"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
@@ -572,22 +534,22 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
 	if (!vm->pgd_created)
 		goto unmapped_gva;
 	pml4e = addr_gpa2hva(vm, vm->pgd);
-	if (!pml4e[index[3]].present)
+	if (!(pml4e[index[3]] & PTE_PRESENT_MASK))
 		goto unmapped_gva;
 
-	pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
-	if (!pdpe[index[2]].present)
+	pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
+	if (!(pdpe[index[2]] & PTE_PRESENT_MASK))
 		goto unmapped_gva;
 
-	pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
-	if (!pde[index[1]].present)
+	pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
+	if (!(pde[index[1]] & PTE_PRESENT_MASK))
 		goto unmapped_gva;
 
-	pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
-	if (!pte[index[0]].present)
+	pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
+	if (!(pte[index[0]] & PTE_PRESENT_MASK))
 		goto unmapped_gva;
 
-	return (pte[index[0]].pfn * vm->page_size) + (gva & 0xfffu);
+	return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & 0xfffu);
 
 unmapped_gva:
 	TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH MANUALSEL 5.17 2/7] KVM: selftests: Silence compiler warning in the kvm_page_table_test
  2022-04-27 15:53 [PATCH MANUALSEL 5.17 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Sasha Levin
@ 2022-04-27 15:53 ` Sasha Levin
  2022-04-27 16:16   ` Paolo Bonzini
  2022-04-27 15:53 ` [PATCH MANUALSEL 5.17 3/7] x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume Sasha Levin
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 14+ messages in thread
From: Sasha Levin @ 2022-04-27 15:53 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Thomas Huth, Claudio Imbrenda, Paolo Bonzini, Sasha Levin, shuah,
	kvm, linux-kselftest

From: Thomas Huth <thuth@redhat.com>

[ Upstream commit 266a19a0bc4fbfab4d981a47640ca98972a01865 ]

When compiling kvm_page_table_test.c, I get this compiler warning
with gcc 11.2:

kvm_page_table_test.c: In function 'pre_init_before_test':
../../../../tools/include/linux/kernel.h:44:24: warning: comparison of
 distinct pointer types lacks a cast
   44 |         (void) (&_max1 == &_max2);              \
      |                        ^~
kvm_page_table_test.c:281:21: note: in expansion of macro 'max'
  281 |         alignment = max(0x100000, alignment);
      |                     ^~~

Fix it by adjusting the type of the absolute value.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Message-Id: <20220414103031.565037-1-thuth@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c
index ba1fdc3dcf4a..2c4a7563a4f8 100644
--- a/tools/testing/selftests/kvm/kvm_page_table_test.c
+++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
@@ -278,7 +278,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
 	else
 		guest_test_phys_mem = p->phys_offset;
 #ifdef __s390x__
-	alignment = max(0x100000, alignment);
+	alignment = max(0x100000UL, alignment);
 #endif
 	guest_test_phys_mem = align_down(guest_test_phys_mem, alignment);
 
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH MANUALSEL 5.17 3/7] x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume
  2022-04-27 15:53 [PATCH MANUALSEL 5.17 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Sasha Levin
  2022-04-27 15:53 ` [PATCH MANUALSEL 5.17 2/7] KVM: selftests: Silence compiler warning in the kvm_page_table_test Sasha Levin
@ 2022-04-27 15:53 ` Sasha Levin
  2022-04-27 16:16   ` Paolo Bonzini
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.17 4/7] KVM: x86: Do not change ICR on write to APIC_SELF_IPI Sasha Levin
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 14+ messages in thread
From: Sasha Levin @ 2022-04-27 15:53 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Wanpeng Li, Marcelo Tosatti, Paolo Bonzini, Sasha Levin, tglx,
	mingo, bp, dave.hansen, x86, kvm

From: Wanpeng Li <wanpengli@tencent.com>

[ Upstream commit 0361bdfddca20c8855ea3bdbbbc9c999912b10ff ]

MSR_KVM_POLL_CONTROL is cleared on reset, thus reverting guests to
host-side polling after suspend/resume.  Non-bootstrap CPUs are
restored correctly by the haltpoll driver because they are hot-unplugged
during suspend and hot-plugged during resume; however, the BSP
is not hotpluggable and remains in host-sde polling mode after
the guest resume.  The makes the guest pay for the cost of vmexits
every time the guest enters idle.

Fix it by recording BSP's haltpoll state and resuming it during guest
resume.

Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Message-Id: <1650267752-46796-1-git-send-email-wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kernel/kvm.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index ed8a13ac4ab2..4c2a158bb6c4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -69,6 +69,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align
 DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
 static int has_steal_clock = 0;
 
+static int has_guest_poll = 0;
 /*
  * No need for any "IO delay" on KVM
  */
@@ -706,14 +707,26 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
 
 static int kvm_suspend(void)
 {
+	u64 val = 0;
+
 	kvm_guest_cpu_offline(false);
 
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
+		rdmsrl(MSR_KVM_POLL_CONTROL, val);
+	has_guest_poll = !(val & 1);
+#endif
 	return 0;
 }
 
 static void kvm_resume(void)
 {
 	kvm_cpu_online(raw_smp_processor_id());
+
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
+		wrmsrl(MSR_KVM_POLL_CONTROL, 0);
+#endif
 }
 
 static struct syscore_ops kvm_syscore_ops = {
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH MANUALSEL 5.17 4/7] KVM: x86: Do not change ICR on write to APIC_SELF_IPI
  2022-04-27 15:53 [PATCH MANUALSEL 5.17 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Sasha Levin
  2022-04-27 15:53 ` [PATCH MANUALSEL 5.17 2/7] KVM: selftests: Silence compiler warning in the kvm_page_table_test Sasha Levin
  2022-04-27 15:53 ` [PATCH MANUALSEL 5.17 3/7] x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume Sasha Levin
@ 2022-04-27 15:54 ` Sasha Levin
  2022-04-27 16:16   ` Paolo Bonzini
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.17 5/7] KVM: x86/mmu: avoid NULL-pointer dereference on page freeing bugs Sasha Levin
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 14+ messages in thread
From: Sasha Levin @ 2022-04-27 15:54 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Paolo Bonzini, Chao Gao, Sean Christopherson, Sasha Levin, tglx,
	mingo, bp, dave.hansen, x86, kvm

From: Paolo Bonzini <pbonzini@redhat.com>

[ Upstream commit d22a81b304a27fca6124174a8e842e826c193466 ]

Emulating writes to SELF_IPI with a write to ICR has an unwanted side effect:
the value of ICR in vAPIC page gets changed.  The lists SELF_IPI as write-only,
with no associated MMIO offset, so any write should have no visible side
effect in the vAPIC page.

Reported-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kvm/lapic.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2a10d0033c96..6b6f9359d29e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2125,10 +2125,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 		break;
 
 	case APIC_SELF_IPI:
-		if (apic_x2apic_mode(apic)) {
-			kvm_lapic_reg_write(apic, APIC_ICR,
-					    APIC_DEST_SELF | (val & APIC_VECTOR_MASK));
-		} else
+		if (apic_x2apic_mode(apic))
+			kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0);
+		else
 			ret = 1;
 		break;
 	default:
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH MANUALSEL 5.17 5/7] KVM: x86/mmu: avoid NULL-pointer dereference on page freeing bugs
  2022-04-27 15:53 [PATCH MANUALSEL 5.17 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Sasha Levin
                   ` (2 preceding siblings ...)
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.17 4/7] KVM: x86: Do not change ICR on write to APIC_SELF_IPI Sasha Levin
@ 2022-04-27 15:54 ` Sasha Levin
  2022-04-27 16:16   ` Paolo Bonzini
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.17 6/7] KVM: SEV: Allow SEV intra-host migration of VM with mirrors Sasha Levin
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 14+ messages in thread
From: Sasha Levin @ 2022-04-27 15:54 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Paolo Bonzini, Sean Christopherson, Sasha Levin, tglx, mingo, bp,
	dave.hansen, x86, kvm

From: Paolo Bonzini <pbonzini@redhat.com>

[ Upstream commit 9191b8f0745e63edf519e4a54a4aaae1d3d46fbd ]

WARN and bail if KVM attempts to free a root that isn't backed by a shadow
page.  KVM allocates a bare page for "special" roots, e.g. when using PAE
paging or shadowing 2/3/4-level page tables with 4/5-level, and so root_hpa
will be valid but won't be backed by a shadow page.  It's all too easy to
blindly call mmu_free_root_page() on root_hpa, be nice and WARN instead of
crashing KVM and possibly the kernel.

Reviewed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kvm/mmu/mmu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7f009ebb319a..e7cd16e1e0a0 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3239,6 +3239,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
 		return;
 
 	sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
+	if (WARN_ON(!sp))
+		return;
 
 	if (is_tdp_mmu_page(sp))
 		kvm_tdp_mmu_put_root(kvm, sp, false);
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH MANUALSEL 5.17 6/7] KVM: SEV: Allow SEV intra-host migration of VM with mirrors
  2022-04-27 15:53 [PATCH MANUALSEL 5.17 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Sasha Levin
                   ` (3 preceding siblings ...)
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.17 5/7] KVM: x86/mmu: avoid NULL-pointer dereference on page freeing bugs Sasha Levin
@ 2022-04-27 15:54 ` Sasha Levin
  2022-04-27 16:04   ` Paolo Bonzini
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.17 7/7] KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is advertised Sasha Levin
  2022-04-27 16:16 ` [PATCH MANUALSEL 5.17 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Paolo Bonzini
  6 siblings, 1 reply; 14+ messages in thread
From: Sasha Levin @ 2022-04-27 15:54 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Peter Gonda, Sean Christopherson, Marc Orr, Paolo Bonzini, kvm,
	Sasha Levin, tglx, mingo, bp, dave.hansen, x86, shuah,
	linux-kselftest

From: Peter Gonda <pgonda@google.com>

[ Upstream commit b2125513dfc0dd0ec5a9605138a3c356592cfb73 ]

For SEV-ES VMs with mirrors to be intra-host migrated they need to be
able to migrate with the mirror. This is due to that fact that all VMSAs
need to be added into the VM with LAUNCH_UPDATE_VMSA before
lAUNCH_FINISH. Allowing migration with mirrors allows users of SEV-ES to
keep the mirror VMs VMSAs during migration.

Adds a list of mirror VMs for the original VM iterate through during its
migration. During the iteration the owner pointers can be updated from
the source to the destination. This fixes the ASID leaking issue which
caused the blocking of migration of VMs with mirrors.

Signed-off-by: Peter Gonda <pgonda@google.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Marc Orr <marcorr@google.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Message-Id: <20220211193634.3183388-1-pgonda@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kvm/svm/sev.c                        | 57 ++++++++++++-------
 arch/x86/kvm/svm/svm.h                        |  3 +-
 .../selftests/kvm/x86_64/sev_migrate_tests.c  | 47 ++++++++++-----
 3 files changed, 73 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index fef975852582..553d6dbf19a2 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -258,6 +258,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
 		goto e_free;
 
 	INIT_LIST_HEAD(&sev->regions_list);
+	INIT_LIST_HEAD(&sev->mirror_vms);
 
 	return 0;
 
@@ -1623,9 +1624,12 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
 	}
 }
 
-static void sev_migrate_from(struct kvm_sev_info *dst,
-			      struct kvm_sev_info *src)
+static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
 {
+	struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info;
+	struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info;
+	struct kvm_sev_info *mirror;
+
 	dst->active = true;
 	dst->asid = src->asid;
 	dst->handle = src->handle;
@@ -1639,6 +1643,30 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
 	src->enc_context_owner = NULL;
 
 	list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
+
+	/*
+	 * If this VM has mirrors, "transfer" each mirror's refcount of the
+	 * source to the destination (this KVM).  The caller holds a reference
+	 * to the source, so there's no danger of use-after-free.
+	 */
+	list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
+	list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
+		kvm_get_kvm(dst_kvm);
+		kvm_put_kvm(src_kvm);
+		mirror->enc_context_owner = dst_kvm;
+	}
+
+	/*
+	 * If this VM is a mirror, remove the old mirror from the owners list
+	 * and add the new mirror to the list.
+	 */
+	if (is_mirroring_enc_context(dst_kvm)) {
+		struct kvm_sev_info *owner_sev_info =
+			&to_kvm_svm(dst->enc_context_owner)->sev_info;
+
+		list_del(&src->mirror_entry);
+		list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
+	}
 }
 
 static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
@@ -1708,15 +1736,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
 
 	src_sev = &to_kvm_svm(source_kvm)->sev_info;
 
-	/*
-	 * VMs mirroring src's encryption context rely on it to keep the
-	 * ASID allocated, but below we are clearing src_sev->asid.
-	 */
-	if (src_sev->num_mirrored_vms) {
-		ret = -EBUSY;
-		goto out_unlock;
-	}
-
 	dst_sev->misc_cg = get_current_misc_cg();
 	cg_cleanup_sev = dst_sev;
 	if (dst_sev->misc_cg != src_sev->misc_cg) {
@@ -1738,7 +1757,8 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
 		if (ret)
 			goto out_source_vcpu;
 	}
-	sev_migrate_from(dst_sev, src_sev);
+
+	sev_migrate_from(kvm, source_kvm);
 	kvm_vm_dead(source_kvm);
 	cg_cleanup_sev = src_sev;
 	ret = 0;
@@ -2008,10 +2028,10 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
 	 */
 	source_sev = &to_kvm_svm(source_kvm)->sev_info;
 	kvm_get_kvm(source_kvm);
-	source_sev->num_mirrored_vms++;
+	mirror_sev = &to_kvm_svm(kvm)->sev_info;
+	list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
 
 	/* Set enc_context_owner and copy its encryption context over */
-	mirror_sev = &to_kvm_svm(kvm)->sev_info;
 	mirror_sev->enc_context_owner = source_kvm;
 	mirror_sev->active = true;
 	mirror_sev->asid = source_sev->asid;
@@ -2019,6 +2039,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
 	mirror_sev->es_active = source_sev->es_active;
 	mirror_sev->handle = source_sev->handle;
 	INIT_LIST_HEAD(&mirror_sev->regions_list);
+	INIT_LIST_HEAD(&mirror_sev->mirror_vms);
 	ret = 0;
 
 	/*
@@ -2041,19 +2062,17 @@ void sev_vm_destroy(struct kvm *kvm)
 	struct list_head *head = &sev->regions_list;
 	struct list_head *pos, *q;
 
-	WARN_ON(sev->num_mirrored_vms);
-
 	if (!sev_guest(kvm))
 		return;
 
+	WARN_ON(!list_empty(&sev->mirror_vms));
+
 	/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
 	if (is_mirroring_enc_context(kvm)) {
 		struct kvm *owner_kvm = sev->enc_context_owner;
-		struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info;
 
 		mutex_lock(&owner_kvm->lock);
-		if (!WARN_ON(!owner_sev->num_mirrored_vms))
-			owner_sev->num_mirrored_vms--;
+		list_del(&sev->mirror_entry);
 		mutex_unlock(&owner_kvm->lock);
 		kvm_put_kvm(owner_kvm);
 		return;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 86bcfed6599e..831bca1fdc29 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -81,7 +81,8 @@ struct kvm_sev_info {
 	struct list_head regions_list;  /* List of registered regions */
 	u64 ap_jump_table;	/* SEV-ES AP Jump Table address */
 	struct kvm *enc_context_owner; /* Owner of copied encryption context */
-	unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
+	struct list_head mirror_vms; /* List of VMs mirroring */
+	struct list_head mirror_entry; /* Use as a list entry of mirrors */
 	struct misc_cg *misc_cg; /* For misc cgroup accounting */
 	atomic_t migration_in_progress;
 };
diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
index 80056bbbb003..2e5a42cb470b 100644
--- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
+++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
@@ -341,35 +341,54 @@ static void test_sev_mirror_parameters(void)
 
 static void test_sev_move_copy(void)
 {
-	struct kvm_vm *dst_vm, *sev_vm, *mirror_vm, *dst_mirror_vm;
-	int ret;
+	struct kvm_vm *dst_vm, *dst2_vm, *dst3_vm, *sev_vm, *mirror_vm,
+		      *dst_mirror_vm, *dst2_mirror_vm, *dst3_mirror_vm;
 
 	sev_vm = sev_vm_create(/* es= */ false);
 	dst_vm = aux_vm_create(true);
+	dst2_vm = aux_vm_create(true);
+	dst3_vm = aux_vm_create(true);
 	mirror_vm = aux_vm_create(false);
 	dst_mirror_vm = aux_vm_create(false);
+	dst2_mirror_vm = aux_vm_create(false);
+	dst3_mirror_vm = aux_vm_create(false);
 
 	sev_mirror_create(mirror_vm->fd, sev_vm->fd);
-	ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd);
-	TEST_ASSERT(ret == -1 && errno == EBUSY,
-		    "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret,
-		    errno);
 
-	/* The mirror itself can be migrated.  */
 	sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd);
-	ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd);
-	TEST_ASSERT(ret == -1 && errno == EBUSY,
-		    "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret,
-		    errno);
+	sev_migrate_from(dst_vm->fd, sev_vm->fd);
+
+	sev_migrate_from(dst2_vm->fd, dst_vm->fd);
+	sev_migrate_from(dst2_mirror_vm->fd, dst_mirror_vm->fd);
+
+	sev_migrate_from(dst3_mirror_vm->fd, dst2_mirror_vm->fd);
+	sev_migrate_from(dst3_vm->fd, dst2_vm->fd);
+
+	kvm_vm_free(dst_vm);
+	kvm_vm_free(sev_vm);
+	kvm_vm_free(dst2_vm);
+	kvm_vm_free(dst3_vm);
+	kvm_vm_free(mirror_vm);
+	kvm_vm_free(dst_mirror_vm);
+	kvm_vm_free(dst2_mirror_vm);
+	kvm_vm_free(dst3_mirror_vm);
 
 	/*
-	 * mirror_vm is not a mirror anymore, dst_mirror_vm is.  Thus,
-	 * the owner can be copied as soon as dst_mirror_vm is gone.
+	 * Run similar test be destroy mirrors before mirrored VMs to ensure
+	 * destruction is done safely.
 	 */
-	kvm_vm_free(dst_mirror_vm);
+	sev_vm = sev_vm_create(/* es= */ false);
+	dst_vm = aux_vm_create(true);
+	mirror_vm = aux_vm_create(false);
+	dst_mirror_vm = aux_vm_create(false);
+
+	sev_mirror_create(mirror_vm->fd, sev_vm->fd);
+
+	sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd);
 	sev_migrate_from(dst_vm->fd, sev_vm->fd);
 
 	kvm_vm_free(mirror_vm);
+	kvm_vm_free(dst_mirror_vm);
 	kvm_vm_free(dst_vm);
 	kvm_vm_free(sev_vm);
 }
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH MANUALSEL 5.17 7/7] KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is advertised
  2022-04-27 15:53 [PATCH MANUALSEL 5.17 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Sasha Levin
                   ` (4 preceding siblings ...)
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.17 6/7] KVM: SEV: Allow SEV intra-host migration of VM with mirrors Sasha Levin
@ 2022-04-27 15:54 ` Sasha Levin
  2022-04-27 16:17   ` Paolo Bonzini
  2022-04-27 16:16 ` [PATCH MANUALSEL 5.17 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Paolo Bonzini
  6 siblings, 1 reply; 14+ messages in thread
From: Sasha Levin @ 2022-04-27 15:54 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Wanpeng Li, Aili Yao, Sean Christopherson, Paolo Bonzini,
	Sasha Levin, tglx, mingo, bp, dave.hansen, x86, kvm

From: Wanpeng Li <wanpengli@tencent.com>

[ Upstream commit 1714a4eb6fb0cb79f182873cd011a8ed60ac65e8 ]

As commit 0c5f81dad46 ("KVM: LAPIC: Inject timer interrupt via posted
interrupt") mentioned that the host admin should well tune the guest
setup, so that vCPUs are placed on isolated pCPUs, and with several pCPUs
surplus for *busy* housekeeping.  In this setup, it is preferrable to
disable mwait/hlt/pause vmexits to keep the vCPUs in non-root mode.

However, if only some guests isolated and others not, they would not
have any benefit from posted timer interrupts, and at the same time lose
VMX preemption timer fast paths because kvm_can_post_timer_interrupt()
returns true and therefore forces kvm_can_use_hv_timer() to false.

By guaranteeing that posted-interrupt timer is only used if MWAIT or
HLT are done without vmexit, KVM can make a better choice and use the
VMX preemption timer and the corresponding fast paths.

Reported-by: Aili Yao <yaoaili@kingsoft.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Cc: Aili Yao <yaoaili@kingsoft.com>
Cc: Sean Christopherson <seanjc@google.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Message-Id: <1643112538-36743-1-git-send-email-wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kvm/lapic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 6b6f9359d29e..970d5c740b00 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -113,7 +113,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
 
 static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
 {
-	return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
+	return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
+		(kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
 }
 
 bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH MANUALSEL 5.17 6/7] KVM: SEV: Allow SEV intra-host migration of VM with mirrors
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.17 6/7] KVM: SEV: Allow SEV intra-host migration of VM with mirrors Sasha Levin
@ 2022-04-27 16:04   ` Paolo Bonzini
  0 siblings, 0 replies; 14+ messages in thread
From: Paolo Bonzini @ 2022-04-27 16:04 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable
  Cc: Peter Gonda, Sean Christopherson, Marc Orr, kvm, tglx, mingo, bp,
	dave.hansen, x86, shuah, linux-kselftest

On 4/27/22 17:54, Sasha Levin wrote:
> From: Peter Gonda <pgonda@google.com>
> 
> [ Upstream commit b2125513dfc0dd0ec5a9605138a3c356592cfb73 ]
> 
> For SEV-ES VMs with mirrors to be intra-host migrated they need to be
> able to migrate with the mirror. This is due to that fact that all VMSAs
> need to be added into the VM with LAUNCH_UPDATE_VMSA before
> lAUNCH_FINISH. Allowing migration with mirrors allows users of SEV-ES to
> keep the mirror VMs VMSAs during migration.
> 
> Adds a list of mirror VMs for the original VM iterate through during its
> migration. During the iteration the owner pointers can be updated from
> the source to the destination. This fixes the ASID leaking issue which
> caused the blocking of migration of VMs with mirrors.
> 
> Signed-off-by: Peter Gonda <pgonda@google.com>
> Cc: Sean Christopherson <seanjc@google.com>
> Cc: Marc Orr <marcorr@google.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: kvm@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Message-Id: <20220211193634.3183388-1-pgonda@google.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>

Too scary,

NACK

Paolo

> ---
>   arch/x86/kvm/svm/sev.c                        | 57 ++++++++++++-------
>   arch/x86/kvm/svm/svm.h                        |  3 +-
>   .../selftests/kvm/x86_64/sev_migrate_tests.c  | 47 ++++++++++-----
>   3 files changed, 73 insertions(+), 34 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index fef975852582..553d6dbf19a2 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -258,6 +258,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>   		goto e_free;
>   
>   	INIT_LIST_HEAD(&sev->regions_list);
> +	INIT_LIST_HEAD(&sev->mirror_vms);
>   
>   	return 0;
>   
> @@ -1623,9 +1624,12 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
>   	}
>   }
>   
> -static void sev_migrate_from(struct kvm_sev_info *dst,
> -			      struct kvm_sev_info *src)
> +static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
>   {
> +	struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info;
> +	struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info;
> +	struct kvm_sev_info *mirror;
> +
>   	dst->active = true;
>   	dst->asid = src->asid;
>   	dst->handle = src->handle;
> @@ -1639,6 +1643,30 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
>   	src->enc_context_owner = NULL;
>   
>   	list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
> +
> +	/*
> +	 * If this VM has mirrors, "transfer" each mirror's refcount of the
> +	 * source to the destination (this KVM).  The caller holds a reference
> +	 * to the source, so there's no danger of use-after-free.
> +	 */
> +	list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
> +	list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
> +		kvm_get_kvm(dst_kvm);
> +		kvm_put_kvm(src_kvm);
> +		mirror->enc_context_owner = dst_kvm;
> +	}
> +
> +	/*
> +	 * If this VM is a mirror, remove the old mirror from the owners list
> +	 * and add the new mirror to the list.
> +	 */
> +	if (is_mirroring_enc_context(dst_kvm)) {
> +		struct kvm_sev_info *owner_sev_info =
> +			&to_kvm_svm(dst->enc_context_owner)->sev_info;
> +
> +		list_del(&src->mirror_entry);
> +		list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
> +	}
>   }
>   
>   static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
> @@ -1708,15 +1736,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
>   
>   	src_sev = &to_kvm_svm(source_kvm)->sev_info;
>   
> -	/*
> -	 * VMs mirroring src's encryption context rely on it to keep the
> -	 * ASID allocated, but below we are clearing src_sev->asid.
> -	 */
> -	if (src_sev->num_mirrored_vms) {
> -		ret = -EBUSY;
> -		goto out_unlock;
> -	}
> -
>   	dst_sev->misc_cg = get_current_misc_cg();
>   	cg_cleanup_sev = dst_sev;
>   	if (dst_sev->misc_cg != src_sev->misc_cg) {
> @@ -1738,7 +1757,8 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
>   		if (ret)
>   			goto out_source_vcpu;
>   	}
> -	sev_migrate_from(dst_sev, src_sev);
> +
> +	sev_migrate_from(kvm, source_kvm);
>   	kvm_vm_dead(source_kvm);
>   	cg_cleanup_sev = src_sev;
>   	ret = 0;
> @@ -2008,10 +2028,10 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>   	 */
>   	source_sev = &to_kvm_svm(source_kvm)->sev_info;
>   	kvm_get_kvm(source_kvm);
> -	source_sev->num_mirrored_vms++;
> +	mirror_sev = &to_kvm_svm(kvm)->sev_info;
> +	list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
>   
>   	/* Set enc_context_owner and copy its encryption context over */
> -	mirror_sev = &to_kvm_svm(kvm)->sev_info;
>   	mirror_sev->enc_context_owner = source_kvm;
>   	mirror_sev->active = true;
>   	mirror_sev->asid = source_sev->asid;
> @@ -2019,6 +2039,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>   	mirror_sev->es_active = source_sev->es_active;
>   	mirror_sev->handle = source_sev->handle;
>   	INIT_LIST_HEAD(&mirror_sev->regions_list);
> +	INIT_LIST_HEAD(&mirror_sev->mirror_vms);
>   	ret = 0;
>   
>   	/*
> @@ -2041,19 +2062,17 @@ void sev_vm_destroy(struct kvm *kvm)
>   	struct list_head *head = &sev->regions_list;
>   	struct list_head *pos, *q;
>   
> -	WARN_ON(sev->num_mirrored_vms);
> -
>   	if (!sev_guest(kvm))
>   		return;
>   
> +	WARN_ON(!list_empty(&sev->mirror_vms));
> +
>   	/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
>   	if (is_mirroring_enc_context(kvm)) {
>   		struct kvm *owner_kvm = sev->enc_context_owner;
> -		struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info;
>   
>   		mutex_lock(&owner_kvm->lock);
> -		if (!WARN_ON(!owner_sev->num_mirrored_vms))
> -			owner_sev->num_mirrored_vms--;
> +		list_del(&sev->mirror_entry);
>   		mutex_unlock(&owner_kvm->lock);
>   		kvm_put_kvm(owner_kvm);
>   		return;
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 86bcfed6599e..831bca1fdc29 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -81,7 +81,8 @@ struct kvm_sev_info {
>   	struct list_head regions_list;  /* List of registered regions */
>   	u64 ap_jump_table;	/* SEV-ES AP Jump Table address */
>   	struct kvm *enc_context_owner; /* Owner of copied encryption context */
> -	unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
> +	struct list_head mirror_vms; /* List of VMs mirroring */
> +	struct list_head mirror_entry; /* Use as a list entry of mirrors */
>   	struct misc_cg *misc_cg; /* For misc cgroup accounting */
>   	atomic_t migration_in_progress;
>   };
> diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
> index 80056bbbb003..2e5a42cb470b 100644
> --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
> +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
> @@ -341,35 +341,54 @@ static void test_sev_mirror_parameters(void)
>   
>   static void test_sev_move_copy(void)
>   {
> -	struct kvm_vm *dst_vm, *sev_vm, *mirror_vm, *dst_mirror_vm;
> -	int ret;
> +	struct kvm_vm *dst_vm, *dst2_vm, *dst3_vm, *sev_vm, *mirror_vm,
> +		      *dst_mirror_vm, *dst2_mirror_vm, *dst3_mirror_vm;
>   
>   	sev_vm = sev_vm_create(/* es= */ false);
>   	dst_vm = aux_vm_create(true);
> +	dst2_vm = aux_vm_create(true);
> +	dst3_vm = aux_vm_create(true);
>   	mirror_vm = aux_vm_create(false);
>   	dst_mirror_vm = aux_vm_create(false);
> +	dst2_mirror_vm = aux_vm_create(false);
> +	dst3_mirror_vm = aux_vm_create(false);
>   
>   	sev_mirror_create(mirror_vm->fd, sev_vm->fd);
> -	ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd);
> -	TEST_ASSERT(ret == -1 && errno == EBUSY,
> -		    "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret,
> -		    errno);
>   
> -	/* The mirror itself can be migrated.  */
>   	sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd);
> -	ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd);
> -	TEST_ASSERT(ret == -1 && errno == EBUSY,
> -		    "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret,
> -		    errno);
> +	sev_migrate_from(dst_vm->fd, sev_vm->fd);
> +
> +	sev_migrate_from(dst2_vm->fd, dst_vm->fd);
> +	sev_migrate_from(dst2_mirror_vm->fd, dst_mirror_vm->fd);
> +
> +	sev_migrate_from(dst3_mirror_vm->fd, dst2_mirror_vm->fd);
> +	sev_migrate_from(dst3_vm->fd, dst2_vm->fd);
> +
> +	kvm_vm_free(dst_vm);
> +	kvm_vm_free(sev_vm);
> +	kvm_vm_free(dst2_vm);
> +	kvm_vm_free(dst3_vm);
> +	kvm_vm_free(mirror_vm);
> +	kvm_vm_free(dst_mirror_vm);
> +	kvm_vm_free(dst2_mirror_vm);
> +	kvm_vm_free(dst3_mirror_vm);
>   
>   	/*
> -	 * mirror_vm is not a mirror anymore, dst_mirror_vm is.  Thus,
> -	 * the owner can be copied as soon as dst_mirror_vm is gone.
> +	 * Run similar test be destroy mirrors before mirrored VMs to ensure
> +	 * destruction is done safely.
>   	 */
> -	kvm_vm_free(dst_mirror_vm);
> +	sev_vm = sev_vm_create(/* es= */ false);
> +	dst_vm = aux_vm_create(true);
> +	mirror_vm = aux_vm_create(false);
> +	dst_mirror_vm = aux_vm_create(false);
> +
> +	sev_mirror_create(mirror_vm->fd, sev_vm->fd);
> +
> +	sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd);
>   	sev_migrate_from(dst_vm->fd, sev_vm->fd);
>   
>   	kvm_vm_free(mirror_vm);
> +	kvm_vm_free(dst_mirror_vm);
>   	kvm_vm_free(dst_vm);
>   	kvm_vm_free(sev_vm);
>   }


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH MANUALSEL 5.17 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs
  2022-04-27 15:53 [PATCH MANUALSEL 5.17 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Sasha Levin
                   ` (5 preceding siblings ...)
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.17 7/7] KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is advertised Sasha Levin
@ 2022-04-27 16:16 ` Paolo Bonzini
  6 siblings, 0 replies; 14+ messages in thread
From: Paolo Bonzini @ 2022-04-27 16:16 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable
  Cc: Nana Liu, Peter Xu, shuah, nathan, ndesaulniers, jmattson,
	oupton, seanjc, ricarkol, yang.zhong, aaronlewis, wei.w.wang,
	kvm, linux-kselftest, llvm

On 4/27/22 17:53, Sasha Levin wrote:
> From: Paolo Bonzini <pbonzini@redhat.com>
> 
> [ Upstream commit f18b4aebe107d092e384b1ae680b1e1de7a0196d ]
> 
> Red Hat's QE team reported test failure on access_tracking_perf_test:
> 
> Testing guest mode: PA-bits:ANY, VA-bits:48,  4K pages
> guest physical test memory offset: 0x3fffbffff000
> 
> Populating memory             : 0.684014577s
> Writing to populated memory   : 0.006230175s
> Reading from populated memory : 0.004557805s
> ==== Test Assertion Failure ====
>    lib/kvm_util.c:1411: false
>    pid=125806 tid=125809 errno=4 - Interrupted system call
>       1  0x0000000000402f7c: addr_gpa2hva at kvm_util.c:1411
>       2   (inlined by) addr_gpa2hva at kvm_util.c:1405
>       3  0x0000000000401f52: lookup_pfn at access_tracking_perf_test.c:98
>       4   (inlined by) mark_vcpu_memory_idle at access_tracking_perf_test.c:152
>       5   (inlined by) vcpu_thread_main at access_tracking_perf_test.c:232
>       6  0x00007fefe9ff81ce: ?? ??:0
>       7  0x00007fefe9c64d82: ?? ??:0
>    No vm physical memory at 0xffbffff000
> 
> I can easily reproduce it with a Intel(R) Xeon(R) CPU E5-2630 with 46 bits
> PA.
> 
> It turns out that the address translation for clearing idle page tracking
> returned a wrong result; addr_gva2gpa()'s last step, which is based on
> "pte[index[0]].pfn", did the calculation with 40 bits length and the
> high 12 bits got truncated.  In above case the GPA address to be returned
> should be 0x3fffbffff000 for GVA 0xc0000000, but it got truncated into
> 0xffbffff000 and the subsequent gpa2hva lookup failed.
> 
> The width of operations on bit fields greater than 32-bit is
> implementation defined, and differs between GCC (which uses the bitfield
> precision) and clang (which uses 64-bit arithmetic), so this is a
> potential minefield.  Remove the bit fields and using manual masking
> instead.
> 
> Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2075036
> Reported-by: Nana Liu <nanliu@redhat.com>
> Reviewed-by: Peter Xu <peterx@redhat.com>
> Tested-by: Peter Xu <peterx@redhat.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>   .../selftests/kvm/include/x86_64/processor.h  |  15 ++
>   .../selftests/kvm/lib/x86_64/processor.c      | 192 +++++++-----------
>   2 files changed, 92 insertions(+), 115 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
> index 8a470da7b71a..15a2875698b5 100644
> --- a/tools/testing/selftests/kvm/include/x86_64/processor.h
> +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
> @@ -60,6 +60,21 @@
>   /* CPUID.0x8000_0001.EDX */
>   #define CPUID_GBPAGES		(1ul << 26)
>   
> +/* Page table bitfield declarations */
> +#define PTE_PRESENT_MASK        BIT_ULL(0)
> +#define PTE_WRITABLE_MASK       BIT_ULL(1)
> +#define PTE_USER_MASK           BIT_ULL(2)
> +#define PTE_ACCESSED_MASK       BIT_ULL(5)
> +#define PTE_DIRTY_MASK          BIT_ULL(6)
> +#define PTE_LARGE_MASK          BIT_ULL(7)
> +#define PTE_GLOBAL_MASK         BIT_ULL(8)
> +#define PTE_NX_MASK             BIT_ULL(63)
> +
> +#define PAGE_SHIFT		12
> +
> +#define PHYSICAL_PAGE_MASK      GENMASK_ULL(51, 12)
> +#define PTE_GET_PFN(pte)        (((pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
> +
>   /* General Registers in 64-Bit Mode */
>   struct gpr64_regs {
>   	u64 rax;
> diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
> index 9f000dfb5594..0dd442c26015 100644
> --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
> +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
> @@ -19,38 +19,6 @@
>   
>   vm_vaddr_t exception_handlers;
>   
> -/* Virtual translation table structure declarations */
> -struct pageUpperEntry {
> -	uint64_t present:1;
> -	uint64_t writable:1;
> -	uint64_t user:1;
> -	uint64_t write_through:1;
> -	uint64_t cache_disable:1;
> -	uint64_t accessed:1;
> -	uint64_t ignored_06:1;
> -	uint64_t page_size:1;
> -	uint64_t ignored_11_08:4;
> -	uint64_t pfn:40;
> -	uint64_t ignored_62_52:11;
> -	uint64_t execute_disable:1;
> -};
> -
> -struct pageTableEntry {
> -	uint64_t present:1;
> -	uint64_t writable:1;
> -	uint64_t user:1;
> -	uint64_t write_through:1;
> -	uint64_t cache_disable:1;
> -	uint64_t accessed:1;
> -	uint64_t dirty:1;
> -	uint64_t reserved_07:1;
> -	uint64_t global:1;
> -	uint64_t ignored_11_09:3;
> -	uint64_t pfn:40;
> -	uint64_t ignored_62_52:11;
> -	uint64_t execute_disable:1;
> -};
> -
>   void regs_dump(FILE *stream, struct kvm_regs *regs,
>   	       uint8_t indent)
>   {
> @@ -195,23 +163,21 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr,
>   	return &page_table[index];
>   }
>   
> -static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
> -						    uint64_t pt_pfn,
> -						    uint64_t vaddr,
> -						    uint64_t paddr,
> -						    int level,
> -						    enum x86_page_size page_size)
> +static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
> +				       uint64_t pt_pfn,
> +				       uint64_t vaddr,
> +				       uint64_t paddr,
> +				       int level,
> +				       enum x86_page_size page_size)
>   {
> -	struct pageUpperEntry *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
> -
> -	if (!pte->present) {
> -		pte->writable = true;
> -		pte->present = true;
> -		pte->page_size = (level == page_size);
> -		if (pte->page_size)
> -			pte->pfn = paddr >> vm->page_shift;
> +	uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
> +
> +	if (!(*pte & PTE_PRESENT_MASK)) {
> +		*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
> +		if (level == page_size)
> +			*pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
>   		else
> -			pte->pfn = vm_alloc_page_table(vm) >> vm->page_shift;
> +			*pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
>   	} else {
>   		/*
>   		 * Entry already present.  Assert that the caller doesn't want
> @@ -221,7 +187,7 @@ static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
>   		TEST_ASSERT(level != page_size,
>   			    "Cannot create hugepage at level: %u, vaddr: 0x%lx\n",
>   			    page_size, vaddr);
> -		TEST_ASSERT(!pte->page_size,
> +		TEST_ASSERT(!(*pte & PTE_LARGE_MASK),
>   			    "Cannot create page table at level: %u, vaddr: 0x%lx\n",
>   			    level, vaddr);
>   	}
> @@ -232,8 +198,8 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
>   		   enum x86_page_size page_size)
>   {
>   	const uint64_t pg_size = 1ull << ((page_size * 9) + 12);
> -	struct pageUpperEntry *pml4e, *pdpe, *pde;
> -	struct pageTableEntry *pte;
> +	uint64_t *pml4e, *pdpe, *pde;
> +	uint64_t *pte;
>   
>   	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K,
>   		    "Unknown or unsupported guest mode, mode: 0x%x", vm->mode);
> @@ -257,24 +223,22 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
>   	 */
>   	pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift,
>   				      vaddr, paddr, 3, page_size);
> -	if (pml4e->page_size)
> +	if (*pml4e & PTE_LARGE_MASK)
>   		return;
>   
> -	pdpe = virt_create_upper_pte(vm, pml4e->pfn, vaddr, paddr, 2, page_size);
> -	if (pdpe->page_size)
> +	pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, 2, page_size);
> +	if (*pdpe & PTE_LARGE_MASK)
>   		return;
>   
> -	pde = virt_create_upper_pte(vm, pdpe->pfn, vaddr, paddr, 1, page_size);
> -	if (pde->page_size)
> +	pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, 1, page_size);
> +	if (*pde & PTE_LARGE_MASK)
>   		return;
>   
>   	/* Fill in page table entry. */
> -	pte = virt_get_pte(vm, pde->pfn, vaddr, 0);
> -	TEST_ASSERT(!pte->present,
> +	pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, 0);
> +	TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
>   		    "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr);
> -	pte->pfn = paddr >> vm->page_shift;
> -	pte->writable = true;
> -	pte->present = 1;
> +	*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
>   }
>   
>   void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
> @@ -282,12 +246,12 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
>   	__virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K);
>   }
>   
> -static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
> +static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
>   						       uint64_t vaddr)
>   {
>   	uint16_t index[4];
> -	struct pageUpperEntry *pml4e, *pdpe, *pde;
> -	struct pageTableEntry *pte;
> +	uint64_t *pml4e, *pdpe, *pde;
> +	uint64_t *pte;
>   	struct kvm_cpuid_entry2 *entry;
>   	struct kvm_sregs sregs;
>   	int max_phy_addr;
> @@ -329,30 +293,29 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
>   	index[3] = (vaddr >> 39) & 0x1ffu;
>   
>   	pml4e = addr_gpa2hva(vm, vm->pgd);
> -	TEST_ASSERT(pml4e[index[3]].present,
> +	TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK,
>   		"Expected pml4e to be present for gva: 0x%08lx", vaddr);
> -	TEST_ASSERT((*(uint64_t*)(&pml4e[index[3]]) &
> -		(rsvd_mask | (1ull << 7))) == 0,
> +	TEST_ASSERT((pml4e[index[3]] & (rsvd_mask | PTE_LARGE_MASK)) == 0,
>   		"Unexpected reserved bits set.");
>   
> -	pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
> -	TEST_ASSERT(pdpe[index[2]].present,
> +	pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
> +	TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK,
>   		"Expected pdpe to be present for gva: 0x%08lx", vaddr);
> -	TEST_ASSERT(pdpe[index[2]].page_size == 0,
> +	TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK),
>   		"Expected pdpe to map a pde not a 1-GByte page.");
> -	TEST_ASSERT((*(uint64_t*)(&pdpe[index[2]]) & rsvd_mask) == 0,
> +	TEST_ASSERT((pdpe[index[2]] & rsvd_mask) == 0,
>   		"Unexpected reserved bits set.");
>   
> -	pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
> -	TEST_ASSERT(pde[index[1]].present,
> +	pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
> +	TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK,
>   		"Expected pde to be present for gva: 0x%08lx", vaddr);
> -	TEST_ASSERT(pde[index[1]].page_size == 0,
> +	TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK),
>   		"Expected pde to map a pte not a 2-MByte page.");
> -	TEST_ASSERT((*(uint64_t*)(&pde[index[1]]) & rsvd_mask) == 0,
> +	TEST_ASSERT((pde[index[1]] & rsvd_mask) == 0,
>   		"Unexpected reserved bits set.");
>   
> -	pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
> -	TEST_ASSERT(pte[index[0]].present,
> +	pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
> +	TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK,
>   		"Expected pte to be present for gva: 0x%08lx", vaddr);
>   
>   	return &pte[index[0]];
> @@ -360,7 +323,7 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
>   
>   uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
>   {
> -	struct pageTableEntry *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
> +	uint64_t *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
>   
>   	return *(uint64_t *)pte;
>   }
> @@ -368,18 +331,17 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
>   void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr,
>   			     uint64_t pte)
>   {
> -	struct pageTableEntry *new_pte = _vm_get_page_table_entry(vm, vcpuid,
> -								  vaddr);
> +	uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
>   
>   	*(uint64_t *)new_pte = pte;
>   }
>   
>   void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
>   {
> -	struct pageUpperEntry *pml4e, *pml4e_start;
> -	struct pageUpperEntry *pdpe, *pdpe_start;
> -	struct pageUpperEntry *pde, *pde_start;
> -	struct pageTableEntry *pte, *pte_start;
> +	uint64_t *pml4e, *pml4e_start;
> +	uint64_t *pdpe, *pdpe_start;
> +	uint64_t *pde, *pde_start;
> +	uint64_t *pte, *pte_start;
>   
>   	if (!vm->pgd_created)
>   		return;
> @@ -389,58 +351,58 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
>   	fprintf(stream, "%*s      index hvaddr         gpaddr         "
>   		"addr         w exec dirty\n",
>   		indent, "");
> -	pml4e_start = (struct pageUpperEntry *) addr_gpa2hva(vm, vm->pgd);
> +	pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd);
>   	for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
>   		pml4e = &pml4e_start[n1];
> -		if (!pml4e->present)
> +		if (!(*pml4e & PTE_PRESENT_MASK))
>   			continue;
> -		fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u "
> +		fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u "
>   			" %u\n",
>   			indent, "",
>   			pml4e - pml4e_start, pml4e,
> -			addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->pfn,
> -			pml4e->writable, pml4e->execute_disable);
> +			addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),
> +			!!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));
>   
> -		pdpe_start = addr_gpa2hva(vm, pml4e->pfn * vm->page_size);
> +		pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);
>   		for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
>   			pdpe = &pdpe_start[n2];
> -			if (!pdpe->present)
> +			if (!(*pdpe & PTE_PRESENT_MASK))
>   				continue;
> -			fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10lx "
> +			fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10llx "
>   				"%u  %u\n",
>   				indent, "",
>   				pdpe - pdpe_start, pdpe,
>   				addr_hva2gpa(vm, pdpe),
> -				(uint64_t) pdpe->pfn, pdpe->writable,
> -				pdpe->execute_disable);
> +				PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK),
> +				!!(*pdpe & PTE_NX_MASK));
>   
> -			pde_start = addr_gpa2hva(vm, pdpe->pfn * vm->page_size);
> +			pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK);
>   			for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
>   				pde = &pde_start[n3];
> -				if (!pde->present)
> +				if (!(*pde & PTE_PRESENT_MASK))
>   					continue;
>   				fprintf(stream, "%*spde   0x%-3zx %p "
> -					"0x%-12lx 0x%-10lx %u  %u\n",
> +					"0x%-12lx 0x%-10llx %u  %u\n",
>   					indent, "", pde - pde_start, pde,
>   					addr_hva2gpa(vm, pde),
> -					(uint64_t) pde->pfn, pde->writable,
> -					pde->execute_disable);
> +					PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK),
> +					!!(*pde & PTE_NX_MASK));
>   
> -				pte_start = addr_gpa2hva(vm, pde->pfn * vm->page_size);
> +				pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK);
>   				for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
>   					pte = &pte_start[n4];
> -					if (!pte->present)
> +					if (!(*pte & PTE_PRESENT_MASK))
>   						continue;
>   					fprintf(stream, "%*spte   0x%-3zx %p "
> -						"0x%-12lx 0x%-10lx %u  %u "
> +						"0x%-12lx 0x%-10llx %u  %u "
>   						"    %u    0x%-10lx\n",
>   						indent, "",
>   						pte - pte_start, pte,
>   						addr_hva2gpa(vm, pte),
> -						(uint64_t) pte->pfn,
> -						pte->writable,
> -						pte->execute_disable,
> -						pte->dirty,
> +						PTE_GET_PFN(*pte),
> +						!!(*pte & PTE_WRITABLE_MASK),
> +						!!(*pte & PTE_NX_MASK),
> +						!!(*pte & PTE_DIRTY_MASK),
>   						((uint64_t) n1 << 27)
>   							| ((uint64_t) n2 << 18)
>   							| ((uint64_t) n3 << 9)
> @@ -558,8 +520,8 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
>   vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
>   {
>   	uint16_t index[4];
> -	struct pageUpperEntry *pml4e, *pdpe, *pde;
> -	struct pageTableEntry *pte;
> +	uint64_t *pml4e, *pdpe, *pde;
> +	uint64_t *pte;
>   
>   	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
>   		"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
> @@ -572,22 +534,22 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
>   	if (!vm->pgd_created)
>   		goto unmapped_gva;
>   	pml4e = addr_gpa2hva(vm, vm->pgd);
> -	if (!pml4e[index[3]].present)
> +	if (!(pml4e[index[3]] & PTE_PRESENT_MASK))
>   		goto unmapped_gva;
>   
> -	pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
> -	if (!pdpe[index[2]].present)
> +	pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
> +	if (!(pdpe[index[2]] & PTE_PRESENT_MASK))
>   		goto unmapped_gva;
>   
> -	pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
> -	if (!pde[index[1]].present)
> +	pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
> +	if (!(pde[index[1]] & PTE_PRESENT_MASK))
>   		goto unmapped_gva;
>   
> -	pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
> -	if (!pte[index[0]].present)
> +	pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
> +	if (!(pte[index[0]] & PTE_PRESENT_MASK))
>   		goto unmapped_gva;
>   
> -	return (pte[index[0]].pfn * vm->page_size) + (gva & 0xfffu);
> +	return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & 0xfffu);
>   
>   unmapped_gva:
>   	TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);

Acked-by: Paolo Bonzini <pbonzini@redhat.com>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH MANUALSEL 5.17 2/7] KVM: selftests: Silence compiler warning in the kvm_page_table_test
  2022-04-27 15:53 ` [PATCH MANUALSEL 5.17 2/7] KVM: selftests: Silence compiler warning in the kvm_page_table_test Sasha Levin
@ 2022-04-27 16:16   ` Paolo Bonzini
  0 siblings, 0 replies; 14+ messages in thread
From: Paolo Bonzini @ 2022-04-27 16:16 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable
  Cc: Thomas Huth, Claudio Imbrenda, shuah, kvm, linux-kselftest

On 4/27/22 17:53, Sasha Levin wrote:
> From: Thomas Huth <thuth@redhat.com>
> 
> [ Upstream commit 266a19a0bc4fbfab4d981a47640ca98972a01865 ]
> 
> When compiling kvm_page_table_test.c, I get this compiler warning
> with gcc 11.2:
> 
> kvm_page_table_test.c: In function 'pre_init_before_test':
> ../../../../tools/include/linux/kernel.h:44:24: warning: comparison of
>   distinct pointer types lacks a cast
>     44 |         (void) (&_max1 == &_max2);              \
>        |                        ^~
> kvm_page_table_test.c:281:21: note: in expansion of macro 'max'
>    281 |         alignment = max(0x100000, alignment);
>        |                     ^~~
> 
> Fix it by adjusting the type of the absolute value.
> 
> Signed-off-by: Thomas Huth <thuth@redhat.com>
> Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> Message-Id: <20220414103031.565037-1-thuth@redhat.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>   tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c
> index ba1fdc3dcf4a..2c4a7563a4f8 100644
> --- a/tools/testing/selftests/kvm/kvm_page_table_test.c
> +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
> @@ -278,7 +278,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
>   	else
>   		guest_test_phys_mem = p->phys_offset;
>   #ifdef __s390x__
> -	alignment = max(0x100000, alignment);
> +	alignment = max(0x100000UL, alignment);
>   #endif
>   	guest_test_phys_mem = align_down(guest_test_phys_mem, alignment);
>   

Acked-by: Paolo Bonzini <pbonzini@redhat.com>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH MANUALSEL 5.17 3/7] x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume
  2022-04-27 15:53 ` [PATCH MANUALSEL 5.17 3/7] x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume Sasha Levin
@ 2022-04-27 16:16   ` Paolo Bonzini
  0 siblings, 0 replies; 14+ messages in thread
From: Paolo Bonzini @ 2022-04-27 16:16 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable
  Cc: Wanpeng Li, Marcelo Tosatti, tglx, mingo, bp, dave.hansen, x86, kvm

On 4/27/22 17:53, Sasha Levin wrote:
> From: Wanpeng Li <wanpengli@tencent.com>
> 
> [ Upstream commit 0361bdfddca20c8855ea3bdbbbc9c999912b10ff ]
> 
> MSR_KVM_POLL_CONTROL is cleared on reset, thus reverting guests to
> host-side polling after suspend/resume.  Non-bootstrap CPUs are
> restored correctly by the haltpoll driver because they are hot-unplugged
> during suspend and hot-plugged during resume; however, the BSP
> is not hotpluggable and remains in host-sde polling mode after
> the guest resume.  The makes the guest pay for the cost of vmexits
> every time the guest enters idle.
> 
> Fix it by recording BSP's haltpoll state and resuming it during guest
> resume.
> 
> Cc: Marcelo Tosatti <mtosatti@redhat.com>
> Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> Message-Id: <1650267752-46796-1-git-send-email-wanpengli@tencent.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>   arch/x86/kernel/kvm.c | 13 +++++++++++++
>   1 file changed, 13 insertions(+)
> 
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index ed8a13ac4ab2..4c2a158bb6c4 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -69,6 +69,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align
>   DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
>   static int has_steal_clock = 0;
>   
> +static int has_guest_poll = 0;
>   /*
>    * No need for any "IO delay" on KVM
>    */
> @@ -706,14 +707,26 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
>   
>   static int kvm_suspend(void)
>   {
> +	u64 val = 0;
> +
>   	kvm_guest_cpu_offline(false);
>   
> +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
> +	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
> +		rdmsrl(MSR_KVM_POLL_CONTROL, val);
> +	has_guest_poll = !(val & 1);
> +#endif
>   	return 0;
>   }
>   
>   static void kvm_resume(void)
>   {
>   	kvm_cpu_online(raw_smp_processor_id());
> +
> +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
> +	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
> +		wrmsrl(MSR_KVM_POLL_CONTROL, 0);
> +#endif
>   }
>   
>   static struct syscore_ops kvm_syscore_ops = {

Acked-by: Paolo Bonzini <pbonzini@redhat.com>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH MANUALSEL 5.17 4/7] KVM: x86: Do not change ICR on write to APIC_SELF_IPI
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.17 4/7] KVM: x86: Do not change ICR on write to APIC_SELF_IPI Sasha Levin
@ 2022-04-27 16:16   ` Paolo Bonzini
  0 siblings, 0 replies; 14+ messages in thread
From: Paolo Bonzini @ 2022-04-27 16:16 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable
  Cc: Chao Gao, Sean Christopherson, tglx, mingo, bp, dave.hansen, x86, kvm

On 4/27/22 17:54, Sasha Levin wrote:
> From: Paolo Bonzini <pbonzini@redhat.com>
> 
> [ Upstream commit d22a81b304a27fca6124174a8e842e826c193466 ]
> 
> Emulating writes to SELF_IPI with a write to ICR has an unwanted side effect:
> the value of ICR in vAPIC page gets changed.  The lists SELF_IPI as write-only,
> with no associated MMIO offset, so any write should have no visible side
> effect in the vAPIC page.
> 
> Reported-by: Chao Gao <chao.gao@intel.com>
> Reviewed-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>   arch/x86/kvm/lapic.c | 7 +++----
>   1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 2a10d0033c96..6b6f9359d29e 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -2125,10 +2125,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
>   		break;
>   
>   	case APIC_SELF_IPI:
> -		if (apic_x2apic_mode(apic)) {
> -			kvm_lapic_reg_write(apic, APIC_ICR,
> -					    APIC_DEST_SELF | (val & APIC_VECTOR_MASK));
> -		} else
> +		if (apic_x2apic_mode(apic))
> +			kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0);
> +		else
>   			ret = 1;
>   		break;
>   	default:

Acked-by: Paolo Bonzini <pbonzini@redhat.com>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH MANUALSEL 5.17 5/7] KVM: x86/mmu: avoid NULL-pointer dereference on page freeing bugs
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.17 5/7] KVM: x86/mmu: avoid NULL-pointer dereference on page freeing bugs Sasha Levin
@ 2022-04-27 16:16   ` Paolo Bonzini
  0 siblings, 0 replies; 14+ messages in thread
From: Paolo Bonzini @ 2022-04-27 16:16 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable
  Cc: Sean Christopherson, tglx, mingo, bp, dave.hansen, x86, kvm

On 4/27/22 17:54, Sasha Levin wrote:
> From: Paolo Bonzini <pbonzini@redhat.com>
> 
> [ Upstream commit 9191b8f0745e63edf519e4a54a4aaae1d3d46fbd ]
> 
> WARN and bail if KVM attempts to free a root that isn't backed by a shadow
> page.  KVM allocates a bare page for "special" roots, e.g. when using PAE
> paging or shadowing 2/3/4-level page tables with 4/5-level, and so root_hpa
> will be valid but won't be backed by a shadow page.  It's all too easy to
> blindly call mmu_free_root_page() on root_hpa, be nice and WARN instead of
> crashing KVM and possibly the kernel.
> 
> Reviewed-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>   arch/x86/kvm/mmu/mmu.c | 2 ++
>   1 file changed, 2 insertions(+)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 7f009ebb319a..e7cd16e1e0a0 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3239,6 +3239,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
>   		return;
>   
>   	sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
> +	if (WARN_ON(!sp))
> +		return;
>   
>   	if (is_tdp_mmu_page(sp))
>   		kvm_tdp_mmu_put_root(kvm, sp, false);

Acked-by: Paolo Bonzini <pbonzini@redhat.com>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH MANUALSEL 5.17 7/7] KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is advertised
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.17 7/7] KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is advertised Sasha Levin
@ 2022-04-27 16:17   ` Paolo Bonzini
  0 siblings, 0 replies; 14+ messages in thread
From: Paolo Bonzini @ 2022-04-27 16:17 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable
  Cc: Wanpeng Li, Aili Yao, Sean Christopherson, tglx, mingo, bp,
	dave.hansen, x86, kvm

On 4/27/22 17:54, Sasha Levin wrote:
> From: Wanpeng Li <wanpengli@tencent.com>
> 
> [ Upstream commit 1714a4eb6fb0cb79f182873cd011a8ed60ac65e8 ]
> 
> As commit 0c5f81dad46 ("KVM: LAPIC: Inject timer interrupt via posted
> interrupt") mentioned that the host admin should well tune the guest
> setup, so that vCPUs are placed on isolated pCPUs, and with several pCPUs
> surplus for *busy* housekeeping.  In this setup, it is preferrable to
> disable mwait/hlt/pause vmexits to keep the vCPUs in non-root mode.
> 
> However, if only some guests isolated and others not, they would not
> have any benefit from posted timer interrupts, and at the same time lose
> VMX preemption timer fast paths because kvm_can_post_timer_interrupt()
> returns true and therefore forces kvm_can_use_hv_timer() to false.
> 
> By guaranteeing that posted-interrupt timer is only used if MWAIT or
> HLT are done without vmexit, KVM can make a better choice and use the
> VMX preemption timer and the corresponding fast paths.
> 
> Reported-by: Aili Yao <yaoaili@kingsoft.com>
> Reviewed-by: Sean Christopherson <seanjc@google.com>
> Cc: Aili Yao <yaoaili@kingsoft.com>
> Cc: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> Message-Id: <1643112538-36743-1-git-send-email-wanpengli@tencent.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>   arch/x86/kvm/lapic.c | 3 ++-
>   1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 6b6f9359d29e..970d5c740b00 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -113,7 +113,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
>   
>   static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
>   {
> -	return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
> +	return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
> +		(kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
>   }
>   
>   bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)

Acked-by: Paolo Bonzini <pbonzini@redhat.com>


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2022-04-27 16:19 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-27 15:53 [PATCH MANUALSEL 5.17 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Sasha Levin
2022-04-27 15:53 ` [PATCH MANUALSEL 5.17 2/7] KVM: selftests: Silence compiler warning in the kvm_page_table_test Sasha Levin
2022-04-27 16:16   ` Paolo Bonzini
2022-04-27 15:53 ` [PATCH MANUALSEL 5.17 3/7] x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume Sasha Levin
2022-04-27 16:16   ` Paolo Bonzini
2022-04-27 15:54 ` [PATCH MANUALSEL 5.17 4/7] KVM: x86: Do not change ICR on write to APIC_SELF_IPI Sasha Levin
2022-04-27 16:16   ` Paolo Bonzini
2022-04-27 15:54 ` [PATCH MANUALSEL 5.17 5/7] KVM: x86/mmu: avoid NULL-pointer dereference on page freeing bugs Sasha Levin
2022-04-27 16:16   ` Paolo Bonzini
2022-04-27 15:54 ` [PATCH MANUALSEL 5.17 6/7] KVM: SEV: Allow SEV intra-host migration of VM with mirrors Sasha Levin
2022-04-27 16:04   ` Paolo Bonzini
2022-04-27 15:54 ` [PATCH MANUALSEL 5.17 7/7] KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is advertised Sasha Levin
2022-04-27 16:17   ` Paolo Bonzini
2022-04-27 16:16 ` [PATCH MANUALSEL 5.17 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Paolo Bonzini

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.