All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH MANUALSEL 5.15 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs
@ 2022-04-27 15:54 Sasha Levin
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 2/7] KVM: selftests: Silence compiler warning in the kvm_page_table_test Sasha Levin
                   ` (6 more replies)
  0 siblings, 7 replies; 15+ messages in thread
From: Sasha Levin @ 2022-04-27 15:54 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Paolo Bonzini, Nana Liu, Peter Xu, Sasha Levin, shuah, nathan,
	ndesaulniers, jmattson, oupton, seanjc, ricarkol, yang.zhong,
	aaronlewis, wei.w.wang, kvm, linux-kselftest, llvm

From: Paolo Bonzini <pbonzini@redhat.com>

[ Upstream commit f18b4aebe107d092e384b1ae680b1e1de7a0196d ]

Red Hat's QE team reported test failure on access_tracking_perf_test:

Testing guest mode: PA-bits:ANY, VA-bits:48,  4K pages
guest physical test memory offset: 0x3fffbffff000

Populating memory             : 0.684014577s
Writing to populated memory   : 0.006230175s
Reading from populated memory : 0.004557805s
==== Test Assertion Failure ====
  lib/kvm_util.c:1411: false
  pid=125806 tid=125809 errno=4 - Interrupted system call
     1  0x0000000000402f7c: addr_gpa2hva at kvm_util.c:1411
     2   (inlined by) addr_gpa2hva at kvm_util.c:1405
     3  0x0000000000401f52: lookup_pfn at access_tracking_perf_test.c:98
     4   (inlined by) mark_vcpu_memory_idle at access_tracking_perf_test.c:152
     5   (inlined by) vcpu_thread_main at access_tracking_perf_test.c:232
     6  0x00007fefe9ff81ce: ?? ??:0
     7  0x00007fefe9c64d82: ?? ??:0
  No vm physical memory at 0xffbffff000

I can easily reproduce it with a Intel(R) Xeon(R) CPU E5-2630 with 46 bits
PA.

It turns out that the address translation for clearing idle page tracking
returned a wrong result; addr_gva2gpa()'s last step, which is based on
"pte[index[0]].pfn", did the calculation with 40 bits length and the
high 12 bits got truncated.  In above case the GPA address to be returned
should be 0x3fffbffff000 for GVA 0xc0000000, but it got truncated into
0xffbffff000 and the subsequent gpa2hva lookup failed.

The width of operations on bit fields greater than 32-bit is
implementation defined, and differs between GCC (which uses the bitfield
precision) and clang (which uses 64-bit arithmetic), so this is a
potential minefield.  Remove the bit fields and using manual masking
instead.

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2075036
Reported-by: Nana Liu <nanliu@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Tested-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 .../selftests/kvm/include/x86_64/processor.h  |  15 ++
 .../selftests/kvm/lib/x86_64/processor.c      | 192 +++++++-----------
 2 files changed, 92 insertions(+), 115 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 05e65ca1c30c..23861c8faa61 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -58,6 +58,21 @@
 /* CPUID.0x8000_0001.EDX */
 #define CPUID_GBPAGES		(1ul << 26)
 
+/* Page table bitfield declarations */
+#define PTE_PRESENT_MASK        BIT_ULL(0)
+#define PTE_WRITABLE_MASK       BIT_ULL(1)
+#define PTE_USER_MASK           BIT_ULL(2)
+#define PTE_ACCESSED_MASK       BIT_ULL(5)
+#define PTE_DIRTY_MASK          BIT_ULL(6)
+#define PTE_LARGE_MASK          BIT_ULL(7)
+#define PTE_GLOBAL_MASK         BIT_ULL(8)
+#define PTE_NX_MASK             BIT_ULL(63)
+
+#define PAGE_SHIFT		12
+
+#define PHYSICAL_PAGE_MASK      GENMASK_ULL(51, 12)
+#define PTE_GET_PFN(pte)        (((pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
+
 /* General Registers in 64-Bit Mode */
 struct gpr64_regs {
 	u64 rax;
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index da73b97e1e6d..46057079d8bb 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -19,38 +19,6 @@
 
 vm_vaddr_t exception_handlers;
 
-/* Virtual translation table structure declarations */
-struct pageUpperEntry {
-	uint64_t present:1;
-	uint64_t writable:1;
-	uint64_t user:1;
-	uint64_t write_through:1;
-	uint64_t cache_disable:1;
-	uint64_t accessed:1;
-	uint64_t ignored_06:1;
-	uint64_t page_size:1;
-	uint64_t ignored_11_08:4;
-	uint64_t pfn:40;
-	uint64_t ignored_62_52:11;
-	uint64_t execute_disable:1;
-};
-
-struct pageTableEntry {
-	uint64_t present:1;
-	uint64_t writable:1;
-	uint64_t user:1;
-	uint64_t write_through:1;
-	uint64_t cache_disable:1;
-	uint64_t accessed:1;
-	uint64_t dirty:1;
-	uint64_t reserved_07:1;
-	uint64_t global:1;
-	uint64_t ignored_11_09:3;
-	uint64_t pfn:40;
-	uint64_t ignored_62_52:11;
-	uint64_t execute_disable:1;
-};
-
 void regs_dump(FILE *stream, struct kvm_regs *regs,
 	       uint8_t indent)
 {
@@ -195,23 +163,21 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr,
 	return &page_table[index];
 }
 
-static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
-						    uint64_t pt_pfn,
-						    uint64_t vaddr,
-						    uint64_t paddr,
-						    int level,
-						    enum x86_page_size page_size)
+static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
+				       uint64_t pt_pfn,
+				       uint64_t vaddr,
+				       uint64_t paddr,
+				       int level,
+				       enum x86_page_size page_size)
 {
-	struct pageUpperEntry *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
-
-	if (!pte->present) {
-		pte->writable = true;
-		pte->present = true;
-		pte->page_size = (level == page_size);
-		if (pte->page_size)
-			pte->pfn = paddr >> vm->page_shift;
+	uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
+
+	if (!(*pte & PTE_PRESENT_MASK)) {
+		*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
+		if (level == page_size)
+			*pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
 		else
-			pte->pfn = vm_alloc_page_table(vm) >> vm->page_shift;
+			*pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
 	} else {
 		/*
 		 * Entry already present.  Assert that the caller doesn't want
@@ -221,7 +187,7 @@ static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
 		TEST_ASSERT(level != page_size,
 			    "Cannot create hugepage at level: %u, vaddr: 0x%lx\n",
 			    page_size, vaddr);
-		TEST_ASSERT(!pte->page_size,
+		TEST_ASSERT(!(*pte & PTE_LARGE_MASK),
 			    "Cannot create page table at level: %u, vaddr: 0x%lx\n",
 			    level, vaddr);
 	}
@@ -232,8 +198,8 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 		   enum x86_page_size page_size)
 {
 	const uint64_t pg_size = 1ull << ((page_size * 9) + 12);
-	struct pageUpperEntry *pml4e, *pdpe, *pde;
-	struct pageTableEntry *pte;
+	uint64_t *pml4e, *pdpe, *pde;
+	uint64_t *pte;
 
 	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K,
 		    "Unknown or unsupported guest mode, mode: 0x%x", vm->mode);
@@ -257,24 +223,22 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 	 */
 	pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift,
 				      vaddr, paddr, 3, page_size);
-	if (pml4e->page_size)
+	if (*pml4e & PTE_LARGE_MASK)
 		return;
 
-	pdpe = virt_create_upper_pte(vm, pml4e->pfn, vaddr, paddr, 2, page_size);
-	if (pdpe->page_size)
+	pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, 2, page_size);
+	if (*pdpe & PTE_LARGE_MASK)
 		return;
 
-	pde = virt_create_upper_pte(vm, pdpe->pfn, vaddr, paddr, 1, page_size);
-	if (pde->page_size)
+	pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, 1, page_size);
+	if (*pde & PTE_LARGE_MASK)
 		return;
 
 	/* Fill in page table entry. */
-	pte = virt_get_pte(vm, pde->pfn, vaddr, 0);
-	TEST_ASSERT(!pte->present,
+	pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, 0);
+	TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
 		    "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr);
-	pte->pfn = paddr >> vm->page_shift;
-	pte->writable = true;
-	pte->present = 1;
+	*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
 }
 
 void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
@@ -282,12 +246,12 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
 	__virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K);
 }
 
-static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
+static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
 						       uint64_t vaddr)
 {
 	uint16_t index[4];
-	struct pageUpperEntry *pml4e, *pdpe, *pde;
-	struct pageTableEntry *pte;
+	uint64_t *pml4e, *pdpe, *pde;
+	uint64_t *pte;
 	struct kvm_cpuid_entry2 *entry;
 	struct kvm_sregs sregs;
 	int max_phy_addr;
@@ -329,30 +293,29 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
 	index[3] = (vaddr >> 39) & 0x1ffu;
 
 	pml4e = addr_gpa2hva(vm, vm->pgd);
-	TEST_ASSERT(pml4e[index[3]].present,
+	TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK,
 		"Expected pml4e to be present for gva: 0x%08lx", vaddr);
-	TEST_ASSERT((*(uint64_t*)(&pml4e[index[3]]) &
-		(rsvd_mask | (1ull << 7))) == 0,
+	TEST_ASSERT((pml4e[index[3]] & (rsvd_mask | PTE_LARGE_MASK)) == 0,
 		"Unexpected reserved bits set.");
 
-	pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
-	TEST_ASSERT(pdpe[index[2]].present,
+	pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
+	TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK,
 		"Expected pdpe to be present for gva: 0x%08lx", vaddr);
-	TEST_ASSERT(pdpe[index[2]].page_size == 0,
+	TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK),
 		"Expected pdpe to map a pde not a 1-GByte page.");
-	TEST_ASSERT((*(uint64_t*)(&pdpe[index[2]]) & rsvd_mask) == 0,
+	TEST_ASSERT((pdpe[index[2]] & rsvd_mask) == 0,
 		"Unexpected reserved bits set.");
 
-	pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
-	TEST_ASSERT(pde[index[1]].present,
+	pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
+	TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK,
 		"Expected pde to be present for gva: 0x%08lx", vaddr);
-	TEST_ASSERT(pde[index[1]].page_size == 0,
+	TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK),
 		"Expected pde to map a pte not a 2-MByte page.");
-	TEST_ASSERT((*(uint64_t*)(&pde[index[1]]) & rsvd_mask) == 0,
+	TEST_ASSERT((pde[index[1]] & rsvd_mask) == 0,
 		"Unexpected reserved bits set.");
 
-	pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
-	TEST_ASSERT(pte[index[0]].present,
+	pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
+	TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK,
 		"Expected pte to be present for gva: 0x%08lx", vaddr);
 
 	return &pte[index[0]];
@@ -360,7 +323,7 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
 
 uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
 {
-	struct pageTableEntry *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
+	uint64_t *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
 
 	return *(uint64_t *)pte;
 }
@@ -368,18 +331,17 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
 void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr,
 			     uint64_t pte)
 {
-	struct pageTableEntry *new_pte = _vm_get_page_table_entry(vm, vcpuid,
-								  vaddr);
+	uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
 
 	*(uint64_t *)new_pte = pte;
 }
 
 void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 {
-	struct pageUpperEntry *pml4e, *pml4e_start;
-	struct pageUpperEntry *pdpe, *pdpe_start;
-	struct pageUpperEntry *pde, *pde_start;
-	struct pageTableEntry *pte, *pte_start;
+	uint64_t *pml4e, *pml4e_start;
+	uint64_t *pdpe, *pdpe_start;
+	uint64_t *pde, *pde_start;
+	uint64_t *pte, *pte_start;
 
 	if (!vm->pgd_created)
 		return;
@@ -389,58 +351,58 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 	fprintf(stream, "%*s      index hvaddr         gpaddr         "
 		"addr         w exec dirty\n",
 		indent, "");
-	pml4e_start = (struct pageUpperEntry *) addr_gpa2hva(vm, vm->pgd);
+	pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd);
 	for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
 		pml4e = &pml4e_start[n1];
-		if (!pml4e->present)
+		if (!(*pml4e & PTE_PRESENT_MASK))
 			continue;
-		fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u "
+		fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u "
 			" %u\n",
 			indent, "",
 			pml4e - pml4e_start, pml4e,
-			addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->pfn,
-			pml4e->writable, pml4e->execute_disable);
+			addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),
+			!!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));
 
-		pdpe_start = addr_gpa2hva(vm, pml4e->pfn * vm->page_size);
+		pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);
 		for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
 			pdpe = &pdpe_start[n2];
-			if (!pdpe->present)
+			if (!(*pdpe & PTE_PRESENT_MASK))
 				continue;
-			fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10lx "
+			fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10llx "
 				"%u  %u\n",
 				indent, "",
 				pdpe - pdpe_start, pdpe,
 				addr_hva2gpa(vm, pdpe),
-				(uint64_t) pdpe->pfn, pdpe->writable,
-				pdpe->execute_disable);
+				PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK),
+				!!(*pdpe & PTE_NX_MASK));
 
-			pde_start = addr_gpa2hva(vm, pdpe->pfn * vm->page_size);
+			pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK);
 			for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
 				pde = &pde_start[n3];
-				if (!pde->present)
+				if (!(*pde & PTE_PRESENT_MASK))
 					continue;
 				fprintf(stream, "%*spde   0x%-3zx %p "
-					"0x%-12lx 0x%-10lx %u  %u\n",
+					"0x%-12lx 0x%-10llx %u  %u\n",
 					indent, "", pde - pde_start, pde,
 					addr_hva2gpa(vm, pde),
-					(uint64_t) pde->pfn, pde->writable,
-					pde->execute_disable);
+					PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK),
+					!!(*pde & PTE_NX_MASK));
 
-				pte_start = addr_gpa2hva(vm, pde->pfn * vm->page_size);
+				pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK);
 				for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
 					pte = &pte_start[n4];
-					if (!pte->present)
+					if (!(*pte & PTE_PRESENT_MASK))
 						continue;
 					fprintf(stream, "%*spte   0x%-3zx %p "
-						"0x%-12lx 0x%-10lx %u  %u "
+						"0x%-12lx 0x%-10llx %u  %u "
 						"    %u    0x%-10lx\n",
 						indent, "",
 						pte - pte_start, pte,
 						addr_hva2gpa(vm, pte),
-						(uint64_t) pte->pfn,
-						pte->writable,
-						pte->execute_disable,
-						pte->dirty,
+						PTE_GET_PFN(*pte),
+						!!(*pte & PTE_WRITABLE_MASK),
+						!!(*pte & PTE_NX_MASK),
+						!!(*pte & PTE_DIRTY_MASK),
 						((uint64_t) n1 << 27)
 							| ((uint64_t) n2 << 18)
 							| ((uint64_t) n3 << 9)
@@ -558,8 +520,8 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
 vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
 {
 	uint16_t index[4];
-	struct pageUpperEntry *pml4e, *pdpe, *pde;
-	struct pageTableEntry *pte;
+	uint64_t *pml4e, *pdpe, *pde;
+	uint64_t *pte;
 
 	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
 		"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
@@ -572,22 +534,22 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
 	if (!vm->pgd_created)
 		goto unmapped_gva;
 	pml4e = addr_gpa2hva(vm, vm->pgd);
-	if (!pml4e[index[3]].present)
+	if (!(pml4e[index[3]] & PTE_PRESENT_MASK))
 		goto unmapped_gva;
 
-	pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
-	if (!pdpe[index[2]].present)
+	pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
+	if (!(pdpe[index[2]] & PTE_PRESENT_MASK))
 		goto unmapped_gva;
 
-	pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
-	if (!pde[index[1]].present)
+	pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
+	if (!(pde[index[1]] & PTE_PRESENT_MASK))
 		goto unmapped_gva;
 
-	pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
-	if (!pte[index[0]].present)
+	pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
+	if (!(pte[index[0]] & PTE_PRESENT_MASK))
 		goto unmapped_gva;
 
-	return (pte[index[0]].pfn * vm->page_size) + (gva & 0xfffu);
+	return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & 0xfffu);
 
 unmapped_gva:
 	TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH MANUALSEL 5.15 2/7] KVM: selftests: Silence compiler warning in the kvm_page_table_test
  2022-04-27 15:54 [PATCH MANUALSEL 5.15 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Sasha Levin
@ 2022-04-27 15:54 ` Sasha Levin
  2022-04-27 16:19   ` Paolo Bonzini
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 3/7] x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume Sasha Levin
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 15+ messages in thread
From: Sasha Levin @ 2022-04-27 15:54 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Thomas Huth, Claudio Imbrenda, Paolo Bonzini, Sasha Levin, shuah,
	kvm, linux-kselftest

From: Thomas Huth <thuth@redhat.com>

[ Upstream commit 266a19a0bc4fbfab4d981a47640ca98972a01865 ]

When compiling kvm_page_table_test.c, I get this compiler warning
with gcc 11.2:

kvm_page_table_test.c: In function 'pre_init_before_test':
../../../../tools/include/linux/kernel.h:44:24: warning: comparison of
 distinct pointer types lacks a cast
   44 |         (void) (&_max1 == &_max2);              \
      |                        ^~
kvm_page_table_test.c:281:21: note: in expansion of macro 'max'
  281 |         alignment = max(0x100000, alignment);
      |                     ^~~

Fix it by adjusting the type of the absolute value.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Message-Id: <20220414103031.565037-1-thuth@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c
index 36407cb0ec85..f1ddfe4c4a03 100644
--- a/tools/testing/selftests/kvm/kvm_page_table_test.c
+++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
@@ -278,7 +278,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
 	else
 		guest_test_phys_mem = p->phys_offset;
 #ifdef __s390x__
-	alignment = max(0x100000, alignment);
+	alignment = max(0x100000UL, alignment);
 #endif
 	guest_test_phys_mem &= ~(alignment - 1);
 
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH MANUALSEL 5.15 3/7] x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume
  2022-04-27 15:54 [PATCH MANUALSEL 5.15 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Sasha Levin
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 2/7] KVM: selftests: Silence compiler warning in the kvm_page_table_test Sasha Levin
@ 2022-04-27 15:54 ` Sasha Levin
  2022-04-27 16:19   ` Paolo Bonzini
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 4/7] KVM: x86/mmu: do not allow readers to acquire references to invalid roots Sasha Levin
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 15+ messages in thread
From: Sasha Levin @ 2022-04-27 15:54 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Wanpeng Li, Marcelo Tosatti, Paolo Bonzini, Sasha Levin, tglx,
	mingo, bp, dave.hansen, x86, kvm

From: Wanpeng Li <wanpengli@tencent.com>

[ Upstream commit 0361bdfddca20c8855ea3bdbbbc9c999912b10ff ]

MSR_KVM_POLL_CONTROL is cleared on reset, thus reverting guests to
host-side polling after suspend/resume.  Non-bootstrap CPUs are
restored correctly by the haltpoll driver because they are hot-unplugged
during suspend and hot-plugged during resume; however, the BSP
is not hotpluggable and remains in host-sde polling mode after
the guest resume.  The makes the guest pay for the cost of vmexits
every time the guest enters idle.

Fix it by recording BSP's haltpoll state and resuming it during guest
resume.

Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Message-Id: <1650267752-46796-1-git-send-email-wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kernel/kvm.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index bd7b65081eb0..d36b58e705b6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -66,6 +66,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align
 DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
 static int has_steal_clock = 0;
 
+static int has_guest_poll = 0;
 /*
  * No need for any "IO delay" on KVM
  */
@@ -650,14 +651,26 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
 
 static int kvm_suspend(void)
 {
+	u64 val = 0;
+
 	kvm_guest_cpu_offline(false);
 
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
+		rdmsrl(MSR_KVM_POLL_CONTROL, val);
+	has_guest_poll = !(val & 1);
+#endif
 	return 0;
 }
 
 static void kvm_resume(void)
 {
 	kvm_cpu_online(raw_smp_processor_id());
+
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
+		wrmsrl(MSR_KVM_POLL_CONTROL, 0);
+#endif
 }
 
 static struct syscore_ops kvm_syscore_ops = {
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH MANUALSEL 5.15 4/7] KVM: x86/mmu: do not allow readers to acquire references to invalid roots
  2022-04-27 15:54 [PATCH MANUALSEL 5.15 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Sasha Levin
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 2/7] KVM: selftests: Silence compiler warning in the kvm_page_table_test Sasha Levin
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 3/7] x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume Sasha Levin
@ 2022-04-27 15:54 ` Sasha Levin
  2022-04-27 16:19   ` Paolo Bonzini
  2022-04-27 16:20   ` Paolo Bonzini
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 5/7] KVM: x86: Do not change ICR on write to APIC_SELF_IPI Sasha Levin
                   ` (3 subsequent siblings)
  6 siblings, 2 replies; 15+ messages in thread
From: Sasha Levin @ 2022-04-27 15:54 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Paolo Bonzini, Sasha Levin, tglx, mingo, bp, dave.hansen, x86, kvm

From: Paolo Bonzini <pbonzini@redhat.com>

[ Upstream commit 614f6970aa70242a3f8a8051b01244c029f77b2a ]

Remove the "shared" argument of for_each_tdp_mmu_root_yield_safe, thus ensuring
that readers do not ever acquire a reference to an invalid root.  After this
patch, all readers except kvm_tdp_mmu_zap_invalidated_roots() treat
refcount=0/valid, refcount=0/invalid and refcount=1/invalid in exactly the
same way.  kvm_tdp_mmu_zap_invalidated_roots() is different but it also
does not acquire a reference to the invalid root, and it cannot see
refcount=0/invalid because it is guaranteed to run after
kvm_tdp_mmu_invalidate_all_roots().

Opportunistically add a lockdep assertion to the yield-safe iterator.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 853780eb033b..7e854313ec3b 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -155,14 +155,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
 	     _root;								\
 	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
-		if (kvm_mmu_page_as_id(_root) != _as_id) {			\
+		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
+		    kvm_mmu_page_as_id(_root) != _as_id) {			\
 		} else
 
 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
 
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)		\
-	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, false)
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)			\
+	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
 
 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)				\
 	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,		\
@@ -828,7 +829,7 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
 {
 	struct kvm_mmu_page *root;
 
-	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
+	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
 		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
 				      false);
 
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH MANUALSEL 5.15 5/7] KVM: x86: Do not change ICR on write to APIC_SELF_IPI
  2022-04-27 15:54 [PATCH MANUALSEL 5.15 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Sasha Levin
                   ` (2 preceding siblings ...)
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 4/7] KVM: x86/mmu: do not allow readers to acquire references to invalid roots Sasha Levin
@ 2022-04-27 15:54 ` Sasha Levin
  2022-04-27 16:19   ` Paolo Bonzini
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 6/7] KVM: x86/mmu: avoid NULL-pointer dereference on page freeing bugs Sasha Levin
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 15+ messages in thread
From: Sasha Levin @ 2022-04-27 15:54 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Paolo Bonzini, Chao Gao, Sean Christopherson, Sasha Levin, tglx,
	mingo, bp, dave.hansen, x86, kvm

From: Paolo Bonzini <pbonzini@redhat.com>

[ Upstream commit d22a81b304a27fca6124174a8e842e826c193466 ]

Emulating writes to SELF_IPI with a write to ICR has an unwanted side effect:
the value of ICR in vAPIC page gets changed.  The lists SELF_IPI as write-only,
with no associated MMIO offset, so any write should have no visible side
effect in the vAPIC page.

Reported-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kvm/lapic.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4d92fb4fdf69..83d1743a1dd0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2125,10 +2125,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 		break;
 
 	case APIC_SELF_IPI:
-		if (apic_x2apic_mode(apic)) {
-			kvm_lapic_reg_write(apic, APIC_ICR,
-					    APIC_DEST_SELF | (val & APIC_VECTOR_MASK));
-		} else
+		if (apic_x2apic_mode(apic))
+			kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0);
+		else
 			ret = 1;
 		break;
 	default:
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH MANUALSEL 5.15 6/7] KVM: x86/mmu: avoid NULL-pointer dereference on page freeing bugs
  2022-04-27 15:54 [PATCH MANUALSEL 5.15 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Sasha Levin
                   ` (3 preceding siblings ...)
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 5/7] KVM: x86: Do not change ICR on write to APIC_SELF_IPI Sasha Levin
@ 2022-04-27 15:54 ` Sasha Levin
  2022-04-27 16:20   ` Paolo Bonzini
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 7/7] KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is advertised Sasha Levin
  2022-04-27 16:19 ` [PATCH MANUALSEL 5.15 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Paolo Bonzini
  6 siblings, 1 reply; 15+ messages in thread
From: Sasha Levin @ 2022-04-27 15:54 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Paolo Bonzini, Sean Christopherson, Sasha Levin, tglx, mingo, bp,
	dave.hansen, x86, kvm

From: Paolo Bonzini <pbonzini@redhat.com>

[ Upstream commit 9191b8f0745e63edf519e4a54a4aaae1d3d46fbd ]

WARN and bail if KVM attempts to free a root that isn't backed by a shadow
page.  KVM allocates a bare page for "special" roots, e.g. when using PAE
paging or shadowing 2/3/4-level page tables with 4/5-level, and so root_hpa
will be valid but won't be backed by a shadow page.  It's all too easy to
blindly call mmu_free_root_page() on root_hpa, be nice and WARN instead of
crashing KVM and possibly the kernel.

Reviewed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kvm/mmu/mmu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 34e828badc51..806f9d42bcce 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3314,6 +3314,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
 		return;
 
 	sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
+	if (WARN_ON(!sp))
+		return;
 
 	if (is_tdp_mmu_page(sp))
 		kvm_tdp_mmu_put_root(kvm, sp, false);
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH MANUALSEL 5.15 7/7] KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is advertised
  2022-04-27 15:54 [PATCH MANUALSEL 5.15 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Sasha Levin
                   ` (4 preceding siblings ...)
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 6/7] KVM: x86/mmu: avoid NULL-pointer dereference on page freeing bugs Sasha Levin
@ 2022-04-27 15:54 ` Sasha Levin
  2022-04-27 16:20   ` Paolo Bonzini
  2022-04-27 16:19 ` [PATCH MANUALSEL 5.15 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Paolo Bonzini
  6 siblings, 1 reply; 15+ messages in thread
From: Sasha Levin @ 2022-04-27 15:54 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Wanpeng Li, Aili Yao, Sean Christopherson, Paolo Bonzini,
	Sasha Levin, tglx, mingo, bp, dave.hansen, x86, kvm

From: Wanpeng Li <wanpengli@tencent.com>

[ Upstream commit 1714a4eb6fb0cb79f182873cd011a8ed60ac65e8 ]

As commit 0c5f81dad46 ("KVM: LAPIC: Inject timer interrupt via posted
interrupt") mentioned that the host admin should well tune the guest
setup, so that vCPUs are placed on isolated pCPUs, and with several pCPUs
surplus for *busy* housekeeping.  In this setup, it is preferrable to
disable mwait/hlt/pause vmexits to keep the vCPUs in non-root mode.

However, if only some guests isolated and others not, they would not
have any benefit from posted timer interrupts, and at the same time lose
VMX preemption timer fast paths because kvm_can_post_timer_interrupt()
returns true and therefore forces kvm_can_use_hv_timer() to false.

By guaranteeing that posted-interrupt timer is only used if MWAIT or
HLT are done without vmexit, KVM can make a better choice and use the
VMX preemption timer and the corresponding fast paths.

Reported-by: Aili Yao <yaoaili@kingsoft.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Cc: Aili Yao <yaoaili@kingsoft.com>
Cc: Sean Christopherson <seanjc@google.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Message-Id: <1643112538-36743-1-git-send-email-wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kvm/lapic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 83d1743a1dd0..493d636e6231 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -113,7 +113,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
 
 static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
 {
-	return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
+	return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
+		(kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
 }
 
 bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH MANUALSEL 5.15 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs
  2022-04-27 15:54 [PATCH MANUALSEL 5.15 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Sasha Levin
                   ` (5 preceding siblings ...)
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 7/7] KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is advertised Sasha Levin
@ 2022-04-27 16:19 ` Paolo Bonzini
  6 siblings, 0 replies; 15+ messages in thread
From: Paolo Bonzini @ 2022-04-27 16:19 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable
  Cc: Nana Liu, Peter Xu, shuah, nathan, ndesaulniers, jmattson,
	oupton, seanjc, ricarkol, yang.zhong, aaronlewis, wei.w.wang,
	kvm, linux-kselftest, llvm

On 4/27/22 17:54, Sasha Levin wrote:
> From: Paolo Bonzini <pbonzini@redhat.com>
> 
> [ Upstream commit f18b4aebe107d092e384b1ae680b1e1de7a0196d ]
> 
> Red Hat's QE team reported test failure on access_tracking_perf_test:
> 
> Testing guest mode: PA-bits:ANY, VA-bits:48,  4K pages
> guest physical test memory offset: 0x3fffbffff000
> 
> Populating memory             : 0.684014577s
> Writing to populated memory   : 0.006230175s
> Reading from populated memory : 0.004557805s
> ==== Test Assertion Failure ====
>    lib/kvm_util.c:1411: false
>    pid=125806 tid=125809 errno=4 - Interrupted system call
>       1  0x0000000000402f7c: addr_gpa2hva at kvm_util.c:1411
>       2   (inlined by) addr_gpa2hva at kvm_util.c:1405
>       3  0x0000000000401f52: lookup_pfn at access_tracking_perf_test.c:98
>       4   (inlined by) mark_vcpu_memory_idle at access_tracking_perf_test.c:152
>       5   (inlined by) vcpu_thread_main at access_tracking_perf_test.c:232
>       6  0x00007fefe9ff81ce: ?? ??:0
>       7  0x00007fefe9c64d82: ?? ??:0
>    No vm physical memory at 0xffbffff000
> 
> I can easily reproduce it with a Intel(R) Xeon(R) CPU E5-2630 with 46 bits
> PA.
> 
> It turns out that the address translation for clearing idle page tracking
> returned a wrong result; addr_gva2gpa()'s last step, which is based on
> "pte[index[0]].pfn", did the calculation with 40 bits length and the
> high 12 bits got truncated.  In above case the GPA address to be returned
> should be 0x3fffbffff000 for GVA 0xc0000000, but it got truncated into
> 0xffbffff000 and the subsequent gpa2hva lookup failed.
> 
> The width of operations on bit fields greater than 32-bit is
> implementation defined, and differs between GCC (which uses the bitfield
> precision) and clang (which uses 64-bit arithmetic), so this is a
> potential minefield.  Remove the bit fields and using manual masking
> instead.
> 
> Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2075036
> Reported-by: Nana Liu <nanliu@redhat.com>
> Reviewed-by: Peter Xu <peterx@redhat.com>
> Tested-by: Peter Xu <peterx@redhat.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>   .../selftests/kvm/include/x86_64/processor.h  |  15 ++
>   .../selftests/kvm/lib/x86_64/processor.c      | 192 +++++++-----------
>   2 files changed, 92 insertions(+), 115 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
> index 05e65ca1c30c..23861c8faa61 100644
> --- a/tools/testing/selftests/kvm/include/x86_64/processor.h
> +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
> @@ -58,6 +58,21 @@
>   /* CPUID.0x8000_0001.EDX */
>   #define CPUID_GBPAGES		(1ul << 26)
>   
> +/* Page table bitfield declarations */
> +#define PTE_PRESENT_MASK        BIT_ULL(0)
> +#define PTE_WRITABLE_MASK       BIT_ULL(1)
> +#define PTE_USER_MASK           BIT_ULL(2)
> +#define PTE_ACCESSED_MASK       BIT_ULL(5)
> +#define PTE_DIRTY_MASK          BIT_ULL(6)
> +#define PTE_LARGE_MASK          BIT_ULL(7)
> +#define PTE_GLOBAL_MASK         BIT_ULL(8)
> +#define PTE_NX_MASK             BIT_ULL(63)
> +
> +#define PAGE_SHIFT		12
> +
> +#define PHYSICAL_PAGE_MASK      GENMASK_ULL(51, 12)
> +#define PTE_GET_PFN(pte)        (((pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
> +
>   /* General Registers in 64-Bit Mode */
>   struct gpr64_regs {
>   	u64 rax;
> diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
> index da73b97e1e6d..46057079d8bb 100644
> --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
> +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
> @@ -19,38 +19,6 @@
>   
>   vm_vaddr_t exception_handlers;
>   
> -/* Virtual translation table structure declarations */
> -struct pageUpperEntry {
> -	uint64_t present:1;
> -	uint64_t writable:1;
> -	uint64_t user:1;
> -	uint64_t write_through:1;
> -	uint64_t cache_disable:1;
> -	uint64_t accessed:1;
> -	uint64_t ignored_06:1;
> -	uint64_t page_size:1;
> -	uint64_t ignored_11_08:4;
> -	uint64_t pfn:40;
> -	uint64_t ignored_62_52:11;
> -	uint64_t execute_disable:1;
> -};
> -
> -struct pageTableEntry {
> -	uint64_t present:1;
> -	uint64_t writable:1;
> -	uint64_t user:1;
> -	uint64_t write_through:1;
> -	uint64_t cache_disable:1;
> -	uint64_t accessed:1;
> -	uint64_t dirty:1;
> -	uint64_t reserved_07:1;
> -	uint64_t global:1;
> -	uint64_t ignored_11_09:3;
> -	uint64_t pfn:40;
> -	uint64_t ignored_62_52:11;
> -	uint64_t execute_disable:1;
> -};
> -
>   void regs_dump(FILE *stream, struct kvm_regs *regs,
>   	       uint8_t indent)
>   {
> @@ -195,23 +163,21 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr,
>   	return &page_table[index];
>   }
>   
> -static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
> -						    uint64_t pt_pfn,
> -						    uint64_t vaddr,
> -						    uint64_t paddr,
> -						    int level,
> -						    enum x86_page_size page_size)
> +static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
> +				       uint64_t pt_pfn,
> +				       uint64_t vaddr,
> +				       uint64_t paddr,
> +				       int level,
> +				       enum x86_page_size page_size)
>   {
> -	struct pageUpperEntry *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
> -
> -	if (!pte->present) {
> -		pte->writable = true;
> -		pte->present = true;
> -		pte->page_size = (level == page_size);
> -		if (pte->page_size)
> -			pte->pfn = paddr >> vm->page_shift;
> +	uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
> +
> +	if (!(*pte & PTE_PRESENT_MASK)) {
> +		*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
> +		if (level == page_size)
> +			*pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
>   		else
> -			pte->pfn = vm_alloc_page_table(vm) >> vm->page_shift;
> +			*pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
>   	} else {
>   		/*
>   		 * Entry already present.  Assert that the caller doesn't want
> @@ -221,7 +187,7 @@ static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
>   		TEST_ASSERT(level != page_size,
>   			    "Cannot create hugepage at level: %u, vaddr: 0x%lx\n",
>   			    page_size, vaddr);
> -		TEST_ASSERT(!pte->page_size,
> +		TEST_ASSERT(!(*pte & PTE_LARGE_MASK),
>   			    "Cannot create page table at level: %u, vaddr: 0x%lx\n",
>   			    level, vaddr);
>   	}
> @@ -232,8 +198,8 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
>   		   enum x86_page_size page_size)
>   {
>   	const uint64_t pg_size = 1ull << ((page_size * 9) + 12);
> -	struct pageUpperEntry *pml4e, *pdpe, *pde;
> -	struct pageTableEntry *pte;
> +	uint64_t *pml4e, *pdpe, *pde;
> +	uint64_t *pte;
>   
>   	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K,
>   		    "Unknown or unsupported guest mode, mode: 0x%x", vm->mode);
> @@ -257,24 +223,22 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
>   	 */
>   	pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift,
>   				      vaddr, paddr, 3, page_size);
> -	if (pml4e->page_size)
> +	if (*pml4e & PTE_LARGE_MASK)
>   		return;
>   
> -	pdpe = virt_create_upper_pte(vm, pml4e->pfn, vaddr, paddr, 2, page_size);
> -	if (pdpe->page_size)
> +	pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, 2, page_size);
> +	if (*pdpe & PTE_LARGE_MASK)
>   		return;
>   
> -	pde = virt_create_upper_pte(vm, pdpe->pfn, vaddr, paddr, 1, page_size);
> -	if (pde->page_size)
> +	pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, 1, page_size);
> +	if (*pde & PTE_LARGE_MASK)
>   		return;
>   
>   	/* Fill in page table entry. */
> -	pte = virt_get_pte(vm, pde->pfn, vaddr, 0);
> -	TEST_ASSERT(!pte->present,
> +	pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, 0);
> +	TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
>   		    "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr);
> -	pte->pfn = paddr >> vm->page_shift;
> -	pte->writable = true;
> -	pte->present = 1;
> +	*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
>   }
>   
>   void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
> @@ -282,12 +246,12 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
>   	__virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K);
>   }
>   
> -static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
> +static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
>   						       uint64_t vaddr)
>   {
>   	uint16_t index[4];
> -	struct pageUpperEntry *pml4e, *pdpe, *pde;
> -	struct pageTableEntry *pte;
> +	uint64_t *pml4e, *pdpe, *pde;
> +	uint64_t *pte;
>   	struct kvm_cpuid_entry2 *entry;
>   	struct kvm_sregs sregs;
>   	int max_phy_addr;
> @@ -329,30 +293,29 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
>   	index[3] = (vaddr >> 39) & 0x1ffu;
>   
>   	pml4e = addr_gpa2hva(vm, vm->pgd);
> -	TEST_ASSERT(pml4e[index[3]].present,
> +	TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK,
>   		"Expected pml4e to be present for gva: 0x%08lx", vaddr);
> -	TEST_ASSERT((*(uint64_t*)(&pml4e[index[3]]) &
> -		(rsvd_mask | (1ull << 7))) == 0,
> +	TEST_ASSERT((pml4e[index[3]] & (rsvd_mask | PTE_LARGE_MASK)) == 0,
>   		"Unexpected reserved bits set.");
>   
> -	pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
> -	TEST_ASSERT(pdpe[index[2]].present,
> +	pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
> +	TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK,
>   		"Expected pdpe to be present for gva: 0x%08lx", vaddr);
> -	TEST_ASSERT(pdpe[index[2]].page_size == 0,
> +	TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK),
>   		"Expected pdpe to map a pde not a 1-GByte page.");
> -	TEST_ASSERT((*(uint64_t*)(&pdpe[index[2]]) & rsvd_mask) == 0,
> +	TEST_ASSERT((pdpe[index[2]] & rsvd_mask) == 0,
>   		"Unexpected reserved bits set.");
>   
> -	pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
> -	TEST_ASSERT(pde[index[1]].present,
> +	pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
> +	TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK,
>   		"Expected pde to be present for gva: 0x%08lx", vaddr);
> -	TEST_ASSERT(pde[index[1]].page_size == 0,
> +	TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK),
>   		"Expected pde to map a pte not a 2-MByte page.");
> -	TEST_ASSERT((*(uint64_t*)(&pde[index[1]]) & rsvd_mask) == 0,
> +	TEST_ASSERT((pde[index[1]] & rsvd_mask) == 0,
>   		"Unexpected reserved bits set.");
>   
> -	pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
> -	TEST_ASSERT(pte[index[0]].present,
> +	pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
> +	TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK,
>   		"Expected pte to be present for gva: 0x%08lx", vaddr);
>   
>   	return &pte[index[0]];
> @@ -360,7 +323,7 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
>   
>   uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
>   {
> -	struct pageTableEntry *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
> +	uint64_t *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
>   
>   	return *(uint64_t *)pte;
>   }
> @@ -368,18 +331,17 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr)
>   void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr,
>   			     uint64_t pte)
>   {
> -	struct pageTableEntry *new_pte = _vm_get_page_table_entry(vm, vcpuid,
> -								  vaddr);
> +	uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
>   
>   	*(uint64_t *)new_pte = pte;
>   }
>   
>   void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
>   {
> -	struct pageUpperEntry *pml4e, *pml4e_start;
> -	struct pageUpperEntry *pdpe, *pdpe_start;
> -	struct pageUpperEntry *pde, *pde_start;
> -	struct pageTableEntry *pte, *pte_start;
> +	uint64_t *pml4e, *pml4e_start;
> +	uint64_t *pdpe, *pdpe_start;
> +	uint64_t *pde, *pde_start;
> +	uint64_t *pte, *pte_start;
>   
>   	if (!vm->pgd_created)
>   		return;
> @@ -389,58 +351,58 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
>   	fprintf(stream, "%*s      index hvaddr         gpaddr         "
>   		"addr         w exec dirty\n",
>   		indent, "");
> -	pml4e_start = (struct pageUpperEntry *) addr_gpa2hva(vm, vm->pgd);
> +	pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd);
>   	for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
>   		pml4e = &pml4e_start[n1];
> -		if (!pml4e->present)
> +		if (!(*pml4e & PTE_PRESENT_MASK))
>   			continue;
> -		fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u "
> +		fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u "
>   			" %u\n",
>   			indent, "",
>   			pml4e - pml4e_start, pml4e,
> -			addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->pfn,
> -			pml4e->writable, pml4e->execute_disable);
> +			addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),
> +			!!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));
>   
> -		pdpe_start = addr_gpa2hva(vm, pml4e->pfn * vm->page_size);
> +		pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);
>   		for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
>   			pdpe = &pdpe_start[n2];
> -			if (!pdpe->present)
> +			if (!(*pdpe & PTE_PRESENT_MASK))
>   				continue;
> -			fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10lx "
> +			fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10llx "
>   				"%u  %u\n",
>   				indent, "",
>   				pdpe - pdpe_start, pdpe,
>   				addr_hva2gpa(vm, pdpe),
> -				(uint64_t) pdpe->pfn, pdpe->writable,
> -				pdpe->execute_disable);
> +				PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK),
> +				!!(*pdpe & PTE_NX_MASK));
>   
> -			pde_start = addr_gpa2hva(vm, pdpe->pfn * vm->page_size);
> +			pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK);
>   			for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
>   				pde = &pde_start[n3];
> -				if (!pde->present)
> +				if (!(*pde & PTE_PRESENT_MASK))
>   					continue;
>   				fprintf(stream, "%*spde   0x%-3zx %p "
> -					"0x%-12lx 0x%-10lx %u  %u\n",
> +					"0x%-12lx 0x%-10llx %u  %u\n",
>   					indent, "", pde - pde_start, pde,
>   					addr_hva2gpa(vm, pde),
> -					(uint64_t) pde->pfn, pde->writable,
> -					pde->execute_disable);
> +					PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK),
> +					!!(*pde & PTE_NX_MASK));
>   
> -				pte_start = addr_gpa2hva(vm, pde->pfn * vm->page_size);
> +				pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK);
>   				for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
>   					pte = &pte_start[n4];
> -					if (!pte->present)
> +					if (!(*pte & PTE_PRESENT_MASK))
>   						continue;
>   					fprintf(stream, "%*spte   0x%-3zx %p "
> -						"0x%-12lx 0x%-10lx %u  %u "
> +						"0x%-12lx 0x%-10llx %u  %u "
>   						"    %u    0x%-10lx\n",
>   						indent, "",
>   						pte - pte_start, pte,
>   						addr_hva2gpa(vm, pte),
> -						(uint64_t) pte->pfn,
> -						pte->writable,
> -						pte->execute_disable,
> -						pte->dirty,
> +						PTE_GET_PFN(*pte),
> +						!!(*pte & PTE_WRITABLE_MASK),
> +						!!(*pte & PTE_NX_MASK),
> +						!!(*pte & PTE_DIRTY_MASK),
>   						((uint64_t) n1 << 27)
>   							| ((uint64_t) n2 << 18)
>   							| ((uint64_t) n3 << 9)
> @@ -558,8 +520,8 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
>   vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
>   {
>   	uint16_t index[4];
> -	struct pageUpperEntry *pml4e, *pdpe, *pde;
> -	struct pageTableEntry *pte;
> +	uint64_t *pml4e, *pdpe, *pde;
> +	uint64_t *pte;
>   
>   	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
>   		"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
> @@ -572,22 +534,22 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
>   	if (!vm->pgd_created)
>   		goto unmapped_gva;
>   	pml4e = addr_gpa2hva(vm, vm->pgd);
> -	if (!pml4e[index[3]].present)
> +	if (!(pml4e[index[3]] & PTE_PRESENT_MASK))
>   		goto unmapped_gva;
>   
> -	pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
> -	if (!pdpe[index[2]].present)
> +	pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
> +	if (!(pdpe[index[2]] & PTE_PRESENT_MASK))
>   		goto unmapped_gva;
>   
> -	pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
> -	if (!pde[index[1]].present)
> +	pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
> +	if (!(pde[index[1]] & PTE_PRESENT_MASK))
>   		goto unmapped_gva;
>   
> -	pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
> -	if (!pte[index[0]].present)
> +	pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
> +	if (!(pte[index[0]] & PTE_PRESENT_MASK))
>   		goto unmapped_gva;
>   
> -	return (pte[index[0]].pfn * vm->page_size) + (gva & 0xfffu);
> +	return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & 0xfffu);
>   
>   unmapped_gva:
>   	TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);

Acked-by: Paolo Bonzini <pbonzini@redhat.com>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH MANUALSEL 5.15 2/7] KVM: selftests: Silence compiler warning in the kvm_page_table_test
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 2/7] KVM: selftests: Silence compiler warning in the kvm_page_table_test Sasha Levin
@ 2022-04-27 16:19   ` Paolo Bonzini
  0 siblings, 0 replies; 15+ messages in thread
From: Paolo Bonzini @ 2022-04-27 16:19 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable
  Cc: Thomas Huth, Claudio Imbrenda, shuah, kvm, linux-kselftest

On 4/27/22 17:54, Sasha Levin wrote:
> From: Thomas Huth <thuth@redhat.com>
> 
> [ Upstream commit 266a19a0bc4fbfab4d981a47640ca98972a01865 ]
> 
> When compiling kvm_page_table_test.c, I get this compiler warning
> with gcc 11.2:
> 
> kvm_page_table_test.c: In function 'pre_init_before_test':
> ../../../../tools/include/linux/kernel.h:44:24: warning: comparison of
>   distinct pointer types lacks a cast
>     44 |         (void) (&_max1 == &_max2);              \
>        |                        ^~
> kvm_page_table_test.c:281:21: note: in expansion of macro 'max'
>    281 |         alignment = max(0x100000, alignment);
>        |                     ^~~
> 
> Fix it by adjusting the type of the absolute value.
> 
> Signed-off-by: Thomas Huth <thuth@redhat.com>
> Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> Message-Id: <20220414103031.565037-1-thuth@redhat.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>   tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c
> index 36407cb0ec85..f1ddfe4c4a03 100644
> --- a/tools/testing/selftests/kvm/kvm_page_table_test.c
> +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
> @@ -278,7 +278,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
>   	else
>   		guest_test_phys_mem = p->phys_offset;
>   #ifdef __s390x__
> -	alignment = max(0x100000, alignment);
> +	alignment = max(0x100000UL, alignment);
>   #endif
>   	guest_test_phys_mem &= ~(alignment - 1);
>   

Acked-by: Paolo Bonzini <pbonzini@redhat.com>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH MANUALSEL 5.15 3/7] x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 3/7] x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume Sasha Levin
@ 2022-04-27 16:19   ` Paolo Bonzini
  0 siblings, 0 replies; 15+ messages in thread
From: Paolo Bonzini @ 2022-04-27 16:19 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable
  Cc: Wanpeng Li, Marcelo Tosatti, tglx, mingo, bp, dave.hansen, x86, kvm

On 4/27/22 17:54, Sasha Levin wrote:
> From: Wanpeng Li <wanpengli@tencent.com>
> 
> [ Upstream commit 0361bdfddca20c8855ea3bdbbbc9c999912b10ff ]
> 
> MSR_KVM_POLL_CONTROL is cleared on reset, thus reverting guests to
> host-side polling after suspend/resume.  Non-bootstrap CPUs are
> restored correctly by the haltpoll driver because they are hot-unplugged
> during suspend and hot-plugged during resume; however, the BSP
> is not hotpluggable and remains in host-sde polling mode after
> the guest resume.  The makes the guest pay for the cost of vmexits
> every time the guest enters idle.
> 
> Fix it by recording BSP's haltpoll state and resuming it during guest
> resume.
> 
> Cc: Marcelo Tosatti <mtosatti@redhat.com>
> Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> Message-Id: <1650267752-46796-1-git-send-email-wanpengli@tencent.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>   arch/x86/kernel/kvm.c | 13 +++++++++++++
>   1 file changed, 13 insertions(+)
> 
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index bd7b65081eb0..d36b58e705b6 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -66,6 +66,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align
>   DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
>   static int has_steal_clock = 0;
>   
> +static int has_guest_poll = 0;
>   /*
>    * No need for any "IO delay" on KVM
>    */
> @@ -650,14 +651,26 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
>   
>   static int kvm_suspend(void)
>   {
> +	u64 val = 0;
> +
>   	kvm_guest_cpu_offline(false);
>   
> +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
> +	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
> +		rdmsrl(MSR_KVM_POLL_CONTROL, val);
> +	has_guest_poll = !(val & 1);
> +#endif
>   	return 0;
>   }
>   
>   static void kvm_resume(void)
>   {
>   	kvm_cpu_online(raw_smp_processor_id());
> +
> +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
> +	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
> +		wrmsrl(MSR_KVM_POLL_CONTROL, 0);
> +#endif
>   }
>   
>   static struct syscore_ops kvm_syscore_ops = {

Acked-by: Paolo Bonzini <pbonzini@redhat.com>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH MANUALSEL 5.15 4/7] KVM: x86/mmu: do not allow readers to acquire references to invalid roots
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 4/7] KVM: x86/mmu: do not allow readers to acquire references to invalid roots Sasha Levin
@ 2022-04-27 16:19   ` Paolo Bonzini
  2022-04-27 16:20   ` Paolo Bonzini
  1 sibling, 0 replies; 15+ messages in thread
From: Paolo Bonzini @ 2022-04-27 16:19 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable; +Cc: tglx, mingo, bp, dave.hansen, x86, kvm

On 4/27/22 17:54, Sasha Levin wrote:
> From: Paolo Bonzini <pbonzini@redhat.com>
> 
> [ Upstream commit 614f6970aa70242a3f8a8051b01244c029f77b2a ]
> 
> Remove the "shared" argument of for_each_tdp_mmu_root_yield_safe, thus ensuring
> that readers do not ever acquire a reference to an invalid root.  After this
> patch, all readers except kvm_tdp_mmu_zap_invalidated_roots() treat
> refcount=0/valid, refcount=0/invalid and refcount=1/invalid in exactly the
> same way.  kvm_tdp_mmu_zap_invalidated_roots() is different but it also
> does not acquire a reference to the invalid root, and it cannot see
> refcount=0/invalid because it is guaranteed to run after
> kvm_tdp_mmu_invalidate_all_roots().
> 
> Opportunistically add a lockdep assertion to the yield-safe iterator.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>   arch/x86/kvm/mmu/tdp_mmu.c | 9 +++++----
>   1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 853780eb033b..7e854313ec3b 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -155,14 +155,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
>   	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
>   	     _root;								\
>   	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
> -		if (kvm_mmu_page_as_id(_root) != _as_id) {			\
> +		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
> +		    kvm_mmu_page_as_id(_root) != _as_id) {			\
>   		} else
>   
>   #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
>   	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
>   
> -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)		\
> -	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, false)
> +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)			\
> +	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
>   
>   #define for_each_tdp_mmu_root(_kvm, _root, _as_id)				\
>   	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,		\
> @@ -828,7 +829,7 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
>   {
>   	struct kvm_mmu_page *root;
>   
> -	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
> +	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
>   		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
>   				      false);
>   

Acked-by: Paolo Bonzini <pbonzini@redhat.com>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH MANUALSEL 5.15 5/7] KVM: x86: Do not change ICR on write to APIC_SELF_IPI
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 5/7] KVM: x86: Do not change ICR on write to APIC_SELF_IPI Sasha Levin
@ 2022-04-27 16:19   ` Paolo Bonzini
  0 siblings, 0 replies; 15+ messages in thread
From: Paolo Bonzini @ 2022-04-27 16:19 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable
  Cc: Chao Gao, Sean Christopherson, tglx, mingo, bp, dave.hansen, x86, kvm

On 4/27/22 17:54, Sasha Levin wrote:
> From: Paolo Bonzini <pbonzini@redhat.com>
> 
> [ Upstream commit d22a81b304a27fca6124174a8e842e826c193466 ]
> 
> Emulating writes to SELF_IPI with a write to ICR has an unwanted side effect:
> the value of ICR in vAPIC page gets changed.  The lists SELF_IPI as write-only,
> with no associated MMIO offset, so any write should have no visible side
> effect in the vAPIC page.
> 
> Reported-by: Chao Gao <chao.gao@intel.com>
> Reviewed-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>   arch/x86/kvm/lapic.c | 7 +++----
>   1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 4d92fb4fdf69..83d1743a1dd0 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -2125,10 +2125,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
>   		break;
>   
>   	case APIC_SELF_IPI:
> -		if (apic_x2apic_mode(apic)) {
> -			kvm_lapic_reg_write(apic, APIC_ICR,
> -					    APIC_DEST_SELF | (val & APIC_VECTOR_MASK));
> -		} else
> +		if (apic_x2apic_mode(apic))
> +			kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0);
> +		else
>   			ret = 1;
>   		break;
>   	default:

Acked-by: Paolo Bonzini <pbonzini@redhat.com>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH MANUALSEL 5.15 4/7] KVM: x86/mmu: do not allow readers to acquire references to invalid roots
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 4/7] KVM: x86/mmu: do not allow readers to acquire references to invalid roots Sasha Levin
  2022-04-27 16:19   ` Paolo Bonzini
@ 2022-04-27 16:20   ` Paolo Bonzini
  1 sibling, 0 replies; 15+ messages in thread
From: Paolo Bonzini @ 2022-04-27 16:20 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable; +Cc: tglx, mingo, bp, dave.hansen, x86, kvm

On 4/27/22 17:54, Sasha Levin wrote:
> From: Paolo Bonzini <pbonzini@redhat.com>
> 
> [ Upstream commit 614f6970aa70242a3f8a8051b01244c029f77b2a ]
> 
> Remove the "shared" argument of for_each_tdp_mmu_root_yield_safe, thus ensuring
> that readers do not ever acquire a reference to an invalid root.  After this
> patch, all readers except kvm_tdp_mmu_zap_invalidated_roots() treat
> refcount=0/valid, refcount=0/invalid and refcount=1/invalid in exactly the
> same way.  kvm_tdp_mmu_zap_invalidated_roots() is different but it also
> does not acquire a reference to the invalid root, and it cannot see
> refcount=0/invalid because it is guaranteed to run after
> kvm_tdp_mmu_invalidate_all_roots().
> 
> Opportunistically add a lockdep assertion to the yield-safe iterator.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>   arch/x86/kvm/mmu/tdp_mmu.c | 9 +++++----
>   1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 853780eb033b..7e854313ec3b 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -155,14 +155,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
>   	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
>   	     _root;								\
>   	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
> -		if (kvm_mmu_page_as_id(_root) != _as_id) {			\
> +		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
> +		    kvm_mmu_page_as_id(_root) != _as_id) {			\
>   		} else
>   
>   #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
>   	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
>   
> -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)		\
> -	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, false)
> +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)			\
> +	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
>   
>   #define for_each_tdp_mmu_root(_kvm, _root, _as_id)				\
>   	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,		\
> @@ -828,7 +829,7 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
>   {
>   	struct kvm_mmu_page *root;
>   
> -	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
> +	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
>   		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
>   				      false);
>   

Sorry no, this is a NACK.

Paolo


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH MANUALSEL 5.15 6/7] KVM: x86/mmu: avoid NULL-pointer dereference on page freeing bugs
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 6/7] KVM: x86/mmu: avoid NULL-pointer dereference on page freeing bugs Sasha Levin
@ 2022-04-27 16:20   ` Paolo Bonzini
  0 siblings, 0 replies; 15+ messages in thread
From: Paolo Bonzini @ 2022-04-27 16:20 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable
  Cc: Sean Christopherson, tglx, mingo, bp, dave.hansen, x86, kvm

On 4/27/22 17:54, Sasha Levin wrote:
> From: Paolo Bonzini <pbonzini@redhat.com>
> 
> [ Upstream commit 9191b8f0745e63edf519e4a54a4aaae1d3d46fbd ]
> 
> WARN and bail if KVM attempts to free a root that isn't backed by a shadow
> page.  KVM allocates a bare page for "special" roots, e.g. when using PAE
> paging or shadowing 2/3/4-level page tables with 4/5-level, and so root_hpa
> will be valid but won't be backed by a shadow page.  It's all too easy to
> blindly call mmu_free_root_page() on root_hpa, be nice and WARN instead of
> crashing KVM and possibly the kernel.
> 
> Reviewed-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>   arch/x86/kvm/mmu/mmu.c | 2 ++
>   1 file changed, 2 insertions(+)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 34e828badc51..806f9d42bcce 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3314,6 +3314,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
>   		return;
>   
>   	sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
> +	if (WARN_ON(!sp))
> +		return;
>   
>   	if (is_tdp_mmu_page(sp))
>   		kvm_tdp_mmu_put_root(kvm, sp, false);

Acked-by: Paolo Bonzini <pbonzini@redhat.com>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH MANUALSEL 5.15 7/7] KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is advertised
  2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 7/7] KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is advertised Sasha Levin
@ 2022-04-27 16:20   ` Paolo Bonzini
  0 siblings, 0 replies; 15+ messages in thread
From: Paolo Bonzini @ 2022-04-27 16:20 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable
  Cc: Wanpeng Li, Aili Yao, Sean Christopherson, tglx, mingo, bp,
	dave.hansen, x86, kvm

On 4/27/22 17:54, Sasha Levin wrote:
> From: Wanpeng Li <wanpengli@tencent.com>
> 
> [ Upstream commit 1714a4eb6fb0cb79f182873cd011a8ed60ac65e8 ]
> 
> As commit 0c5f81dad46 ("KVM: LAPIC: Inject timer interrupt via posted
> interrupt") mentioned that the host admin should well tune the guest
> setup, so that vCPUs are placed on isolated pCPUs, and with several pCPUs
> surplus for *busy* housekeeping.  In this setup, it is preferrable to
> disable mwait/hlt/pause vmexits to keep the vCPUs in non-root mode.
> 
> However, if only some guests isolated and others not, they would not
> have any benefit from posted timer interrupts, and at the same time lose
> VMX preemption timer fast paths because kvm_can_post_timer_interrupt()
> returns true and therefore forces kvm_can_use_hv_timer() to false.
> 
> By guaranteeing that posted-interrupt timer is only used if MWAIT or
> HLT are done without vmexit, KVM can make a better choice and use the
> VMX preemption timer and the corresponding fast paths.
> 
> Reported-by: Aili Yao <yaoaili@kingsoft.com>
> Reviewed-by: Sean Christopherson <seanjc@google.com>
> Cc: Aili Yao <yaoaili@kingsoft.com>
> Cc: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> Message-Id: <1643112538-36743-1-git-send-email-wanpengli@tencent.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>   arch/x86/kvm/lapic.c | 3 ++-
>   1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 83d1743a1dd0..493d636e6231 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -113,7 +113,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
>   
>   static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
>   {
> -	return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
> +	return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
> +		(kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
>   }
>   
>   bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)

Acked-by: Paolo Bonzini <pbonzini@redhat.com>


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2022-04-27 16:27 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-27 15:54 [PATCH MANUALSEL 5.15 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Sasha Levin
2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 2/7] KVM: selftests: Silence compiler warning in the kvm_page_table_test Sasha Levin
2022-04-27 16:19   ` Paolo Bonzini
2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 3/7] x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume Sasha Levin
2022-04-27 16:19   ` Paolo Bonzini
2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 4/7] KVM: x86/mmu: do not allow readers to acquire references to invalid roots Sasha Levin
2022-04-27 16:19   ` Paolo Bonzini
2022-04-27 16:20   ` Paolo Bonzini
2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 5/7] KVM: x86: Do not change ICR on write to APIC_SELF_IPI Sasha Levin
2022-04-27 16:19   ` Paolo Bonzini
2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 6/7] KVM: x86/mmu: avoid NULL-pointer dereference on page freeing bugs Sasha Levin
2022-04-27 16:20   ` Paolo Bonzini
2022-04-27 15:54 ` [PATCH MANUALSEL 5.15 7/7] KVM: LAPIC: Enable timer posted-interrupt only when mwait/hlt is advertised Sasha Levin
2022-04-27 16:20   ` Paolo Bonzini
2022-04-27 16:19 ` [PATCH MANUALSEL 5.15 1/7] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Paolo Bonzini

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.