* [kvm PATCH v3 1/1] kvm: vmx: use vmalloc() to allocate vcpus
@ 2018-10-24 19:39 Marc Orr
2018-10-24 19:46 ` Matthew Wilcox
2018-10-25 0:08 ` kbuild test robot
0 siblings, 2 replies; 3+ messages in thread
From: Marc Orr @ 2018-10-24 19:39 UTC (permalink / raw)
To: kvm, jmattson, rientjes, konrad.wilk, linux-mm, akpm, pbonzini,
rkrcmar, willy
Cc: Marc Orr
Previously, vcpus were allocated through the kmem_cache_zalloc() API,
which requires the underlying physical memory to be contiguous.
Because the x86 vcpu struct, struct vcpu_vmx, is relatively large
(e.g., currently 47680 bytes on my setup), it can become hard to find
contiguous memory.
At the same time, the comments in the code indicate that the primary
reason for using the kmem_cache_zalloc() API is to align the memory
rather than to provide physical contiguity.
Thus, this patch updates the vcpu allocation logic for vmx to use the
vmalloc() API.
Note, this patch uses the __vmalloc_node_range() API, which is in the
include/linux/vmalloc.h file. To use __vmalloc_node_range(), this patch
exports the API.
Signed-off-by: Marc Orr <marcorr@google.com>
---
arch/x86/kvm/vmx.c | 89 +++++++++++++++++++++++++++++++++++++----
include/linux/vmalloc.h | 1 +
mm/vmalloc.c | 7 ++++
virt/kvm/kvm_main.c | 28 +++++++------
4 files changed, 105 insertions(+), 20 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index abeeb45d1c33..62fcc0d63585 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -898,7 +898,14 @@ struct nested_vmx {
#define POSTED_INTR_ON 0
#define POSTED_INTR_SN 1
-/* Posted-Interrupt Descriptor */
+/*
+ * Posted-Interrupt Descriptor
+ *
+ * Note, the physical address of this structure is used by VMX. Furthermore, the
+ * translation code assumes that the entire pi_desc struct resides within a
+ * single page, which will be true because the struct is 64 bytes and 64-byte
+ * aligned.
+ */
struct pi_desc {
u32 pir[8]; /* Posted interrupt requested */
union {
@@ -970,8 +977,25 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
struct vmx_msrs {
unsigned int nr;
- struct vmx_msr_entry val[NR_AUTOLOAD_MSRS];
+ struct vmx_msr_entry *val;
};
+struct kmem_cache *vmx_msr_entry_cache;
+
+/*
+ * To prevent vmx_msr_entry array from crossing a page boundary, require:
+ * sizeof(*vmx_msrs.vmx_msr_entry.val) to be a power of two. This is guaranteed
+ * through compile-time asserts that:
+ * - NR_AUTOLOAD_MSRS * sizeof(struct vmx_msr_entry) is a power of two
+ * - NR_AUTOLOAD_MSRS * sizeof(struct vmx_msr_entry) <= PAGE_SIZE
+ * - The allocation of vmx_msrs.vmx_msr_entry.val is aligned to its size.
+ */
+#define CHECK_POWER_OF_TWO(val) \
+ BUILD_BUG_ON_MSG(!((val) && !((val) & ((val) - 1))), \
+ #val " is not a power of two.")
+#define CHECK_INTRA_PAGE(val) do { \
+ CHECK_POWER_OF_TWO(val); \
+ BUILD_BUG_ON(!(val <= PAGE_SIZE)); \
+ } while (0)
struct vcpu_vmx {
struct kvm_vcpu vcpu;
@@ -6616,6 +6640,14 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
}
if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
+ /*
+ * Note, pi_desc is contained within a single
+ * page because the struct is 64 bytes and 64-byte aligned.
+ */
+ phys_addr_t pi_desc_phys =
+ page_to_phys(vmalloc_to_page(&vmx->pi_desc)) +
+ (u64)&vmx->pi_desc % PAGE_SIZE;
+
vmcs_write64(EOI_EXIT_BITMAP0, 0);
vmcs_write64(EOI_EXIT_BITMAP1, 0);
vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -6624,7 +6656,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write16(GUEST_INTR_STATUS, 0);
vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
- vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
+ vmcs_write64(POSTED_INTR_DESC_ADDR, pi_desc_phys);
}
if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
@@ -11476,19 +11508,34 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
free_loaded_vmcs(vmx->loaded_vmcs);
kfree(vmx->guest_msrs);
kvm_vcpu_uninit(vcpu);
- kmem_cache_free(kvm_vcpu_cache, vmx);
+ kmem_cache_free(vmx_msr_entry_cache, vmx->msr_autoload.guest.val);
+ kmem_cache_free(vmx_msr_entry_cache, vmx->msr_autoload.host.val);
+ vfree(vmx);
}
static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
int err;
- struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ struct vcpu_vmx *vmx = vzalloc_account(sizeof(struct vcpu_vmx));
unsigned long *msr_bitmap;
int cpu;
if (!vmx)
return ERR_PTR(-ENOMEM);
+ vmx->msr_autoload.guest.val =
+ kmem_cache_zalloc(vmx_msr_entry_cache, GFP_KERNEL);
+ if (!vmx->msr_autoload.guest.val) {
+ err = -ENOMEM;
+ goto free_vmx;
+ }
+ vmx->msr_autoload.host.val =
+ kmem_cache_zalloc(vmx_msr_entry_cache, GFP_KERNEL);
+ if (!vmx->msr_autoload.host.val) {
+ err = -ENOMEM;
+ goto free_msr_autoload_guest;
+ }
+
vmx->vpid = allocate_vpid();
err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
@@ -11576,7 +11623,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
kvm_vcpu_uninit(&vmx->vcpu);
free_vcpu:
free_vpid(vmx->vpid);
- kmem_cache_free(kvm_vcpu_cache, vmx);
+ kmem_cache_free(vmx_msr_entry_cache, vmx->msr_autoload.host.val);
+free_msr_autoload_guest:
+ kmem_cache_free(vmx_msr_entry_cache, vmx->msr_autoload.guest.val);
+free_vmx:
+ vfree(vmx);
return ERR_PTR(err);
}
@@ -15153,6 +15204,10 @@ module_exit(vmx_exit);
static int __init vmx_init(void)
{
int r;
+ size_t vmx_msr_entry_size =
+ sizeof(struct vmx_msr_entry) * NR_AUTOLOAD_MSRS;
+
+ CHECK_INTRA_PAGE(vmx_msr_entry_size);
#if IS_ENABLED(CONFIG_HYPERV)
/*
@@ -15183,10 +15238,25 @@ static int __init vmx_init(void)
}
#endif
- r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
+ /*
+ * Disable kmem cache; vmalloc will be used instead
+ * to avoid OOM'ing when memory is available but not contiguous.
+ */
+ r = kvm_init(&vmx_x86_ops, 0, 0, THIS_MODULE);
if (r)
return r;
+ /*
+ * A vmx_msr_entry array resides exclusively within the kernel. Thus,
+ * use kmem_cache_create_usercopy(), with the usersize argument set to
+ * ZERO, to blacklist copying vmx_msr_entry to/from user space.
+ */
+ vmx_msr_entry_cache =
+ kmem_cache_create_usercopy("vmx_msr_entry", vmx_msr_entry_size,
+ vmx_msr_entry_size, SLAB_ACCOUNT, 0, 0, NULL);
+ if (!vmx_msr_entry_cache) {
+ r = -ENOMEM;
+ goto out;
+ }
/*
* Must be called after kvm_init() so enable_ept is properly set
@@ -15210,5 +15280,8 @@ static int __init vmx_init(void)
vmx_check_vmcs12_offsets();
return 0;
+out:
+ kvm_exit();
+ return r;
}
module_init(vmx_init);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 398e9c95cd61..47ae6e19ea72 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -71,6 +71,7 @@ static inline void vmalloc_init(void)
extern void *vmalloc(unsigned long size);
extern void *vzalloc(unsigned long size);
+extern void *vzalloc_account(unsigned long size);
extern void *vmalloc_user(unsigned long size);
extern void *vmalloc_node(unsigned long size, int node);
extern void *vzalloc_node(unsigned long size, int node);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a728fc492557..20adc04d9558 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1846,6 +1846,13 @@ void *vzalloc(unsigned long size)
}
EXPORT_SYMBOL(vzalloc);
+void *vzalloc_account(unsigned long size)
+{
+ return __vmalloc_node_flags(size, NUMA_NO_NODE,
+ GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT);
+}
+EXPORT_SYMBOL(vzalloc_account);
+
/**
* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
* @size: allocation size
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 786ade1843a2..8b979e7c3ecd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4038,18 +4038,22 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
goto out_free_2;
register_reboot_notifier(&kvm_reboot_notifier);
- /* A kmem cache lets us meet the alignment requirements of fx_save. */
- if (!vcpu_align)
- vcpu_align = __alignof__(struct kvm_vcpu);
- kvm_vcpu_cache =
- kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
- SLAB_ACCOUNT,
- offsetof(struct kvm_vcpu, arch),
- sizeof_field(struct kvm_vcpu, arch),
- NULL);
- if (!kvm_vcpu_cache) {
- r = -ENOMEM;
- goto out_free_3;
+ /*
+ * When vcpu_size is zero,
+ * architecture-specific code manages its own vcpu allocation.
+ */
+ kvm_vcpu_cache = NULL;
+ if (vcpu_size) {
+ if (!vcpu_align)
+ vcpu_align = __alignof__(struct kvm_vcpu);
+ kvm_vcpu_cache = kmem_cache_create_usercopy(
+ "kvm_vcpu", vcpu_size, vcpu_align, SLAB_ACCOUNT,
+ offsetof(struct kvm_vcpu, arch),
+ sizeof_field(struct kvm_vcpu, arch), NULL);
+ if (!kvm_vcpu_cache) {
+ r = -ENOMEM;
+ goto out_free_3;
+ }
}
r = kvm_async_pf_init();
--
2.19.1.568.g152ad8e336-goog
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [kvm PATCH v3 1/1] kvm: vmx: use vmalloc() to allocate vcpus
2018-10-24 19:39 [kvm PATCH v3 1/1] kvm: vmx: use vmalloc() to allocate vcpus Marc Orr
@ 2018-10-24 19:46 ` Matthew Wilcox
2018-10-25 0:08 ` kbuild test robot
1 sibling, 0 replies; 3+ messages in thread
From: Matthew Wilcox @ 2018-10-24 19:46 UTC (permalink / raw)
To: Marc Orr
Cc: kvm, jmattson, rientjes, konrad.wilk, linux-mm, akpm, pbonzini, rkrcmar
On Wed, Oct 24, 2018 at 12:39:12PM -0700, Marc Orr wrote:
> Previously, vcpus were allocated through the kmem_cache_zalloc() API,
> which requires the underlying physical memory to be contiguous.
> Because the x86 vcpu struct, struct vcpu_vmx, is relatively large
> (e.g., currently 47680 bytes on my setup), it can become hard to find
> contiguous memory.
>
> At the same time, the comments in the code indicate that the primary
> reason for using the kmem_cache_zalloc() API is to align the memory
> rather than to provide physical contiguity.
>
> Thus, this patch updates the vcpu allocation logic for vmx to use the
> vmalloc() API.
>
> Note, this patch uses the __vmalloc_node_range() API, which is in the
> include/linux/vmalloc.h file. To use __vmalloc_node_range(), this patch
> exports the API.
Oops ;-)
> +void *vzalloc_account(unsigned long size)
> +{
> + return __vmalloc_node_flags(size, NUMA_NO_NODE,
> + GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT);
> +}
> +EXPORT_SYMBOL(vzalloc_account);
For the mm parts:
Reviewed-by: Matthew Wilcox <willy@infradead.org>
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [kvm PATCH v3 1/1] kvm: vmx: use vmalloc() to allocate vcpus
2018-10-24 19:39 [kvm PATCH v3 1/1] kvm: vmx: use vmalloc() to allocate vcpus Marc Orr
2018-10-24 19:46 ` Matthew Wilcox
@ 2018-10-25 0:08 ` kbuild test robot
1 sibling, 0 replies; 3+ messages in thread
From: kbuild test robot @ 2018-10-25 0:08 UTC (permalink / raw)
To: Marc Orr
Cc: kbuild-all, kvm, jmattson, rientjes, konrad.wilk, linux-mm, akpm,
pbonzini, rkrcmar, willy
[-- Attachment #1: Type: text/plain, Size: 5360 bytes --]
Hi Marc,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on kvm/master]
url: https://github.com/0day-ci/linux/commits/Marc-Orr/kvm-vmx-use-vmalloc-to-allocate-vcpus/20181025-045750
base: https://git.kernel.org/pub/scm/virt/kvm/kvm.git master
config: i386-allmodconfig (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=i386
All warnings (new ones prefixed by >>):
arch/x86/kvm/vmx.c: In function 'vmx_vcpu_setup':
>> arch/x86/kvm/vmx.c:6629:4: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
(u64)&vmx->pi_desc % PAGE_SIZE;
^
vim +6629 arch/x86/kvm/vmx.c
6586
6587 #define VMX_XSS_EXIT_BITMAP 0
6588 /*
6589 * Sets up the vmcs for emulated real mode.
6590 */
6591 static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
6592 {
6593 int i;
6594
6595 if (enable_shadow_vmcs) {
6596 /*
6597 * At vCPU creation, "VMWRITE to any supported field
6598 * in the VMCS" is supported, so use the more
6599 * permissive vmx_vmread_bitmap to specify both read
6600 * and write permissions for the shadow VMCS.
6601 */
6602 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
6603 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap));
6604 }
6605 if (cpu_has_vmx_msr_bitmap())
6606 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
6607
6608 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
6609
6610 /* Control */
6611 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
6612 vmx->hv_deadline_tsc = -1;
6613
6614 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
6615
6616 if (cpu_has_secondary_exec_ctrls()) {
6617 vmx_compute_secondary_exec_control(vmx);
6618 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
6619 vmx->secondary_exec_control);
6620 }
6621
6622 if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
6623 /*
6624 * Note, pi_desc is contained within a single
6625 * page because the struct is 64 bytes and 64-byte aligned.
6626 */
6627 phys_addr_t pi_desc_phys =
6628 page_to_phys(vmalloc_to_page(&vmx->pi_desc)) +
> 6629 (u64)&vmx->pi_desc % PAGE_SIZE;
6630
6631 vmcs_write64(EOI_EXIT_BITMAP0, 0);
6632 vmcs_write64(EOI_EXIT_BITMAP1, 0);
6633 vmcs_write64(EOI_EXIT_BITMAP2, 0);
6634 vmcs_write64(EOI_EXIT_BITMAP3, 0);
6635
6636 vmcs_write16(GUEST_INTR_STATUS, 0);
6637
6638 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
6639 vmcs_write64(POSTED_INTR_DESC_ADDR, pi_desc_phys);
6640 }
6641
6642 if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
6643 vmcs_write32(PLE_GAP, ple_gap);
6644 vmx->ple_window = ple_window;
6645 vmx->ple_window_dirty = true;
6646 }
6647
6648 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
6649 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
6650 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
6651
6652 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
6653 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
6654 vmx_set_constant_host_state(vmx);
6655 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
6656 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
6657
6658 if (cpu_has_vmx_vmfunc())
6659 vmcs_write64(VM_FUNCTION_CONTROL, 0);
6660
6661 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
6662 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
6663 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
6664 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
6665 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
6666
6667 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
6668 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
6669
6670 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
6671 u32 index = vmx_msr_index[i];
6672 u32 data_low, data_high;
6673 int j = vmx->nmsrs;
6674
6675 if (rdmsr_safe(index, &data_low, &data_high) < 0)
6676 continue;
6677 if (wrmsr_safe(index, data_low, data_high) < 0)
6678 continue;
6679 vmx->guest_msrs[j].index = i;
6680 vmx->guest_msrs[j].data = 0;
6681 vmx->guest_msrs[j].mask = -1ull;
6682 ++vmx->nmsrs;
6683 }
6684
6685 vmx->arch_capabilities = kvm_get_arch_capabilities();
6686
6687 vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
6688
6689 /* 22.2.1, 20.8.1 */
6690 vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
6691
6692 vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
6693 vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
6694
6695 set_cr4_guest_host_mask(vmx);
6696
6697 if (vmx_xsaves_supported())
6698 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
6699
6700 if (enable_pml) {
6701 ASSERT(vmx->pml_pg);
6702 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
6703 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
6704 }
6705
6706 if (cpu_has_vmx_encls_vmexit())
6707 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
6708 }
6709
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 65202 bytes --]
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2018-10-25 0:08 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-24 19:39 [kvm PATCH v3 1/1] kvm: vmx: use vmalloc() to allocate vcpus Marc Orr
2018-10-24 19:46 ` Matthew Wilcox
2018-10-25 0:08 ` kbuild test robot
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).