[v3,64/75] x86/sev-es: Cache CPUID results for improved performance
diff mbox series

Message ID 20200428151725.31091-65-joro@8bytes.org
State New
Headers show
Series
  • x86: SEV-ES Guest Support
Related show

Commit Message

Joerg Roedel April 28, 2020, 3:17 p.m. UTC
From: Mike Stunes <mstunes@vmware.com>

To avoid a future VMEXIT for a subsequent CPUID function, cache the
results returned by CPUID into an xarray.

 [tl: coding standard changes, register zero extension]

Signed-off-by: Mike Stunes <mstunes@vmware.com>
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
[ jroedel@suse.de: - Wrapped cache handling into vc_handle_cpuid_cached()
                   - Used lower_32_bits() where applicable
		   - Moved cache_index out of struct es_em_ctxt ]
Co-developed-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/x86/kernel/sev-es-shared.c |  12 ++--
 arch/x86/kernel/sev-es.c        | 119 +++++++++++++++++++++++++++++++-
 2 files changed, 124 insertions(+), 7 deletions(-)

Comments

Mike Stunes May 6, 2020, 6:08 p.m. UTC | #1
> On Apr 28, 2020, at 8:17 AM, Joerg Roedel <joro@8bytes.org> wrote:
> 
> From: Mike Stunes <mstunes@vmware.com>
> 
> To avoid a future VMEXIT for a subsequent CPUID function, cache the
> results returned by CPUID into an xarray.
> 
> [tl: coding standard changes, register zero extension]
> 
> Signed-off-by: Mike Stunes <mstunes@vmware.com>
> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
> [ jroedel@suse.de: - Wrapped cache handling into vc_handle_cpuid_cached()
>                   - Used lower_32_bits() where applicable
> 		   - Moved cache_index out of struct es_em_ctxt ]
> Co-developed-by: Joerg Roedel <jroedel@suse.de>
> Signed-off-by: Joerg Roedel <jroedel@suse.de>
> ---
> arch/x86/kernel/sev-es-shared.c |  12 ++--
> arch/x86/kernel/sev-es.c        | 119 +++++++++++++++++++++++++++++++-
> 2 files changed, 124 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
> index 03095bc7b563..0303834d4811 100644
> --- a/arch/x86/kernel/sev-es.c
> +++ b/arch/x86/kernel/sev-es.c
> @@ -744,6 +758,91 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb,
> 	return ret;
> }
> 
> +static unsigned long sev_es_get_cpuid_cache_index(struct es_em_ctxt *ctxt)
> +{
> +	unsigned long hi, lo;
> +
> +	/* Don't attempt to cache until the xarray is initialized */
> +	if (!sev_es_cpuid_cache_initialized)
> +		return ULONG_MAX;
> +
> +	lo = lower_32_bits(ctxt->regs->ax);
> +
> +	/*
> +	 * CPUID 0x0000000d requires both RCX and XCR0, so it can't be
> +	 * cached.
> +	 */
> +	if (lo == 0x0000000d)
> +		return ULONG_MAX;
> +
> +	/*
> +	 * Some callers of CPUID don't always set RCX to zero for CPUID
> +	 * functions that don't require RCX, which can result in excessive
> +	 * cached values, so RCX needs to be manually zeroed for use as part
> +	 * of the cache index. Future CPUID values may need RCX, but since
> +	 * they can't be known, they must not be cached.
> +	 */
> +	if (lo > 0x80000020)
> +		return ULONG_MAX;

If the cache is shared across CPUs, do we also need to exclude function 0x1 because it contains the LAPIC ID? (Or is the cache per-CPU?)
Tom Lendacky May 6, 2020, 11:02 p.m. UTC | #2
On 5/6/20 1:08 PM, Mike Stunes wrote:
> 
> 
>> On Apr 28, 2020, at 8:17 AM, Joerg Roedel <joro@8bytes.org> wrote:
>>
>> From: Mike Stunes <mstunes@vmware.com>
>>
>> To avoid a future VMEXIT for a subsequent CPUID function, cache the
>> results returned by CPUID into an xarray.
>>
>> [tl: coding standard changes, register zero extension]
>>
>> Signed-off-by: Mike Stunes <mstunes@vmware.com>
>> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
>> [ jroedel@suse.de: - Wrapped cache handling into vc_handle_cpuid_cached()
>>                    - Used lower_32_bits() where applicable
>> 		   - Moved cache_index out of struct es_em_ctxt ]
>> Co-developed-by: Joerg Roedel <jroedel@suse.de>
>> Signed-off-by: Joerg Roedel <jroedel@suse.de>
>> ---
>> arch/x86/kernel/sev-es-shared.c |  12 ++--
>> arch/x86/kernel/sev-es.c        | 119 +++++++++++++++++++++++++++++++-
>> 2 files changed, 124 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
>> index 03095bc7b563..0303834d4811 100644
>> --- a/arch/x86/kernel/sev-es.c
>> +++ b/arch/x86/kernel/sev-es.c
>> @@ -744,6 +758,91 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb,
>> 	return ret;
>> }
>>
>> +static unsigned long sev_es_get_cpuid_cache_index(struct es_em_ctxt *ctxt)
>> +{
>> +	unsigned long hi, lo;
>> +
>> +	/* Don't attempt to cache until the xarray is initialized */
>> +	if (!sev_es_cpuid_cache_initialized)
>> +		return ULONG_MAX;
>> +
>> +	lo = lower_32_bits(ctxt->regs->ax);
>> +
>> +	/*
>> +	 * CPUID 0x0000000d requires both RCX and XCR0, so it can't be
>> +	 * cached.
>> +	 */
>> +	if (lo == 0x0000000d)
>> +		return ULONG_MAX;
>> +
>> +	/*
>> +	 * Some callers of CPUID don't always set RCX to zero for CPUID
>> +	 * functions that don't require RCX, which can result in excessive
>> +	 * cached values, so RCX needs to be manually zeroed for use as part
>> +	 * of the cache index. Future CPUID values may need RCX, but since
>> +	 * they can't be known, they must not be cached.
>> +	 */
>> +	if (lo > 0x80000020)
>> +		return ULONG_MAX;
> 
> If the cache is shared across CPUs, do we also need to exclude function 0x1 because it contains the LAPIC ID? (Or is the cache per-CPU?)

It's currently not a per-CPU cache, but given what you pointed out, it 
should be if we're going to keep function 0x1 in there. The question will 
be how often is that CPUID issued, as that would determine if (not) 
caching it matters. Or even how often CPUID is issued and whether the 
xarray lock could be under contention if the cache is not per-CPU.

Thanks,
Tom

>
Sean Christopherson May 20, 2020, 5:16 a.m. UTC | #3
On Tue, Apr 28, 2020 at 05:17:14PM +0200, Joerg Roedel wrote:
> From: Mike Stunes <mstunes@vmware.com>
> 
> To avoid a future VMEXIT for a subsequent CPUID function, cache the
> results returned by CPUID into an xarray.
> 
>  [tl: coding standard changes, register zero extension]
> 
> Signed-off-by: Mike Stunes <mstunes@vmware.com>
> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
> [ jroedel@suse.de: - Wrapped cache handling into vc_handle_cpuid_cached()
>                    - Used lower_32_bits() where applicable
> 		   - Moved cache_index out of struct es_em_ctxt ]
> Co-developed-by: Joerg Roedel <jroedel@suse.de>
> Signed-off-by: Joerg Roedel <jroedel@suse.de>
> ---

...

> +struct sev_es_cpuid_cache_entry {
> +	unsigned long eax;
> +	unsigned long ebx;
> +	unsigned long ecx;
> +	unsigned long edx;

Why are these unsigned longs?  CPUID returns 32-bit values, this wastes 16
bytes per entry.

> +};
> +
> +static struct xarray sev_es_cpuid_cache;
> +static bool __ro_after_init sev_es_cpuid_cache_initialized;
> +
>  /* For early boot hypervisor communication in SEV-ES enabled guests */
>  static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
>  
> @@ -463,6 +474,9 @@ void __init sev_es_init_vc_handling(void)
>  		sev_es_setup_vc_stack(cpu);
>  	}
>  
> +	xa_init_flags(&sev_es_cpuid_cache, XA_FLAGS_LOCK_IRQ);
> +	sev_es_cpuid_cache_initialized = true;
> +
>  	init_vc_stack_names();
>  }
>  
> @@ -744,6 +758,91 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb,
>  	return ret;
>  }
>  
> +static unsigned long sev_es_get_cpuid_cache_index(struct es_em_ctxt *ctxt)
> +{
> +	unsigned long hi, lo;
> +
> +	/* Don't attempt to cache until the xarray is initialized */
> +	if (!sev_es_cpuid_cache_initialized)
> +		return ULONG_MAX;
> +
> +	lo = lower_32_bits(ctxt->regs->ax);
> +
> +	/*
> +	 * CPUID 0x0000000d requires both RCX and XCR0, so it can't be
> +	 * cached.
> +	 */
> +	if (lo == 0x0000000d)
> +		return ULONG_MAX;
> +
> +	/*
> +	 * Some callers of CPUID don't always set RCX to zero for CPUID
> +	 * functions that don't require RCX, which can result in excessive
> +	 * cached values, so RCX needs to be manually zeroed for use as part
> +	 * of the cache index. Future CPUID values may need RCX, but since
> +	 * they can't be known, they must not be cached.
> +	 */
> +	if (lo > 0x80000020)
> +		return ULONG_MAX;
> +
> +	switch (lo) {
> +	case 0x00000007:

OSPKE may or may not be cached correctly depending on when
sev_es_cpuid_cache_initialized is set.

> +	case 0x0000000b:
> +	case 0x0000000f:
> +	case 0x00000010:
> +	case 0x8000001d:
> +	case 0x80000020:
> +		hi = ctxt->regs->cx << 32;
> +		break;
> +	default:
> +		hi = 0;
> +	}
> +
> +	return hi | lo;

This needs to be way more restrictive on what is cached.  Unless I've
overlooked something, this lets userspace trigger arbitrary, unaccounted
kernel memory allocations.  E.g.

	for (i = 0; i <= 0x80000020; i++) {
		for (j = 0; j <= 0xffffffff; j++) {
			cpuid(i, j);
			if (i != 7 || i != 0xb || i != 0xf || i != 0x10 ||
			    i != 0x8000001d || i != 0x80000020)
				break;
		}
	}

The whole cache on-demand approach seems like overkill.  The number of CPUID
leaves that are invoked after boot with any regularity can probably be counted
on one hand.   IIRC glibc invokes CPUID to gather TLB/cache info, XCR0-based
features, and one or two other leafs.  A statically sized global array that's
arbitrarily index a la x86_capability would be just as simple and more
performant.  It would also allow fancier things like emulating CPUID 0xD in
the guest if you want to go down that road.
Borislav Petkov May 26, 2020, 9:19 a.m. UTC | #4
On Tue, May 19, 2020 at 10:16:37PM -0700, Sean Christopherson wrote:
> The whole cache on-demand approach seems like overkill.  The number of CPUID
> leaves that are invoked after boot with any regularity can probably be counted
> on one hand.   IIRC glibc invokes CPUID to gather TLB/cache info, XCR0-based
> features, and one or two other leafs.  A statically sized global array that's
> arbitrarily index a la x86_capability would be just as simple and more
> performant.  It would also allow fancier things like emulating CPUID 0xD in
> the guest if you want to go down that road.

And before we do any of that "caching" or whatnot, I'd like to see
numbers justifying its existence. Because if it is only a couple of
CPUID invocations and the boot delay is immeasurable, then it's not
worth the effort.
Tom Lendacky May 27, 2020, 3:34 p.m. UTC | #5
On 4/28/20 10:17 AM, Joerg Roedel wrote:
> From: Mike Stunes <mstunes@vmware.com>
> 
> To avoid a future VMEXIT for a subsequent CPUID function, cache the
> results returned by CPUID into an xarray.
> 
>   [tl: coding standard changes, register zero extension]
> 
> Signed-off-by: Mike Stunes <mstunes@vmware.com>
> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
> [ jroedel@suse.de: - Wrapped cache handling into vc_handle_cpuid_cached()
>                     - Used lower_32_bits() where applicable
> 		   - Moved cache_index out of struct es_em_ctxt ]
> Co-developed-by: Joerg Roedel <jroedel@suse.de>
> Signed-off-by: Joerg Roedel <jroedel@suse.de>
> ---
>   arch/x86/kernel/sev-es-shared.c |  12 ++--
>   arch/x86/kernel/sev-es.c        | 119 +++++++++++++++++++++++++++++++-
>   2 files changed, 124 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c
> index 5bfc1f3030d4..cfdafe12da4f 100644
> --- a/arch/x86/kernel/sev-es-shared.c
> +++ b/arch/x86/kernel/sev-es-shared.c
> @@ -427,8 +427,8 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
>   	u32 cr4 = native_read_cr4();
>   	enum es_result ret;
>   
> -	ghcb_set_rax(ghcb, regs->ax);
> -	ghcb_set_rcx(ghcb, regs->cx);
> +	ghcb_set_rax(ghcb, lower_32_bits(regs->ax));
> +	ghcb_set_rcx(ghcb, lower_32_bits(regs->cx));
>   
>   	if (cr4 & X86_CR4_OSXSAVE)
>   		/* Safe to read xcr0 */
> @@ -447,10 +447,10 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
>   	      ghcb_is_valid_rdx(ghcb)))
>   		return ES_VMM_ERROR;
>   
> -	regs->ax = ghcb->save.rax;
> -	regs->bx = ghcb->save.rbx;
> -	regs->cx = ghcb->save.rcx;
> -	regs->dx = ghcb->save.rdx;
> +	regs->ax = lower_32_bits(ghcb->save.rax);
> +	regs->bx = lower_32_bits(ghcb->save.rbx);
> +	regs->cx = lower_32_bits(ghcb->save.rcx);
> +	regs->dx = lower_32_bits(ghcb->save.rdx);
>   
>   	return ES_OK;
>   }
> diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
> index 03095bc7b563..0303834d4811 100644
> --- a/arch/x86/kernel/sev-es.c
> +++ b/arch/x86/kernel/sev-es.c
> @@ -19,6 +19,7 @@
>   #include <linux/memblock.h>
>   #include <linux/kernel.h>
>   #include <linux/mm.h>
> +#include <linux/xarray.h>
>   
>   #include <generated/asm-offsets.h>
>   #include <asm/cpu_entry_area.h>
> @@ -33,6 +34,16 @@
>   
>   #define DR7_RESET_VALUE        0x400
>   
> +struct sev_es_cpuid_cache_entry {
> +	unsigned long eax;
> +	unsigned long ebx;
> +	unsigned long ecx;
> +	unsigned long edx;
> +};
> +
> +static struct xarray sev_es_cpuid_cache;
> +static bool __ro_after_init sev_es_cpuid_cache_initialized;
> +
>   /* For early boot hypervisor communication in SEV-ES enabled guests */
>   static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
>   
> @@ -463,6 +474,9 @@ void __init sev_es_init_vc_handling(void)
>   		sev_es_setup_vc_stack(cpu);
>   	}
>   
> +	xa_init_flags(&sev_es_cpuid_cache, XA_FLAGS_LOCK_IRQ);
> +	sev_es_cpuid_cache_initialized = true;
> +
>   	init_vc_stack_names();
>   }
>   
> @@ -744,6 +758,91 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb,
>   	return ret;
>   }
>   
> +static unsigned long sev_es_get_cpuid_cache_index(struct es_em_ctxt *ctxt)
> +{
> +	unsigned long hi, lo;
> +
> +	/* Don't attempt to cache until the xarray is initialized */
> +	if (!sev_es_cpuid_cache_initialized)
> +		return ULONG_MAX;
> +
> +	lo = lower_32_bits(ctxt->regs->ax);
> +
> +	/*
> +	 * CPUID 0x0000000d requires both RCX and XCR0, so it can't be
> +	 * cached.
> +	 */
> +	if (lo == 0x0000000d)
> +		return ULONG_MAX;
> +
> +	/*
> +	 * Some callers of CPUID don't always set RCX to zero for CPUID
> +	 * functions that don't require RCX, which can result in excessive
> +	 * cached values, so RCX needs to be manually zeroed for use as part
> +	 * of the cache index. Future CPUID values may need RCX, but since
> +	 * they can't be known, they must not be cached.
> +	 */
> +	if (lo > 0x80000020)
> +		return ULONG_MAX;
> +
> +	switch (lo) {
> +	case 0x00000007:
> +	case 0x0000000b:
> +	case 0x0000000f:
> +	case 0x00000010:
> +	case 0x8000001d:
> +	case 0x80000020:
> +		hi = ctxt->regs->cx << 32;
> +		break;
> +	default:
> +		hi = 0;
> +	}
> +
> +	return hi | lo;
> +}
> +
> +static bool sev_es_check_cpuid_cache(struct es_em_ctxt *ctxt,
> +				     unsigned long cache_index)
> +{
> +	struct sev_es_cpuid_cache_entry *cache_entry;
> +
> +	if (cache_index == ULONG_MAX)
> +		return false;
> +
> +	cache_entry = xa_load(&sev_es_cpuid_cache, cache_index);
> +	if (!cache_entry)
> +		return false;
> +
> +	ctxt->regs->ax = cache_entry->eax;
> +	ctxt->regs->bx = cache_entry->ebx;
> +	ctxt->regs->cx = cache_entry->ecx;
> +	ctxt->regs->dx = cache_entry->edx;
> +
> +	return true;
> +}
> +
> +static void sev_es_add_cpuid_cache(struct es_em_ctxt *ctxt,
> +				   unsigned long cache_index)
> +{
> +	struct sev_es_cpuid_cache_entry *cache_entry;
> +	int ret;
> +
> +	if (cache_index == ULONG_MAX)
> +		return;
> +
> +	cache_entry = kzalloc(sizeof(*cache_entry), GFP_ATOMIC);
> +	if (cache_entry) {
> +		cache_entry->eax = ctxt->regs->ax;
> +		cache_entry->ebx = ctxt->regs->bx;
> +		cache_entry->ecx = ctxt->regs->cx;
> +		cache_entry->edx = ctxt->regs->dx;
> +
> +		/* Ignore insertion errors */
> +		ret = xa_insert(&sev_es_cpuid_cache, cache_index,
> +				cache_entry, GFP_ATOMIC);

Just realized, that on error, the cache_entry should be freed.

Thanks,
Tom

> +	}
> +}
> +
>   static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
>   					  struct es_em_ctxt *ctxt)
>   {
> @@ -895,6 +994,24 @@ static enum es_result vc_handle_trap_db(struct ghcb *ghcb,
>   	return ES_EXCEPTION;
>   }
>   
> +static enum es_result vc_handle_cpuid_cached(struct ghcb *ghcb,
> +					     struct es_em_ctxt *ctxt)
> +{
> +	unsigned long cache_index;
> +	enum es_result result;
> +
> +	cache_index = sev_es_get_cpuid_cache_index(ctxt);
> +
> +	if (sev_es_check_cpuid_cache(ctxt, cache_index))
> +		return ES_OK;
> +
> +	result = vc_handle_cpuid(ghcb, ctxt);
> +	if (result == ES_OK)
> +		sev_es_add_cpuid_cache(ctxt, cache_index);
> +
> +	return result;
> +}
> +
>   static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
>   					 struct ghcb *ghcb,
>   					 unsigned long exit_code)
> @@ -926,7 +1043,7 @@ static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
>   		result = ES_UNSUPPORTED;
>   		break;
>   	case SVM_EXIT_CPUID:
> -		result = vc_handle_cpuid(ghcb, ctxt);
> +		result = vc_handle_cpuid_cached(ghcb, ctxt);
>   		break;
>   	case SVM_EXIT_IOIO:
>   		result = vc_handle_ioio(ghcb, ctxt);
>
Tom Lendacky May 27, 2020, 5:49 p.m. UTC | #6
On 5/26/20 4:19 AM, Borislav Petkov wrote:
> On Tue, May 19, 2020 at 10:16:37PM -0700, Sean Christopherson wrote:
>> The whole cache on-demand approach seems like overkill.  The number of CPUID
>> leaves that are invoked after boot with any regularity can probably be counted
>> on one hand.   IIRC glibc invokes CPUID to gather TLB/cache info, XCR0-based
>> features, and one or two other leafs.  A statically sized global array that's
>> arbitrarily index a la x86_capability would be just as simple and more
>> performant.  It would also allow fancier things like emulating CPUID 0xD in
>> the guest if you want to go down that road.
> 
> And before we do any of that "caching" or whatnot, I'd like to see
> numbers justifying its existence. Because if it is only a couple of
> CPUID invocations and the boot delay is immeasurable, then it's not
> worth the effort.

I added some rudimentary stats code to see how many times there was a 
CPUID cache hit on a 64-vCPU guest during a kernel build (make clean 
followed by make with -j 64):

   SEV-ES CPUID cache statistics
     0x00000000/0x00000000: 220,384
     0x00000007/0x00000000: 213,306
     0x80000000/0x00000000: 1,054,642
     0x80000001/0x00000000: 213,306
     0x80000005/0x00000000: 210,334
     0x80000006/0x00000000: 420,668
     0x80000007/0x00000000: 210,334
     0x80000008/0x00000000: 420,684

     2,963,658 cache hits

So it is significant in quantity, but I'm not sure what the overall 
performance difference is. If I can find some more time I'll try to 
compare the kernel builds with and without the caching to see if it is 
noticeable.

Thanks,
Tom

>
Joerg Roedel June 12, 2020, 9:12 a.m. UTC | #7
On Tue, Apr 28, 2020 at 05:17:14PM +0200, Joerg Roedel wrote:
> From: Mike Stunes <mstunes@vmware.com>
> 
> To avoid a future VMEXIT for a subsequent CPUID function, cache the
> results returned by CPUID into an xarray.
> 
>  [tl: coding standard changes, register zero extension]
> 
> Signed-off-by: Mike Stunes <mstunes@vmware.com>
> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
> [ jroedel@suse.de: - Wrapped cache handling into vc_handle_cpuid_cached()
>                    - Used lower_32_bits() where applicable
> 		   - Moved cache_index out of struct es_em_ctxt ]
> Co-developed-by: Joerg Roedel <jroedel@suse.de>
> Signed-off-by: Joerg Roedel <jroedel@suse.de>
> ---
>  arch/x86/kernel/sev-es-shared.c |  12 ++--
>  arch/x86/kernel/sev-es.c        | 119 +++++++++++++++++++++++++++++++-
>  2 files changed, 124 insertions(+), 7 deletions(-)

Okay, following the discussion, I am dropping this patch for now. We can
revisit CPUID caching later when we have a better justification.

Regards,

	Joerg

Patch
diff mbox series

diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c
index 5bfc1f3030d4..cfdafe12da4f 100644
--- a/arch/x86/kernel/sev-es-shared.c
+++ b/arch/x86/kernel/sev-es-shared.c
@@ -427,8 +427,8 @@  static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
 	u32 cr4 = native_read_cr4();
 	enum es_result ret;
 
-	ghcb_set_rax(ghcb, regs->ax);
-	ghcb_set_rcx(ghcb, regs->cx);
+	ghcb_set_rax(ghcb, lower_32_bits(regs->ax));
+	ghcb_set_rcx(ghcb, lower_32_bits(regs->cx));
 
 	if (cr4 & X86_CR4_OSXSAVE)
 		/* Safe to read xcr0 */
@@ -447,10 +447,10 @@  static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
 	      ghcb_is_valid_rdx(ghcb)))
 		return ES_VMM_ERROR;
 
-	regs->ax = ghcb->save.rax;
-	regs->bx = ghcb->save.rbx;
-	regs->cx = ghcb->save.rcx;
-	regs->dx = ghcb->save.rdx;
+	regs->ax = lower_32_bits(ghcb->save.rax);
+	regs->bx = lower_32_bits(ghcb->save.rbx);
+	regs->cx = lower_32_bits(ghcb->save.rcx);
+	regs->dx = lower_32_bits(ghcb->save.rdx);
 
 	return ES_OK;
 }
diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
index 03095bc7b563..0303834d4811 100644
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev-es.c
@@ -19,6 +19,7 @@ 
 #include <linux/memblock.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/xarray.h>
 
 #include <generated/asm-offsets.h>
 #include <asm/cpu_entry_area.h>
@@ -33,6 +34,16 @@ 
 
 #define DR7_RESET_VALUE        0x400
 
+struct sev_es_cpuid_cache_entry {
+	unsigned long eax;
+	unsigned long ebx;
+	unsigned long ecx;
+	unsigned long edx;
+};
+
+static struct xarray sev_es_cpuid_cache;
+static bool __ro_after_init sev_es_cpuid_cache_initialized;
+
 /* For early boot hypervisor communication in SEV-ES enabled guests */
 static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
 
@@ -463,6 +474,9 @@  void __init sev_es_init_vc_handling(void)
 		sev_es_setup_vc_stack(cpu);
 	}
 
+	xa_init_flags(&sev_es_cpuid_cache, XA_FLAGS_LOCK_IRQ);
+	sev_es_cpuid_cache_initialized = true;
+
 	init_vc_stack_names();
 }
 
@@ -744,6 +758,91 @@  static enum es_result vc_handle_mmio(struct ghcb *ghcb,
 	return ret;
 }
 
+static unsigned long sev_es_get_cpuid_cache_index(struct es_em_ctxt *ctxt)
+{
+	unsigned long hi, lo;
+
+	/* Don't attempt to cache until the xarray is initialized */
+	if (!sev_es_cpuid_cache_initialized)
+		return ULONG_MAX;
+
+	lo = lower_32_bits(ctxt->regs->ax);
+
+	/*
+	 * CPUID 0x0000000d requires both RCX and XCR0, so it can't be
+	 * cached.
+	 */
+	if (lo == 0x0000000d)
+		return ULONG_MAX;
+
+	/*
+	 * Some callers of CPUID don't always set RCX to zero for CPUID
+	 * functions that don't require RCX, which can result in excessive
+	 * cached values, so RCX needs to be manually zeroed for use as part
+	 * of the cache index. Future CPUID values may need RCX, but since
+	 * they can't be known, they must not be cached.
+	 */
+	if (lo > 0x80000020)
+		return ULONG_MAX;
+
+	switch (lo) {
+	case 0x00000007:
+	case 0x0000000b:
+	case 0x0000000f:
+	case 0x00000010:
+	case 0x8000001d:
+	case 0x80000020:
+		hi = ctxt->regs->cx << 32;
+		break;
+	default:
+		hi = 0;
+	}
+
+	return hi | lo;
+}
+
+static bool sev_es_check_cpuid_cache(struct es_em_ctxt *ctxt,
+				     unsigned long cache_index)
+{
+	struct sev_es_cpuid_cache_entry *cache_entry;
+
+	if (cache_index == ULONG_MAX)
+		return false;
+
+	cache_entry = xa_load(&sev_es_cpuid_cache, cache_index);
+	if (!cache_entry)
+		return false;
+
+	ctxt->regs->ax = cache_entry->eax;
+	ctxt->regs->bx = cache_entry->ebx;
+	ctxt->regs->cx = cache_entry->ecx;
+	ctxt->regs->dx = cache_entry->edx;
+
+	return true;
+}
+
+static void sev_es_add_cpuid_cache(struct es_em_ctxt *ctxt,
+				   unsigned long cache_index)
+{
+	struct sev_es_cpuid_cache_entry *cache_entry;
+	int ret;
+
+	if (cache_index == ULONG_MAX)
+		return;
+
+	cache_entry = kzalloc(sizeof(*cache_entry), GFP_ATOMIC);
+	if (cache_entry) {
+		cache_entry->eax = ctxt->regs->ax;
+		cache_entry->ebx = ctxt->regs->bx;
+		cache_entry->ecx = ctxt->regs->cx;
+		cache_entry->edx = ctxt->regs->dx;
+
+		/* Ignore insertion errors */
+		ret = xa_insert(&sev_es_cpuid_cache, cache_index,
+				cache_entry, GFP_ATOMIC);
+	}
+}
+
 static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
 					  struct es_em_ctxt *ctxt)
 {
@@ -895,6 +994,24 @@  static enum es_result vc_handle_trap_db(struct ghcb *ghcb,
 	return ES_EXCEPTION;
 }
 
+static enum es_result vc_handle_cpuid_cached(struct ghcb *ghcb,
+					     struct es_em_ctxt *ctxt)
+{
+	unsigned long cache_index;
+	enum es_result result;
+
+	cache_index = sev_es_get_cpuid_cache_index(ctxt);
+
+	if (sev_es_check_cpuid_cache(ctxt, cache_index))
+		return ES_OK;
+
+	result = vc_handle_cpuid(ghcb, ctxt);
+	if (result == ES_OK)
+		sev_es_add_cpuid_cache(ctxt, cache_index);
+
+	return result;
+}
+
 static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
 					 struct ghcb *ghcb,
 					 unsigned long exit_code)
@@ -926,7 +1043,7 @@  static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
 		result = ES_UNSUPPORTED;
 		break;
 	case SVM_EXIT_CPUID:
-		result = vc_handle_cpuid(ghcb, ctxt);
+		result = vc_handle_cpuid_cached(ghcb, ctxt);
 		break;
 	case SVM_EXIT_IOIO:
 		result = vc_handle_ioio(ghcb, ctxt);