linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] KVM: x86/pmu: Introduce pmc->is_paused to reduce the call time of perf interfaces
@ 2021-07-28 12:07 Like Xu
  2021-07-29 12:58 ` Peter Zijlstra
  2021-08-02 15:46 ` Paolo Bonzini
  0 siblings, 2 replies; 4+ messages in thread
From: Like Xu @ 2021-07-28 12:07 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Sean Christopherson, Vitaly Kuznetsov, Wanpeng Li, Jim Mattson,
	Joerg Roedel, Thomas Gleixner, kvm, x86, linux-kernel

From: Like Xu <likexu@tencent.com>

Based on our observations, after any vm-exit associated with vPMU, there
are at least two or more perf interfaces to be called for guest counter
emulation, such as perf_event_{pause, read_value, period}(), and each one
will {lock, unlock} the same perf_event_ctx. The frequency of calls becomes
more severe when guest use counters in a multiplexed manner.

Holding a lock once and completing the KVM request operations in the perf
context would introduce a set of impractical new interfaces. So we can
further optimize the vPMU implementation by avoiding repeated calls to
these interfaces in the KVM context for at least one pattern:

After we call perf_event_pause() once, the event will be disabled and its
internal count will be reset to 0. So there is no need to pause it again
or read its value. Once the event is paused, event period will not be
updated until the next time it's resumed or reprogrammed. And there is
also no need to call perf_event_period twice for a non-running counter,
considering the perf_event for a running counter is never paused.

Based on this implementation, for the following common usage of
sampling 4 events using perf on a 4u8g guest:

  echo 0 > /proc/sys/kernel/watchdog
  echo 25 > /proc/sys/kernel/perf_cpu_time_max_percent
  echo 10000 > /proc/sys/kernel/perf_event_max_sample_rate
  echo 0 > /proc/sys/kernel/perf_cpu_time_max_percent
  for i in `seq 1 1 10`
  do
  taskset -c 0 perf record \
  -e cpu-cycles -e instructions -e branch-instructions -e cache-misses \
  /root/br_instr a
  done

the average latency of the guest NMI handler is reduced from
37646.7 ns to 32929.3 ns (~1.14x speed up) on the Intel ICX server.
Also, in addition to collecting more samples, no loss of sampling
accuracy was observed compared to before the optimization.

Signed-off-by: Like Xu <likexu@tencent.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/pmu.c              | 5 ++++-
 arch/x86/kvm/pmu.h              | 2 +-
 arch/x86/kvm/vmx/pmu_intel.c    | 4 ++--
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 99f37781a6fc..a079880d4cd5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -482,6 +482,7 @@ struct kvm_pmc {
 	 * ctrl value for fixed counters.
 	 */
 	u64 current_config;
+	bool is_paused;
 };
 
 struct kvm_pmu {
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 827886c12c16..0772bad9165c 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -137,18 +137,20 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
 	pmc->perf_event = event;
 	pmc_to_pmu(pmc)->event_count++;
 	clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
+	pmc->is_paused = false;
 }
 
 static void pmc_pause_counter(struct kvm_pmc *pmc)
 {
 	u64 counter = pmc->counter;
 
-	if (!pmc->perf_event)
+	if (!pmc->perf_event || pmc->is_paused)
 		return;
 
 	/* update counter, reset event value to avoid redundant accumulation */
 	counter += perf_event_pause(pmc->perf_event, true);
 	pmc->counter = counter & pmc_bitmask(pmc);
+	pmc->is_paused = true;
 }
 
 static bool pmc_resume_counter(struct kvm_pmc *pmc)
@@ -163,6 +165,7 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
 
 	/* reuse perf_event to serve as pmc_reprogram_counter() does*/
 	perf_event_enable(pmc->perf_event);
+	pmc->is_paused = false;
 
 	clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
 	return true;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 67e753edfa22..0e4f2b1fa9fb 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -55,7 +55,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
 	u64 counter, enabled, running;
 
 	counter = pmc->counter;
-	if (pmc->perf_event)
+	if (pmc->perf_event && !pmc->is_paused)
 		counter += perf_event_read_value(pmc->perf_event,
 						 &enabled, &running);
 	/* FIXME: Scaling needed? */
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 9efc1a6b8693..10cc4f65c4ef 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -437,13 +437,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			    !(msr & MSR_PMC_FULL_WIDTH_BIT))
 				data = (s64)(s32)data;
 			pmc->counter += data - pmc_read_counter(pmc);
-			if (pmc->perf_event)
+			if (pmc->perf_event && !pmc->is_paused)
 				perf_event_period(pmc->perf_event,
 						  get_sample_period(pmc, data));
 			return 0;
 		} else if ((pmc = get_fixed_pmc(pmu, msr))) {
 			pmc->counter += data - pmc_read_counter(pmc);
-			if (pmc->perf_event)
+			if (pmc->perf_event && !pmc->is_paused)
 				perf_event_period(pmc->perf_event,
 						  get_sample_period(pmc, data));
 			return 0;
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] KVM: x86/pmu: Introduce pmc->is_paused to reduce the call time of perf interfaces
  2021-07-28 12:07 [PATCH] KVM: x86/pmu: Introduce pmc->is_paused to reduce the call time of perf interfaces Like Xu
@ 2021-07-29 12:58 ` Peter Zijlstra
  2021-07-29 13:46   ` Like Xu
  2021-08-02 15:46 ` Paolo Bonzini
  1 sibling, 1 reply; 4+ messages in thread
From: Peter Zijlstra @ 2021-07-29 12:58 UTC (permalink / raw)
  To: Like Xu
  Cc: Paolo Bonzini, Sean Christopherson, Vitaly Kuznetsov, Wanpeng Li,
	Jim Mattson, Joerg Roedel, Thomas Gleixner, kvm, x86,
	linux-kernel

On Wed, Jul 28, 2021 at 08:07:05PM +0800, Like Xu wrote:
> From: Like Xu <likexu@tencent.com>
> 
> Based on our observations, after any vm-exit associated with vPMU, there
> are at least two or more perf interfaces to be called for guest counter
> emulation, such as perf_event_{pause, read_value, period}(), and each one
> will {lock, unlock} the same perf_event_ctx. The frequency of calls becomes
> more severe when guest use counters in a multiplexed manner.
> 
> Holding a lock once and completing the KVM request operations in the perf
> context would introduce a set of impractical new interfaces. So we can
> further optimize the vPMU implementation by avoiding repeated calls to
> these interfaces in the KVM context for at least one pattern:
> 
> After we call perf_event_pause() once, the event will be disabled and its
> internal count will be reset to 0. So there is no need to pause it again
> or read its value. Once the event is paused, event period will not be
> updated until the next time it's resumed or reprogrammed. And there is
> also no need to call perf_event_period twice for a non-running counter,
> considering the perf_event for a running counter is never paused.
> 
> Based on this implementation, for the following common usage of
> sampling 4 events using perf on a 4u8g guest:
> 
>   echo 0 > /proc/sys/kernel/watchdog
>   echo 25 > /proc/sys/kernel/perf_cpu_time_max_percent
>   echo 10000 > /proc/sys/kernel/perf_event_max_sample_rate
>   echo 0 > /proc/sys/kernel/perf_cpu_time_max_percent
>   for i in `seq 1 1 10`
>   do
>   taskset -c 0 perf record \
>   -e cpu-cycles -e instructions -e branch-instructions -e cache-misses \
>   /root/br_instr a
>   done
> 
> the average latency of the guest NMI handler is reduced from
> 37646.7 ns to 32929.3 ns (~1.14x speed up) on the Intel ICX server.
> Also, in addition to collecting more samples, no loss of sampling
> accuracy was observed compared to before the optimization.
> 
> Signed-off-by: Like Xu <likexu@tencent.com>

Looks sane I suppose.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

What kinds of VM-exits are the most common?

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] KVM: x86/pmu: Introduce pmc->is_paused to reduce the call time of perf interfaces
  2021-07-29 12:58 ` Peter Zijlstra
@ 2021-07-29 13:46   ` Like Xu
  0 siblings, 0 replies; 4+ messages in thread
From: Like Xu @ 2021-07-29 13:46 UTC (permalink / raw)
  To: Peter Zijlstra, Paolo Bonzini
  Cc: Sean Christopherson, Vitaly Kuznetsov, Wanpeng Li, Jim Mattson,
	Joerg Roedel, Thomas Gleixner, kvm, x86, linux-kernel

On 29/7/2021 8:58 pm, Peter Zijlstra wrote:
> On Wed, Jul 28, 2021 at 08:07:05PM +0800, Like Xu wrote:
>> From: Like Xu <likexu@tencent.com>
>>
>> Based on our observations, after any vm-exit associated with vPMU, there
>> are at least two or more perf interfaces to be called for guest counter
>> emulation, such as perf_event_{pause, read_value, period}(), and each one
>> will {lock, unlock} the same perf_event_ctx. The frequency of calls becomes
>> more severe when guest use counters in a multiplexed manner.
>>
>> Holding a lock once and completing the KVM request operations in the perf
>> context would introduce a set of impractical new interfaces. So we can
>> further optimize the vPMU implementation by avoiding repeated calls to
>> these interfaces in the KVM context for at least one pattern:
>>
>> After we call perf_event_pause() once, the event will be disabled and its
>> internal count will be reset to 0. So there is no need to pause it again
>> or read its value. Once the event is paused, event period will not be
>> updated until the next time it's resumed or reprogrammed. And there is
>> also no need to call perf_event_period twice for a non-running counter,
>> considering the perf_event for a running counter is never paused.
>>
>> Based on this implementation, for the following common usage of
>> sampling 4 events using perf on a 4u8g guest:
>>
>>    echo 0 > /proc/sys/kernel/watchdog
>>    echo 25 > /proc/sys/kernel/perf_cpu_time_max_percent
>>    echo 10000 > /proc/sys/kernel/perf_event_max_sample_rate
>>    echo 0 > /proc/sys/kernel/perf_cpu_time_max_percent
>>    for i in `seq 1 1 10`
>>    do
>>    taskset -c 0 perf record \
>>    -e cpu-cycles -e instructions -e branch-instructions -e cache-misses \
>>    /root/br_instr a
>>    done
>>
>> the average latency of the guest NMI handler is reduced from
>> 37646.7 ns to 32929.3 ns (~1.14x speed up) on the Intel ICX server.
>> Also, in addition to collecting more samples, no loss of sampling
>> accuracy was observed compared to before the optimization.
>>
>> Signed-off-by: Like Xu <likexu@tencent.com>
> 
> Looks sane I suppose.
> 
> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> 
> What kinds of VM-exits are the most common?
> 

A typical vm-exit trace is as follows:

  146820 EXTERNAL_INTERRUPT
  126301 MSR_WRITE
   17009 MSR_READ
    9710 RDPMC
    7295 EXCEPTION_NMI
    2493 EPT_VIOLATION
    1357 EPT_MISCONFIG
     567 CPUID
     107 NMI_WINDOW
      59 IO_INSTRUCTION
       2 VMCALL

including the following kvm_msr trace:

   15822 msr_write, MSR_CORE_PERF_GLOBAL_CTRL
   14558 msr_read, MSR_CORE_PERF_GLOBAL_STATUS
    7315 msr_write, IA32_X2APIC_LVT_PMI
    7250 msr_write, MSR_CORE_PERF_GLOBAL_OVF_CTRL
    2922 msr_write, MSR_IA32_PMC0
    2912 msr_write, MSR_CORE_PERF_FIXED_CTR0
    2904 msr_write, MSR_CORE_PERF_FIXED_CTR1
    2390 msr_write, MSR_CORE_PERF_FIXED_CTR_CTRL
    2390 msr_read, MSR_CORE_PERF_FIXED_CTR_CTRL
    1195 msr_write, MSR_P6_EVNTSEL1
    1195 msr_write, MSR_P6_EVNTSEL0
     976 msr_write, MSR_IA32_PMC1
     618 msr_write, IA32_X2APIC_ICR

Due to the presence of a large number of msr accesses, the latency of
the guest PMI handler is still far from that of the host handler.

I have two rough ideas that could drastically reduce the vPMU overhead
for the third time:

- Add a new paravirtualized pmu guest driver that saves all msr latest
values to the static physical memory of each vcpu to achieve a reduced
number of vm-exits and also facilitate kvm emulation access;

- Bypass the host perf_event PMI callback injection path and inject
guest PMI directly after the EXCEPTION_N/PMI vm-exit; (For TDX guest)


Any negative comments or help with additional details are welcome.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] KVM: x86/pmu: Introduce pmc->is_paused to reduce the call time of perf interfaces
  2021-07-28 12:07 [PATCH] KVM: x86/pmu: Introduce pmc->is_paused to reduce the call time of perf interfaces Like Xu
  2021-07-29 12:58 ` Peter Zijlstra
@ 2021-08-02 15:46 ` Paolo Bonzini
  1 sibling, 0 replies; 4+ messages in thread
From: Paolo Bonzini @ 2021-08-02 15:46 UTC (permalink / raw)
  To: Like Xu
  Cc: Sean Christopherson, Vitaly Kuznetsov, Wanpeng Li, Jim Mattson,
	Joerg Roedel, Thomas Gleixner, kvm, x86, linux-kernel

On 28/07/21 14:07, Like Xu wrote:
> From: Like Xu <likexu@tencent.com>
> 
> Based on our observations, after any vm-exit associated with vPMU, there
> are at least two or more perf interfaces to be called for guest counter
> emulation, such as perf_event_{pause, read_value, period}(), and each one
> will {lock, unlock} the same perf_event_ctx. The frequency of calls becomes
> more severe when guest use counters in a multiplexed manner.
> 
> Holding a lock once and completing the KVM request operations in the perf
> context would introduce a set of impractical new interfaces. So we can
> further optimize the vPMU implementation by avoiding repeated calls to
> these interfaces in the KVM context for at least one pattern:
> 
> After we call perf_event_pause() once, the event will be disabled and its
> internal count will be reset to 0. So there is no need to pause it again
> or read its value. Once the event is paused, event period will not be
> updated until the next time it's resumed or reprogrammed. And there is
> also no need to call perf_event_period twice for a non-running counter,
> considering the perf_event for a running counter is never paused.
> 
> Based on this implementation, for the following common usage of
> sampling 4 events using perf on a 4u8g guest:
> 
>    echo 0 > /proc/sys/kernel/watchdog
>    echo 25 > /proc/sys/kernel/perf_cpu_time_max_percent
>    echo 10000 > /proc/sys/kernel/perf_event_max_sample_rate
>    echo 0 > /proc/sys/kernel/perf_cpu_time_max_percent
>    for i in `seq 1 1 10`
>    do
>    taskset -c 0 perf record \
>    -e cpu-cycles -e instructions -e branch-instructions -e cache-misses \
>    /root/br_instr a
>    done
> 
> the average latency of the guest NMI handler is reduced from
> 37646.7 ns to 32929.3 ns (~1.14x speed up) on the Intel ICX server.
> Also, in addition to collecting more samples, no loss of sampling
> accuracy was observed compared to before the optimization.
> 
> Signed-off-by: Like Xu <likexu@tencent.com>
> ---
>   arch/x86/include/asm/kvm_host.h | 1 +
>   arch/x86/kvm/pmu.c              | 5 ++++-
>   arch/x86/kvm/pmu.h              | 2 +-
>   arch/x86/kvm/vmx/pmu_intel.c    | 4 ++--
>   4 files changed, 8 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 99f37781a6fc..a079880d4cd5 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -482,6 +482,7 @@ struct kvm_pmc {
>   	 * ctrl value for fixed counters.
>   	 */
>   	u64 current_config;
> +	bool is_paused;
>   };
>   
>   struct kvm_pmu {
> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> index 827886c12c16..0772bad9165c 100644
> --- a/arch/x86/kvm/pmu.c
> +++ b/arch/x86/kvm/pmu.c
> @@ -137,18 +137,20 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
>   	pmc->perf_event = event;
>   	pmc_to_pmu(pmc)->event_count++;
>   	clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
> +	pmc->is_paused = false;
>   }
>   
>   static void pmc_pause_counter(struct kvm_pmc *pmc)
>   {
>   	u64 counter = pmc->counter;
>   
> -	if (!pmc->perf_event)
> +	if (!pmc->perf_event || pmc->is_paused)
>   		return;
>   
>   	/* update counter, reset event value to avoid redundant accumulation */
>   	counter += perf_event_pause(pmc->perf_event, true);
>   	pmc->counter = counter & pmc_bitmask(pmc);
> +	pmc->is_paused = true;
>   }
>   
>   static bool pmc_resume_counter(struct kvm_pmc *pmc)
> @@ -163,6 +165,7 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
>   
>   	/* reuse perf_event to serve as pmc_reprogram_counter() does*/
>   	perf_event_enable(pmc->perf_event);
> +	pmc->is_paused = false;
>   
>   	clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
>   	return true;
> diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
> index 67e753edfa22..0e4f2b1fa9fb 100644
> --- a/arch/x86/kvm/pmu.h
> +++ b/arch/x86/kvm/pmu.h
> @@ -55,7 +55,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
>   	u64 counter, enabled, running;
>   
>   	counter = pmc->counter;
> -	if (pmc->perf_event)
> +	if (pmc->perf_event && !pmc->is_paused)
>   		counter += perf_event_read_value(pmc->perf_event,
>   						 &enabled, &running);
>   	/* FIXME: Scaling needed? */
> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
> index 9efc1a6b8693..10cc4f65c4ef 100644
> --- a/arch/x86/kvm/vmx/pmu_intel.c
> +++ b/arch/x86/kvm/vmx/pmu_intel.c
> @@ -437,13 +437,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>   			    !(msr & MSR_PMC_FULL_WIDTH_BIT))
>   				data = (s64)(s32)data;
>   			pmc->counter += data - pmc_read_counter(pmc);
> -			if (pmc->perf_event)
> +			if (pmc->perf_event && !pmc->is_paused)
>   				perf_event_period(pmc->perf_event,
>   						  get_sample_period(pmc, data));
>   			return 0;
>   		} else if ((pmc = get_fixed_pmc(pmu, msr))) {
>   			pmc->counter += data - pmc_read_counter(pmc);
> -			if (pmc->perf_event)
> +			if (pmc->perf_event && !pmc->is_paused)
>   				perf_event_period(pmc->perf_event,
>   						  get_sample_period(pmc, data));
>   			return 0;
> 

Queued, thanks.

Paolo


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-08-02 15:46 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-28 12:07 [PATCH] KVM: x86/pmu: Introduce pmc->is_paused to reduce the call time of perf interfaces Like Xu
2021-07-29 12:58 ` Peter Zijlstra
2021-07-29 13:46   ` Like Xu
2021-08-02 15:46 ` Paolo Bonzini

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).