[PATCH v2 0/4] KVM: x86: Add support of CMCI signaling and UCNA

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 0/4] KVM: x86: Add support of CMCI signaling and UCNA
@ 2022-04-12 22:31 Jue Wang
  2022-04-12 22:31 ` [PATCH v2 1/4] KVM: x86: Clean up KVM APIC LVT logic Jue Wang
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Jue Wang @ 2022-04-12 22:31 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson, Vitaly Kuznetsov, Wanpeng Li,
	Jim Mattson, Joerg Roedel
  Cc: Tony Luck, kvm, Jue Wang

This patch series add support for Corrected Machine Check Interrupt (CMCI)
signaling and UnCorrectable No Action required (UCNA) error injection to
KVM.

UCNA errors signaled via CMCI allow a guest to be notified as soon as
uncorrectable memory errors get detected by some background threads, e.g.,
threads that migrate guest memory across hosts.

Upon receiving UCNAs, guest kernel isolates the poisoned pages
immediately, preventing future accesses that may result in fatal Machine
Check Exceptions.

Jue Wang (4):
  KVM: x86: Clean up KVM APIC LVT logic.
  KVM: x86: Add LVTCMCI support.
  KVM: x86: Add support for MSR_IA32_MCx_CTL2 MSRs.
  KVM: x86: Add support for MCG_CMCI_P and handling of injected UCNAs.

 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/lapic.c            | 56 ++++++++++++++------
 arch/x86/kvm/lapic.h            | 24 ++++++++-
 arch/x86/kvm/vmx/vmx.c          |  1 +
 arch/x86/kvm/x86.c              | 94 +++++++++++++++++++++++++++++----
 5 files changed, 149 insertions(+), 27 deletions(-)

-- 
2.35.1.1178.g4f1659d476-goog

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2 1/4] KVM: x86: Clean up KVM APIC LVT logic.
  2022-04-12 22:31 [PATCH v2 0/4] KVM: x86: Add support of CMCI signaling and UCNA Jue Wang
@ 2022-04-12 22:31 ` Jue Wang
  2022-05-11 17:38   ` Sean Christopherson
  2022-04-12 22:31 ` [PATCH v2 2/4] KVM: x86: Add LVTCMCI support Jue Wang
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 13+ messages in thread
From: Jue Wang @ 2022-04-12 22:31 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson, Vitaly Kuznetsov, Wanpeng Li,
	Jim Mattson, Joerg Roedel
  Cc: Tony Luck, kvm, Jue Wang

This is in preparation to add APIC_LVTCMCI support.

Signed-off-by: Jue Wang <juew@google.com>
---
 arch/x86/kvm/lapic.c | 33 +++++++++++++++++++--------------
 arch/x86/kvm/lapic.h | 19 ++++++++++++++++++-
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9322e6340a74..2c770e4c0e6c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -54,7 +54,7 @@
 #define PRIo64 "o"
 
 /* 14 is the version for Xeon and Pentium 8.4.8*/
-#define APIC_VERSION			(0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
+#define APIC_VERSION			0x14UL
 #define LAPIC_MMIO_LENGTH		(1 << 12)
 /* followed define is not in apicdef.h */
 #define MAX_APIC_VECTOR			256
@@ -364,10 +364,15 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
 	return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
 }
 
+static inline int kvm_apic_get_nr_lvt_entries(struct kvm_vcpu *vcpu)
+{
+	return KVM_APIC_MAX_NR_LVT_ENTRIES;
+}
+
 void kvm_apic_set_version(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
-	u32 v = APIC_VERSION;
+	u32 v = APIC_VERSION | ((kvm_apic_get_nr_lvt_entries(vcpu) - 1) << 16);
 
 	if (!lapic_in_kernel(vcpu))
 		return;
@@ -385,12 +390,13 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
 	kvm_lapic_set_reg(apic, APIC_LVR, v);
 }
 
-static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
-	LVT_MASK ,      /* part LVTT mask, timer mode mask added at runtime */
-	LVT_MASK | APIC_MODE_MASK,	/* LVTTHMR */
-	LVT_MASK | APIC_MODE_MASK,	/* LVTPC */
-	LINT_MASK, LINT_MASK,	/* LVT0-1 */
-	LVT_MASK		/* LVTERR */
+static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = {
+	[LVT_TIMER] = LVT_MASK,      /* timer mode mask added at runtime */
+	[LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK,
+	[LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK,
+	[LVT_LINT0] = LINT_MASK,
+	[LVT_LINT1] = LINT_MASK,
+	[LVT_ERROR] = LVT_MASK
 };
 
 static int find_highest_vector(void *bitmap)
@@ -2039,10 +2045,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 			int i;
 			u32 lvt_val;
 
-			for (i = 0; i < KVM_APIC_LVT_NUM; i++) {
-				lvt_val = kvm_lapic_get_reg(apic,
-						       APIC_LVTT + 0x10 * i);
-				kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i,
+			for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++) {
+				lvt_val = kvm_lapic_get_reg(apic, APIC_LVTx(i));
+				kvm_lapic_set_reg(apic, APIC_LVTx(i),
 					     lvt_val | APIC_LVT_MASKED);
 			}
 			apic_update_lvtt(apic);
@@ -2341,8 +2346,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 		kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
 	kvm_apic_set_version(apic->vcpu);
 
-	for (i = 0; i < KVM_APIC_LVT_NUM; i++)
-		kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
+	for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++)
+		kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
 	apic_update_lvtt(apic);
 	if (kvm_vcpu_is_reset_bsp(vcpu) &&
 	    kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 2b44e533fc8d..5666441d5d1b 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -10,7 +10,6 @@
 
 #define KVM_APIC_INIT		0
 #define KVM_APIC_SIPI		1
-#define KVM_APIC_LVT_NUM	6
 
 #define APIC_SHORT_MASK			0xc0000
 #define APIC_DEST_NOSHORT		0x0
@@ -29,6 +28,24 @@ enum lapic_mode {
 	LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE,
 };
 
+enum lapic_lvt_entry {
+	LVT_TIMER,
+	LVT_THERMAL_MONITOR,
+	LVT_PERFORMANCE_COUNTER,
+	LVT_LINT0,
+	LVT_LINT1,
+	LVT_ERROR,
+
+	KVM_APIC_MAX_NR_LVT_ENTRIES,
+};
+
+
+#define APIC_LVTx(x)                                                    \
+({                                                                      \
+	int __apic_reg = APIC_LVTT + 0x10 * (x);                        \
+	__apic_reg;                                                     \
+})
+
 struct kvm_timer {
 	struct hrtimer timer;
 	s64 period; 				/* unit: ns */
-- 
2.35.1.1178.g4f1659d476-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 2/4] KVM: x86: Add LVTCMCI support.
  2022-04-12 22:31 [PATCH v2 0/4] KVM: x86: Add support of CMCI signaling and UCNA Jue Wang
  2022-04-12 22:31 ` [PATCH v2 1/4] KVM: x86: Clean up KVM APIC LVT logic Jue Wang
@ 2022-04-12 22:31 ` Jue Wang
  2022-05-11 17:55   ` Sean Christopherson
  2022-04-12 22:31 ` [PATCH v2 3/4] KVM: x86: Add support for MSR_IA32_MCx_CTL2 MSRs Jue Wang
  2022-04-12 22:31 ` [PATCH v2 4/4] KVM: x86: Add support for MCG_CMCI_P and handling of injected UCNAs Jue Wang
  3 siblings, 1 reply; 13+ messages in thread
From: Jue Wang @ 2022-04-12 22:31 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson, Vitaly Kuznetsov, Wanpeng Li,
	Jim Mattson, Joerg Roedel
  Cc: Tony Luck, kvm, Jue Wang

This feature is only enabled when the vCPU has opted in to enable
MCG_CMCI_P.

Signed-off-by: Jue Wang <juew@google.com>
---
 arch/x86/kvm/lapic.c | 33 ++++++++++++++++++++++++++-------
 arch/x86/kvm/lapic.h |  7 ++++++-
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2c770e4c0e6c..0b370ccd11a1 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -27,6 +27,7 @@
 #include <linux/math64.h>
 #include <linux/slab.h>
 #include <asm/processor.h>
+#include <asm/mce.h>
 #include <asm/msr.h>
 #include <asm/page.h>
 #include <asm/current.h>
@@ -364,9 +365,14 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
 	return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
 }
 
+static inline bool kvm_is_cmci_supported(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.mcg_cap & MCG_CMCI_P;
+}
+
 static inline int kvm_apic_get_nr_lvt_entries(struct kvm_vcpu *vcpu)
 {
-	return KVM_APIC_MAX_NR_LVT_ENTRIES;
+	return KVM_APIC_MAX_NR_LVT_ENTRIES - !kvm_is_cmci_supported(vcpu);
 }
 
 void kvm_apic_set_version(struct kvm_vcpu *vcpu)
@@ -396,7 +402,8 @@ static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = {
 	[LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK,
 	[LVT_LINT0] = LINT_MASK,
 	[LVT_LINT1] = LINT_MASK,
-	[LVT_ERROR] = LVT_MASK
+	[LVT_ERROR] = LVT_MASK,
+	[LVT_CMCI] = LVT_MASK | APIC_MODE_MASK
 };
 
 static int find_highest_vector(void *bitmap)
@@ -1411,6 +1418,9 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 		APIC_REG_MASK(APIC_TMCCT) |
 		APIC_REG_MASK(APIC_TDCR);
 
+	if (kvm_is_cmci_supported(apic->vcpu))
+		valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
+
 	/* ARBPRI is not valid on x2APIC */
 	if (!apic_x2apic_mode(apic))
 		valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI);
@@ -2043,12 +2053,10 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 		apic_set_spiv(apic, val & mask);
 		if (!(val & APIC_SPIV_APIC_ENABLED)) {
 			int i;
-			u32 lvt_val;
 
-			for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++) {
-				lvt_val = kvm_lapic_get_reg(apic, APIC_LVTx(i));
+			for (i = 0; i < kvm_apic_get_nr_lvt_entries(apic->vcpu); i++) {
 				kvm_lapic_set_reg(apic, APIC_LVTx(i),
-					     lvt_val | APIC_LVT_MASKED);
+					kvm_lapic_get_reg(apic, APIC_LVTx(i)) | APIC_LVT_MASKED);
 			}
 			apic_update_lvtt(apic);
 			atomic_set(&apic->lapic_timer.pending, 0);
@@ -2098,6 +2106,17 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 		apic_update_lvtt(apic);
 		break;
 
+	case APIC_LVTCMCI:
+		if (!kvm_is_cmci_supported(apic->vcpu)) {
+			ret = 1;
+			break;
+		}
+		if (!kvm_apic_sw_enabled(apic))
+			val |= APIC_LVT_MASKED;
+		val &= apic_lvt_mask[LVT_CMCI];
+		kvm_lapic_set_reg(apic, APIC_LVTCMCI, val);
+		break;
+
 	case APIC_TMICT:
 		if (apic_lvtt_tscdeadline(apic))
 			break;
@@ -2346,7 +2365,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 		kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
 	kvm_apic_set_version(apic->vcpu);
 
-	for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++)
+	for (i = 0; i < kvm_apic_get_nr_lvt_entries(vcpu); i++)
 		kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
 	apic_update_lvtt(apic);
 	if (kvm_vcpu_is_reset_bsp(vcpu) &&
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 5666441d5d1b..9f9f74b17918 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -35,6 +35,7 @@ enum lapic_lvt_entry {
 	LVT_LINT0,
 	LVT_LINT1,
 	LVT_ERROR,
+	LVT_CMCI,
 
 	KVM_APIC_MAX_NR_LVT_ENTRIES,
 };
@@ -42,7 +43,11 @@ enum lapic_lvt_entry {
 
 #define APIC_LVTx(x)                                                    \
 ({                                                                      \
-	int __apic_reg = APIC_LVTT + 0x10 * (x);                        \
+	int __apic_reg;                                                 \
+	if ((x) == LVT_CMCI)                                            \
+		__apic_reg = APIC_LVTCMCI;				\
+	else								\
+		__apic_reg = APIC_LVTT + 0x10 * (x);                    \
 	__apic_reg;                                                     \
 })
 
-- 
2.35.1.1178.g4f1659d476-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 3/4] KVM: x86: Add support for MSR_IA32_MCx_CTL2 MSRs.
  2022-04-12 22:31 [PATCH v2 0/4] KVM: x86: Add support of CMCI signaling and UCNA Jue Wang
  2022-04-12 22:31 ` [PATCH v2 1/4] KVM: x86: Clean up KVM APIC LVT logic Jue Wang
  2022-04-12 22:31 ` [PATCH v2 2/4] KVM: x86: Add LVTCMCI support Jue Wang
@ 2022-04-12 22:31 ` Jue Wang
  2022-05-11 19:00   ` Sean Christopherson
  2022-04-12 22:31 ` [PATCH v2 4/4] KVM: x86: Add support for MCG_CMCI_P and handling of injected UCNAs Jue Wang
  3 siblings, 1 reply; 13+ messages in thread
From: Jue Wang @ 2022-04-12 22:31 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson, Vitaly Kuznetsov, Wanpeng Li,
	Jim Mattson, Joerg Roedel
  Cc: Tony Luck, kvm, Jue Wang

Note the support of CMCI (MCG_CMCI_P) is not enabled in
kvm_mce_cap_supported yet.

Signed-off-by: Jue Wang <juew@google.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              | 50 +++++++++++++++++++++++++--------
 2 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ec9830d2aabf..639ef92d01d1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -800,6 +800,7 @@ struct kvm_vcpu_arch {
 	u64 mcg_ctl;
 	u64 mcg_ext_ctl;
 	u64 *mce_banks;
+	u64 *mci_ctl2_banks;
 
 	/* Cache MMIO info */
 	u64 mmio_gva;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eb4029660bd9..73c64d2b9e60 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3167,6 +3167,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	unsigned bank_num = mcg_cap & 0xff;
 	u32 msr = msr_info->index;
 	u64 data = msr_info->data;
+	u32 offset;
 
 	switch (msr) {
 	case MSR_IA32_MCG_STATUS:
@@ -3180,10 +3181,22 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		vcpu->arch.mcg_ctl = data;
 		break;
+	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+		if (!(mcg_cap & MCG_CMCI_P) &&
+				(data || !msr_info->host_initiated))
+			return 1;
+		/* An attempt to write a 1 to a reserved bit raises #GP */
+		if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK))
+			return 1;
+		offset = array_index_nospec(
+				msr - MSR_IA32_MC0_CTL2,
+				MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
+		vcpu->arch.mci_ctl2_banks[offset] = data;
+		break;
 	default:
 		if (msr >= MSR_IA32_MC0_CTL &&
 		    msr < MSR_IA32_MCx_CTL(bank_num)) {
-			u32 offset = array_index_nospec(
+			offset = array_index_nospec(
 				msr - MSR_IA32_MC0_CTL,
 				MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
 
@@ -3489,7 +3502,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		}
 		break;
-	case 0x200 ... 0x2ff:
+	case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
+	case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
 		return kvm_mtrr_set_msr(vcpu, msr, data);
 	case MSR_IA32_APICBASE:
 		return kvm_set_apic_base(vcpu, msr_info);
@@ -3646,6 +3660,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
 		return set_msr_mce(vcpu, msr_info);
 
 	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
@@ -3750,6 +3765,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
 	u64 data;
 	u64 mcg_cap = vcpu->arch.mcg_cap;
 	unsigned bank_num = mcg_cap & 0xff;
+	u32 offset;
 
 	switch (msr) {
 	case MSR_IA32_P5_MC_ADDR:
@@ -3767,10 +3783,18 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
 	case MSR_IA32_MCG_STATUS:
 		data = vcpu->arch.mcg_status;
 		break;
+	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+		if (!(mcg_cap & MCG_CMCI_P) && !host)
+			return 1;
+		offset = array_index_nospec(
+				msr - MSR_IA32_MC0_CTL2,
+				MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
+		data = vcpu->arch.mci_ctl2_banks[offset];
+		break;
 	default:
 		if (msr >= MSR_IA32_MC0_CTL &&
 		    msr < MSR_IA32_MCx_CTL(bank_num)) {
-			u32 offset = array_index_nospec(
+			offset = array_index_nospec(
 				msr - MSR_IA32_MC0_CTL,
 				MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
 
@@ -3873,7 +3897,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	}
 	case MSR_MTRRcap:
-	case 0x200 ... 0x2ff:
+	case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
+	case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
 		return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
 	case 0xcd: /* fsb frequency */
 		msr_info->data = 3;
@@ -3989,6 +4014,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
 		return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
 				   msr_info->host_initiated);
 	case MSR_IA32_XSS:
@@ -4737,12 +4763,12 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 		goto out;
 	r = 0;
 	vcpu->arch.mcg_cap = mcg_cap;
-	/* Init IA32_MCG_CTL to all 1s */
-	if (mcg_cap & MCG_CTL_P)
-		vcpu->arch.mcg_ctl = ~(u64)0;
-	/* Init IA32_MCi_CTL to all 1s */
-	for (bank = 0; bank < bank_num; bank++)
+	/* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */
+	for (bank = 0; bank < bank_num; bank++) {
 		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+		if (mcg_cap & MCG_CMCI_P)
+			vcpu->arch.mci_ctl2_banks[bank] = 0;
+	}
 
 	static_call(kvm_x86_setup_mce)(vcpu);
 out:
@@ -11126,9 +11152,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 		goto fail_free_lapic;
 	vcpu->arch.pio_data = page_address(page);
 
-	vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+	vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
+				       GFP_KERNEL_ACCOUNT);
+	vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
 				       GFP_KERNEL_ACCOUNT);
-	if (!vcpu->arch.mce_banks)
+	if (!vcpu->arch.mce_banks | !vcpu->arch.mci_ctl2_banks)
 		goto fail_free_pio_data;
 	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 
-- 
2.35.1.1178.g4f1659d476-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 4/4] KVM: x86: Add support for MCG_CMCI_P and handling of injected UCNAs.
  2022-04-12 22:31 [PATCH v2 0/4] KVM: x86: Add support of CMCI signaling and UCNA Jue Wang
                   ` (2 preceding siblings ...)
  2022-04-12 22:31 ` [PATCH v2 3/4] KVM: x86: Add support for MSR_IA32_MCx_CTL2 MSRs Jue Wang
@ 2022-04-12 22:31 ` Jue Wang
  2022-05-11 19:20   ` Sean Christopherson
  3 siblings, 1 reply; 13+ messages in thread
From: Jue Wang @ 2022-04-12 22:31 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson, Vitaly Kuznetsov, Wanpeng Li,
	Jim Mattson, Joerg Roedel
  Cc: Tony Luck, kvm, Jue Wang

Note prior to this patch, the UCNA type of signaling can already be
processed by kvm_vcpu_ioctl_x86_set_mce and does not result in correct
CMCI signaling semantic.

Signed-off-by: Jue Wang <juew@google.com>
---
 arch/x86/kvm/vmx/vmx.c |  1 +
 arch/x86/kvm/x86.c     | 48 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b730d799c26e..63aa2b3d30ca 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8035,6 +8035,7 @@ static __init int hardware_setup(void)
 	}
 
 	kvm_mce_cap_supported |= MCG_LMCE_P;
+	kvm_mce_cap_supported |= MCG_CMCI_P;
 
 	if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
 		return -EINVAL;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 73c64d2b9e60..eb6058ca1e70 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4775,6 +4775,50 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	return r;
 }
 
+static bool is_ucna(u64 mcg_status, u64 mci_status)
+{
+	return !mcg_status &&
+		!(mci_status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR));
+}
+
+static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu,
+		struct kvm_x86_mce *mce)
+{
+	u64 mcg_cap = vcpu->arch.mcg_cap;
+	unsigned int bank_num = mcg_cap & 0xff;
+	u64 *banks = vcpu->arch.mce_banks;
+
+	/* Check for legal bank number in guest */
+	if (mce->bank >= bank_num)
+		return -EINVAL;
+
+	/*
+	 * UCNA signals should not set bits that are only used for machine check
+	 * exceptions.
+	 */
+	if (mce->mcg_status ||
+		(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)))
+		return -EINVAL;
+
+	/* UCNA must have VAL and UC bits set */
+	if (!(mce->status & MCI_STATUS_VAL) || !(mce->status & MCI_STATUS_UC))
+		return -EINVAL;
+
+	banks += 4 * mce->bank;
+	banks[1] = mce->status;
+	banks[2] = mce->addr;
+	banks[3] = mce->misc;
+	vcpu->arch.mcg_status = mce->mcg_status;
+
+	if (!(mcg_cap & MCG_CMCI_P) || !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN))
+		return 0;
+
+	if (lapic_in_kernel(vcpu))
+		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
+
+	return 0;
+}
+
 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 				      struct kvm_x86_mce *mce)
 {
@@ -4784,6 +4828,10 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 
 	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
 		return -EINVAL;
+
+	if (is_ucna(mce->mcg_status, mce->status))
+		return kvm_vcpu_x86_set_ucna(vcpu, mce);
+
 	/*
 	 * if IA32_MCG_CTL is not all 1s, the uncorrected error
 	 * reporting is disabled
-- 
2.35.1.1178.g4f1659d476-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 1/4] KVM: x86: Clean up KVM APIC LVT logic.
  2022-04-12 22:31 ` [PATCH v2 1/4] KVM: x86: Clean up KVM APIC LVT logic Jue Wang
@ 2022-05-11 17:38   ` Sean Christopherson
  2022-05-12 17:57     ` Jue Wang
  0 siblings, 1 reply; 13+ messages in thread
From: Sean Christopherson @ 2022-05-11 17:38 UTC (permalink / raw)
  To: Jue Wang
  Cc: Paolo Bonzini, Vitaly Kuznetsov, Wanpeng Li, Jim Mattson,
	Joerg Roedel, Tony Luck, kvm

On Tue, Apr 12, 2022, Jue Wang wrote:
> This is in preparation to add APIC_LVTCMCI support.

There is not nearly enough information in this changelog.  Same goes for all other
patches in the series.  And when you start writing changelogs to explain what is
being done and why, I suspect you'll find that this should be further broken up
into multiple patches.

 1. Make APIC_VERSION capture only the magic 0x14UL
 2. Fill apic_lvt_mask with enums / explicit entries.
 3. Add APIC_LVTx() macro

And proper upstream etiquette would be to add

  Suggested-by: Sean Christopherson <seanjc@google.com>

for #2 and #3.  I don't care much about the attribution (though that's nice too),
but more importantly it provides a bit of context for others that get involved
later in the series (sometimes unwillingly).  E.g. if someone encounters a bug
with a patch, the Suggested-by gives them one more person to loop into the
discussion.  Ditto for other reviewers, e.g. if someone starts reviewing the
series at v3 or whatever, it provides some background on how the series got to
v3 without them having to actually look at v1 or v2.

> Signed-off-by: Jue Wang <juew@google.com>
> ---
>  arch/x86/kvm/lapic.c | 33 +++++++++++++++++++--------------
>  arch/x86/kvm/lapic.h | 19 ++++++++++++++++++-
>  2 files changed, 37 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 9322e6340a74..2c770e4c0e6c 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -54,7 +54,7 @@
>  #define PRIo64 "o"
>  
>  /* 14 is the version for Xeon and Pentium 8.4.8*/
> -#define APIC_VERSION			(0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
> +#define APIC_VERSION			0x14UL
>  #define LAPIC_MMIO_LENGTH		(1 << 12)
>  /* followed define is not in apicdef.h */
>  #define MAX_APIC_VECTOR			256
> @@ -364,10 +364,15 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
>  	return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
>  }
>  
> +static inline int kvm_apic_get_nr_lvt_entries(struct kvm_vcpu *vcpu)
> +{
> +	return KVM_APIC_MAX_NR_LVT_ENTRIES;
> +}

I think it makes sense to introduce this helper with the CMCI patch.  Until then,
requiring @vcpu to get the max number of entries is misleading and unnecessary.

Case in point, this patch is broken in that the APIC_SPIV path in kvm_lapic_reg_write()
uses the #define directly, which necessitates fixup in the CMCI patch to use this
helper.

> +
>  void kvm_apic_set_version(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_lapic *apic = vcpu->arch.apic;
> -	u32 v = APIC_VERSION;
> +	u32 v = APIC_VERSION | ((kvm_apic_get_nr_lvt_entries(vcpu) - 1) << 16);
>  
>  	if (!lapic_in_kernel(vcpu))
>  		return;
> @@ -385,12 +390,13 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
>  	kvm_lapic_set_reg(apic, APIC_LVR, v);
>  }
>  
> -static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
> -	LVT_MASK ,      /* part LVTT mask, timer mode mask added at runtime */
> -	LVT_MASK | APIC_MODE_MASK,	/* LVTTHMR */
> -	LVT_MASK | APIC_MODE_MASK,	/* LVTPC */
> -	LINT_MASK, LINT_MASK,	/* LVT0-1 */
> -	LVT_MASK		/* LVTERR */
> +static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = {
> +	[LVT_TIMER] = LVT_MASK,      /* timer mode mask added at runtime */
> +	[LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK,
> +	[LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK,
> +	[LVT_LINT0] = LINT_MASK,
> +	[LVT_LINT1] = LINT_MASK,
> +	[LVT_ERROR] = LVT_MASK
>  };
>  
>  static int find_highest_vector(void *bitmap)
> @@ -2039,10 +2045,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
>  			int i;
>  			u32 lvt_val;
>  
> -			for (i = 0; i < KVM_APIC_LVT_NUM; i++) {
> -				lvt_val = kvm_lapic_get_reg(apic,
> -						       APIC_LVTT + 0x10 * i);
> -				kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i,
> +			for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++) {
> +				lvt_val = kvm_lapic_get_reg(apic, APIC_LVTx(i));
> +				kvm_lapic_set_reg(apic, APIC_LVTx(i),
>  					     lvt_val | APIC_LVT_MASKED);
>  			}
>  			apic_update_lvtt(apic);
> @@ -2341,8 +2346,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
>  		kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
>  	kvm_apic_set_version(apic->vcpu);
>  
> -	for (i = 0; i < KVM_APIC_LVT_NUM; i++)
> -		kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
> +	for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++)
> +		kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
>  	apic_update_lvtt(apic);
>  	if (kvm_vcpu_is_reset_bsp(vcpu) &&
>  	    kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index 2b44e533fc8d..5666441d5d1b 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -10,7 +10,6 @@
>  
>  #define KVM_APIC_INIT		0
>  #define KVM_APIC_SIPI		1
> -#define KVM_APIC_LVT_NUM	6
>  
>  #define APIC_SHORT_MASK			0xc0000
>  #define APIC_DEST_NOSHORT		0x0
> @@ -29,6 +28,24 @@ enum lapic_mode {
>  	LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE,
>  };
>  
> +enum lapic_lvt_entry {
> +	LVT_TIMER,
> +	LVT_THERMAL_MONITOR,
> +	LVT_PERFORMANCE_COUNTER,
> +	LVT_LINT0,
> +	LVT_LINT1,
> +	LVT_ERROR,
> +
> +	KVM_APIC_MAX_NR_LVT_ENTRIES,
> +};
> +
> +
> +#define APIC_LVTx(x)                                                    \
> +({                                                                      \
> +	int __apic_reg = APIC_LVTT + 0x10 * (x);                        \

An intermediate variable is completely unnecessary.  This should do just fine.

  #define APIC_LVTx(x) (APIC_LVTT + 0x10 * (x))

Yes, the macro _may_ eventually becomes a multi-line beast with a variable when
CMCI support is added, but again that belongs in the CMCI patch.  That way this
patch doesn't need to change if we decide that even the CMCI-aware version can
just be:

  #define APIC_LVTx(x) ((x) == LVT_CMCI ? APIC_LVTCMCI : APIC_LVTT + 0x10 * (x))


> +	__apic_reg;                                                     \
> +})
> +
>  struct kvm_timer {
>  	struct hrtimer timer;
>  	s64 period; 				/* unit: ns */
> -- 
> 2.35.1.1178.g4f1659d476-goog
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 2/4] KVM: x86: Add LVTCMCI support.
  2022-04-12 22:31 ` [PATCH v2 2/4] KVM: x86: Add LVTCMCI support Jue Wang
@ 2022-05-11 17:55   ` Sean Christopherson
  2022-05-12 18:01     ` Jue Wang
  0 siblings, 1 reply; 13+ messages in thread
From: Sean Christopherson @ 2022-05-11 17:55 UTC (permalink / raw)
  To: Jue Wang
  Cc: Paolo Bonzini, Vitaly Kuznetsov, Wanpeng Li, Jim Mattson,
	Joerg Roedel, Tony Luck, kvm

On Tue, Apr 12, 2022, Jue Wang wrote:
> This feature is only enabled when the vCPU has opted in to enable
> MCG_CMCI_P.

Again, waaaay too terse.  What is CMCI?  What does "support" mean since an astute
reader will notice that it's impossible for MCG_CMCI_P to be set.  How/when will
the vCPU (which is wrong no?  doesn't userspace do the write?) be able to opt-in?

> Signed-off-by: Jue Wang <juew@google.com>
> ---
>  arch/x86/kvm/lapic.c | 33 ++++++++++++++++++++++++++-------
>  arch/x86/kvm/lapic.h |  7 ++++++-
>  2 files changed, 32 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 2c770e4c0e6c..0b370ccd11a1 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -27,6 +27,7 @@
>  #include <linux/math64.h>
>  #include <linux/slab.h>
>  #include <asm/processor.h>
> +#include <asm/mce.h>
>  #include <asm/msr.h>
>  #include <asm/page.h>
>  #include <asm/current.h>
> @@ -364,9 +365,14 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
>  	return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
>  }
>  
> +static inline bool kvm_is_cmci_supported(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.mcg_cap & MCG_CMCI_P;
> +}
> +
>  static inline int kvm_apic_get_nr_lvt_entries(struct kvm_vcpu *vcpu)

I think it makes sense to take @apic here, not @vcpu, since this is an APIC-specific
helper.  kvm_apic_set_version() will need to be modified to not call
kvm_apic_get_nr_lvt_entries() until after it has verified the local APIC is in-kernel,
but IMO that's a good thing.

>  {
> -	return KVM_APIC_MAX_NR_LVT_ENTRIES;
> +	return KVM_APIC_MAX_NR_LVT_ENTRIES - !kvm_is_cmci_supported(vcpu);
>  }
>  
>  void kvm_apic_set_version(struct kvm_vcpu *vcpu)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/4] KVM: x86: Add support for MSR_IA32_MCx_CTL2 MSRs.
  2022-04-12 22:31 ` [PATCH v2 3/4] KVM: x86: Add support for MSR_IA32_MCx_CTL2 MSRs Jue Wang
@ 2022-05-11 19:00   ` Sean Christopherson
  2022-05-13  4:45     ` Jue Wang
  0 siblings, 1 reply; 13+ messages in thread
From: Sean Christopherson @ 2022-05-11 19:00 UTC (permalink / raw)
  To: Jue Wang
  Cc: Paolo Bonzini, Vitaly Kuznetsov, Wanpeng Li, Jim Mattson,
	Joerg Roedel, Tony Luck, kvm

On Tue, Apr 12, 2022, Jue Wang wrote:
> Note the support of CMCI (MCG_CMCI_P) is not enabled in
> kvm_mce_cap_supported yet.

You can probably guess what I would say here :-)  A reader should not have to go
wade through Intel's SDM to understand the relationship between CTL2 and CMCI.

> Signed-off-by: Jue Wang <juew@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/x86.c              | 50 +++++++++++++++++++++++++--------
>  2 files changed, 40 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index ec9830d2aabf..639ef92d01d1 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -800,6 +800,7 @@ struct kvm_vcpu_arch {
>  	u64 mcg_ctl;
>  	u64 mcg_ext_ctl;
>  	u64 *mce_banks;
> +	u64 *mci_ctl2_banks;
>  
>  	/* Cache MMIO info */
>  	u64 mmio_gva;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index eb4029660bd9..73c64d2b9e60 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3167,6 +3167,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  	unsigned bank_num = mcg_cap & 0xff;
>  	u32 msr = msr_info->index;
>  	u64 data = msr_info->data;
> +	u32 offset;
>  
>  	switch (msr) {
>  	case MSR_IA32_MCG_STATUS:
> @@ -3180,10 +3181,22 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  			return 1;
>  		vcpu->arch.mcg_ctl = data;
>  		break;
> +	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:

This is wrong.  Well, incomplete.  It will allow writes to MSRs that do not exist,
i.e. if bank_num >= offset < KVM_MAX_MCE_BANKS.

I believe this will suffice and is also the simplest?

		if (msr >= MSR_IA32_MCx_CTL2(bank_num))
			return 1;

Actually, tying in with the array_index_nopsec() comments below, if this captures
the last MSR in a variable, then the overrun is much more reasonable (sample idea
at the bottom).

> +		if (!(mcg_cap & MCG_CMCI_P) &&
> +				(data || !msr_info->host_initiated))

Funky indentation.  Should be:

		if (!(mcg_cap & MCG_CMCI_P) &&
		    (data || !msr_info->host_initiated))
			return 1;

> +			return 1;
> +		/* An attempt to write a 1 to a reserved bit raises #GP */
> +		if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK))
> +			return 1;
> +		offset = array_index_nospec(
> +				msr - MSR_IA32_MC0_CTL2,
> +				MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);

My preference would be to let this run over (by a fair amount)

		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
					    MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);

But if we use a local variable, there's no overrun:

	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
		last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
		if (msr > last_msr)
			return 1;

		if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
			return 1;
		/* An attempt to write a 1 to a reserved bit raises #GP */
		if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK))
			return 1;
		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
					    last_msr + 1 - MSR_IA32_MC0_CTL2);
		vcpu->arch.mci_ctl2_banks[offset] = data;
		break;


And if we go that route, in a follow-up patch at the end of the series, clean up
the "default" path to hoist the if-statement into a proper case statement (unless
I've misread the code), e.g. with some opportunstic cleanup:

	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
		last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
		if (msr > last_msr)
			return 1;

		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
					    last_msr + 1 - MSR_IA32_MC0_CTL);

		/*
		 * Only 0 or all 1s can be written to IA32_MCi_CTL, though some
		 * Linux kernels clear bit 10 in bank 4 to workaround a BIOS/GART
		 * TLB issue on AMD K8s, ignore this to avoid an uncatched #GP in
		 * the guest.
		 */
		if ((offset & 0x3) == 0 &&
		    data != 0 && (data | (1 << 10)) != ~(u64)0)
			return -1;

		/* MCi_STATUS */
		if (!msr_info->host_initiated && (offset & 0x3) == 1 &&
		    data != 0 && !can_set_mci_status(vcpu))
			return -1;

		vcpu->arch.mce_banks[offset] = data;
		break;

Actually, rereading that, isn't "return -1" wrong?  That will cause kvm_emulate_wrmsr()
to exit to userspace, not inject a #GP.

*sigh*  Yep, indeed, the -1 gets interpreted as -EPERM and kills the guest.

Scratch the idea of doing the above on top, I'll send separate patches and a KUT
testcase.  I'll Cc you on the patches, it'd save Paolo a merge conflict (or you a
rebase) if this series is based on top of that bugfix + cleanup.

> +		vcpu->arch.mci_ctl2_banks[offset] = data;
> +		break;
>  	default:
>  		if (msr >= MSR_IA32_MC0_CTL &&
>  		    msr < MSR_IA32_MCx_CTL(bank_num)) {
> -			u32 offset = array_index_nospec(
> +			offset = array_index_nospec(
>  				msr - MSR_IA32_MC0_CTL,
>  				MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
>  
> @@ -3489,7 +3502,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  			return 1;
>  		}
>  		break;
> -	case 0x200 ... 0x2ff:
> +	case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
> +	case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
>  		return kvm_mtrr_set_msr(vcpu, msr, data);
>  	case MSR_IA32_APICBASE:
>  		return kvm_set_apic_base(vcpu, msr_info);
> @@ -3646,6 +3660,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  	case MSR_IA32_MCG_CTL:
>  	case MSR_IA32_MCG_STATUS:
>  	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
> +	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
>  		return set_msr_mce(vcpu, msr_info);
>  
>  	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
> @@ -3750,6 +3765,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
>  	u64 data;
>  	u64 mcg_cap = vcpu->arch.mcg_cap;
>  	unsigned bank_num = mcg_cap & 0xff;
> +	u32 offset;
>  
>  	switch (msr) {
>  	case MSR_IA32_P5_MC_ADDR:
> @@ -3767,10 +3783,18 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
>  	case MSR_IA32_MCG_STATUS:
>  		data = vcpu->arch.mcg_status;
>  		break;
> +	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:

Same comments as the WRMSR path.  I'll also handle the "default" case in my cleanup.

> +		if (!(mcg_cap & MCG_CMCI_P) && !host)
> +			return 1;
> +		offset = array_index_nospec(
> +				msr - MSR_IA32_MC0_CTL2,
> +				MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
> +		data = vcpu->arch.mci_ctl2_banks[offset];
> +		break;
>  	default:
>  		if (msr >= MSR_IA32_MC0_CTL &&
>  		    msr < MSR_IA32_MCx_CTL(bank_num)) {
> -			u32 offset = array_index_nospec(
> +			offset = array_index_nospec(
>  				msr - MSR_IA32_MC0_CTL,
>  				MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
>  

...

> @@ -11126,9 +11152,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
>  		goto fail_free_lapic;
>  	vcpu->arch.pio_data = page_address(page);
>  
> -	vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
> +	vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
> +				       GFP_KERNEL_ACCOUNT);

Switching to kcalloc() should be a separate patch.

> +	vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
>  				       GFP_KERNEL_ACCOUNT);
> -	if (!vcpu->arch.mce_banks)
> +	if (!vcpu->arch.mce_banks | !vcpu->arch.mci_ctl2_banks)

This wants to be a logical-OR, not a bitwise-OR.

Oh, and vcpu->arch.mci_ctl2_banks needs to be freed if a later step fails.

>  		goto fail_free_pio_data;
>  	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
>  
> -- 
> 2.35.1.1178.g4f1659d476-goog
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 4/4] KVM: x86: Add support for MCG_CMCI_P and handling of injected UCNAs.
  2022-04-12 22:31 ` [PATCH v2 4/4] KVM: x86: Add support for MCG_CMCI_P and handling of injected UCNAs Jue Wang
@ 2022-05-11 19:20   ` Sean Christopherson
  2022-05-13  4:54     ` Jue Wang
  0 siblings, 1 reply; 13+ messages in thread
From: Sean Christopherson @ 2022-05-11 19:20 UTC (permalink / raw)
  To: Jue Wang
  Cc: Paolo Bonzini, Vitaly Kuznetsov, Wanpeng Li, Jim Mattson,
	Joerg Roedel, Tony Luck, kvm

On Tue, Apr 12, 2022, Jue Wang wrote:
> Note prior to this patch, the UCNA type of signaling can already be
> processed by kvm_vcpu_ioctl_x86_set_mce and does not result in correct
> CMCI signaling semantic.

Same as before...

UCNA should be spelled out at least once.

> 
> Signed-off-by: Jue Wang <juew@google.com>
> ---
>  arch/x86/kvm/vmx/vmx.c |  1 +
>  arch/x86/kvm/x86.c     | 48 ++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 49 insertions(+)
> 
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index b730d799c26e..63aa2b3d30ca 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -8035,6 +8035,7 @@ static __init int hardware_setup(void)
>  	}
>  
>  	kvm_mce_cap_supported |= MCG_LMCE_P;
> +	kvm_mce_cap_supported |= MCG_CMCI_P;

Is there really no hardware dependency on CMCI?  Honest question  If not, that
should be explicitly called out in the changelog.

>  	if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
>  		return -EINVAL;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 73c64d2b9e60..eb6058ca1e70 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -4775,6 +4775,50 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
>  	return r;
>  }
>  
> +static bool is_ucna(u64 mcg_status, u64 mci_status)

Any reason not to take 'struct kvm_x86_mce *mce'?

> +{
> +	return !mcg_status &&
> +		!(mci_status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR));

As someone who knows nothing about MCI encoding, can you add a function comment
explaing, in detail, what's going on here?

Also, my preference is to align the indentation on multi-line returns.  Paolo scoffs
at this nit of mine, but he's obviously wrong ;-)

	return !mcg_status &&
	       !(mci_status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR));

> +}
> +
> +static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu,
> +		struct kvm_x86_mce *mce)

Please align the params.  Actually, just let it run over, it's a single char.

static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce)

> +{
> +	u64 mcg_cap = vcpu->arch.mcg_cap;
> +	unsigned int bank_num = mcg_cap & 0xff;
> +	u64 *banks = vcpu->arch.mce_banks;
> +
> +	/* Check for legal bank number in guest */

Eh, don't think this warrants a comment.

> +	if (mce->bank >= bank_num)
> +		return -EINVAL;
> +
> +	/*
> +	 * UCNA signals should not set bits that are only used for machine check
> +	 * exceptions.
> +	 */
> +	if (mce->mcg_status ||
> +		(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)))

Unless mine eyes deceive me, this is the same as:

	if (!is_ucna(mce->mcg_status, mce->status))

> +		return -EINVAL;
> +
> +	/* UCNA must have VAL and UC bits set */
> +	if (!(mce->status & MCI_STATUS_VAL) || !(mce->status & MCI_STATUS_UC))
> +		return -EINVAL;
> +
> +	banks += 4 * mce->bank;
> +	banks[1] = mce->status;
> +	banks[2] = mce->addr;
> +	banks[3] = mce->misc;
> +	vcpu->arch.mcg_status = mce->mcg_status;
> +
> +	if (!(mcg_cap & MCG_CMCI_P) || !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN))

This one's worth wrapping, that's quite a long line, and there's a natural split point:

	if (!(mcg_cap & MCG_CMCI_P) ||
	    !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN))
		return 0;


> +		return 0;
> +
> +	if (lapic_in_kernel(vcpu))
> +		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
> +
> +	return 0;
> +}
> +
>  static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
>  				      struct kvm_x86_mce *mce)
>  {
> @@ -4784,6 +4828,10 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
>  
>  	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
>  		return -EINVAL;
> +
> +	if (is_ucna(mce->mcg_status, mce->status))
> +		return kvm_vcpu_x86_set_ucna(vcpu, mce);
> +
>  	/*
>  	 * if IA32_MCG_CTL is not all 1s, the uncorrected error
>  	 * reporting is disabled
> -- 
> 2.35.1.1178.g4f1659d476-goog
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 1/4] KVM: x86: Clean up KVM APIC LVT logic.
  2022-05-11 17:38   ` Sean Christopherson
@ 2022-05-12 17:57     ` Jue Wang
  0 siblings, 0 replies; 13+ messages in thread
From: Jue Wang @ 2022-05-12 17:57 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Paolo Bonzini, Vitaly Kuznetsov, Wanpeng Li, Jim Mattson,
	Joerg Roedel, Tony Luck, kvm

Thanks a lot, Sean!


On Wed, May 11, 2022 at 10:38 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Apr 12, 2022, Jue Wang wrote:
> > This is in preparation to add APIC_LVTCMCI support.
>
> There is not nearly enough information in this changelog.  Same goes for all other
> patches in the series.  And when you start writing changelogs to explain what is
> being done and why, I suspect you'll find that this should be further broken up
> into multiple patches.
I realized the general lack of enough information and background in
the changelog and I am working to rewrite it with necessary
background.

>
>  1. Make APIC_VERSION capture only the magic 0x14UL
>  2. Fill apic_lvt_mask with enums / explicit entries.
>  3. Add APIC_LVTx() macro
>
> And proper upstream etiquette would be to add
>
>   Suggested-by: Sean Christopherson <seanjc@google.com>
>
> for #2 and #3.  I don't care much about the attribution (though that's nice too),
> but more importantly it provides a bit of context for others that get involved
> later in the series (sometimes unwillingly).  E.g. if someone encounters a bug
> with a patch, the Suggested-by gives them one more person to loop into the
> discussion.  Ditto for other reviewers, e.g. if someone starts reviewing the
> series at v3 or whatever, it provides some background on how the series got to
> v3 without them having to actually look at v1 or v2.

Thanks, for walking me through this process and practices. I am
breaking this patch into 3 and adding "Suggested-by: Sean
Christopherson ...." to them. Is it OK that I add you as
"Suggested-by" to the later patches in this series?


>
> > Signed-off-by: Jue Wang <juew@google.com>
> > ---
> >  arch/x86/kvm/lapic.c | 33 +++++++++++++++++++--------------
> >  arch/x86/kvm/lapic.h | 19 ++++++++++++++++++-
> >  2 files changed, 37 insertions(+), 15 deletions(-)
> >
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index 9322e6340a74..2c770e4c0e6c 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -54,7 +54,7 @@
> >  #define PRIo64 "o"
> >
> >  /* 14 is the version for Xeon and Pentium 8.4.8*/
> > -#define APIC_VERSION                 (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
> > +#define APIC_VERSION                 0x14UL
> >  #define LAPIC_MMIO_LENGTH            (1 << 12)
> >  /* followed define is not in apicdef.h */
> >  #define MAX_APIC_VECTOR                      256
> > @@ -364,10 +364,15 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
> >       return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
> >  }
> >
> > +static inline int kvm_apic_get_nr_lvt_entries(struct kvm_vcpu *vcpu)
> > +{
> > +     return KVM_APIC_MAX_NR_LVT_ENTRIES;
> > +}
>
> I think it makes sense to introduce this helper with the CMCI patch.  Until then,
> requiring @vcpu to get the max number of entries is misleading and unnecessary.
>
> Case in point, this patch is broken in that the APIC_SPIV path in kvm_lapic_reg_write()
> uses the #define directly, which necessitates fixup in the CMCI patch to use this
> helper.
>
Ack, will incorporate this and comment below into V3.
> > +
> >  void kvm_apic_set_version(struct kvm_vcpu *vcpu)
> >  {
> >       struct kvm_lapic *apic = vcpu->arch.apic;
> > -     u32 v = APIC_VERSION;
> > +     u32 v = APIC_VERSION | ((kvm_apic_get_nr_lvt_entries(vcpu) - 1) << 16);
> >
> >       if (!lapic_in_kernel(vcpu))
> >               return;
> > @@ -385,12 +390,13 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
> >       kvm_lapic_set_reg(apic, APIC_LVR, v);
> >  }
> >
> > -static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
> > -     LVT_MASK ,      /* part LVTT mask, timer mode mask added at runtime */
> > -     LVT_MASK | APIC_MODE_MASK,      /* LVTTHMR */
> > -     LVT_MASK | APIC_MODE_MASK,      /* LVTPC */
> > -     LINT_MASK, LINT_MASK,   /* LVT0-1 */
> > -     LVT_MASK                /* LVTERR */
> > +static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = {
> > +     [LVT_TIMER] = LVT_MASK,      /* timer mode mask added at runtime */
> > +     [LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK,
> > +     [LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK,
> > +     [LVT_LINT0] = LINT_MASK,
> > +     [LVT_LINT1] = LINT_MASK,
> > +     [LVT_ERROR] = LVT_MASK
> >  };
> >
> >  static int find_highest_vector(void *bitmap)
> > @@ -2039,10 +2045,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
> >                       int i;
> >                       u32 lvt_val;
> >
> > -                     for (i = 0; i < KVM_APIC_LVT_NUM; i++) {
> > -                             lvt_val = kvm_lapic_get_reg(apic,
> > -                                                    APIC_LVTT + 0x10 * i);
> > -                             kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i,
> > +                     for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++) {
> > +                             lvt_val = kvm_lapic_get_reg(apic, APIC_LVTx(i));
> > +                             kvm_lapic_set_reg(apic, APIC_LVTx(i),
> >                                            lvt_val | APIC_LVT_MASKED);
> >                       }
> >                       apic_update_lvtt(apic);
> > @@ -2341,8 +2346,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
> >               kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
> >       kvm_apic_set_version(apic->vcpu);
> >
> > -     for (i = 0; i < KVM_APIC_LVT_NUM; i++)
> > -             kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
> > +     for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++)
> > +             kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
> >       apic_update_lvtt(apic);
> >       if (kvm_vcpu_is_reset_bsp(vcpu) &&
> >           kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
> > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> > index 2b44e533fc8d..5666441d5d1b 100644
> > --- a/arch/x86/kvm/lapic.h
> > +++ b/arch/x86/kvm/lapic.h
> > @@ -10,7 +10,6 @@
> >
> >  #define KVM_APIC_INIT                0
> >  #define KVM_APIC_SIPI                1
> > -#define KVM_APIC_LVT_NUM     6
> >
> >  #define APIC_SHORT_MASK                      0xc0000
> >  #define APIC_DEST_NOSHORT            0x0
> > @@ -29,6 +28,24 @@ enum lapic_mode {
> >       LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE,
> >  };
> >
> > +enum lapic_lvt_entry {
> > +     LVT_TIMER,
> > +     LVT_THERMAL_MONITOR,
> > +     LVT_PERFORMANCE_COUNTER,
> > +     LVT_LINT0,
> > +     LVT_LINT1,
> > +     LVT_ERROR,
> > +
> > +     KVM_APIC_MAX_NR_LVT_ENTRIES,
> > +};
> > +
> > +
> > +#define APIC_LVTx(x)                                                    \
> > +({                                                                      \
> > +     int __apic_reg = APIC_LVTT + 0x10 * (x);                        \
>
> An intermediate variable is completely unnecessary.  This should do just fine.
>
>   #define APIC_LVTx(x) (APIC_LVTT + 0x10 * (x))
>
> Yes, the macro _may_ eventually becomes a multi-line beast with a variable when
> CMCI support is added, but again that belongs in the CMCI patch.  That way this
> patch doesn't need to change if we decide that even the CMCI-aware version can
> just be:
>
>   #define APIC_LVTx(x) ((x) == LVT_CMCI ? APIC_LVTCMCI : APIC_LVTT + 0x10 * (x))
>
>
> > +     __apic_reg;                                                     \
> > +})
> > +
> >  struct kvm_timer {
> >       struct hrtimer timer;
> >       s64 period;                             /* unit: ns */
> > --
> > 2.35.1.1178.g4f1659d476-goog
> >

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 2/4] KVM: x86: Add LVTCMCI support.
  2022-05-11 17:55   ` Sean Christopherson
@ 2022-05-12 18:01     ` Jue Wang
  0 siblings, 0 replies; 13+ messages in thread
From: Jue Wang @ 2022-05-12 18:01 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Paolo Bonzini, Vitaly Kuznetsov, Wanpeng Li, Jim Mattson,
	Joerg Roedel, Tony Luck, kvm

On Wed, May 11, 2022 at 10:55 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Apr 12, 2022, Jue Wang wrote:
> > This feature is only enabled when the vCPU has opted in to enable
> > MCG_CMCI_P.
>
> Again, waaaay too terse.  What is CMCI?  What does "support" mean since an astute
> reader will notice that it's impossible for MCG_CMCI_P to be set.  How/when will
> the vCPU (which is wrong no?  doesn't userspace do the write?) be able to opt-in?
I am rewriting the change log for every patch in this series and will
send an updated V3.
>
> > Signed-off-by: Jue Wang <juew@google.com>
> > ---
> >  arch/x86/kvm/lapic.c | 33 ++++++++++++++++++++++++++-------
> >  arch/x86/kvm/lapic.h |  7 ++++++-
> >  2 files changed, 32 insertions(+), 8 deletions(-)
> >
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index 2c770e4c0e6c..0b370ccd11a1 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -27,6 +27,7 @@
> >  #include <linux/math64.h>
> >  #include <linux/slab.h>
> >  #include <asm/processor.h>
> > +#include <asm/mce.h>
> >  #include <asm/msr.h>
> >  #include <asm/page.h>
> >  #include <asm/current.h>
> > @@ -364,9 +365,14 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
> >       return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
> >  }
> >
> > +static inline bool kvm_is_cmci_supported(struct kvm_vcpu *vcpu)
> > +{
> > +     return vcpu->arch.mcg_cap & MCG_CMCI_P;
> > +}
> > +
> >  static inline int kvm_apic_get_nr_lvt_entries(struct kvm_vcpu *vcpu)
>
> I think it makes sense to take @apic here, not @vcpu, since this is an APIC-specific
> helper.  kvm_apic_set_version() will need to be modified to not call
> kvm_apic_get_nr_lvt_entries() until after it has verified the local APIC is in-kernel,
> but IMO that's a good thing.
Ack.
>
> >  {
> > -     return KVM_APIC_MAX_NR_LVT_ENTRIES;
> > +     return KVM_APIC_MAX_NR_LVT_ENTRIES - !kvm_is_cmci_supported(vcpu);
> >  }
> >
> >  void kvm_apic_set_version(struct kvm_vcpu *vcpu)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 3/4] KVM: x86: Add support for MSR_IA32_MCx_CTL2 MSRs.
  2022-05-11 19:00   ` Sean Christopherson
@ 2022-05-13  4:45     ` Jue Wang
  0 siblings, 0 replies; 13+ messages in thread
From: Jue Wang @ 2022-05-13  4:45 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Paolo Bonzini, Vitaly Kuznetsov, Wanpeng Li, Jim Mattson,
	Joerg Roedel, Tony Luck, kvm

Thanks a lot, Sean.

On Wed, May 11, 2022 at 12:00 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Apr 12, 2022, Jue Wang wrote:
> > Note the support of CMCI (MCG_CMCI_P) is not enabled in
> > kvm_mce_cap_supported yet.
>
> You can probably guess what I would say here :-)  A reader should not have to go
> wade through Intel's SDM to understand the relationship between CTL2 and CMCI.

Included details in the change log for V3 about CTL2 and CMCI_P
register / bits. Also rewrote the change log so they contain full text
form of UCNA and CMCI, and a complete context of this patch series.


>
> > Signed-off-by: Jue Wang <juew@google.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |  1 +
> >  arch/x86/kvm/x86.c              | 50 +++++++++++++++++++++++++--------
> >  2 files changed, 40 insertions(+), 11 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index ec9830d2aabf..639ef92d01d1 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -800,6 +800,7 @@ struct kvm_vcpu_arch {
> >       u64 mcg_ctl;
> >       u64 mcg_ext_ctl;
> >       u64 *mce_banks;
> > +     u64 *mci_ctl2_banks;
> >
> >       /* Cache MMIO info */
> >       u64 mmio_gva;
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index eb4029660bd9..73c64d2b9e60 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -3167,6 +3167,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >       unsigned bank_num = mcg_cap & 0xff;
> >       u32 msr = msr_info->index;
> >       u64 data = msr_info->data;
> > +     u32 offset;
> >
> >       switch (msr) {
> >       case MSR_IA32_MCG_STATUS:
> > @@ -3180,10 +3181,22 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >                       return 1;
> >               vcpu->arch.mcg_ctl = data;
> >               break;
> > +     case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
>
> This is wrong.  Well, incomplete.  It will allow writes to MSRs that do not exist,
> i.e. if bank_num >= offset < KVM_MAX_MCE_BANKS.
>
> I believe this will suffice and is also the simplest?
>
>                 if (msr >= MSR_IA32_MCx_CTL2(bank_num))
>                         return 1;
>
> Actually, tying in with the array_index_nopsec() comments below, if this captures
> the last MSR in a variable, then the overrun is much more reasonable (sample idea
> at the bottom).
>
> > +             if (!(mcg_cap & MCG_CMCI_P) &&
> > +                             (data || !msr_info->host_initiated))
>
> Funky indentation.  Should be:
>
>                 if (!(mcg_cap & MCG_CMCI_P) &&
>                     (data || !msr_info->host_initiated))
>                         return 1;

Updated, I need to further fix my VIM configs.

>
> > +                     return 1;
> > +             /* An attempt to write a 1 to a reserved bit raises #GP */
> > +             if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK))
> > +                     return 1;
> > +             offset = array_index_nospec(
> > +                             msr - MSR_IA32_MC0_CTL2,
> > +                             MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
>
> My preference would be to let this run over (by a fair amount)
>
>                 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
>                                             MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
>
> But if we use a local variable, there's no overrun:
>
>         case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
>                 last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
>                 if (msr > last_msr)
>                         return 1;
>
>                 if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
>                         return 1;
>                 /* An attempt to write a 1 to a reserved bit raises #GP */
>                 if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK))
>                         return 1;
>                 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
>                                             last_msr + 1 - MSR_IA32_MC0_CTL2);
>                 vcpu->arch.mci_ctl2_banks[offset] = data;
>                 break;
>
Updated to use this version.
>
> And if we go that route, in a follow-up patch at the end of the series, clean up
> the "default" path to hoist the if-statement into a proper case statement (unless
> I've misread the code), e.g. with some opportunstic cleanup:
>
>         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
>                 last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
>                 if (msr > last_msr)
>                         return 1;
>
>                 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
>                                             last_msr + 1 - MSR_IA32_MC0_CTL);
>
>                 /*
>                  * Only 0 or all 1s can be written to IA32_MCi_CTL, though some
>                  * Linux kernels clear bit 10 in bank 4 to workaround a BIOS/GART
>                  * TLB issue on AMD K8s, ignore this to avoid an uncatched #GP in
>                  * the guest.
>                  */
>                 if ((offset & 0x3) == 0 &&
>                     data != 0 && (data | (1 << 10)) != ~(u64)0)
>                         return -1;
>
>                 /* MCi_STATUS */
>                 if (!msr_info->host_initiated && (offset & 0x3) == 1 &&
>                     data != 0 && !can_set_mci_status(vcpu))
>                         return -1;
>
>                 vcpu->arch.mce_banks[offset] = data;
>                 break;
>
> Actually, rereading that, isn't "return -1" wrong?  That will cause kvm_emulate_wrmsr()
> to exit to userspace, not inject a #GP.
>
> *sigh*  Yep, indeed, the -1 gets interpreted as -EPERM and kills the guest.
>
> Scratch the idea of doing the above on top, I'll send separate patches and a KUT
> testcase.  I'll Cc you on the patches, it'd save Paolo a merge conflict (or you a
> rebase) if this series is based on top of that bugfix + cleanup.

I will base this series on top of the bugfix + cleanup series.

>
> > +             vcpu->arch.mci_ctl2_banks[offset] = data;
> > +             break;
> >       default:
> >               if (msr >= MSR_IA32_MC0_CTL &&
> >                   msr < MSR_IA32_MCx_CTL(bank_num)) {
> > -                     u32 offset = array_index_nospec(
> > +                     offset = array_index_nospec(
> >                               msr - MSR_IA32_MC0_CTL,
> >                               MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
> >
> > @@ -3489,7 +3502,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >                       return 1;
> >               }
> >               break;
> > -     case 0x200 ... 0x2ff:
> > +     case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
> > +     case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
> >               return kvm_mtrr_set_msr(vcpu, msr, data);
> >       case MSR_IA32_APICBASE:
> >               return kvm_set_apic_base(vcpu, msr_info);
> > @@ -3646,6 +3660,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >       case MSR_IA32_MCG_CTL:
> >       case MSR_IA32_MCG_STATUS:
> >       case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
> > +     case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
> >               return set_msr_mce(vcpu, msr_info);
> >
> >       case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
> > @@ -3750,6 +3765,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
> >       u64 data;
> >       u64 mcg_cap = vcpu->arch.mcg_cap;
> >       unsigned bank_num = mcg_cap & 0xff;
> > +     u32 offset;
> >
> >       switch (msr) {
> >       case MSR_IA32_P5_MC_ADDR:
> > @@ -3767,10 +3783,18 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
> >       case MSR_IA32_MCG_STATUS:
> >               data = vcpu->arch.mcg_status;
> >               break;
> > +     case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
>

> Same comments as the WRMSR path.  I'll also handle the "default" case in my cleanup.

Updated similarly as the WRMSR path. Thanks again, I will base V3 on
the cleanup + fix series.

>
> > +             if (!(mcg_cap & MCG_CMCI_P) && !host)
> > +                     return 1;
> > +             offset = array_index_nospec(
> > +                             msr - MSR_IA32_MC0_CTL2,
> > +                             MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
> > +             data = vcpu->arch.mci_ctl2_banks[offset];
> > +             break;
> >       default:
> >               if (msr >= MSR_IA32_MC0_CTL &&
> >                   msr < MSR_IA32_MCx_CTL(bank_num)) {
> > -                     u32 offset = array_index_nospec(
> > +                     offset = array_index_nospec(
> >                               msr - MSR_IA32_MC0_CTL,
> >                               MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
> >
>
> ...
>
> > @@ -11126,9 +11152,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
> >               goto fail_free_lapic;
> >       vcpu->arch.pio_data = page_address(page);
> >
> > -     vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
> > +     vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
> > +                                    GFP_KERNEL_ACCOUNT);
>
> Switching to kcalloc() should be a separate patch.

Done.

>
> > +     vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
> >                                      GFP_KERNEL_ACCOUNT);
> > -     if (!vcpu->arch.mce_banks)
> > +     if (!vcpu->arch.mce_banks | !vcpu->arch.mci_ctl2_banks)
>
> This wants to be a logical-OR, not a bitwise-OR.
>
> Oh, and vcpu->arch.mci_ctl2_banks needs to be freed if a later step fails.

Thanks for the catch, fixed in V3.

>
> >               goto fail_free_pio_data;
> >       vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
> >
> > --
> > 2.35.1.1178.g4f1659d476-goog
> >

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 4/4] KVM: x86: Add support for MCG_CMCI_P and handling of injected UCNAs.
  2022-05-11 19:20   ` Sean Christopherson
@ 2022-05-13  4:54     ` Jue Wang
  0 siblings, 0 replies; 13+ messages in thread
From: Jue Wang @ 2022-05-13  4:54 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Paolo Bonzini, Vitaly Kuznetsov, Wanpeng Li, Jim Mattson,
	Joerg Roedel, Tony Luck, kvm

On Wed, May 11, 2022 at 12:20 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Apr 12, 2022, Jue Wang wrote:
> > Note prior to this patch, the UCNA type of signaling can already be
> > processed by kvm_vcpu_ioctl_x86_set_mce and does not result in correct
> > CMCI signaling semantic.
>
> Same as before...
>
> UCNA should be spelled out at least once.

Rewrote the change log to include full context of this change and full
text definition of UCNA and CMCI.

>
> >
> > Signed-off-by: Jue Wang <juew@google.com>
> > ---
> >  arch/x86/kvm/vmx/vmx.c |  1 +
> >  arch/x86/kvm/x86.c     | 48 ++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 49 insertions(+)
> >
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index b730d799c26e..63aa2b3d30ca 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -8035,6 +8035,7 @@ static __init int hardware_setup(void)
> >       }
> >
> >       kvm_mce_cap_supported |= MCG_LMCE_P;
> > +     kvm_mce_cap_supported |= MCG_CMCI_P;
>
> Is there really no hardware dependency on CMCI?  Honest question  If not, that
> should be explicitly called out in the changelog.

CMCI emulation does not depend on hardware, it only depends on vcpu's
lapic being available.

Updated the change log.
>
> >       if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
> >               return -EINVAL;
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 73c64d2b9e60..eb6058ca1e70 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -4775,6 +4775,50 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
> >       return r;
> >  }
> >
> > +static bool is_ucna(u64 mcg_status, u64 mci_status)
>
> Any reason not to take 'struct kvm_x86_mce *mce'?

Updated.

>
> > +{
> > +     return !mcg_status &&
> > +             !(mci_status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR));
>
> As someone who knows nothing about MCI encoding, can you add a function comment
> explaing, in detail, what's going on here?
>
> Also, my preference is to align the indentation on multi-line returns.  Paolo scoffs
> at this nit of mine, but he's obviously wrong ;-)

Added function comments in V3 and updated alignment.
>
>         return !mcg_status &&
>                !(mci_status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR));
>
> > +}
> > +
> > +static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu,
> > +             struct kvm_x86_mce *mce)
>
> Please align the params.  Actually, just let it run over, it's a single char.

Done.
>
> static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce)
>
> > +{
> > +     u64 mcg_cap = vcpu->arch.mcg_cap;
> > +     unsigned int bank_num = mcg_cap & 0xff;
> > +     u64 *banks = vcpu->arch.mce_banks;
> > +
> > +     /* Check for legal bank number in guest */
>
> Eh, don't think this warrants a comment.

Removed.
>
> > +     if (mce->bank >= bank_num)
> > +             return -EINVAL;
> > +
> > +     /*
> > +      * UCNA signals should not set bits that are only used for machine check
> > +      * exceptions.
> > +      */
> > +     if (mce->mcg_status ||
> > +             (mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)))
>
> Unless mine eyes deceive me, this is the same as:
>
>         if (!is_ucna(mce->mcg_status, mce->status))
>

Good catch, also folded the MCI_STATUS_VAL and MCI_STATUS_UC checks
into is_ucna.
> > +             return -EINVAL;
> > +
> > +     /* UCNA must have VAL and UC bits set */
> > +     if (!(mce->status & MCI_STATUS_VAL) || !(mce->status & MCI_STATUS_UC))
> > +             return -EINVAL;
> > +
> > +     banks += 4 * mce->bank;
> > +     banks[1] = mce->status;
> > +     banks[2] = mce->addr;
> > +     banks[3] = mce->misc;
> > +     vcpu->arch.mcg_status = mce->mcg_status;
> > +
> > +     if (!(mcg_cap & MCG_CMCI_P) || !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN))
>
> This one's worth wrapping, that's quite a long line, and there's a natural split point:
>
>         if (!(mcg_cap & MCG_CMCI_P) ||
>             !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN))
>                 return 0;
>
>
Done.
> > +             return 0;
> > +
> > +     if (lapic_in_kernel(vcpu))
> > +             kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
> > +
> > +     return 0;
> > +}
> > +
> >  static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
> >                                     struct kvm_x86_mce *mce)
> >  {
> > @@ -4784,6 +4828,10 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
> >
> >       if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
> >               return -EINVAL;
> > +
> > +     if (is_ucna(mce->mcg_status, mce->status))
> > +             return kvm_vcpu_x86_set_ucna(vcpu, mce);
> > +
> >       /*
> >        * if IA32_MCG_CTL is not all 1s, the uncorrected error
> >        * reporting is disabled
> > --
> > 2.35.1.1178.g4f1659d476-goog
> >

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-05-13  4:54 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-12 22:31 [PATCH v2 0/4] KVM: x86: Add support of CMCI signaling and UCNA Jue Wang
2022-04-12 22:31 ` [PATCH v2 1/4] KVM: x86: Clean up KVM APIC LVT logic Jue Wang
2022-05-11 17:38   ` Sean Christopherson
2022-05-12 17:57     ` Jue Wang
2022-04-12 22:31 ` [PATCH v2 2/4] KVM: x86: Add LVTCMCI support Jue Wang
2022-05-11 17:55   ` Sean Christopherson
2022-05-12 18:01     ` Jue Wang
2022-04-12 22:31 ` [PATCH v2 3/4] KVM: x86: Add support for MSR_IA32_MCx_CTL2 MSRs Jue Wang
2022-05-11 19:00   ` Sean Christopherson
2022-05-13  4:45     ` Jue Wang
2022-04-12 22:31 ` [PATCH v2 4/4] KVM: x86: Add support for MCG_CMCI_P and handling of injected UCNAs Jue Wang
2022-05-11 19:20   ` Sean Christopherson
2022-05-13  4:54     ` Jue Wang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.