Re: [PATCH] KVM: x86: Fix lost interrupt on irr_pending race

From: "Radim Krčmář" <rkrcmar@redhat.com>
To: Paolo Bonzini <pbonzini@redhat.com>
Cc: Nadav Amit <namit@cs.technion.ac.il>,
	kvm@vger.kernel.org, wanpeng.li@linux.intel.com,
	nadav.amit@gmail.com
Subject: Re: [PATCH] KVM: x86: Fix lost interrupt on irr_pending race
Date: Tue, 18 Nov 2014 22:31:36 +0100	[thread overview]
Message-ID: <20141118213136.GB22235@potion.brq.redhat.com> (raw)
In-Reply-To: <546BA328.9010009@redhat.com>

2014-11-18 20:51+0100, Paolo Bonzini:
> On 16/11/2014 22:49, Nadav Amit wrote:
> > @@ -374,13 +378,15 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
> > +		apic->irr_pending = false;
> > +		apic_clear_vector(vec, apic->regs + APIC_IRR);
> > +		if (apic_search_irr(apic) != -1)
> > +			apic->irr_pending = true;
> >  	}
> >  }
> 
> This is even more tricky than it looks like. :)
> 
> No one can concurrently look at apic->irr_pending while it is false, in
> particular apic_sync_pv_eoi_to_guest cannot enable PV EOI by mistake
> just because it sees a false irr_pending.  So it's okay if it is first
> set to false and then to true.

Yeah, using 'atomic_t irr_count' instead seems less error-prone to me;
and it would usually end in temporary unpublished-patches tree, but you
can take a look at the idea:

---8<---
KVM: x86: convert irr_pending to atomic irr_count

We've already had a buggy race with it ... atomic_t is simple to grasp
and harder to misuse, so we can switch without losing much performance.
(Read is normal and clear_irr does not parse APIC_IRR in exchange.
 We inflate lapic_struct by 3 bytes.)

Only two races remain now, which is the minimum without a proper lock,
atomic_t also makes their existence obvious on every use.

/__apic_test_and.*()/ aren't atomic, so we have to introduce new ones.
---
 arch/x86/kvm/lapic.c | 48 ++++++++++++++++++++++++++++--------------------
 arch/x86/kvm/lapic.h |  4 ++--
 2 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e0e5642..e3169c5 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -102,6 +102,16 @@ static inline void apic_clear_vector(int vec, void *bitmap)
 	clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
 }
 
+static inline int apic_test_and_set_vector(int vec, void *bitmap)
+{
+	return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline int apic_test_and_clear_vector(int vec, void *bitmap)
+{
+	return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
 static inline int __apic_test_and_set_vector(int vec, void *bitmap)
 {
 	return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -341,12 +351,11 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
 
 static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
 {
-	apic_set_vector(vec, apic->regs + APIC_IRR);
-	/*
-	 * irr_pending must be true if any interrupt is pending; set it after
-	 * APIC_IRR to avoid race with apic_clear_irr
-	 */
-	apic->irr_pending = true;
+	if (apic_test_and_set_vector(vec, apic->regs + APIC_IRR))
+		return;
+
+	if (likely(!kvm_apic_vid_enabled(vcpu->kvm)))
+		atomic_inc(&apic->irr_count);
 }
 
 static inline int apic_search_irr(struct kvm_lapic *apic)
@@ -359,10 +368,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
 	int result;
 
 	/*
-	 * Note that irr_pending is just a hint. It will be always
-	 * true with virtual interrupt delivery enabled.
+	 * Note that irr_count is just a hint. It will be always
+	 * nonzero with virtual interrupt delivery enabled.
 	 */
-	if (!apic->irr_pending)
+	if (!atomic_read(&apic->irr_count))
 		return -1;
 
 	kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
@@ -376,18 +385,16 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
 {
 	struct kvm_vcpu *vcpu;
 
+	if(!apic_test_and_clear_vector(vec, apic->regs + APIC_IRR))
+		return;
+
 	vcpu = apic->vcpu;
 
-	if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) {
+	if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
 		/* try to update RVI */
-		apic_clear_vector(vec, apic->regs + APIC_IRR);
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
-	} else {
-		apic->irr_pending = false;
-		apic_clear_vector(vec, apic->regs + APIC_IRR);
-		if (apic_search_irr(apic) != -1)
-			apic->irr_pending = true;
-	}
+	else
+		atomic_dec(&apic->irr_count);
 }
 
 static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
@@ -1522,7 +1529,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 		apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
 		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
 	}
-	apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
+	atomic_set(&apic->irr_count, kvm_apic_vid_enabled(vcpu->kvm));
 	apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm);
 	apic->highest_isr_cache = -1;
 	update_divide_count(apic);
@@ -1732,7 +1739,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 	hrtimer_cancel(&apic->lapic_timer.timer);
 	update_divide_count(apic);
 	start_apic_timer(apic);
-	apic->irr_pending = true;
+	atomic_set(&apic->irr_count, kvm_apic_vid_enabled(vcpu->kvm) ?
+				1 : count_vectors(apic->regs + APIC_IRR));
 	apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ?
 				1 : count_vectors(apic->regs + APIC_ISR);
 	apic->highest_isr_cache = -1;
@@ -1820,7 +1828,7 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
 {
 	if (!pv_eoi_enabled(vcpu) ||
 	    /* IRR set or many bits in ISR: could be nested. */
-	    apic->irr_pending ||
+	    atomic_read(&apic->irr_count) ||
 	    /* Cache not set: could be safe but we don't bother. */
 	    apic->highest_isr_cache == -1 ||
 	    /* Need EOI to update ioapic. */
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index d4365f2..a3f533b 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -24,8 +24,8 @@ struct kvm_lapic {
 	u32 divide_count;
 	struct kvm_vcpu *vcpu;
 	bool sw_enabled;
-	bool irr_pending;
-	/* Number of bits set in ISR. */
+	/* Number of bits set in IRR and ISR. */
+	atomic_t irr_count;
 	s16 isr_count;
 	/* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
 	int highest_isr_cache;
-- 
2.1.0