All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP.
@ 2015-05-13  1:47 Steve Rutherford
  2015-05-13  1:47 ` [RFC PATCH 2/4] KVM: x86: Add KVM exit for IOAPIC EOIs Steve Rutherford
                   ` (3 more replies)
  0 siblings, 4 replies; 41+ messages in thread
From: Steve Rutherford @ 2015-05-13  1:47 UTC (permalink / raw)
  To: kvm; +Cc: ahonig

First patch in a series which enables the relocation of the
PIC/IOAPIC/PIT to userspace.

Adds capability KVM_CAP_SPLIT_IRQCHIP and ioctl KVM_SPLIT_IRQCHIP.

KVM_SPLIT_IRQCHIP enables the construction of LAPICs without the rest
of the irqchip.

Compile tested for x86.

Signed-off-by: Steve Rutherford <srutherford@google.com>
Suggested-by: Andrew Honig <ahonig@google.com>
---
 Documentation/virtual/kvm/api.txt | 15 ++++++++++++
 arch/powerpc/kvm/irq.h            |  5 ++++
 arch/s390/kvm/irq.h               |  4 ++++
 arch/x86/include/asm/kvm_host.h   |  2 ++
 arch/x86/kvm/assigned-dev.c       |  4 ++--
 arch/x86/kvm/irq.c                |  6 ++---
 arch/x86/kvm/irq.h                | 11 +++++++++
 arch/x86/kvm/irq_comm.c           |  7 ++++++
 arch/x86/kvm/lapic.c              | 13 +++++++----
 arch/x86/kvm/mmu.c                |  2 +-
 arch/x86/kvm/svm.c                |  4 ++--
 arch/x86/kvm/vmx.c                | 12 +++++-----
 arch/x86/kvm/x86.c                | 49 +++++++++++++++++++++++++++------------
 include/kvm/arm_vgic.h            |  1 +
 include/linux/kvm_host.h          |  1 +
 include/uapi/linux/kvm.h          |  3 +++
 virt/kvm/irqchip.c                |  2 +-
 17 files changed, 106 insertions(+), 35 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 6955444..0744b4e 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2979,6 +2979,21 @@ len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0
 and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq),
 which is the maximum number of possibly pending cpu-local interrupts.
 
+4.96 KVM_SPLIT_IRQCHIP
+
+Capability: KVM_CAP_SPLIT_IRQCHIP
+Architectures: x86
+Type:  VM ioctl
+Parameters: None
+Returns: 0 on success, -1 on error
+
+Create a local apic for each processor in the kernel.  This differs from
+KVM_CREATE_IRQCHIP in that it only creates the local apic; it creates neither
+the ioapic nor the pic in the kernel. Also, enables in kernel routing of
+interrupt requests. Fails if VCPU has already been created, or if the irqchip is
+already in the kernel.
+
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
index 5a9a10b..5e6fa06 100644
--- a/arch/powerpc/kvm/irq.h
+++ b/arch/powerpc/kvm/irq.h
@@ -17,4 +17,9 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 	return ret;
 }
 
+static inline int lapic_in_kernel(struct kvm *kvm)
+{
+	return irqchip_in_kernel(kvm);
+}
+
 #endif
diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h
index d98e415..db876c3 100644
--- a/arch/s390/kvm/irq.h
+++ b/arch/s390/kvm/irq.h
@@ -19,4 +19,8 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 	return 1;
 }
 
+static inline int lapic_in_kernel(struct kvm *kvm)
+{
+	return irqchip_in_kernel(kvm);
+}
 #endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bbb8f4e..3ddc134 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -638,6 +638,8 @@ struct kvm_arch {
 	bool boot_vcpu_runs_old_kvmclock;
 
 	u64 disabled_quirks;
+
+	bool irqchip_split;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
index d090ecf..1237e92 100644
--- a/arch/x86/kvm/assigned-dev.c
+++ b/arch/x86/kvm/assigned-dev.c
@@ -291,7 +291,7 @@ static int kvm_deassign_irq(struct kvm *kvm,
 {
 	unsigned long guest_irq_type, host_irq_type;
 
-	if (!irqchip_in_kernel(kvm))
+	if (!lapic_in_kernel(kvm))
 		return -EINVAL;
 	/* no irq assignment to deassign */
 	if (!assigned_dev->irq_requested_type)
@@ -568,7 +568,7 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 	struct kvm_assigned_dev_kernel *match;
 	unsigned long host_irq_type, guest_irq_type;
 
-	if (!irqchip_in_kernel(kvm))
+	if (!lapic_in_kernel(kvm))
 		return r;
 
 	mutex_lock(&kvm->lock);
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index a1ec6a50..706e47a 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -57,7 +57,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
  */
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
 {
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v->kvm))
 		return v->arch.interrupt.pending;
 
 	if (kvm_cpu_has_extint(v))
@@ -75,7 +75,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
  */
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 {
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v->kvm))
 		return v->arch.interrupt.pending;
 
 	if (kvm_cpu_has_extint(v))
@@ -103,7 +103,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 {
 	int vector;
 
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v->kvm))
 		return v->arch.interrupt.nr;
 
 	vector = kvm_cpu_get_extint(v);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index ad68c73..e46abf3 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -92,6 +92,17 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 	return ret;
 }
 
+static inline int irqchip_split(struct kvm *kvm)
+{
+	return kvm->arch.irqchip_split;
+}
+
+static inline int lapic_in_kernel(struct kvm *kvm)
+{
+	return irqchip_split(kvm) || irqchip_in_kernel(kvm);
+}
+
+
 void kvm_pic_reset(struct kvm_kpic_state *s);
 
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 9efff9e..f43c59a 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -328,3 +328,10 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
 	return kvm_set_irq_routing(kvm, default_routing,
 				   ARRAY_SIZE(default_routing), 0);
 }
+
+static const struct kvm_irq_routing_entry empty_routing[] = {};
+
+int kvm_setup_empty_irq_routing(struct kvm *kvm)
+{
+	return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
+}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index dc5b57b..bc392a6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -209,7 +209,8 @@ out:
 	if (old)
 		kfree_rcu(old, rcu);
 
-	kvm_vcpu_request_scan_ioapic(kvm);
+	if (!irqchip_split(kvm))
+		kvm_vcpu_request_scan_ioapic(kvm);
 }
 
 static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
@@ -1819,7 +1820,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
 				apic_find_highest_isr(apic));
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
-	kvm_rtc_eoi_tracking_restore_one(vcpu);
+	if (!irqchip_split(vcpu->kvm))
+		kvm_rtc_eoi_tracking_restore_one(vcpu);
 }
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1902,7 +1904,8 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
 	    /* Cache not set: could be safe but we don't bother. */
 	    apic->highest_isr_cache == -1 ||
 	    /* Need EOI to update ioapic. */
-	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
+	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache) ||
+	    irqchip_split(vcpu->kvm)) {
 		/*
 		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
 		 * so we need not do anything here.
@@ -1958,7 +1961,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 reg = (msr - APIC_BASE_MSR) << 4;
 
-	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+	if (!lapic_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
 		return 1;
 
 	if (reg == APIC_ICR2)
@@ -1975,7 +1978,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
 
-	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+	if (!lapic_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
 		return 1;
 
 	if (reg == APIC_DFR || reg == APIC_ICR2) {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b78e83f..e5bf6db 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3465,7 +3465,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 
 static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 {
-	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+	if (unlikely(!lapic_in_kernel(vcpu->kvm) ||
 		     kvm_event_needs_reinjection(vcpu)))
 		return false;
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8954d7a..7ac9ec2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3054,7 +3054,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
 	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
 	/* instruction emulation calls kvm_set_cr8() */
 	r = cr_interception(svm);
-	if (irqchip_in_kernel(svm->vcpu.kvm))
+	if (lapic_in_kernel(svm->vcpu.kvm))
 		return r;
 	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
 		return r;
@@ -3295,7 +3295,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
 	 * If the user space waits to inject interrupts, exit as soon as
 	 * possible
 	 */
-	if (!irqchip_in_kernel(svm->vcpu.kvm) &&
+	if (!lapic_in_kernel(svm->vcpu.kvm) &&
 	    kvm_run->request_interrupt_window &&
 	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bcb61b0..a53747f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -948,7 +948,7 @@ static inline bool cpu_has_vmx_tpr_shadow(void)
 
 static inline bool vm_need_tpr_shadow(struct kvm *kvm)
 {
-	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
+	return (cpu_has_vmx_tpr_shadow()) && lapic_in_kernel(kvm);
 }
 
 static inline bool cpu_has_secondary_exec_ctrls(void)
@@ -1064,7 +1064,7 @@ static inline bool cpu_has_vmx_ple(void)
 
 static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
-	return flexpriority_enabled && irqchip_in_kernel(kvm);
+	return flexpriority_enabled && lapic_in_kernel(kvm);
 }
 
 static inline bool cpu_has_vmx_vpid(void)
@@ -4341,7 +4341,7 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
 
 static int vmx_vm_has_apicv(struct kvm *kvm)
 {
-	return enable_apicv && irqchip_in_kernel(kvm);
+	return enable_apicv && lapic_in_kernel(kvm);
 }
 
 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -5317,7 +5317,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 				u8 cr8 = (u8)val;
 				err = kvm_set_cr8(vcpu, cr8);
 				kvm_complete_insn_gp(vcpu, err);
-				if (irqchip_in_kernel(vcpu->kvm))
+				if (lapic_in_kernel(vcpu->kvm))
 					return 1;
 				if (cr8_prev <= cr8)
 					return 1;
@@ -5534,7 +5534,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 	 * If the user space waits to inject interrupts, exit as soon as
 	 * possible
 	 */
-	if (!irqchip_in_kernel(vcpu->kvm) &&
+	if (!lapic_in_kernel(vcpu->kvm) &&
 	    vcpu->run->request_interrupt_window &&
 	    !kvm_cpu_has_interrupt(vcpu)) {
 		vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
@@ -9419,7 +9419,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
 	 * emulated by vmx_set_efer(), below.
 	 */
-	vm_entry_controls_init(vmx, 
+	vm_entry_controls_init(vmx,
 		(vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
 			~VM_ENTRY_IA32E_MODE) |
 		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cde5d61..7505b39 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -782,7 +782,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 	if (cr8 & CR8_RESERVED_BITS)
 		return 1;
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (lapic_in_kernel(vcpu->kvm))
 		kvm_lapic_set_tpr(vcpu, cr8);
 	else
 		vcpu->arch.cr8 = cr8;
@@ -792,7 +792,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
 
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 {
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (lapic_in_kernel(vcpu->kvm))
 		return kvm_lapic_get_cr8(vcpu);
 	else
 		return vcpu->arch.cr8;
@@ -2800,6 +2800,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_TSC_DEADLINE_TIMER:
 	case KVM_CAP_ENABLE_CAP_VM:
 	case KVM_CAP_DISABLE_QUIRKS:
+	case KVM_CAP_SPLIT_IRQCHIP:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_PCI_2_3:
@@ -3002,7 +3003,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 {
 	if (irq->irq >= KVM_NR_INTERRUPTS)
 		return -EINVAL;
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (lapic_in_kernel(vcpu->kvm))
 		return -ENXIO;
 
 	kvm_queue_interrupt(vcpu, irq->irq, false);
@@ -3480,7 +3481,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		struct kvm_vapic_addr va;
 
 		r = -EINVAL;
-		if (!irqchip_in_kernel(vcpu->kvm))
+		if (!lapic_in_kernel(vcpu->kvm))
 			goto out;
 		r = -EFAULT;
 		if (copy_from_user(&va, argp, sizeof va))
@@ -3838,7 +3839,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
 			bool line_status)
 {
-	if (!irqchip_in_kernel(kvm))
+	if (!lapic_in_kernel(kvm))
 		return -ENXIO;
 
 	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
@@ -4128,6 +4129,24 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
 		break;
 	}
+	case KVM_CREATE_SPLIT_IRQCHIP: {
+		mutex_lock(&kvm->lock);
+		r = -EEXIST;
+		if (lapic_in_kernel(kvm))
+			goto split_irqchip_unlock;
+		r = -EINVAL;
+		if (atomic_read(&kvm->online_vcpus))
+			goto split_irqchip_unlock;
+		r = kvm_setup_empty_irq_routing(kvm);
+		if (r)
+			goto split_irqchip_unlock;
+		kvm->arch.irqchip_split = true;
+		r = 0;
+split_irqchip_unlock:
+		mutex_unlock(&kvm->lock);
+		break;
+	}
+
 	default:
 		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
 	}
@@ -5893,7 +5912,7 @@ void kvm_arch_exit(void)
 int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.halt_exits;
-	if (irqchip_in_kernel(vcpu->kvm)) {
+	if (lapic_in_kernel(vcpu->kvm)) {
 		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
 		return 1;
 	} else {
@@ -6060,7 +6079,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
  */
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
 {
-	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
+	return (!lapic_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
 		vcpu->run->request_interrupt_window &&
 		kvm_arch_interrupt_allowed(vcpu));
 }
@@ -6072,7 +6091,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
 	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
 	kvm_run->cr8 = kvm_get_cr8(vcpu);
 	kvm_run->apic_base = kvm_get_apic_base(vcpu);
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (lapic_in_kernel(vcpu->kvm))
 		kvm_run->ready_for_interrupt_injection = 1;
 	else
 		kvm_run->ready_for_interrupt_injection =
@@ -6219,7 +6238,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
 {
 	struct page *page = NULL;
 
-	if (!irqchip_in_kernel(vcpu->kvm))
+	if (!lapic_in_kernel(vcpu->kvm))
 		return;
 
 	if (!kvm_x86_ops->set_apic_access_page_addr)
@@ -6255,7 +6274,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
-	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+	bool req_int_win = !lapic_in_kernel(vcpu->kvm) &&
 		vcpu->run->request_interrupt_window;
 	bool req_immediate_exit = false;
 
@@ -6644,7 +6663,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	/* re-sync apic's tpr */
-	if (!irqchip_in_kernel(vcpu->kvm)) {
+	if (!lapic_in_kernel(vcpu->kvm)) {
 		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
 			r = -EINVAL;
 			goto out;
@@ -7340,7 +7359,7 @@ void kvm_arch_check_processor_compat(void *rtn)
 
 bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
 {
-	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+	return lapic_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
 }
 
 struct static_key kvm_no_apic_vcpu __read_mostly;
@@ -7356,7 +7375,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.pv.pv_unhalted = false;
 	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
-	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
+	if (!lapic_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	else
 		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -7374,7 +7393,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	if (r < 0)
 		goto fail_free_pio_data;
 
-	if (irqchip_in_kernel(kvm)) {
+	if (lapic_in_kernel(kvm)) {
 		r = kvm_create_lapic(vcpu);
 		if (r < 0)
 			goto fail_mmu_destroy;
@@ -7437,7 +7456,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	kvm_mmu_destroy(vcpu);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	free_page((unsigned long)vcpu->arch.pio_data);
-	if (!irqchip_in_kernel(vcpu->kvm))
+	if (!lapic_in_kernel(vcpu->kvm))
 		static_key_slow_dec(&kvm_no_apic_vcpu);
 }
 
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 133ea00..ffe1f4e 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -329,6 +329,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
 int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
 
 #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
+#define lapic_in_kernel(k)      (irqchip_in_kernel(k))
 #define vgic_initialized(k)	(!!((k)->arch.vgic.nr_cpus))
 #define vgic_ready(k)		((k)->arch.vgic.ready)
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 87fd74a..277b7a1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -934,6 +934,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
 #endif
 
 int kvm_setup_default_irq_routing(struct kvm *kvm);
+int kvm_setup_empty_irq_routing(struct kvm *kvm);
 int kvm_set_irq_routing(struct kvm *kvm,
 			const struct kvm_irq_routing_entry *entries,
 			unsigned nr,
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 75bd9f7..7d06dc4 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -815,6 +815,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_S390_IRQ_STATE 114
 #define KVM_CAP_PPC_HWRNG 115
 #define KVM_CAP_DISABLE_QUIRKS 116
+#define KVM_CAP_SPLIT_IRQCHIP 117
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1200,6 +1201,8 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_S390_IRQ_STATE */
 #define KVM_S390_SET_IRQ_STATE	  _IOW(KVMIO, 0xb5, struct kvm_s390_irq_state)
 #define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
+/* Available with KVM_CAP_SPLIT_IRQCHIP */
+#define KVM_CREATE_SPLIT_IRQCHIP  _IO(KVMIO, 0xb7)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 1d56a90..8aaceed 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -73,7 +73,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
 {
 	struct kvm_kernel_irq_routing_entry route;
 
-	if (!irqchip_in_kernel(kvm) || msi->flags != 0)
+	if (!lapic_in_kernel(kvm) || msi->flags != 0)
 		return -EINVAL;
 
 	route.msi.address_lo = msi->address_lo;
-- 
2.2.0.rc0.207.ga3a616c


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [RFC PATCH 2/4] KVM: x86: Add KVM exit for IOAPIC EOIs
  2015-05-13  1:47 [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP Steve Rutherford
@ 2015-05-13  1:47 ` Steve Rutherford
  2015-05-13  7:35   ` Paolo Bonzini
  2015-05-24 16:46   ` Avi Kivity
  2015-05-13  1:47 ` [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference Steve Rutherford
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 41+ messages in thread
From: Steve Rutherford @ 2015-05-13  1:47 UTC (permalink / raw)
  To: kvm; +Cc: ahonig

Adds KVM_EXIT_IOAPIC_EOI which passes the interrupt vector up to
userspace.

Uses a per VCPU exit bitmap to decide whether or not the IOAPIC needs
to be informed (which is identical to the EOI_EXIT_BITMAP field used
by modern x86 processors, but can also be used to elide kvm IOAPIC EOI
exits on older processors).

[Note: A prototype using ResampleFDs found that decoupling the EOI
from the VCPU's thread made it possible for the VCPU to not see a
recent EOI after reentering the guest. This does not match real
hardware.]

Compile tested for Intel x86.

Signed-off-by: Steve Rutherford <srutherford@google.com>
---
 Documentation/virtual/kvm/api.txt | 10 ++++++++++
 arch/x86/include/asm/kvm_host.h   |  3 +++
 arch/x86/kvm/lapic.c              |  9 +++++++++
 arch/x86/kvm/x86.c                | 11 +++++++++++
 include/linux/kvm_host.h          |  1 +
 include/uapi/linux/kvm.h          |  5 +++++
 6 files changed, 39 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 0744b4e..dd92996 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3285,6 +3285,16 @@ Valid values for 'type' are:
 	 */
 	__u64 kvm_valid_regs;
 	__u64 kvm_dirty_regs;
+
+	/* KVM_EXIT_IOAPIC_EOI */
+        struct {
+	       __u8 vector;
+        } eoi;
+
+Indicates that an eoi of a level triggered IOAPIC interrupt on vector has
+occurred, which should be handled by the userspace IOAPIC. Triggers when
+the Irqchip has been split between userspace and the kernel.
+
 	union {
 		struct kvm_sync_regs regs;
 		char padding[1024];
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3ddc134..b1978f1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -539,6 +539,9 @@ struct kvm_vcpu_arch {
 	struct {
 		bool pv_unhalted;
 	} pv;
+
+	u64 eoi_exit_bitmaps[4];
+	int pending_ioapic_eoi;
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index bc392a6..42fada6f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -860,6 +860,15 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 
 static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
 {
+	if (irqchip_split(apic->vcpu->kvm)) {
+		if (test_bit(vector,
+			     (void *) apic->vcpu->arch.eoi_exit_bitmaps)) {
+			apic->vcpu->arch.pending_ioapic_eoi = vector;
+			kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
+		}
+		return;
+	}
+
 	if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
 		int trigger_mode;
 		if (apic_test_vector(vector, apic->regs + APIC_TMR))
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7505b39..cc27c35 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6324,6 +6324,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_handle_pmu_event(vcpu);
 		if (kvm_check_request(KVM_REQ_PMI, vcpu))
 			kvm_deliver_pmi(vcpu);
+		if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
+			BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
+			if (test_bit(vcpu->arch.pending_ioapic_eoi,
+				     (void *) vcpu->arch.eoi_exit_bitmaps)) {
+				vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
+				vcpu->run->eoi.vector =
+						vcpu->arch.pending_ioapic_eoi;
+				r = 0;
+				goto out;
+			}
+		}
 		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
 			vcpu_scan_ioapic(vcpu);
 		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 277b7a1..cef20ad 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -134,6 +134,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_ENABLE_IBS        23
 #define KVM_REQ_DISABLE_IBS       24
 #define KVM_REQ_APIC_PAGE_RELOAD  25
+#define KVM_REQ_IOAPIC_EOI_EXIT   26
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7d06dc4..2305948 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -183,6 +183,7 @@ struct kvm_s390_skeys {
 #define KVM_EXIT_EPR              23
 #define KVM_EXIT_SYSTEM_EVENT     24
 #define KVM_EXIT_S390_STSI        25
+#define KVM_EXIT_IOAPIC_EOI       26
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -329,6 +330,10 @@ struct kvm_run {
 			__u8 sel1;
 			__u16 sel2;
 		} s390_stsi;
+		/* KVM_EXIT_IOAPIC_EOI */
+		struct {
+			__u8 vector;
+		} eoi;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
-- 
2.2.0.rc0.207.ga3a616c


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-13  1:47 [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP Steve Rutherford
  2015-05-13  1:47 ` [RFC PATCH 2/4] KVM: x86: Add KVM exit for IOAPIC EOIs Steve Rutherford
@ 2015-05-13  1:47 ` Steve Rutherford
  2015-05-13  6:12   ` Jan Kiszka
  2015-05-13  7:51   ` Paolo Bonzini
  2015-05-13  1:47 ` [RFC PATCH 4/4] KVM: x86: Add support for local interrupt requests from userspace Steve Rutherford
  2015-05-13  7:57 ` [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP Paolo Bonzini
  3 siblings, 2 replies; 41+ messages in thread
From: Steve Rutherford @ 2015-05-13  1:47 UTC (permalink / raw)
  To: kvm; +Cc: ahonig

In order to support a userspace IOAPIC interacting with an in kernel
APIC, the EOI exit bitmaps need to be configurable.

If the IOAPIC is in userspace (i.e. the irqchip has been split), the
EOI exit bitmaps will be set whenever the GSI Routes are configured.
In particular, for the low 24 MSI routes, the EOI Exit bit
corresponding to the destination vector will be set for the
destination VCPU.

The intention is for the userspace IOAPIC to use MSI routes [0,23] to
inject interrupts into the guest.

This is a slight abuse of the notion of an MSI Route, given that MSIs
classically bypass the IOAPIC. It might be worthwhile to add an
additional route type to improve clarity.

Compile tested for Intel x86.

Signed-off-by: Steve Rutherford <srutherford@google.com>
---
 arch/x86/kvm/ioapic.c    | 11 +++++++++++
 arch/x86/kvm/ioapic.h    |  1 +
 arch/x86/kvm/lapic.c     |  2 ++
 arch/x86/kvm/x86.c       | 13 +++++++++++--
 include/linux/kvm_host.h |  4 ++++
 virt/kvm/irqchip.c       | 32 ++++++++++++++++++++++++++++++++
 6 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 856f791..3323c86 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -672,3 +672,14 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 	spin_unlock(&ioapic->lock);
 	return 0;
 }
+
+void kvm_vcpu_request_scan_userspace_ioapic(struct kvm *kvm)
+{
+	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+	if (ioapic)
+		return;
+	if (!lapic_in_kernel(kvm))
+		return;
+	kvm_make_scan_ioapic_request(kvm);
+}
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index ca0b0b4..b7af71b 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -123,4 +123,5 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
 			u32 *tmr);
 
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 42fada6f..7533b87 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -211,6 +211,8 @@ out:
 
 	if (!irqchip_split(kvm))
 		kvm_vcpu_request_scan_ioapic(kvm);
+	else
+		kvm_vcpu_request_scan_userspace_ioapic(kvm);
 }
 
 static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cc27c35..6127fe7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6335,8 +6335,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 				goto out;
 			}
 		}
-		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
-			vcpu_scan_ioapic(vcpu);
+		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) {
+			if (irqchip_split(vcpu->kvm)) {
+				memset(vcpu->arch.eoi_exit_bitmaps, 0, 32);
+				kvm_scan_ioapic_routes(
+				    vcpu, vcpu->arch.eoi_exit_bitmaps);
+				kvm_x86_ops->load_eoi_exitmap(
+				    vcpu, vcpu->arch.eoi_exit_bitmaps);
+
+			} else
+				vcpu_scan_ioapic(vcpu);
+		}
 		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
 			kvm_vcpu_reload_apic_access_page(vcpu);
 	}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cef20ad..678215a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -438,10 +438,14 @@ void vcpu_put(struct kvm_vcpu *vcpu);
 
 #ifdef __KVM_HAVE_IOAPIC
 void kvm_vcpu_request_scan_ioapic(struct kvm *kvm);
+void kvm_vcpu_request_scan_userspace_ioapic(struct kvm *kvm);
 #else
 static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
 {
 }
+static inline void kvm_vcpu_request_scan_userspace_ioapic(struct kvm *kvm)
+{
+}
 #endif
 
 #ifdef CONFIG_HAVE_KVM_IRQFD
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 8aaceed..8a253aa 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -205,6 +205,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
 
 	synchronize_srcu_expedited(&kvm->irq_srcu);
 
+	kvm_vcpu_request_scan_userspace_ioapic(kvm);
+
 	new = old;
 	r = 0;
 
@@ -212,3 +214,33 @@ out:
 	kfree(new);
 	return r;
 }
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_kernel_irq_routing_entry *entry;
+	struct kvm_irq_routing_table *table;
+	u32 i, nr_rt_entries;
+
+	mutex_lock(&kvm->irq_lock);
+	table = kvm->irq_routing;
+	nr_rt_entries = min_t(u32, table->nr_rt_entries, IOAPIC_NUM_PINS);
+	for (i = 0; i < nr_rt_entries; ++i) {
+		hlist_for_each_entry(entry, &table->map[i], link) {
+			u32 dest_id, dest_mode;
+
+			if (entry->type != KVM_IRQ_ROUTING_MSI)
+				continue;
+			dest_id = (entry->msi.address_lo >> 12) & 0xff;
+			dest_mode = (entry->msi.address_lo >> 2) & 0x1;
+			if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id,
+						dest_mode)) {
+				u32 vector = entry->msi.data & 0xff;
+
+				__set_bit(vector,
+					  (unsigned long *) eoi_exit_bitmap);
+			}
+		}
+	}
+	mutex_unlock(&kvm->irq_lock);
+}
-- 
2.2.0.rc0.207.ga3a616c


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [RFC PATCH 4/4] KVM: x86: Add support for local interrupt requests from userspace
  2015-05-13  1:47 [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP Steve Rutherford
  2015-05-13  1:47 ` [RFC PATCH 2/4] KVM: x86: Add KVM exit for IOAPIC EOIs Steve Rutherford
  2015-05-13  1:47 ` [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference Steve Rutherford
@ 2015-05-13  1:47 ` Steve Rutherford
  2015-05-13  6:12   ` Jan Kiszka
  2015-05-13  7:22   ` Paolo Bonzini
  2015-05-13  7:57 ` [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP Paolo Bonzini
  3 siblings, 2 replies; 41+ messages in thread
From: Steve Rutherford @ 2015-05-13  1:47 UTC (permalink / raw)
  To: kvm; +Cc: ahonig

In order to enable userspace PIC support, the userspace PIC needs to
be able to inject local interrupt requests.

This adds the ioctl KVM_REQUEST_LOCAL_INTERRUPT and kvm exit
KVM_EXIT_GET_EXTINT.

The vm ioctl KVM_REQUEST_LOCAL_INTERRUPT makes a KVM_REQ_EVENT request
on the BSP, which causes the BSP to exit to userspace to fetch the
vector of the underlying external interrupt, which the BSP then
injects into the guest. This matches the PIC spec, and is necessary to
boot Windows.

Boots and passes the KVM unit tests on intel x86 with the
PIC/PIT/IOAPIC in userspace (under a non-QEMU VMM). Boots and passes
the KVM unit tests under normal conditions as well.

SVM support and device assignment are untested with this feature
enabled, but testing for both is in the works.

Compiles for ARM/x86/PPC.

Signed-off-by: Steve Rutherford <srutherford@google.com>
---
 Documentation/virtual/kvm/api.txt |  9 +++++++
 arch/x86/include/asm/kvm_host.h   |  1 +
 arch/x86/kvm/irq.c                |  6 ++++-
 arch/x86/kvm/lapic.c              |  7 ++++++
 arch/x86/kvm/lapic.h              |  2 ++
 arch/x86/kvm/x86.c                | 53 ++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/kvm.h          |  6 +++++
 7 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index dd92996..a650321 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2993,6 +2993,15 @@ the ioapic nor the pic in the kernel. Also, enables in kernel routing of
 interrupt requests. Fails if VCPU has already been created, or if the irqchip is
 already in the kernel.
 
+4.97 KVM_REQUEST_LOCAL_INTERRUPT
+
+Capability: KVM_CAP_SPLIT_IRQCHIP
+Type: VM ioctl
+Parameters: none
+Returns: 0 on success, -1 on error.
+
+Informs the kernel that userspace has a pending external interrupt.
+
 
 5. The kvm_run structure
 ------------------------
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b1978f1..602ea70 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -542,6 +542,7 @@ struct kvm_vcpu_arch {
 
 	u64 eoi_exit_bitmaps[4];
 	int pending_ioapic_eoi;
+	bool pending_external;
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 706e47a..487b5f5 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -43,7 +43,11 @@ EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
  */
 static int kvm_cpu_has_extint(struct kvm_vcpu *v)
 {
-	if (kvm_apic_accept_pic_intr(v))
+	u8 accept = kvm_apic_accept_pic_intr(v);
+
+	if (accept && irqchip_split(v->kvm))
+		return v->arch.pending_external;
+	else if (accept)
 		return pic_irqchip(v->kvm)->output;	/* PIC */
 	else
 		return 0;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 7533b87..9a021f7 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2089,3 +2089,10 @@ void kvm_lapic_init(void)
 	jump_label_rate_limit(&apic_hw_disabled, HZ);
 	jump_label_rate_limit(&apic_sw_disabled, HZ);
 }
+
+void kvm_request_local_interrupt(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.pending_external = true;
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+	kvm_vcpu_kick(vcpu);
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 71b150c..66bb780 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -63,6 +63,8 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 		unsigned long *dest_map);
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
 
+void kvm_request_local_interrupt(struct kvm_vcpu *vcpu);
+
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 		struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6127fe7..b7ceff3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -65,6 +65,8 @@
 #include <asm/pvclock.h>
 #include <asm/div64.h>
 
+#define GET_VECTOR_FROM_USERSPACE 1
+
 #define MAX_IO_MSRS 256
 #define KVM_MAX_MCE_BANKS 32
 #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
@@ -4146,6 +4148,30 @@ split_irqchip_unlock:
 		mutex_unlock(&kvm->lock);
 		break;
 	}
+	case KVM_REQUEST_LOCAL_INTERRUPT: {
+		int i;
+		struct kvm_vcpu *vcpu;
+		struct kvm_vcpu *bsp_vcpu = NULL;
+
+		mutex_lock(&kvm->lock);
+		r = -EEXIST;
+		if (!irqchip_split(kvm))
+			goto out;
+		kvm_for_each_vcpu(i, vcpu, kvm) {
+			if (kvm_vcpu_is_reset_bsp(vcpu)) {
+				bsp_vcpu = vcpu;
+				continue;
+			}
+		}
+		r = -EINVAL;
+		if (bsp_vcpu == NULL)
+			goto interrupt_unlock;
+		kvm_request_local_interrupt(bsp_vcpu);
+		r = 0;
+interrupt_unlock:
+		mutex_unlock(&kvm->lock);
+		break;
+	}
 
 	default:
 		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
@@ -6123,6 +6149,16 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
 }
 
+static int pending_userspace_external_intr(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.pending_external && kvm_apic_accept_pic_intr(vcpu)) {
+		vcpu->arch.pending_external = false;
+		return true;
+	} else
+		return false;
+
+}
+
 static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
 {
 	int r;
@@ -6187,7 +6223,12 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
 				return r;
 		}
 		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
-			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
+			if (irqchip_split(vcpu->kvm) &&
+			    pending_userspace_external_intr(vcpu)) {
+				return GET_VECTOR_FROM_USERSPACE;
+			}
+			kvm_queue_interrupt(vcpu,
+					    kvm_cpu_get_interrupt(vcpu),
 					    false);
 			kvm_x86_ops->set_irq(vcpu);
 		}
@@ -6351,13 +6392,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+		int event;
 		kvm_apic_accept_events(vcpu);
 		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
 			r = 1;
 			goto out;
 		}
-
-		if (inject_pending_event(vcpu, req_int_win) != 0)
+		event = inject_pending_event(vcpu, req_int_win);
+		if (event == GET_VECTOR_FROM_USERSPACE) {
+			vcpu->run->exit_reason = KVM_EXIT_GET_EXTINT;
+			kvm_make_request(KVM_REQ_EVENT, vcpu);
+			r = 0;
+			goto out;
+		} else if (event != 0)
 			req_immediate_exit = true;
 		/* enable NMI/IRQ window open exits if needed */
 		else if (vcpu->arch.nmi_pending)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2305948..368e80a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -184,6 +184,7 @@ struct kvm_s390_skeys {
 #define KVM_EXIT_SYSTEM_EVENT     24
 #define KVM_EXIT_S390_STSI        25
 #define KVM_EXIT_IOAPIC_EOI       26
+#define KVM_EXIT_GET_EXTINT       27
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -334,6 +335,10 @@ struct kvm_run {
 		struct {
 			__u8 vector;
 		} eoi;
+		/* KVM_EXIT_GET_EXTINT */
+		struct {
+			__u8 vector;
+		} extint;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -1208,6 +1213,7 @@ struct kvm_s390_ucas_mapping {
 #define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
 /* Available with KVM_CAP_SPLIT_IRQCHIP */
 #define KVM_CREATE_SPLIT_IRQCHIP  _IO(KVMIO, 0xb7)
+#define KVM_REQUEST_LOCAL_INTERRUPT _IO(KVMIO, 0xb8)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
-- 
2.2.0.rc0.207.ga3a616c


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-13  1:47 ` [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference Steve Rutherford
@ 2015-05-13  6:12   ` Jan Kiszka
  2015-05-13  8:04     ` Paolo Bonzini
  2015-05-13  7:51   ` Paolo Bonzini
  1 sibling, 1 reply; 41+ messages in thread
From: Jan Kiszka @ 2015-05-13  6:12 UTC (permalink / raw)
  To: Steve Rutherford, kvm; +Cc: ahonig

On 2015-05-13 03:47, Steve Rutherford wrote:
> In order to support a userspace IOAPIC interacting with an in kernel
> APIC, the EOI exit bitmaps need to be configurable.
> 
> If the IOAPIC is in userspace (i.e. the irqchip has been split), the
> EOI exit bitmaps will be set whenever the GSI Routes are configured.
> In particular, for the low 24 MSI routes, the EOI Exit bit
> corresponding to the destination vector will be set for the
> destination VCPU.
> 
> The intention is for the userspace IOAPIC to use MSI routes [0,23] to
> inject interrupts into the guest.
> 
> This is a slight abuse of the notion of an MSI Route, given that MSIs
> classically bypass the IOAPIC. It might be worthwhile to add an
> additional route type to improve clarity.
> 
> Compile tested for Intel x86.
> 
> Signed-off-by: Steve Rutherford <srutherford@google.com>
> ---
>  arch/x86/kvm/ioapic.c    | 11 +++++++++++
>  arch/x86/kvm/ioapic.h    |  1 +
>  arch/x86/kvm/lapic.c     |  2 ++
>  arch/x86/kvm/x86.c       | 13 +++++++++++--
>  include/linux/kvm_host.h |  4 ++++
>  virt/kvm/irqchip.c       | 32 ++++++++++++++++++++++++++++++++
>  6 files changed, 61 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
> index 856f791..3323c86 100644
> --- a/arch/x86/kvm/ioapic.c
> +++ b/arch/x86/kvm/ioapic.c
> @@ -672,3 +672,14 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
>  	spin_unlock(&ioapic->lock);
>  	return 0;
>  }
> +
> +void kvm_vcpu_request_scan_userspace_ioapic(struct kvm *kvm)
> +{
> +	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> +
> +	if (ioapic)
> +		return;
> +	if (!lapic_in_kernel(kvm))
> +		return;
> +	kvm_make_scan_ioapic_request(kvm);
> +}
> diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
> index ca0b0b4..b7af71b 100644
> --- a/arch/x86/kvm/ioapic.h
> +++ b/arch/x86/kvm/ioapic.h
> @@ -123,4 +123,5 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
>  void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
>  			u32 *tmr);
>  
> +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
>  #endif
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 42fada6f..7533b87 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -211,6 +211,8 @@ out:
>  
>  	if (!irqchip_split(kvm))
>  		kvm_vcpu_request_scan_ioapic(kvm);
> +	else
> +		kvm_vcpu_request_scan_userspace_ioapic(kvm);
>  }
>  
>  static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index cc27c35..6127fe7 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6335,8 +6335,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  				goto out;
>  			}
>  		}
> -		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
> -			vcpu_scan_ioapic(vcpu);
> +		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) {
> +			if (irqchip_split(vcpu->kvm)) {
> +				memset(vcpu->arch.eoi_exit_bitmaps, 0, 32);
> +				kvm_scan_ioapic_routes(
> +				    vcpu, vcpu->arch.eoi_exit_bitmaps);
> +				kvm_x86_ops->load_eoi_exitmap(
> +				    vcpu, vcpu->arch.eoi_exit_bitmaps);
> +
> +			} else
> +				vcpu_scan_ioapic(vcpu);
> +		}
>  		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
>  			kvm_vcpu_reload_apic_access_page(vcpu);
>  	}
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index cef20ad..678215a 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -438,10 +438,14 @@ void vcpu_put(struct kvm_vcpu *vcpu);
>  
>  #ifdef __KVM_HAVE_IOAPIC
>  void kvm_vcpu_request_scan_ioapic(struct kvm *kvm);
> +void kvm_vcpu_request_scan_userspace_ioapic(struct kvm *kvm);
>  #else
>  static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
>  {
>  }
> +static inline void kvm_vcpu_request_scan_userspace_ioapic(struct kvm *kvm)
> +{
> +}
>  #endif
>  
>  #ifdef CONFIG_HAVE_KVM_IRQFD
> diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
> index 8aaceed..8a253aa 100644
> --- a/virt/kvm/irqchip.c
> +++ b/virt/kvm/irqchip.c
> @@ -205,6 +205,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
>  
>  	synchronize_srcu_expedited(&kvm->irq_srcu);
>  
> +	kvm_vcpu_request_scan_userspace_ioapic(kvm);
> +
>  	new = old;
>  	r = 0;
>  
> @@ -212,3 +214,33 @@ out:
>  	kfree(new);
>  	return r;
>  }
> +
> +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	struct kvm_kernel_irq_routing_entry *entry;
> +	struct kvm_irq_routing_table *table;
> +	u32 i, nr_rt_entries;
> +
> +	mutex_lock(&kvm->irq_lock);
> +	table = kvm->irq_routing;
> +	nr_rt_entries = min_t(u32, table->nr_rt_entries, IOAPIC_NUM_PINS);
> +	for (i = 0; i < nr_rt_entries; ++i) {
> +		hlist_for_each_entry(entry, &table->map[i], link) {
> +			u32 dest_id, dest_mode;
> +
> +			if (entry->type != KVM_IRQ_ROUTING_MSI)
> +				continue;
> +			dest_id = (entry->msi.address_lo >> 12) & 0xff;
> +			dest_mode = (entry->msi.address_lo >> 2) & 0x1;
> +			if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id,
> +						dest_mode)) {
> +				u32 vector = entry->msi.data & 0xff;
> +
> +				__set_bit(vector,
> +					  (unsigned long *) eoi_exit_bitmap);
> +			}
> +		}
> +	}
> +	mutex_unlock(&kvm->irq_lock);
> +}
> 

This looks a bit frightening regarding the lookup costs. Do we really
have to run through the complete routing table to find the needed
information? There can be way more "real" MSI entries than IOAPIC pins.
There can even be multiple IOAPICs (thanks to your patches overcoming
the single in-kernel instance). And this is potentially serializing the
whole VM (kvm->irq_lock), not just those VCPUs that use the IOAPIC.

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 4/4] KVM: x86: Add support for local interrupt requests from userspace
  2015-05-13  1:47 ` [RFC PATCH 4/4] KVM: x86: Add support for local interrupt requests from userspace Steve Rutherford
@ 2015-05-13  6:12   ` Jan Kiszka
  2015-05-13 22:41     ` Steve Rutherford
  2015-05-13  7:22   ` Paolo Bonzini
  1 sibling, 1 reply; 41+ messages in thread
From: Jan Kiszka @ 2015-05-13  6:12 UTC (permalink / raw)
  To: Steve Rutherford, kvm; +Cc: Andrew Honig

On 2015-05-13 03:47, Steve Rutherford wrote:
> In order to enable userspace PIC support, the userspace PIC needs to
> be able to inject local interrupt requests.
> 
> This adds the ioctl KVM_REQUEST_LOCAL_INTERRUPT and kvm exit
> KVM_EXIT_GET_EXTINT.
> 
> The vm ioctl KVM_REQUEST_LOCAL_INTERRUPT makes a KVM_REQ_EVENT request
> on the BSP, which causes the BSP to exit to userspace to fetch the
> vector of the underlying external interrupt, which the BSP then
> injects into the guest. This matches the PIC spec, and is necessary to
> boot Windows.

The API name seems too generic, given the fairly specific use case. As
it only allows to kick the BSP, not any VCPU, that should be clarified.
Maybe call it KVM_REQUEST_PIC_INJECTION or so. Or allow userspace to
specify the target VCPU, then it's a bit more generic again.

Actually, when looking at the MultiProcessor spec, you will find
multiple models for injecting PIC interrupts into CPU APICs. Just in the
PIC Mode, the BSP is the only possible target. In the other modes, all
APICs can receive PIC interrupts, and either the IOAPIC or the local
APIC's LINT0 configuration decide about the effective target. We should
allow to model all modes, based on userspace decisions.

> 
> Boots and passes the KVM unit tests on intel x86 with the
> PIC/PIT/IOAPIC in userspace (under a non-QEMU VMM). Boots and passes
> the KVM unit tests under normal conditions as well.

Do you plan to provide a reference implementation for an open source
userspace VMM as well, once the kernel side is settled?

> 
> SVM support and device assignment are untested with this feature
> enabled, but testing for both is in the works.
> 
> Compiles for ARM/x86/PPC.
> 
> Signed-off-by: Steve Rutherford <srutherford@google.com>
> ---
>  Documentation/virtual/kvm/api.txt |  9 +++++++
>  arch/x86/include/asm/kvm_host.h   |  1 +
>  arch/x86/kvm/irq.c                |  6 ++++-
>  arch/x86/kvm/lapic.c              |  7 ++++++
>  arch/x86/kvm/lapic.h              |  2 ++
>  arch/x86/kvm/x86.c                | 53 ++++++++++++++++++++++++++++++++++++---
>  include/uapi/linux/kvm.h          |  6 +++++
>  7 files changed, 80 insertions(+), 4 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index dd92996..a650321 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -2993,6 +2993,15 @@ the ioapic nor the pic in the kernel. Also, enables in kernel routing of
>  interrupt requests. Fails if VCPU has already been created, or if the irqchip is
>  already in the kernel.
>  
> +4.97 KVM_REQUEST_LOCAL_INTERRUPT
> +
> +Capability: KVM_CAP_SPLIT_IRQCHIP
> +Type: VM ioctl
> +Parameters: none
> +Returns: 0 on success, -1 on error.
> +
> +Informs the kernel that userspace has a pending external interrupt.
> +
>  
>  5. The kvm_run structure
>  ------------------------
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index b1978f1..602ea70 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -542,6 +542,7 @@ struct kvm_vcpu_arch {
>  
>  	u64 eoi_exit_bitmaps[4];
>  	int pending_ioapic_eoi;
> +	bool pending_external;
>  };
>  
>  struct kvm_lpage_info {
> diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
> index 706e47a..487b5f5 100644
> --- a/arch/x86/kvm/irq.c
> +++ b/arch/x86/kvm/irq.c
> @@ -43,7 +43,11 @@ EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
>   */
>  static int kvm_cpu_has_extint(struct kvm_vcpu *v)
>  {
> -	if (kvm_apic_accept_pic_intr(v))
> +	u8 accept = kvm_apic_accept_pic_intr(v);
> +
> +	if (accept && irqchip_split(v->kvm))
> +		return v->arch.pending_external;
> +	else if (accept)
>  		return pic_irqchip(v->kvm)->output;	/* PIC */
>  	else
>  		return 0;
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 7533b87..9a021f7 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -2089,3 +2089,10 @@ void kvm_lapic_init(void)
>  	jump_label_rate_limit(&apic_hw_disabled, HZ);
>  	jump_label_rate_limit(&apic_sw_disabled, HZ);
>  }
> +
> +void kvm_request_local_interrupt(struct kvm_vcpu *vcpu)
> +{
> +	vcpu->arch.pending_external = true;
> +	kvm_make_request(KVM_REQ_EVENT, vcpu);
> +	kvm_vcpu_kick(vcpu);
> +}
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index 71b150c..66bb780 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -63,6 +63,8 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
>  		unsigned long *dest_map);
>  int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
>  
> +void kvm_request_local_interrupt(struct kvm_vcpu *vcpu);
> +
>  bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
>  		struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map);
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 6127fe7..b7ceff3 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -65,6 +65,8 @@
>  #include <asm/pvclock.h>
>  #include <asm/div64.h>
>  
> +#define GET_VECTOR_FROM_USERSPACE 1
> +
>  #define MAX_IO_MSRS 256
>  #define KVM_MAX_MCE_BANKS 32
>  #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
> @@ -4146,6 +4148,30 @@ split_irqchip_unlock:
>  		mutex_unlock(&kvm->lock);
>  		break;
>  	}
> +	case KVM_REQUEST_LOCAL_INTERRUPT: {
> +		int i;
> +		struct kvm_vcpu *vcpu;
> +		struct kvm_vcpu *bsp_vcpu = NULL;
> +
> +		mutex_lock(&kvm->lock);
> +		r = -EEXIST;

Nitpicking, EEXIST seems confusing. I would go for EINVAL here as well.

> +		if (!irqchip_split(kvm))
> +			goto out;
> +		kvm_for_each_vcpu(i, vcpu, kvm) {
> +			if (kvm_vcpu_is_reset_bsp(vcpu)) {
> +				bsp_vcpu = vcpu;
> +				continue;
> +			}
> +		}
> +		r = -EINVAL;
> +		if (bsp_vcpu == NULL)
> +			goto interrupt_unlock;
> +		kvm_request_local_interrupt(bsp_vcpu);
> +		r = 0;
> +interrupt_unlock:
> +		mutex_unlock(&kvm->lock);
> +		break;
> +	}
>  
>  	default:
>  		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
> @@ -6123,6 +6149,16 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
>  	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
>  }
>  
> +static int pending_userspace_external_intr(struct kvm_vcpu *vcpu)
> +{
> +	if (vcpu->arch.pending_external && kvm_apic_accept_pic_intr(vcpu)) {
> +		vcpu->arch.pending_external = false;
> +		return true;
> +	} else
> +		return false;
> +
> +}
> +
>  static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
>  {
>  	int r;
> @@ -6187,7 +6223,12 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
>  				return r;
>  		}
>  		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
> -			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
> +			if (irqchip_split(vcpu->kvm) &&
> +			    pending_userspace_external_intr(vcpu)) {
> +				return GET_VECTOR_FROM_USERSPACE;
> +			}
> +			kvm_queue_interrupt(vcpu,
> +					    kvm_cpu_get_interrupt(vcpu),
>  					    false);
>  			kvm_x86_ops->set_irq(vcpu);
>  		}
> @@ -6351,13 +6392,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  	}
>  
>  	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
> +		int event;
>  		kvm_apic_accept_events(vcpu);
>  		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
>  			r = 1;
>  			goto out;
>  		}
> -
> -		if (inject_pending_event(vcpu, req_int_win) != 0)
> +		event = inject_pending_event(vcpu, req_int_win);
> +		if (event == GET_VECTOR_FROM_USERSPACE) {
> +			vcpu->run->exit_reason = KVM_EXIT_GET_EXTINT;
> +			kvm_make_request(KVM_REQ_EVENT, vcpu);
> +			r = 0;
> +			goto out;
> +		} else if (event != 0)
>  			req_immediate_exit = true;
>  		/* enable NMI/IRQ window open exits if needed */
>  		else if (vcpu->arch.nmi_pending)
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 2305948..368e80a 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -184,6 +184,7 @@ struct kvm_s390_skeys {
>  #define KVM_EXIT_SYSTEM_EVENT     24
>  #define KVM_EXIT_S390_STSI        25
>  #define KVM_EXIT_IOAPIC_EOI       26
> +#define KVM_EXIT_GET_EXTINT       27
>  
>  /* For KVM_EXIT_INTERNAL_ERROR */
>  /* Emulate instruction failed. */
> @@ -334,6 +335,10 @@ struct kvm_run {
>  		struct {
>  			__u8 vector;
>  		} eoi;
> +		/* KVM_EXIT_GET_EXTINT */
> +		struct {
> +			__u8 vector;
> +		} extint;
>  		/* Fix the size of the union. */
>  		char padding[256];
>  	};
> @@ -1208,6 +1213,7 @@ struct kvm_s390_ucas_mapping {
>  #define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
>  /* Available with KVM_CAP_SPLIT_IRQCHIP */
>  #define KVM_CREATE_SPLIT_IRQCHIP  _IO(KVMIO, 0xb7)
> +#define KVM_REQUEST_LOCAL_INTERRUPT _IO(KVMIO, 0xb8)
>  
>  #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
>  #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
> 

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 4/4] KVM: x86: Add support for local interrupt requests from userspace
  2015-05-13  1:47 ` [RFC PATCH 4/4] KVM: x86: Add support for local interrupt requests from userspace Steve Rutherford
  2015-05-13  6:12   ` Jan Kiszka
@ 2015-05-13  7:22   ` Paolo Bonzini
  2015-05-13 23:13     ` Steve Rutherford
  1 sibling, 1 reply; 41+ messages in thread
From: Paolo Bonzini @ 2015-05-13  7:22 UTC (permalink / raw)
  To: Steve Rutherford, kvm; +Cc: ahonig



On 13/05/2015 03:47, Steve Rutherford wrote:
> In order to enable userspace PIC support, the userspace PIC needs to
> be able to inject local interrupt requests.
> 
> This adds the ioctl KVM_REQUEST_LOCAL_INTERRUPT and kvm exit
> KVM_EXIT_GET_EXTINT.
> 
> The vm ioctl KVM_REQUEST_LOCAL_INTERRUPT makes a KVM_REQ_EVENT request
> on the BSP, which causes the BSP to exit to userspace to fetch the
> vector of the underlying external interrupt, which the BSP then
> injects into the guest. This matches the PIC spec, and is necessary to
> boot Windows.
> 
> Boots and passes the KVM unit tests on intel x86 with the
> PIC/PIT/IOAPIC in userspace (under a non-QEMU VMM). Boots and passes
> the KVM unit tests under normal conditions as well.
> 
> SVM support and device assignment are untested with this feature
> enabled, but testing for both is in the works.
> 
> Compiles for ARM/x86/PPC.

This may be ENOCOFFEE, but where is extint.vector read?

Could you use KVM_INTERRUPT to set KVM_REQUEST_LOCAL_INTERRUPT _and_ the
interrupt vector at the same time?  This is exactly the logic that is
used for !irqchip, and it should work for the EXTINT case as well.

Paolo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 2/4] KVM: x86: Add KVM exit for IOAPIC EOIs
  2015-05-13  1:47 ` [RFC PATCH 2/4] KVM: x86: Add KVM exit for IOAPIC EOIs Steve Rutherford
@ 2015-05-13  7:35   ` Paolo Bonzini
  2015-05-13 22:18     ` Steve Rutherford
  2015-05-24 16:46   ` Avi Kivity
  1 sibling, 1 reply; 41+ messages in thread
From: Paolo Bonzini @ 2015-05-13  7:35 UTC (permalink / raw)
  To: Steve Rutherford, kvm; +Cc: ahonig



On 13/05/2015 03:47, Steve Rutherford wrote:
> Adds KVM_EXIT_IOAPIC_EOI which passes the interrupt vector up to
> userspace.
> 
> Uses a per VCPU exit bitmap to decide whether or not the IOAPIC needs
> to be informed (which is identical to the EOI_EXIT_BITMAP field used
> by modern x86 processors, but can also be used to elide kvm IOAPIC EOI
> exits on older processors).
> 
> [Note: A prototype using ResampleFDs found that decoupling the EOI
> from the VCPU's thread made it possible for the VCPU to not see a
> recent EOI after reentering the guest. This does not match real
> hardware.]

Can you explain this better (and perhaps add a testcase to ioapic.c)?

Paolo

> Compile tested for Intel x86.
> 
> Signed-off-by: Steve Rutherford <srutherford@google.com>
> ---
>  Documentation/virtual/kvm/api.txt | 10 ++++++++++
>  arch/x86/include/asm/kvm_host.h   |  3 +++
>  arch/x86/kvm/lapic.c              |  9 +++++++++
>  arch/x86/kvm/x86.c                | 11 +++++++++++
>  include/linux/kvm_host.h          |  1 +
>  include/uapi/linux/kvm.h          |  5 +++++
>  6 files changed, 39 insertions(+)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 0744b4e..dd92996 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -3285,6 +3285,16 @@ Valid values for 'type' are:
>  	 */
>  	__u64 kvm_valid_regs;
>  	__u64 kvm_dirty_regs;
> +
> +	/* KVM_EXIT_IOAPIC_EOI */
> +        struct {
> +	       __u8 vector;
> +        } eoi;
> +
> +Indicates that an eoi of a level triggered IOAPIC interrupt on vector has
> +occurred, which should be handled by the userspace IOAPIC. Triggers when
> +the Irqchip has been split between userspace and the kernel.
> +
>  	union {
>  		struct kvm_sync_regs regs;
>  		char padding[1024];
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3ddc134..b1978f1 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -539,6 +539,9 @@ struct kvm_vcpu_arch {
>  	struct {
>  		bool pv_unhalted;
>  	} pv;
> +
> +	u64 eoi_exit_bitmaps[4];
> +	int pending_ioapic_eoi;
>  };
>  
>  struct kvm_lpage_info {
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index bc392a6..42fada6f 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -860,6 +860,15 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
>  
>  static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
>  {
> +	if (irqchip_split(apic->vcpu->kvm)) {
> +		if (test_bit(vector,
> +			     (void *) apic->vcpu->arch.eoi_exit_bitmaps)) {
> +			apic->vcpu->arch.pending_ioapic_eoi = vector;
> +			kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
> +		}
> +		return;
> +	}
> +
>  	if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
>  		int trigger_mode;
>  		if (apic_test_vector(vector, apic->regs + APIC_TMR))
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 7505b39..cc27c35 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6324,6 +6324,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  			kvm_handle_pmu_event(vcpu);
>  		if (kvm_check_request(KVM_REQ_PMI, vcpu))
>  			kvm_deliver_pmi(vcpu);
> +		if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
> +			BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
> +			if (test_bit(vcpu->arch.pending_ioapic_eoi,
> +				     (void *) vcpu->arch.eoi_exit_bitmaps)) {
> +				vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
> +				vcpu->run->eoi.vector =
> +						vcpu->arch.pending_ioapic_eoi;
> +				r = 0;
> +				goto out;
> +			}
> +		}
>  		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
>  			vcpu_scan_ioapic(vcpu);
>  		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 277b7a1..cef20ad 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -134,6 +134,7 @@ static inline bool is_error_page(struct page *page)
>  #define KVM_REQ_ENABLE_IBS        23
>  #define KVM_REQ_DISABLE_IBS       24
>  #define KVM_REQ_APIC_PAGE_RELOAD  25
> +#define KVM_REQ_IOAPIC_EOI_EXIT   26
>  
>  #define KVM_USERSPACE_IRQ_SOURCE_ID		0
>  #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 7d06dc4..2305948 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -183,6 +183,7 @@ struct kvm_s390_skeys {
>  #define KVM_EXIT_EPR              23
>  #define KVM_EXIT_SYSTEM_EVENT     24
>  #define KVM_EXIT_S390_STSI        25
> +#define KVM_EXIT_IOAPIC_EOI       26
>  
>  /* For KVM_EXIT_INTERNAL_ERROR */
>  /* Emulate instruction failed. */
> @@ -329,6 +330,10 @@ struct kvm_run {
>  			__u8 sel1;
>  			__u16 sel2;
>  		} s390_stsi;
> +		/* KVM_EXIT_IOAPIC_EOI */
> +		struct {
> +			__u8 vector;
> +		} eoi;
>  		/* Fix the size of the union. */
>  		char padding[256];
>  	};
> 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-13  1:47 ` [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference Steve Rutherford
  2015-05-13  6:12   ` Jan Kiszka
@ 2015-05-13  7:51   ` Paolo Bonzini
  2015-05-13 22:24     ` Steve Rutherford
  1 sibling, 1 reply; 41+ messages in thread
From: Paolo Bonzini @ 2015-05-13  7:51 UTC (permalink / raw)
  To: Steve Rutherford, kvm; +Cc: ahonig



On 13/05/2015 03:47, Steve Rutherford wrote:
> @@ -205,6 +205,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
>  
>  	synchronize_srcu_expedited(&kvm->irq_srcu);
>  
> +	kvm_vcpu_request_scan_userspace_ioapic(kvm);
> +
>  	new = old;
>  	r = 0;
>  

This can be done before synchronize_srcu_expedited, so that changes
ripple to the VCPUs faster.

> +void kvm_vcpu_request_scan_userspace_ioapic(struct kvm *kvm)
> +{
> +	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> +
> +	if (ioapic)
> +		return;
> +	if (!lapic_in_kernel(kvm))
> +		return;
> +	kvm_make_scan_ioapic_request(kvm);
> +}

This is okay for use in kvm_set_irq_routing (perhaps renamed to
kvm_arch_irq_routing_update), but when it is used here:

>  	if (!irqchip_split(kvm))
>  		kvm_vcpu_request_scan_ioapic(kvm);
> +	else
> +		kvm_vcpu_request_scan_userspace_ioapic(kvm);

... you can simply do this:

- 	if (!irqchip_split(kvm))
-		kvm_vcpu_request_scan_ioapic(kvm);
+	kvm_make_scan_ioapic_request(kvm);

Paolo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP.
  2015-05-13  1:47 [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP Steve Rutherford
                   ` (2 preceding siblings ...)
  2015-05-13  1:47 ` [RFC PATCH 4/4] KVM: x86: Add support for local interrupt requests from userspace Steve Rutherford
@ 2015-05-13  7:57 ` Paolo Bonzini
  2015-05-13 22:10   ` Steve Rutherford
  3 siblings, 1 reply; 41+ messages in thread
From: Paolo Bonzini @ 2015-05-13  7:57 UTC (permalink / raw)
  To: Steve Rutherford, kvm; +Cc: ahonig



On 13/05/2015 03:47, Steve Rutherford wrote:
> First patch in a series which enables the relocation of the
> PIC/IOAPIC/PIT to userspace.
> 
> Adds capability KVM_CAP_SPLIT_IRQCHIP and ioctl KVM_SPLIT_IRQCHIP.
> 
> KVM_SPLIT_IRQCHIP enables the construction of LAPICs without the rest
> of the irqchip.

The ioctl name in the code is KVM_CREATE_SPLIT_IRQCHIP, but it doesn't
seem to create a local APIC.

> Compile tested for x86.

The capability is fine.  However, instead of introducing a new ioctl,
you can use KVM_CAP_ENABLE_CAP_VM and enable the capability once
(per-VM) using KVM_ENABLE_CAP.  The enabling code is basically the same
you have in your code.

Paolo

> Signed-off-by: Steve Rutherford <srutherford@google.com>
> Suggested-by: Andrew Honig <ahonig@google.com>
> ---
>  Documentation/virtual/kvm/api.txt | 15 ++++++++++++
>  arch/powerpc/kvm/irq.h            |  5 ++++
>  arch/s390/kvm/irq.h               |  4 ++++
>  arch/x86/include/asm/kvm_host.h   |  2 ++
>  arch/x86/kvm/assigned-dev.c       |  4 ++--
>  arch/x86/kvm/irq.c                |  6 ++---
>  arch/x86/kvm/irq.h                | 11 +++++++++
>  arch/x86/kvm/irq_comm.c           |  7 ++++++
>  arch/x86/kvm/lapic.c              | 13 +++++++----
>  arch/x86/kvm/mmu.c                |  2 +-
>  arch/x86/kvm/svm.c                |  4 ++--
>  arch/x86/kvm/vmx.c                | 12 +++++-----
>  arch/x86/kvm/x86.c                | 49 +++++++++++++++++++++++++++------------
>  include/kvm/arm_vgic.h            |  1 +
>  include/linux/kvm_host.h          |  1 +
>  include/uapi/linux/kvm.h          |  3 +++
>  virt/kvm/irqchip.c                |  2 +-
>  17 files changed, 106 insertions(+), 35 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 6955444..0744b4e 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -2979,6 +2979,21 @@ len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0
>  and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq),
>  which is the maximum number of possibly pending cpu-local interrupts.
>  
> +4.96 KVM_SPLIT_IRQCHIP
> +
> +Capability: KVM_CAP_SPLIT_IRQCHIP
> +Architectures: x86
> +Type:  VM ioctl
> +Parameters: None
> +Returns: 0 on success, -1 on error
> +
> +Create a local apic for each processor in the kernel.  This differs from
> +KVM_CREATE_IRQCHIP in that it only creates the local apic; it creates neither
> +the ioapic nor the pic in the kernel. Also, enables in kernel routing of
> +interrupt requests. Fails if VCPU has already been created, or if the irqchip is
> +already in the kernel.
> +
> +
>  5. The kvm_run structure
>  ------------------------
>  
> diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
> index 5a9a10b..5e6fa06 100644
> --- a/arch/powerpc/kvm/irq.h
> +++ b/arch/powerpc/kvm/irq.h
> @@ -17,4 +17,9 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
>  	return ret;
>  }
>  
> +static inline int lapic_in_kernel(struct kvm *kvm)
> +{
> +	return irqchip_in_kernel(kvm);
> +}
> +
>  #endif
> diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h
> index d98e415..db876c3 100644
> --- a/arch/s390/kvm/irq.h
> +++ b/arch/s390/kvm/irq.h
> @@ -19,4 +19,8 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
>  	return 1;
>  }
>  
> +static inline int lapic_in_kernel(struct kvm *kvm)
> +{
> +	return irqchip_in_kernel(kvm);
> +}
>  #endif
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index bbb8f4e..3ddc134 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -638,6 +638,8 @@ struct kvm_arch {
>  	bool boot_vcpu_runs_old_kvmclock;
>  
>  	u64 disabled_quirks;
> +
> +	bool irqchip_split;
>  };
>  
>  struct kvm_vm_stat {
> diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
> index d090ecf..1237e92 100644
> --- a/arch/x86/kvm/assigned-dev.c
> +++ b/arch/x86/kvm/assigned-dev.c
> @@ -291,7 +291,7 @@ static int kvm_deassign_irq(struct kvm *kvm,
>  {
>  	unsigned long guest_irq_type, host_irq_type;
>  
> -	if (!irqchip_in_kernel(kvm))
> +	if (!lapic_in_kernel(kvm))
>  		return -EINVAL;
>  	/* no irq assignment to deassign */
>  	if (!assigned_dev->irq_requested_type)
> @@ -568,7 +568,7 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
>  	struct kvm_assigned_dev_kernel *match;
>  	unsigned long host_irq_type, guest_irq_type;
>  
> -	if (!irqchip_in_kernel(kvm))
> +	if (!lapic_in_kernel(kvm))
>  		return r;
>  
>  	mutex_lock(&kvm->lock);
> diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
> index a1ec6a50..706e47a 100644
> --- a/arch/x86/kvm/irq.c
> +++ b/arch/x86/kvm/irq.c
> @@ -57,7 +57,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
>   */
>  int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
>  {
> -	if (!irqchip_in_kernel(v->kvm))
> +	if (!lapic_in_kernel(v->kvm))
>  		return v->arch.interrupt.pending;
>  
>  	if (kvm_cpu_has_extint(v))
> @@ -75,7 +75,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
>   */
>  int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
>  {
> -	if (!irqchip_in_kernel(v->kvm))
> +	if (!lapic_in_kernel(v->kvm))
>  		return v->arch.interrupt.pending;
>  
>  	if (kvm_cpu_has_extint(v))
> @@ -103,7 +103,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
>  {
>  	int vector;
>  
> -	if (!irqchip_in_kernel(v->kvm))
> +	if (!lapic_in_kernel(v->kvm))
>  		return v->arch.interrupt.nr;
>  
>  	vector = kvm_cpu_get_extint(v);
> diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
> index ad68c73..e46abf3 100644
> --- a/arch/x86/kvm/irq.h
> +++ b/arch/x86/kvm/irq.h
> @@ -92,6 +92,17 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
>  	return ret;
>  }
>  
> +static inline int irqchip_split(struct kvm *kvm)
> +{
> +	return kvm->arch.irqchip_split;
> +}
> +
> +static inline int lapic_in_kernel(struct kvm *kvm)
> +{
> +	return irqchip_split(kvm) || irqchip_in_kernel(kvm);
> +}
> +
> +
>  void kvm_pic_reset(struct kvm_kpic_state *s);
>  
>  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
> index 9efff9e..f43c59a 100644
> --- a/arch/x86/kvm/irq_comm.c
> +++ b/arch/x86/kvm/irq_comm.c
> @@ -328,3 +328,10 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
>  	return kvm_set_irq_routing(kvm, default_routing,
>  				   ARRAY_SIZE(default_routing), 0);
>  }
> +
> +static const struct kvm_irq_routing_entry empty_routing[] = {};
> +
> +int kvm_setup_empty_irq_routing(struct kvm *kvm)
> +{
> +	return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
> +}
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index dc5b57b..bc392a6 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -209,7 +209,8 @@ out:
>  	if (old)
>  		kfree_rcu(old, rcu);
>  
> -	kvm_vcpu_request_scan_ioapic(kvm);
> +	if (!irqchip_split(kvm))
> +		kvm_vcpu_request_scan_ioapic(kvm);
>  }
>  
>  static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
> @@ -1819,7 +1820,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
>  		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
>  				apic_find_highest_isr(apic));
>  	kvm_make_request(KVM_REQ_EVENT, vcpu);
> -	kvm_rtc_eoi_tracking_restore_one(vcpu);
> +	if (!irqchip_split(vcpu->kvm))
> +		kvm_rtc_eoi_tracking_restore_one(vcpu);
>  }
>  
>  void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
> @@ -1902,7 +1904,8 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
>  	    /* Cache not set: could be safe but we don't bother. */
>  	    apic->highest_isr_cache == -1 ||
>  	    /* Need EOI to update ioapic. */
> -	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
> +	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache) ||
> +	    irqchip_split(vcpu->kvm)) {
>  		/*
>  		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
>  		 * so we need not do anything here.
> @@ -1958,7 +1961,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>  	struct kvm_lapic *apic = vcpu->arch.apic;
>  	u32 reg = (msr - APIC_BASE_MSR) << 4;
>  
> -	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> +	if (!lapic_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
>  		return 1;
>  
>  	if (reg == APIC_ICR2)
> @@ -1975,7 +1978,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
>  	struct kvm_lapic *apic = vcpu->arch.apic;
>  	u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
>  
> -	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> +	if (!lapic_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
>  		return 1;
>  
>  	if (reg == APIC_DFR || reg == APIC_ICR2) {
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index b78e83f..e5bf6db 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3465,7 +3465,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
>  
>  static bool can_do_async_pf(struct kvm_vcpu *vcpu)
>  {
> -	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
> +	if (unlikely(!lapic_in_kernel(vcpu->kvm) ||
>  		     kvm_event_needs_reinjection(vcpu)))
>  		return false;
>  
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 8954d7a..7ac9ec2 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -3054,7 +3054,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
>  	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
>  	/* instruction emulation calls kvm_set_cr8() */
>  	r = cr_interception(svm);
> -	if (irqchip_in_kernel(svm->vcpu.kvm))
> +	if (lapic_in_kernel(svm->vcpu.kvm))
>  		return r;
>  	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
>  		return r;
> @@ -3295,7 +3295,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
>  	 * If the user space waits to inject interrupts, exit as soon as
>  	 * possible
>  	 */
> -	if (!irqchip_in_kernel(svm->vcpu.kvm) &&
> +	if (!lapic_in_kernel(svm->vcpu.kvm) &&
>  	    kvm_run->request_interrupt_window &&
>  	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
>  		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index bcb61b0..a53747f 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -948,7 +948,7 @@ static inline bool cpu_has_vmx_tpr_shadow(void)
>  
>  static inline bool vm_need_tpr_shadow(struct kvm *kvm)
>  {
> -	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
> +	return (cpu_has_vmx_tpr_shadow()) && lapic_in_kernel(kvm);
>  }
>  
>  static inline bool cpu_has_secondary_exec_ctrls(void)
> @@ -1064,7 +1064,7 @@ static inline bool cpu_has_vmx_ple(void)
>  
>  static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
>  {
> -	return flexpriority_enabled && irqchip_in_kernel(kvm);
> +	return flexpriority_enabled && lapic_in_kernel(kvm);
>  }
>  
>  static inline bool cpu_has_vmx_vpid(void)
> @@ -4341,7 +4341,7 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
>  
>  static int vmx_vm_has_apicv(struct kvm *kvm)
>  {
> -	return enable_apicv && irqchip_in_kernel(kvm);
> +	return enable_apicv && lapic_in_kernel(kvm);
>  }
>  
>  static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
> @@ -5317,7 +5317,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
>  				u8 cr8 = (u8)val;
>  				err = kvm_set_cr8(vcpu, cr8);
>  				kvm_complete_insn_gp(vcpu, err);
> -				if (irqchip_in_kernel(vcpu->kvm))
> +				if (lapic_in_kernel(vcpu->kvm))
>  					return 1;
>  				if (cr8_prev <= cr8)
>  					return 1;
> @@ -5534,7 +5534,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
>  	 * If the user space waits to inject interrupts, exit as soon as
>  	 * possible
>  	 */
> -	if (!irqchip_in_kernel(vcpu->kvm) &&
> +	if (!lapic_in_kernel(vcpu->kvm) &&
>  	    vcpu->run->request_interrupt_window &&
>  	    !kvm_cpu_has_interrupt(vcpu)) {
>  		vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
> @@ -9419,7 +9419,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
>  	 * emulated by vmx_set_efer(), below.
>  	 */
> -	vm_entry_controls_init(vmx, 
> +	vm_entry_controls_init(vmx,
>  		(vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
>  			~VM_ENTRY_IA32E_MODE) |
>  		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index cde5d61..7505b39 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -782,7 +782,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
>  {
>  	if (cr8 & CR8_RESERVED_BITS)
>  		return 1;
> -	if (irqchip_in_kernel(vcpu->kvm))
> +	if (lapic_in_kernel(vcpu->kvm))
>  		kvm_lapic_set_tpr(vcpu, cr8);
>  	else
>  		vcpu->arch.cr8 = cr8;
> @@ -792,7 +792,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
>  
>  unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
>  {
> -	if (irqchip_in_kernel(vcpu->kvm))
> +	if (lapic_in_kernel(vcpu->kvm))
>  		return kvm_lapic_get_cr8(vcpu);
>  	else
>  		return vcpu->arch.cr8;
> @@ -2800,6 +2800,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  	case KVM_CAP_TSC_DEADLINE_TIMER:
>  	case KVM_CAP_ENABLE_CAP_VM:
>  	case KVM_CAP_DISABLE_QUIRKS:
> +	case KVM_CAP_SPLIT_IRQCHIP:
>  #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
>  	case KVM_CAP_ASSIGN_DEV_IRQ:
>  	case KVM_CAP_PCI_2_3:
> @@ -3002,7 +3003,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
>  {
>  	if (irq->irq >= KVM_NR_INTERRUPTS)
>  		return -EINVAL;
> -	if (irqchip_in_kernel(vcpu->kvm))
> +	if (lapic_in_kernel(vcpu->kvm))
>  		return -ENXIO;
>  
>  	kvm_queue_interrupt(vcpu, irq->irq, false);
> @@ -3480,7 +3481,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  		struct kvm_vapic_addr va;
>  
>  		r = -EINVAL;
> -		if (!irqchip_in_kernel(vcpu->kvm))
> +		if (!lapic_in_kernel(vcpu->kvm))
>  			goto out;
>  		r = -EFAULT;
>  		if (copy_from_user(&va, argp, sizeof va))
> @@ -3838,7 +3839,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
>  int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
>  			bool line_status)
>  {
> -	if (!irqchip_in_kernel(kvm))
> +	if (!lapic_in_kernel(kvm))
>  		return -ENXIO;
>  
>  	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
> @@ -4128,6 +4129,24 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
>  		break;
>  	}
> +	case KVM_CREATE_SPLIT_IRQCHIP: {
> +		mutex_lock(&kvm->lock);
> +		r = -EEXIST;
> +		if (lapic_in_kernel(kvm))
> +			goto split_irqchip_unlock;
> +		r = -EINVAL;
> +		if (atomic_read(&kvm->online_vcpus))
> +			goto split_irqchip_unlock;
> +		r = kvm_setup_empty_irq_routing(kvm);
> +		if (r)
> +			goto split_irqchip_unlock;
> +		kvm->arch.irqchip_split = true;
> +		r = 0;
> +split_irqchip_unlock:
> +		mutex_unlock(&kvm->lock);
> +		break;
> +	}
> +
>  	default:
>  		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
>  	}
> @@ -5893,7 +5912,7 @@ void kvm_arch_exit(void)
>  int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
>  {
>  	++vcpu->stat.halt_exits;
> -	if (irqchip_in_kernel(vcpu->kvm)) {
> +	if (lapic_in_kernel(vcpu->kvm)) {
>  		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
>  		return 1;
>  	} else {
> @@ -6060,7 +6079,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
>   */
>  static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
>  {
> -	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
> +	return (!lapic_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
>  		vcpu->run->request_interrupt_window &&
>  		kvm_arch_interrupt_allowed(vcpu));
>  }
> @@ -6072,7 +6091,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
>  	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
>  	kvm_run->cr8 = kvm_get_cr8(vcpu);
>  	kvm_run->apic_base = kvm_get_apic_base(vcpu);
> -	if (irqchip_in_kernel(vcpu->kvm))
> +	if (lapic_in_kernel(vcpu->kvm))
>  		kvm_run->ready_for_interrupt_injection = 1;
>  	else
>  		kvm_run->ready_for_interrupt_injection =
> @@ -6219,7 +6238,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
>  {
>  	struct page *page = NULL;
>  
> -	if (!irqchip_in_kernel(vcpu->kvm))
> +	if (!lapic_in_kernel(vcpu->kvm))
>  		return;
>  
>  	if (!kvm_x86_ops->set_apic_access_page_addr)
> @@ -6255,7 +6274,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
>  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  {
>  	int r;
> -	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
> +	bool req_int_win = !lapic_in_kernel(vcpu->kvm) &&
>  		vcpu->run->request_interrupt_window;
>  	bool req_immediate_exit = false;
>  
> @@ -6644,7 +6663,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>  	}
>  
>  	/* re-sync apic's tpr */
> -	if (!irqchip_in_kernel(vcpu->kvm)) {
> +	if (!lapic_in_kernel(vcpu->kvm)) {
>  		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
>  			r = -EINVAL;
>  			goto out;
> @@ -7340,7 +7359,7 @@ void kvm_arch_check_processor_compat(void *rtn)
>  
>  bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
>  {
> -	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
> +	return lapic_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
>  }
>  
>  struct static_key kvm_no_apic_vcpu __read_mostly;
> @@ -7356,7 +7375,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
>  
>  	vcpu->arch.pv.pv_unhalted = false;
>  	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
> -	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
> +	if (!lapic_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
>  		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
>  	else
>  		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
> @@ -7374,7 +7393,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
>  	if (r < 0)
>  		goto fail_free_pio_data;
>  
> -	if (irqchip_in_kernel(kvm)) {
> +	if (lapic_in_kernel(kvm)) {
>  		r = kvm_create_lapic(vcpu);
>  		if (r < 0)
>  			goto fail_mmu_destroy;
> @@ -7437,7 +7456,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
>  	kvm_mmu_destroy(vcpu);
>  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
>  	free_page((unsigned long)vcpu->arch.pio_data);
> -	if (!irqchip_in_kernel(vcpu->kvm))
> +	if (!lapic_in_kernel(vcpu->kvm))
>  		static_key_slow_dec(&kvm_no_apic_vcpu);
>  }
>  
> diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
> index 133ea00..ffe1f4e 100644
> --- a/include/kvm/arm_vgic.h
> +++ b/include/kvm/arm_vgic.h
> @@ -329,6 +329,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
>  int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
>  
>  #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
> +#define lapic_in_kernel(k)      (irqchip_in_kernel(k))
>  #define vgic_initialized(k)	(!!((k)->arch.vgic.nr_cpus))
>  #define vgic_ready(k)		((k)->arch.vgic.ready)
>  
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 87fd74a..277b7a1 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -934,6 +934,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
>  #endif
>  
>  int kvm_setup_default_irq_routing(struct kvm *kvm);
> +int kvm_setup_empty_irq_routing(struct kvm *kvm);
>  int kvm_set_irq_routing(struct kvm *kvm,
>  			const struct kvm_irq_routing_entry *entries,
>  			unsigned nr,
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 75bd9f7..7d06dc4 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -815,6 +815,7 @@ struct kvm_ppc_smmu_info {
>  #define KVM_CAP_S390_IRQ_STATE 114
>  #define KVM_CAP_PPC_HWRNG 115
>  #define KVM_CAP_DISABLE_QUIRKS 116
> +#define KVM_CAP_SPLIT_IRQCHIP 117
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -1200,6 +1201,8 @@ struct kvm_s390_ucas_mapping {
>  /* Available with KVM_CAP_S390_IRQ_STATE */
>  #define KVM_S390_SET_IRQ_STATE	  _IOW(KVMIO, 0xb5, struct kvm_s390_irq_state)
>  #define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
> +/* Available with KVM_CAP_SPLIT_IRQCHIP */
> +#define KVM_CREATE_SPLIT_IRQCHIP  _IO(KVMIO, 0xb7)
>  
>  #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
>  #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
> diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
> index 1d56a90..8aaceed 100644
> --- a/virt/kvm/irqchip.c
> +++ b/virt/kvm/irqchip.c
> @@ -73,7 +73,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
>  {
>  	struct kvm_kernel_irq_routing_entry route;
>  
> -	if (!irqchip_in_kernel(kvm) || msi->flags != 0)
> +	if (!lapic_in_kernel(kvm) || msi->flags != 0)
>  		return -EINVAL;
>  
>  	route.msi.address_lo = msi->address_lo;
> 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-13  6:12   ` Jan Kiszka
@ 2015-05-13  8:04     ` Paolo Bonzini
  2015-05-13  8:10       ` Jan Kiszka
                         ` (2 more replies)
  0 siblings, 3 replies; 41+ messages in thread
From: Paolo Bonzini @ 2015-05-13  8:04 UTC (permalink / raw)
  To: Jan Kiszka, Steve Rutherford, kvm; +Cc: ahonig



On 13/05/2015 08:12, Jan Kiszka wrote:
>> +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
>> +{
>> +	struct kvm *kvm = vcpu->kvm;
>> +	struct kvm_kernel_irq_routing_entry *entry;
>> +	struct kvm_irq_routing_table *table;
>> +	u32 i, nr_rt_entries;
>> +
>> +	mutex_lock(&kvm->irq_lock);

This only needs irq_srcu protection, not irq_lock, so the lookup cost
becomes much smaller (all CPUs can proceed in parallel).

You would need to put an smp_mb here, to ensure that irq_routing is read
after KVM_SCAN_IOAPIC is cleared.  You can introduce
smb_mb__after_srcu_read_lock in order to elide it.

The matching memory barrier would be a smp_mb__before_atomic in
kvm_make_scan_ioapic_request.

>> +	table = kvm->irq_routing;
>> +	nr_rt_entries = min_t(u32, table->nr_rt_entries, IOAPIC_NUM_PINS);
>> +	for (i = 0; i < nr_rt_entries; ++i) {
>> +		hlist_for_each_entry(entry, &table->map[i], link) {
>> +			u32 dest_id, dest_mode;
>> +
>> +			if (entry->type != KVM_IRQ_ROUTING_MSI)
>> +				continue;
>> +			dest_id = (entry->msi.address_lo >> 12) & 0xff;
>> +			dest_mode = (entry->msi.address_lo >> 2) & 0x1;
>> +			if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id,
>> +						dest_mode)) {
>> +				u32 vector = entry->msi.data & 0xff;
>> +
>> +				__set_bit(vector,
>> +					  (unsigned long *) eoi_exit_bitmap);
>> +			}
>> +		}
>> +	}
>> +	mutex_unlock(&kvm->irq_lock);
>> +}
>>
> 
> This looks a bit frightening regarding the lookup costs. Do we really
> have to run through the complete routing table to find the needed
> information? There can be way more "real" MSI entries than IOAPIC pins.

It does at most IOAPIC_NUM_PINS iterations however.

> There can even be multiple IOAPICs (thanks to your patches overcoming
> the single in-kernel instance).

With multiple IOAPICs you have more than 24 GSIs per IOAPIC.  That means
that the above loop is broken for multiple IOAPICs.

But perhaps when enabling KVM_SPLIT_IRQCHIP we can use args[0] to pass
the number of IOAPIC routes that will cause EOI exits?

Paolo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-13  8:04     ` Paolo Bonzini
@ 2015-05-13  8:10       ` Jan Kiszka
  2015-05-13  9:24         ` Paolo Bonzini
  2015-05-13 22:21       ` Steve Rutherford
  2015-05-15  2:38       ` Steve Rutherford
  2 siblings, 1 reply; 41+ messages in thread
From: Jan Kiszka @ 2015-05-13  8:10 UTC (permalink / raw)
  To: Paolo Bonzini, Steve Rutherford, kvm; +Cc: ahonig

On 2015-05-13 10:04, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 08:12, Jan Kiszka wrote:
>>> +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
>>> +{
>>> +	struct kvm *kvm = vcpu->kvm;
>>> +	struct kvm_kernel_irq_routing_entry *entry;
>>> +	struct kvm_irq_routing_table *table;
>>> +	u32 i, nr_rt_entries;
>>> +
>>> +	mutex_lock(&kvm->irq_lock);
> 
> This only needs irq_srcu protection, not irq_lock, so the lookup cost
> becomes much smaller (all CPUs can proceed in parallel).
> 
> You would need to put an smp_mb here, to ensure that irq_routing is read
> after KVM_SCAN_IOAPIC is cleared.  You can introduce
> smb_mb__after_srcu_read_lock in order to elide it.
> 
> The matching memory barrier would be a smp_mb__before_atomic in
> kvm_make_scan_ioapic_request.
> 
>>> +	table = kvm->irq_routing;
>>> +	nr_rt_entries = min_t(u32, table->nr_rt_entries, IOAPIC_NUM_PINS);
>>> +	for (i = 0; i < nr_rt_entries; ++i) {
>>> +		hlist_for_each_entry(entry, &table->map[i], link) {
>>> +			u32 dest_id, dest_mode;
>>> +
>>> +			if (entry->type != KVM_IRQ_ROUTING_MSI)
>>> +				continue;
>>> +			dest_id = (entry->msi.address_lo >> 12) & 0xff;
>>> +			dest_mode = (entry->msi.address_lo >> 2) & 0x1;
>>> +			if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id,
>>> +						dest_mode)) {
>>> +				u32 vector = entry->msi.data & 0xff;
>>> +
>>> +				__set_bit(vector,
>>> +					  (unsigned long *) eoi_exit_bitmap);
>>> +			}
>>> +		}
>>> +	}
>>> +	mutex_unlock(&kvm->irq_lock);
>>> +}
>>>
>>
>> This looks a bit frightening regarding the lookup costs. Do we really
>> have to run through the complete routing table to find the needed
>> information? There can be way more "real" MSI entries than IOAPIC pins.
> 
> It does at most IOAPIC_NUM_PINS iterations however.
> 
>> There can even be multiple IOAPICs (thanks to your patches overcoming
>> the single in-kernel instance).
> 
> With multiple IOAPICs you have more than 24 GSIs per IOAPIC.  That means

I don't think that the number of pins per IOAPIC increases. At least not
in the devices I've seen so far.

> that the above loop is broken for multiple IOAPICs.

The worst case remains #IOAPIC * 24 iterations - if we have means to
stop after the IOAPIC entries, not iterating over all routes.

> 
> But perhaps when enabling KVM_SPLIT_IRQCHIP we can use args[0] to pass
> the number of IOAPIC routes that will cause EOI exits?

And you need to ensure that their routes can be found in the table
directly. Given IOAPIC hotplug, that may not be the first ones there...

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-13  8:10       ` Jan Kiszka
@ 2015-05-13  9:24         ` Paolo Bonzini
  2015-05-13 10:25           ` Jan Kiszka
  0 siblings, 1 reply; 41+ messages in thread
From: Paolo Bonzini @ 2015-05-13  9:24 UTC (permalink / raw)
  To: Jan Kiszka, Steve Rutherford, kvm; +Cc: ahonig



On 13/05/2015 10:10, Jan Kiszka wrote:
>>> >> There can even be multiple IOAPICs (thanks to your patches overcoming
>>> >> the single in-kernel instance).
>> > 
>> > With multiple IOAPICs you have more than 24 GSIs per IOAPIC.  That means
> I don't think that the number of pins per IOAPIC increases. At least not
> in the devices I've seen so far.

Sorry, that was supposed to be "more than 24 GSIs for the IOAPICs".

>> > that the above loop is broken for multiple IOAPICs.
> The worst case remains #IOAPIC * 24 iterations - if we have means to
> stop after the IOAPIC entries, not iterating over all routes.

Yes.  Which is not too bad if VCPUs can process it in parallel.

>> > But perhaps when enabling KVM_SPLIT_IRQCHIP we can use args[0] to pass
>> > the number of IOAPIC routes that will cause EOI exits?
> And you need to ensure that their routes can be found in the table
> directly. Given IOAPIC hotplug, that may not be the first ones there...

Can you reserve a bunch of GSIs at the beginning of the GSI space, and
use rt->map[] to access them and build the EOI exit bitmap?

Paolo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-13  9:24         ` Paolo Bonzini
@ 2015-05-13 10:25           ` Jan Kiszka
  2015-05-13 13:04             ` Paolo Bonzini
  0 siblings, 1 reply; 41+ messages in thread
From: Jan Kiszka @ 2015-05-13 10:25 UTC (permalink / raw)
  To: Paolo Bonzini, Steve Rutherford, kvm; +Cc: ahonig

On 2015-05-13 11:24, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 10:10, Jan Kiszka wrote:
>>>>>> There can even be multiple IOAPICs (thanks to your patches overcoming
>>>>>> the single in-kernel instance).
>>>>
>>>> With multiple IOAPICs you have more than 24 GSIs per IOAPIC.  That means
>> I don't think that the number of pins per IOAPIC increases. At least not
>> in the devices I've seen so far.
> 
> Sorry, that was supposed to be "more than 24 GSIs for the IOAPICs".
> 
>>>> that the above loop is broken for multiple IOAPICs.
>> The worst case remains #IOAPIC * 24 iterations - if we have means to
>> stop after the IOAPIC entries, not iterating over all routes.
> 
> Yes.  Which is not too bad if VCPUs can process it in parallel.
> 
>>>> But perhaps when enabling KVM_SPLIT_IRQCHIP we can use args[0] to pass
>>>> the number of IOAPIC routes that will cause EOI exits?
>> And you need to ensure that their routes can be found in the table
>> directly. Given IOAPIC hotplug, that may not be the first ones there...
> 
> Can you reserve a bunch of GSIs at the beginning of the GSI space, and
> use rt->map[] to access them and build the EOI exit bitmap?

Ideally, userspace could give the kernel a hint where to look. It has a
copy of the routing table and manages it, no?

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-13 10:25           ` Jan Kiszka
@ 2015-05-13 13:04             ` Paolo Bonzini
  2015-05-13 13:19               ` Jan Kiszka
  0 siblings, 1 reply; 41+ messages in thread
From: Paolo Bonzini @ 2015-05-13 13:04 UTC (permalink / raw)
  To: Jan Kiszka, Steve Rutherford, kvm; +Cc: ahonig



On 13/05/2015 12:25, Jan Kiszka wrote:
>>>> But perhaps when enabling KVM_SPLIT_IRQCHIP we can use args[0] to pass
>>>> the number of IOAPIC routes that will cause EOI exits?
>>> 
>>> And you need to ensure that their routes can be found in the table
>>> directly. Given IOAPIC hotplug, that may not be the first ones there...
>> 
>> Can you reserve a bunch of GSIs at the beginning of the GSI space, and
>> use rt->map[] to access them and build the EOI exit bitmap?
> 
> Ideally, userspace could give the kernel a hint where to look. It has a
> copy of the routing table and manages it, no?

Well, reserving space at the beginning of the table (and communicating
the size via KVM_ENABLE_CAP is a way to "give the kernel a hint where to
look".  Perhaps not the best, but simple and supports hotplug to some
extent.

Paolo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-13 13:04             ` Paolo Bonzini
@ 2015-05-13 13:19               ` Jan Kiszka
  0 siblings, 0 replies; 41+ messages in thread
From: Jan Kiszka @ 2015-05-13 13:19 UTC (permalink / raw)
  To: Paolo Bonzini, Steve Rutherford, kvm; +Cc: ahonig

On 2015-05-13 15:04, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 12:25, Jan Kiszka wrote:
>>>>> But perhaps when enabling KVM_SPLIT_IRQCHIP we can use args[0] to pass
>>>>> the number of IOAPIC routes that will cause EOI exits?
>>>>
>>>> And you need to ensure that their routes can be found in the table
>>>> directly. Given IOAPIC hotplug, that may not be the first ones there...
>>>
>>> Can you reserve a bunch of GSIs at the beginning of the GSI space, and
>>> use rt->map[] to access them and build the EOI exit bitmap?
>>
>> Ideally, userspace could give the kernel a hint where to look. It has a
>> copy of the routing table and manages it, no?
> 
> Well, reserving space at the beginning of the table (and communicating
> the size via KVM_ENABLE_CAP is a way to "give the kernel a hint where to
> look".  Perhaps not the best, but simple and supports hotplug to some
> extent.

But then we need at least a way to differentiate between IOAPIC and MSI
routes so that the loop can actually stop when hitting the first
non-IOAPIC entry. Right now, this is not possible. But even that would
be a kind of ugly interface.

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP.
  2015-05-13  7:57 ` [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP Paolo Bonzini
@ 2015-05-13 22:10   ` Steve Rutherford
  2015-05-14  9:12     ` Wu, Feng
  0 siblings, 1 reply; 41+ messages in thread
From: Steve Rutherford @ 2015-05-13 22:10 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, ahonig

On Wed, May 13, 2015 at 09:57:00AM +0200, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 03:47, Steve Rutherford wrote:
> > First patch in a series which enables the relocation of the
> > PIC/IOAPIC/PIT to userspace.
> > 
> > Adds capability KVM_CAP_SPLIT_IRQCHIP and ioctl KVM_SPLIT_IRQCHIP.
> > 
> > KVM_SPLIT_IRQCHIP enables the construction of LAPICs without the rest
> > of the irqchip.
> 
> The ioctl name in the code is KVM_CREATE_SPLIT_IRQCHIP, but it doesn't
> seem to create a local APIC.
> 

 This enables the creation of LAPICs every time a VCPU is created, which I guess is different. I'll change the language to reflect that.

> > Compile tested for x86.
> 
> The capability is fine.  However, instead of introducing a new ioctl,
> you can use KVM_CAP_ENABLE_CAP_VM and enable the capability once
> (per-VM) using KVM_ENABLE_CAP.  The enabling code is basically the same
> you have in your code.
> 

ENABLE_CAP seems like a better way to go. I'll update this to use it instead of the IOCTL. 

Steve
> Paolo
> 
> > Signed-off-by: Steve Rutherford <srutherford@google.com>
> > Suggested-by: Andrew Honig <ahonig@google.com>
> > ---
> >  Documentation/virtual/kvm/api.txt | 15 ++++++++++++
> >  arch/powerpc/kvm/irq.h            |  5 ++++
> >  arch/s390/kvm/irq.h               |  4 ++++
> >  arch/x86/include/asm/kvm_host.h   |  2 ++
> >  arch/x86/kvm/assigned-dev.c       |  4 ++--
> >  arch/x86/kvm/irq.c                |  6 ++---
> >  arch/x86/kvm/irq.h                | 11 +++++++++
> >  arch/x86/kvm/irq_comm.c           |  7 ++++++
> >  arch/x86/kvm/lapic.c              | 13 +++++++----
> >  arch/x86/kvm/mmu.c                |  2 +-
> >  arch/x86/kvm/svm.c                |  4 ++--
> >  arch/x86/kvm/vmx.c                | 12 +++++-----
> >  arch/x86/kvm/x86.c                | 49 +++++++++++++++++++++++++++------------
> >  include/kvm/arm_vgic.h            |  1 +
> >  include/linux/kvm_host.h          |  1 +
> >  include/uapi/linux/kvm.h          |  3 +++
> >  virt/kvm/irqchip.c                |  2 +-
> >  17 files changed, 106 insertions(+), 35 deletions(-)
> > 
> > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> > index 6955444..0744b4e 100644
> > --- a/Documentation/virtual/kvm/api.txt
> > +++ b/Documentation/virtual/kvm/api.txt
> > @@ -2979,6 +2979,21 @@ len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0
> >  and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq),
> >  which is the maximum number of possibly pending cpu-local interrupts.
> >  
> > +4.96 KVM_SPLIT_IRQCHIP
> > +
> > +Capability: KVM_CAP_SPLIT_IRQCHIP
> > +Architectures: x86
> > +Type:  VM ioctl
> > +Parameters: None
> > +Returns: 0 on success, -1 on error
> > +
> > +Create a local apic for each processor in the kernel.  This differs from
> > +KVM_CREATE_IRQCHIP in that it only creates the local apic; it creates neither
> > +the ioapic nor the pic in the kernel. Also, enables in kernel routing of
> > +interrupt requests. Fails if VCPU has already been created, or if the irqchip is
> > +already in the kernel.
> > +
> > +
> >  5. The kvm_run structure
> >  ------------------------
> >  
> > diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
> > index 5a9a10b..5e6fa06 100644
> > --- a/arch/powerpc/kvm/irq.h
> > +++ b/arch/powerpc/kvm/irq.h
> > @@ -17,4 +17,9 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
> >  	return ret;
> >  }
> >  
> > +static inline int lapic_in_kernel(struct kvm *kvm)
> > +{
> > +	return irqchip_in_kernel(kvm);
> > +}
> > +
> >  #endif
> > diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h
> > index d98e415..db876c3 100644
> > --- a/arch/s390/kvm/irq.h
> > +++ b/arch/s390/kvm/irq.h
> > @@ -19,4 +19,8 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
> >  	return 1;
> >  }
> >  
> > +static inline int lapic_in_kernel(struct kvm *kvm)
> > +{
> > +	return irqchip_in_kernel(kvm);
> > +}
> >  #endif
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index bbb8f4e..3ddc134 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -638,6 +638,8 @@ struct kvm_arch {
> >  	bool boot_vcpu_runs_old_kvmclock;
> >  
> >  	u64 disabled_quirks;
> > +
> > +	bool irqchip_split;
> >  };
> >  
> >  struct kvm_vm_stat {
> > diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
> > index d090ecf..1237e92 100644
> > --- a/arch/x86/kvm/assigned-dev.c
> > +++ b/arch/x86/kvm/assigned-dev.c
> > @@ -291,7 +291,7 @@ static int kvm_deassign_irq(struct kvm *kvm,
> >  {
> >  	unsigned long guest_irq_type, host_irq_type;
> >  
> > -	if (!irqchip_in_kernel(kvm))
> > +	if (!lapic_in_kernel(kvm))
> >  		return -EINVAL;
> >  	/* no irq assignment to deassign */
> >  	if (!assigned_dev->irq_requested_type)
> > @@ -568,7 +568,7 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
> >  	struct kvm_assigned_dev_kernel *match;
> >  	unsigned long host_irq_type, guest_irq_type;
> >  
> > -	if (!irqchip_in_kernel(kvm))
> > +	if (!lapic_in_kernel(kvm))
> >  		return r;
> >  
> >  	mutex_lock(&kvm->lock);
> > diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
> > index a1ec6a50..706e47a 100644
> > --- a/arch/x86/kvm/irq.c
> > +++ b/arch/x86/kvm/irq.c
> > @@ -57,7 +57,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
> >   */
> >  int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
> >  {
> > -	if (!irqchip_in_kernel(v->kvm))
> > +	if (!lapic_in_kernel(v->kvm))
> >  		return v->arch.interrupt.pending;
> >  
> >  	if (kvm_cpu_has_extint(v))
> > @@ -75,7 +75,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
> >   */
> >  int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
> >  {
> > -	if (!irqchip_in_kernel(v->kvm))
> > +	if (!lapic_in_kernel(v->kvm))
> >  		return v->arch.interrupt.pending;
> >  
> >  	if (kvm_cpu_has_extint(v))
> > @@ -103,7 +103,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
> >  {
> >  	int vector;
> >  
> > -	if (!irqchip_in_kernel(v->kvm))
> > +	if (!lapic_in_kernel(v->kvm))
> >  		return v->arch.interrupt.nr;
> >  
> >  	vector = kvm_cpu_get_extint(v);
> > diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
> > index ad68c73..e46abf3 100644
> > --- a/arch/x86/kvm/irq.h
> > +++ b/arch/x86/kvm/irq.h
> > @@ -92,6 +92,17 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
> >  	return ret;
> >  }
> >  
> > +static inline int irqchip_split(struct kvm *kvm)
> > +{
> > +	return kvm->arch.irqchip_split;
> > +}
> > +
> > +static inline int lapic_in_kernel(struct kvm *kvm)
> > +{
> > +	return irqchip_split(kvm) || irqchip_in_kernel(kvm);
> > +}
> > +
> > +
> >  void kvm_pic_reset(struct kvm_kpic_state *s);
> >  
> >  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
> > diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
> > index 9efff9e..f43c59a 100644
> > --- a/arch/x86/kvm/irq_comm.c
> > +++ b/arch/x86/kvm/irq_comm.c
> > @@ -328,3 +328,10 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
> >  	return kvm_set_irq_routing(kvm, default_routing,
> >  				   ARRAY_SIZE(default_routing), 0);
> >  }
> > +
> > +static const struct kvm_irq_routing_entry empty_routing[] = {};
> > +
> > +int kvm_setup_empty_irq_routing(struct kvm *kvm)
> > +{
> > +	return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
> > +}
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index dc5b57b..bc392a6 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -209,7 +209,8 @@ out:
> >  	if (old)
> >  		kfree_rcu(old, rcu);
> >  
> > -	kvm_vcpu_request_scan_ioapic(kvm);
> > +	if (!irqchip_split(kvm))
> > +		kvm_vcpu_request_scan_ioapic(kvm);
> >  }
> >  
> >  static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
> > @@ -1819,7 +1820,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
> >  		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
> >  				apic_find_highest_isr(apic));
> >  	kvm_make_request(KVM_REQ_EVENT, vcpu);
> > -	kvm_rtc_eoi_tracking_restore_one(vcpu);
> > +	if (!irqchip_split(vcpu->kvm))
> > +		kvm_rtc_eoi_tracking_restore_one(vcpu);
> >  }
> >  
> >  void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
> > @@ -1902,7 +1904,8 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
> >  	    /* Cache not set: could be safe but we don't bother. */
> >  	    apic->highest_isr_cache == -1 ||
> >  	    /* Need EOI to update ioapic. */
> > -	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
> > +	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache) ||
> > +	    irqchip_split(vcpu->kvm)) {
> >  		/*
> >  		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
> >  		 * so we need not do anything here.
> > @@ -1958,7 +1961,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> >  	struct kvm_lapic *apic = vcpu->arch.apic;
> >  	u32 reg = (msr - APIC_BASE_MSR) << 4;
> >  
> > -	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> > +	if (!lapic_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> >  		return 1;
> >  
> >  	if (reg == APIC_ICR2)
> > @@ -1975,7 +1978,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
> >  	struct kvm_lapic *apic = vcpu->arch.apic;
> >  	u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
> >  
> > -	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> > +	if (!lapic_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> >  		return 1;
> >  
> >  	if (reg == APIC_DFR || reg == APIC_ICR2) {
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index b78e83f..e5bf6db 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -3465,7 +3465,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
> >  
> >  static bool can_do_async_pf(struct kvm_vcpu *vcpu)
> >  {
> > -	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
> > +	if (unlikely(!lapic_in_kernel(vcpu->kvm) ||
> >  		     kvm_event_needs_reinjection(vcpu)))
> >  		return false;
> >  
> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > index 8954d7a..7ac9ec2 100644
> > --- a/arch/x86/kvm/svm.c
> > +++ b/arch/x86/kvm/svm.c
> > @@ -3054,7 +3054,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
> >  	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
> >  	/* instruction emulation calls kvm_set_cr8() */
> >  	r = cr_interception(svm);
> > -	if (irqchip_in_kernel(svm->vcpu.kvm))
> > +	if (lapic_in_kernel(svm->vcpu.kvm))
> >  		return r;
> >  	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
> >  		return r;
> > @@ -3295,7 +3295,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
> >  	 * If the user space waits to inject interrupts, exit as soon as
> >  	 * possible
> >  	 */
> > -	if (!irqchip_in_kernel(svm->vcpu.kvm) &&
> > +	if (!lapic_in_kernel(svm->vcpu.kvm) &&
> >  	    kvm_run->request_interrupt_window &&
> >  	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
> >  		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index bcb61b0..a53747f 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -948,7 +948,7 @@ static inline bool cpu_has_vmx_tpr_shadow(void)
> >  
> >  static inline bool vm_need_tpr_shadow(struct kvm *kvm)
> >  {
> > -	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
> > +	return (cpu_has_vmx_tpr_shadow()) && lapic_in_kernel(kvm);
> >  }
> >  
> >  static inline bool cpu_has_secondary_exec_ctrls(void)
> > @@ -1064,7 +1064,7 @@ static inline bool cpu_has_vmx_ple(void)
> >  
> >  static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
> >  {
> > -	return flexpriority_enabled && irqchip_in_kernel(kvm);
> > +	return flexpriority_enabled && lapic_in_kernel(kvm);
> >  }
> >  
> >  static inline bool cpu_has_vmx_vpid(void)
> > @@ -4341,7 +4341,7 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
> >  
> >  static int vmx_vm_has_apicv(struct kvm *kvm)
> >  {
> > -	return enable_apicv && irqchip_in_kernel(kvm);
> > +	return enable_apicv && lapic_in_kernel(kvm);
> >  }
> >  
> >  static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
> > @@ -5317,7 +5317,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
> >  				u8 cr8 = (u8)val;
> >  				err = kvm_set_cr8(vcpu, cr8);
> >  				kvm_complete_insn_gp(vcpu, err);
> > -				if (irqchip_in_kernel(vcpu->kvm))
> > +				if (lapic_in_kernel(vcpu->kvm))
> >  					return 1;
> >  				if (cr8_prev <= cr8)
> >  					return 1;
> > @@ -5534,7 +5534,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
> >  	 * If the user space waits to inject interrupts, exit as soon as
> >  	 * possible
> >  	 */
> > -	if (!irqchip_in_kernel(vcpu->kvm) &&
> > +	if (!lapic_in_kernel(vcpu->kvm) &&
> >  	    vcpu->run->request_interrupt_window &&
> >  	    !kvm_cpu_has_interrupt(vcpu)) {
> >  		vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
> > @@ -9419,7 +9419,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
> >  	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
> >  	 * emulated by vmx_set_efer(), below.
> >  	 */
> > -	vm_entry_controls_init(vmx, 
> > +	vm_entry_controls_init(vmx,
> >  		(vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
> >  			~VM_ENTRY_IA32E_MODE) |
> >  		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index cde5d61..7505b39 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -782,7 +782,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
> >  {
> >  	if (cr8 & CR8_RESERVED_BITS)
> >  		return 1;
> > -	if (irqchip_in_kernel(vcpu->kvm))
> > +	if (lapic_in_kernel(vcpu->kvm))
> >  		kvm_lapic_set_tpr(vcpu, cr8);
> >  	else
> >  		vcpu->arch.cr8 = cr8;
> > @@ -792,7 +792,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
> >  
> >  unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
> >  {
> > -	if (irqchip_in_kernel(vcpu->kvm))
> > +	if (lapic_in_kernel(vcpu->kvm))
> >  		return kvm_lapic_get_cr8(vcpu);
> >  	else
> >  		return vcpu->arch.cr8;
> > @@ -2800,6 +2800,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> >  	case KVM_CAP_TSC_DEADLINE_TIMER:
> >  	case KVM_CAP_ENABLE_CAP_VM:
> >  	case KVM_CAP_DISABLE_QUIRKS:
> > +	case KVM_CAP_SPLIT_IRQCHIP:
> >  #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
> >  	case KVM_CAP_ASSIGN_DEV_IRQ:
> >  	case KVM_CAP_PCI_2_3:
> > @@ -3002,7 +3003,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
> >  {
> >  	if (irq->irq >= KVM_NR_INTERRUPTS)
> >  		return -EINVAL;
> > -	if (irqchip_in_kernel(vcpu->kvm))
> > +	if (lapic_in_kernel(vcpu->kvm))
> >  		return -ENXIO;
> >  
> >  	kvm_queue_interrupt(vcpu, irq->irq, false);
> > @@ -3480,7 +3481,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> >  		struct kvm_vapic_addr va;
> >  
> >  		r = -EINVAL;
> > -		if (!irqchip_in_kernel(vcpu->kvm))
> > +		if (!lapic_in_kernel(vcpu->kvm))
> >  			goto out;
> >  		r = -EFAULT;
> >  		if (copy_from_user(&va, argp, sizeof va))
> > @@ -3838,7 +3839,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
> >  int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
> >  			bool line_status)
> >  {
> > -	if (!irqchip_in_kernel(kvm))
> > +	if (!lapic_in_kernel(kvm))
> >  		return -ENXIO;
> >  
> >  	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
> > @@ -4128,6 +4129,24 @@ long kvm_arch_vm_ioctl(struct file *filp,
> >  		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
> >  		break;
> >  	}
> > +	case KVM_CREATE_SPLIT_IRQCHIP: {
> > +		mutex_lock(&kvm->lock);
> > +		r = -EEXIST;
> > +		if (lapic_in_kernel(kvm))
> > +			goto split_irqchip_unlock;
> > +		r = -EINVAL;
> > +		if (atomic_read(&kvm->online_vcpus))
> > +			goto split_irqchip_unlock;
> > +		r = kvm_setup_empty_irq_routing(kvm);
> > +		if (r)
> > +			goto split_irqchip_unlock;
> > +		kvm->arch.irqchip_split = true;
> > +		r = 0;
> > +split_irqchip_unlock:
> > +		mutex_unlock(&kvm->lock);
> > +		break;
> > +	}
> > +
> >  	default:
> >  		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
> >  	}
> > @@ -5893,7 +5912,7 @@ void kvm_arch_exit(void)
> >  int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
> >  {
> >  	++vcpu->stat.halt_exits;
> > -	if (irqchip_in_kernel(vcpu->kvm)) {
> > +	if (lapic_in_kernel(vcpu->kvm)) {
> >  		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
> >  		return 1;
> >  	} else {
> > @@ -6060,7 +6079,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
> >   */
> >  static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
> >  {
> > -	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
> > +	return (!lapic_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
> >  		vcpu->run->request_interrupt_window &&
> >  		kvm_arch_interrupt_allowed(vcpu));
> >  }
> > @@ -6072,7 +6091,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
> >  	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
> >  	kvm_run->cr8 = kvm_get_cr8(vcpu);
> >  	kvm_run->apic_base = kvm_get_apic_base(vcpu);
> > -	if (irqchip_in_kernel(vcpu->kvm))
> > +	if (lapic_in_kernel(vcpu->kvm))
> >  		kvm_run->ready_for_interrupt_injection = 1;
> >  	else
> >  		kvm_run->ready_for_interrupt_injection =
> > @@ -6219,7 +6238,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
> >  {
> >  	struct page *page = NULL;
> >  
> > -	if (!irqchip_in_kernel(vcpu->kvm))
> > +	if (!lapic_in_kernel(vcpu->kvm))
> >  		return;
> >  
> >  	if (!kvm_x86_ops->set_apic_access_page_addr)
> > @@ -6255,7 +6274,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
> >  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >  {
> >  	int r;
> > -	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
> > +	bool req_int_win = !lapic_in_kernel(vcpu->kvm) &&
> >  		vcpu->run->request_interrupt_window;
> >  	bool req_immediate_exit = false;
> >  
> > @@ -6644,7 +6663,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
> >  	}
> >  
> >  	/* re-sync apic's tpr */
> > -	if (!irqchip_in_kernel(vcpu->kvm)) {
> > +	if (!lapic_in_kernel(vcpu->kvm)) {
> >  		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
> >  			r = -EINVAL;
> >  			goto out;
> > @@ -7340,7 +7359,7 @@ void kvm_arch_check_processor_compat(void *rtn)
> >  
> >  bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
> >  {
> > -	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
> > +	return lapic_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
> >  }
> >  
> >  struct static_key kvm_no_apic_vcpu __read_mostly;
> > @@ -7356,7 +7375,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> >  
> >  	vcpu->arch.pv.pv_unhalted = false;
> >  	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
> > -	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
> > +	if (!lapic_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
> >  		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> >  	else
> >  		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
> > @@ -7374,7 +7393,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> >  	if (r < 0)
> >  		goto fail_free_pio_data;
> >  
> > -	if (irqchip_in_kernel(kvm)) {
> > +	if (lapic_in_kernel(kvm)) {
> >  		r = kvm_create_lapic(vcpu);
> >  		if (r < 0)
> >  			goto fail_mmu_destroy;
> > @@ -7437,7 +7456,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
> >  	kvm_mmu_destroy(vcpu);
> >  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> >  	free_page((unsigned long)vcpu->arch.pio_data);
> > -	if (!irqchip_in_kernel(vcpu->kvm))
> > +	if (!lapic_in_kernel(vcpu->kvm))
> >  		static_key_slow_dec(&kvm_no_apic_vcpu);
> >  }
> >  
> > diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
> > index 133ea00..ffe1f4e 100644
> > --- a/include/kvm/arm_vgic.h
> > +++ b/include/kvm/arm_vgic.h
> > @@ -329,6 +329,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
> >  int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
> >  
> >  #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
> > +#define lapic_in_kernel(k)      (irqchip_in_kernel(k))
> >  #define vgic_initialized(k)	(!!((k)->arch.vgic.nr_cpus))
> >  #define vgic_ready(k)		((k)->arch.vgic.ready)
> >  
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 87fd74a..277b7a1 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -934,6 +934,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
> >  #endif
> >  
> >  int kvm_setup_default_irq_routing(struct kvm *kvm);
> > +int kvm_setup_empty_irq_routing(struct kvm *kvm);
> >  int kvm_set_irq_routing(struct kvm *kvm,
> >  			const struct kvm_irq_routing_entry *entries,
> >  			unsigned nr,
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index 75bd9f7..7d06dc4 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -815,6 +815,7 @@ struct kvm_ppc_smmu_info {
> >  #define KVM_CAP_S390_IRQ_STATE 114
> >  #define KVM_CAP_PPC_HWRNG 115
> >  #define KVM_CAP_DISABLE_QUIRKS 116
> > +#define KVM_CAP_SPLIT_IRQCHIP 117
> >  
> >  #ifdef KVM_CAP_IRQ_ROUTING
> >  
> > @@ -1200,6 +1201,8 @@ struct kvm_s390_ucas_mapping {
> >  /* Available with KVM_CAP_S390_IRQ_STATE */
> >  #define KVM_S390_SET_IRQ_STATE	  _IOW(KVMIO, 0xb5, struct kvm_s390_irq_state)
> >  #define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
> > +/* Available with KVM_CAP_SPLIT_IRQCHIP */
> > +#define KVM_CREATE_SPLIT_IRQCHIP  _IO(KVMIO, 0xb7)
> >  
> >  #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
> >  #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
> > diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
> > index 1d56a90..8aaceed 100644
> > --- a/virt/kvm/irqchip.c
> > +++ b/virt/kvm/irqchip.c
> > @@ -73,7 +73,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
> >  {
> >  	struct kvm_kernel_irq_routing_entry route;
> >  
> > -	if (!irqchip_in_kernel(kvm) || msi->flags != 0)
> > +	if (!lapic_in_kernel(kvm) || msi->flags != 0)
> >  		return -EINVAL;
> >  
> >  	route.msi.address_lo = msi->address_lo;
> > 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 2/4] KVM: x86: Add KVM exit for IOAPIC EOIs
  2015-05-13  7:35   ` Paolo Bonzini
@ 2015-05-13 22:18     ` Steve Rutherford
  0 siblings, 0 replies; 41+ messages in thread
From: Steve Rutherford @ 2015-05-13 22:18 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, ahonig

On Wed, May 13, 2015 at 09:35:34AM +0200, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 03:47, Steve Rutherford wrote:
> > Adds KVM_EXIT_IOAPIC_EOI which passes the interrupt vector up to
> > userspace.
> > 
> > Uses a per VCPU exit bitmap to decide whether or not the IOAPIC needs
> > to be informed (which is identical to the EOI_EXIT_BITMAP field used
> > by modern x86 processors, but can also be used to elide kvm IOAPIC EOI
> > exits on older processors).
> > 
> > [Note: A prototype using ResampleFDs found that decoupling the EOI
> > from the VCPU's thread made it possible for the VCPU to not see a
> > recent EOI after reentering the guest. This does not match real
> > hardware.]
> 
> Can you explain this better (and perhaps add a testcase to ioapic.c)?

This is for passing EOIs to the IOAPIC for level-triggered IOAPIC interrupts. Note that this patch requires some other patch to set the bitmaps. 

The sequential level-triggered interrupts test in ioapic.c effectively tests one side of this (If an EOI fails to be delivered to the IOAPIC, then any subsequent interrupts on that EOI's vector will be coalesced, and the second interrupt in the test won't fire.). 
The other side, checking that the guest /only/ exits on the specified vectors, is harder to instrument from within the KVM unit tests.

Steve

> 
> Paolo
> 
> > Compile tested for Intel x86.
> > 
> > Signed-off-by: Steve Rutherford <srutherford@google.com>
> > ---
> >  Documentation/virtual/kvm/api.txt | 10 ++++++++++
> >  arch/x86/include/asm/kvm_host.h   |  3 +++
> >  arch/x86/kvm/lapic.c              |  9 +++++++++
> >  arch/x86/kvm/x86.c                | 11 +++++++++++
> >  include/linux/kvm_host.h          |  1 +
> >  include/uapi/linux/kvm.h          |  5 +++++
> >  6 files changed, 39 insertions(+)
> > 
> > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> > index 0744b4e..dd92996 100644
> > --- a/Documentation/virtual/kvm/api.txt
> > +++ b/Documentation/virtual/kvm/api.txt
> > @@ -3285,6 +3285,16 @@ Valid values for 'type' are:
> >  	 */
> >  	__u64 kvm_valid_regs;
> >  	__u64 kvm_dirty_regs;
> > +
> > +	/* KVM_EXIT_IOAPIC_EOI */
> > +        struct {
> > +	       __u8 vector;
> > +        } eoi;
> > +
> > +Indicates that an eoi of a level triggered IOAPIC interrupt on vector has
> > +occurred, which should be handled by the userspace IOAPIC. Triggers when
> > +the Irqchip has been split between userspace and the kernel.
> > +
> >  	union {
> >  		struct kvm_sync_regs regs;
> >  		char padding[1024];
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 3ddc134..b1978f1 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -539,6 +539,9 @@ struct kvm_vcpu_arch {
> >  	struct {
> >  		bool pv_unhalted;
> >  	} pv;
> > +
> > +	u64 eoi_exit_bitmaps[4];
> > +	int pending_ioapic_eoi;
> >  };
> >  
> >  struct kvm_lpage_info {
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index bc392a6..42fada6f 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -860,6 +860,15 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
> >  
> >  static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
> >  {
> > +	if (irqchip_split(apic->vcpu->kvm)) {
> > +		if (test_bit(vector,
> > +			     (void *) apic->vcpu->arch.eoi_exit_bitmaps)) {
> > +			apic->vcpu->arch.pending_ioapic_eoi = vector;
> > +			kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
> > +		}
> > +		return;
> > +	}
> > +
> >  	if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
> >  		int trigger_mode;
> >  		if (apic_test_vector(vector, apic->regs + APIC_TMR))
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 7505b39..cc27c35 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -6324,6 +6324,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >  			kvm_handle_pmu_event(vcpu);
> >  		if (kvm_check_request(KVM_REQ_PMI, vcpu))
> >  			kvm_deliver_pmi(vcpu);
> > +		if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
> > +			BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
> > +			if (test_bit(vcpu->arch.pending_ioapic_eoi,
> > +				     (void *) vcpu->arch.eoi_exit_bitmaps)) {
> > +				vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
> > +				vcpu->run->eoi.vector =
> > +						vcpu->arch.pending_ioapic_eoi;
> > +				r = 0;
> > +				goto out;
> > +			}
> > +		}
> >  		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
> >  			vcpu_scan_ioapic(vcpu);
> >  		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 277b7a1..cef20ad 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -134,6 +134,7 @@ static inline bool is_error_page(struct page *page)
> >  #define KVM_REQ_ENABLE_IBS        23
> >  #define KVM_REQ_DISABLE_IBS       24
> >  #define KVM_REQ_APIC_PAGE_RELOAD  25
> > +#define KVM_REQ_IOAPIC_EOI_EXIT   26
> >  
> >  #define KVM_USERSPACE_IRQ_SOURCE_ID		0
> >  #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index 7d06dc4..2305948 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -183,6 +183,7 @@ struct kvm_s390_skeys {
> >  #define KVM_EXIT_EPR              23
> >  #define KVM_EXIT_SYSTEM_EVENT     24
> >  #define KVM_EXIT_S390_STSI        25
> > +#define KVM_EXIT_IOAPIC_EOI       26
> >  
> >  /* For KVM_EXIT_INTERNAL_ERROR */
> >  /* Emulate instruction failed. */
> > @@ -329,6 +330,10 @@ struct kvm_run {
> >  			__u8 sel1;
> >  			__u16 sel2;
> >  		} s390_stsi;
> > +		/* KVM_EXIT_IOAPIC_EOI */
> > +		struct {
> > +			__u8 vector;
> > +		} eoi;
> >  		/* Fix the size of the union. */
> >  		char padding[256];
> >  	};
> > 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-13  8:04     ` Paolo Bonzini
  2015-05-13  8:10       ` Jan Kiszka
@ 2015-05-13 22:21       ` Steve Rutherford
  2015-05-15  2:38       ` Steve Rutherford
  2 siblings, 0 replies; 41+ messages in thread
From: Steve Rutherford @ 2015-05-13 22:21 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: Jan Kiszka, kvm, ahonig

On Wed, May 13, 2015 at 10:04:53AM +0200, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 08:12, Jan Kiszka wrote:
> >> +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
> >> +{
> >> +	struct kvm *kvm = vcpu->kvm;
> >> +	struct kvm_kernel_irq_routing_entry *entry;
> >> +	struct kvm_irq_routing_table *table;
> >> +	u32 i, nr_rt_entries;
> >> +
> >> +	mutex_lock(&kvm->irq_lock);
> 
> This only needs irq_srcu protection, not irq_lock, so the lookup cost
> becomes much smaller (all CPUs can proceed in parallel).
> 
> You would need to put an smp_mb here, to ensure that irq_routing is read
> after KVM_SCAN_IOAPIC is cleared.  You can introduce
> smb_mb__after_srcu_read_lock in order to elide it.
> 
> The matching memory barrier would be a smp_mb__before_atomic in
> kvm_make_scan_ioapic_request.
> 

Makes sense, I'll update this for the next iteration.

Steve

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-13  7:51   ` Paolo Bonzini
@ 2015-05-13 22:24     ` Steve Rutherford
  2015-05-14  9:20       ` Paolo Bonzini
  0 siblings, 1 reply; 41+ messages in thread
From: Steve Rutherford @ 2015-05-13 22:24 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, ahonig

On Wed, May 13, 2015 at 09:51:43AM +0200, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 03:47, Steve Rutherford wrote:
> > @@ -205,6 +205,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
> >  
> >  	synchronize_srcu_expedited(&kvm->irq_srcu);
> >  
> > +	kvm_vcpu_request_scan_userspace_ioapic(kvm);
> > +
> >  	new = old;
> >  	r = 0;
> >  
> 
> This can be done before synchronize_srcu_expedited, so that changes
> ripple to the VCPUs faster.
> 
> > +void kvm_vcpu_request_scan_userspace_ioapic(struct kvm *kvm)
> > +{
> > +	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> > +
> > +	if (ioapic)
> > +		return;
> > +	if (!lapic_in_kernel(kvm))
> > +		return;
> > +	kvm_make_scan_ioapic_request(kvm);
> > +}
> 
> This is okay for use in kvm_set_irq_routing (perhaps renamed to
> kvm_arch_irq_routing_update), but when it is used here:
> 
> >  	if (!irqchip_split(kvm))
> >  		kvm_vcpu_request_scan_ioapic(kvm);
> > +	else
> > +		kvm_vcpu_request_scan_userspace_ioapic(kvm);
> 
> ... you can simply do this:
> 
> - 	if (!irqchip_split(kvm))
> -		kvm_vcpu_request_scan_ioapic(kvm);
> +	kvm_make_scan_ioapic_request(kvm);
> 
> Paolo

Seems reasonable.

While we're on the topic of scanning the IOAPIC, should this also scan the IOAPIC when (un)registering irq ack notifiers? [Which is currently done for the in-kernel IOAPIC.]

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 4/4] KVM: x86: Add support for local interrupt requests from userspace
  2015-05-13  6:12   ` Jan Kiszka
@ 2015-05-13 22:41     ` Steve Rutherford
  2015-05-15 13:27       ` Jan Kiszka
  0 siblings, 1 reply; 41+ messages in thread
From: Steve Rutherford @ 2015-05-13 22:41 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: kvm, Andrew Honig

On Wed, May 13, 2015 at 08:12:59AM +0200, Jan Kiszka wrote:
> On 2015-05-13 03:47, Steve Rutherford wrote:
> > In order to enable userspace PIC support, the userspace PIC needs to
> > be able to inject local interrupt requests.
> > 
> > This adds the ioctl KVM_REQUEST_LOCAL_INTERRUPT and kvm exit
> > KVM_EXIT_GET_EXTINT.
> > 
> > The vm ioctl KVM_REQUEST_LOCAL_INTERRUPT makes a KVM_REQ_EVENT request
> > on the BSP, which causes the BSP to exit to userspace to fetch the
> > vector of the underlying external interrupt, which the BSP then
> > injects into the guest. This matches the PIC spec, and is necessary to
> > boot Windows.
> 
> The API name seems too generic, given the fairly specific use case. As
> it only allows to kick the BSP, not any VCPU, that should be clarified.
> Maybe call it KVM_REQUEST_PIC_INJECTION or so. Or allow userspace to
> specify the target VCPU, then it's a bit more generic again.
> 
> Actually, when looking at the MultiProcessor spec, you will find
> multiple models for injecting PIC interrupts into CPU APICs. Just in the
> PIC Mode, the BSP is the only possible target. In the other modes, all
> APICs can receive PIC interrupts, and either the IOAPIC or the local
> APIC's LINT0 configuration decide about the effective target. We should
> allow to model all modes, based on userspace decisions.
> 

Supporting the other modes seems reasonable, but I'm not certain I have an outlet for testing them for correctness. I'm not even certain which OSes use the other modes. Unless there is a common OS that uses the other modes, and a straightforward way for me to test the other modes, I'd rather just rename the API to be less generic.

> > 
> > Boots and passes the KVM unit tests on intel x86 with the
> > PIC/PIT/IOAPIC in userspace (under a non-QEMU VMM). Boots and passes
> > the KVM unit tests under normal conditions as well.
> 
> Do you plan to provide a reference implementation for an open source
> userspace VMM as well, once the kernel side is settled?
> 

Not at the moment (given that I'm not all that familiar with QEMU). I'm definitely willing to help guide someone else through the process.

Steve
> > 
> > SVM support and device assignment are untested with this feature
> > enabled, but testing for both is in the works.
> > 
> > Compiles for ARM/x86/PPC.
> > 
> > Signed-off-by: Steve Rutherford <srutherford@google.com>
> > ---
> >  Documentation/virtual/kvm/api.txt |  9 +++++++
> >  arch/x86/include/asm/kvm_host.h   |  1 +
> >  arch/x86/kvm/irq.c                |  6 ++++-
> >  arch/x86/kvm/lapic.c              |  7 ++++++
> >  arch/x86/kvm/lapic.h              |  2 ++
> >  arch/x86/kvm/x86.c                | 53 ++++++++++++++++++++++++++++++++++++---
> >  include/uapi/linux/kvm.h          |  6 +++++
> >  7 files changed, 80 insertions(+), 4 deletions(-)
> > 
> > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> > index dd92996..a650321 100644
> > --- a/Documentation/virtual/kvm/api.txt
> > +++ b/Documentation/virtual/kvm/api.txt
> > @@ -2993,6 +2993,15 @@ the ioapic nor the pic in the kernel. Also, enables in kernel routing of
> >  interrupt requests. Fails if VCPU has already been created, or if the irqchip is
> >  already in the kernel.
> >  
> > +4.97 KVM_REQUEST_LOCAL_INTERRUPT
> > +
> > +Capability: KVM_CAP_SPLIT_IRQCHIP
> > +Type: VM ioctl
> > +Parameters: none
> > +Returns: 0 on success, -1 on error.
> > +
> > +Informs the kernel that userspace has a pending external interrupt.
> > +
> >  
> >  5. The kvm_run structure
> >  ------------------------
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index b1978f1..602ea70 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -542,6 +542,7 @@ struct kvm_vcpu_arch {
> >  
> >  	u64 eoi_exit_bitmaps[4];
> >  	int pending_ioapic_eoi;
> > +	bool pending_external;
> >  };
> >  
> >  struct kvm_lpage_info {
> > diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
> > index 706e47a..487b5f5 100644
> > --- a/arch/x86/kvm/irq.c
> > +++ b/arch/x86/kvm/irq.c
> > @@ -43,7 +43,11 @@ EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
> >   */
> >  static int kvm_cpu_has_extint(struct kvm_vcpu *v)
> >  {
> > -	if (kvm_apic_accept_pic_intr(v))
> > +	u8 accept = kvm_apic_accept_pic_intr(v);
> > +
> > +	if (accept && irqchip_split(v->kvm))
> > +		return v->arch.pending_external;
> > +	else if (accept)
> >  		return pic_irqchip(v->kvm)->output;	/* PIC */
> >  	else
> >  		return 0;
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index 7533b87..9a021f7 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -2089,3 +2089,10 @@ void kvm_lapic_init(void)
> >  	jump_label_rate_limit(&apic_hw_disabled, HZ);
> >  	jump_label_rate_limit(&apic_sw_disabled, HZ);
> >  }
> > +
> > +void kvm_request_local_interrupt(struct kvm_vcpu *vcpu)
> > +{
> > +	vcpu->arch.pending_external = true;
> > +	kvm_make_request(KVM_REQ_EVENT, vcpu);
> > +	kvm_vcpu_kick(vcpu);
> > +}
> > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> > index 71b150c..66bb780 100644
> > --- a/arch/x86/kvm/lapic.h
> > +++ b/arch/x86/kvm/lapic.h
> > @@ -63,6 +63,8 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
> >  		unsigned long *dest_map);
> >  int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
> >  
> > +void kvm_request_local_interrupt(struct kvm_vcpu *vcpu);
> > +
> >  bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
> >  		struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map);
> >  
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 6127fe7..b7ceff3 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -65,6 +65,8 @@
> >  #include <asm/pvclock.h>
> >  #include <asm/div64.h>
> >  
> > +#define GET_VECTOR_FROM_USERSPACE 1
> > +
> >  #define MAX_IO_MSRS 256
> >  #define KVM_MAX_MCE_BANKS 32
> >  #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
> > @@ -4146,6 +4148,30 @@ split_irqchip_unlock:
> >  		mutex_unlock(&kvm->lock);
> >  		break;
> >  	}
> > +	case KVM_REQUEST_LOCAL_INTERRUPT: {
> > +		int i;
> > +		struct kvm_vcpu *vcpu;
> > +		struct kvm_vcpu *bsp_vcpu = NULL;
> > +
> > +		mutex_lock(&kvm->lock);
> > +		r = -EEXIST;
> 
> Nitpicking, EEXIST seems confusing. I would go for EINVAL here as well.
> 
> > +		if (!irqchip_split(kvm))
> > +			goto out;
> > +		kvm_for_each_vcpu(i, vcpu, kvm) {
> > +			if (kvm_vcpu_is_reset_bsp(vcpu)) {
> > +				bsp_vcpu = vcpu;
> > +				continue;
> > +			}
> > +		}
> > +		r = -EINVAL;
> > +		if (bsp_vcpu == NULL)
> > +			goto interrupt_unlock;
> > +		kvm_request_local_interrupt(bsp_vcpu);
> > +		r = 0;
> > +interrupt_unlock:
> > +		mutex_unlock(&kvm->lock);
> > +		break;
> > +	}
> >  
> >  	default:
> >  		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
> > @@ -6123,6 +6149,16 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
> >  	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
> >  }
> >  
> > +static int pending_userspace_external_intr(struct kvm_vcpu *vcpu)
> > +{
> > +	if (vcpu->arch.pending_external && kvm_apic_accept_pic_intr(vcpu)) {
> > +		vcpu->arch.pending_external = false;
> > +		return true;
> > +	} else
> > +		return false;
> > +
> > +}
> > +
> >  static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
> >  {
> >  	int r;
> > @@ -6187,7 +6223,12 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
> >  				return r;
> >  		}
> >  		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
> > -			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
> > +			if (irqchip_split(vcpu->kvm) &&
> > +			    pending_userspace_external_intr(vcpu)) {
> > +				return GET_VECTOR_FROM_USERSPACE;
> > +			}
> > +			kvm_queue_interrupt(vcpu,
> > +					    kvm_cpu_get_interrupt(vcpu),
> >  					    false);
> >  			kvm_x86_ops->set_irq(vcpu);
> >  		}
> > @@ -6351,13 +6392,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >  	}
> >  
> >  	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
> > +		int event;
> >  		kvm_apic_accept_events(vcpu);
> >  		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
> >  			r = 1;
> >  			goto out;
> >  		}
> > -
> > -		if (inject_pending_event(vcpu, req_int_win) != 0)
> > +		event = inject_pending_event(vcpu, req_int_win);
> > +		if (event == GET_VECTOR_FROM_USERSPACE) {
> > +			vcpu->run->exit_reason = KVM_EXIT_GET_EXTINT;
> > +			kvm_make_request(KVM_REQ_EVENT, vcpu);
> > +			r = 0;
> > +			goto out;
> > +		} else if (event != 0)
> >  			req_immediate_exit = true;
> >  		/* enable NMI/IRQ window open exits if needed */
> >  		else if (vcpu->arch.nmi_pending)
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index 2305948..368e80a 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -184,6 +184,7 @@ struct kvm_s390_skeys {
> >  #define KVM_EXIT_SYSTEM_EVENT     24
> >  #define KVM_EXIT_S390_STSI        25
> >  #define KVM_EXIT_IOAPIC_EOI       26
> > +#define KVM_EXIT_GET_EXTINT       27
> >  
> >  /* For KVM_EXIT_INTERNAL_ERROR */
> >  /* Emulate instruction failed. */
> > @@ -334,6 +335,10 @@ struct kvm_run {
> >  		struct {
> >  			__u8 vector;
> >  		} eoi;
> > +		/* KVM_EXIT_GET_EXTINT */
> > +		struct {
> > +			__u8 vector;
> > +		} extint;
> >  		/* Fix the size of the union. */
> >  		char padding[256];
> >  	};
> > @@ -1208,6 +1213,7 @@ struct kvm_s390_ucas_mapping {
> >  #define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
> >  /* Available with KVM_CAP_SPLIT_IRQCHIP */
> >  #define KVM_CREATE_SPLIT_IRQCHIP  _IO(KVMIO, 0xb7)
> > +#define KVM_REQUEST_LOCAL_INTERRUPT _IO(KVMIO, 0xb8)
> >  
> >  #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
> >  #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
> > 
> 
> Jan
> 
> -- 
> Siemens AG, Corporate Technology, CT RTC ITP SES-DE
> Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 4/4] KVM: x86: Add support for local interrupt requests from userspace
  2015-05-13  7:22   ` Paolo Bonzini
@ 2015-05-13 23:13     ` Steve Rutherford
  0 siblings, 0 replies; 41+ messages in thread
From: Steve Rutherford @ 2015-05-13 23:13 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, ahonig

On Wed, May 13, 2015 at 09:22:21AM +0200, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 03:47, Steve Rutherford wrote:
> > In order to enable userspace PIC support, the userspace PIC needs to
> > be able to inject local interrupt requests.
> > 
> > This adds the ioctl KVM_REQUEST_LOCAL_INTERRUPT and kvm exit
> > KVM_EXIT_GET_EXTINT.
> > 
> > The vm ioctl KVM_REQUEST_LOCAL_INTERRUPT makes a KVM_REQ_EVENT request
> > on the BSP, which causes the BSP to exit to userspace to fetch the
> > vector of the underlying external interrupt, which the BSP then
> > injects into the guest. This matches the PIC spec, and is necessary to
> > boot Windows.
> > 
> > Boots and passes the KVM unit tests on intel x86 with the
> > PIC/PIT/IOAPIC in userspace (under a non-QEMU VMM). Boots and passes
> > the KVM unit tests under normal conditions as well.
> > 
> > SVM support and device assignment are untested with this feature
> > enabled, but testing for both is in the works.
> > 
> > Compiles for ARM/x86/PPC.
> 
> This may be ENOCOFFEE, but where is extint.vector read?

Uhhh. It's not. Something's wrong here. Looks like I dropped some lines in porting these patches. Somehow this still passes the KVM unit tests o.O

I'll track down the error. 

Steve
> 
> Could you use KVM_INTERRUPT to set KVM_REQUEST_LOCAL_INTERRUPT _and_ the
> interrupt vector at the same time?  This is exactly the logic that is
> used for !irqchip, and it should work for the EXTINT case as well.
> 
> Paolo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP.
  2015-05-13 22:10   ` Steve Rutherford
@ 2015-05-14  9:12     ` Wu, Feng
  2015-05-14 19:29       ` Andrew Honig
  0 siblings, 1 reply; 41+ messages in thread
From: Wu, Feng @ 2015-05-14  9:12 UTC (permalink / raw)
  To: Steve Rutherford, Paolo Bonzini; +Cc: kvm, ahonig, Wu, Feng



> -----Original Message-----
> From: kvm-owner@vger.kernel.org [mailto:kvm-owner@vger.kernel.org] On
> Behalf Of Steve Rutherford
> Sent: Thursday, May 14, 2015 6:10 AM
> To: Paolo Bonzini
> Cc: kvm@vger.kernel.org; ahonig@google.com
> Subject: Re: [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP.
> 
> On Wed, May 13, 2015 at 09:57:00AM +0200, Paolo Bonzini wrote:
> >
> >
> > On 13/05/2015 03:47, Steve Rutherford wrote:
> > > First patch in a series which enables the relocation of the
> > > PIC/IOAPIC/PIT to userspace.
> > >
> > > Adds capability KVM_CAP_SPLIT_IRQCHIP and ioctl KVM_SPLIT_IRQCHIP.
> > >
> > > KVM_SPLIT_IRQCHIP enables the construction of LAPICs without the rest
> > > of the irqchip.
> >
> > The ioctl name in the code is KVM_CREATE_SPLIT_IRQCHIP, but it doesn't
> > seem to create a local APIC.
> >
> 
>  This enables the creation of LAPICs every time a VCPU is created, which I
> guess is different. I'll change the language to reflect that.

vlapic is always created by kvm_vcpu_init () --> kvm_arch_vcpu_init() --> kvm_create_lapic(),
right? So in this IOCTL, it just set a flag, which is a hint for us to create the lapic or not.

BTW, what is the purpose of this series. If I understand it correctly, you only want to
use the in-kernel lapic and leave the others (pic, ioapic) in userspace, what is the
benefit of it?

Thanks,
Feng

> 
> > > Compile tested for x86.
> >
> > The capability is fine.  However, instead of introducing a new ioctl,
> > you can use KVM_CAP_ENABLE_CAP_VM and enable the capability once
> > (per-VM) using KVM_ENABLE_CAP.  The enabling code is basically the same
> > you have in your code.
> >
> 
> ENABLE_CAP seems like a better way to go. I'll update this to use it instead of
> the IOCTL.
> 
> Steve
> > Paolo
> >
> > > Signed-off-by: Steve Rutherford <srutherford@google.com>
> > > Suggested-by: Andrew Honig <ahonig@google.com>
> > > ---
> > >  Documentation/virtual/kvm/api.txt | 15 ++++++++++++
> > >  arch/powerpc/kvm/irq.h            |  5 ++++
> > >  arch/s390/kvm/irq.h               |  4 ++++
> > >  arch/x86/include/asm/kvm_host.h   |  2 ++
> > >  arch/x86/kvm/assigned-dev.c       |  4 ++--
> > >  arch/x86/kvm/irq.c                |  6 ++---
> > >  arch/x86/kvm/irq.h                | 11 +++++++++
> > >  arch/x86/kvm/irq_comm.c           |  7 ++++++
> > >  arch/x86/kvm/lapic.c              | 13 +++++++----
> > >  arch/x86/kvm/mmu.c                |  2 +-
> > >  arch/x86/kvm/svm.c                |  4 ++--
> > >  arch/x86/kvm/vmx.c                | 12 +++++-----
> > >  arch/x86/kvm/x86.c                | 49
> +++++++++++++++++++++++++++------------
> > >  include/kvm/arm_vgic.h            |  1 +
> > >  include/linux/kvm_host.h          |  1 +
> > >  include/uapi/linux/kvm.h          |  3 +++
> > >  virt/kvm/irqchip.c                |  2 +-
> > >  17 files changed, 106 insertions(+), 35 deletions(-)
> > >
> > > diff --git a/Documentation/virtual/kvm/api.txt
> b/Documentation/virtual/kvm/api.txt
> > > index 6955444..0744b4e 100644
> > > --- a/Documentation/virtual/kvm/api.txt
> > > +++ b/Documentation/virtual/kvm/api.txt
> > > @@ -2979,6 +2979,21 @@ len must be a multiple of sizeof(struct
> kvm_s390_irq). It must be > 0
> > >  and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq),
> > >  which is the maximum number of possibly pending cpu-local interrupts.
> > >
> > > +4.96 KVM_SPLIT_IRQCHIP
> > > +
> > > +Capability: KVM_CAP_SPLIT_IRQCHIP
> > > +Architectures: x86
> > > +Type:  VM ioctl
> > > +Parameters: None
> > > +Returns: 0 on success, -1 on error
> > > +
> > > +Create a local apic for each processor in the kernel.  This differs from
> > > +KVM_CREATE_IRQCHIP in that it only creates the local apic; it creates
> neither
> > > +the ioapic nor the pic in the kernel. Also, enables in kernel routing of
> > > +interrupt requests. Fails if VCPU has already been created, or if the irqchip
> is
> > > +already in the kernel.
> > > +
> > > +
> > >  5. The kvm_run structure
> > >  ------------------------
> > >
> > > diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
> > > index 5a9a10b..5e6fa06 100644
> > > --- a/arch/powerpc/kvm/irq.h
> > > +++ b/arch/powerpc/kvm/irq.h
> > > @@ -17,4 +17,9 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
> > >  	return ret;
> > >  }
> > >
> > > +static inline int lapic_in_kernel(struct kvm *kvm)
> > > +{
> > > +	return irqchip_in_kernel(kvm);
> > > +}
> > > +
> > >  #endif
> > > diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h
> > > index d98e415..db876c3 100644
> > > --- a/arch/s390/kvm/irq.h
> > > +++ b/arch/s390/kvm/irq.h
> > > @@ -19,4 +19,8 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
> > >  	return 1;
> > >  }
> > >
> > > +static inline int lapic_in_kernel(struct kvm *kvm)
> > > +{
> > > +	return irqchip_in_kernel(kvm);
> > > +}
> > >  #endif
> > > diff --git a/arch/x86/include/asm/kvm_host.h
> b/arch/x86/include/asm/kvm_host.h
> > > index bbb8f4e..3ddc134 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -638,6 +638,8 @@ struct kvm_arch {
> > >  	bool boot_vcpu_runs_old_kvmclock;
> > >
> > >  	u64 disabled_quirks;
> > > +
> > > +	bool irqchip_split;
> > >  };
> > >
> > >  struct kvm_vm_stat {
> > > diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
> > > index d090ecf..1237e92 100644
> > > --- a/arch/x86/kvm/assigned-dev.c
> > > +++ b/arch/x86/kvm/assigned-dev.c
> > > @@ -291,7 +291,7 @@ static int kvm_deassign_irq(struct kvm *kvm,
> > >  {
> > >  	unsigned long guest_irq_type, host_irq_type;
> > >
> > > -	if (!irqchip_in_kernel(kvm))
> > > +	if (!lapic_in_kernel(kvm))
> > >  		return -EINVAL;
> > >  	/* no irq assignment to deassign */
> > >  	if (!assigned_dev->irq_requested_type)
> > > @@ -568,7 +568,7 @@ static int kvm_vm_ioctl_assign_irq(struct kvm
> *kvm,
> > >  	struct kvm_assigned_dev_kernel *match;
> > >  	unsigned long host_irq_type, guest_irq_type;
> > >
> > > -	if (!irqchip_in_kernel(kvm))
> > > +	if (!lapic_in_kernel(kvm))
> > >  		return r;
> > >
> > >  	mutex_lock(&kvm->lock);
> > > diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
> > > index a1ec6a50..706e47a 100644
> > > --- a/arch/x86/kvm/irq.c
> > > +++ b/arch/x86/kvm/irq.c
> > > @@ -57,7 +57,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
> > >   */
> > >  int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
> > >  {
> > > -	if (!irqchip_in_kernel(v->kvm))
> > > +	if (!lapic_in_kernel(v->kvm))
> > >  		return v->arch.interrupt.pending;
> > >
> > >  	if (kvm_cpu_has_extint(v))
> > > @@ -75,7 +75,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
> > >   */
> > >  int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
> > >  {
> > > -	if (!irqchip_in_kernel(v->kvm))
> > > +	if (!lapic_in_kernel(v->kvm))
> > >  		return v->arch.interrupt.pending;
> > >
> > >  	if (kvm_cpu_has_extint(v))
> > > @@ -103,7 +103,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
> > >  {
> > >  	int vector;
> > >
> > > -	if (!irqchip_in_kernel(v->kvm))
> > > +	if (!lapic_in_kernel(v->kvm))
> > >  		return v->arch.interrupt.nr;
> > >
> > >  	vector = kvm_cpu_get_extint(v);
> > > diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
> > > index ad68c73..e46abf3 100644
> > > --- a/arch/x86/kvm/irq.h
> > > +++ b/arch/x86/kvm/irq.h
> > > @@ -92,6 +92,17 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
> > >  	return ret;
> > >  }
> > >
> > > +static inline int irqchip_split(struct kvm *kvm)
> > > +{
> > > +	return kvm->arch.irqchip_split;
> > > +}
> > > +
> > > +static inline int lapic_in_kernel(struct kvm *kvm)
> > > +{
> > > +	return irqchip_split(kvm) || irqchip_in_kernel(kvm);
> > > +}
> > > +
> > > +
> > >  void kvm_pic_reset(struct kvm_kpic_state *s);
> > >
> > >  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
> > > diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
> > > index 9efff9e..f43c59a 100644
> > > --- a/arch/x86/kvm/irq_comm.c
> > > +++ b/arch/x86/kvm/irq_comm.c
> > > @@ -328,3 +328,10 @@ int kvm_setup_default_irq_routing(struct kvm
> *kvm)
> > >  	return kvm_set_irq_routing(kvm, default_routing,
> > >  				   ARRAY_SIZE(default_routing), 0);
> > >  }
> > > +
> > > +static const struct kvm_irq_routing_entry empty_routing[] = {};
> > > +
> > > +int kvm_setup_empty_irq_routing(struct kvm *kvm)
> > > +{
> > > +	return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
> > > +}
> > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > index dc5b57b..bc392a6 100644
> > > --- a/arch/x86/kvm/lapic.c
> > > +++ b/arch/x86/kvm/lapic.c
> > > @@ -209,7 +209,8 @@ out:
> > >  	if (old)
> > >  		kfree_rcu(old, rcu);
> > >
> > > -	kvm_vcpu_request_scan_ioapic(kvm);
> > > +	if (!irqchip_split(kvm))
> > > +		kvm_vcpu_request_scan_ioapic(kvm);
> > >  }
> > >
> > >  static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
> > > @@ -1819,7 +1820,8 @@ void kvm_apic_post_state_restore(struct
> kvm_vcpu *vcpu,
> > >  		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
> > >  				apic_find_highest_isr(apic));
> > >  	kvm_make_request(KVM_REQ_EVENT, vcpu);
> > > -	kvm_rtc_eoi_tracking_restore_one(vcpu);
> > > +	if (!irqchip_split(vcpu->kvm))
> > > +		kvm_rtc_eoi_tracking_restore_one(vcpu);
> > >  }
> > >
> > >  void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
> > > @@ -1902,7 +1904,8 @@ static void apic_sync_pv_eoi_to_guest(struct
> kvm_vcpu *vcpu,
> > >  	    /* Cache not set: could be safe but we don't bother. */
> > >  	    apic->highest_isr_cache == -1 ||
> > >  	    /* Need EOI to update ioapic. */
> > > -	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
> > > +	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)
> ||
> > > +	    irqchip_split(vcpu->kvm)) {
> > >  		/*
> > >  		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
> > >  		 * so we need not do anything here.
> > > @@ -1958,7 +1961,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu
> *vcpu, u32 msr, u64 data)
> > >  	struct kvm_lapic *apic = vcpu->arch.apic;
> > >  	u32 reg = (msr - APIC_BASE_MSR) << 4;
> > >
> > > -	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> > > +	if (!lapic_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> > >  		return 1;
> > >
> > >  	if (reg == APIC_ICR2)
> > > @@ -1975,7 +1978,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu
> *vcpu, u32 msr, u64 *data)
> > >  	struct kvm_lapic *apic = vcpu->arch.apic;
> > >  	u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
> > >
> > > -	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> > > +	if (!lapic_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
> > >  		return 1;
> > >
> > >  	if (reg == APIC_DFR || reg == APIC_ICR2) {
> > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > > index b78e83f..e5bf6db 100644
> > > --- a/arch/x86/kvm/mmu.c
> > > +++ b/arch/x86/kvm/mmu.c
> > > @@ -3465,7 +3465,7 @@ static int kvm_arch_setup_async_pf(struct
> kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
> > >
> > >  static bool can_do_async_pf(struct kvm_vcpu *vcpu)
> > >  {
> > > -	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
> > > +	if (unlikely(!lapic_in_kernel(vcpu->kvm) ||
> > >  		     kvm_event_needs_reinjection(vcpu)))
> > >  		return false;
> > >
> > > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > > index 8954d7a..7ac9ec2 100644
> > > --- a/arch/x86/kvm/svm.c
> > > +++ b/arch/x86/kvm/svm.c
> > > @@ -3054,7 +3054,7 @@ static int cr8_write_interception(struct
> vcpu_svm *svm)
> > >  	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
> > >  	/* instruction emulation calls kvm_set_cr8() */
> > >  	r = cr_interception(svm);
> > > -	if (irqchip_in_kernel(svm->vcpu.kvm))
> > > +	if (lapic_in_kernel(svm->vcpu.kvm))
> > >  		return r;
> > >  	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
> > >  		return r;
> > > @@ -3295,7 +3295,7 @@ static int interrupt_window_interception(struct
> vcpu_svm *svm)
> > >  	 * If the user space waits to inject interrupts, exit as soon as
> > >  	 * possible
> > >  	 */
> > > -	if (!irqchip_in_kernel(svm->vcpu.kvm) &&
> > > +	if (!lapic_in_kernel(svm->vcpu.kvm) &&
> > >  	    kvm_run->request_interrupt_window &&
> > >  	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
> > >  		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
> > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > > index bcb61b0..a53747f 100644
> > > --- a/arch/x86/kvm/vmx.c
> > > +++ b/arch/x86/kvm/vmx.c
> > > @@ -948,7 +948,7 @@ static inline bool cpu_has_vmx_tpr_shadow(void)
> > >
> > >  static inline bool vm_need_tpr_shadow(struct kvm *kvm)
> > >  {
> > > -	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
> > > +	return (cpu_has_vmx_tpr_shadow()) && lapic_in_kernel(kvm);
> > >  }
> > >
> > >  static inline bool cpu_has_secondary_exec_ctrls(void)
> > > @@ -1064,7 +1064,7 @@ static inline bool cpu_has_vmx_ple(void)
> > >
> > >  static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
> > >  {
> > > -	return flexpriority_enabled && irqchip_in_kernel(kvm);
> > > +	return flexpriority_enabled && lapic_in_kernel(kvm);
> > >  }
> > >
> > >  static inline bool cpu_has_vmx_vpid(void)
> > > @@ -4341,7 +4341,7 @@ static void
> vmx_disable_intercept_msr_write_x2apic(u32 msr)
> > >
> > >  static int vmx_vm_has_apicv(struct kvm *kvm)
> > >  {
> > > -	return enable_apicv && irqchip_in_kernel(kvm);
> > > +	return enable_apicv && lapic_in_kernel(kvm);
> > >  }
> > >
> > >  static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu
> *vcpu)
> > > @@ -5317,7 +5317,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
> > >  				u8 cr8 = (u8)val;
> > >  				err = kvm_set_cr8(vcpu, cr8);
> > >  				kvm_complete_insn_gp(vcpu, err);
> > > -				if (irqchip_in_kernel(vcpu->kvm))
> > > +				if (lapic_in_kernel(vcpu->kvm))
> > >  					return 1;
> > >  				if (cr8_prev <= cr8)
> > >  					return 1;
> > > @@ -5534,7 +5534,7 @@ static int handle_interrupt_window(struct
> kvm_vcpu *vcpu)
> > >  	 * If the user space waits to inject interrupts, exit as soon as
> > >  	 * possible
> > >  	 */
> > > -	if (!irqchip_in_kernel(vcpu->kvm) &&
> > > +	if (!lapic_in_kernel(vcpu->kvm) &&
> > >  	    vcpu->run->request_interrupt_window &&
> > >  	    !kvm_cpu_has_interrupt(vcpu)) {
> > >  		vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
> > > @@ -9419,7 +9419,7 @@ static void prepare_vmcs02(struct kvm_vcpu
> *vcpu, struct vmcs12 *vmcs12)
> > >  	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and
> VM_ENTRY_IA32E_MODE are
> > >  	 * emulated by vmx_set_efer(), below.
> > >  	 */
> > > -	vm_entry_controls_init(vmx,
> > > +	vm_entry_controls_init(vmx,
> > >  		(vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER
> &
> > >  			~VM_ENTRY_IA32E_MODE) |
> > >  		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index cde5d61..7505b39 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -782,7 +782,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned
> long cr8)
> > >  {
> > >  	if (cr8 & CR8_RESERVED_BITS)
> > >  		return 1;
> > > -	if (irqchip_in_kernel(vcpu->kvm))
> > > +	if (lapic_in_kernel(vcpu->kvm))
> > >  		kvm_lapic_set_tpr(vcpu, cr8);
> > >  	else
> > >  		vcpu->arch.cr8 = cr8;
> > > @@ -792,7 +792,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
> > >
> > >  unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
> > >  {
> > > -	if (irqchip_in_kernel(vcpu->kvm))
> > > +	if (lapic_in_kernel(vcpu->kvm))
> > >  		return kvm_lapic_get_cr8(vcpu);
> > >  	else
> > >  		return vcpu->arch.cr8;
> > > @@ -2800,6 +2800,7 @@ int kvm_vm_ioctl_check_extension(struct kvm
> *kvm, long ext)
> > >  	case KVM_CAP_TSC_DEADLINE_TIMER:
> > >  	case KVM_CAP_ENABLE_CAP_VM:
> > >  	case KVM_CAP_DISABLE_QUIRKS:
> > > +	case KVM_CAP_SPLIT_IRQCHIP:
> > >  #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
> > >  	case KVM_CAP_ASSIGN_DEV_IRQ:
> > >  	case KVM_CAP_PCI_2_3:
> > > @@ -3002,7 +3003,7 @@ static int kvm_vcpu_ioctl_interrupt(struct
> kvm_vcpu *vcpu,
> > >  {
> > >  	if (irq->irq >= KVM_NR_INTERRUPTS)
> > >  		return -EINVAL;
> > > -	if (irqchip_in_kernel(vcpu->kvm))
> > > +	if (lapic_in_kernel(vcpu->kvm))
> > >  		return -ENXIO;
> > >
> > >  	kvm_queue_interrupt(vcpu, irq->irq, false);
> > > @@ -3480,7 +3481,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> > >  		struct kvm_vapic_addr va;
> > >
> > >  		r = -EINVAL;
> > > -		if (!irqchip_in_kernel(vcpu->kvm))
> > > +		if (!lapic_in_kernel(vcpu->kvm))
> > >  			goto out;
> > >  		r = -EFAULT;
> > >  		if (copy_from_user(&va, argp, sizeof va))
> > > @@ -3838,7 +3839,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
> struct kvm_dirty_log *log)
> > >  int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level
> *irq_event,
> > >  			bool line_status)
> > >  {
> > > -	if (!irqchip_in_kernel(kvm))
> > > +	if (!lapic_in_kernel(kvm))
> > >  		return -ENXIO;
> > >
> > >  	irq_event->status = kvm_set_irq(kvm,
> KVM_USERSPACE_IRQ_SOURCE_ID,
> > > @@ -4128,6 +4129,24 @@ long kvm_arch_vm_ioctl(struct file *filp,
> > >  		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
> > >  		break;
> > >  	}
> > > +	case KVM_CREATE_SPLIT_IRQCHIP: {
> > > +		mutex_lock(&kvm->lock);
> > > +		r = -EEXIST;
> > > +		if (lapic_in_kernel(kvm))
> > > +			goto split_irqchip_unlock;
> > > +		r = -EINVAL;
> > > +		if (atomic_read(&kvm->online_vcpus))
> > > +			goto split_irqchip_unlock;
> > > +		r = kvm_setup_empty_irq_routing(kvm);
> > > +		if (r)
> > > +			goto split_irqchip_unlock;
> > > +		kvm->arch.irqchip_split = true;
> > > +		r = 0;
> > > +split_irqchip_unlock:
> > > +		mutex_unlock(&kvm->lock);
> > > +		break;
> > > +	}
> > > +
> > >  	default:
> > >  		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
> > >  	}
> > > @@ -5893,7 +5912,7 @@ void kvm_arch_exit(void)
> > >  int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
> > >  {
> > >  	++vcpu->stat.halt_exits;
> > > -	if (irqchip_in_kernel(vcpu->kvm)) {
> > > +	if (lapic_in_kernel(vcpu->kvm)) {
> > >  		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
> > >  		return 1;
> > >  	} else {
> > > @@ -6060,7 +6079,7 @@ static int emulator_fix_hypercall(struct
> x86_emulate_ctxt *ctxt)
> > >   */
> > >  static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
> > >  {
> > > -	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu)
> &&
> > > +	return (!lapic_in_kernel(vcpu->kvm)
> && !kvm_cpu_has_interrupt(vcpu) &&
> > >  		vcpu->run->request_interrupt_window &&
> > >  		kvm_arch_interrupt_allowed(vcpu));
> > >  }
> > > @@ -6072,7 +6091,7 @@ static void post_kvm_run_save(struct kvm_vcpu
> *vcpu)
> > >  	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
> > >  	kvm_run->cr8 = kvm_get_cr8(vcpu);
> > >  	kvm_run->apic_base = kvm_get_apic_base(vcpu);
> > > -	if (irqchip_in_kernel(vcpu->kvm))
> > > +	if (lapic_in_kernel(vcpu->kvm))
> > >  		kvm_run->ready_for_interrupt_injection = 1;
> > >  	else
> > >  		kvm_run->ready_for_interrupt_injection =
> > > @@ -6219,7 +6238,7 @@ void kvm_vcpu_reload_apic_access_page(struct
> kvm_vcpu *vcpu)
> > >  {
> > >  	struct page *page = NULL;
> > >
> > > -	if (!irqchip_in_kernel(vcpu->kvm))
> > > +	if (!lapic_in_kernel(vcpu->kvm))
> > >  		return;
> > >
> > >  	if (!kvm_x86_ops->set_apic_access_page_addr)
> > > @@ -6255,7 +6274,7 @@ void
> kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
> > >  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> > >  {
> > >  	int r;
> > > -	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
> > > +	bool req_int_win = !lapic_in_kernel(vcpu->kvm) &&
> > >  		vcpu->run->request_interrupt_window;
> > >  	bool req_immediate_exit = false;
> > >
> > > @@ -6644,7 +6663,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu
> *vcpu, struct kvm_run *kvm_run)
> > >  	}
> > >
> > >  	/* re-sync apic's tpr */
> > > -	if (!irqchip_in_kernel(vcpu->kvm)) {
> > > +	if (!lapic_in_kernel(vcpu->kvm)) {
> > >  		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
> > >  			r = -EINVAL;
> > >  			goto out;
> > > @@ -7340,7 +7359,7 @@ void kvm_arch_check_processor_compat(void
> *rtn)
> > >
> > >  bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
> > >  {
> > > -	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
> > > +	return lapic_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
> > >  }
> > >
> > >  struct static_key kvm_no_apic_vcpu __read_mostly;
> > > @@ -7356,7 +7375,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> > >
> > >  	vcpu->arch.pv.pv_unhalted = false;
> > >  	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
> > > -	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
> > > +	if (!lapic_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
> > >  		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> > >  	else
> > >  		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
> > > @@ -7374,7 +7393,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> > >  	if (r < 0)
> > >  		goto fail_free_pio_data;
> > >
> > > -	if (irqchip_in_kernel(kvm)) {
> > > +	if (lapic_in_kernel(kvm)) {
> > >  		r = kvm_create_lapic(vcpu);
> > >  		if (r < 0)
> > >  			goto fail_mmu_destroy;
> > > @@ -7437,7 +7456,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu
> *vcpu)
> > >  	kvm_mmu_destroy(vcpu);
> > >  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > >  	free_page((unsigned long)vcpu->arch.pio_data);
> > > -	if (!irqchip_in_kernel(vcpu->kvm))
> > > +	if (!lapic_in_kernel(vcpu->kvm))
> > >  		static_key_slow_dec(&kvm_no_apic_vcpu);
> > >  }
> > >
> > > diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
> > > index 133ea00..ffe1f4e 100644
> > > --- a/include/kvm/arm_vgic.h
> > > +++ b/include/kvm/arm_vgic.h
> > > @@ -329,6 +329,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu
> *vcpu);
> > >  int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
> > >
> > >  #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
> > > +#define lapic_in_kernel(k)      (irqchip_in_kernel(k))
> > >  #define vgic_initialized(k)	(!!((k)->arch.vgic.nr_cpus))
> > >  #define vgic_ready(k)		((k)->arch.vgic.ready)
> > >
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index 87fd74a..277b7a1 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -934,6 +934,7 @@ static inline int mmu_notifier_retry(struct kvm
> *kvm, unsigned long mmu_seq)
> > >  #endif
> > >
> > >  int kvm_setup_default_irq_routing(struct kvm *kvm);
> > > +int kvm_setup_empty_irq_routing(struct kvm *kvm);
> > >  int kvm_set_irq_routing(struct kvm *kvm,
> > >  			const struct kvm_irq_routing_entry *entries,
> > >  			unsigned nr,
> > > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > > index 75bd9f7..7d06dc4 100644
> > > --- a/include/uapi/linux/kvm.h
> > > +++ b/include/uapi/linux/kvm.h
> > > @@ -815,6 +815,7 @@ struct kvm_ppc_smmu_info {
> > >  #define KVM_CAP_S390_IRQ_STATE 114
> > >  #define KVM_CAP_PPC_HWRNG 115
> > >  #define KVM_CAP_DISABLE_QUIRKS 116
> > > +#define KVM_CAP_SPLIT_IRQCHIP 117
> > >
> > >  #ifdef KVM_CAP_IRQ_ROUTING
> > >
> > > @@ -1200,6 +1201,8 @@ struct kvm_s390_ucas_mapping {
> > >  /* Available with KVM_CAP_S390_IRQ_STATE */
> > >  #define KVM_S390_SET_IRQ_STATE	  _IOW(KVMIO, 0xb5, struct
> kvm_s390_irq_state)
> > >  #define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct
> kvm_s390_irq_state)
> > > +/* Available with KVM_CAP_SPLIT_IRQCHIP */
> > > +#define KVM_CREATE_SPLIT_IRQCHIP  _IO(KVMIO, 0xb7)
> > >
> > >  #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
> > >  #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
> > > diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
> > > index 1d56a90..8aaceed 100644
> > > --- a/virt/kvm/irqchip.c
> > > +++ b/virt/kvm/irqchip.c
> > > @@ -73,7 +73,7 @@ int kvm_send_userspace_msi(struct kvm *kvm,
> struct kvm_msi *msi)
> > >  {
> > >  	struct kvm_kernel_irq_routing_entry route;
> > >
> > > -	if (!irqchip_in_kernel(kvm) || msi->flags != 0)
> > > +	if (!lapic_in_kernel(kvm) || msi->flags != 0)
> > >  		return -EINVAL;
> > >
> > >  	route.msi.address_lo = msi->address_lo;
> > >
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-13 22:24     ` Steve Rutherford
@ 2015-05-14  9:20       ` Paolo Bonzini
  2015-05-14 15:23         ` Alex Williamson
  0 siblings, 1 reply; 41+ messages in thread
From: Paolo Bonzini @ 2015-05-14  9:20 UTC (permalink / raw)
  To: Steve Rutherford; +Cc: kvm, ahonig, Alex Williamson



On 14/05/2015 00:24, Steve Rutherford wrote:
> Seems reasonable.
> 
> While we're on the topic of scanning the IOAPIC, should this also
> scan the IOAPIC when (un)registering irq ack notifiers? [Which is
> currently done for the in-kernel IOAPIC.]

Would irq_ack_notifiers be used at all with this patch set?  Resampling
of IOAPIC level-triggered interrupts would be implemented in userspace.
 For the same reason, assigned devices using legacy device assignment
probably would not be able to use INTX (so this feature should depend on
!KVM_DEVICE_ASSIGNMENT).  Add the emulated i8254 and the last user of
irq_ack_notifiers goes away.

Alex, how does VFIO do INTX resampling if you're using TCG or -machine
kernel_irqchip=off?  (Context: this series keeps the local APIC
emulation in the kernel, thus including MSI, but moves the IOAPIC
emualtion to userspace).

Paolo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-14  9:20       ` Paolo Bonzini
@ 2015-05-14 15:23         ` Alex Williamson
  2015-05-14 15:46           ` Paolo Bonzini
  0 siblings, 1 reply; 41+ messages in thread
From: Alex Williamson @ 2015-05-14 15:23 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: Steve Rutherford, kvm, ahonig

On Thu, 2015-05-14 at 11:20 +0200, Paolo Bonzini wrote:
> 
> On 14/05/2015 00:24, Steve Rutherford wrote:
> > Seems reasonable.
> > 
> > While we're on the topic of scanning the IOAPIC, should this also
> > scan the IOAPIC when (un)registering irq ack notifiers? [Which is
> > currently done for the in-kernel IOAPIC.]
> 
> Would irq_ack_notifiers be used at all with this patch set?  Resampling
> of IOAPIC level-triggered interrupts would be implemented in userspace.
>  For the same reason, assigned devices using legacy device assignment
> probably would not be able to use INTX (so this feature should depend on
> !KVM_DEVICE_ASSIGNMENT).  Add the emulated i8254 and the last user of
> irq_ack_notifiers goes away.
> 
> Alex, how does VFIO do INTX resampling if you're using TCG or -machine
> kernel_irqchip=off?  (Context: this series keeps the local APIC
> emulation in the kernel, thus including MSI, but moves the IOAPIC
> emualtion to userspace).

Without KVM irqchip, we use a rudimentary approach where we disable
mmaps to the device when an interrupt occurs.  This makes us trap all
accesses to the device.  We then handle any device access from the guest
as a potential EOI and unmask the interrupt.  If the interrupt re-fires,
we've at least rate-limited it.  If it doesn't, a timer eventually
re-enables devices mmaps.  It's not very efficient, but it works (for
both PCI and platform devices) and avoids IRQ APIs through QEMU that get
very platform/machine specific.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-14 15:23         ` Alex Williamson
@ 2015-05-14 15:46           ` Paolo Bonzini
  2015-05-14 16:04             ` Alex Williamson
  0 siblings, 1 reply; 41+ messages in thread
From: Paolo Bonzini @ 2015-05-14 15:46 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Steve Rutherford, kvm, ahonig



On 14/05/2015 17:23, Alex Williamson wrote:
> > Would irq_ack_notifiers be used at all with this patch set?  Resampling
> > of IOAPIC level-triggered interrupts would be implemented in userspace.
> >  For the same reason, assigned devices using legacy device assignment
> > probably would not be able to use INTX (so this feature should depend on
> > !KVM_DEVICE_ASSIGNMENT).  Add the emulated i8254 and the last user of
> > irq_ack_notifiers goes away.
> > 
> > Alex, how does VFIO do INTX resampling if you're using TCG or -machine
> > kernel_irqchip=off?  (Context: this series keeps the local APIC
> > emulation in the kernel, thus including MSI, but moves the IOAPIC
> > emualtion to userspace).
> 
> Without KVM irqchip, we use a rudimentary approach where we disable
> mmaps to the device when an interrupt occurs.  This makes us trap all
> accesses to the device.  We then handle any device access from the guest
> as a potential EOI and unmask the interrupt.  If the interrupt re-fires,
> we've at least rate-limited it.  If it doesn't, a timer eventually
> re-enables devices mmaps.  It's not very efficient, but it works (for
> both PCI and platform devices) and avoids IRQ APIs through QEMU that get
> very platform/machine specific.  Thanks,

If we move the IOAPIC back to userspace, it probably would be easier to
implement irqfd/resamplefd in userspace as well.  Let the bikeshedding
start!

Paolo

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-14 15:46           ` Paolo Bonzini
@ 2015-05-14 16:04             ` Alex Williamson
  2015-05-14 22:10               ` Steve Rutherford
  0 siblings, 1 reply; 41+ messages in thread
From: Alex Williamson @ 2015-05-14 16:04 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: Steve Rutherford, kvm, ahonig

On Thu, 2015-05-14 at 17:46 +0200, Paolo Bonzini wrote:
> 
> On 14/05/2015 17:23, Alex Williamson wrote:
> > > Would irq_ack_notifiers be used at all with this patch set?  Resampling
> > > of IOAPIC level-triggered interrupts would be implemented in userspace.
> > >  For the same reason, assigned devices using legacy device assignment
> > > probably would not be able to use INTX (so this feature should depend on
> > > !KVM_DEVICE_ASSIGNMENT).  Add the emulated i8254 and the last user of
> > > irq_ack_notifiers goes away.
> > > 
> > > Alex, how does VFIO do INTX resampling if you're using TCG or -machine
> > > kernel_irqchip=off?  (Context: this series keeps the local APIC
> > > emulation in the kernel, thus including MSI, but moves the IOAPIC
> > > emualtion to userspace).
> > 
> > Without KVM irqchip, we use a rudimentary approach where we disable
> > mmaps to the device when an interrupt occurs.  This makes us trap all
> > accesses to the device.  We then handle any device access from the guest
> > as a potential EOI and unmask the interrupt.  If the interrupt re-fires,
> > we've at least rate-limited it.  If it doesn't, a timer eventually
> > re-enables devices mmaps.  It's not very efficient, but it works (for
> > both PCI and platform devices) and avoids IRQ APIs through QEMU that get
> > very platform/machine specific.  Thanks,
> 
> If we move the IOAPIC back to userspace, it probably would be easier to
> implement irqfd/resamplefd in userspace as well.  Let the bikeshedding
> start!

The current solution is merely functional with the benefit of being
architecture and machine-type independent.  If someone actually cared
about vfio assigned device performance without KVM irqchip, we'd
certainly need a more deterministic and efficient EOI path.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP.
  2015-05-14  9:12     ` Wu, Feng
@ 2015-05-14 19:29       ` Andrew Honig
  2015-05-15  1:28         ` Wu, Feng
  2015-05-15  5:03         ` Wanpeng Li
  0 siblings, 2 replies; 41+ messages in thread
From: Andrew Honig @ 2015-05-14 19:29 UTC (permalink / raw)
  To: Wu, Feng; +Cc: Steve Rutherford, Paolo Bonzini, kvm

>
> BTW, what is the purpose of this series. If I understand it correctly, you only want to
> use the in-kernel lapic and leave the others (pic, ioapic) in userspace, what is the
> benefit of it?

The purpose is to achieve the security benefit of removing some of the
interrupt handling into userspace, without incurring a significant
performance penalty.  If you move the entire IRQCHIP into userspace,
we've seen perf impacts from 15-200% depending on the workload.  With
this patch series, we're seeing perf penalty <1% on our tests (TCP_RR
latency, TCP throughput, and Disk I/O).  See
(https://lwn.net/Articles/619332/)

>
> Thanks,
> Feng
>
>>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-14 16:04             ` Alex Williamson
@ 2015-05-14 22:10               ` Steve Rutherford
  2015-05-14 22:35                 ` Alex Williamson
  0 siblings, 1 reply; 41+ messages in thread
From: Steve Rutherford @ 2015-05-14 22:10 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Paolo Bonzini, kvm, ahonig

On Thu, May 14, 2015 at 10:04:40AM -0600, Alex Williamson wrote:
> On Thu, 2015-05-14 at 17:46 +0200, Paolo Bonzini wrote:
> > 
> > On 14/05/2015 17:23, Alex Williamson wrote:
> > > > Would irq_ack_notifiers be used at all with this patch set?  Resampling
> > > > of IOAPIC level-triggered interrupts would be implemented in userspace.
> > > >  For the same reason, assigned devices using legacy device assignment
> > > > probably would not be able to use INTX (so this feature should depend on
> > > > !KVM_DEVICE_ASSIGNMENT).  Add the emulated i8254 and the last user of
> > > > irq_ack_notifiers goes away.
> > > > 
> > > > Alex, how does VFIO do INTX resampling if you're using TCG or -machine
> > > > kernel_irqchip=off?  (Context: this series keeps the local APIC
> > > > emulation in the kernel, thus including MSI, but moves the IOAPIC
> > > > emualtion to userspace).
> > > 
> > > Without KVM irqchip, we use a rudimentary approach where we disable
> > > mmaps to the device when an interrupt occurs.  This makes us trap all
> > > accesses to the device.  We then handle any device access from the guest
> > > as a potential EOI and unmask the interrupt.  If the interrupt re-fires,
> > > we've at least rate-limited it.  If it doesn't, a timer eventually
> > > re-enables devices mmaps.  It's not very efficient, but it works (for
> > > both PCI and platform devices) and avoids IRQ APIs through QEMU that get
> > > very platform/machine specific.  Thanks,
> > 
> > If we move the IOAPIC back to userspace, it probably would be easier to
> > implement irqfd/resamplefd in userspace as well.  Let the bikeshedding
> > start!
> 
> The current solution is merely functional with the benefit of being
> architecture and machine-type independent.  If someone actually cared
> about vfio assigned device performance without KVM irqchip, we'd
> certainly need a more deterministic and efficient EOI path.  Thanks,
> 
> Alex
> 

I'd like this feature to not preclude fast /modern/ device assignment (i.e. devices using MSIs). A perf hit (or even incompatibility) for legacy devices is fine [that's sort of the premise of this patchset]. What's necessary for the device assignment to work with the split irqchip?

My take regarding the original question: No, we don't need to scan the IOAPIC when the irqchip is split, given that the userspace IOAPIC is not using resamplefds to fetch EOIs from the kernel. 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-14 22:10               ` Steve Rutherford
@ 2015-05-14 22:35                 ` Alex Williamson
  2015-05-14 23:21                   ` Steve Rutherford
  0 siblings, 1 reply; 41+ messages in thread
From: Alex Williamson @ 2015-05-14 22:35 UTC (permalink / raw)
  To: Steve Rutherford; +Cc: Paolo Bonzini, kvm, ahonig

On Thu, 2015-05-14 at 15:10 -0700, Steve Rutherford wrote:
> On Thu, May 14, 2015 at 10:04:40AM -0600, Alex Williamson wrote:
> > On Thu, 2015-05-14 at 17:46 +0200, Paolo Bonzini wrote:
> > > 
> > > On 14/05/2015 17:23, Alex Williamson wrote:
> > > > > Would irq_ack_notifiers be used at all with this patch set?  Resampling
> > > > > of IOAPIC level-triggered interrupts would be implemented in userspace.
> > > > >  For the same reason, assigned devices using legacy device assignment
> > > > > probably would not be able to use INTX (so this feature should depend on
> > > > > !KVM_DEVICE_ASSIGNMENT).  Add the emulated i8254 and the last user of
> > > > > irq_ack_notifiers goes away.
> > > > > 
> > > > > Alex, how does VFIO do INTX resampling if you're using TCG or -machine
> > > > > kernel_irqchip=off?  (Context: this series keeps the local APIC
> > > > > emulation in the kernel, thus including MSI, but moves the IOAPIC
> > > > > emualtion to userspace).
> > > > 
> > > > Without KVM irqchip, we use a rudimentary approach where we disable
> > > > mmaps to the device when an interrupt occurs.  This makes us trap all
> > > > accesses to the device.  We then handle any device access from the guest
> > > > as a potential EOI and unmask the interrupt.  If the interrupt re-fires,
> > > > we've at least rate-limited it.  If it doesn't, a timer eventually
> > > > re-enables devices mmaps.  It's not very efficient, but it works (for
> > > > both PCI and platform devices) and avoids IRQ APIs through QEMU that get
> > > > very platform/machine specific.  Thanks,
> > > 
> > > If we move the IOAPIC back to userspace, it probably would be easier to
> > > implement irqfd/resamplefd in userspace as well.  Let the bikeshedding
> > > start!
> > 
> > The current solution is merely functional with the benefit of being
> > architecture and machine-type independent.  If someone actually cared
> > about vfio assigned device performance without KVM irqchip, we'd
> > certainly need a more deterministic and efficient EOI path.  Thanks,
> > 
> > Alex
> > 
> 
> I'd like this feature to not preclude fast /modern/ device assignment
> (i.e. devices using MSIs). A perf hit (or even incompatibility) for
> legacy devices is fine [that's sort of the premise of this patchset].
> What's necessary for the device assignment to work with the split
> irqchip?

The current code will attempt to use kvm_irqchip_add_msi_route() and
kvm_irqchip_add_irqfd_notifier() to route the MSI through the KVM
irqchip.  Without that, QEMU will receive the MSI eventfd and call
msi_notify() or msix_notify(), which would be significantly slower.
This should be pretty much the same as virtio, which you've hopefully
already taken into consideration.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-14 22:35                 ` Alex Williamson
@ 2015-05-14 23:21                   ` Steve Rutherford
  0 siblings, 0 replies; 41+ messages in thread
From: Steve Rutherford @ 2015-05-14 23:21 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Paolo Bonzini, kvm, ahonig

On Thu, May 14, 2015 at 04:35:19PM -0600, Alex Williamson wrote:
> On Thu, 2015-05-14 at 15:10 -0700, Steve Rutherford wrote:
> > On Thu, May 14, 2015 at 10:04:40AM -0600, Alex Williamson wrote:
> > > On Thu, 2015-05-14 at 17:46 +0200, Paolo Bonzini wrote:
> > > > 
> > > > On 14/05/2015 17:23, Alex Williamson wrote:
> > > > > > Would irq_ack_notifiers be used at all with this patch set?  Resampling
> > > > > > of IOAPIC level-triggered interrupts would be implemented in userspace.
> > > > > >  For the same reason, assigned devices using legacy device assignment
> > > > > > probably would not be able to use INTX (so this feature should depend on
> > > > > > !KVM_DEVICE_ASSIGNMENT).  Add the emulated i8254 and the last user of
> > > > > > irq_ack_notifiers goes away.
> > > > > > 
> > > > > > Alex, how does VFIO do INTX resampling if you're using TCG or -machine
> > > > > > kernel_irqchip=off?  (Context: this series keeps the local APIC
> > > > > > emulation in the kernel, thus including MSI, but moves the IOAPIC
> > > > > > emualtion to userspace).
> > > > > 
> > > > > Without KVM irqchip, we use a rudimentary approach where we disable
> > > > > mmaps to the device when an interrupt occurs.  This makes us trap all
> > > > > accesses to the device.  We then handle any device access from the guest
> > > > > as a potential EOI and unmask the interrupt.  If the interrupt re-fires,
> > > > > we've at least rate-limited it.  If it doesn't, a timer eventually
> > > > > re-enables devices mmaps.  It's not very efficient, but it works (for
> > > > > both PCI and platform devices) and avoids IRQ APIs through QEMU that get
> > > > > very platform/machine specific.  Thanks,
> > > > 
> > > > If we move the IOAPIC back to userspace, it probably would be easier to
> > > > implement irqfd/resamplefd in userspace as well.  Let the bikeshedding
> > > > start!
> > > 
> > > The current solution is merely functional with the benefit of being
> > > architecture and machine-type independent.  If someone actually cared
> > > about vfio assigned device performance without KVM irqchip, we'd
> > > certainly need a more deterministic and efficient EOI path.  Thanks,
> > > 
> > > Alex
> > > 
> > 
> > I'd like this feature to not preclude fast /modern/ device assignment
> > (i.e. devices using MSIs). A perf hit (or even incompatibility) for
> > legacy devices is fine [that's sort of the premise of this patchset].
> > What's necessary for the device assignment to work with the split
> > irqchip?
> 
> The current code will attempt to use kvm_irqchip_add_msi_route() and
> kvm_irqchip_add_irqfd_notifier() to route the MSI through the KVM
> irqchip.  Without that, QEMU will receive the MSI eventfd and call
> msi_notify() or msix_notify(), which would be significantly slower.
> This should be pretty much the same as virtio, which you've hopefully
> already taken into consideration.  Thanks,
> 
> Alex
> 

Those will be easy to update, given that the routing table is staying in the kernel. The notion of "kvm_irqchip_in_kernel" will need to be updated when support for this is added to QEMU, but that's not a huge deal.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP.
  2015-05-14 19:29       ` Andrew Honig
@ 2015-05-15  1:28         ` Wu, Feng
  2015-05-15  5:03         ` Wanpeng Li
  1 sibling, 0 replies; 41+ messages in thread
From: Wu, Feng @ 2015-05-15  1:28 UTC (permalink / raw)
  To: Andrew Honig; +Cc: Steve Rutherford, Paolo Bonzini, kvm, Wu, Feng



> -----Original Message-----
> From: Andrew Honig [mailto:ahonig@google.com]
> Sent: Friday, May 15, 2015 3:29 AM
> To: Wu, Feng
> Cc: Steve Rutherford; Paolo Bonzini; kvm@vger.kernel.org
> Subject: Re: [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP.
> 
> >
> > BTW, what is the purpose of this series. If I understand it correctly, you only
> want to
> > use the in-kernel lapic and leave the others (pic, ioapic) in userspace, what is
> the
> > benefit of it?
> 
> The purpose is to achieve the security benefit of removing some of the
> interrupt handling into userspace, without incurring a significant
> performance penalty.  If you move the entire IRQCHIP into userspace,
> we've seen perf impacts from 15-200% depending on the workload.  With
> this patch series, we're seeing perf penalty <1% on our tests (TCP_RR
> latency, TCP throughput, and Disk I/O).  See
> (https://lwn.net/Articles/619332/)

Good to know this, thanks for the sharing!

Thanks,
Feng

> 
> >
> > Thanks,
> > Feng
> >
> >>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference
  2015-05-13  8:04     ` Paolo Bonzini
  2015-05-13  8:10       ` Jan Kiszka
  2015-05-13 22:21       ` Steve Rutherford
@ 2015-05-15  2:38       ` Steve Rutherford
  2 siblings, 0 replies; 41+ messages in thread
From: Steve Rutherford @ 2015-05-15  2:38 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: Jan Kiszka, kvm, ahonig

On Wed, May 13, 2015 at 10:04:53AM +0200, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 08:12, Jan Kiszka wrote:
> >> +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
> >> +{
> >> +	struct kvm *kvm = vcpu->kvm;
> >> +	struct kvm_kernel_irq_routing_entry *entry;
> >> +	struct kvm_irq_routing_table *table;
> >> +	u32 i, nr_rt_entries;
> >> +
> >> +	mutex_lock(&kvm->irq_lock);
> 
> This only needs irq_srcu protection, not irq_lock, so the lookup cost
> becomes much smaller (all CPUs can proceed in parallel).

I've updated the next iteration to include this. 

> 
> You would need to put an smp_mb here, to ensure that irq_routing is read
> after KVM_SCAN_IOAPIC is cleared.  You can introduce
> smb_mb__after_srcu_read_lock in order to elide it.
> 
> The matching memory barrier would be a smp_mb__before_atomic in
> kvm_make_scan_ioapic_request. 

Wait, what could happen if KVM_SCAN_IOAPIC is cleared after irq_routing is read? I'm imagining the following, but I'm not sure it makes sense.

After reading irq_routing, another routing update could roll through (which would be followed the setting of KVM_SCAN_IOAPIC), both of which could be followed by the reordered clearing of KVM_SCAN_IOAPIC. This would lead to only one scan, which would use the stale value for kvm->irq_routing.

The reading of kvm->irq_routing (and the reads in srcu_read_lock) should be able to be reordered back to before the clearing of the bit in vcpu->requests (but after the read of vcpu->requests), because reads can be reordered with unrelated writes, but not with other reads. If a routing update came through on another cpu during this window, the issue above could happen. 

Adding a memory barrier (but not an wmb or an rmb) before reading irq_routing (ideally not in the srcu critical section) should prevent this from happening?

Sorry if this is long winded. Memory ordering always feels subtle. 

Steve

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP.
  2015-05-14 19:29       ` Andrew Honig
  2015-05-15  1:28         ` Wu, Feng
@ 2015-05-15  5:03         ` Wanpeng Li
  2015-05-15 18:10           ` Steve Rutherford
  1 sibling, 1 reply; 41+ messages in thread
From: Wanpeng Li @ 2015-05-15  5:03 UTC (permalink / raw)
  To: Andrew Honig; +Cc: Wu, Feng, Steve Rutherford, Paolo Bonzini, kvm

On Thu, May 14, 2015 at 12:29:21PM -0700, Andrew Honig wrote:
>>
>> BTW, what is the purpose of this series. If I understand it correctly, you only want to
>> use the in-kernel lapic and leave the others (pic, ioapic) in userspace, what is the
>> benefit of it?
>
>The purpose is to achieve the security benefit of removing some of the
>interrupt handling into userspace, without incurring a significant
>performance penalty.  If you move the entire IRQCHIP into userspace,
>we've seen perf impacts from 15-200% depending on the workload.  With
>this patch series, we're seeing perf penalty <1% on our tests (TCP_RR

Why keep pic and ioapic in kernel space not get obvious benefit, what's
the bottleneck?

Regards,
Wanpeng Li 

>latency, TCP throughput, and Disk I/O).  See
>(https://lwn.net/Articles/619332/)
>
>>
>> Thanks,
>> Feng
>>
>>>
>--
>To unsubscribe from this list: send the line "unsubscribe kvm" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 4/4] KVM: x86: Add support for local interrupt requests from userspace
  2015-05-13 22:41     ` Steve Rutherford
@ 2015-05-15 13:27       ` Jan Kiszka
  0 siblings, 0 replies; 41+ messages in thread
From: Jan Kiszka @ 2015-05-15 13:27 UTC (permalink / raw)
  To: Steve Rutherford; +Cc: kvm, Andrew Honig

On 2015-05-14 00:41, Steve Rutherford wrote:
> On Wed, May 13, 2015 at 08:12:59AM +0200, Jan Kiszka wrote:
>> On 2015-05-13 03:47, Steve Rutherford wrote:
>>> In order to enable userspace PIC support, the userspace PIC needs to
>>> be able to inject local interrupt requests.
>>>
>>> This adds the ioctl KVM_REQUEST_LOCAL_INTERRUPT and kvm exit
>>> KVM_EXIT_GET_EXTINT.
>>>
>>> The vm ioctl KVM_REQUEST_LOCAL_INTERRUPT makes a KVM_REQ_EVENT request
>>> on the BSP, which causes the BSP to exit to userspace to fetch the
>>> vector of the underlying external interrupt, which the BSP then
>>> injects into the guest. This matches the PIC spec, and is necessary to
>>> boot Windows.
>>
>> The API name seems too generic, given the fairly specific use case. As
>> it only allows to kick the BSP, not any VCPU, that should be clarified.
>> Maybe call it KVM_REQUEST_PIC_INJECTION or so. Or allow userspace to
>> specify the target VCPU, then it's a bit more generic again.
>>
>> Actually, when looking at the MultiProcessor spec, you will find
>> multiple models for injecting PIC interrupts into CPU APICs. Just in the
>> PIC Mode, the BSP is the only possible target. In the other modes, all
>> APICs can receive PIC interrupts, and either the IOAPIC or the local
>> APIC's LINT0 configuration decide about the effective target. We should
>> allow to model all modes, based on userspace decisions.
>>
> 
> Supporting the other modes seems reasonable, but I'm not certain I have an outlet for testing them for correctness. I'm not even certain which OSes use the other modes. Unless there is a common OS that uses the other modes, and a straightforward way for me to test the other modes, I'd rather just rename the API to be less generic.

The OS has to configure what the hardware provides, I bet Windows does
so as well. This is a hardware property, thus something userspace (QEMU
& Co.) may want to configure.

> 
>>>
>>> Boots and passes the KVM unit tests on intel x86 with the
>>> PIC/PIT/IOAPIC in userspace (under a non-QEMU VMM). Boots and passes
>>> the KVM unit tests under normal conditions as well.
>>
>> Do you plan to provide a reference implementation for an open source
>> userspace VMM as well, once the kernel side is settled?
>>
> 
> Not at the moment (given that I'm not all that familiar with QEMU). I'm definitely willing to help guide someone else through the process.

It would be fairly valuable to have this tested against a known, public
machine emulator so that we can validate all needs before setting the
new kernel ABI in stone.

I do have an interest in this API as well, sitting on IRQ remapping
hacks and their lacking x2APIC support for too long, but I unfortunately
can't promise bandwidth either. :/

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP.
  2015-05-15  5:03         ` Wanpeng Li
@ 2015-05-15 18:10           ` Steve Rutherford
  2015-05-18  2:11             ` Wanpeng Li
  0 siblings, 1 reply; 41+ messages in thread
From: Steve Rutherford @ 2015-05-15 18:10 UTC (permalink / raw)
  To: Wanpeng Li; +Cc: Andrew Honig, Wu, Feng, Paolo Bonzini, kvm

On Fri, May 15, 2015 at 01:03:02PM +0800, Wanpeng Li wrote:
> On Thu, May 14, 2015 at 12:29:21PM -0700, Andrew Honig wrote:
> >>
> >> BTW, what is the purpose of this series. If I understand it correctly, you only want to
> >> use the in-kernel lapic and leave the others (pic, ioapic) in userspace, what is the
> >> benefit of it?
> >
> >The purpose is to achieve the security benefit of removing some of the
> >interrupt handling into userspace, without incurring a significant
> >performance penalty.  If you move the entire IRQCHIP into userspace,
> >we've seen perf impacts from 15-200% depending on the workload.  With
> >this patch series, we're seeing perf penalty <1% on our tests (TCP_RR
> 
> Why keep pic and ioapic in kernel space not get obvious benefit, what's
> the bottleneck?

It's the other way around. The PIC and IOAPIC are going up to userspace, and the APICs are staying in the kernel.

> 
> Regards,
> Wanpeng Li 
> 
> >latency, TCP throughput, and Disk I/O).  See
> >(https://lwn.net/Articles/619332/)
> >
> >>
> >> Thanks,
> >> Feng
> >>
> >>>
> >--
> >To unsubscribe from this list: send the line "unsubscribe kvm" in
> >the body of a message to majordomo@vger.kernel.org
> >More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP.
  2015-05-15 18:10           ` Steve Rutherford
@ 2015-05-18  2:11             ` Wanpeng Li
  0 siblings, 0 replies; 41+ messages in thread
From: Wanpeng Li @ 2015-05-18  2:11 UTC (permalink / raw)
  To: Steve Rutherford; +Cc: Andrew Honig, Wu, Feng, Paolo Bonzini, kvm

On Fri, May 15, 2015 at 11:10:23AM -0700, Steve Rutherford wrote:
>On Fri, May 15, 2015 at 01:03:02PM +0800, Wanpeng Li wrote:
>> On Thu, May 14, 2015 at 12:29:21PM -0700, Andrew Honig wrote:
>> >>
>> >> BTW, what is the purpose of this series. If I understand it correctly, you only want to
>> >> use the in-kernel lapic and leave the others (pic, ioapic) in userspace, what is the
>> >> benefit of it?
>> >
>> >The purpose is to achieve the security benefit of removing some of the
>> >interrupt handling into userspace, without incurring a significant
>> >performance penalty.  If you move the entire IRQCHIP into userspace,
>> >we've seen perf impacts from 15-200% depending on the workload.  With
>> >this patch series, we're seeing perf penalty <1% on our tests (TCP_RR
>> 
>> Why keep pic and ioapic in kernel space not get obvious benefit, what's
>> the bottleneck?
>
>It's the other way around. The PIC and IOAPIC are going up to userspace, and the APICs are staying in the kernel.

Yeah, this is what you have done in your patchset. I mean do you observe 
that why keep pic and ioapic in kernel space not get obvious benefit than 
move pic and ioapic to userspace.

Regards,
Wanpeng Li 

>
>> 
>> Regards,
>> Wanpeng Li 
>> 
>> >latency, TCP throughput, and Disk I/O).  See
> >(https://lwn.net/Articles/619332/)
>> >
>> >>
>> >> Thanks,
>> >> Feng
>> >>
>> >>>
>> >--
>> >To unsubscribe from this list: send the line "unsubscribe kvm" in
>> >the body of a message to majordomo@vger.kernel.org
>> >More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 2/4] KVM: x86: Add KVM exit for IOAPIC EOIs
  2015-05-13  1:47 ` [RFC PATCH 2/4] KVM: x86: Add KVM exit for IOAPIC EOIs Steve Rutherford
  2015-05-13  7:35   ` Paolo Bonzini
@ 2015-05-24 16:46   ` Avi Kivity
  2015-05-27  2:06     ` Steve Rutherford
  1 sibling, 1 reply; 41+ messages in thread
From: Avi Kivity @ 2015-05-24 16:46 UTC (permalink / raw)
  To: Steve Rutherford, kvm; +Cc: ahonig

On 05/13/2015 04:47 AM, Steve Rutherford wrote:
> Adds KVM_EXIT_IOAPIC_EOI which passes the interrupt vector up to
> userspace.
>
> Uses a per VCPU exit bitmap to decide whether or not the IOAPIC needs
> to be informed (which is identical to the EOI_EXIT_BITMAP field used
> by modern x86 processors, but can also be used to elide kvm IOAPIC EOI
> exits on older processors).
>
> [Note: A prototype using ResampleFDs found that decoupling the EOI
> from the VCPU's thread made it possible for the VCPU to not see a
> recent EOI after reentering the guest. This does not match real
> hardware.]
>
> Compile tested for Intel x86.
>
> Signed-off-by: Steve Rutherford <srutherford@google.com>
> ---
>   Documentation/virtual/kvm/api.txt | 10 ++++++++++
>   arch/x86/include/asm/kvm_host.h   |  3 +++
>   arch/x86/kvm/lapic.c              |  9 +++++++++
>   arch/x86/kvm/x86.c                | 11 +++++++++++
>   include/linux/kvm_host.h          |  1 +
>   include/uapi/linux/kvm.h          |  5 +++++
>   6 files changed, 39 insertions(+)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 0744b4e..dd92996 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -3285,6 +3285,16 @@ Valid values for 'type' are:
>   	 */
>   	__u64 kvm_valid_regs;
>   	__u64 kvm_dirty_regs;
> +
> +	/* KVM_EXIT_IOAPIC_EOI */
> +        struct {
> +	       __u8 vector;
> +        } eoi;
> +
> +Indicates that an eoi of a level triggered IOAPIC interrupt on vector has
> +occurred, which should be handled by the userspace IOAPIC. Triggers when
> +the Irqchip has been split between userspace and the kernel.
> +

The ioapic is a global resource, so it doesn't make sense for 
information about it to be returned in a per-vcpu structure (or to block 
the vcpu while it is being processed).

The way I'd model it is to emulate the APIC bus that connects local 
APICs and the IOAPIC, using a socket pair.  When the user-space ioapic 
wants to inject an interrupt, it sends a message to the local APICs 
which then inject it, and when it's ack'ed the EOI is sent back on the 
same bus.

It's true that the APIC bus no longer exists, but modern processors 
still pretend it does.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 2/4] KVM: x86: Add KVM exit for IOAPIC EOIs
  2015-05-24 16:46   ` Avi Kivity
@ 2015-05-27  2:06     ` Steve Rutherford
  2015-05-27  5:32       ` Avi Kivity
  0 siblings, 1 reply; 41+ messages in thread
From: Steve Rutherford @ 2015-05-27  2:06 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, ahonig

On Sun, May 24, 2015 at 07:46:03PM +0300, Avi Kivity wrote:
> On 05/13/2015 04:47 AM, Steve Rutherford wrote:
> >Adds KVM_EXIT_IOAPIC_EOI which passes the interrupt vector up to
> >userspace.
> >
> >Uses a per VCPU exit bitmap to decide whether or not the IOAPIC needs
> >to be informed (which is identical to the EOI_EXIT_BITMAP field used
> >by modern x86 processors, but can also be used to elide kvm IOAPIC EOI
> >exits on older processors).
> >
> >[Note: A prototype using ResampleFDs found that decoupling the EOI
> >from the VCPU's thread made it possible for the VCPU to not see a
> >recent EOI after reentering the guest. This does not match real
> >hardware.]
> >
> >Compile tested for Intel x86.
> >
> >Signed-off-by: Steve Rutherford <srutherford@google.com>
> >---
> >  Documentation/virtual/kvm/api.txt | 10 ++++++++++
> >  arch/x86/include/asm/kvm_host.h   |  3 +++
> >  arch/x86/kvm/lapic.c              |  9 +++++++++
> >  arch/x86/kvm/x86.c                | 11 +++++++++++
> >  include/linux/kvm_host.h          |  1 +
> >  include/uapi/linux/kvm.h          |  5 +++++
> >  6 files changed, 39 insertions(+)
> >
> >diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> >index 0744b4e..dd92996 100644
> >--- a/Documentation/virtual/kvm/api.txt
> >+++ b/Documentation/virtual/kvm/api.txt
> >@@ -3285,6 +3285,16 @@ Valid values for 'type' are:
> >  	 */
> >  	__u64 kvm_valid_regs;
> >  	__u64 kvm_dirty_regs;
> >+
> >+	/* KVM_EXIT_IOAPIC_EOI */
> >+        struct {
> >+	       __u8 vector;
> >+        } eoi;
> >+
> >+Indicates that an eoi of a level triggered IOAPIC interrupt on vector has
> >+occurred, which should be handled by the userspace IOAPIC. Triggers when
> >+the Irqchip has been split between userspace and the kernel.
> >+
> 
> The ioapic is a global resource, so it doesn't make sense for
> information about it to be returned in a per-vcpu structure
EOI exits are a per-vcpu behavior, so this doesn't seem all that strange.

> (or to block the vcpu while it is being processed).

Blocking doesn't feel clean, but doesn't seem all that bad, given
that these operations are relatively rare on modern configurations.

> 
> The way I'd model it is to emulate the APIC bus that connects local
> APICs and the IOAPIC, using a socket pair.  When the user-space
> ioapic wants to inject an interrupt, it sends a message to the local
> APICs which then inject it, and when it's ack'ed the EOI is sent
> back on the same bus.
Although I'm not certain about this, it sounds to me like this would
require a kernel thread to be waiting (in some way) on this socket, which
seems rather heavy handed.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 2/4] KVM: x86: Add KVM exit for IOAPIC EOIs
  2015-05-27  2:06     ` Steve Rutherford
@ 2015-05-27  5:32       ` Avi Kivity
  2015-05-28 21:58         ` Steve Rutherford
  0 siblings, 1 reply; 41+ messages in thread
From: Avi Kivity @ 2015-05-27  5:32 UTC (permalink / raw)
  To: Steve Rutherford; +Cc: kvm, ahonig

On 05/27/2015 05:06 AM, Steve Rutherford wrote:
> On Sun, May 24, 2015 at 07:46:03PM +0300, Avi Kivity wrote:
>> On 05/13/2015 04:47 AM, Steve Rutherford wrote:
>>> Adds KVM_EXIT_IOAPIC_EOI which passes the interrupt vector up to
>>> userspace.
>>>
>>> Uses a per VCPU exit bitmap to decide whether or not the IOAPIC needs
>>> to be informed (which is identical to the EOI_EXIT_BITMAP field used
>>> by modern x86 processors, but can also be used to elide kvm IOAPIC EOI
>>> exits on older processors).
>>>
>>> [Note: A prototype using ResampleFDs found that decoupling the EOI
>> >from the VCPU's thread made it possible for the VCPU to not see a
>>> recent EOI after reentering the guest. This does not match real
>>> hardware.]
>>>
>>> Compile tested for Intel x86.
>>>
>>> Signed-off-by: Steve Rutherford <srutherford@google.com>
>>> ---
>>>   Documentation/virtual/kvm/api.txt | 10 ++++++++++
>>>   arch/x86/include/asm/kvm_host.h   |  3 +++
>>>   arch/x86/kvm/lapic.c              |  9 +++++++++
>>>   arch/x86/kvm/x86.c                | 11 +++++++++++
>>>   include/linux/kvm_host.h          |  1 +
>>>   include/uapi/linux/kvm.h          |  5 +++++
>>>   6 files changed, 39 insertions(+)
>>>
>>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>>> index 0744b4e..dd92996 100644
>>> --- a/Documentation/virtual/kvm/api.txt
>>> +++ b/Documentation/virtual/kvm/api.txt
>>> @@ -3285,6 +3285,16 @@ Valid values for 'type' are:
>>>   	 */
>>>   	__u64 kvm_valid_regs;
>>>   	__u64 kvm_dirty_regs;
>>> +
>>> +	/* KVM_EXIT_IOAPIC_EOI */
>>> +        struct {
>>> +	       __u8 vector;
>>> +        } eoi;
>>> +
>>> +Indicates that an eoi of a level triggered IOAPIC interrupt on vector has
>>> +occurred, which should be handled by the userspace IOAPIC. Triggers when
>>> +the Irqchip has been split between userspace and the kernel.
>>> +
>> The ioapic is a global resource, so it doesn't make sense for
>> information about it to be returned in a per-vcpu structure
> EOI exits are a per-vcpu behavior, so this doesn't seem all that strange.
>
>> (or to block the vcpu while it is being processed).
> Blocking doesn't feel clean, but doesn't seem all that bad, given
> that these operations are relatively rare on modern configurations.

Agree, maybe the realtime people have an interest here.

>> The way I'd model it is to emulate the APIC bus that connects local
>> APICs and the IOAPIC, using a socket pair.  When the user-space
>> ioapic wants to inject an interrupt, it sends a message to the local
>> APICs which then inject it, and when it's ack'ed the EOI is sent
>> back on the same bus.
> Although I'm not certain about this, it sounds to me like this would
> require a kernel thread to be waiting (in some way) on this socket, which
> seems rather heavy handed.

It's been a while since I did kernel programming, but I think you can 
queue a callback to be called when an I/O is ready, and not require a 
thread.  IIRC we do that with irqfd to cause an interrupt to be injected.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC PATCH 2/4] KVM: x86: Add KVM exit for IOAPIC EOIs
  2015-05-27  5:32       ` Avi Kivity
@ 2015-05-28 21:58         ` Steve Rutherford
  0 siblings, 0 replies; 41+ messages in thread
From: Steve Rutherford @ 2015-05-28 21:58 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, ahonig

On Wed, May 27, 2015 at 08:32:04AM +0300, Avi Kivity wrote:
> On 05/27/2015 05:06 AM, Steve Rutherford wrote:
> >On Sun, May 24, 2015 at 07:46:03PM +0300, Avi Kivity wrote:
> >>On 05/13/2015 04:47 AM, Steve Rutherford wrote:
> >>>Adds KVM_EXIT_IOAPIC_EOI which passes the interrupt vector up to
> >>>userspace.
> >>>
> >>>Uses a per VCPU exit bitmap to decide whether or not the IOAPIC needs
> >>>to be informed (which is identical to the EOI_EXIT_BITMAP field used
> >>>by modern x86 processors, but can also be used to elide kvm IOAPIC EOI
> >>>exits on older processors).
> >>>
> >>>[Note: A prototype using ResampleFDs found that decoupling the EOI
> >>>from the VCPU's thread made it possible for the VCPU to not see a
> >>>recent EOI after reentering the guest. This does not match real
> >>>hardware.]
> >>>
> >>>Compile tested for Intel x86.
> >>>
> >>>Signed-off-by: Steve Rutherford <srutherford@google.com>
> >>>---
> >>>  Documentation/virtual/kvm/api.txt | 10 ++++++++++
> >>>  arch/x86/include/asm/kvm_host.h   |  3 +++
> >>>  arch/x86/kvm/lapic.c              |  9 +++++++++
> >>>  arch/x86/kvm/x86.c                | 11 +++++++++++
> >>>  include/linux/kvm_host.h          |  1 +
> >>>  include/uapi/linux/kvm.h          |  5 +++++
> >>>  6 files changed, 39 insertions(+)
> >>>
> >>>diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> >>>index 0744b4e..dd92996 100644
> >>>--- a/Documentation/virtual/kvm/api.txt
> >>>+++ b/Documentation/virtual/kvm/api.txt
> >>>@@ -3285,6 +3285,16 @@ Valid values for 'type' are:
> >>>  	 */
> >>>  	__u64 kvm_valid_regs;
> >>>  	__u64 kvm_dirty_regs;
> >>>+
> >>>+	/* KVM_EXIT_IOAPIC_EOI */
> >>>+        struct {
> >>>+	       __u8 vector;
> >>>+        } eoi;
> >>>+
> >>>+Indicates that an eoi of a level triggered IOAPIC interrupt on vector has
> >>>+occurred, which should be handled by the userspace IOAPIC. Triggers when
> >>>+the Irqchip has been split between userspace and the kernel.
> >>>+
> >>The ioapic is a global resource, so it doesn't make sense for
> >>information about it to be returned in a per-vcpu structure
> >EOI exits are a per-vcpu behavior, so this doesn't seem all that strange.
> >
> >>(or to block the vcpu while it is being processed).
> >Blocking doesn't feel clean, but doesn't seem all that bad, given
> >that these operations are relatively rare on modern configurations.
> 
> Agree, maybe the realtime people have an interest here.
> 
> >>The way I'd model it is to emulate the APIC bus that connects local
> >>APICs and the IOAPIC, using a socket pair.  When the user-space
> >>ioapic wants to inject an interrupt, it sends a message to the local
> >>APICs which then inject it, and when it's ack'ed the EOI is sent
> >>back on the same bus.
> >Although I'm not certain about this, it sounds to me like this would
> >require a kernel thread to be waiting (in some way) on this socket, which
> >seems rather heavy handed.
> 
> It's been a while since I did kernel programming, but I think you
> can queue a callback to be called when an I/O is ready, and not
> require a thread.  IIRC we do that with irqfd to cause an interrupt
> to be injected.
> 

This should be possible, but it's going to add a ton of complexity, and I don't really see any compelling benefits. If there is a compelling reason to switch to a socket based interface, I'm definitely willing to refactor.

Steve

^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2015-05-28 21:58 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-13  1:47 [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP Steve Rutherford
2015-05-13  1:47 ` [RFC PATCH 2/4] KVM: x86: Add KVM exit for IOAPIC EOIs Steve Rutherford
2015-05-13  7:35   ` Paolo Bonzini
2015-05-13 22:18     ` Steve Rutherford
2015-05-24 16:46   ` Avi Kivity
2015-05-27  2:06     ` Steve Rutherford
2015-05-27  5:32       ` Avi Kivity
2015-05-28 21:58         ` Steve Rutherford
2015-05-13  1:47 ` [RFC PATCH 3/4] KVM: x86: Add EOI exit bitmap inference Steve Rutherford
2015-05-13  6:12   ` Jan Kiszka
2015-05-13  8:04     ` Paolo Bonzini
2015-05-13  8:10       ` Jan Kiszka
2015-05-13  9:24         ` Paolo Bonzini
2015-05-13 10:25           ` Jan Kiszka
2015-05-13 13:04             ` Paolo Bonzini
2015-05-13 13:19               ` Jan Kiszka
2015-05-13 22:21       ` Steve Rutherford
2015-05-15  2:38       ` Steve Rutherford
2015-05-13  7:51   ` Paolo Bonzini
2015-05-13 22:24     ` Steve Rutherford
2015-05-14  9:20       ` Paolo Bonzini
2015-05-14 15:23         ` Alex Williamson
2015-05-14 15:46           ` Paolo Bonzini
2015-05-14 16:04             ` Alex Williamson
2015-05-14 22:10               ` Steve Rutherford
2015-05-14 22:35                 ` Alex Williamson
2015-05-14 23:21                   ` Steve Rutherford
2015-05-13  1:47 ` [RFC PATCH 4/4] KVM: x86: Add support for local interrupt requests from userspace Steve Rutherford
2015-05-13  6:12   ` Jan Kiszka
2015-05-13 22:41     ` Steve Rutherford
2015-05-15 13:27       ` Jan Kiszka
2015-05-13  7:22   ` Paolo Bonzini
2015-05-13 23:13     ` Steve Rutherford
2015-05-13  7:57 ` [RFC PATCH 1/4] KVM: x86: Split the APIC from the rest of IRQCHIP Paolo Bonzini
2015-05-13 22:10   ` Steve Rutherford
2015-05-14  9:12     ` Wu, Feng
2015-05-14 19:29       ` Andrew Honig
2015-05-15  1:28         ` Wu, Feng
2015-05-15  5:03         ` Wanpeng Li
2015-05-15 18:10           ` Steve Rutherford
2015-05-18  2:11             ` Wanpeng Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.