On Mon, Feb 04, 2019 at 12:30:39PM +0100, Cédric Le Goater wrote: > On 2/4/19 5:45 AM, David Gibson wrote: > > On Mon, Jan 07, 2019 at 07:43:18PM +0100, Cédric Le Goater wrote: > >> This will let the guest create a memory mapping to expose the ESB MMIO > >> regions used to control the interrupt sources, to trigger events, to > >> EOI or to turn off the sources. > >> > >> Signed-off-by: Cédric Le Goater > >> --- > >> arch/powerpc/include/uapi/asm/kvm.h | 4 ++ > >> arch/powerpc/kvm/book3s_xive_native.c | 97 +++++++++++++++++++++++++++ > >> 2 files changed, 101 insertions(+) > >> > >> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h > >> index 8c876c166ef2..6bb61ba141c2 100644 > >> --- a/arch/powerpc/include/uapi/asm/kvm.h > >> +++ b/arch/powerpc/include/uapi/asm/kvm.h > >> @@ -675,4 +675,8 @@ struct kvm_ppc_cpu_char { > >> #define KVM_XICS_PRESENTED (1ULL << 43) > >> #define KVM_XICS_QUEUED (1ULL << 44) > >> > >> +/* POWER9 XIVE Native Interrupt Controller */ > >> +#define KVM_DEV_XIVE_GRP_CTRL 1 > >> +#define KVM_DEV_XIVE_GET_ESB_FD 1 > > > > Introducing a new FD for ESB and TIMA seems overkill. Can't you get > > to both with an mmap() directly on the xive device fd? Using the > > offset to distinguish which one to map, obviously. > > The page offset would define some sort of user API. It seems feasible. > But I am not sure this would be practical in the future if we need to > tune the length. Um.. why not? I mean, yes the XIVE supports rather a lot of interrupts, but we have 64-bits of offset we can play with - we can leave room for billions of ESB slots and still have room for billions of VPs. > The TIMA has two pages that can be exposed at guest level for interrupt > management : the OS and the USER page. That should be OK. > > But we might want to map only portions of the interrupt ESB space, for > PCI passthrough for instance as Paul proposed. I am still looking at that. > > Thanks, > > C. > > >> #endif /* __LINUX_KVM_POWERPC_H */ > >> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c > >> index 115143e76c45..e20081f0c8d4 100644 > >> --- a/arch/powerpc/kvm/book3s_xive_native.c > >> +++ b/arch/powerpc/kvm/book3s_xive_native.c > >> @@ -153,6 +153,85 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, > >> return rc; > >> } > >> > >> +static int xive_native_esb_fault(struct vm_fault *vmf) > >> +{ > >> + struct vm_area_struct *vma = vmf->vma; > >> + struct kvmppc_xive *xive = vma->vm_file->private_data; > >> + struct kvmppc_xive_src_block *sb; > >> + struct kvmppc_xive_irq_state *state; > >> + struct xive_irq_data *xd; > >> + u32 hw_num; > >> + u16 src; > >> + u64 page; > >> + unsigned long irq; > >> + > >> + /* > >> + * Linux/KVM uses a two pages ESB setting, one for trigger and > >> + * one for EOI > >> + */ > >> + irq = vmf->pgoff / 2; > >> + > >> + sb = kvmppc_xive_find_source(xive, irq, &src); > >> + if (!sb) { > >> + pr_err("%s: source %lx not found !\n", __func__, irq); > >> + return VM_FAULT_SIGBUS; > >> + } > >> + > >> + state = &sb->irq_state[src]; > >> + kvmppc_xive_select_irq(state, &hw_num, &xd); > >> + > >> + arch_spin_lock(&sb->lock); > >> + > >> + /* > >> + * first/even page is for trigger > >> + * second/odd page is for EOI and management. > >> + */ > >> + page = vmf->pgoff % 2 ? xd->eoi_page : xd->trig_page; > >> + arch_spin_unlock(&sb->lock); > >> + > >> + if (!page) { > >> + pr_err("%s: acessing invalid ESB page for source %lx !\n", > >> + __func__, irq); > >> + return VM_FAULT_SIGBUS; > >> + } > >> + > >> + vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT); > >> + return VM_FAULT_NOPAGE; > >> +} > >> + > >> +static const struct vm_operations_struct xive_native_esb_vmops = { > >> + .fault = xive_native_esb_fault, > >> +}; > >> + > >> +static int xive_native_esb_mmap(struct file *file, struct vm_area_struct *vma) > >> +{ > >> + /* There are two ESB pages (trigger and EOI) per IRQ */ > >> + if (vma_pages(vma) + vma->vm_pgoff > KVMPPC_XIVE_NR_IRQS * 2) > >> + return -EINVAL; > >> + > >> + vma->vm_flags |= VM_IO | VM_PFNMAP; > >> + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); > >> + vma->vm_ops = &xive_native_esb_vmops; > >> + return 0; > >> +} > >> + > >> +static const struct file_operations xive_native_esb_fops = { > >> + .mmap = xive_native_esb_mmap, > >> +}; > >> + > >> +static int kvmppc_xive_native_get_esb_fd(struct kvmppc_xive *xive, u64 addr) > >> +{ > >> + u64 __user *ubufp = (u64 __user *) addr; > >> + int ret; > >> + > >> + ret = anon_inode_getfd("[xive-esb]", &xive_native_esb_fops, xive, > >> + O_RDWR | O_CLOEXEC); > >> + if (ret < 0) > >> + return ret; > >> + > >> + return put_user(ret, ubufp); > >> +} > >> + > >> static int kvmppc_xive_native_set_attr(struct kvm_device *dev, > >> struct kvm_device_attr *attr) > >> { > >> @@ -162,12 +241,30 @@ static int kvmppc_xive_native_set_attr(struct kvm_device *dev, > >> static int kvmppc_xive_native_get_attr(struct kvm_device *dev, > >> struct kvm_device_attr *attr) > >> { > >> + struct kvmppc_xive *xive = dev->private; > >> + > >> + switch (attr->group) { > >> + case KVM_DEV_XIVE_GRP_CTRL: > >> + switch (attr->attr) { > >> + case KVM_DEV_XIVE_GET_ESB_FD: > >> + return kvmppc_xive_native_get_esb_fd(xive, attr->addr); > >> + } > >> + break; > >> + } > >> return -ENXIO; > >> } > >> > >> static int kvmppc_xive_native_has_attr(struct kvm_device *dev, > >> struct kvm_device_attr *attr) > >> { > >> + switch (attr->group) { > >> + case KVM_DEV_XIVE_GRP_CTRL: > >> + switch (attr->attr) { > >> + case KVM_DEV_XIVE_GET_ESB_FD: > >> + return 0; > >> + } > >> + break; > >> + } > >> return -ENXIO; > >> } > >> > > > -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson