From: Auger Eric <eric.auger@redhat.com>
To: Alex Williamson <alex.williamson@redhat.com>
Cc: eric.auger.pro@gmail.com, iommu@lists.linux-foundation.org,
linux-kernel@vger.kernel.org, kvm@vger.kernel.org,
kvmarm@lists.cs.columbia.edu, joro@8bytes.org,
jacob.jun.pan@linux.intel.com, yi.l.liu@linux.intel.com,
jean-philippe.brucker@arm.com, will.deacon@arm.com,
robin.murphy@arm.com, kevin.tian@intel.com, ashok.raj@intel.com,
marc.zyngier@arm.com, christoffer.dall@arm.com,
peter.maydell@linaro.org
Subject: Re: [RFC v3 18/21] vfio-pci: Add a new VFIO_REGION_TYPE_NESTED region type
Date: Mon, 14 Jan 2019 21:48:06 +0100 [thread overview]
Message-ID: <497dad10-8832-3f2c-3ff8-fbcdd4c41ae4@redhat.com> (raw)
In-Reply-To: <20190111165838.06a22ab8@x1.home>
Hi Alex,
On 1/12/19 12:58 AM, Alex Williamson wrote:
> On Tue, 8 Jan 2019 11:26:30 +0100
> Eric Auger <eric.auger@redhat.com> wrote:
>
>> This patch adds a new 64kB region aiming to report nested mode
>> translation faults.
>>
>> The region contains a header with the size of the queue,
>> the producer and consumer indices and then the actual
>> fault queue data. The producer is updated by the kernel while
>> the consumer is updated by the userspace.
>>
>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>
>> ---
>> ---
>> drivers/vfio/pci/vfio_pci.c | 102 +++++++++++++++++++++++++++-
>> drivers/vfio/pci/vfio_pci_private.h | 2 +
>> include/uapi/linux/vfio.h | 15 ++++
>> 3 files changed, 118 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
>> index ff60bd1ea587..2ba181ab2edd 100644
>> --- a/drivers/vfio/pci/vfio_pci.c
>> +++ b/drivers/vfio/pci/vfio_pci.c
>> @@ -56,6 +56,11 @@ module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
>> MODULE_PARM_DESC(disable_idle_d3,
>> "Disable using the PCI D3 low power state for idle, unused devices");
>>
>> +#define VFIO_FAULT_REGION_SIZE 0x10000
>
> Why 64K?
For the region to be mmappable with 64kB page size.
>
>> +#define VFIO_FAULT_QUEUE_SIZE \
>> + ((VFIO_FAULT_REGION_SIZE - sizeof(struct vfio_fault_region_header)) / \
>> + sizeof(struct iommu_fault))
>> +
>> static inline bool vfio_vga_disabled(void)
>> {
>> #ifdef CONFIG_VFIO_PCI_VGA
>> @@ -1226,6 +1231,100 @@ static const struct vfio_device_ops vfio_pci_ops = {
>> static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
>> static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
>>
>> +static size_t
>> +vfio_pci_dma_fault_rw(struct vfio_pci_device *vdev, char __user *buf,
>> + size_t count, loff_t *ppos, bool iswrite)
>> +{
>> + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
>> + void *base = vdev->region[i].data;
>> + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
>> +
>> + if (pos >= vdev->region[i].size)
>> + return -EINVAL;
>> +
>> + count = min(count, (size_t)(vdev->region[i].size - pos));
>> +
>> + if (copy_to_user(buf, base + pos, count))
>> + return -EFAULT;
>> +
>> + *ppos += count;
>> +
>> + return count;
>> +}
>> +
>> +static int vfio_pci_dma_fault_mmap(struct vfio_pci_device *vdev,
>> + struct vfio_pci_region *region,
>> + struct vm_area_struct *vma)
>> +{
>> + u64 phys_len, req_len, pgoff, req_start;
>> + unsigned long long addr;
>> + unsigned int index;
>> +
>> + index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
>> +
>> + if (vma->vm_end < vma->vm_start)
>> + return -EINVAL;
>> + if ((vma->vm_flags & VM_SHARED) == 0)
>> + return -EINVAL;
>> +
>> + phys_len = VFIO_FAULT_REGION_SIZE;
>> +
>> + req_len = vma->vm_end - vma->vm_start;
>> + pgoff = vma->vm_pgoff &
>> + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
>> + req_start = pgoff << PAGE_SHIFT;
>> +
>> + if (req_start + req_len > phys_len)
>> + return -EINVAL;
>> +
>> + addr = virt_to_phys(vdev->fault_region);
>> + vma->vm_private_data = vdev;
>> + vma->vm_pgoff = (addr >> PAGE_SHIFT) + pgoff;
>> +
>> + return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
>> + req_len, vma->vm_page_prot);
>> +}
>> +
>> +void vfio_pci_dma_fault_release(struct vfio_pci_device *vdev,
>> + struct vfio_pci_region *region)
>> +{
>> +}
>> +
>> +static const struct vfio_pci_regops vfio_pci_dma_fault_regops = {
>> + .rw = vfio_pci_dma_fault_rw,
>> + .mmap = vfio_pci_dma_fault_mmap,
>> + .release = vfio_pci_dma_fault_release,
>> +};
>> +
>> +static int vfio_pci_init_dma_fault_region(struct vfio_pci_device *vdev)
>> +{
>> + u32 flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE |
>> + VFIO_REGION_INFO_FLAG_MMAP;
>> + int ret;
>> +
>> + spin_lock_init(&vdev->fault_queue_lock);
>> +
>> + vdev->fault_region = kmalloc(VFIO_FAULT_REGION_SIZE, GFP_KERNEL);
>> + if (!vdev->fault_region)
>> + return -ENOMEM;
>> +
>> + ret = vfio_pci_register_dev_region(vdev,
>> + VFIO_REGION_TYPE_NESTED,
>> + VFIO_REGION_SUBTYPE_NESTED_FAULT_REGION,
>> + &vfio_pci_dma_fault_regops, VFIO_FAULT_REGION_SIZE,
>> + flags, vdev->fault_region);
>> + if (ret) {
>> + kfree(vdev->fault_region);
>> + return ret;
>> + }
>> +
>> + vdev->fault_region->header.prod = 0;
>> + vdev->fault_region->header.cons = 0;
>> + vdev->fault_region->header.reserved = 0;
>
> Use kzalloc above or else we're leaking kernel memory to userspace
> anyway.
sure
>
>> + vdev->fault_region->header.size = VFIO_FAULT_QUEUE_SIZE;
>> + return 0;
>> +}
>> +
>> static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>> {
>> struct vfio_pci_device *vdev;
>> @@ -1300,7 +1399,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>> pci_set_power_state(pdev, PCI_D3hot);
>> }
>>
>> - return ret;
>> + return vfio_pci_init_dma_fault_region(vdev);
>
> Missing lots of cleanup should this fail. Why is this done on probe
> anyway? This looks like something we'd do from vfio_pci_enable() and
> therefore our release callback would free fault_region rather than what
> we have below.
OK. That's fine to put in the vfio_pci_enable().
>
>> }
>>
>> static void vfio_pci_remove(struct pci_dev *pdev)
>> @@ -1315,6 +1414,7 @@ static void vfio_pci_remove(struct pci_dev *pdev)
>>
>> vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
>> kfree(vdev->region);
>> + kfree(vdev->fault_region);
>> mutex_destroy(&vdev->ioeventfds_lock);
>> kfree(vdev);
>>
>> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
>> index 8c0009f00818..38b5d1764a26 100644
>> --- a/drivers/vfio/pci/vfio_pci_private.h
>> +++ b/drivers/vfio/pci/vfio_pci_private.h
>> @@ -120,6 +120,8 @@ struct vfio_pci_device {
>> int ioeventfds_nr;
>> struct eventfd_ctx *err_trigger;
>> struct eventfd_ctx *req_trigger;
>> + spinlock_t fault_queue_lock;
>> + struct vfio_fault_region *fault_region;
>> struct list_head dummy_resources_list;
>> struct mutex ioeventfds_lock;
>> struct list_head ioeventfds_list;
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index 352e795a93c8..b78c2c62af6d 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -307,6 +307,9 @@ struct vfio_region_info_cap_type {
>> #define VFIO_REGION_TYPE_GFX (1)
>> #define VFIO_REGION_SUBTYPE_GFX_EDID (1)
>>
>> +#define VFIO_REGION_TYPE_NESTED (2)
>> +#define VFIO_REGION_SUBTYPE_NESTED_FAULT_REGION (1)
>> +
>> /**
>> * struct vfio_region_gfx_edid - EDID region layout.
>> *
>> @@ -697,6 +700,18 @@ struct vfio_device_ioeventfd {
>>
>> #define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16)
>>
>> +struct vfio_fault_region_header {
>> + __u32 size; /* Read-Only */
>> + __u32 prod; /* Read-Only */
>
> We can't really enforce read-only if it's mmap'd.
Do we really need to? Assuming the kernel always uses
VFIO_FAULT_QUEUE_SIZE to check prod and cons indice - which is not the
case at the moment by the way :-( -s, the queue cannot
be overflown .
The header also can be checked each time the kernel fills any event in
the queue
(vfio_pci_iommu_dev_fault_handler). If inconsistent the kernel may stop
using the queue. If the user-space mangles with those RO fields, this
will break error reporting on the guest but the problem should be
confined here?
> I worry about synchronization here too, perhaps there should be a ring offset such
> that the ring can be in a separate page from the header and then sparse
> mmap support can ensure that the user access is restricted.
I was assuming a single writer and single reader lock-free circular
buffer here. My understanding was it was safe to consider concurrent
read and write. What I am missing anyway is atomic counter operations to
guarantee the indices are updated after the push/pop action as explained in
https://www.kernel.org/doc/Documentation/circular-buffers.txt. I am not
comfortable about how to enforce this on user side though.
In case I split the header and the actual buffer into 2 different
possible 64kB pages, the first one will be very scarcely used.
> wonder if there are other transports that make sense here, this almost
> feels like a vhost sort of thing. Thanks,
Using something more sophisticated may be useful for PRI where answers
need to be provided. For the case of unrecoverable faults, I wonder
whether it is worth the pain exposing a fault region compared to the
original IOCTL approach introduced in
[RFC v2 18/20] vfio: VFIO_IOMMU_GET_FAULT_EVENTS
https://lkml.org/lkml/2018/9/18/1094
Thanks
Eric
>
> Alex
>
>> + __u32 cons;
>> + __u32 reserved; /* must be 0 */
>> +};
>> +
>> +struct vfio_fault_region {
>> + struct vfio_fault_region_header header;
>> + struct iommu_fault queue[0];
>> +};
>> +
>> /* -------- API for Type1 VFIO IOMMU -------- */
>>
>> /**
>
next prev parent reply other threads:[~2019-01-14 20:48 UTC|newest]
Thread overview: 59+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-01-08 10:26 [RFC v3 00/21] SMMUv3 Nested Stage Setup Eric Auger
2019-01-08 10:26 ` [RFC v3 01/21] iommu: Introduce set_pasid_table API Eric Auger
2019-01-11 18:16 ` Jean-Philippe Brucker
2019-01-25 8:39 ` Auger Eric
2019-01-25 8:55 ` Auger Eric
2019-01-25 10:33 ` Jean-Philippe Brucker
2019-01-11 18:43 ` Alex Williamson
2019-01-25 9:20 ` Auger Eric
2019-01-08 10:26 ` [RFC v3 02/21] iommu: Introduce cache_invalidate API Eric Auger
2019-01-11 21:30 ` Alex Williamson
2019-01-25 16:49 ` Auger Eric
2019-01-28 17:32 ` Jean-Philippe Brucker
2019-01-29 17:49 ` Auger Eric
2019-01-29 23:16 ` Alex Williamson
2019-01-30 8:48 ` Auger Eric
2019-01-08 10:26 ` [RFC v3 03/21] iommu: Introduce bind_guest_msi Eric Auger
2019-01-11 22:44 ` Alex Williamson
2019-01-25 17:51 ` Auger Eric
2019-01-25 18:11 ` Auger Eric
2019-01-08 10:26 ` [RFC v3 04/21] vfio: VFIO_IOMMU_SET_PASID_TABLE Eric Auger
2019-01-11 22:50 ` Alex Williamson
2019-01-15 21:34 ` Auger Eric
2019-01-08 10:26 ` [RFC v3 05/21] vfio: VFIO_IOMMU_CACHE_INVALIDATE Eric Auger
2019-01-08 10:26 ` [RFC v3 06/21] vfio: VFIO_IOMMU_BIND_MSI Eric Auger
2019-01-11 23:02 ` Alex Williamson
2019-01-11 23:23 ` Alex Williamson
2019-01-08 10:26 ` [RFC v3 07/21] iommu/arm-smmu-v3: Link domains and devices Eric Auger
2019-01-08 10:26 ` [RFC v3 08/21] iommu/arm-smmu-v3: Maintain a SID->device structure Eric Auger
2019-01-08 10:26 ` [RFC v3 09/21] iommu/smmuv3: Get prepared for nested stage support Eric Auger
2019-01-11 16:04 ` Jean-Philippe Brucker
2019-01-25 19:27 ` Robin Murphy
2019-01-08 10:26 ` [RFC v3 10/21] iommu/smmuv3: Implement set_pasid_table Eric Auger
2019-01-08 10:26 ` [RFC v3 11/21] iommu/smmuv3: Implement cache_invalidate Eric Auger
2019-01-11 16:59 ` Jean-Philippe Brucker
2019-01-08 10:26 ` [RFC v3 12/21] dma-iommu: Implement NESTED_MSI cookie Eric Auger
2019-01-08 10:26 ` [RFC v3 13/21] iommu/smmuv3: Implement bind_guest_msi Eric Auger
2019-01-08 10:26 ` [RFC v3 14/21] iommu: introduce device fault data Eric Auger
[not found] ` <20190110104544.26f3bcb1@jacob-builder>
2019-01-11 11:06 ` Jean-Philippe Brucker
2019-01-14 22:32 ` Jacob Pan
2019-01-16 15:52 ` Jean-Philippe Brucker
2019-01-16 18:33 ` Auger Eric
2019-01-15 21:27 ` Auger Eric
2019-01-16 16:54 ` Jean-Philippe Brucker
2019-01-08 10:26 ` [RFC v3 15/21] driver core: add per device iommu param Eric Auger
2019-01-08 10:26 ` [RFC v3 16/21] iommu: introduce device fault report API Eric Auger
2019-01-08 10:26 ` [RFC v3 17/21] iommu/smmuv3: Report non recoverable faults Eric Auger
2019-01-11 17:46 ` Jean-Philippe Brucker
2019-01-15 21:06 ` Auger Eric
2019-01-16 12:25 ` Jean-Philippe Brucker
2019-01-16 12:49 ` Auger Eric
2019-01-08 10:26 ` [RFC v3 18/21] vfio-pci: Add a new VFIO_REGION_TYPE_NESTED region type Eric Auger
2019-01-11 23:58 ` Alex Williamson
2019-01-14 20:48 ` Auger Eric [this message]
2019-01-14 23:04 ` Alex Williamson
2019-01-15 21:56 ` Auger Eric
2019-01-08 10:26 ` [RFC v3 19/21] vfio-pci: Register an iommu fault handler Eric Auger
2019-01-08 10:26 ` [RFC v3 20/21] vfio-pci: Add VFIO_PCI_DMA_FAULT_IRQ_INDEX Eric Auger
2019-01-08 10:26 ` [RFC v3 21/21] vfio: Document nested stage control Eric Auger
2019-01-18 10:02 ` [RFC v3 00/21] SMMUv3 Nested Stage Setup Auger Eric
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=497dad10-8832-3f2c-3ff8-fbcdd4c41ae4@redhat.com \
--to=eric.auger@redhat.com \
--cc=alex.williamson@redhat.com \
--cc=ashok.raj@intel.com \
--cc=christoffer.dall@arm.com \
--cc=eric.auger.pro@gmail.com \
--cc=iommu@lists.linux-foundation.org \
--cc=jacob.jun.pan@linux.intel.com \
--cc=jean-philippe.brucker@arm.com \
--cc=joro@8bytes.org \
--cc=kevin.tian@intel.com \
--cc=kvm@vger.kernel.org \
--cc=kvmarm@lists.cs.columbia.edu \
--cc=linux-kernel@vger.kernel.org \
--cc=marc.zyngier@arm.com \
--cc=peter.maydell@linaro.org \
--cc=robin.murphy@arm.com \
--cc=will.deacon@arm.com \
--cc=yi.l.liu@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).