From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Michael S. Tsirkin" Subject: Re: [PATCH v3 5/7] iommu: Add virtio-iommu driver Date: Fri, 12 Oct 2018 12:35:58 -0400 Message-ID: <20181012120953-mutt-send-email-mst__42960.1825694137$1539362050$gmane$org@kernel.org> References: <20181012145917.6840-1-jean-philippe.brucker@arm.com> <20181012145917.6840-6-jean-philippe.brucker@arm.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: Content-Disposition: inline In-Reply-To: <20181012145917.6840-6-jean-philippe.brucker@arm.com> List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: virtualization-bounces@lists.linux-foundation.org Errors-To: virtualization-bounces@lists.linux-foundation.org To: Jean-Philippe Brucker Cc: mark.rutland@arm.com, peter.maydell@linaro.org, lorenzo.pieralisi@arm.com, tnowicki@caviumnetworks.com, devicetree@vger.kernel.org, linux-pci@vger.kernel.org, joro@8bytes.org, will.deacon@arm.com, virtualization@lists.linux-foundation.org, eric.auger@redhat.com, iommu@lists.linux-foundation.org, robh+dt@kernel.org, marc.zyngier@arm.com, robin.murphy@arm.com, kvmarm@lists.cs.columbia.edu List-Id: virtualization@lists.linuxfoundation.org On Fri, Oct 12, 2018 at 03:59:15PM +0100, Jean-Philippe Brucker wrote: > The virtio IOMMU is a para-virtualized device, allowing to send IOMMU > requests such as map/unmap over virtio transport without emulating page > tables. This implementation handles ATTACH, DETACH, MAP and UNMAP > requests. > > The bulk of the code transforms calls coming from the IOMMU API into > corresponding virtio requests. Mappings are kept in an interval tree > instead of page tables. > > Signed-off-by: Jean-Philippe Brucker > --- > MAINTAINERS | 7 + > drivers/iommu/Kconfig | 11 + > drivers/iommu/Makefile | 1 + > drivers/iommu/virtio-iommu.c | 938 ++++++++++++++++++++++++++++++ > include/uapi/linux/virtio_ids.h | 1 + > include/uapi/linux/virtio_iommu.h | 101 ++++ > 6 files changed, 1059 insertions(+) > create mode 100644 drivers/iommu/virtio-iommu.c > create mode 100644 include/uapi/linux/virtio_iommu.h > > diff --git a/MAINTAINERS b/MAINTAINERS > index 48a65c3a4189..f02fa65f47e2 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -15599,6 +15599,13 @@ S: Maintained > F: drivers/virtio/virtio_input.c > F: include/uapi/linux/virtio_input.h > > +VIRTIO IOMMU DRIVER > +M: Jean-Philippe Brucker > +L: virtualization@lists.linux-foundation.org > +S: Maintained > +F: drivers/iommu/virtio-iommu.c > +F: include/uapi/linux/virtio_iommu.h > + > VIRTUAL BOX GUEST DEVICE DRIVER > M: Hans de Goede > M: Arnd Bergmann > diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig > index c60395b7470f..2dc016dc2b92 100644 > --- a/drivers/iommu/Kconfig > +++ b/drivers/iommu/Kconfig > @@ -414,4 +414,15 @@ config QCOM_IOMMU > help > Support for IOMMU on certain Qualcomm SoCs. > > +config VIRTIO_IOMMU > + bool "Virtio IOMMU driver" > + depends on VIRTIO=y > + select IOMMU_API > + select INTERVAL_TREE > + select ARM_DMA_USE_IOMMU if ARM > + help > + Para-virtualised IOMMU driver with virtio. > + > + Say Y here if you intend to run this kernel as a guest. > + > endif # IOMMU_SUPPORT > diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile > index ab5eba6edf82..4cd643408e49 100644 > --- a/drivers/iommu/Makefile > +++ b/drivers/iommu/Makefile > @@ -31,3 +31,4 @@ obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o > obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o > obj-$(CONFIG_S390_IOMMU) += s390-iommu.o > obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o > +obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o > diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c > new file mode 100644 > index 000000000000..9fb38cd3b727 > --- /dev/null > +++ b/drivers/iommu/virtio-iommu.c > @@ -0,0 +1,938 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Virtio driver for the paravirtualized IOMMU > + * > + * Copyright (C) 2018 Arm Limited > + */ > + > +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#include > + > +#define MSI_IOVA_BASE 0x8000000 > +#define MSI_IOVA_LENGTH 0x100000 > + > +#define VIOMMU_REQUEST_VQ 0 > +#define VIOMMU_NR_VQS 1 > + > +/* > + * During development, it is convenient to time out rather than wait > + * indefinitely in atomic context when a device misbehaves and a request doesn't > + * return. In production however, some requests shouldn't return until they are > + * successful. > + */ > +#ifdef DEBUG > +#define VIOMMU_REQUEST_TIMEOUT 10000 /* 10s */ > +#endif > + > +struct viommu_dev { > + struct iommu_device iommu; > + struct device *dev; > + struct virtio_device *vdev; > + > + struct ida domain_ids; > + > + struct virtqueue *vqs[VIOMMU_NR_VQS]; > + spinlock_t request_lock; > + struct list_head requests; > + > + /* Device configuration */ > + struct iommu_domain_geometry geometry; > + u64 pgsize_bitmap; > + u8 domain_bits; > +}; > + > +struct viommu_mapping { > + phys_addr_t paddr; > + struct interval_tree_node iova; > + u32 flags; > +}; > + > +struct viommu_domain { > + struct iommu_domain domain; > + struct viommu_dev *viommu; > + struct mutex mutex; > + unsigned int id; > + > + spinlock_t mappings_lock; > + struct rb_root_cached mappings; > + > + unsigned long nr_endpoints; > +}; > + > +struct viommu_endpoint { > + struct viommu_dev *viommu; > + struct viommu_domain *vdomain; > +}; > + > +struct viommu_request { > + struct list_head list; > + void *writeback; > + unsigned int write_offset; > + unsigned int len; > + char buf[]; > +}; > + > +#define to_viommu_domain(domain) \ > + container_of(domain, struct viommu_domain, domain) > + > +static int viommu_get_req_errno(void *buf, size_t len) > +{ > + struct virtio_iommu_req_tail *tail = buf + len - sizeof(*tail); > + > + switch (tail->status) { > + case VIRTIO_IOMMU_S_OK: > + return 0; > + case VIRTIO_IOMMU_S_UNSUPP: > + return -ENOSYS; > + case VIRTIO_IOMMU_S_INVAL: > + return -EINVAL; > + case VIRTIO_IOMMU_S_RANGE: > + return -ERANGE; > + case VIRTIO_IOMMU_S_NOENT: > + return -ENOENT; > + case VIRTIO_IOMMU_S_FAULT: > + return -EFAULT; > + case VIRTIO_IOMMU_S_IOERR: > + case VIRTIO_IOMMU_S_DEVERR: > + default: > + return -EIO; > + } > +} > + > +static void viommu_set_req_status(void *buf, size_t len, int status) > +{ > + struct virtio_iommu_req_tail *tail = buf + len - sizeof(*tail); > + > + tail->status = status; > +} > + > +static off_t viommu_get_req_offset(struct viommu_dev *viommu, > + struct virtio_iommu_req_head *req, > + size_t len) > +{ > + size_t tail_size = sizeof(struct virtio_iommu_req_tail); > + > + return len - tail_size; > +} > + > +/* > + * __viommu_sync_req - Complete all in-flight requests > + * > + * Wait for all added requests to complete. When this function returns, all > + * requests that were in-flight at the time of the call have completed. > + */ > +static int __viommu_sync_req(struct viommu_dev *viommu) > +{ > + int ret = 0; > + unsigned int len; > + size_t write_len; > + ktime_t timeout = 0; > + struct viommu_request *req; > + struct virtqueue *vq = viommu->vqs[VIOMMU_REQUEST_VQ]; > + > + assert_spin_locked(&viommu->request_lock); > +#ifdef DEBUG > + timeout = ktime_add_ms(ktime_get(), VIOMMU_REQUEST_TIMEOUT); > +#endif > + virtqueue_kick(vq); > + > + while (!list_empty(&viommu->requests)) { > + len = 0; > + req = virtqueue_get_buf(vq, &len); > + if (req == NULL) { > + if (!timeout || ktime_before(ktime_get(), timeout)) > + continue; > + > + /* After timeout, remove all requests */ > + req = list_first_entry(&viommu->requests, > + struct viommu_request, list); > + ret = -ETIMEDOUT; > + } > + > + if (!len) > + viommu_set_req_status(req->buf, req->len, > + VIRTIO_IOMMU_S_IOERR); > + > + write_len = req->len - req->write_offset; > + if (req->writeback && len >= write_len) > + memcpy(req->writeback, req->buf + req->write_offset, > + write_len); > + > + list_del(&req->list); > + kfree(req); So with DEBUG set, this will actually free memory that device still DMA's into. Hardly pretty. I think you want to mark device broken, queue the request and then wait for device to be reset. > + } > + > + return ret; > +} > + > +static int viommu_sync_req(struct viommu_dev *viommu) > +{ > + int ret; > + unsigned long flags; > + > + spin_lock_irqsave(&viommu->request_lock, flags); > + ret = __viommu_sync_req(viommu); > + if (ret) > + dev_dbg(viommu->dev, "could not sync requests (%d)\n", ret); > + spin_unlock_irqrestore(&viommu->request_lock, flags); > + > + return ret; > +} > + > +/* > + * __viommu_add_request - Add one request to the queue > + * @buf: pointer to the request buffer > + * @len: length of the request buffer > + * @writeback: copy data back to the buffer when the request completes. > + * > + * Add a request to the queue. Only synchronize the queue if it's already full. > + * Otherwise don't kick the queue nor wait for requests to complete. > + * > + * When @writeback is true, data written by the device, including the request > + * status, is copied into @buf after the request completes. This is unsafe if > + * the caller allocates @buf on stack and drops the lock between add_req() and > + * sync_req(). > + * > + * Return 0 if the request was successfully added to the queue. > + */ > +static int __viommu_add_req(struct viommu_dev *viommu, void *buf, size_t len, > + bool writeback) > +{ > + int ret; > + off_t write_offset; > + struct viommu_request *req; > + struct scatterlist top_sg, bottom_sg; > + struct scatterlist *sg[2] = { &top_sg, &bottom_sg }; > + struct virtqueue *vq = viommu->vqs[VIOMMU_REQUEST_VQ]; > + > + assert_spin_locked(&viommu->request_lock); > + > + write_offset = viommu_get_req_offset(viommu, buf, len); > + if (!write_offset) > + return -EINVAL; > + > + req = kzalloc(sizeof(*req) + len, GFP_ATOMIC); > + if (!req) > + return -ENOMEM; > + > + req->len = len; > + if (writeback) { > + req->writeback = buf + write_offset; > + req->write_offset = write_offset; > + } > + memcpy(&req->buf, buf, write_offset); > + > + sg_init_one(&top_sg, req->buf, write_offset); > + sg_init_one(&bottom_sg, req->buf + write_offset, len - write_offset); > + > + ret = virtqueue_add_sgs(vq, sg, 1, 1, req, GFP_ATOMIC); > + if (ret == -ENOSPC) { > + /* If the queue is full, sync and retry */ > + if (!__viommu_sync_req(viommu)) > + ret = virtqueue_add_sgs(vq, sg, 1, 1, req, GFP_ATOMIC); > + } > + if (ret) > + goto err_free; > + > + list_add_tail(&req->list, &viommu->requests); > + return 0; > + > +err_free: > + kfree(req); > + return ret; > +} > + > +static int viommu_add_req(struct viommu_dev *viommu, void *buf, size_t len) > +{ > + int ret; > + unsigned long flags; > + > + spin_lock_irqsave(&viommu->request_lock, flags); > + ret = __viommu_add_req(viommu, buf, len, false); > + if (ret) > + dev_dbg(viommu->dev, "could not add request: %d\n", ret); > + spin_unlock_irqrestore(&viommu->request_lock, flags); > + > + return ret; > +} > + > +/* > + * Send a request and wait for it to complete. Return the request status (as an > + * errno) > + */ > +static int viommu_send_req_sync(struct viommu_dev *viommu, void *buf, > + size_t len) > +{ > + int ret; > + unsigned long flags; > + > + spin_lock_irqsave(&viommu->request_lock, flags); > + > + ret = __viommu_add_req(viommu, buf, len, true); > + if (ret) { > + dev_dbg(viommu->dev, "could not add request (%d)\n", ret); > + goto out_unlock; > + } > + > + ret = __viommu_sync_req(viommu); > + if (ret) { > + dev_dbg(viommu->dev, "could not sync requests (%d)\n", ret); > + /* Fall-through (get the actual request status) */ > + } > + > + ret = viommu_get_req_errno(buf, len); > +out_unlock: > + spin_unlock_irqrestore(&viommu->request_lock, flags); > + return ret; > +} > + > +/* > + * viommu_add_mapping - add a mapping to the internal tree > + * > + * On success, return the new mapping. Otherwise return NULL. > + */ > +static struct viommu_mapping * > +viommu_add_mapping(struct viommu_domain *vdomain, unsigned long iova, > + phys_addr_t paddr, size_t size, u32 flags) > +{ > + unsigned long irqflags; > + struct viommu_mapping *mapping; > + > + mapping = kzalloc(sizeof(*mapping), GFP_ATOMIC); > + if (!mapping) > + return NULL; > + > + mapping->paddr = paddr; > + mapping->iova.start = iova; > + mapping->iova.last = iova + size - 1; > + mapping->flags = flags; > + > + spin_lock_irqsave(&vdomain->mappings_lock, irqflags); > + interval_tree_insert(&mapping->iova, &vdomain->mappings); > + spin_unlock_irqrestore(&vdomain->mappings_lock, irqflags); > + > + return mapping; > +} > + > +/* > + * viommu_del_mappings - remove mappings from the internal tree > + * > + * @vdomain: the domain > + * @iova: start of the range > + * @size: size of the range. A size of 0 corresponds to the entire address > + * space. > + * > + * On success, returns the number of unmapped bytes (>= size) > + */ > +static size_t viommu_del_mappings(struct viommu_domain *vdomain, > + unsigned long iova, size_t size) > +{ > + size_t unmapped = 0; > + unsigned long flags; > + unsigned long last = iova + size - 1; > + struct viommu_mapping *mapping = NULL; > + struct interval_tree_node *node, *next; > + > + spin_lock_irqsave(&vdomain->mappings_lock, flags); > + next = interval_tree_iter_first(&vdomain->mappings, iova, last); > + while (next) { > + node = next; > + mapping = container_of(node, struct viommu_mapping, iova); > + next = interval_tree_iter_next(node, iova, last); > + > + /* Trying to split a mapping? */ > + if (mapping->iova.start < iova) > + break; > + > + /* > + * Note that for a partial range, this will return the full > + * mapping so we avoid sending split requests to the device. > + */ > + unmapped += mapping->iova.last - mapping->iova.start + 1; > + > + interval_tree_remove(node, &vdomain->mappings); > + kfree(mapping); > + } > + spin_unlock_irqrestore(&vdomain->mappings_lock, flags); > + > + return unmapped; > +} > + > +/* > + * viommu_replay_mappings - re-send MAP requests > + * > + * When reattaching a domain that was previously detached from all endpoints, > + * mappings were deleted from the device. Re-create the mappings available in > + * the internal tree. > + */ > +static int viommu_replay_mappings(struct viommu_domain *vdomain) > +{ > + int ret; > + unsigned long flags; > + struct viommu_mapping *mapping; > + struct interval_tree_node *node; > + struct virtio_iommu_req_map map; > + > + spin_lock_irqsave(&vdomain->mappings_lock, flags); > + node = interval_tree_iter_first(&vdomain->mappings, 0, -1UL); > + while (node) { > + mapping = container_of(node, struct viommu_mapping, iova); > + map = (struct virtio_iommu_req_map) { > + .head.type = VIRTIO_IOMMU_T_MAP, > + .domain = cpu_to_le32(vdomain->id), > + .virt_start = cpu_to_le64(mapping->iova.start), > + .virt_end = cpu_to_le64(mapping->iova.last), > + .phys_start = cpu_to_le64(mapping->paddr), > + .flags = cpu_to_le32(mapping->flags), > + }; > + > + ret = viommu_send_req_sync(vdomain->viommu, &map, sizeof(map)); > + if (ret) > + break; > + > + node = interval_tree_iter_next(node, 0, -1UL); > + } > + spin_unlock_irqrestore(&vdomain->mappings_lock, flags); > + > + return ret; > +} > + > +/* IOMMU API */ > + > +static struct iommu_domain *viommu_domain_alloc(unsigned type) > +{ > + struct viommu_domain *vdomain; > + > + if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA) > + return NULL; > + > + vdomain = kzalloc(sizeof(*vdomain), GFP_KERNEL); > + if (!vdomain) > + return NULL; > + > + mutex_init(&vdomain->mutex); > + spin_lock_init(&vdomain->mappings_lock); > + vdomain->mappings = RB_ROOT_CACHED; > + > + if (type == IOMMU_DOMAIN_DMA && > + iommu_get_dma_cookie(&vdomain->domain)) { > + kfree(vdomain); > + return NULL; > + } > + > + return &vdomain->domain; > +} > + > +static int viommu_domain_finalise(struct viommu_dev *viommu, > + struct iommu_domain *domain) > +{ > + int ret; > + struct viommu_domain *vdomain = to_viommu_domain(domain); > + unsigned int max_domain = viommu->domain_bits > 31 ? ~0 : > + (1U << viommu->domain_bits) - 1; > + > + vdomain->viommu = viommu; > + > + domain->pgsize_bitmap = viommu->pgsize_bitmap; > + domain->geometry = viommu->geometry; > + > + ret = ida_alloc_max(&viommu->domain_ids, max_domain, GFP_KERNEL); > + if (ret >= 0) > + vdomain->id = (unsigned int)ret; > + > + return ret > 0 ? 0 : ret; > +} > + > +static void viommu_domain_free(struct iommu_domain *domain) > +{ > + struct viommu_domain *vdomain = to_viommu_domain(domain); > + > + iommu_put_dma_cookie(domain); > + > + /* Free all remaining mappings (size 2^64) */ > + viommu_del_mappings(vdomain, 0, 0); > + > + if (vdomain->viommu) > + ida_free(&vdomain->viommu->domain_ids, vdomain->id); > + > + kfree(vdomain); > +} > + > +static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) > +{ > + int i; > + int ret = 0; > + struct virtio_iommu_req_attach req; > + struct iommu_fwspec *fwspec = dev->iommu_fwspec; > + struct viommu_endpoint *vdev = fwspec->iommu_priv; > + struct viommu_domain *vdomain = to_viommu_domain(domain); > + > + mutex_lock(&vdomain->mutex); > + if (!vdomain->viommu) { > + /* > + * Initialize the domain proper now that we know which viommu > + * owns it. > + */ > + ret = viommu_domain_finalise(vdev->viommu, domain); > + } else if (vdomain->viommu != vdev->viommu) { > + dev_err(dev, "cannot attach to foreign vIOMMU\n"); > + ret = -EXDEV; > + } > + mutex_unlock(&vdomain->mutex); > + > + if (ret) > + return ret; > + > + /* > + * In the virtio-iommu device, when attaching the endpoint to a new > + * domain, it is detached from the old one and, if as as a result the > + * old domain isn't attached to any endpoint, all mappings are removed > + * from the old domain and it is freed. > + * > + * In the driver the old domain still exists, and its mappings will be > + * recreated if it gets reattached to an endpoint. Otherwise it will be > + * freed explicitly. > + * > + * vdev->vdomain is protected by group->mutex > + */ > + if (vdev->vdomain) > + vdev->vdomain->nr_endpoints--; > + > + req = (struct virtio_iommu_req_attach) { > + .head.type = VIRTIO_IOMMU_T_ATTACH, > + .domain = cpu_to_le32(vdomain->id), > + }; > + > + for (i = 0; i < fwspec->num_ids; i++) { > + req.endpoint = cpu_to_le32(fwspec->ids[i]); > + > + ret = viommu_send_req_sync(vdomain->viommu, &req, sizeof(req)); > + if (ret) > + return ret; > + } > + > + if (!vdomain->nr_endpoints) { > + /* > + * This endpoint is the first to be attached to the domain. > + * Replay existing mappings (e.g. SW MSI). > + */ > + ret = viommu_replay_mappings(vdomain); > + if (ret) > + return ret; > + } > + > + vdomain->nr_endpoints++; > + vdev->vdomain = vdomain; > + > + return 0; > +} > + > +static int viommu_map(struct iommu_domain *domain, unsigned long iova, > + phys_addr_t paddr, size_t size, int prot) > +{ > + int ret; > + int flags; > + struct viommu_mapping *mapping; > + struct virtio_iommu_req_map map; > + struct viommu_domain *vdomain = to_viommu_domain(domain); > + > + flags = (prot & IOMMU_READ ? VIRTIO_IOMMU_MAP_F_READ : 0) | > + (prot & IOMMU_WRITE ? VIRTIO_IOMMU_MAP_F_WRITE : 0) | > + (prot & IOMMU_MMIO ? VIRTIO_IOMMU_MAP_F_MMIO : 0); > + > + mapping = viommu_add_mapping(vdomain, iova, paddr, size, flags); > + if (!mapping) > + return -ENOMEM; > + > + map = (struct virtio_iommu_req_map) { > + .head.type = VIRTIO_IOMMU_T_MAP, > + .domain = cpu_to_le32(vdomain->id), > + .virt_start = cpu_to_le64(iova), > + .phys_start = cpu_to_le64(paddr), > + .virt_end = cpu_to_le64(iova + size - 1), > + .flags = cpu_to_le32(flags), > + }; > + > + if (!vdomain->nr_endpoints) > + return 0; > + > + ret = viommu_send_req_sync(vdomain->viommu, &map, sizeof(map)); > + if (ret) > + viommu_del_mappings(vdomain, iova, size); > + > + return ret; > +} > + > +static size_t viommu_unmap(struct iommu_domain *domain, unsigned long iova, > + size_t size) > +{ > + int ret = 0; > + size_t unmapped; > + struct virtio_iommu_req_unmap unmap; > + struct viommu_domain *vdomain = to_viommu_domain(domain); > + > + unmapped = viommu_del_mappings(vdomain, iova, size); > + if (unmapped < size) > + return 0; > + > + /* Device already removed all mappings after detach. */ > + if (!vdomain->nr_endpoints) > + return unmapped; > + > + unmap = (struct virtio_iommu_req_unmap) { > + .head.type = VIRTIO_IOMMU_T_UNMAP, > + .domain = cpu_to_le32(vdomain->id), > + .virt_start = cpu_to_le64(iova), > + .virt_end = cpu_to_le64(iova + unmapped - 1), > + }; > + > + ret = viommu_add_req(vdomain->viommu, &unmap, sizeof(unmap)); > + return ret ? 0 : unmapped; > +} > + > +static phys_addr_t viommu_iova_to_phys(struct iommu_domain *domain, > + dma_addr_t iova) > +{ > + u64 paddr = 0; > + unsigned long flags; > + struct viommu_mapping *mapping; > + struct interval_tree_node *node; > + struct viommu_domain *vdomain = to_viommu_domain(domain); > + > + spin_lock_irqsave(&vdomain->mappings_lock, flags); > + node = interval_tree_iter_first(&vdomain->mappings, iova, iova); > + if (node) { > + mapping = container_of(node, struct viommu_mapping, iova); > + paddr = mapping->paddr + (iova - mapping->iova.start); > + } > + spin_unlock_irqrestore(&vdomain->mappings_lock, flags); > + > + return paddr; > +} > + > +static void viommu_iotlb_sync(struct iommu_domain *domain) > +{ > + struct viommu_domain *vdomain = to_viommu_domain(domain); > + > + viommu_sync_req(vdomain->viommu); > +} > + > +static void viommu_get_resv_regions(struct device *dev, struct list_head *head) > +{ > + struct iommu_resv_region *region; > + int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; > + > + region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, prot, > + IOMMU_RESV_SW_MSI); > + if (!region) > + return; > + > + list_add_tail(®ion->list, head); > + iommu_dma_get_resv_regions(dev, head); > +} > + > +static void viommu_put_resv_regions(struct device *dev, struct list_head *head) > +{ > + struct iommu_resv_region *entry, *next; > + > + list_for_each_entry_safe(entry, next, head, list) > + kfree(entry); > +} > + > +static struct iommu_ops viommu_ops; > +static struct virtio_driver virtio_iommu_drv; > + > +static int viommu_match_node(struct device *dev, void *data) > +{ > + return dev->parent->fwnode == data; > +} > + > +static struct viommu_dev *viommu_get_by_fwnode(struct fwnode_handle *fwnode) > +{ > + struct device *dev = driver_find_device(&virtio_iommu_drv.driver, NULL, > + fwnode, viommu_match_node); > + put_device(dev); > + > + return dev ? dev_to_virtio(dev)->priv : NULL; > +} > + > +static int viommu_add_device(struct device *dev) > +{ > + int ret; > + struct iommu_group *group; > + struct viommu_endpoint *vdev; > + struct viommu_dev *viommu = NULL; > + struct iommu_fwspec *fwspec = dev->iommu_fwspec; > + > + if (!fwspec || fwspec->ops != &viommu_ops) > + return -ENODEV; > + > + viommu = viommu_get_by_fwnode(fwspec->iommu_fwnode); > + if (!viommu) > + return -ENODEV; > + > + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); > + if (!vdev) > + return -ENOMEM; > + > + vdev->viommu = viommu; > + fwspec->iommu_priv = vdev; > + > + ret = iommu_device_link(&viommu->iommu, dev); > + if (ret) > + goto err_free_dev; > + > + /* > + * Last step creates a default domain and attaches to it. Everything > + * must be ready. > + */ > + group = iommu_group_get_for_dev(dev); > + if (IS_ERR(group)) { > + ret = PTR_ERR(group); > + goto err_unlink_dev; > + } > + > + iommu_group_put(group); > + > + return PTR_ERR_OR_ZERO(group); > + > +err_unlink_dev: > + iommu_device_unlink(&viommu->iommu, dev); > + > +err_free_dev: > + kfree(vdev); > + > + return ret; > +} > + > +static void viommu_remove_device(struct device *dev) > +{ > + struct viommu_endpoint *vdev; > + struct iommu_fwspec *fwspec = dev->iommu_fwspec; > + > + if (!fwspec || fwspec->ops != &viommu_ops) > + return; > + > + vdev = fwspec->iommu_priv; > + > + iommu_group_remove_device(dev); > + iommu_device_unlink(&vdev->viommu->iommu, dev); > + kfree(vdev); > +} > + > +static struct iommu_group *viommu_device_group(struct device *dev) > +{ > + if (dev_is_pci(dev)) > + return pci_device_group(dev); > + else > + return generic_device_group(dev); > +} > + > +static int viommu_of_xlate(struct device *dev, struct of_phandle_args *args) > +{ > + return iommu_fwspec_add_ids(dev, args->args, 1); > +} > + > +static struct iommu_ops viommu_ops = { > + .domain_alloc = viommu_domain_alloc, > + .domain_free = viommu_domain_free, > + .attach_dev = viommu_attach_dev, > + .map = viommu_map, > + .unmap = viommu_unmap, > + .iova_to_phys = viommu_iova_to_phys, > + .iotlb_sync = viommu_iotlb_sync, > + .add_device = viommu_add_device, > + .remove_device = viommu_remove_device, > + .device_group = viommu_device_group, > + .get_resv_regions = viommu_get_resv_regions, > + .put_resv_regions = viommu_put_resv_regions, > + .of_xlate = viommu_of_xlate, > +}; > + > +static int viommu_init_vqs(struct viommu_dev *viommu) > +{ > + struct virtio_device *vdev = dev_to_virtio(viommu->dev); > + const char *name = "request"; > + void *ret; > + > + ret = virtio_find_single_vq(vdev, NULL, name); > + if (IS_ERR(ret)) { > + dev_err(viommu->dev, "cannot find VQ\n"); > + return PTR_ERR(ret); > + } > + > + viommu->vqs[VIOMMU_REQUEST_VQ] = ret; > + > + return 0; > +} > + > +static int viommu_probe(struct virtio_device *vdev) > +{ > + struct device *parent_dev = vdev->dev.parent; > + struct viommu_dev *viommu = NULL; > + struct device *dev = &vdev->dev; > + u64 input_start = 0; > + u64 input_end = -1UL; > + int ret; > + > + if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) > + return -ENODEV; I'm a bit confused about what will happen if this device happens to be behind an iommu itself. If we can't handle that, should we clear PLATFORM_IOMMU e.g. like the balloon does? > + > + viommu = devm_kzalloc(dev, sizeof(*viommu), GFP_KERNEL); > + if (!viommu) > + return -ENOMEM; > + > + spin_lock_init(&viommu->request_lock); > + ida_init(&viommu->domain_ids); > + viommu->dev = dev; > + viommu->vdev = vdev; > + INIT_LIST_HEAD(&viommu->requests); > + > + ret = viommu_init_vqs(viommu); > + if (ret) > + return ret; > + > + virtio_cread(vdev, struct virtio_iommu_config, page_size_mask, > + &viommu->pgsize_bitmap); > + > + if (!viommu->pgsize_bitmap) { > + ret = -EINVAL; > + goto err_free_vqs; > + } > + > + viommu->domain_bits = 32; > + > + /* Optional features */ > + virtio_cread_feature(vdev, VIRTIO_IOMMU_F_INPUT_RANGE, > + struct virtio_iommu_config, input_range.start, > + &input_start); > + > + virtio_cread_feature(vdev, VIRTIO_IOMMU_F_INPUT_RANGE, > + struct virtio_iommu_config, input_range.end, > + &input_end); > + > + virtio_cread_feature(vdev, VIRTIO_IOMMU_F_DOMAIN_BITS, > + struct virtio_iommu_config, domain_bits, > + &viommu->domain_bits); > + > + viommu->geometry = (struct iommu_domain_geometry) { > + .aperture_start = input_start, > + .aperture_end = input_end, > + .force_aperture = true, > + }; > + > + viommu_ops.pgsize_bitmap = viommu->pgsize_bitmap; > + > + virtio_device_ready(vdev); > + > + ret = iommu_device_sysfs_add(&viommu->iommu, dev, NULL, "%s", > + virtio_bus_name(vdev)); > + if (ret) > + goto err_free_vqs; > + > + iommu_device_set_ops(&viommu->iommu, &viommu_ops); > + iommu_device_set_fwnode(&viommu->iommu, parent_dev->fwnode); > + > + iommu_device_register(&viommu->iommu); > + > +#ifdef CONFIG_PCI > + if (pci_bus_type.iommu_ops != &viommu_ops) { > + pci_request_acs(); > + ret = bus_set_iommu(&pci_bus_type, &viommu_ops); > + if (ret) > + goto err_unregister; > + } > +#endif > +#ifdef CONFIG_ARM_AMBA > + if (amba_bustype.iommu_ops != &viommu_ops) { > + ret = bus_set_iommu(&amba_bustype, &viommu_ops); > + if (ret) > + goto err_unregister; > + } > +#endif > + if (platform_bus_type.iommu_ops != &viommu_ops) { > + ret = bus_set_iommu(&platform_bus_type, &viommu_ops); > + if (ret) > + goto err_unregister; > + } > + > + vdev->priv = viommu; > + > + dev_info(dev, "input address: %u bits\n", > + order_base_2(viommu->geometry.aperture_end)); > + dev_info(dev, "page mask: %#llx\n", viommu->pgsize_bitmap); > + > + return 0; > + > +err_unregister: > + iommu_device_sysfs_remove(&viommu->iommu); > + iommu_device_unregister(&viommu->iommu); > +err_free_vqs: > + vdev->config->del_vqs(vdev); > + > + return ret; > +} > + > +static void viommu_remove(struct virtio_device *vdev) > +{ > + struct viommu_dev *viommu = vdev->priv; > + > + iommu_device_sysfs_remove(&viommu->iommu); > + iommu_device_unregister(&viommu->iommu); > + > + /* Stop all virtqueues */ > + vdev->config->reset(vdev); > + vdev->config->del_vqs(vdev); > + > + dev_info(&vdev->dev, "device removed\n"); > +} > + > +static void viommu_config_changed(struct virtio_device *vdev) > +{ > + dev_warn(&vdev->dev, "config changed\n"); > +} > + > +static unsigned int features[] = { > + VIRTIO_IOMMU_F_MAP_UNMAP, > + VIRTIO_IOMMU_F_DOMAIN_BITS, > + VIRTIO_IOMMU_F_INPUT_RANGE, > +}; > + > +static struct virtio_device_id id_table[] = { > + { VIRTIO_ID_IOMMU, VIRTIO_DEV_ANY_ID }, > + { 0 }, > +}; > + > +static struct virtio_driver virtio_iommu_drv = { > + .driver.name = KBUILD_MODNAME, > + .driver.owner = THIS_MODULE, > + .id_table = id_table, > + .feature_table = features, > + .feature_table_size = ARRAY_SIZE(features), > + .probe = viommu_probe, > + .remove = viommu_remove, > + .config_changed = viommu_config_changed, > +}; > + > +module_virtio_driver(virtio_iommu_drv); > + > +MODULE_DESCRIPTION("Virtio IOMMU driver"); > +MODULE_AUTHOR("Jean-Philippe Brucker "); > +MODULE_LICENSE("GPL v2"); > diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h > index 6d5c3b2d4f4d..cfe47c5d9a56 100644 > --- a/include/uapi/linux/virtio_ids.h > +++ b/include/uapi/linux/virtio_ids.h > @@ -43,5 +43,6 @@ > #define VIRTIO_ID_INPUT 18 /* virtio input */ > #define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ > #define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ > +#define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */ > > #endif /* _LINUX_VIRTIO_IDS_H */ > diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h > new file mode 100644 > index 000000000000..e808fc7fbe82 > --- /dev/null > +++ b/include/uapi/linux/virtio_iommu.h > @@ -0,0 +1,101 @@ > +/* SPDX-License-Identifier: BSD-3-Clause */ > +/* > + * Virtio-iommu definition v0.8 > + * > + * Copyright (C) 2018 Arm Ltd. > + */ > +#ifndef _UAPI_LINUX_VIRTIO_IOMMU_H > +#define _UAPI_LINUX_VIRTIO_IOMMU_H > + > +#include > + > +/* Feature bits */ > +#define VIRTIO_IOMMU_F_INPUT_RANGE 0 > +#define VIRTIO_IOMMU_F_DOMAIN_BITS 1 > +#define VIRTIO_IOMMU_F_MAP_UNMAP 2 > +#define VIRTIO_IOMMU_F_BYPASS 3 > + > +struct virtio_iommu_config { > + /* Supported page sizes */ > + __u64 page_size_mask; > + /* Supported IOVA range */ > + struct virtio_iommu_range { I'd rather we moved the definition outside even though gcc allows it - some old userspace compilers might not. > + __u64 start; > + __u64 end; > + } input_range; > + /* Max domain ID size */ > + __u8 domain_bits; Let's add explicit padding here as well? > +}; > + > +/* Request types */ > +#define VIRTIO_IOMMU_T_ATTACH 0x01 > +#define VIRTIO_IOMMU_T_DETACH 0x02 > +#define VIRTIO_IOMMU_T_MAP 0x03 > +#define VIRTIO_IOMMU_T_UNMAP 0x04 > + > +/* Status types */ > +#define VIRTIO_IOMMU_S_OK 0x00 > +#define VIRTIO_IOMMU_S_IOERR 0x01 > +#define VIRTIO_IOMMU_S_UNSUPP 0x02 > +#define VIRTIO_IOMMU_S_DEVERR 0x03 > +#define VIRTIO_IOMMU_S_INVAL 0x04 > +#define VIRTIO_IOMMU_S_RANGE 0x05 > +#define VIRTIO_IOMMU_S_NOENT 0x06 > +#define VIRTIO_IOMMU_S_FAULT 0x07 > + > +struct virtio_iommu_req_head { > + __u8 type; > + __u8 reserved[3]; > +}; > + > +struct virtio_iommu_req_tail { > + __u8 status; > + __u8 reserved[3]; > +}; > + > +struct virtio_iommu_req_attach { > + struct virtio_iommu_req_head head; > + __le32 domain; > + __le32 endpoint; > + __u8 reserved[8]; > + struct virtio_iommu_req_tail tail; > +}; > + > +struct virtio_iommu_req_detach { > + struct virtio_iommu_req_head head; > + __le32 domain; > + __le32 endpoint; > + __u8 reserved[8]; > + struct virtio_iommu_req_tail tail; > +}; > + > +#define VIRTIO_IOMMU_MAP_F_READ (1 << 0) > +#define VIRTIO_IOMMU_MAP_F_WRITE (1 << 1) > +#define VIRTIO_IOMMU_MAP_F_EXEC (1 << 2) > +#define VIRTIO_IOMMU_MAP_F_MMIO (1 << 3) > + > +#define VIRTIO_IOMMU_MAP_F_MASK (VIRTIO_IOMMU_MAP_F_READ | \ > + VIRTIO_IOMMU_MAP_F_WRITE | \ > + VIRTIO_IOMMU_MAP_F_EXEC | \ > + VIRTIO_IOMMU_MAP_F_MMIO) > + > +struct virtio_iommu_req_map { > + struct virtio_iommu_req_head head; > + __le32 domain; > + __le64 virt_start; > + __le64 virt_end; > + __le64 phys_start; > + __le32 flags; > + struct virtio_iommu_req_tail tail; > +}; > + > +struct virtio_iommu_req_unmap { > + struct virtio_iommu_req_head head; > + __le32 domain; > + __le64 virt_start; > + __le64 virt_end; > + __u8 reserved[4]; > + struct virtio_iommu_req_tail tail; > +}; > + > +#endif > -- > 2.19.1