linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH V2] drivers/uio/uio.c: DMA mapping, interrupt extensions, etc.
@ 2010-04-15 20:55 Tom Lyon
  2010-04-17 10:43 ` Joerg Roedel
  2010-04-17 18:34 ` Avi Kivity
  0 siblings, 2 replies; 4+ messages in thread
From: Tom Lyon @ 2010-04-15 20:55 UTC (permalink / raw)
  To: mst, hjk, gregkh, chrisw, joro, avi, kvm, linux-kernel

This is the second of 2 related, but independent, patches. This is for 
uio.c, the previous is for uio_pci_generic.c.

The 2 patches were previously one large patch. Changes for this version:
- uio_pci_generic.c just gets extensions so that a single fd can be used
  by non-privileged processes for interrupt control and mmaps
- All of the DMA and IOMMU related stuff move to uio.c; no longer a need
  to pass ioctls to individual uio drivers. It turns out that the code 
  is not PCI specific anyways.
- A new ioctl to pin DMA buffers to certain IO virtual addresses for KVM.
- New eventfd based interrupt notifications, including support for PCI
  specific MSI and MSI-X interrupts.
- PCI specific code to reset PCI functions before and after use

diff -ruNP linux-2.6.33/drivers/uio/uio.c uio-2.6.33/drivers/uio/uio.c
--- linux-2.6.33/drivers/uio/uio.c	2010-02-24 10:52:17.000000000 -0800
+++ uio-2.6.33/drivers/uio/uio.c	2010-04-15 12:39:02.000000000 -0700
@@ -23,6 +23,11 @@
 #include <linux/string.h>
 #include <linux/kobject.h>
 #include <linux/uio_driver.h>
+#include <linux/mmu_notifier.h>
+#include <linux/pci.h>
+#include <linux/iommu.h>
+#include <linux/eventfd.h>
+#include <linux/semaphore.h>
 
 #define UIO_MAX_DEVICES 255
 
@@ -37,8 +42,37 @@
 	struct uio_info		*info;
 	struct kobject		*map_dir;
 	struct kobject		*portio_dir;
+	struct pci_dev		*pdev;
+	int			pmaster;
+	struct semaphore	gate;
+	int			listeners;
+	atomic_t		mapcount;
+	struct msix_entry	*msix;
+	int			nvec;
+	struct iommu_domain	*domain;
+	int			cachec;
+	struct eventfd_ctx	*ev_irq;
+	struct eventfd_ctx	*ev_msi;
+	struct eventfd_ctx	**ev_msix;
 };
 
+/*
+ * Structure for keeping track of memory nailed down by the
+ * user for DMA
+ */
+struct dma_map_page {
+        struct list_head list;
+        struct page     **pages;
+	struct scatterlist *sg;
+        dma_addr_t      daddr;
+	unsigned long	vaddr;
+	int		npage;
+	int		rdwr;
+};
+
+static void uio_disable_msi(struct uio_device *);
+static void uio_disable_msix(struct uio_device *);
+
 static int uio_major;
 static DEFINE_IDR(uio_idr);
 static const struct file_operations uio_fops;
@@ -440,17 +474,38 @@
 	struct uio_device *idev = (struct uio_device *)dev_id;
 	irqreturn_t ret = idev->info->handler(irq, idev->info);
 
-	if (ret == IRQ_HANDLED)
+	if (ret != IRQ_HANDLED)
+		return ret;
+	if (idev->ev_irq)
+		eventfd_signal(idev->ev_irq, 1);
+	else
 		uio_event_notify(idev->info);
 
 	return ret;
 }
 
+/*
+ * MSI and MSI-X Interrupt handler.
+ * Just record an event
+ */
+static irqreturn_t msihandler(int irq, void *arg)
+{
+	struct eventfd_ctx *ctx = arg;
+
+	eventfd_signal(ctx, 1);
+	return IRQ_HANDLED;
+}
+
 struct uio_listener {
 	struct uio_device *dev;
 	s32 event_count;
+	struct mm_struct	*mm;
+	struct mmu_notifier	mmu_notifier;
+	struct list_head	dm_list;
 };
 
+static void uio_dma_unmapall(struct uio_listener *);
+
 static int uio_open(struct inode *inode, struct file *filep)
 {
 	struct uio_device *idev;
@@ -470,7 +525,7 @@
 		goto out;
 	}
 
-	listener = kmalloc(sizeof(*listener), GFP_KERNEL);
+	listener = kzalloc(sizeof(*listener), GFP_KERNEL);
 	if (!listener) {
 		ret = -ENOMEM;
 		goto err_alloc_listener;
@@ -478,8 +533,22 @@
 
 	listener->dev = idev;
 	listener->event_count = atomic_read(&idev->event);
+	INIT_LIST_HEAD(&listener->dm_list);
 	filep->private_data = listener;
 
+	down(&idev->gate);
+	if (idev->listeners == 0) {		/* first open */
+		if (idev->pmaster && !iommu_found() && !capable(CAP_SYS_RAWIO)) {
+			up(&idev->gate);
+			return -EPERM;
+		}
+		/* reset to known state if we can */
+		if (idev->pdev)
+			(void) pci_reset_function(idev->pdev);
+	}
+	idev->listeners++;
+	up(&idev->gate);
+
 	if (idev->info->open) {
 		ret = idev->info->open(idev->info, inode);
 		if (ret)
@@ -514,6 +583,34 @@
 	if (idev->info->release)
 		ret = idev->info->release(idev->info, inode);
 
+	uio_dma_unmapall(listener);
+	if (listener->mm) {
+		mmu_notifier_unregister(&listener->mmu_notifier, listener->mm);
+		listener->mm = NULL;
+	}
+
+	down(&idev->gate);
+	if (--idev->listeners <= 0) {
+		if (idev->msix) {
+			uio_disable_msix(idev);
+		}
+		if (idev->ev_msi) {
+			uio_disable_msi(idev);
+		}
+		if (idev->ev_irq) {
+			eventfd_ctx_put(idev->ev_msi);
+			idev->ev_irq = NULL;
+		}
+		if (idev->domain) {
+			iommu_domain_free(idev->domain);
+			idev->domain = NULL;
+		}
+		/* reset to known state if we can */
+		if (idev->pdev)
+			(void) pci_reset_function(idev->pdev);
+	}
+	up(&idev->gate);
+
 	module_put(idev->owner);
 	kfree(listener);
 	return ret;
@@ -730,12 +827,510 @@
 	}
 }
 
+/* Unmap DMA region */
+static void uio_dma_unmap(struct uio_listener *listener,
+			struct dma_map_page *mlp)
+{
+	int i;
+	struct uio_device *idev = listener->dev;
+
+	list_del(&mlp->list);
+	if (mlp->sg) {
+		dma_unmap_sg(idev->dev->parent, mlp->sg, mlp->npage, DMA_BIDIRECTIONAL);
+		kfree(mlp->sg);
+	} else {
+		for (i=0; i<mlp->npage; i++)
+			(void) iommu_unmap_range(idev->domain,
+					mlp->daddr + i*PAGE_SIZE, PAGE_SIZE);
+	}
+	for (i=0; i<mlp->npage; i++) {
+		if (mlp->rdwr)
+			SetPageDirty(mlp->pages[i]);
+		put_page(mlp->pages[i]);
+	}
+	listener->mm->locked_vm -= mlp->npage;
+	kfree(mlp->pages);
+	kfree(mlp);
+	atomic_dec(&idev->mapcount);
+}
+
+/* Unmap ALL DMA regions */
+static void uio_dma_unmapall(struct uio_listener *listener)
+{
+        struct list_head *pos, *pos2;
+        struct dma_map_page *mlp;
+
+        list_for_each_safe(pos, pos2, &listener->dm_list) {
+                mlp = list_entry(pos, struct dma_map_page, list);
+		uio_dma_unmap(listener, mlp);
+        }
+}
+
+int uio_dma_unmap_dm(struct uio_listener *listener, struct uio_dma_map *dmp)
+{
+	unsigned long start, npage;
+	struct dma_map_page *mlp;
+        struct list_head *pos, *pos2;
+	int ret;
+
+	start = dmp->vaddr & ~PAGE_SIZE;
+	npage = dmp->size >> PAGE_SHIFT;
+
+	ret = -ENXIO;
+	list_for_each_safe(pos, pos2, &listener->dm_list) {
+		mlp = list_entry(pos, struct dma_map_page, list);
+		if (dmp->vaddr != mlp->vaddr || mlp->npage != npage)
+			continue;
+		ret = 0;
+		uio_dma_unmap(listener, mlp);
+		break;
+	}
+	return ret;
+}
+
+/* Handle MMU notifications - user process freed or realloced memory
+ * which may be in use in a DMA region. Clean up region if so.
+ */
+static void uio_dma_handle_mmu_notify(struct mmu_notifier *mn,
+		unsigned long start, unsigned long end)
+{
+	struct uio_listener *listener;
+	unsigned long myend;
+        struct list_head *pos, *pos2;
+        struct dma_map_page *mlp;
+
+	listener = container_of(mn, struct uio_listener, mmu_notifier);
+	list_for_each_safe(pos, pos2, &listener->dm_list) {
+		mlp = list_entry(pos, struct dma_map_page, list);
+		if (mlp->vaddr >= end)
+			continue;
+		/*
+		 * Ranges overlap if they're not disjoint; and they're
+		 * disjoint if the end of one is before the start of
+		 * the other one.
+		 */
+		myend = mlp->vaddr + (mlp->npage << PAGE_SHIFT) - 1;
+		if (!(myend <= start || end <= mlp->vaddr)) {
+			printk(KERN_WARNING
+				"%s: demap start %lx end %lx va %lx pa %lx\n",
+				__func__, start, end, mlp->vaddr, (long)mlp->daddr);
+			uio_dma_unmap(listener, mlp);
+		}
+	}
+}
+
+static void uio_dma_inval_page(struct mmu_notifier *mn,
+		struct mm_struct *mm, unsigned long addr)
+{
+	uio_dma_handle_mmu_notify(mn, addr, addr + PAGE_SIZE);
+}
+
+static void uio_dma_inval_range_start(struct mmu_notifier *mn,
+		struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+	uio_dma_handle_mmu_notify(mn, start, end);
+}
+
+static const struct mmu_notifier_ops uio_dma_mmu_notifier_ops = {
+	.invalidate_page = uio_dma_inval_page,
+	.invalidate_range_start = uio_dma_inval_range_start,
+};
+
+static int uio_dma_map_iova(
+		struct uio_listener *listener,
+		unsigned long start_iova,
+		struct page **pages,
+		int npage,
+		int rdwr,
+		struct dma_map_page **mlpp)
+{
+	struct uio_device *idev = listener->dev;
+	int ret;
+	int i;
+	phys_addr_t hpa;
+	struct dma_map_page *mlp;
+	unsigned long iova = start_iova;
+
+	if (idev->domain == NULL) {
+		if (!list_empty(&listener->dm_list))	/* no mix with anywhere */
+			return -EINVAL;
+		if (!iommu_found())
+			return -EINVAL;
+		idev->domain = iommu_domain_alloc();
+		if (idev->domain == NULL)
+			return -ENXIO;
+		idev->cachec = iommu_domain_has_cap(idev->domain,
+					IOMMU_CAP_CACHE_COHERENCY);
+		ret = iommu_attach_device(idev->domain, idev->dev->parent);
+		if (ret) {
+			iommu_domain_free(idev->domain);
+			idev->domain = NULL;
+			printk(KERN_ERR "%s: device_attach failed %d\n", __func__, ret);
+			return ret;
+		}
+	}
+	for (i=0; i<npage; i++) {
+		if (iommu_iova_to_phys(idev->domain, iova + i*PAGE_SIZE))
+			return -EBUSY;
+	}
+	rdwr = rdwr ? IOMMU_READ|IOMMU_WRITE : IOMMU_READ;
+	if (idev->cachec) rdwr |= IOMMU_CACHE;
+	for (i=0; i<npage; i++) {
+		hpa = page_to_phys(pages[i]);
+		ret = iommu_map_range(idev->domain, iova, 
+			hpa, PAGE_SIZE, rdwr);
+		if (ret) {
+			while (--i > 0) {
+				iova -= PAGE_SIZE;
+				(void) iommu_unmap_range(idev->domain, iova, PAGE_SIZE);
+			}
+			return ret;
+		}
+		iova += PAGE_SIZE;
+	} 
+
+	mlp = kzalloc(sizeof *mlp, GFP_KERNEL);
+	mlp->pages = pages;
+	mlp->daddr = start_iova;
+	mlp->npage = npage;
+	*mlpp = mlp;
+	atomic_inc(&idev->mapcount);
+	return 0;
+}
+
+static int uio_dma_map_anywhere(
+		struct uio_listener *listener,
+		struct page **pages,
+		int npage,
+		int rdwr,
+		struct dma_map_page **mlpp)
+{
+	struct uio_device *idev = listener->dev;
+	struct scatterlist *sg, *nsg;
+	int i, nents;
+	struct dma_map_page *mlp;
+	unsigned long length;
+
+	if (idev->domain) {
+		/* map anywhere and map iova don't mix */
+		if (atomic_read(&idev->mapcount) == 0) {
+			iommu_domain_free(idev->domain);
+			idev->domain = NULL;
+		} else
+			return -EINVAL;
+	}
+	sg = kzalloc(npage * sizeof(struct scatterlist), GFP_KERNEL);
+	if (sg == NULL)
+		return -ENOMEM;
+	for (i=0; i<npage; i++) {
+		sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+	}
+	nents = dma_map_sg(idev->dev->parent, sg, npage,
+			rdwr ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE);
+	/* The API for dma_map_sg suggests that it may squash together
+	 * adjacent pages, but noone seems to really do that. So we squash 
+	 * it ourselves, because the user level wants a single buffer.
+	 * This works if (a) there is an iommu, or (b) the user allocates 
+	 * large buffers from a huge page
+	 */
+	nsg = sg;
+	for (i=1; i<nents; i++) {
+		length = sg[i].dma_length;
+		sg[i].dma_length = 0;
+		if (sg[i].dma_address == (nsg->dma_address + nsg->dma_length)) {
+			nsg->dma_length += length;
+		} else {
+			nsg++;
+			nsg->dma_address = sg[i].dma_address;
+			nsg->dma_length = length;
+		}
+	}
+	nents = 1 + (nsg - sg);
+	if (nents != 1) {
+		if (nents > 0)
+			dma_unmap_sg(idev->dev->parent, sg, npage, DMA_BIDIRECTIONAL);
+		for (i=0; i<npage; i++)
+			put_page(pages[i]);
+		kfree(sg);
+		printk(KERN_ERR "%s: sequential dma mapping failed\n", __func__);
+		return -EFAULT;
+	}
+
+	mlp = kzalloc(sizeof *mlp, GFP_KERNEL);
+	mlp->pages = pages;
+	mlp->sg = sg;
+	mlp->daddr = sg_dma_address(sg);
+	mlp->npage = npage;
+	*mlpp = mlp;
+	atomic_inc(&idev->mapcount);
+	return 0;
+}
+
+static int uio_dma_map_common(struct uio_listener *listener,
+		unsigned int cmd, struct uio_dma_map *dmp)
+{
+	int locked, lock_limit;
+	struct page **pages;
+	int npage;
+	struct dma_map_page *mlp = NULL;
+	int ret=0;
+
+	if (dmp->vaddr & (PAGE_SIZE-1))
+		return -EINVAL;
+	if (dmp->size & (PAGE_SIZE-1))
+		return -EINVAL;
+	if (dmp->size <= 0)
+		return -EINVAL;
+	npage = dmp->size >> PAGE_SHIFT;
+
+	/* account for locked pages */
+	locked = npage + current->mm->locked_vm;
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+		printk(KERN_WARNING "%s: RLIMIT_MEMLOCK exceeded\n",
+			__func__);
+		return -ENOMEM;
+	}
+	/* only 1 address space per fd */
+	if (current->mm != listener->mm) {
+		if (listener->mm != NULL)
+			return -EINVAL;
+		listener->mm = current->mm;
+		listener->mmu_notifier.ops = &uio_dma_mmu_notifier_ops;
+		ret = mmu_notifier_register(&listener->mmu_notifier,
+						listener->mm);
+		if (ret)
+			printk(KERN_ERR "%s: mmu_notifier_register failed %d\n",
+				__func__, ret);
+	}
+
+	pages = kzalloc(npage * sizeof(struct page *), GFP_KERNEL);
+	if (pages == NULL)
+		return -ENOMEM;
+	ret = get_user_pages_fast(dmp->vaddr, npage, dmp->rdwr, pages);
+	if (ret != npage) {
+		printk(KERN_ERR "%s: get_user_pages_fast returns %d, not %d\n",
+			__func__, ret, npage);
+		kfree(pages);
+		return -EFAULT;
+	}
+
+	if (cmd == UIO_DMA_MAP_IOVA)
+		ret = uio_dma_map_iova(listener, dmp->dmaaddr,
+				pages, npage, dmp->rdwr, &mlp);
+	else
+		ret = uio_dma_map_anywhere(listener, pages,
+				npage, dmp->rdwr, &mlp);
+	if (ret) {
+		kfree(pages);
+		return ret;
+	}
+	mlp->vaddr = dmp->vaddr;
+	mlp->rdwr = dmp->rdwr;
+	dmp->dmaaddr = mlp->daddr;
+	list_add(&mlp->list, &listener->dm_list);
+
+	current->mm->locked_vm += npage;
+	return 0;
+}
+
+static void uio_disable_msi(struct uio_device *idev)
+{
+	struct pci_dev *pdev = idev->pdev;
+
+	if (!pdev)		/* pci specific */
+		return;
+	if (idev->ev_msi) {
+		eventfd_ctx_put(idev->ev_msi);
+		free_irq(pdev->irq, idev->ev_msi);
+		idev->ev_msi = NULL;
+	}
+	pci_disable_msi(pdev);
+}
+
+static int uio_enable_msi(struct uio_device *idev, int fd)
+{
+	struct pci_dev *pdev = idev->pdev;
+	int ret;
+
+	if (!pdev)		/* pci specific */
+		return -EINVAL;
+	idev->ev_msi = eventfd_ctx_fdget(fd);
+	if (idev->ev_msi == NULL)
+		return -EINVAL;
+	pci_enable_msi(pdev);
+	ret = request_irq(pdev->irq, msihandler, 0,
+			idev->info->name, idev->ev_msi);
+	if (ret) {
+		eventfd_ctx_put(idev->ev_msi);
+		pci_disable_msi(pdev);
+		idev->ev_msi = NULL;
+	}
+	return ret;
+}
+
+static void uio_disable_msix(struct uio_device *idev)
+{
+	struct pci_dev *pdev = idev->pdev;
+	int i;
+
+	if (!pdev)		/* pci specific */
+		return;
+	if (idev->ev_msix && idev->msix) {
+		for (i=0; i<idev->nvec; i++) {
+			free_irq(idev->msix[i].vector, idev->ev_msix[i]);
+			eventfd_ctx_put(idev->ev_msix[i]);
+		}
+	}
+	if (idev->ev_msix) {
+		kfree(idev->ev_msix);
+		idev->ev_msix = NULL;
+	}
+	if (idev->msix) {
+		kfree(idev->msix);
+		idev->msix = NULL;
+	}
+	idev->nvec = 0;
+	pci_disable_msix(pdev);
+}
+
+static int uio_enable_msix(struct uio_device *idev, int nvec, void __user *uarg)
+{
+	struct pci_dev *pdev = idev->pdev;
+	struct eventfd_ctx *ctx;
+	int ret = 0;
+	int i;
+	int fd;
+
+	if (!pdev)		/* pci specific */
+		return -EINVAL;
+	idev->msix = kzalloc(nvec * sizeof(struct msix_entry), GFP_KERNEL);
+	idev->ev_msix = kzalloc(nvec * sizeof(struct eventfd_ctx *), GFP_KERNEL);
+	if (idev->msix == NULL || idev->ev_msix == NULL)
+		ret = -ENOMEM;
+	else {
+		for (i=0; i<nvec; i++) {
+			if (copy_from_user(&fd, uarg, sizeof fd)) {
+				ret = -EFAULT;
+				break;
+			}
+			uarg += sizeof fd;
+			ctx = eventfd_ctx_fdget(fd);
+			if (ctx == NULL) {
+				ret = -EINVAL;
+				break;
+			}
+			idev->msix[i].entry = i;
+			idev->ev_msix[i] = ctx;
+		}
+	}
+	if (!ret)
+		ret = pci_enable_msix(pdev, idev->msix, nvec);
+	idev->nvec = 0;
+	for (i=0; i<nvec && !ret; i++) {
+		ret = request_irq(idev->msix[i].vector, msihandler, 0,
+			idev->info->name, idev->ev_msix[i]);
+		if (ret)
+			break;
+		idev->nvec = i+1;
+	}
+	if (ret)
+		uio_disable_msix(idev);
+	return ret;
+}
+
+static int uio_ioctl(struct inode *inode, struct file *filep,
+			unsigned int cmd, unsigned long arg) 
+{
+	struct uio_listener *listener = filep->private_data;
+	struct uio_device *idev = listener->dev;
+	void __user *uarg = (void __user *)arg;
+	struct pci_dev *pdev = idev->pdev;
+	struct uio_dma_map dm;
+	int ret = 0;
+	u64 mask;
+	int fd, nfd;
+
+	if (idev == NULL || idev->info == NULL)
+		return -EINVAL;
+
+	switch(cmd) {
+
+	case UIO_DMA_MAP_ANYWHERE:
+	case UIO_DMA_MAP_IOVA:
+		if (copy_from_user(&dm, uarg, sizeof dm))
+                        return -EFAULT;
+		ret = uio_dma_map_common(listener, cmd, &dm);
+		if (!ret && copy_to_user(uarg, &dm, sizeof dm))
+			ret = -EFAULT;
+		break;
+
+	case UIO_DMA_UNMAP:
+		if (copy_from_user(&dm, uarg, sizeof dm))
+			return -EFAULT;
+		ret = uio_dma_unmap_dm(listener, &dm);
+		break;
+
+	case UIO_DMA_MASK:	/* set master mode and DMA mask */
+                if (copy_from_user(&mask, uarg, sizeof mask))
+                        return -EFAULT;
+		if (pdev) {
+			pci_set_master(pdev);
+			ret = pci_set_dma_mask(pdev, mask);
+		} else {
+			ret = dma_set_mask(idev->dev->parent, mask);
+		}
+		break;
+
+	case UIO_EVENTFD_IRQ:
+                if (copy_from_user(&fd, uarg, sizeof fd))
+                        return -EFAULT;
+		if (idev->ev_irq)
+			eventfd_ctx_put(idev->ev_irq);
+		if (fd >= 0) {
+			idev->ev_irq = eventfd_ctx_fdget(fd);
+			if (idev->ev_irq == NULL)
+				ret = -EINVAL;
+		}
+		break;
+
+	case UIO_EVENTFD_MSI:
+		if (copy_from_user(&fd, uarg, sizeof fd))
+			return -EFAULT;
+		if (fd >= 0 && idev->ev_msi == NULL && idev->ev_msix == NULL) {
+			ret = uio_enable_msi(idev, fd);
+		} else if (fd < 0 && idev->ev_msi) {
+			uio_disable_msi(idev);
+		} else
+			ret = -EINVAL;
+		break;
+
+	case UIO_EVENTFDS_MSIX:
+		if (copy_from_user(&nfd, uarg, sizeof nfd))
+			return -EFAULT;
+		uarg += sizeof nfd;
+		if (nfd > 0 && idev->ev_msi == NULL && idev->ev_msix == NULL) 
+			ret = uio_enable_msix(idev, nfd, uarg);
+		else if (nfd == 0 && idev->ev_msix)
+			uio_disable_msix(idev);
+		else {
+			ret = -EINVAL;
+		}
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	return ret;
+}
+
 static const struct file_operations uio_fops = {
 	.owner		= THIS_MODULE,
 	.open		= uio_open,
 	.release	= uio_release,
 	.read		= uio_read,
 	.write		= uio_write,
+	.ioctl		= uio_ioctl,
 	.mmap		= uio_mmap,
 	.poll		= uio_poll,
 	.fasync		= uio_fasync,
@@ -836,6 +1431,7 @@
 		ret = -ENOMEM;
 		goto err_kzalloc;
 	}
+	init_MUTEX(&idev->gate);
 
 	idev->owner = owner;
 	idev->info = info;
@@ -861,6 +1457,22 @@
 
 	info->uio_dev = idev;
 
+	/* See if we have a PCI device */
+	if (parent->bus == &pci_bus_type) {
+		u16 old_cmd, cmd;
+		struct pci_dev *pdev = to_pci_dev(parent);
+
+		idev->pdev = pdev;
+		/* see if its a master */
+		pci_read_config_word(pdev, PCI_COMMAND, &old_cmd);
+		cmd = old_cmd | PCI_COMMAND_MASTER;
+		pci_write_config_word(pdev, PCI_COMMAND, cmd);
+		pci_read_config_word(pdev, PCI_COMMAND, &cmd);
+		pci_write_config_word(pdev, PCI_COMMAND, old_cmd);
+		if (cmd & PCI_COMMAND_MASTER)
+			idev->pmaster = 1;
+	}
+
 	if (idev->info->irq >= 0) {
 		ret = request_irq(idev->info->irq, uio_interrupt,
 				  idev->info->irq_flags, idev->info->name, idev);
diff -ruNP linux-2.6.33/include/linux/uio_driver.h uio-2.6.33/include/linux/uio_driver.h
--- linux-2.6.33/include/linux/uio_driver.h	2010-02-24 10:52:17.000000000 -0800
+++ uio-2.6.33/include/linux/uio_driver.h	2010-04-14 15:42:57.000000000 -0700
@@ -14,8 +14,12 @@
 #ifndef _UIO_DRIVER_H_
 #define _UIO_DRIVER_H_
 
-#include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/ioctl.h>
+
+#ifdef __KERNEL__
+
+#include <linux/module.h>
 #include <linux/interrupt.h>
 
 struct uio_map;
@@ -122,4 +126,23 @@
 #define UIO_PORT_GPIO	2
 #define UIO_PORT_OTHER	3
 
+#endif	/* __KERNEL__ */
+
+// Kernel & User level defines for ioctls
+
+struct uio_dma_map {
+	unsigned long	vaddr;
+	unsigned long long dmaaddr;
+	int	size;
+	int	rdwr;
+};
+
+#define	UIO_DMA_MAP_ANYWHERE	_IOWR(';', 100, struct uio_dma_map)
+#define	UIO_DMA_MAP_IOVA	_IOWR(';', 101, struct uio_dma_map)
+#define	UIO_DMA_UNMAP		_IOW(';', 102, struct uio_dma_map)
+#define	UIO_DMA_MASK		_IOW(';', 103, unsigned long long)
+#define	UIO_EVENTFD_IRQ		_IOW(';', 104, int)
+#define	UIO_EVENTFD_MSI		_IOW(';', 105, int)
+#define	UIO_EVENTFDS_MSIX	_IOW(';', 106, int)
+
 #endif /* _LINUX_UIO_DRIVER_H_ */

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH V2] drivers/uio/uio.c: DMA mapping, interrupt extensions, etc.
  2010-04-15 20:55 [PATCH V2] drivers/uio/uio.c: DMA mapping, interrupt extensions, etc Tom Lyon
@ 2010-04-17 10:43 ` Joerg Roedel
  2010-04-17 17:04   ` Tom Lyon
  2010-04-17 18:34 ` Avi Kivity
  1 sibling, 1 reply; 4+ messages in thread
From: Joerg Roedel @ 2010-04-17 10:43 UTC (permalink / raw)
  To: Tom Lyon; +Cc: mst, hjk, gregkh, chrisw, avi, kvm, linux-kernel

On Thu, Apr 15, 2010 at 01:55:29PM -0700, Tom Lyon wrote:

> +	down(&idev->gate);
> +	if (idev->listeners == 0) {		/* first open */
> +		if (idev->pmaster && !iommu_found() && !capable(CAP_SYS_RAWIO)) {
> +			up(&idev->gate);
> +			return -EPERM;
> +		}
> +		/* reset to known state if we can */
> +		if (idev->pdev)
> +			(void) pci_reset_function(idev->pdev);
> +	}
> +	idev->listeners++;
> +	up(&idev->gate);

Why do you want to allow multiple opens for a device? Is it not
sufficient to allow only one?


> +	if (idev->domain == NULL) {
> +		if (!list_empty(&listener->dm_list))	/* no mix with anywhere */
> +			return -EINVAL;
> +		if (!iommu_found())
> +			return -EINVAL;
> +		idev->domain = iommu_domain_alloc();
> +		if (idev->domain == NULL)
> +			return -ENXIO;
> +		idev->cachec = iommu_domain_has_cap(idev->domain,
> +					IOMMU_CAP_CACHE_COHERENCY);
> +		ret = iommu_attach_device(idev->domain, idev->dev->parent);
> +		if (ret) {
> +			iommu_domain_free(idev->domain);
> +			idev->domain = NULL;
> +			printk(KERN_ERR "%s: device_attach failed %d\n", __func__, ret);
> +			return ret;
> +		}
> +	}

If userspace calls this path this will make all the addresses mapped
with DMA-API paths unusable by the device. This doesn't look like a sane
userspace interface.

For better and more in-depth review I suggest that you split up this
large patch into a series of smaler which implement specific aspects of
your work.

	Joerg

P.S.: I got these warning when applying your patches ...

Applying: drivers/uio/uio_pci_generic.c: allow access for non-privileged processes
/home/joro/src/linux.trees.git/.git/rebase-apply/patch:86: trailing whitespace.
                        info->mem[j].name = name; 
/home/joro/src/linux.trees.git/.git/rebase-apply/patch:87: trailing whitespace.
                        info->mem[j].addr = pci_resource_start(pdev, i); 
/home/joro/src/linux.trees.git/.git/rebase-apply/patch:89: trailing whitespace.
                        info->mem[j].memtype = UIO_MEM_PHYS; 
warning: 3 lines add whitespace errors.
Applying: drivers/uio/uio.c: DMA mapping, interrupt extensions, etc.
/home/joro/src/linux.trees.git/.git/rebase-apply/patch:315: trailing whitespace.
                ret = iommu_map_range(idev->domain, iova, 
/home/joro/src/linux.trees.git/.git/rebase-apply/patch:325: trailing whitespace.
        } 
/home/joro/src/linux.trees.git/.git/rebase-apply/patch:366: trailing whitespace.
         * adjacent pages, but noone seems to really do that. So we squash 
/home/joro/src/linux.trees.git/.git/rebase-apply/patch:368: trailing whitespace.
         * This works if (a) there is an iommu, or (b) the user allocates 
/home/joro/src/linux.trees.git/.git/rebase-apply/patch:578: trailing whitespace.
                        unsigned int cmd, unsigned long arg) 
warning: squelched 1 whitespace error
warning: 6 lines add whitespace errors.

And there are also some coding-style issues.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH V2] drivers/uio/uio.c: DMA mapping, interrupt extensions, etc.
  2010-04-17 10:43 ` Joerg Roedel
@ 2010-04-17 17:04   ` Tom Lyon
  0 siblings, 0 replies; 4+ messages in thread
From: Tom Lyon @ 2010-04-17 17:04 UTC (permalink / raw)
  To: Joerg Roedel; +Cc: mst, hjk, gregkh, chrisw, avi, kvm, linux-kernel

The current uio and uio_pci_generic allow multiple opens; I was just preserving that behavior.

On Saturday 17 April 2010 03:43:09 am Joerg Roedel wrote:
> On Thu, Apr 15, 2010 at 01:55:29PM -0700, Tom Lyon wrote:
> 
> > +	down(&idev->gate);
> > +	if (idev->listeners == 0) {		/* first open */
> > +		if (idev->pmaster && !iommu_found() && !capable(CAP_SYS_RAWIO)) {
> > +			up(&idev->gate);
> > +			return -EPERM;
> > +		}
> > +		/* reset to known state if we can */
> > +		if (idev->pdev)
> > +			(void) pci_reset_function(idev->pdev);
> > +	}
> > +	idev->listeners++;
> > +	up(&idev->gate);
> 
> Why do you want to allow multiple opens for a device? Is it not
> sufficient to allow only one?
> 
> 
> > +	if (idev->domain == NULL) {
> > +		if (!list_empty(&listener->dm_list))	/* no mix with anywhere */
> > +			return -EINVAL;
> > +		if (!iommu_found())
> > +			return -EINVAL;
> > +		idev->domain = iommu_domain_alloc();
> > +		if (idev->domain == NULL)
> > +			return -ENXIO;
> > +		idev->cachec = iommu_domain_has_cap(idev->domain,
> > +					IOMMU_CAP_CACHE_COHERENCY);
> > +		ret = iommu_attach_device(idev->domain, idev->dev->parent);
> > +		if (ret) {
> > +			iommu_domain_free(idev->domain);
> > +			idev->domain = NULL;
> > +			printk(KERN_ERR "%s: device_attach failed %d\n", __func__, ret);
> > +			return ret;
> > +		}
> > +	}
> 
> If userspace calls this path this will make all the addresses mapped
> with DMA-API paths unusable by the device. This doesn't look like a sane
> userspace interface.
> 
> For better and more in-depth review I suggest that you split up this
> large patch into a series of smaler which implement specific aspects of
> your work.
> 
> 	Joerg
> 
> P.S.: I got these warning when applying your patches ...
> 
> Applying: drivers/uio/uio_pci_generic.c: allow access for non-privileged processes
> /home/joro/src/linux.trees.git/.git/rebase-apply/patch:86: trailing whitespace.
>                         info->mem[j].name = name; 
> /home/joro/src/linux.trees.git/.git/rebase-apply/patch:87: trailing whitespace.
>                         info->mem[j].addr = pci_resource_start(pdev, i); 
> /home/joro/src/linux.trees.git/.git/rebase-apply/patch:89: trailing whitespace.
>                         info->mem[j].memtype = UIO_MEM_PHYS; 
> warning: 3 lines add whitespace errors.
> Applying: drivers/uio/uio.c: DMA mapping, interrupt extensions, etc.
> /home/joro/src/linux.trees.git/.git/rebase-apply/patch:315: trailing whitespace.
>                 ret = iommu_map_range(idev->domain, iova, 
> /home/joro/src/linux.trees.git/.git/rebase-apply/patch:325: trailing whitespace.
>         } 
> /home/joro/src/linux.trees.git/.git/rebase-apply/patch:366: trailing whitespace.
>          * adjacent pages, but noone seems to really do that. So we squash 
> /home/joro/src/linux.trees.git/.git/rebase-apply/patch:368: trailing whitespace.
>          * This works if (a) there is an iommu, or (b) the user allocates 
> /home/joro/src/linux.trees.git/.git/rebase-apply/patch:578: trailing whitespace.
>                         unsigned int cmd, unsigned long arg) 
> warning: squelched 1 whitespace error
> warning: 6 lines add whitespace errors.
> 
> And there are also some coding-style issues.
> 



^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH V2] drivers/uio/uio.c: DMA mapping, interrupt extensions, etc.
  2010-04-15 20:55 [PATCH V2] drivers/uio/uio.c: DMA mapping, interrupt extensions, etc Tom Lyon
  2010-04-17 10:43 ` Joerg Roedel
@ 2010-04-17 18:34 ` Avi Kivity
  1 sibling, 0 replies; 4+ messages in thread
From: Avi Kivity @ 2010-04-17 18:34 UTC (permalink / raw)
  To: Tom Lyon; +Cc: mst, hjk, gregkh, chrisw, joro, kvm, linux-kernel

On 04/15/2010 11:55 PM, Tom Lyon wrote:
> This is the second of 2 related, but independent, patches. This is for
> uio.c, the previous is for uio_pci_generic.c.
>
> The 2 patches were previously one large patch. Changes for this version:
> - uio_pci_generic.c just gets extensions so that a single fd can be used
>    by non-privileged processes for interrupt control and mmaps
> - All of the DMA and IOMMU related stuff move to uio.c; no longer a need
>    to pass ioctls to individual uio drivers. It turns out that the code
>    is not PCI specific anyways.
> - A new ioctl to pin DMA buffers to certain IO virtual addresses for KVM.
> - New eventfd based interrupt notifications, including support for PCI
>    specific MSI and MSI-X interrupts.
> - PCI specific code to reset PCI functions before and after use
>
>
> @@ -122,4 +126,23 @@
>   #define UIO_PORT_GPIO	2
>   #define UIO_PORT_OTHER	3
>
> +#endif	/* __KERNEL__ */
> +
> +// Kernel&  User level defines for ioctls
> +
> +struct uio_dma_map {
> +	unsigned long	vaddr;
> +	unsigned long long dmaaddr;
>    

Use __u64 for both, otherwise you need to rewrite the structures in the 
kernel in case 32-bit userspace calls a 64-bit kernel.

> +	int	size;
>    

What units?  Size is probably too small.  Suggest unsigned type to avoid 
an extra check in the kernel.

> +	int	rdwr;
>    

What values can this hold?

> +};
> +
> +#define	UIO_DMA_MAP_ANYWHERE	_IOWR(';', 100, struct uio_dma_map)
>    

What does this do?  Ignore vaddr?

> +#define	UIO_DMA_MAP_IOVA	_IOWR(';', 101, struct uio_dma_map)
> +#define	UIO_DMA_UNMAP		_IOW(';', 102, struct uio_dma_map)
> +#define	UIO_DMA_MASK		_IOW(';', 103, unsigned long long)
> +#define	UIO_EVENTFD_IRQ		_IOW(';', 104, int)
> +#define	UIO_EVENTFD_MSI		_IOW(';', 105, int)
> +#define	UIO_EVENTFDS_MSIX	_IOW(';', 106, int)
>    

These three need some documentation.


-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2010-04-17 18:35 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-04-15 20:55 [PATCH V2] drivers/uio/uio.c: DMA mapping, interrupt extensions, etc Tom Lyon
2010-04-17 10:43 ` Joerg Roedel
2010-04-17 17:04   ` Tom Lyon
2010-04-17 18:34 ` Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).