linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH v2] uacce: Add uacce_ctrl misc device
@ 2021-01-25  8:34 Zhou Wang
  2021-01-25  9:28 ` Greg Kroah-Hartman
  2021-01-25 15:47 ` Jason Gunthorpe
  0 siblings, 2 replies; 18+ messages in thread
From: Zhou Wang @ 2021-01-25  8:34 UTC (permalink / raw)
  To: Greg Kroah-Hartman, Arnd Bergmann, Zhangfei Gao
  Cc: linux-accelerators, linux-kernel, iommu, linux-mm, song.bao.hua,
	liguozhu, Zhou Wang, Sihang Chen

Uacce driver supports to use devices in user space safely by SVA. However,
IO page fault may happen when doing DMA operations, which will affect DMA
performance severely.

For some accelerators which need stable performance, it is better to
avoid IO page fault totally. Current memory related APIs, like mlock,
could not achieve this requirement. Idealy we should have a system call,
like "mpin", to pin related pages. However, there is no such API in
kernel, currently drivers which need pin pages implement ioctl interfaces
in their own drivers, like v412, gpu, infiniband, media, vfio etc.

This patch also tries to do this by introducing a pin user page interface in
uacce driver. This patch introduces a new char device named /dev/uacce_ctrl
to help to maintain pin/unpin pages. User space can do pin/unpin pages by
ioctls of an open file of /dev/uacce_ctrl, all pinned pages under one file
will be unpinned in file release process.

Signed-off-by: Zhou Wang <wangzhou1@hisilicon.com>
Signed-off-by: Sihang Chen <chensihang1@hisilicon.com>
Suggested-by: Barry Song <song.bao.hua@hisilicon.com>
---
Changes v1 -> v2:
 - Some tiny fixes
 - Follow Greg's suggestion to get mm-list and iommu-list involved.

v1: https://lwn.net/Articles/843432/
---
 drivers/misc/uacce/uacce.c      | 172 +++++++++++++++++++++++++++++++++++++++-
 include/uapi/misc/uacce/uacce.h |  16 ++++
 2 files changed, 187 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c
index d07af4e..69d3ba8 100644
--- a/drivers/misc/uacce/uacce.c
+++ b/drivers/misc/uacce/uacce.c
@@ -2,16 +2,28 @@
 #include <linux/compat.h>
 #include <linux/dma-mapping.h>
 #include <linux/iommu.h>
+#include <linux/miscdevice.h>
 #include <linux/module.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
 #include <linux/uacce.h>
+#include <linux/vmalloc.h>
 
 static struct class *uacce_class;
 static dev_t uacce_devt;
 static DEFINE_MUTEX(uacce_mutex);
 static DEFINE_XARRAY_ALLOC(uacce_xa);
 
+struct uacce_pin_container {
+	struct xarray array;
+};
+
+struct pin_pages {
+	unsigned long first;
+	unsigned long nr_pages;
+	struct page **pages;
+};
+
 static int uacce_start_queue(struct uacce_queue *q)
 {
 	int ret = 0;
@@ -497,6 +509,151 @@ void uacce_remove(struct uacce_device *uacce)
 }
 EXPORT_SYMBOL_GPL(uacce_remove);
 
+static int uacce_ctrl_open(struct inode *inode, struct file *file)
+{
+	struct uacce_pin_container *p;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+	file->private_data = p;
+
+	xa_init(&p->array);
+
+	return 0;
+}
+
+static int uacce_ctrl_release(struct inode *inode, struct file *file)
+{
+	struct uacce_pin_container *priv = file->private_data;
+	struct pin_pages *p;
+	unsigned long idx;
+
+	xa_for_each(&priv->array, idx, p) {
+		unpin_user_pages(p->pages, p->nr_pages);
+		xa_erase(&priv->array, p->first);
+		vfree(p->pages);
+		kfree(p);
+	}
+
+	xa_destroy(&priv->array);
+	kfree(priv);
+
+	return 0;
+}
+
+static int uacce_pin_page(struct uacce_pin_container *priv,
+			  struct uacce_pin_address *addr)
+{
+	unsigned int flags = FOLL_FORCE | FOLL_WRITE;
+	unsigned long first, last, nr_pages;
+	struct page **pages;
+	struct pin_pages *p;
+	int ret;
+
+	first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
+	last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
+	nr_pages = last - first + 1;
+
+	pages = vmalloc(nr_pages * sizeof(struct page *));
+	if (!pages)
+		return -ENOMEM;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p) {
+		ret = -ENOMEM;
+		goto free;
+	}
+
+	ret = pin_user_pages_fast(addr->addr & PAGE_MASK, nr_pages,
+				  flags | FOLL_LONGTERM, pages);
+	if (ret != nr_pages) {
+		pr_err("uacce: Failed to pin page\n");
+		goto free_p;
+	}
+	p->first = first;
+	p->nr_pages = nr_pages;
+	p->pages = pages;
+
+	ret = xa_err(xa_store(&priv->array, p->first, p, GFP_KERNEL));
+	if (ret)
+		goto unpin_pages;
+
+	return 0;
+
+unpin_pages:
+	unpin_user_pages(pages, nr_pages);
+free_p:
+	kfree(p);
+free:
+	vfree(pages);
+	return ret;
+}
+
+static int uacce_unpin_page(struct uacce_pin_container *priv,
+			    struct uacce_pin_address *addr)
+{
+	unsigned long first, last, nr_pages;
+	struct pin_pages *p;
+
+	first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
+	last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
+	nr_pages = last - first + 1;
+
+	/* find pin_pages */
+	p = xa_load(&priv->array, first);
+	if (!p)
+		return -ENODEV;
+
+	if (p->nr_pages != nr_pages)
+		return -EINVAL;
+
+	/* unpin */
+	unpin_user_pages(p->pages, p->nr_pages);
+
+	/* release resource */
+	xa_erase(&priv->array, first);
+	vfree(p->pages);
+	kfree(p);
+
+	return 0;
+}
+
+static long uacce_ctrl_unl_ioctl(struct file *filep, unsigned int cmd,
+				 unsigned long arg)
+{
+	struct uacce_pin_container *p = filep->private_data;
+	struct uacce_pin_address addr;
+
+	if (copy_from_user(&addr, (void __user *)arg,
+			   sizeof(struct uacce_pin_address)))
+		return -EFAULT;
+
+	switch (cmd) {
+	case UACCE_CMD_PIN:
+		return uacce_pin_page(p, &addr);
+
+	case UACCE_CMD_UNPIN:
+		return uacce_unpin_page(p, &addr);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct file_operations uacce_ctrl_fops = {
+	.owner = THIS_MODULE,
+	.open = uacce_ctrl_open,
+	.release = uacce_ctrl_release,
+	.unlocked_ioctl	= uacce_ctrl_unl_ioctl,
+};
+
+static struct miscdevice uacce_ctrl_miscdev = {
+	.name = "uacce_ctrl",
+	.minor = MISC_DYNAMIC_MINOR,
+	.fops = &uacce_ctrl_fops,
+};
+
 static int __init uacce_init(void)
 {
 	int ret;
@@ -507,13 +664,26 @@ static int __init uacce_init(void)
 
 	ret = alloc_chrdev_region(&uacce_devt, 0, MINORMASK, UACCE_NAME);
 	if (ret)
-		class_destroy(uacce_class);
+		goto destroy_class;
+
+	ret = misc_register(&uacce_ctrl_miscdev);
+	if (ret) {
+		pr_err("uacce: ctrl dev registration failed\n");
+		goto unregister_cdev;
+	}
 
+	return 0;
+
+unregister_cdev:
+	unregister_chrdev_region(uacce_devt, MINORMASK);
+destroy_class:
+	class_destroy(uacce_class);
 	return ret;
 }
 
 static __exit void uacce_exit(void)
 {
+	misc_deregister(&uacce_ctrl_miscdev);
 	unregister_chrdev_region(uacce_devt, MINORMASK);
 	class_destroy(uacce_class);
 }
diff --git a/include/uapi/misc/uacce/uacce.h b/include/uapi/misc/uacce/uacce.h
index cc71856..0b10551 100644
--- a/include/uapi/misc/uacce/uacce.h
+++ b/include/uapi/misc/uacce/uacce.h
@@ -35,4 +35,20 @@ enum uacce_qfrt {
 	UACCE_QFRT_DUS = 1,
 };
 
+/**
+ * struct uacce_pin_address - Expected pin user space address and size
+ * @addr: Address to pin
+ * @size: Size of pin address
+ */
+struct uacce_pin_address {
+	__u64 addr;
+	__u64 size;
+};
+
+/* UACCE_CMD_PIN: Pin a range of memory */
+#define UACCE_CMD_PIN		_IOW('W', 2, struct uacce_pin_address)
+
+/* UACCE_CMD_UNPIN: Unpin a range of memory */
+#define UACCE_CMD_UNPIN		_IOW('W', 3, struct uacce_pin_address)
+
 #endif
-- 
2.8.1



^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-01-25  8:34 [RFC PATCH v2] uacce: Add uacce_ctrl misc device Zhou Wang
@ 2021-01-25  9:28 ` Greg Kroah-Hartman
  2021-01-25 12:47   ` Zhou Wang
  2021-01-25 15:47 ` Jason Gunthorpe
  1 sibling, 1 reply; 18+ messages in thread
From: Greg Kroah-Hartman @ 2021-01-25  9:28 UTC (permalink / raw)
  To: Zhou Wang
  Cc: Arnd Bergmann, Zhangfei Gao, linux-accelerators, linux-kernel,
	iommu, linux-mm, song.bao.hua, liguozhu, Sihang Chen

On Mon, Jan 25, 2021 at 04:34:56PM +0800, Zhou Wang wrote:
> +static int uacce_pin_page(struct uacce_pin_container *priv,
> +			  struct uacce_pin_address *addr)
> +{
> +	unsigned int flags = FOLL_FORCE | FOLL_WRITE;
> +	unsigned long first, last, nr_pages;
> +	struct page **pages;
> +	struct pin_pages *p;
> +	int ret;
> +
> +	first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
> +	last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
> +	nr_pages = last - first + 1;
> +
> +	pages = vmalloc(nr_pages * sizeof(struct page *));
> +	if (!pages)
> +		return -ENOMEM;
> +
> +	p = kzalloc(sizeof(*p), GFP_KERNEL);
> +	if (!p) {
> +		ret = -ENOMEM;
> +		goto free;
> +	}
> +
> +	ret = pin_user_pages_fast(addr->addr & PAGE_MASK, nr_pages,
> +				  flags | FOLL_LONGTERM, pages);
> +	if (ret != nr_pages) {
> +		pr_err("uacce: Failed to pin page\n");
> +		goto free_p;
> +	}
> +	p->first = first;
> +	p->nr_pages = nr_pages;
> +	p->pages = pages;
> +
> +	ret = xa_err(xa_store(&priv->array, p->first, p, GFP_KERNEL));
> +	if (ret)
> +		goto unpin_pages;
> +
> +	return 0;
> +
> +unpin_pages:
> +	unpin_user_pages(pages, nr_pages);
> +free_p:
> +	kfree(p);
> +free:
> +	vfree(pages);
> +	return ret;
> +}

No error checking on the memory locations or size of memory to be
'pinned', what could ever go wrong?

Note, this opens a huge hole in the kernel that needs to be documented
really really really well somewhere, as it can cause very strange
results if you do not know exactly what you are doing, which is why I am
going to require that the mm developers sign off on this type of thing.

And to give more context, I really don't think this is needed, but if it
is, it should be a new syscall, not buried in an ioctl for a random
misc driver, but the author seems to want it tied to this specific
driver...

thanks,

greg k-h


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-01-25  9:28 ` Greg Kroah-Hartman
@ 2021-01-25 12:47   ` Zhou Wang
  0 siblings, 0 replies; 18+ messages in thread
From: Zhou Wang @ 2021-01-25 12:47 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: song.bao.hua, Sihang Chen, Arnd Bergmann, linux-kernel, linux-mm,
	iommu, Zhangfei Gao, liguozhu, linux-accelerators

On 2021/1/25 17:28, Greg Kroah-Hartman wrote:
> On Mon, Jan 25, 2021 at 04:34:56PM +0800, Zhou Wang wrote:
>> +static int uacce_pin_page(struct uacce_pin_container *priv,
>> +			  struct uacce_pin_address *addr)
>> +{
>> +	unsigned int flags = FOLL_FORCE | FOLL_WRITE;
>> +	unsigned long first, last, nr_pages;
>> +	struct page **pages;
>> +	struct pin_pages *p;
>> +	int ret;
>> +
>> +	first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
>> +	last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
>> +	nr_pages = last - first + 1;
>> +
>> +	pages = vmalloc(nr_pages * sizeof(struct page *));
>> +	if (!pages)
>> +		return -ENOMEM;
>> +
>> +	p = kzalloc(sizeof(*p), GFP_KERNEL);
>> +	if (!p) {
>> +		ret = -ENOMEM;
>> +		goto free;
>> +	}
>> +
>> +	ret = pin_user_pages_fast(addr->addr & PAGE_MASK, nr_pages,
>> +				  flags | FOLL_LONGTERM, pages);
>> +	if (ret != nr_pages) {
>> +		pr_err("uacce: Failed to pin page\n");
>> +		goto free_p;
>> +	}
>> +	p->first = first;
>> +	p->nr_pages = nr_pages;
>> +	p->pages = pages;
>> +
>> +	ret = xa_err(xa_store(&priv->array, p->first, p, GFP_KERNEL));
>> +	if (ret)
>> +		goto unpin_pages;
>> +
>> +	return 0;
>> +
>> +unpin_pages:
>> +	unpin_user_pages(pages, nr_pages);
>> +free_p:
>> +	kfree(p);
>> +free:
>> +	vfree(pages);
>> +	return ret;
>> +}
> 
> No error checking on the memory locations or size of memory to be
> 'pinned', what could ever go wrong?

These problems has been considered if I understand it right.

I have checked pin_user_pages_fast, it checks memory location by access_ok.
For the size of memory to pin, we added a limitation, like limiting pin
page size to 1GB, however, it has been removed in the post patch. The
reason is the permission of /dev/uacce_ctrl is 600 root:root, /dev/uacce_ctrl
has to been added to trusted groups by root to be used.

> 
> Note, this opens a huge hole in the kernel that needs to be documented
> really really really well somewhere, as it can cause very strange
> results if you do not know exactly what you are doing, which is why I am
> going to require that the mm developers sign off on this type of thing.
> 
> And to give more context, I really don't think this is needed, but if it

Maybe I do not explain the problem clearly. Let us see it again.

From the view of functionality, pin page is no needed at all, however,
from the view of performance, we need make DMA physical pages fixed as
the latency of IO page fault currently is relatively high, for example
for ARM SMMUv3 IO page fault, it will be at least 20us+. When a DMA
transaction triggers a IO page fault, the performance will be bad. See
from a long term, the DMA performance will be not stable.

Here we use pinned pages to create a memory pool in user space, users'
lib/app can use the memory in above pinned pages based memory pool to
avoid IO page fault.

Best,
Zhou

> is, it should be a new syscall, not buried in an ioctl for a random
> misc driver, but the author seems to want it tied to this specific
> driver...
> 
> thanks,
> 
> greg k-h
> 
> .
> 



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-01-25  8:34 [RFC PATCH v2] uacce: Add uacce_ctrl misc device Zhou Wang
  2021-01-25  9:28 ` Greg Kroah-Hartman
@ 2021-01-25 15:47 ` Jason Gunthorpe
  2021-01-25 22:21   ` Song Bao Hua (Barry Song)
  2021-01-26  9:00   ` Zhou Wang
  1 sibling, 2 replies; 18+ messages in thread
From: Jason Gunthorpe @ 2021-01-25 15:47 UTC (permalink / raw)
  To: Zhou Wang
  Cc: Greg Kroah-Hartman, Arnd Bergmann, Zhangfei Gao,
	linux-accelerators, linux-kernel, iommu, linux-mm, song.bao.hua,
	liguozhu, Sihang Chen

On Mon, Jan 25, 2021 at 04:34:56PM +0800, Zhou Wang wrote:

> +static int uacce_pin_page(struct uacce_pin_container *priv,
> +			  struct uacce_pin_address *addr)
> +{
> +	unsigned int flags = FOLL_FORCE | FOLL_WRITE;
> +	unsigned long first, last, nr_pages;
> +	struct page **pages;
> +	struct pin_pages *p;
> +	int ret;
> +
> +	first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
> +	last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
> +	nr_pages = last - first + 1;
> +
> +	pages = vmalloc(nr_pages * sizeof(struct page *));
> +	if (!pages)
> +		return -ENOMEM;
> +
> +	p = kzalloc(sizeof(*p), GFP_KERNEL);
> +	if (!p) {
> +		ret = -ENOMEM;
> +		goto free;
> +	}
> +
> +	ret = pin_user_pages_fast(addr->addr & PAGE_MASK, nr_pages,
> +				  flags | FOLL_LONGTERM, pages);

This needs to copy the RLIMIT_MEMLOCK and can_do_mlock() stuff from
other places, like ib_umem_get

> +	ret = xa_err(xa_store(&priv->array, p->first, p, GFP_KERNEL));

And this is really weird, I don't think it makes sense to make handles
for DMA based on the starting VA.

> +static int uacce_unpin_page(struct uacce_pin_container *priv,
> +			    struct uacce_pin_address *addr)
> +{
> +	unsigned long first, last, nr_pages;
> +	struct pin_pages *p;
> +
> +	first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
> +	last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
> +	nr_pages = last - first + 1;
> +
> +	/* find pin_pages */
> +	p = xa_load(&priv->array, first);
> +	if (!p)
> +		return -ENODEV;
> +
> +	if (p->nr_pages != nr_pages)
> +		return -EINVAL;
> +
> +	/* unpin */
> +	unpin_user_pages(p->pages, p->nr_pages);

And unpinning without guaranteeing there is no ongoing DMA is really
weird

Are you abusing this in conjunction with a SVA scheme just to prevent
page motion? Why wasn't mlock good enough?

Jason


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-01-25 15:47 ` Jason Gunthorpe
@ 2021-01-25 22:21   ` Song Bao Hua (Barry Song)
  2021-01-25 23:16     ` Jason Gunthorpe
  2021-01-26  9:00   ` Zhou Wang
  1 sibling, 1 reply; 18+ messages in thread
From: Song Bao Hua (Barry Song) @ 2021-01-25 22:21 UTC (permalink / raw)
  To: Jason Gunthorpe, Wangzhou (B)
  Cc: Greg Kroah-Hartman, Arnd Bergmann, Zhangfei Gao,
	linux-accelerators, linux-kernel, iommu, linux-mm,
	Liguozhu (Kenneth), chensihang (A)



> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgg@ziepe.ca]
> Sent: Tuesday, January 26, 2021 4:47 AM
> To: Wangzhou (B) <wangzhou1@hisilicon.com>
> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>; Arnd Bergmann
> <arnd@arndb.de>; Zhangfei Gao <zhangfei.gao@linaro.org>;
> linux-accelerators@lists.ozlabs.org; linux-kernel@vger.kernel.org;
> iommu@lists.linux-foundation.org; linux-mm@kvack.org; Song Bao Hua (Barry Song)
> <song.bao.hua@hisilicon.com>; Liguozhu (Kenneth) <liguozhu@hisilicon.com>;
> chensihang (A) <chensihang1@hisilicon.com>
> Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
> 
> On Mon, Jan 25, 2021 at 04:34:56PM +0800, Zhou Wang wrote:
> 
> > +static int uacce_pin_page(struct uacce_pin_container *priv,
> > +			  struct uacce_pin_address *addr)
> > +{
> > +	unsigned int flags = FOLL_FORCE | FOLL_WRITE;
> > +	unsigned long first, last, nr_pages;
> > +	struct page **pages;
> > +	struct pin_pages *p;
> > +	int ret;
> > +
> > +	first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
> > +	last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
> > +	nr_pages = last - first + 1;
> > +
> > +	pages = vmalloc(nr_pages * sizeof(struct page *));
> > +	if (!pages)
> > +		return -ENOMEM;
> > +
> > +	p = kzalloc(sizeof(*p), GFP_KERNEL);
> > +	if (!p) {
> > +		ret = -ENOMEM;
> > +		goto free;
> > +	}
> > +
> > +	ret = pin_user_pages_fast(addr->addr & PAGE_MASK, nr_pages,
> > +				  flags | FOLL_LONGTERM, pages);
> 
> This needs to copy the RLIMIT_MEMLOCK and can_do_mlock() stuff from
> other places, like ib_umem_get
> 
> > +	ret = xa_err(xa_store(&priv->array, p->first, p, GFP_KERNEL));
> 
> And this is really weird, I don't think it makes sense to make handles
> for DMA based on the starting VA.
> 
> > +static int uacce_unpin_page(struct uacce_pin_container *priv,
> > +			    struct uacce_pin_address *addr)
> > +{
> > +	unsigned long first, last, nr_pages;
> > +	struct pin_pages *p;
> > +
> > +	first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
> > +	last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
> > +	nr_pages = last - first + 1;
> > +
> > +	/* find pin_pages */
> > +	p = xa_load(&priv->array, first);
> > +	if (!p)
> > +		return -ENODEV;
> > +
> > +	if (p->nr_pages != nr_pages)
> > +		return -EINVAL;
> > +
> > +	/* unpin */
> > +	unpin_user_pages(p->pages, p->nr_pages);
> 
> And unpinning without guaranteeing there is no ongoing DMA is really
> weird

In SVA case, kernel has no idea if accelerators are accessing
the memory so I would assume SVA has a method to prevent
the pages being transferred from migration or release. Otherwise,
SVA will crash easily in a system with high memory pressure.

Anyway, This is a problem worth further investigating.

> 
> Are you abusing this in conjunction with a SVA scheme just to prevent
> page motion? Why wasn't mlock good enough?

Page migration won't cause any disfunction in SVA case as IO page
fault will get a valid page again. It is only a performance issue
as IO page fault has larger latency than the usual page fault,
would be 3-80slower than page fault[1]

mlock, while certainly be able to prevent swapping out, it won't
be able to stop page moving due to:
* memory compaction in alloc_pages()
* making huge pages
* numa balance
* memory compaction in CMA
etc.

[1] https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7482091&tag=1
> 
> Jason

Thanks
Barry


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-01-25 22:21   ` Song Bao Hua (Barry Song)
@ 2021-01-25 23:16     ` Jason Gunthorpe
  2021-01-25 23:35       ` Song Bao Hua (Barry Song)
  0 siblings, 1 reply; 18+ messages in thread
From: Jason Gunthorpe @ 2021-01-25 23:16 UTC (permalink / raw)
  To: Song Bao Hua (Barry Song)
  Cc: Wangzhou (B),
	Greg Kroah-Hartman, Arnd Bergmann, Zhangfei Gao,
	linux-accelerators, linux-kernel, iommu, linux-mm,
	Liguozhu (Kenneth), chensihang (A)

On Mon, Jan 25, 2021 at 10:21:14PM +0000, Song Bao Hua (Barry Song) wrote:
> mlock, while certainly be able to prevent swapping out, it won't
> be able to stop page moving due to:
> * memory compaction in alloc_pages()
> * making huge pages
> * numa balance
> * memory compaction in CMA

Enabling those things is a major reason to have SVA device in the
first place, providing a SW API to turn it all off seems like the
wrong direction.

If the device doesn't want to use SVA then don't use it, use normal
DMA pinning like everything else.

Jason


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-01-25 23:16     ` Jason Gunthorpe
@ 2021-01-25 23:35       ` Song Bao Hua (Barry Song)
  2021-01-26  1:13         ` Jason Gunthorpe
  0 siblings, 1 reply; 18+ messages in thread
From: Song Bao Hua (Barry Song) @ 2021-01-25 23:35 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Wangzhou (B),
	Greg Kroah-Hartman, Arnd Bergmann, Zhangfei Gao,
	linux-accelerators, linux-kernel, iommu, linux-mm,
	Liguozhu (Kenneth), chensihang (A)



> -----Original Message-----
> From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On Behalf Of
> Jason Gunthorpe
> Sent: Tuesday, January 26, 2021 12:16 PM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> Cc: Wangzhou (B) <wangzhou1@hisilicon.com>; Greg Kroah-Hartman
> <gregkh@linuxfoundation.org>; Arnd Bergmann <arnd@arndb.de>; Zhangfei Gao
> <zhangfei.gao@linaro.org>; linux-accelerators@lists.ozlabs.org;
> linux-kernel@vger.kernel.org; iommu@lists.linux-foundation.org;
> linux-mm@kvack.org; Liguozhu (Kenneth) <liguozhu@hisilicon.com>; chensihang
> (A) <chensihang1@hisilicon.com>
> Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
> 
> On Mon, Jan 25, 2021 at 10:21:14PM +0000, Song Bao Hua (Barry Song) wrote:
> > mlock, while certainly be able to prevent swapping out, it won't
> > be able to stop page moving due to:
> > * memory compaction in alloc_pages()
> > * making huge pages
> > * numa balance
> > * memory compaction in CMA
> 
> Enabling those things is a major reason to have SVA device in the
> first place, providing a SW API to turn it all off seems like the
> wrong direction.

I wouldn't say this is a major reason to have SVA. If we read the
history of SVA and papers, people would think easy programming due
to data struct sharing between cpu and device, and process space
isolation in device would be the major reasons for SVA. SVA also
declares it supports zero-copy while zero-copy doesn't necessarily
depend on SVA.

Page migration and I/O page fault overhead, on the other hand, would
probably be the major problems which block SVA becoming a 
high-performance and more popular solution.

> 
> If the device doesn't want to use SVA then don't use it, use normal
> DMA pinning like everything else.
> 

If we disable SVA, we won't get the benefits of SVA on address sharing,
and process space isolation.

> Jason

Thanks
Barry


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-01-25 23:35       ` Song Bao Hua (Barry Song)
@ 2021-01-26  1:13         ` Jason Gunthorpe
  2021-01-26  1:26           ` Song Bao Hua (Barry Song)
  0 siblings, 1 reply; 18+ messages in thread
From: Jason Gunthorpe @ 2021-01-26  1:13 UTC (permalink / raw)
  To: Song Bao Hua (Barry Song)
  Cc: Wangzhou (B),
	Greg Kroah-Hartman, Arnd Bergmann, Zhangfei Gao,
	linux-accelerators, linux-kernel, iommu, linux-mm,
	Liguozhu (Kenneth), chensihang (A)

On Mon, Jan 25, 2021 at 11:35:22PM +0000, Song Bao Hua (Barry Song) wrote:

> > On Mon, Jan 25, 2021 at 10:21:14PM +0000, Song Bao Hua (Barry Song) wrote:
> > > mlock, while certainly be able to prevent swapping out, it won't
> > > be able to stop page moving due to:
> > > * memory compaction in alloc_pages()
> > > * making huge pages
> > > * numa balance
> > > * memory compaction in CMA
> > 
> > Enabling those things is a major reason to have SVA device in the
> > first place, providing a SW API to turn it all off seems like the
> > wrong direction.
> 
> I wouldn't say this is a major reason to have SVA. If we read the
> history of SVA and papers, people would think easy programming due
> to data struct sharing between cpu and device, and process space
> isolation in device would be the major reasons for SVA. SVA also
> declares it supports zero-copy while zero-copy doesn't necessarily
> depend on SVA.

Once you have to explicitly make system calls to declare memory under
IO, you loose all of that.

Since you've asked the app to be explicit about the DMAs it intends to
do, there is not really much reason to use SVA for those DMAs anymore.

Jason


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-01-26  1:13         ` Jason Gunthorpe
@ 2021-01-26  1:26           ` Song Bao Hua (Barry Song)
  2021-01-26 18:20             ` Jason Gunthorpe
  2021-01-29 10:09             ` Tian, Kevin
  0 siblings, 2 replies; 18+ messages in thread
From: Song Bao Hua (Barry Song) @ 2021-01-26  1:26 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Wangzhou (B),
	Greg Kroah-Hartman, Arnd Bergmann, Zhangfei Gao,
	linux-accelerators, linux-kernel, iommu, linux-mm,
	Liguozhu (Kenneth), chensihang (A)



> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgg@ziepe.ca]
> Sent: Tuesday, January 26, 2021 2:13 PM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> Cc: Wangzhou (B) <wangzhou1@hisilicon.com>; Greg Kroah-Hartman
> <gregkh@linuxfoundation.org>; Arnd Bergmann <arnd@arndb.de>; Zhangfei Gao
> <zhangfei.gao@linaro.org>; linux-accelerators@lists.ozlabs.org;
> linux-kernel@vger.kernel.org; iommu@lists.linux-foundation.org;
> linux-mm@kvack.org; Liguozhu (Kenneth) <liguozhu@hisilicon.com>; chensihang
> (A) <chensihang1@hisilicon.com>
> Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
> 
> On Mon, Jan 25, 2021 at 11:35:22PM +0000, Song Bao Hua (Barry Song) wrote:
> 
> > > On Mon, Jan 25, 2021 at 10:21:14PM +0000, Song Bao Hua (Barry Song) wrote:
> > > > mlock, while certainly be able to prevent swapping out, it won't
> > > > be able to stop page moving due to:
> > > > * memory compaction in alloc_pages()
> > > > * making huge pages
> > > > * numa balance
> > > > * memory compaction in CMA
> > >
> > > Enabling those things is a major reason to have SVA device in the
> > > first place, providing a SW API to turn it all off seems like the
> > > wrong direction.
> >
> > I wouldn't say this is a major reason to have SVA. If we read the
> > history of SVA and papers, people would think easy programming due
> > to data struct sharing between cpu and device, and process space
> > isolation in device would be the major reasons for SVA. SVA also
> > declares it supports zero-copy while zero-copy doesn't necessarily
> > depend on SVA.
> 
> Once you have to explicitly make system calls to declare memory under
> IO, you loose all of that.
> 
> Since you've asked the app to be explicit about the DMAs it intends to
> do, there is not really much reason to use SVA for those DMAs anymore.

Let's see a non-SVA case. We are not using SVA, we can have
a memory pool by hugetlb or pin, and app can allocate memory
from this pool, and get stable I/O performance on the memory
from the pool. But device has its separate page table which
is not bound with this process, thus lacking the protection
of process space isolation. Plus, CPU and device are using
different address.

And then we move to SVA case, we can still have a memory pool
by hugetlb or pin, and app can allocate memory from this pool
since this pool is mapped to the address space of the process,
and we are able to get stable I/O performance since it is always
there. But in this case, device is using the page table of
process with the full permission control.
And they are using same address and can possibly enjoy the easy
programming if HW supports.

SVA is not doom to work with IO page fault only. If we have SVA+pin,
we would get both sharing address and stable I/O latency.

> 
> Jason

Thanks
Barry



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-01-25 15:47 ` Jason Gunthorpe
  2021-01-25 22:21   ` Song Bao Hua (Barry Song)
@ 2021-01-26  9:00   ` Zhou Wang
  1 sibling, 0 replies; 18+ messages in thread
From: Zhou Wang @ 2021-01-26  9:00 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Greg Kroah-Hartman, Arnd Bergmann, Zhangfei Gao,
	linux-accelerators, linux-kernel, iommu, linux-mm, song.bao.hua,
	liguozhu, Sihang Chen

On 2021/1/25 23:47, Jason Gunthorpe wrote:
> On Mon, Jan 25, 2021 at 04:34:56PM +0800, Zhou Wang wrote:
> 
>> +static int uacce_pin_page(struct uacce_pin_container *priv,
>> +			  struct uacce_pin_address *addr)
>> +{
>> +	unsigned int flags = FOLL_FORCE | FOLL_WRITE;
>> +	unsigned long first, last, nr_pages;
>> +	struct page **pages;
>> +	struct pin_pages *p;
>> +	int ret;
>> +
>> +	first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
>> +	last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
>> +	nr_pages = last - first + 1;
>> +
>> +	pages = vmalloc(nr_pages * sizeof(struct page *));
>> +	if (!pages)
>> +		return -ENOMEM;
>> +
>> +	p = kzalloc(sizeof(*p), GFP_KERNEL);
>> +	if (!p) {
>> +		ret = -ENOMEM;
>> +		goto free;
>> +	}
>> +
>> +	ret = pin_user_pages_fast(addr->addr & PAGE_MASK, nr_pages,
>> +				  flags | FOLL_LONGTERM, pages);
> 
> This needs to copy the RLIMIT_MEMLOCK and can_do_mlock() stuff from
> other places, like ib_umem_get

I am confused as can_do_mlock seems to be about the limitation of mlock,
which has different meaning with pin memory?

> 
>> +	ret = xa_err(xa_store(&priv->array, p->first, p, GFP_KERNEL));
> 
> And this is really weird, I don't think it makes sense to make handles
> for DMA based on the starting VA.

Here starting VA is used as an index of pinned pages. When doing unpinning,
starting VA is used to get pinned pages, check unpinned VA/size.

But it has problem here to use xa_store here as new one will replace old one :(
Seems we can use xa_insert here, which returns -EBUSY if the entry at one
index is not empty. The design here will be that we only support to pin same
VA once.

> 
>> +static int uacce_unpin_page(struct uacce_pin_container *priv,
>> +			    struct uacce_pin_address *addr)
>> +{
>> +	unsigned long first, last, nr_pages;
>> +	struct pin_pages *p;
>> +
>> +	first = (addr->addr & PAGE_MASK) >> PAGE_SHIFT;
>> +	last = ((addr->addr + addr->size - 1) & PAGE_MASK) >> PAGE_SHIFT;
>> +	nr_pages = last - first + 1;
>> +
>> +	/* find pin_pages */
>> +	p = xa_load(&priv->array, first);
>> +	if (!p)
>> +		return -ENODEV;
>> +
>> +	if (p->nr_pages != nr_pages)
>> +		return -EINVAL;
>> +
>> +	/* unpin */
>> +	unpin_user_pages(p->pages, p->nr_pages);
> 
> And unpinning without guaranteeing there is no ongoing DMA is really
> weird
> 
> Are you abusing this in conjunction with a SVA scheme just to prevent
> page motion? Why wasn't mlock good enough?

Just as Barry said, mlock can not avoid IOPF from page migration.

Best,
Zhou

> 
> Jason
> 
> .
> 



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-01-26  1:26           ` Song Bao Hua (Barry Song)
@ 2021-01-26 18:20             ` Jason Gunthorpe
  2021-01-28  1:28               ` Song Bao Hua (Barry Song)
  2021-01-29 10:09             ` Tian, Kevin
  1 sibling, 1 reply; 18+ messages in thread
From: Jason Gunthorpe @ 2021-01-26 18:20 UTC (permalink / raw)
  To: Song Bao Hua (Barry Song)
  Cc: Wangzhou (B),
	Greg Kroah-Hartman, Arnd Bergmann, Zhangfei Gao,
	linux-accelerators, linux-kernel, iommu, linux-mm,
	Liguozhu (Kenneth), chensihang (A)

On Tue, Jan 26, 2021 at 01:26:45AM +0000, Song Bao Hua (Barry Song) wrote:
> > On Mon, Jan 25, 2021 at 11:35:22PM +0000, Song Bao Hua (Barry Song) wrote:
> > 
> > > > On Mon, Jan 25, 2021 at 10:21:14PM +0000, Song Bao Hua (Barry Song) wrote:
> > > > > mlock, while certainly be able to prevent swapping out, it won't
> > > > > be able to stop page moving due to:
> > > > > * memory compaction in alloc_pages()
> > > > > * making huge pages
> > > > > * numa balance
> > > > > * memory compaction in CMA
> > > >
> > > > Enabling those things is a major reason to have SVA device in the
> > > > first place, providing a SW API to turn it all off seems like the
> > > > wrong direction.
> > >
> > > I wouldn't say this is a major reason to have SVA. If we read the
> > > history of SVA and papers, people would think easy programming due
> > > to data struct sharing between cpu and device, and process space
> > > isolation in device would be the major reasons for SVA. SVA also
> > > declares it supports zero-copy while zero-copy doesn't necessarily
> > > depend on SVA.
> > 
> > Once you have to explicitly make system calls to declare memory under
> > IO, you loose all of that.
> > 
> > Since you've asked the app to be explicit about the DMAs it intends to
> > do, there is not really much reason to use SVA for those DMAs anymore.
> 
> Let's see a non-SVA case. We are not using SVA, we can have
> a memory pool by hugetlb or pin, and app can allocate memory
> from this pool, and get stable I/O performance on the memory
> from the pool. But device has its separate page table which
> is not bound with this process, thus lacking the protection
> of process space isolation. Plus, CPU and device are using
> different address.

So you are relying on the platform to do the SVA for the device?

This feels like it goes back to another topic where I felt the SVA
setup uAPI should be shared and not buried into every driver's unique
ioctls.

Having something like this in a shared SVA system is somewhat less
strange.

Jason


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-01-26 18:20             ` Jason Gunthorpe
@ 2021-01-28  1:28               ` Song Bao Hua (Barry Song)
  0 siblings, 0 replies; 18+ messages in thread
From: Song Bao Hua (Barry Song) @ 2021-01-28  1:28 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Wangzhou (B),
	Greg Kroah-Hartman, Arnd Bergmann, Zhangfei Gao,
	linux-accelerators, linux-kernel, iommu, linux-mm,
	Liguozhu (Kenneth), chensihang (A)



> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgg@ziepe.ca]
> Sent: Wednesday, January 27, 2021 7:20 AM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> Cc: Wangzhou (B) <wangzhou1@hisilicon.com>; Greg Kroah-Hartman
> <gregkh@linuxfoundation.org>; Arnd Bergmann <arnd@arndb.de>; Zhangfei Gao
> <zhangfei.gao@linaro.org>; linux-accelerators@lists.ozlabs.org;
> linux-kernel@vger.kernel.org; iommu@lists.linux-foundation.org;
> linux-mm@kvack.org; Liguozhu (Kenneth) <liguozhu@hisilicon.com>; chensihang
> (A) <chensihang1@hisilicon.com>
> Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
> 
> On Tue, Jan 26, 2021 at 01:26:45AM +0000, Song Bao Hua (Barry Song) wrote:
> > > On Mon, Jan 25, 2021 at 11:35:22PM +0000, Song Bao Hua (Barry Song) wrote:
> > >
> > > > > On Mon, Jan 25, 2021 at 10:21:14PM +0000, Song Bao Hua (Barry Song)
> wrote:
> > > > > > mlock, while certainly be able to prevent swapping out, it won't
> > > > > > be able to stop page moving due to:
> > > > > > * memory compaction in alloc_pages()
> > > > > > * making huge pages
> > > > > > * numa balance
> > > > > > * memory compaction in CMA
> > > > >
> > > > > Enabling those things is a major reason to have SVA device in the
> > > > > first place, providing a SW API to turn it all off seems like the
> > > > > wrong direction.
> > > >
> > > > I wouldn't say this is a major reason to have SVA. If we read the
> > > > history of SVA and papers, people would think easy programming due
> > > > to data struct sharing between cpu and device, and process space
> > > > isolation in device would be the major reasons for SVA. SVA also
> > > > declares it supports zero-copy while zero-copy doesn't necessarily
> > > > depend on SVA.
> > >
> > > Once you have to explicitly make system calls to declare memory under
> > > IO, you loose all of that.
> > >
> > > Since you've asked the app to be explicit about the DMAs it intends to
> > > do, there is not really much reason to use SVA for those DMAs anymore.
> >
> > Let's see a non-SVA case. We are not using SVA, we can have
> > a memory pool by hugetlb or pin, and app can allocate memory
> > from this pool, and get stable I/O performance on the memory
> > from the pool. But device has its separate page table which
> > is not bound with this process, thus lacking the protection
> > of process space isolation. Plus, CPU and device are using
> > different address.
> 
> So you are relying on the platform to do the SVA for the device?
> 

Sorry for late response.

uacce and its userspace framework UADK depend on SVA, leveraging
the enhanced security by isolated process address space.

This patch is mainly an extension for performance optimization to
get stable high-performance I/O on pinned memory even though the
hardware supports IO page fault to get pages back after swapping
out or page migration.
But IO page fault will cause serious latency jitter for high-speed
I/O.
For slow speed device, they don't need to use this extension.

> This feels like it goes back to another topic where I felt the SVA
> setup uAPI should be shared and not buried into every driver's unique
> ioctls.
> 
> Having something like this in a shared SVA system is somewhat less
> strange.

Sounds reasonable. On the other hand, uacce seems to be an common
uAPI for SVA, and probably the only one for this moment.

uacce is a framework not a specific driver as any accelerators
can hook into this framework as long as a device provides
uacce_ops and register itself by uacce_register(). Uacce, for
itself, doesn't bind with any specific hardware. So uacce interfaces
are kind of common uAPI :-)

> 
> Jason

Thanks
Barry



^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-01-26  1:26           ` Song Bao Hua (Barry Song)
  2021-01-26 18:20             ` Jason Gunthorpe
@ 2021-01-29 10:09             ` Tian, Kevin
  2021-01-29 10:33               ` Song Bao Hua (Barry Song)
  2021-02-01 23:44               ` Jason Gunthorpe
  1 sibling, 2 replies; 18+ messages in thread
From: Tian, Kevin @ 2021-01-29 10:09 UTC (permalink / raw)
  To: Song Bao Hua (Barry Song), Jason Gunthorpe
  Cc: chensihang (A),
	Arnd Bergmann, Greg Kroah-Hartman, linux-kernel, iommu, linux-mm,
	Zhangfei Gao, Liguozhu (Kenneth),
	linux-accelerators

> From: Song Bao Hua (Barry Song)
> Sent: Tuesday, January 26, 2021 9:27 AM
> 
> > -----Original Message-----
> > From: Jason Gunthorpe [mailto:jgg@ziepe.ca]
> > Sent: Tuesday, January 26, 2021 2:13 PM
> > To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> > Cc: Wangzhou (B) <wangzhou1@hisilicon.com>; Greg Kroah-Hartman
> > <gregkh@linuxfoundation.org>; Arnd Bergmann <arnd@arndb.de>;
> Zhangfei Gao
> > <zhangfei.gao@linaro.org>; linux-accelerators@lists.ozlabs.org;
> > linux-kernel@vger.kernel.org; iommu@lists.linux-foundation.org;
> > linux-mm@kvack.org; Liguozhu (Kenneth) <liguozhu@hisilicon.com>;
> chensihang
> > (A) <chensihang1@hisilicon.com>
> > Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
> >
> > On Mon, Jan 25, 2021 at 11:35:22PM +0000, Song Bao Hua (Barry Song)
> wrote:
> >
> > > > On Mon, Jan 25, 2021 at 10:21:14PM +0000, Song Bao Hua (Barry Song)
> wrote:
> > > > > mlock, while certainly be able to prevent swapping out, it won't
> > > > > be able to stop page moving due to:
> > > > > * memory compaction in alloc_pages()
> > > > > * making huge pages
> > > > > * numa balance
> > > > > * memory compaction in CMA
> > > >
> > > > Enabling those things is a major reason to have SVA device in the
> > > > first place, providing a SW API to turn it all off seems like the
> > > > wrong direction.
> > >
> > > I wouldn't say this is a major reason to have SVA. If we read the
> > > history of SVA and papers, people would think easy programming due
> > > to data struct sharing between cpu and device, and process space
> > > isolation in device would be the major reasons for SVA. SVA also
> > > declares it supports zero-copy while zero-copy doesn't necessarily
> > > depend on SVA.
> >
> > Once you have to explicitly make system calls to declare memory under
> > IO, you loose all of that.
> >
> > Since you've asked the app to be explicit about the DMAs it intends to
> > do, there is not really much reason to use SVA for those DMAs anymore.
> 
> Let's see a non-SVA case. We are not using SVA, we can have
> a memory pool by hugetlb or pin, and app can allocate memory
> from this pool, and get stable I/O performance on the memory
> from the pool. But device has its separate page table which
> is not bound with this process, thus lacking the protection
> of process space isolation. Plus, CPU and device are using
> different address.
> 
> And then we move to SVA case, we can still have a memory pool
> by hugetlb or pin, and app can allocate memory from this pool
> since this pool is mapped to the address space of the process,
> and we are able to get stable I/O performance since it is always
> there. But in this case, device is using the page table of
> process with the full permission control.
> And they are using same address and can possibly enjoy the easy
> programming if HW supports.
> 
> SVA is not doom to work with IO page fault only. If we have SVA+pin,
> we would get both sharing address and stable I/O latency.
> 

Isn't it like a traditional MAP_DMA API (imply pinning) plus specifying 
cpu_va of the memory pool as the iova? 

Thanks
Kevin


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-01-29 10:09             ` Tian, Kevin
@ 2021-01-29 10:33               ` Song Bao Hua (Barry Song)
  2021-02-01 23:44               ` Jason Gunthorpe
  1 sibling, 0 replies; 18+ messages in thread
From: Song Bao Hua (Barry Song) @ 2021-01-29 10:33 UTC (permalink / raw)
  To: Tian, Kevin, Jason Gunthorpe
  Cc: chensihang (A),
	Arnd Bergmann, Greg Kroah-Hartman, linux-kernel, iommu, linux-mm,
	Zhangfei Gao, Liguozhu (Kenneth),
	linux-accelerators



> -----Original Message-----
> From: Tian, Kevin [mailto:kevin.tian@intel.com]
> Sent: Friday, January 29, 2021 11:09 PM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>; Jason Gunthorpe
> <jgg@ziepe.ca>
> Cc: chensihang (A) <chensihang1@hisilicon.com>; Arnd Bergmann
> <arnd@arndb.de>; Greg Kroah-Hartman <gregkh@linuxfoundation.org>;
> linux-kernel@vger.kernel.org; iommu@lists.linux-foundation.org;
> linux-mm@kvack.org; Zhangfei Gao <zhangfei.gao@linaro.org>; Liguozhu
> (Kenneth) <liguozhu@hisilicon.com>; linux-accelerators@lists.ozlabs.org
> Subject: RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
> 
> > From: Song Bao Hua (Barry Song)
> > Sent: Tuesday, January 26, 2021 9:27 AM
> >
> > > -----Original Message-----
> > > From: Jason Gunthorpe [mailto:jgg@ziepe.ca]
> > > Sent: Tuesday, January 26, 2021 2:13 PM
> > > To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> > > Cc: Wangzhou (B) <wangzhou1@hisilicon.com>; Greg Kroah-Hartman
> > > <gregkh@linuxfoundation.org>; Arnd Bergmann <arnd@arndb.de>;
> > Zhangfei Gao
> > > <zhangfei.gao@linaro.org>; linux-accelerators@lists.ozlabs.org;
> > > linux-kernel@vger.kernel.org; iommu@lists.linux-foundation.org;
> > > linux-mm@kvack.org; Liguozhu (Kenneth) <liguozhu@hisilicon.com>;
> > chensihang
> > > (A) <chensihang1@hisilicon.com>
> > > Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
> > >
> > > On Mon, Jan 25, 2021 at 11:35:22PM +0000, Song Bao Hua (Barry Song)
> > wrote:
> > >
> > > > > On Mon, Jan 25, 2021 at 10:21:14PM +0000, Song Bao Hua (Barry Song)
> > wrote:
> > > > > > mlock, while certainly be able to prevent swapping out, it won't
> > > > > > be able to stop page moving due to:
> > > > > > * memory compaction in alloc_pages()
> > > > > > * making huge pages
> > > > > > * numa balance
> > > > > > * memory compaction in CMA
> > > > >
> > > > > Enabling those things is a major reason to have SVA device in the
> > > > > first place, providing a SW API to turn it all off seems like the
> > > > > wrong direction.
> > > >
> > > > I wouldn't say this is a major reason to have SVA. If we read the
> > > > history of SVA and papers, people would think easy programming due
> > > > to data struct sharing between cpu and device, and process space
> > > > isolation in device would be the major reasons for SVA. SVA also
> > > > declares it supports zero-copy while zero-copy doesn't necessarily
> > > > depend on SVA.
> > >
> > > Once you have to explicitly make system calls to declare memory under
> > > IO, you loose all of that.
> > >
> > > Since you've asked the app to be explicit about the DMAs it intends to
> > > do, there is not really much reason to use SVA for those DMAs anymore.
> >
> > Let's see a non-SVA case. We are not using SVA, we can have
> > a memory pool by hugetlb or pin, and app can allocate memory
> > from this pool, and get stable I/O performance on the memory
> > from the pool. But device has its separate page table which
> > is not bound with this process, thus lacking the protection
> > of process space isolation. Plus, CPU and device are using
> > different address.
> >
> > And then we move to SVA case, we can still have a memory pool
> > by hugetlb or pin, and app can allocate memory from this pool
> > since this pool is mapped to the address space of the process,
> > and we are able to get stable I/O performance since it is always
> > there. But in this case, device is using the page table of
> > process with the full permission control.
> > And they are using same address and can possibly enjoy the easy
> > programming if HW supports.
> >
> > SVA is not doom to work with IO page fault only. If we have SVA+pin,
> > we would get both sharing address and stable I/O latency.
> >
> 
> Isn't it like a traditional MAP_DMA API (imply pinning) plus specifying
> cpu_va of the memory pool as the iova?

I think it enjoys the advantage of stable I/O latency of
traditional MAP_DMA, and also uses the process page table
which SVA can provide. The major difference is that in
SVA case, iova totally belongs to process and is as normal
as other heap/stack/data:
p = mmap(.....MAP_ANON....);
ioctl(/dev/acc, p, PIN);

SVA for itself, provides the ability to guarantee the
address space isolation of multiple processes.  If the
device can access the data struct  such as list, tree
directly, they can further enjoy the convenience of
programming SVA gives.

So we are looking for a combination of stable io latency
of traditional DMA map and the ability of SVA.

> 
> Thanks
> Kevin

Thanks
Barry



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-01-29 10:09             ` Tian, Kevin
  2021-01-29 10:33               ` Song Bao Hua (Barry Song)
@ 2021-02-01 23:44               ` Jason Gunthorpe
  2021-02-02  0:22                 ` Song Bao Hua (Barry Song)
  2021-02-02  2:51                 ` Tian, Kevin
  1 sibling, 2 replies; 18+ messages in thread
From: Jason Gunthorpe @ 2021-02-01 23:44 UTC (permalink / raw)
  To: Tian, Kevin
  Cc: Song Bao Hua (Barry Song), chensihang (A),
	Arnd Bergmann, Greg Kroah-Hartman, linux-kernel, iommu, linux-mm,
	Zhangfei Gao, Liguozhu (Kenneth),
	linux-accelerators

On Fri, Jan 29, 2021 at 10:09:03AM +0000, Tian, Kevin wrote:
> > SVA is not doom to work with IO page fault only. If we have SVA+pin,
> > we would get both sharing address and stable I/O latency.
> 
> Isn't it like a traditional MAP_DMA API (imply pinning) plus specifying 
> cpu_va of the memory pool as the iova? 

I think their issue is the HW can't do the cpu_va trick without also
involving the system IOMMU in a SVA mode

It really is something that belongs under some general /dev/sva as we
talked on the vfio thread

Jason


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-02-01 23:44               ` Jason Gunthorpe
@ 2021-02-02  0:22                 ` Song Bao Hua (Barry Song)
  2021-02-02  2:51                 ` Tian, Kevin
  1 sibling, 0 replies; 18+ messages in thread
From: Song Bao Hua (Barry Song) @ 2021-02-02  0:22 UTC (permalink / raw)
  To: Jason Gunthorpe, Tian, Kevin
  Cc: chensihang (A),
	Arnd Bergmann, Greg Kroah-Hartman, linux-kernel, iommu, linux-mm,
	Zhangfei Gao, Liguozhu (Kenneth),
	linux-accelerators



> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgg@ziepe.ca]
> Sent: Tuesday, February 2, 2021 12:44 PM
> To: Tian, Kevin <kevin.tian@intel.com>
> Cc: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>; chensihang (A)
> <chensihang1@hisilicon.com>; Arnd Bergmann <arnd@arndb.de>; Greg
> Kroah-Hartman <gregkh@linuxfoundation.org>; linux-kernel@vger.kernel.org;
> iommu@lists.linux-foundation.org; linux-mm@kvack.org; Zhangfei Gao
> <zhangfei.gao@linaro.org>; Liguozhu (Kenneth) <liguozhu@hisilicon.com>;
> linux-accelerators@lists.ozlabs.org
> Subject: Re: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
> 
> On Fri, Jan 29, 2021 at 10:09:03AM +0000, Tian, Kevin wrote:
> > > SVA is not doom to work with IO page fault only. If we have SVA+pin,
> > > we would get both sharing address and stable I/O latency.
> >
> > Isn't it like a traditional MAP_DMA API (imply pinning) plus specifying
> > cpu_va of the memory pool as the iova?
> 
> I think their issue is the HW can't do the cpu_va trick without also
> involving the system IOMMU in a SVA mode
> 
> It really is something that belongs under some general /dev/sva as we
> talked on the vfio thread

AFAIK, there is no this /dev/sva so /dev/uacce is an uAPI
which belongs to sva.

Another option is that we add a system call like
fs/userfaultfd.c, and move the file_operations and  ioctl
to the anon inode by creating fd via anon_inode_getfd().
Then nothing will be buried by uacce.

> 
> Jason

Thanks
Barry



^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-02-01 23:44               ` Jason Gunthorpe
  2021-02-02  0:22                 ` Song Bao Hua (Barry Song)
@ 2021-02-02  2:51                 ` Tian, Kevin
  2021-02-02  3:47                   ` Song Bao Hua (Barry Song)
  1 sibling, 1 reply; 18+ messages in thread
From: Tian, Kevin @ 2021-02-02  2:51 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Song Bao Hua (Barry Song), chensihang (A),
	Arnd Bergmann, Greg Kroah-Hartman, linux-kernel, iommu, linux-mm,
	Zhangfei Gao, Liguozhu (Kenneth),
	linux-accelerators

> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Tuesday, February 2, 2021 7:44 AM
> 
> On Fri, Jan 29, 2021 at 10:09:03AM +0000, Tian, Kevin wrote:
> > > SVA is not doom to work with IO page fault only. If we have SVA+pin,
> > > we would get both sharing address and stable I/O latency.
> >
> > Isn't it like a traditional MAP_DMA API (imply pinning) plus specifying
> > cpu_va of the memory pool as the iova?
> 
> I think their issue is the HW can't do the cpu_va trick without also
> involving the system IOMMU in a SVA mode
> 

This is the part that I didn't understand. Using cpu_va in a MAP_DMA
interface doesn't require device support. It's just an user-specified
address to be mapped into the IOMMU page table. On the other hand,
sharing CPU page table through a SVA interface for an usage where I/O 
page faults must be completely avoided seems a misleading attempt. 
Even if people do want this model (e.g. mix pinning+fault), it should be
a mm syscall as Greg pointed out, not specific to sva.

Thanks
Kevin


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
  2021-02-02  2:51                 ` Tian, Kevin
@ 2021-02-02  3:47                   ` Song Bao Hua (Barry Song)
  0 siblings, 0 replies; 18+ messages in thread
From: Song Bao Hua (Barry Song) @ 2021-02-02  3:47 UTC (permalink / raw)
  To: Tian, Kevin, Jason Gunthorpe
  Cc: chensihang (A),
	Arnd Bergmann, Greg Kroah-Hartman, linux-kernel, iommu, linux-mm,
	Zhangfei Gao, Liguozhu (Kenneth),
	linux-accelerators



> -----Original Message-----
> From: Tian, Kevin [mailto:kevin.tian@intel.com]
> Sent: Tuesday, February 2, 2021 3:52 PM
> To: Jason Gunthorpe <jgg@ziepe.ca>
> Cc: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>; chensihang (A)
> <chensihang1@hisilicon.com>; Arnd Bergmann <arnd@arndb.de>; Greg
> Kroah-Hartman <gregkh@linuxfoundation.org>; linux-kernel@vger.kernel.org;
> iommu@lists.linux-foundation.org; linux-mm@kvack.org; Zhangfei Gao
> <zhangfei.gao@linaro.org>; Liguozhu (Kenneth) <liguozhu@hisilicon.com>;
> linux-accelerators@lists.ozlabs.org
> Subject: RE: [RFC PATCH v2] uacce: Add uacce_ctrl misc device
> 
> > From: Jason Gunthorpe <jgg@ziepe.ca>
> > Sent: Tuesday, February 2, 2021 7:44 AM
> >
> > On Fri, Jan 29, 2021 at 10:09:03AM +0000, Tian, Kevin wrote:
> > > > SVA is not doom to work with IO page fault only. If we have SVA+pin,
> > > > we would get both sharing address and stable I/O latency.
> > >
> > > Isn't it like a traditional MAP_DMA API (imply pinning) plus specifying
> > > cpu_va of the memory pool as the iova?
> >
> > I think their issue is the HW can't do the cpu_va trick without also
> > involving the system IOMMU in a SVA mode
> >
> 
> This is the part that I didn't understand. Using cpu_va in a MAP_DMA
> interface doesn't require device support. It's just an user-specified
> address to be mapped into the IOMMU page table. On the other hand,

The background is that uacce is based on SVA and we are building
applications on uacce:
https://www.kernel.org/doc/html/v5.10/misc-devices/uacce.html
so IOMMU simply uses the page table of MMU, and don't do any
special mapping to an user-specified address. We don't break
the basic assumption that uacce is using SVA, otherwise, we
need to re-build uacce and the whole base.

> sharing CPU page table through a SVA interface for an usage where I/O
> page faults must be completely avoided seems a misleading attempt.

That is not for completely avoiding IO page fault, that is just
an extension for high-performance I/O case, providing a way to
avoid IO latency jitter. Using it or not is totally up to users.

> Even if people do want this model (e.g. mix pinning+fault), it should be
> a mm syscall as Greg pointed out, not specific to sva.
> 

We are glad to make it a syscall if people are happy with
it. The simplest way would be a syscall similar with
userfaultfd  if we don't want to mess up mm_struct.

> Thanks
> Kevin

Thanks
Barry


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2021-02-02  3:47 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-25  8:34 [RFC PATCH v2] uacce: Add uacce_ctrl misc device Zhou Wang
2021-01-25  9:28 ` Greg Kroah-Hartman
2021-01-25 12:47   ` Zhou Wang
2021-01-25 15:47 ` Jason Gunthorpe
2021-01-25 22:21   ` Song Bao Hua (Barry Song)
2021-01-25 23:16     ` Jason Gunthorpe
2021-01-25 23:35       ` Song Bao Hua (Barry Song)
2021-01-26  1:13         ` Jason Gunthorpe
2021-01-26  1:26           ` Song Bao Hua (Barry Song)
2021-01-26 18:20             ` Jason Gunthorpe
2021-01-28  1:28               ` Song Bao Hua (Barry Song)
2021-01-29 10:09             ` Tian, Kevin
2021-01-29 10:33               ` Song Bao Hua (Barry Song)
2021-02-01 23:44               ` Jason Gunthorpe
2021-02-02  0:22                 ` Song Bao Hua (Barry Song)
2021-02-02  2:51                 ` Tian, Kevin
2021-02-02  3:47                   ` Song Bao Hua (Barry Song)
2021-01-26  9:00   ` Zhou Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).