All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
@ 2022-06-26  9:04 Cindy Lu
  2022-06-26 10:01   ` Michael S. Tsirkin
  0 siblings, 1 reply; 24+ messages in thread
From: Cindy Lu @ 2022-06-26  9:04 UTC (permalink / raw)
  To: mst, jasowang, lulu, virtualization, linux-kernel

We count pinned_vm as follow in vhost-vDPA

lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
         ret = -ENOMEM;
         goto unlock;
}
This means if we have two vDPA devices for the same VM the pages
would be counted twice. So we add a tree to save the page that
counted and we will not count it again.

Add vdpa_mem_tree to saved the mem that already counted.
use a hlist to saved the root for vdpa_mem_tree.

Signed-off-by: Cindy Lu <lulu@redhat.com>
---
 drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |  1 +
 2 files changed, 64 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 40097826cff0..4ca8b1ed944b 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -32,6 +32,8 @@
 #include <linux/kcov.h>
 
 #include "vhost.h"
+#include <linux/hashtable.h>
+#include <linux/jhash.h>
 
 static ushort max_mem_regions = 64;
 module_param(max_mem_regions, ushort, 0444);
@@ -49,6 +51,14 @@ enum {
 #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
 #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
 
+struct vhost_vdpa_rbtree_node {
+	struct hlist_node node;
+	struct rb_root_cached vdpa_mem_tree;
+	struct mm_struct *mm_using;
+};
+static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
+int vhost_vdpa_rbtree_hlist_status;
+
 #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
 static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
 {
@@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
 	dev->mm = NULL;
 }
 
+struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
+{
+	struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
+	struct rb_root_cached *vdpa_tree;
+	u32 key;
+
+	/* No hased table, init one */
+	if (vhost_vdpa_rbtree_hlist_status == 0) {
+		hash_init(vhost_vdpa_rbtree_hlist);
+		vhost_vdpa_rbtree_hlist_status = 1;
+	}
+
+	key = jhash_1word((u64)mm, JHASH_INITVAL);
+	hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
+			       key) {
+		if (rbtree_root->mm_using == mm)
+			return &(rbtree_root->vdpa_mem_tree);
+	}
+	rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
+	if (!rbtree_root)
+		return NULL;
+	rbtree_root->mm_using = mm;
+	rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
+	hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
+	vdpa_tree = &(rbtree_root->vdpa_mem_tree);
+	return vdpa_tree;
+}
+
+void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
+{
+	struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
+	u32 key;
+
+	key = jhash_1word((u64)mm, JHASH_INITVAL);
+
+	/* No hased table, init one */
+	hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
+			       key) {
+		if (rbtree_root->mm_using == mm) {
+			hash_del(&rbtree_root->node);
+			kfree(rbtree_root);
+		}
+	}
+}
+
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
@@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 	err = vhost_dev_alloc_iovecs(dev);
 	if (err)
 		goto err_cgroup;
+	dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
+	if (dev->vdpa_mem_tree == NULL) {
+		err = -ENOMEM;
+		goto err_cgroup;
+	}
 
 	return 0;
 err_cgroup:
@@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 		dev->worker = NULL;
 	}
 err_worker:
+	vhost_vdpa_relase_mem_tree(dev->mm);
 	vhost_detach_mm(dev);
 	dev->kcov_handle = 0;
 err_mm:
@@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 		dev->worker = NULL;
 		dev->kcov_handle = 0;
 	}
+
+	vhost_vdpa_relase_mem_tree(dev->mm);
 	vhost_detach_mm(dev);
 }
 EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d9109107af08..84de33de3abf 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -160,6 +160,7 @@ struct vhost_dev {
 	int byte_weight;
 	u64 kcov_handle;
 	bool use_worker;
+	struct rb_root_cached *vdpa_mem_tree;
 	int (*msg_handler)(struct vhost_dev *dev, u32 asid,
 			   struct vhost_iotlb_msg *msg);
 };
-- 
2.34.3


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
  2022-06-26  9:04 [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem Cindy Lu
@ 2022-06-26 10:01   ` Michael S. Tsirkin
  0 siblings, 0 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2022-06-26 10:01 UTC (permalink / raw)
  To: Cindy Lu; +Cc: jasowang, virtualization, linux-kernel

On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> We count pinned_vm as follow in vhost-vDPA
> 
> lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
>          ret = -ENOMEM;
>          goto unlock;
> }
> This means if we have two vDPA devices for the same VM the pages
> would be counted twice. So we add a tree to save the page that
> counted and we will not count it again.
> 
> Add vdpa_mem_tree to saved the mem that already counted.
> use a hlist to saved the root for vdpa_mem_tree.
> 
> Signed-off-by: Cindy Lu <lulu@redhat.com>
> ---
>  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |  1 +
>  2 files changed, 64 insertions(+)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 40097826cff0..4ca8b1ed944b 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -32,6 +32,8 @@
>  #include <linux/kcov.h>
>  
>  #include "vhost.h"
> +#include <linux/hashtable.h>
> +#include <linux/jhash.h>
>  
>  static ushort max_mem_regions = 64;
>  module_param(max_mem_regions, ushort, 0444);
> @@ -49,6 +51,14 @@ enum {
>  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
>  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
>  
> +struct vhost_vdpa_rbtree_node {
> +	struct hlist_node node;
> +	struct rb_root_cached vdpa_mem_tree;
> +	struct mm_struct *mm_using;
> +};
> +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> +int vhost_vdpa_rbtree_hlist_status;
> +
>  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
>  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
>  {

Are you trying to save some per-mm information here?
Can't we just add it to mm_struct?



> @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
>  	dev->mm = NULL;
>  }
>  
> +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> +{
> +	struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> +	struct rb_root_cached *vdpa_tree;
> +	u32 key;
> +
> +	/* No hased table, init one */
> +	if (vhost_vdpa_rbtree_hlist_status == 0) {
> +		hash_init(vhost_vdpa_rbtree_hlist);
> +		vhost_vdpa_rbtree_hlist_status = 1;
> +	}
> +
> +	key = jhash_1word((u64)mm, JHASH_INITVAL);
> +	hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> +			       key) {
> +		if (rbtree_root->mm_using == mm)
> +			return &(rbtree_root->vdpa_mem_tree);
> +	}
> +	rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> +	if (!rbtree_root)
> +		return NULL;
> +	rbtree_root->mm_using = mm;
> +	rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> +	hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> +	vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> +	return vdpa_tree;
> +}
> +
> +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> +{
> +	struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> +	u32 key;
> +
> +	key = jhash_1word((u64)mm, JHASH_INITVAL);
> +
> +	/* No hased table, init one */
> +	hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> +			       key) {
> +		if (rbtree_root->mm_using == mm) {
> +			hash_del(&rbtree_root->node);
> +			kfree(rbtree_root);
> +		}
> +	}
> +}
> +
>  /* Caller should have device mutex */
>  long vhost_dev_set_owner(struct vhost_dev *dev)
>  {
> @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>  	err = vhost_dev_alloc_iovecs(dev);
>  	if (err)
>  		goto err_cgroup;
> +	dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> +	if (dev->vdpa_mem_tree == NULL) {
> +		err = -ENOMEM;
> +		goto err_cgroup;
> +	}
>  
>  	return 0;
>  err_cgroup:
> @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>  		dev->worker = NULL;
>  	}
>  err_worker:
> +	vhost_vdpa_relase_mem_tree(dev->mm);
>  	vhost_detach_mm(dev);
>  	dev->kcov_handle = 0;
>  err_mm:
> @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>  		dev->worker = NULL;
>  		dev->kcov_handle = 0;
>  	}
> +
> +	vhost_vdpa_relase_mem_tree(dev->mm);
>  	vhost_detach_mm(dev);
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index d9109107af08..84de33de3abf 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -160,6 +160,7 @@ struct vhost_dev {
>  	int byte_weight;
>  	u64 kcov_handle;
>  	bool use_worker;
> +	struct rb_root_cached *vdpa_mem_tree;
>  	int (*msg_handler)(struct vhost_dev *dev, u32 asid,
>  			   struct vhost_iotlb_msg *msg);
>  };
> -- 
> 2.34.3


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
@ 2022-06-26 10:01   ` Michael S. Tsirkin
  0 siblings, 0 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2022-06-26 10:01 UTC (permalink / raw)
  To: Cindy Lu; +Cc: linux-kernel, virtualization

On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> We count pinned_vm as follow in vhost-vDPA
> 
> lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
>          ret = -ENOMEM;
>          goto unlock;
> }
> This means if we have two vDPA devices for the same VM the pages
> would be counted twice. So we add a tree to save the page that
> counted and we will not count it again.
> 
> Add vdpa_mem_tree to saved the mem that already counted.
> use a hlist to saved the root for vdpa_mem_tree.
> 
> Signed-off-by: Cindy Lu <lulu@redhat.com>
> ---
>  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |  1 +
>  2 files changed, 64 insertions(+)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 40097826cff0..4ca8b1ed944b 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -32,6 +32,8 @@
>  #include <linux/kcov.h>
>  
>  #include "vhost.h"
> +#include <linux/hashtable.h>
> +#include <linux/jhash.h>
>  
>  static ushort max_mem_regions = 64;
>  module_param(max_mem_regions, ushort, 0444);
> @@ -49,6 +51,14 @@ enum {
>  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
>  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
>  
> +struct vhost_vdpa_rbtree_node {
> +	struct hlist_node node;
> +	struct rb_root_cached vdpa_mem_tree;
> +	struct mm_struct *mm_using;
> +};
> +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> +int vhost_vdpa_rbtree_hlist_status;
> +
>  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
>  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
>  {

Are you trying to save some per-mm information here?
Can't we just add it to mm_struct?



> @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
>  	dev->mm = NULL;
>  }
>  
> +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> +{
> +	struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> +	struct rb_root_cached *vdpa_tree;
> +	u32 key;
> +
> +	/* No hased table, init one */
> +	if (vhost_vdpa_rbtree_hlist_status == 0) {
> +		hash_init(vhost_vdpa_rbtree_hlist);
> +		vhost_vdpa_rbtree_hlist_status = 1;
> +	}
> +
> +	key = jhash_1word((u64)mm, JHASH_INITVAL);
> +	hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> +			       key) {
> +		if (rbtree_root->mm_using == mm)
> +			return &(rbtree_root->vdpa_mem_tree);
> +	}
> +	rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> +	if (!rbtree_root)
> +		return NULL;
> +	rbtree_root->mm_using = mm;
> +	rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> +	hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> +	vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> +	return vdpa_tree;
> +}
> +
> +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> +{
> +	struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> +	u32 key;
> +
> +	key = jhash_1word((u64)mm, JHASH_INITVAL);
> +
> +	/* No hased table, init one */
> +	hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> +			       key) {
> +		if (rbtree_root->mm_using == mm) {
> +			hash_del(&rbtree_root->node);
> +			kfree(rbtree_root);
> +		}
> +	}
> +}
> +
>  /* Caller should have device mutex */
>  long vhost_dev_set_owner(struct vhost_dev *dev)
>  {
> @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>  	err = vhost_dev_alloc_iovecs(dev);
>  	if (err)
>  		goto err_cgroup;
> +	dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> +	if (dev->vdpa_mem_tree == NULL) {
> +		err = -ENOMEM;
> +		goto err_cgroup;
> +	}
>  
>  	return 0;
>  err_cgroup:
> @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>  		dev->worker = NULL;
>  	}
>  err_worker:
> +	vhost_vdpa_relase_mem_tree(dev->mm);
>  	vhost_detach_mm(dev);
>  	dev->kcov_handle = 0;
>  err_mm:
> @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>  		dev->worker = NULL;
>  		dev->kcov_handle = 0;
>  	}
> +
> +	vhost_vdpa_relase_mem_tree(dev->mm);
>  	vhost_detach_mm(dev);
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index d9109107af08..84de33de3abf 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -160,6 +160,7 @@ struct vhost_dev {
>  	int byte_weight;
>  	u64 kcov_handle;
>  	bool use_worker;
> +	struct rb_root_cached *vdpa_mem_tree;
>  	int (*msg_handler)(struct vhost_dev *dev, u32 asid,
>  			   struct vhost_iotlb_msg *msg);
>  };
> -- 
> 2.34.3

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
  2022-06-26 10:01   ` Michael S. Tsirkin
  (?)
@ 2022-06-27  8:12   ` Cindy Lu
  2022-06-27 10:01       ` Michael S. Tsirkin
  -1 siblings, 1 reply; 24+ messages in thread
From: Cindy Lu @ 2022-06-27  8:12 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Jason Wang, virtualization, linux-kernel

On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > We count pinned_vm as follow in vhost-vDPA
> >
> > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> >          ret = -ENOMEM;
> >          goto unlock;
> > }
> > This means if we have two vDPA devices for the same VM the pages
> > would be counted twice. So we add a tree to save the page that
> > counted and we will not count it again.
> >
> > Add vdpa_mem_tree to saved the mem that already counted.
> > use a hlist to saved the root for vdpa_mem_tree.
> >
> > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > ---
> >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> >  drivers/vhost/vhost.h |  1 +
> >  2 files changed, 64 insertions(+)
> >
> > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > index 40097826cff0..4ca8b1ed944b 100644
> > --- a/drivers/vhost/vhost.c
> > +++ b/drivers/vhost/vhost.c
> > @@ -32,6 +32,8 @@
> >  #include <linux/kcov.h>
> >
> >  #include "vhost.h"
> > +#include <linux/hashtable.h>
> > +#include <linux/jhash.h>
> >
> >  static ushort max_mem_regions = 64;
> >  module_param(max_mem_regions, ushort, 0444);
> > @@ -49,6 +51,14 @@ enum {
> >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> >
> > +struct vhost_vdpa_rbtree_node {
> > +     struct hlist_node node;
> > +     struct rb_root_cached vdpa_mem_tree;
> > +     struct mm_struct *mm_using;
> > +};
> > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > +int vhost_vdpa_rbtree_hlist_status;
> > +
> >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> >  {
>
> Are you trying to save some per-mm information here?
> Can't we just add it to mm_struct?
>
yes, this is a per-mm information, but I have checked with jason before,
seems it maybe difficult to change the mm_struct in upstream
so I add an to add a hlist  instead
Thanks
Cindy

>
>
> > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> >       dev->mm = NULL;
> >  }
> >
> > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > +{
> > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > +     struct rb_root_cached *vdpa_tree;
> > +     u32 key;
> > +
> > +     /* No hased table, init one */
> > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > +             hash_init(vhost_vdpa_rbtree_hlist);
> > +             vhost_vdpa_rbtree_hlist_status = 1;
> > +     }
> > +
> > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > +                            key) {
> > +             if (rbtree_root->mm_using == mm)
> > +                     return &(rbtree_root->vdpa_mem_tree);
> > +     }
> > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > +     if (!rbtree_root)
> > +             return NULL;
> > +     rbtree_root->mm_using = mm;
> > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > +     return vdpa_tree;
> > +}
> > +
> > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > +{
> > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > +     u32 key;
> > +
> > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > +
> > +     /* No hased table, init one */
> > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > +                            key) {
> > +             if (rbtree_root->mm_using == mm) {
> > +                     hash_del(&rbtree_root->node);
> > +                     kfree(rbtree_root);
> > +             }
> > +     }
> > +}
> > +
> >  /* Caller should have device mutex */
> >  long vhost_dev_set_owner(struct vhost_dev *dev)
> >  {
> > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> >       err = vhost_dev_alloc_iovecs(dev);
> >       if (err)
> >               goto err_cgroup;
> > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > +     if (dev->vdpa_mem_tree == NULL) {
> > +             err = -ENOMEM;
> > +             goto err_cgroup;
> > +     }
> >
> >       return 0;
> >  err_cgroup:
> > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> >               dev->worker = NULL;
> >       }
> >  err_worker:
> > +     vhost_vdpa_relase_mem_tree(dev->mm);
> >       vhost_detach_mm(dev);
> >       dev->kcov_handle = 0;
> >  err_mm:
> > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> >               dev->worker = NULL;
> >               dev->kcov_handle = 0;
> >       }
> > +
> > +     vhost_vdpa_relase_mem_tree(dev->mm);
> >       vhost_detach_mm(dev);
> >  }
> >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > index d9109107af08..84de33de3abf 100644
> > --- a/drivers/vhost/vhost.h
> > +++ b/drivers/vhost/vhost.h
> > @@ -160,6 +160,7 @@ struct vhost_dev {
> >       int byte_weight;
> >       u64 kcov_handle;
> >       bool use_worker;
> > +     struct rb_root_cached *vdpa_mem_tree;
> >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> >                          struct vhost_iotlb_msg *msg);
> >  };
> > --
> > 2.34.3
>


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
  2022-06-27  8:12   ` Cindy Lu
@ 2022-06-27 10:01       ` Michael S. Tsirkin
  0 siblings, 0 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2022-06-27 10:01 UTC (permalink / raw)
  To: Cindy Lu; +Cc: linux-kernel, virtualization

On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > We count pinned_vm as follow in vhost-vDPA
> > >
> > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > >          ret = -ENOMEM;
> > >          goto unlock;
> > > }
> > > This means if we have two vDPA devices for the same VM the pages
> > > would be counted twice. So we add a tree to save the page that
> > > counted and we will not count it again.
> > >
> > > Add vdpa_mem_tree to saved the mem that already counted.
> > > use a hlist to saved the root for vdpa_mem_tree.
> > >
> > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > ---
> > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > >  drivers/vhost/vhost.h |  1 +
> > >  2 files changed, 64 insertions(+)
> > >
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index 40097826cff0..4ca8b1ed944b 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -32,6 +32,8 @@
> > >  #include <linux/kcov.h>
> > >
> > >  #include "vhost.h"
> > > +#include <linux/hashtable.h>
> > > +#include <linux/jhash.h>
> > >
> > >  static ushort max_mem_regions = 64;
> > >  module_param(max_mem_regions, ushort, 0444);
> > > @@ -49,6 +51,14 @@ enum {
> > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > >
> > > +struct vhost_vdpa_rbtree_node {
> > > +     struct hlist_node node;
> > > +     struct rb_root_cached vdpa_mem_tree;
> > > +     struct mm_struct *mm_using;
> > > +};
> > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > +int vhost_vdpa_rbtree_hlist_status;
> > > +
> > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > >  {
> >
> > Are you trying to save some per-mm information here?
> > Can't we just add it to mm_struct?
> >
> yes, this is a per-mm information, but I have checked with jason before,
> seems it maybe difficult to change the mm_struct in upstream
> so I add an to add a hlist  instead
> Thanks
> Cindy

Difficult how? You get more scrutiny if you try, for sure,
and you need to need to generalize it enough that it looks
useful outside the driver. But I guess that's good?

> >
> >
> > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > >       dev->mm = NULL;
> > >  }
> > >
> > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > +{
> > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > +     struct rb_root_cached *vdpa_tree;
> > > +     u32 key;
> > > +
> > > +     /* No hased table, init one */
> > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > +     }
> > > +
> > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > +                            key) {
> > > +             if (rbtree_root->mm_using == mm)
> > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > +     }
> > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > +     if (!rbtree_root)
> > > +             return NULL;
> > > +     rbtree_root->mm_using = mm;
> > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > +     return vdpa_tree;
> > > +}
> > > +
> > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > +{
> > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > +     u32 key;
> > > +
> > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > +
> > > +     /* No hased table, init one */
> > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > +                            key) {
> > > +             if (rbtree_root->mm_using == mm) {
> > > +                     hash_del(&rbtree_root->node);
> > > +                     kfree(rbtree_root);
> > > +             }
> > > +     }
> > > +}
> > > +
> > >  /* Caller should have device mutex */
> > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > >  {
> > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > >       err = vhost_dev_alloc_iovecs(dev);
> > >       if (err)
> > >               goto err_cgroup;
> > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > +     if (dev->vdpa_mem_tree == NULL) {
> > > +             err = -ENOMEM;
> > > +             goto err_cgroup;
> > > +     }
> > >
> > >       return 0;
> > >  err_cgroup:
> > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > >               dev->worker = NULL;
> > >       }
> > >  err_worker:
> > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > >       vhost_detach_mm(dev);
> > >       dev->kcov_handle = 0;
> > >  err_mm:
> > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > >               dev->worker = NULL;
> > >               dev->kcov_handle = 0;
> > >       }
> > > +
> > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > >       vhost_detach_mm(dev);
> > >  }
> > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > index d9109107af08..84de33de3abf 100644
> > > --- a/drivers/vhost/vhost.h
> > > +++ b/drivers/vhost/vhost.h
> > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > >       int byte_weight;
> > >       u64 kcov_handle;
> > >       bool use_worker;
> > > +     struct rb_root_cached *vdpa_mem_tree;
> > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > >                          struct vhost_iotlb_msg *msg);
> > >  };
> > > --
> > > 2.34.3
> >

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
@ 2022-06-27 10:01       ` Michael S. Tsirkin
  0 siblings, 0 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2022-06-27 10:01 UTC (permalink / raw)
  To: Cindy Lu; +Cc: Jason Wang, virtualization, linux-kernel

On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > We count pinned_vm as follow in vhost-vDPA
> > >
> > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > >          ret = -ENOMEM;
> > >          goto unlock;
> > > }
> > > This means if we have two vDPA devices for the same VM the pages
> > > would be counted twice. So we add a tree to save the page that
> > > counted and we will not count it again.
> > >
> > > Add vdpa_mem_tree to saved the mem that already counted.
> > > use a hlist to saved the root for vdpa_mem_tree.
> > >
> > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > ---
> > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > >  drivers/vhost/vhost.h |  1 +
> > >  2 files changed, 64 insertions(+)
> > >
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index 40097826cff0..4ca8b1ed944b 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -32,6 +32,8 @@
> > >  #include <linux/kcov.h>
> > >
> > >  #include "vhost.h"
> > > +#include <linux/hashtable.h>
> > > +#include <linux/jhash.h>
> > >
> > >  static ushort max_mem_regions = 64;
> > >  module_param(max_mem_regions, ushort, 0444);
> > > @@ -49,6 +51,14 @@ enum {
> > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > >
> > > +struct vhost_vdpa_rbtree_node {
> > > +     struct hlist_node node;
> > > +     struct rb_root_cached vdpa_mem_tree;
> > > +     struct mm_struct *mm_using;
> > > +};
> > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > +int vhost_vdpa_rbtree_hlist_status;
> > > +
> > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > >  {
> >
> > Are you trying to save some per-mm information here?
> > Can't we just add it to mm_struct?
> >
> yes, this is a per-mm information, but I have checked with jason before,
> seems it maybe difficult to change the mm_struct in upstream
> so I add an to add a hlist  instead
> Thanks
> Cindy

Difficult how? You get more scrutiny if you try, for sure,
and you need to need to generalize it enough that it looks
useful outside the driver. But I guess that's good?

> >
> >
> > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > >       dev->mm = NULL;
> > >  }
> > >
> > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > +{
> > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > +     struct rb_root_cached *vdpa_tree;
> > > +     u32 key;
> > > +
> > > +     /* No hased table, init one */
> > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > +     }
> > > +
> > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > +                            key) {
> > > +             if (rbtree_root->mm_using == mm)
> > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > +     }
> > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > +     if (!rbtree_root)
> > > +             return NULL;
> > > +     rbtree_root->mm_using = mm;
> > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > +     return vdpa_tree;
> > > +}
> > > +
> > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > +{
> > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > +     u32 key;
> > > +
> > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > +
> > > +     /* No hased table, init one */
> > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > +                            key) {
> > > +             if (rbtree_root->mm_using == mm) {
> > > +                     hash_del(&rbtree_root->node);
> > > +                     kfree(rbtree_root);
> > > +             }
> > > +     }
> > > +}
> > > +
> > >  /* Caller should have device mutex */
> > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > >  {
> > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > >       err = vhost_dev_alloc_iovecs(dev);
> > >       if (err)
> > >               goto err_cgroup;
> > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > +     if (dev->vdpa_mem_tree == NULL) {
> > > +             err = -ENOMEM;
> > > +             goto err_cgroup;
> > > +     }
> > >
> > >       return 0;
> > >  err_cgroup:
> > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > >               dev->worker = NULL;
> > >       }
> > >  err_worker:
> > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > >       vhost_detach_mm(dev);
> > >       dev->kcov_handle = 0;
> > >  err_mm:
> > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > >               dev->worker = NULL;
> > >               dev->kcov_handle = 0;
> > >       }
> > > +
> > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > >       vhost_detach_mm(dev);
> > >  }
> > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > index d9109107af08..84de33de3abf 100644
> > > --- a/drivers/vhost/vhost.h
> > > +++ b/drivers/vhost/vhost.h
> > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > >       int byte_weight;
> > >       u64 kcov_handle;
> > >       bool use_worker;
> > > +     struct rb_root_cached *vdpa_mem_tree;
> > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > >                          struct vhost_iotlb_msg *msg);
> > >  };
> > > --
> > > 2.34.3
> >


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
  2022-06-27 10:01       ` Michael S. Tsirkin
@ 2022-06-28  3:54         ` Jason Wang
  -1 siblings, 0 replies; 24+ messages in thread
From: Jason Wang @ 2022-06-28  3:54 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Cindy Lu, virtualization, linux-kernel

On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > We count pinned_vm as follow in vhost-vDPA
> > > >
> > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > >          ret = -ENOMEM;
> > > >          goto unlock;
> > > > }
> > > > This means if we have two vDPA devices for the same VM the pages
> > > > would be counted twice. So we add a tree to save the page that
> > > > counted and we will not count it again.
> > > >
> > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > use a hlist to saved the root for vdpa_mem_tree.
> > > >
> > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > ---
> > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > >  drivers/vhost/vhost.h |  1 +
> > > >  2 files changed, 64 insertions(+)
> > > >
> > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > --- a/drivers/vhost/vhost.c
> > > > +++ b/drivers/vhost/vhost.c
> > > > @@ -32,6 +32,8 @@
> > > >  #include <linux/kcov.h>
> > > >
> > > >  #include "vhost.h"
> > > > +#include <linux/hashtable.h>
> > > > +#include <linux/jhash.h>
> > > >
> > > >  static ushort max_mem_regions = 64;
> > > >  module_param(max_mem_regions, ushort, 0444);
> > > > @@ -49,6 +51,14 @@ enum {
> > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > >
> > > > +struct vhost_vdpa_rbtree_node {
> > > > +     struct hlist_node node;
> > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > +     struct mm_struct *mm_using;
> > > > +};
> > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > +
> > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > >  {
> > >
> > > Are you trying to save some per-mm information here?
> > > Can't we just add it to mm_struct?
> > >
> > yes, this is a per-mm information, but I have checked with jason before,
> > seems it maybe difficult to change the mm_struct in upstream
> > so I add an to add a hlist  instead
> > Thanks
> > Cindy
>
> Difficult how?

It is only useful for vDPA probably. Though it could be used by VFIO
as well, VFIO does pinning/accounting at the container level and it
has been there for years. vDPA have an implicit "container" the
mm_struct, but the accounting is done per device right now.

In the future, when vDPA switches to iommufd, it can be then solved at
iommufd level.

And if we do this in mm, it will bring extra overheads.

Thanks

> You get more scrutiny if you try, for sure,
> and you need to need to generalize it enough that it looks
> useful outside the driver. But I guess that's good?
>
> > >
> > >
> > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > >       dev->mm = NULL;
> > > >  }
> > > >
> > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > +{
> > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > +     struct rb_root_cached *vdpa_tree;
> > > > +     u32 key;
> > > > +
> > > > +     /* No hased table, init one */
> > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > +     }
> > > > +
> > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > +                            key) {
> > > > +             if (rbtree_root->mm_using == mm)
> > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > +     }
> > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > +     if (!rbtree_root)
> > > > +             return NULL;
> > > > +     rbtree_root->mm_using = mm;
> > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > +     return vdpa_tree;
> > > > +}
> > > > +
> > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > +{
> > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > +     u32 key;
> > > > +
> > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > +
> > > > +     /* No hased table, init one */
> > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > +                            key) {
> > > > +             if (rbtree_root->mm_using == mm) {
> > > > +                     hash_del(&rbtree_root->node);
> > > > +                     kfree(rbtree_root);
> > > > +             }
> > > > +     }
> > > > +}
> > > > +
> > > >  /* Caller should have device mutex */
> > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > >  {
> > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > >       err = vhost_dev_alloc_iovecs(dev);
> > > >       if (err)
> > > >               goto err_cgroup;
> > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > +             err = -ENOMEM;
> > > > +             goto err_cgroup;
> > > > +     }
> > > >
> > > >       return 0;
> > > >  err_cgroup:
> > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > >               dev->worker = NULL;
> > > >       }
> > > >  err_worker:
> > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > >       vhost_detach_mm(dev);
> > > >       dev->kcov_handle = 0;
> > > >  err_mm:
> > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > >               dev->worker = NULL;
> > > >               dev->kcov_handle = 0;
> > > >       }
> > > > +
> > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > >       vhost_detach_mm(dev);
> > > >  }
> > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > index d9109107af08..84de33de3abf 100644
> > > > --- a/drivers/vhost/vhost.h
> > > > +++ b/drivers/vhost/vhost.h
> > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > >       int byte_weight;
> > > >       u64 kcov_handle;
> > > >       bool use_worker;
> > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > >                          struct vhost_iotlb_msg *msg);
> > > >  };
> > > > --
> > > > 2.34.3
> > >
>


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
@ 2022-06-28  3:54         ` Jason Wang
  0 siblings, 0 replies; 24+ messages in thread
From: Jason Wang @ 2022-06-28  3:54 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: linux-kernel, Cindy Lu, virtualization

On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > We count pinned_vm as follow in vhost-vDPA
> > > >
> > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > >          ret = -ENOMEM;
> > > >          goto unlock;
> > > > }
> > > > This means if we have two vDPA devices for the same VM the pages
> > > > would be counted twice. So we add a tree to save the page that
> > > > counted and we will not count it again.
> > > >
> > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > use a hlist to saved the root for vdpa_mem_tree.
> > > >
> > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > ---
> > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > >  drivers/vhost/vhost.h |  1 +
> > > >  2 files changed, 64 insertions(+)
> > > >
> > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > --- a/drivers/vhost/vhost.c
> > > > +++ b/drivers/vhost/vhost.c
> > > > @@ -32,6 +32,8 @@
> > > >  #include <linux/kcov.h>
> > > >
> > > >  #include "vhost.h"
> > > > +#include <linux/hashtable.h>
> > > > +#include <linux/jhash.h>
> > > >
> > > >  static ushort max_mem_regions = 64;
> > > >  module_param(max_mem_regions, ushort, 0444);
> > > > @@ -49,6 +51,14 @@ enum {
> > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > >
> > > > +struct vhost_vdpa_rbtree_node {
> > > > +     struct hlist_node node;
> > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > +     struct mm_struct *mm_using;
> > > > +};
> > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > +
> > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > >  {
> > >
> > > Are you trying to save some per-mm information here?
> > > Can't we just add it to mm_struct?
> > >
> > yes, this is a per-mm information, but I have checked with jason before,
> > seems it maybe difficult to change the mm_struct in upstream
> > so I add an to add a hlist  instead
> > Thanks
> > Cindy
>
> Difficult how?

It is only useful for vDPA probably. Though it could be used by VFIO
as well, VFIO does pinning/accounting at the container level and it
has been there for years. vDPA have an implicit "container" the
mm_struct, but the accounting is done per device right now.

In the future, when vDPA switches to iommufd, it can be then solved at
iommufd level.

And if we do this in mm, it will bring extra overheads.

Thanks

> You get more scrutiny if you try, for sure,
> and you need to need to generalize it enough that it looks
> useful outside the driver. But I guess that's good?
>
> > >
> > >
> > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > >       dev->mm = NULL;
> > > >  }
> > > >
> > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > +{
> > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > +     struct rb_root_cached *vdpa_tree;
> > > > +     u32 key;
> > > > +
> > > > +     /* No hased table, init one */
> > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > +     }
> > > > +
> > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > +                            key) {
> > > > +             if (rbtree_root->mm_using == mm)
> > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > +     }
> > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > +     if (!rbtree_root)
> > > > +             return NULL;
> > > > +     rbtree_root->mm_using = mm;
> > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > +     return vdpa_tree;
> > > > +}
> > > > +
> > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > +{
> > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > +     u32 key;
> > > > +
> > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > +
> > > > +     /* No hased table, init one */
> > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > +                            key) {
> > > > +             if (rbtree_root->mm_using == mm) {
> > > > +                     hash_del(&rbtree_root->node);
> > > > +                     kfree(rbtree_root);
> > > > +             }
> > > > +     }
> > > > +}
> > > > +
> > > >  /* Caller should have device mutex */
> > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > >  {
> > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > >       err = vhost_dev_alloc_iovecs(dev);
> > > >       if (err)
> > > >               goto err_cgroup;
> > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > +             err = -ENOMEM;
> > > > +             goto err_cgroup;
> > > > +     }
> > > >
> > > >       return 0;
> > > >  err_cgroup:
> > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > >               dev->worker = NULL;
> > > >       }
> > > >  err_worker:
> > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > >       vhost_detach_mm(dev);
> > > >       dev->kcov_handle = 0;
> > > >  err_mm:
> > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > >               dev->worker = NULL;
> > > >               dev->kcov_handle = 0;
> > > >       }
> > > > +
> > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > >       vhost_detach_mm(dev);
> > > >  }
> > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > index d9109107af08..84de33de3abf 100644
> > > > --- a/drivers/vhost/vhost.h
> > > > +++ b/drivers/vhost/vhost.h
> > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > >       int byte_weight;
> > > >       u64 kcov_handle;
> > > >       bool use_worker;
> > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > >                          struct vhost_iotlb_msg *msg);
> > > >  };
> > > > --
> > > > 2.34.3
> > >
>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
  2022-06-28  3:54         ` Jason Wang
@ 2022-06-28  5:58           ` Michael S. Tsirkin
  -1 siblings, 0 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2022-06-28  5:58 UTC (permalink / raw)
  To: Jason Wang; +Cc: Cindy Lu, virtualization, linux-kernel

On Tue, Jun 28, 2022 at 11:54:27AM +0800, Jason Wang wrote:
> On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > > We count pinned_vm as follow in vhost-vDPA
> > > > >
> > > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > > >          ret = -ENOMEM;
> > > > >          goto unlock;
> > > > > }
> > > > > This means if we have two vDPA devices for the same VM the pages
> > > > > would be counted twice. So we add a tree to save the page that
> > > > > counted and we will not count it again.
> > > > >
> > > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > > use a hlist to saved the root for vdpa_mem_tree.
> > > > >
> > > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > > ---
> > > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > > >  drivers/vhost/vhost.h |  1 +
> > > > >  2 files changed, 64 insertions(+)
> > > > >
> > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > > --- a/drivers/vhost/vhost.c
> > > > > +++ b/drivers/vhost/vhost.c
> > > > > @@ -32,6 +32,8 @@
> > > > >  #include <linux/kcov.h>
> > > > >
> > > > >  #include "vhost.h"
> > > > > +#include <linux/hashtable.h>
> > > > > +#include <linux/jhash.h>
> > > > >
> > > > >  static ushort max_mem_regions = 64;
> > > > >  module_param(max_mem_regions, ushort, 0444);
> > > > > @@ -49,6 +51,14 @@ enum {
> > > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > > >
> > > > > +struct vhost_vdpa_rbtree_node {
> > > > > +     struct hlist_node node;
> > > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > > +     struct mm_struct *mm_using;
> > > > > +};
> > > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > > +
> > > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > > >  {
> > > >
> > > > Are you trying to save some per-mm information here?
> > > > Can't we just add it to mm_struct?
> > > >
> > > yes, this is a per-mm information, but I have checked with jason before,
> > > seems it maybe difficult to change the mm_struct in upstream
> > > so I add an to add a hlist  instead
> > > Thanks
> > > Cindy
> >
> > Difficult how?
> 
> It is only useful for vDPA probably. Though it could be used by VFIO
> as well, VFIO does pinning/accounting at the container level and it
> has been there for years.

Yes it's been there, I'm not sure this means it's perfect.
Also, rdma guys might be interested too I guess?

> vDPA have an implicit "container" the
> mm_struct, but the accounting is done per device right now.
> 
> In the future, when vDPA switches to iommufd, it can be then solved at
> iommufd level.

So is it even worth fixing now?

> And if we do this in mm, it will bring extra overheads.
> 
> Thanks

Pointer per mm, not too bad ...

> > You get more scrutiny if you try, for sure,
> > and you need to need to generalize it enough that it looks
> > useful outside the driver. But I guess that's good?
> >
> > > >
> > > >
> > > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > > >       dev->mm = NULL;
> > > > >  }
> > > > >
> > > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > > +{
> > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > +     struct rb_root_cached *vdpa_tree;
> > > > > +     u32 key;
> > > > > +
> > > > > +     /* No hased table, init one */
> > > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > > +     }
> > > > > +
> > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > +                            key) {
> > > > > +             if (rbtree_root->mm_using == mm)
> > > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > > +     }
> > > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > > +     if (!rbtree_root)
> > > > > +             return NULL;
> > > > > +     rbtree_root->mm_using = mm;
> > > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > > +     return vdpa_tree;
> > > > > +}
> > > > > +
> > > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > > +{
> > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > +     u32 key;
> > > > > +
> > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > +
> > > > > +     /* No hased table, init one */
> > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > +                            key) {
> > > > > +             if (rbtree_root->mm_using == mm) {
> > > > > +                     hash_del(&rbtree_root->node);
> > > > > +                     kfree(rbtree_root);
> > > > > +             }
> > > > > +     }
> > > > > +}
> > > > > +
> > > > >  /* Caller should have device mutex */
> > > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > >  {
> > > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > >       err = vhost_dev_alloc_iovecs(dev);
> > > > >       if (err)
> > > > >               goto err_cgroup;
> > > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > > +             err = -ENOMEM;
> > > > > +             goto err_cgroup;
> > > > > +     }
> > > > >
> > > > >       return 0;
> > > > >  err_cgroup:
> > > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > >               dev->worker = NULL;
> > > > >       }
> > > > >  err_worker:
> > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > >       vhost_detach_mm(dev);
> > > > >       dev->kcov_handle = 0;
> > > > >  err_mm:
> > > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > > >               dev->worker = NULL;
> > > > >               dev->kcov_handle = 0;
> > > > >       }
> > > > > +
> > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > >       vhost_detach_mm(dev);
> > > > >  }
> > > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > > index d9109107af08..84de33de3abf 100644
> > > > > --- a/drivers/vhost/vhost.h
> > > > > +++ b/drivers/vhost/vhost.h
> > > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > > >       int byte_weight;
> > > > >       u64 kcov_handle;
> > > > >       bool use_worker;
> > > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > > >                          struct vhost_iotlb_msg *msg);
> > > > >  };
> > > > > --
> > > > > 2.34.3
> > > >
> >


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
@ 2022-06-28  5:58           ` Michael S. Tsirkin
  0 siblings, 0 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2022-06-28  5:58 UTC (permalink / raw)
  To: Jason Wang; +Cc: linux-kernel, Cindy Lu, virtualization

On Tue, Jun 28, 2022 at 11:54:27AM +0800, Jason Wang wrote:
> On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > > We count pinned_vm as follow in vhost-vDPA
> > > > >
> > > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > > >          ret = -ENOMEM;
> > > > >          goto unlock;
> > > > > }
> > > > > This means if we have two vDPA devices for the same VM the pages
> > > > > would be counted twice. So we add a tree to save the page that
> > > > > counted and we will not count it again.
> > > > >
> > > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > > use a hlist to saved the root for vdpa_mem_tree.
> > > > >
> > > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > > ---
> > > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > > >  drivers/vhost/vhost.h |  1 +
> > > > >  2 files changed, 64 insertions(+)
> > > > >
> > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > > --- a/drivers/vhost/vhost.c
> > > > > +++ b/drivers/vhost/vhost.c
> > > > > @@ -32,6 +32,8 @@
> > > > >  #include <linux/kcov.h>
> > > > >
> > > > >  #include "vhost.h"
> > > > > +#include <linux/hashtable.h>
> > > > > +#include <linux/jhash.h>
> > > > >
> > > > >  static ushort max_mem_regions = 64;
> > > > >  module_param(max_mem_regions, ushort, 0444);
> > > > > @@ -49,6 +51,14 @@ enum {
> > > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > > >
> > > > > +struct vhost_vdpa_rbtree_node {
> > > > > +     struct hlist_node node;
> > > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > > +     struct mm_struct *mm_using;
> > > > > +};
> > > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > > +
> > > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > > >  {
> > > >
> > > > Are you trying to save some per-mm information here?
> > > > Can't we just add it to mm_struct?
> > > >
> > > yes, this is a per-mm information, but I have checked with jason before,
> > > seems it maybe difficult to change the mm_struct in upstream
> > > so I add an to add a hlist  instead
> > > Thanks
> > > Cindy
> >
> > Difficult how?
> 
> It is only useful for vDPA probably. Though it could be used by VFIO
> as well, VFIO does pinning/accounting at the container level and it
> has been there for years.

Yes it's been there, I'm not sure this means it's perfect.
Also, rdma guys might be interested too I guess?

> vDPA have an implicit "container" the
> mm_struct, but the accounting is done per device right now.
> 
> In the future, when vDPA switches to iommufd, it can be then solved at
> iommufd level.

So is it even worth fixing now?

> And if we do this in mm, it will bring extra overheads.
> 
> Thanks

Pointer per mm, not too bad ...

> > You get more scrutiny if you try, for sure,
> > and you need to need to generalize it enough that it looks
> > useful outside the driver. But I guess that's good?
> >
> > > >
> > > >
> > > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > > >       dev->mm = NULL;
> > > > >  }
> > > > >
> > > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > > +{
> > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > +     struct rb_root_cached *vdpa_tree;
> > > > > +     u32 key;
> > > > > +
> > > > > +     /* No hased table, init one */
> > > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > > +     }
> > > > > +
> > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > +                            key) {
> > > > > +             if (rbtree_root->mm_using == mm)
> > > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > > +     }
> > > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > > +     if (!rbtree_root)
> > > > > +             return NULL;
> > > > > +     rbtree_root->mm_using = mm;
> > > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > > +     return vdpa_tree;
> > > > > +}
> > > > > +
> > > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > > +{
> > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > +     u32 key;
> > > > > +
> > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > +
> > > > > +     /* No hased table, init one */
> > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > +                            key) {
> > > > > +             if (rbtree_root->mm_using == mm) {
> > > > > +                     hash_del(&rbtree_root->node);
> > > > > +                     kfree(rbtree_root);
> > > > > +             }
> > > > > +     }
> > > > > +}
> > > > > +
> > > > >  /* Caller should have device mutex */
> > > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > >  {
> > > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > >       err = vhost_dev_alloc_iovecs(dev);
> > > > >       if (err)
> > > > >               goto err_cgroup;
> > > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > > +             err = -ENOMEM;
> > > > > +             goto err_cgroup;
> > > > > +     }
> > > > >
> > > > >       return 0;
> > > > >  err_cgroup:
> > > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > >               dev->worker = NULL;
> > > > >       }
> > > > >  err_worker:
> > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > >       vhost_detach_mm(dev);
> > > > >       dev->kcov_handle = 0;
> > > > >  err_mm:
> > > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > > >               dev->worker = NULL;
> > > > >               dev->kcov_handle = 0;
> > > > >       }
> > > > > +
> > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > >       vhost_detach_mm(dev);
> > > > >  }
> > > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > > index d9109107af08..84de33de3abf 100644
> > > > > --- a/drivers/vhost/vhost.h
> > > > > +++ b/drivers/vhost/vhost.h
> > > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > > >       int byte_weight;
> > > > >       u64 kcov_handle;
> > > > >       bool use_worker;
> > > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > > >                          struct vhost_iotlb_msg *msg);
> > > > >  };
> > > > > --
> > > > > 2.34.3
> > > >
> >

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
  2022-06-28  5:58           ` Michael S. Tsirkin
@ 2022-06-28  6:03             ` Jason Wang
  -1 siblings, 0 replies; 24+ messages in thread
From: Jason Wang @ 2022-06-28  6:03 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: linux-kernel, Cindy Lu, virtualization

On Tue, Jun 28, 2022 at 1:58 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Jun 28, 2022 at 11:54:27AM +0800, Jason Wang wrote:
> > On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > > > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > > > We count pinned_vm as follow in vhost-vDPA
> > > > > >
> > > > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > > > >          ret = -ENOMEM;
> > > > > >          goto unlock;
> > > > > > }
> > > > > > This means if we have two vDPA devices for the same VM the pages
> > > > > > would be counted twice. So we add a tree to save the page that
> > > > > > counted and we will not count it again.
> > > > > >
> > > > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > > > use a hlist to saved the root for vdpa_mem_tree.
> > > > > >
> > > > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > > > ---
> > > > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > > > >  drivers/vhost/vhost.h |  1 +
> > > > > >  2 files changed, 64 insertions(+)
> > > > > >
> > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > > > --- a/drivers/vhost/vhost.c
> > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > @@ -32,6 +32,8 @@
> > > > > >  #include <linux/kcov.h>
> > > > > >
> > > > > >  #include "vhost.h"
> > > > > > +#include <linux/hashtable.h>
> > > > > > +#include <linux/jhash.h>
> > > > > >
> > > > > >  static ushort max_mem_regions = 64;
> > > > > >  module_param(max_mem_regions, ushort, 0444);
> > > > > > @@ -49,6 +51,14 @@ enum {
> > > > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > > > >
> > > > > > +struct vhost_vdpa_rbtree_node {
> > > > > > +     struct hlist_node node;
> > > > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > > > +     struct mm_struct *mm_using;
> > > > > > +};
> > > > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > > > +
> > > > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > > > >  {
> > > > >
> > > > > Are you trying to save some per-mm information here?
> > > > > Can't we just add it to mm_struct?
> > > > >
> > > > yes, this is a per-mm information, but I have checked with jason before,
> > > > seems it maybe difficult to change the mm_struct in upstream
> > > > so I add an to add a hlist  instead
> > > > Thanks
> > > > Cindy
> > >
> > > Difficult how?
> >
> > It is only useful for vDPA probably. Though it could be used by VFIO
> > as well, VFIO does pinning/accounting at the container level and it
> > has been there for years.
>
> Yes it's been there, I'm not sure this means it's perfect.
> Also, rdma guys might be interested too I guess?

It looks to me they plan to go to iommufd as well.

>
> > vDPA have an implicit "container" the
> > mm_struct, but the accounting is done per device right now.
> >
> > In the future, when vDPA switches to iommufd, it can be then solved at
> > iommufd level.
>
> So is it even worth fixing now?

Not sure, but I guess it's better. (Or we need to teach the libvirt to
have special care on this).

>
> > And if we do this in mm, it will bring extra overheads.
> >
> > Thanks
>
> Pointer per mm, not too bad ...

Unless we enable it unconditionally, it requires a lot of tree
operations at least.

Thanks

>
> > > You get more scrutiny if you try, for sure,
> > > and you need to need to generalize it enough that it looks
> > > useful outside the driver. But I guess that's good?
> > >
> > > > >
> > > > >
> > > > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > > > >       dev->mm = NULL;
> > > > > >  }
> > > > > >
> > > > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > > > +{
> > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > +     struct rb_root_cached *vdpa_tree;
> > > > > > +     u32 key;
> > > > > > +
> > > > > > +     /* No hased table, init one */
> > > > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > > > +     }
> > > > > > +
> > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > +                            key) {
> > > > > > +             if (rbtree_root->mm_using == mm)
> > > > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > > > +     }
> > > > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > > > +     if (!rbtree_root)
> > > > > > +             return NULL;
> > > > > > +     rbtree_root->mm_using = mm;
> > > > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > > > +     return vdpa_tree;
> > > > > > +}
> > > > > > +
> > > > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > > > +{
> > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > +     u32 key;
> > > > > > +
> > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > +
> > > > > > +     /* No hased table, init one */
> > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > +                            key) {
> > > > > > +             if (rbtree_root->mm_using == mm) {
> > > > > > +                     hash_del(&rbtree_root->node);
> > > > > > +                     kfree(rbtree_root);
> > > > > > +             }
> > > > > > +     }
> > > > > > +}
> > > > > > +
> > > > > >  /* Caller should have device mutex */
> > > > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > >  {
> > > > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > >       err = vhost_dev_alloc_iovecs(dev);
> > > > > >       if (err)
> > > > > >               goto err_cgroup;
> > > > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > > > +             err = -ENOMEM;
> > > > > > +             goto err_cgroup;
> > > > > > +     }
> > > > > >
> > > > > >       return 0;
> > > > > >  err_cgroup:
> > > > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > >               dev->worker = NULL;
> > > > > >       }
> > > > > >  err_worker:
> > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > >       vhost_detach_mm(dev);
> > > > > >       dev->kcov_handle = 0;
> > > > > >  err_mm:
> > > > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > > > >               dev->worker = NULL;
> > > > > >               dev->kcov_handle = 0;
> > > > > >       }
> > > > > > +
> > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > >       vhost_detach_mm(dev);
> > > > > >  }
> > > > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > > > index d9109107af08..84de33de3abf 100644
> > > > > > --- a/drivers/vhost/vhost.h
> > > > > > +++ b/drivers/vhost/vhost.h
> > > > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > > > >       int byte_weight;
> > > > > >       u64 kcov_handle;
> > > > > >       bool use_worker;
> > > > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > > > >                          struct vhost_iotlb_msg *msg);
> > > > > >  };
> > > > > > --
> > > > > > 2.34.3
> > > > >
> > >
>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
@ 2022-06-28  6:03             ` Jason Wang
  0 siblings, 0 replies; 24+ messages in thread
From: Jason Wang @ 2022-06-28  6:03 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Cindy Lu, virtualization, linux-kernel

On Tue, Jun 28, 2022 at 1:58 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Jun 28, 2022 at 11:54:27AM +0800, Jason Wang wrote:
> > On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > > > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > > > We count pinned_vm as follow in vhost-vDPA
> > > > > >
> > > > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > > > >          ret = -ENOMEM;
> > > > > >          goto unlock;
> > > > > > }
> > > > > > This means if we have two vDPA devices for the same VM the pages
> > > > > > would be counted twice. So we add a tree to save the page that
> > > > > > counted and we will not count it again.
> > > > > >
> > > > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > > > use a hlist to saved the root for vdpa_mem_tree.
> > > > > >
> > > > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > > > ---
> > > > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > > > >  drivers/vhost/vhost.h |  1 +
> > > > > >  2 files changed, 64 insertions(+)
> > > > > >
> > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > > > --- a/drivers/vhost/vhost.c
> > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > @@ -32,6 +32,8 @@
> > > > > >  #include <linux/kcov.h>
> > > > > >
> > > > > >  #include "vhost.h"
> > > > > > +#include <linux/hashtable.h>
> > > > > > +#include <linux/jhash.h>
> > > > > >
> > > > > >  static ushort max_mem_regions = 64;
> > > > > >  module_param(max_mem_regions, ushort, 0444);
> > > > > > @@ -49,6 +51,14 @@ enum {
> > > > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > > > >
> > > > > > +struct vhost_vdpa_rbtree_node {
> > > > > > +     struct hlist_node node;
> > > > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > > > +     struct mm_struct *mm_using;
> > > > > > +};
> > > > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > > > +
> > > > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > > > >  {
> > > > >
> > > > > Are you trying to save some per-mm information here?
> > > > > Can't we just add it to mm_struct?
> > > > >
> > > > yes, this is a per-mm information, but I have checked with jason before,
> > > > seems it maybe difficult to change the mm_struct in upstream
> > > > so I add an to add a hlist  instead
> > > > Thanks
> > > > Cindy
> > >
> > > Difficult how?
> >
> > It is only useful for vDPA probably. Though it could be used by VFIO
> > as well, VFIO does pinning/accounting at the container level and it
> > has been there for years.
>
> Yes it's been there, I'm not sure this means it's perfect.
> Also, rdma guys might be interested too I guess?

It looks to me they plan to go to iommufd as well.

>
> > vDPA have an implicit "container" the
> > mm_struct, but the accounting is done per device right now.
> >
> > In the future, when vDPA switches to iommufd, it can be then solved at
> > iommufd level.
>
> So is it even worth fixing now?

Not sure, but I guess it's better. (Or we need to teach the libvirt to
have special care on this).

>
> > And if we do this in mm, it will bring extra overheads.
> >
> > Thanks
>
> Pointer per mm, not too bad ...

Unless we enable it unconditionally, it requires a lot of tree
operations at least.

Thanks

>
> > > You get more scrutiny if you try, for sure,
> > > and you need to need to generalize it enough that it looks
> > > useful outside the driver. But I guess that's good?
> > >
> > > > >
> > > > >
> > > > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > > > >       dev->mm = NULL;
> > > > > >  }
> > > > > >
> > > > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > > > +{
> > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > +     struct rb_root_cached *vdpa_tree;
> > > > > > +     u32 key;
> > > > > > +
> > > > > > +     /* No hased table, init one */
> > > > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > > > +     }
> > > > > > +
> > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > +                            key) {
> > > > > > +             if (rbtree_root->mm_using == mm)
> > > > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > > > +     }
> > > > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > > > +     if (!rbtree_root)
> > > > > > +             return NULL;
> > > > > > +     rbtree_root->mm_using = mm;
> > > > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > > > +     return vdpa_tree;
> > > > > > +}
> > > > > > +
> > > > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > > > +{
> > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > +     u32 key;
> > > > > > +
> > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > +
> > > > > > +     /* No hased table, init one */
> > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > +                            key) {
> > > > > > +             if (rbtree_root->mm_using == mm) {
> > > > > > +                     hash_del(&rbtree_root->node);
> > > > > > +                     kfree(rbtree_root);
> > > > > > +             }
> > > > > > +     }
> > > > > > +}
> > > > > > +
> > > > > >  /* Caller should have device mutex */
> > > > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > >  {
> > > > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > >       err = vhost_dev_alloc_iovecs(dev);
> > > > > >       if (err)
> > > > > >               goto err_cgroup;
> > > > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > > > +             err = -ENOMEM;
> > > > > > +             goto err_cgroup;
> > > > > > +     }
> > > > > >
> > > > > >       return 0;
> > > > > >  err_cgroup:
> > > > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > >               dev->worker = NULL;
> > > > > >       }
> > > > > >  err_worker:
> > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > >       vhost_detach_mm(dev);
> > > > > >       dev->kcov_handle = 0;
> > > > > >  err_mm:
> > > > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > > > >               dev->worker = NULL;
> > > > > >               dev->kcov_handle = 0;
> > > > > >       }
> > > > > > +
> > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > >       vhost_detach_mm(dev);
> > > > > >  }
> > > > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > > > index d9109107af08..84de33de3abf 100644
> > > > > > --- a/drivers/vhost/vhost.h
> > > > > > +++ b/drivers/vhost/vhost.h
> > > > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > > > >       int byte_weight;
> > > > > >       u64 kcov_handle;
> > > > > >       bool use_worker;
> > > > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > > > >                          struct vhost_iotlb_msg *msg);
> > > > > >  };
> > > > > > --
> > > > > > 2.34.3
> > > > >
> > >
>


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
  2022-06-28  6:03             ` Jason Wang
@ 2022-06-28  6:07               ` Michael S. Tsirkin
  -1 siblings, 0 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2022-06-28  6:07 UTC (permalink / raw)
  To: Jason Wang; +Cc: Cindy Lu, virtualization, linux-kernel

On Tue, Jun 28, 2022 at 02:03:38PM +0800, Jason Wang wrote:
> On Tue, Jun 28, 2022 at 1:58 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Tue, Jun 28, 2022 at 11:54:27AM +0800, Jason Wang wrote:
> > > On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > > > > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > > > > We count pinned_vm as follow in vhost-vDPA
> > > > > > >
> > > > > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > > > > >          ret = -ENOMEM;
> > > > > > >          goto unlock;
> > > > > > > }
> > > > > > > This means if we have two vDPA devices for the same VM the pages
> > > > > > > would be counted twice. So we add a tree to save the page that
> > > > > > > counted and we will not count it again.
> > > > > > >
> > > > > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > > > > use a hlist to saved the root for vdpa_mem_tree.
> > > > > > >
> > > > > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > > > > ---
> > > > > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > >  drivers/vhost/vhost.h |  1 +
> > > > > > >  2 files changed, 64 insertions(+)
> > > > > > >
> > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > @@ -32,6 +32,8 @@
> > > > > > >  #include <linux/kcov.h>
> > > > > > >
> > > > > > >  #include "vhost.h"
> > > > > > > +#include <linux/hashtable.h>
> > > > > > > +#include <linux/jhash.h>
> > > > > > >
> > > > > > >  static ushort max_mem_regions = 64;
> > > > > > >  module_param(max_mem_regions, ushort, 0444);
> > > > > > > @@ -49,6 +51,14 @@ enum {
> > > > > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > > > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > > > > >
> > > > > > > +struct vhost_vdpa_rbtree_node {
> > > > > > > +     struct hlist_node node;
> > > > > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > > > > +     struct mm_struct *mm_using;
> > > > > > > +};
> > > > > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > > > > +
> > > > > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > > > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > > > > >  {
> > > > > >
> > > > > > Are you trying to save some per-mm information here?
> > > > > > Can't we just add it to mm_struct?
> > > > > >
> > > > > yes, this is a per-mm information, but I have checked with jason before,
> > > > > seems it maybe difficult to change the mm_struct in upstream
> > > > > so I add an to add a hlist  instead
> > > > > Thanks
> > > > > Cindy
> > > >
> > > > Difficult how?
> > >
> > > It is only useful for vDPA probably. Though it could be used by VFIO
> > > as well, VFIO does pinning/accounting at the container level and it
> > > has been there for years.
> >
> > Yes it's been there, I'm not sure this means it's perfect.
> > Also, rdma guys might be interested too I guess?
> 
> It looks to me they plan to go to iommufd as well.
> 
> >
> > > vDPA have an implicit "container" the
> > > mm_struct, but the accounting is done per device right now.
> > >
> > > In the future, when vDPA switches to iommufd, it can be then solved at
> > > iommufd level.
> >
> > So is it even worth fixing now?
> 
> Not sure, but I guess it's better. (Or we need to teach the libvirt to
> have special care on this).

It already has to for existing kernels.  Let's just move to iommufd
faster then?

> >
> > > And if we do this in mm, it will bring extra overheads.
> > >
> > > Thanks
> >
> > Pointer per mm, not too bad ...
> 
> Unless we enable it unconditionally, it requires a lot of tree
> operations at least.
> 
> Thanks

Not sure I understand.

> >
> > > > You get more scrutiny if you try, for sure,
> > > > and you need to need to generalize it enough that it looks
> > > > useful outside the driver. But I guess that's good?
> > > >
> > > > > >
> > > > > >
> > > > > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > > > > >       dev->mm = NULL;
> > > > > > >  }
> > > > > > >
> > > > > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > > > > +{
> > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > +     struct rb_root_cached *vdpa_tree;
> > > > > > > +     u32 key;
> > > > > > > +
> > > > > > > +     /* No hased table, init one */
> > > > > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > > > > +     }
> > > > > > > +
> > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > +                            key) {
> > > > > > > +             if (rbtree_root->mm_using == mm)
> > > > > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > > > > +     }
> > > > > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > > > > +     if (!rbtree_root)
> > > > > > > +             return NULL;
> > > > > > > +     rbtree_root->mm_using = mm;
> > > > > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > > > > +     return vdpa_tree;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > > > > +{
> > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > +     u32 key;
> > > > > > > +
> > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > +
> > > > > > > +     /* No hased table, init one */
> > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > +                            key) {
> > > > > > > +             if (rbtree_root->mm_using == mm) {
> > > > > > > +                     hash_del(&rbtree_root->node);
> > > > > > > +                     kfree(rbtree_root);
> > > > > > > +             }
> > > > > > > +     }
> > > > > > > +}
> > > > > > > +
> > > > > > >  /* Caller should have device mutex */
> > > > > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > >  {
> > > > > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > >       err = vhost_dev_alloc_iovecs(dev);
> > > > > > >       if (err)
> > > > > > >               goto err_cgroup;
> > > > > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > > > > +             err = -ENOMEM;
> > > > > > > +             goto err_cgroup;
> > > > > > > +     }
> > > > > > >
> > > > > > >       return 0;
> > > > > > >  err_cgroup:
> > > > > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > >               dev->worker = NULL;
> > > > > > >       }
> > > > > > >  err_worker:
> > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > >       vhost_detach_mm(dev);
> > > > > > >       dev->kcov_handle = 0;
> > > > > > >  err_mm:
> > > > > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > > > > >               dev->worker = NULL;
> > > > > > >               dev->kcov_handle = 0;
> > > > > > >       }
> > > > > > > +
> > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > >       vhost_detach_mm(dev);
> > > > > > >  }
> > > > > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > > > > index d9109107af08..84de33de3abf 100644
> > > > > > > --- a/drivers/vhost/vhost.h
> > > > > > > +++ b/drivers/vhost/vhost.h
> > > > > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > > > > >       int byte_weight;
> > > > > > >       u64 kcov_handle;
> > > > > > >       bool use_worker;
> > > > > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > > > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > > > > >                          struct vhost_iotlb_msg *msg);
> > > > > > >  };
> > > > > > > --
> > > > > > > 2.34.3
> > > > > >
> > > >
> >


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
@ 2022-06-28  6:07               ` Michael S. Tsirkin
  0 siblings, 0 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2022-06-28  6:07 UTC (permalink / raw)
  To: Jason Wang; +Cc: linux-kernel, Cindy Lu, virtualization

On Tue, Jun 28, 2022 at 02:03:38PM +0800, Jason Wang wrote:
> On Tue, Jun 28, 2022 at 1:58 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Tue, Jun 28, 2022 at 11:54:27AM +0800, Jason Wang wrote:
> > > On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > > > > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > > > > We count pinned_vm as follow in vhost-vDPA
> > > > > > >
> > > > > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > > > > >          ret = -ENOMEM;
> > > > > > >          goto unlock;
> > > > > > > }
> > > > > > > This means if we have two vDPA devices for the same VM the pages
> > > > > > > would be counted twice. So we add a tree to save the page that
> > > > > > > counted and we will not count it again.
> > > > > > >
> > > > > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > > > > use a hlist to saved the root for vdpa_mem_tree.
> > > > > > >
> > > > > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > > > > ---
> > > > > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > >  drivers/vhost/vhost.h |  1 +
> > > > > > >  2 files changed, 64 insertions(+)
> > > > > > >
> > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > @@ -32,6 +32,8 @@
> > > > > > >  #include <linux/kcov.h>
> > > > > > >
> > > > > > >  #include "vhost.h"
> > > > > > > +#include <linux/hashtable.h>
> > > > > > > +#include <linux/jhash.h>
> > > > > > >
> > > > > > >  static ushort max_mem_regions = 64;
> > > > > > >  module_param(max_mem_regions, ushort, 0444);
> > > > > > > @@ -49,6 +51,14 @@ enum {
> > > > > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > > > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > > > > >
> > > > > > > +struct vhost_vdpa_rbtree_node {
> > > > > > > +     struct hlist_node node;
> > > > > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > > > > +     struct mm_struct *mm_using;
> > > > > > > +};
> > > > > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > > > > +
> > > > > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > > > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > > > > >  {
> > > > > >
> > > > > > Are you trying to save some per-mm information here?
> > > > > > Can't we just add it to mm_struct?
> > > > > >
> > > > > yes, this is a per-mm information, but I have checked with jason before,
> > > > > seems it maybe difficult to change the mm_struct in upstream
> > > > > so I add an to add a hlist  instead
> > > > > Thanks
> > > > > Cindy
> > > >
> > > > Difficult how?
> > >
> > > It is only useful for vDPA probably. Though it could be used by VFIO
> > > as well, VFIO does pinning/accounting at the container level and it
> > > has been there for years.
> >
> > Yes it's been there, I'm not sure this means it's perfect.
> > Also, rdma guys might be interested too I guess?
> 
> It looks to me they plan to go to iommufd as well.
> 
> >
> > > vDPA have an implicit "container" the
> > > mm_struct, but the accounting is done per device right now.
> > >
> > > In the future, when vDPA switches to iommufd, it can be then solved at
> > > iommufd level.
> >
> > So is it even worth fixing now?
> 
> Not sure, but I guess it's better. (Or we need to teach the libvirt to
> have special care on this).

It already has to for existing kernels.  Let's just move to iommufd
faster then?

> >
> > > And if we do this in mm, it will bring extra overheads.
> > >
> > > Thanks
> >
> > Pointer per mm, not too bad ...
> 
> Unless we enable it unconditionally, it requires a lot of tree
> operations at least.
> 
> Thanks

Not sure I understand.

> >
> > > > You get more scrutiny if you try, for sure,
> > > > and you need to need to generalize it enough that it looks
> > > > useful outside the driver. But I guess that's good?
> > > >
> > > > > >
> > > > > >
> > > > > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > > > > >       dev->mm = NULL;
> > > > > > >  }
> > > > > > >
> > > > > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > > > > +{
> > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > +     struct rb_root_cached *vdpa_tree;
> > > > > > > +     u32 key;
> > > > > > > +
> > > > > > > +     /* No hased table, init one */
> > > > > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > > > > +     }
> > > > > > > +
> > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > +                            key) {
> > > > > > > +             if (rbtree_root->mm_using == mm)
> > > > > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > > > > +     }
> > > > > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > > > > +     if (!rbtree_root)
> > > > > > > +             return NULL;
> > > > > > > +     rbtree_root->mm_using = mm;
> > > > > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > > > > +     return vdpa_tree;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > > > > +{
> > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > +     u32 key;
> > > > > > > +
> > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > +
> > > > > > > +     /* No hased table, init one */
> > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > +                            key) {
> > > > > > > +             if (rbtree_root->mm_using == mm) {
> > > > > > > +                     hash_del(&rbtree_root->node);
> > > > > > > +                     kfree(rbtree_root);
> > > > > > > +             }
> > > > > > > +     }
> > > > > > > +}
> > > > > > > +
> > > > > > >  /* Caller should have device mutex */
> > > > > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > >  {
> > > > > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > >       err = vhost_dev_alloc_iovecs(dev);
> > > > > > >       if (err)
> > > > > > >               goto err_cgroup;
> > > > > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > > > > +             err = -ENOMEM;
> > > > > > > +             goto err_cgroup;
> > > > > > > +     }
> > > > > > >
> > > > > > >       return 0;
> > > > > > >  err_cgroup:
> > > > > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > >               dev->worker = NULL;
> > > > > > >       }
> > > > > > >  err_worker:
> > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > >       vhost_detach_mm(dev);
> > > > > > >       dev->kcov_handle = 0;
> > > > > > >  err_mm:
> > > > > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > > > > >               dev->worker = NULL;
> > > > > > >               dev->kcov_handle = 0;
> > > > > > >       }
> > > > > > > +
> > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > >       vhost_detach_mm(dev);
> > > > > > >  }
> > > > > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > > > > index d9109107af08..84de33de3abf 100644
> > > > > > > --- a/drivers/vhost/vhost.h
> > > > > > > +++ b/drivers/vhost/vhost.h
> > > > > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > > > > >       int byte_weight;
> > > > > > >       u64 kcov_handle;
> > > > > > >       bool use_worker;
> > > > > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > > > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > > > > >                          struct vhost_iotlb_msg *msg);
> > > > > > >  };
> > > > > > > --
> > > > > > > 2.34.3
> > > > > >
> > > >
> >

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
  2022-06-28  6:07               ` Michael S. Tsirkin
@ 2022-06-28  6:10                 ` Jason Wang
  -1 siblings, 0 replies; 24+ messages in thread
From: Jason Wang @ 2022-06-28  6:10 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Cindy Lu, virtualization, linux-kernel

On Tue, Jun 28, 2022 at 2:07 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Jun 28, 2022 at 02:03:38PM +0800, Jason Wang wrote:
> > On Tue, Jun 28, 2022 at 1:58 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Tue, Jun 28, 2022 at 11:54:27AM +0800, Jason Wang wrote:
> > > > On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > > > > > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > > > > > We count pinned_vm as follow in vhost-vDPA
> > > > > > > >
> > > > > > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > > > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > > > > > >          ret = -ENOMEM;
> > > > > > > >          goto unlock;
> > > > > > > > }
> > > > > > > > This means if we have two vDPA devices for the same VM the pages
> > > > > > > > would be counted twice. So we add a tree to save the page that
> > > > > > > > counted and we will not count it again.
> > > > > > > >
> > > > > > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > > > > > use a hlist to saved the root for vdpa_mem_tree.
> > > > > > > >
> > > > > > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > > > > > ---
> > > > > > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > > >  drivers/vhost/vhost.h |  1 +
> > > > > > > >  2 files changed, 64 insertions(+)
> > > > > > > >
> > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > > @@ -32,6 +32,8 @@
> > > > > > > >  #include <linux/kcov.h>
> > > > > > > >
> > > > > > > >  #include "vhost.h"
> > > > > > > > +#include <linux/hashtable.h>
> > > > > > > > +#include <linux/jhash.h>
> > > > > > > >
> > > > > > > >  static ushort max_mem_regions = 64;
> > > > > > > >  module_param(max_mem_regions, ushort, 0444);
> > > > > > > > @@ -49,6 +51,14 @@ enum {
> > > > > > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > > > > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > > > > > >
> > > > > > > > +struct vhost_vdpa_rbtree_node {
> > > > > > > > +     struct hlist_node node;
> > > > > > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > > > > > +     struct mm_struct *mm_using;
> > > > > > > > +};
> > > > > > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > > > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > > > > > +
> > > > > > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > > > > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > > > > > >  {
> > > > > > >
> > > > > > > Are you trying to save some per-mm information here?
> > > > > > > Can't we just add it to mm_struct?
> > > > > > >
> > > > > > yes, this is a per-mm information, but I have checked with jason before,
> > > > > > seems it maybe difficult to change the mm_struct in upstream
> > > > > > so I add an to add a hlist  instead
> > > > > > Thanks
> > > > > > Cindy
> > > > >
> > > > > Difficult how?
> > > >
> > > > It is only useful for vDPA probably. Though it could be used by VFIO
> > > > as well, VFIO does pinning/accounting at the container level and it
> > > > has been there for years.
> > >
> > > Yes it's been there, I'm not sure this means it's perfect.
> > > Also, rdma guys might be interested too I guess?
> >
> > It looks to me they plan to go to iommufd as well.
> >
> > >
> > > > vDPA have an implicit "container" the
> > > > mm_struct, but the accounting is done per device right now.
> > > >
> > > > In the future, when vDPA switches to iommufd, it can be then solved at
> > > > iommufd level.
> > >
> > > So is it even worth fixing now?
> >
> > Not sure, but I guess it's better. (Or we need to teach the libvirt to
> > have special care on this).
>
> It already has to for existing kernels.  Let's just move to iommufd
> faster then?

This is fine, Cindy, any idea on this?

>
> > >
> > > > And if we do this in mm, it will bring extra overheads.
> > > >
> > > > Thanks
> > >
> > > Pointer per mm, not too bad ...
> >
> > Unless we enable it unconditionally, it requires a lot of tree
> > operations at least.
> >
> > Thanks
>
> Not sure I understand.

I mean in order to not count pinned pages multiple times we need to
keep track of the pages that are already pinned in an rbtree with
refcounts. This means we need to populate the tree when userspace
pin/unpin pages.

Thanks

>
> > >
> > > > > You get more scrutiny if you try, for sure,
> > > > > and you need to need to generalize it enough that it looks
> > > > > useful outside the driver. But I guess that's good?
> > > > >
> > > > > > >
> > > > > > >
> > > > > > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > > > > > >       dev->mm = NULL;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > > > > > +{
> > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > +     struct rb_root_cached *vdpa_tree;
> > > > > > > > +     u32 key;
> > > > > > > > +
> > > > > > > > +     /* No hased table, init one */
> > > > > > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > > > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > > > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > > > > > +     }
> > > > > > > > +
> > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > +                            key) {
> > > > > > > > +             if (rbtree_root->mm_using == mm)
> > > > > > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > > > > > +     }
> > > > > > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > > > > > +     if (!rbtree_root)
> > > > > > > > +             return NULL;
> > > > > > > > +     rbtree_root->mm_using = mm;
> > > > > > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > > > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > > > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > > > > > +     return vdpa_tree;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > > > > > +{
> > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > +     u32 key;
> > > > > > > > +
> > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > +
> > > > > > > > +     /* No hased table, init one */
> > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > +                            key) {
> > > > > > > > +             if (rbtree_root->mm_using == mm) {
> > > > > > > > +                     hash_del(&rbtree_root->node);
> > > > > > > > +                     kfree(rbtree_root);
> > > > > > > > +             }
> > > > > > > > +     }
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  /* Caller should have device mutex */
> > > > > > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > >  {
> > > > > > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > >       err = vhost_dev_alloc_iovecs(dev);
> > > > > > > >       if (err)
> > > > > > > >               goto err_cgroup;
> > > > > > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > > > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > > > > > +             err = -ENOMEM;
> > > > > > > > +             goto err_cgroup;
> > > > > > > > +     }
> > > > > > > >
> > > > > > > >       return 0;
> > > > > > > >  err_cgroup:
> > > > > > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > >               dev->worker = NULL;
> > > > > > > >       }
> > > > > > > >  err_worker:
> > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > >       vhost_detach_mm(dev);
> > > > > > > >       dev->kcov_handle = 0;
> > > > > > > >  err_mm:
> > > > > > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > > > > > >               dev->worker = NULL;
> > > > > > > >               dev->kcov_handle = 0;
> > > > > > > >       }
> > > > > > > > +
> > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > >       vhost_detach_mm(dev);
> > > > > > > >  }
> > > > > > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > > > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > > > > > index d9109107af08..84de33de3abf 100644
> > > > > > > > --- a/drivers/vhost/vhost.h
> > > > > > > > +++ b/drivers/vhost/vhost.h
> > > > > > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > > > > > >       int byte_weight;
> > > > > > > >       u64 kcov_handle;
> > > > > > > >       bool use_worker;
> > > > > > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > > > > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > > > > > >                          struct vhost_iotlb_msg *msg);
> > > > > > > >  };
> > > > > > > > --
> > > > > > > > 2.34.3
> > > > > > >
> > > > >
> > >
>


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
@ 2022-06-28  6:10                 ` Jason Wang
  0 siblings, 0 replies; 24+ messages in thread
From: Jason Wang @ 2022-06-28  6:10 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: linux-kernel, Cindy Lu, virtualization

On Tue, Jun 28, 2022 at 2:07 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Jun 28, 2022 at 02:03:38PM +0800, Jason Wang wrote:
> > On Tue, Jun 28, 2022 at 1:58 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Tue, Jun 28, 2022 at 11:54:27AM +0800, Jason Wang wrote:
> > > > On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > > > > > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > > > > > We count pinned_vm as follow in vhost-vDPA
> > > > > > > >
> > > > > > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > > > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > > > > > >          ret = -ENOMEM;
> > > > > > > >          goto unlock;
> > > > > > > > }
> > > > > > > > This means if we have two vDPA devices for the same VM the pages
> > > > > > > > would be counted twice. So we add a tree to save the page that
> > > > > > > > counted and we will not count it again.
> > > > > > > >
> > > > > > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > > > > > use a hlist to saved the root for vdpa_mem_tree.
> > > > > > > >
> > > > > > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > > > > > ---
> > > > > > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > > >  drivers/vhost/vhost.h |  1 +
> > > > > > > >  2 files changed, 64 insertions(+)
> > > > > > > >
> > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > > @@ -32,6 +32,8 @@
> > > > > > > >  #include <linux/kcov.h>
> > > > > > > >
> > > > > > > >  #include "vhost.h"
> > > > > > > > +#include <linux/hashtable.h>
> > > > > > > > +#include <linux/jhash.h>
> > > > > > > >
> > > > > > > >  static ushort max_mem_regions = 64;
> > > > > > > >  module_param(max_mem_regions, ushort, 0444);
> > > > > > > > @@ -49,6 +51,14 @@ enum {
> > > > > > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > > > > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > > > > > >
> > > > > > > > +struct vhost_vdpa_rbtree_node {
> > > > > > > > +     struct hlist_node node;
> > > > > > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > > > > > +     struct mm_struct *mm_using;
> > > > > > > > +};
> > > > > > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > > > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > > > > > +
> > > > > > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > > > > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > > > > > >  {
> > > > > > >
> > > > > > > Are you trying to save some per-mm information here?
> > > > > > > Can't we just add it to mm_struct?
> > > > > > >
> > > > > > yes, this is a per-mm information, but I have checked with jason before,
> > > > > > seems it maybe difficult to change the mm_struct in upstream
> > > > > > so I add an to add a hlist  instead
> > > > > > Thanks
> > > > > > Cindy
> > > > >
> > > > > Difficult how?
> > > >
> > > > It is only useful for vDPA probably. Though it could be used by VFIO
> > > > as well, VFIO does pinning/accounting at the container level and it
> > > > has been there for years.
> > >
> > > Yes it's been there, I'm not sure this means it's perfect.
> > > Also, rdma guys might be interested too I guess?
> >
> > It looks to me they plan to go to iommufd as well.
> >
> > >
> > > > vDPA have an implicit "container" the
> > > > mm_struct, but the accounting is done per device right now.
> > > >
> > > > In the future, when vDPA switches to iommufd, it can be then solved at
> > > > iommufd level.
> > >
> > > So is it even worth fixing now?
> >
> > Not sure, but I guess it's better. (Or we need to teach the libvirt to
> > have special care on this).
>
> It already has to for existing kernels.  Let's just move to iommufd
> faster then?

This is fine, Cindy, any idea on this?

>
> > >
> > > > And if we do this in mm, it will bring extra overheads.
> > > >
> > > > Thanks
> > >
> > > Pointer per mm, not too bad ...
> >
> > Unless we enable it unconditionally, it requires a lot of tree
> > operations at least.
> >
> > Thanks
>
> Not sure I understand.

I mean in order to not count pinned pages multiple times we need to
keep track of the pages that are already pinned in an rbtree with
refcounts. This means we need to populate the tree when userspace
pin/unpin pages.

Thanks

>
> > >
> > > > > You get more scrutiny if you try, for sure,
> > > > > and you need to need to generalize it enough that it looks
> > > > > useful outside the driver. But I guess that's good?
> > > > >
> > > > > > >
> > > > > > >
> > > > > > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > > > > > >       dev->mm = NULL;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > > > > > +{
> > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > +     struct rb_root_cached *vdpa_tree;
> > > > > > > > +     u32 key;
> > > > > > > > +
> > > > > > > > +     /* No hased table, init one */
> > > > > > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > > > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > > > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > > > > > +     }
> > > > > > > > +
> > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > +                            key) {
> > > > > > > > +             if (rbtree_root->mm_using == mm)
> > > > > > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > > > > > +     }
> > > > > > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > > > > > +     if (!rbtree_root)
> > > > > > > > +             return NULL;
> > > > > > > > +     rbtree_root->mm_using = mm;
> > > > > > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > > > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > > > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > > > > > +     return vdpa_tree;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > > > > > +{
> > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > +     u32 key;
> > > > > > > > +
> > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > +
> > > > > > > > +     /* No hased table, init one */
> > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > +                            key) {
> > > > > > > > +             if (rbtree_root->mm_using == mm) {
> > > > > > > > +                     hash_del(&rbtree_root->node);
> > > > > > > > +                     kfree(rbtree_root);
> > > > > > > > +             }
> > > > > > > > +     }
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  /* Caller should have device mutex */
> > > > > > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > >  {
> > > > > > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > >       err = vhost_dev_alloc_iovecs(dev);
> > > > > > > >       if (err)
> > > > > > > >               goto err_cgroup;
> > > > > > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > > > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > > > > > +             err = -ENOMEM;
> > > > > > > > +             goto err_cgroup;
> > > > > > > > +     }
> > > > > > > >
> > > > > > > >       return 0;
> > > > > > > >  err_cgroup:
> > > > > > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > >               dev->worker = NULL;
> > > > > > > >       }
> > > > > > > >  err_worker:
> > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > >       vhost_detach_mm(dev);
> > > > > > > >       dev->kcov_handle = 0;
> > > > > > > >  err_mm:
> > > > > > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > > > > > >               dev->worker = NULL;
> > > > > > > >               dev->kcov_handle = 0;
> > > > > > > >       }
> > > > > > > > +
> > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > >       vhost_detach_mm(dev);
> > > > > > > >  }
> > > > > > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > > > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > > > > > index d9109107af08..84de33de3abf 100644
> > > > > > > > --- a/drivers/vhost/vhost.h
> > > > > > > > +++ b/drivers/vhost/vhost.h
> > > > > > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > > > > > >       int byte_weight;
> > > > > > > >       u64 kcov_handle;
> > > > > > > >       bool use_worker;
> > > > > > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > > > > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > > > > > >                          struct vhost_iotlb_msg *msg);
> > > > > > > >  };
> > > > > > > > --
> > > > > > > > 2.34.3
> > > > > > >
> > > > >
> > >
>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
  2022-06-28  6:10                 ` Jason Wang
@ 2022-06-28  6:17                   ` Michael S. Tsirkin
  -1 siblings, 0 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2022-06-28  6:17 UTC (permalink / raw)
  To: Jason Wang; +Cc: Cindy Lu, virtualization, linux-kernel

On Tue, Jun 28, 2022 at 02:10:38PM +0800, Jason Wang wrote:
> On Tue, Jun 28, 2022 at 2:07 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Tue, Jun 28, 2022 at 02:03:38PM +0800, Jason Wang wrote:
> > > On Tue, Jun 28, 2022 at 1:58 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Tue, Jun 28, 2022 at 11:54:27AM +0800, Jason Wang wrote:
> > > > > On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > > > > > > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > > > > > > We count pinned_vm as follow in vhost-vDPA
> > > > > > > > >
> > > > > > > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > > > > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > > > > > > >          ret = -ENOMEM;
> > > > > > > > >          goto unlock;
> > > > > > > > > }
> > > > > > > > > This means if we have two vDPA devices for the same VM the pages
> > > > > > > > > would be counted twice. So we add a tree to save the page that
> > > > > > > > > counted and we will not count it again.
> > > > > > > > >
> > > > > > > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > > > > > > use a hlist to saved the root for vdpa_mem_tree.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > > > > > > ---
> > > > > > > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > >  drivers/vhost/vhost.h |  1 +
> > > > > > > > >  2 files changed, 64 insertions(+)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > > > @@ -32,6 +32,8 @@
> > > > > > > > >  #include <linux/kcov.h>
> > > > > > > > >
> > > > > > > > >  #include "vhost.h"
> > > > > > > > > +#include <linux/hashtable.h>
> > > > > > > > > +#include <linux/jhash.h>
> > > > > > > > >
> > > > > > > > >  static ushort max_mem_regions = 64;
> > > > > > > > >  module_param(max_mem_regions, ushort, 0444);
> > > > > > > > > @@ -49,6 +51,14 @@ enum {
> > > > > > > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > > > > > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > > > > > > >
> > > > > > > > > +struct vhost_vdpa_rbtree_node {
> > > > > > > > > +     struct hlist_node node;
> > > > > > > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > > > > > > +     struct mm_struct *mm_using;
> > > > > > > > > +};
> > > > > > > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > > > > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > > > > > > +
> > > > > > > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > > > > > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > > > > > > >  {
> > > > > > > >
> > > > > > > > Are you trying to save some per-mm information here?
> > > > > > > > Can't we just add it to mm_struct?
> > > > > > > >
> > > > > > > yes, this is a per-mm information, but I have checked with jason before,
> > > > > > > seems it maybe difficult to change the mm_struct in upstream
> > > > > > > so I add an to add a hlist  instead
> > > > > > > Thanks
> > > > > > > Cindy
> > > > > >
> > > > > > Difficult how?
> > > > >
> > > > > It is only useful for vDPA probably. Though it could be used by VFIO
> > > > > as well, VFIO does pinning/accounting at the container level and it
> > > > > has been there for years.
> > > >
> > > > Yes it's been there, I'm not sure this means it's perfect.
> > > > Also, rdma guys might be interested too I guess?
> > >
> > > It looks to me they plan to go to iommufd as well.
> > >
> > > >
> > > > > vDPA have an implicit "container" the
> > > > > mm_struct, but the accounting is done per device right now.
> > > > >
> > > > > In the future, when vDPA switches to iommufd, it can be then solved at
> > > > > iommufd level.
> > > >
> > > > So is it even worth fixing now?
> > >
> > > Not sure, but I guess it's better. (Or we need to teach the libvirt to
> > > have special care on this).
> >
> > It already has to for existing kernels.  Let's just move to iommufd
> > faster then?
> 
> This is fine, Cindy, any idea on this?
> 
> >
> > > >
> > > > > And if we do this in mm, it will bring extra overheads.
> > > > >
> > > > > Thanks
> > > >
> > > > Pointer per mm, not too bad ...
> > >
> > > Unless we enable it unconditionally, it requires a lot of tree
> > > operations at least.
> > >
> > > Thanks
> >
> > Not sure I understand.
> 
> I mean in order to not count pinned pages multiple times we need to
> keep track of the pages that are already pinned in an rbtree with
> refcounts. This means we need to populate the tree when userspace
> pin/unpin pages.
> 
> Thanks

Yep. In fact, if you are going to try and fix accounting you should
probably find a way to fix a mix of vdpa and -realtime mlock=on as well
as a mix of vfio and vdpa.


> >
> > > >
> > > > > > You get more scrutiny if you try, for sure,
> > > > > > and you need to need to generalize it enough that it looks
> > > > > > useful outside the driver. But I guess that's good?
> > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > > > > > > >       dev->mm = NULL;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > > > > > > +{
> > > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > > +     struct rb_root_cached *vdpa_tree;
> > > > > > > > > +     u32 key;
> > > > > > > > > +
> > > > > > > > > +     /* No hased table, init one */
> > > > > > > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > > > > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > > > > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > > > > > > +     }
> > > > > > > > > +
> > > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > > +                            key) {
> > > > > > > > > +             if (rbtree_root->mm_using == mm)
> > > > > > > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > > > > > > +     }
> > > > > > > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > > > > > > +     if (!rbtree_root)
> > > > > > > > > +             return NULL;
> > > > > > > > > +     rbtree_root->mm_using = mm;
> > > > > > > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > > > > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > > > > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > > > > > > +     return vdpa_tree;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > > > > > > +{
> > > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > > +     u32 key;
> > > > > > > > > +
> > > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > > +
> > > > > > > > > +     /* No hased table, init one */
> > > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > > +                            key) {
> > > > > > > > > +             if (rbtree_root->mm_using == mm) {
> > > > > > > > > +                     hash_del(&rbtree_root->node);
> > > > > > > > > +                     kfree(rbtree_root);
> > > > > > > > > +             }
> > > > > > > > > +     }
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  /* Caller should have device mutex */
> > > > > > > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > >  {
> > > > > > > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > >       err = vhost_dev_alloc_iovecs(dev);
> > > > > > > > >       if (err)
> > > > > > > > >               goto err_cgroup;
> > > > > > > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > > > > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > > > > > > +             err = -ENOMEM;
> > > > > > > > > +             goto err_cgroup;
> > > > > > > > > +     }
> > > > > > > > >
> > > > > > > > >       return 0;
> > > > > > > > >  err_cgroup:
> > > > > > > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > >               dev->worker = NULL;
> > > > > > > > >       }
> > > > > > > > >  err_worker:
> > > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > > >       vhost_detach_mm(dev);
> > > > > > > > >       dev->kcov_handle = 0;
> > > > > > > > >  err_mm:
> > > > > > > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > > > > > > >               dev->worker = NULL;
> > > > > > > > >               dev->kcov_handle = 0;
> > > > > > > > >       }
> > > > > > > > > +
> > > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > > >       vhost_detach_mm(dev);
> > > > > > > > >  }
> > > > > > > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > > > > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > > > > > > index d9109107af08..84de33de3abf 100644
> > > > > > > > > --- a/drivers/vhost/vhost.h
> > > > > > > > > +++ b/drivers/vhost/vhost.h
> > > > > > > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > > > > > > >       int byte_weight;
> > > > > > > > >       u64 kcov_handle;
> > > > > > > > >       bool use_worker;
> > > > > > > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > > > > > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > > > > > > >                          struct vhost_iotlb_msg *msg);
> > > > > > > > >  };
> > > > > > > > > --
> > > > > > > > > 2.34.3
> > > > > > > >
> > > > > >
> > > >
> >


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
@ 2022-06-28  6:17                   ` Michael S. Tsirkin
  0 siblings, 0 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2022-06-28  6:17 UTC (permalink / raw)
  To: Jason Wang; +Cc: linux-kernel, Cindy Lu, virtualization

On Tue, Jun 28, 2022 at 02:10:38PM +0800, Jason Wang wrote:
> On Tue, Jun 28, 2022 at 2:07 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Tue, Jun 28, 2022 at 02:03:38PM +0800, Jason Wang wrote:
> > > On Tue, Jun 28, 2022 at 1:58 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Tue, Jun 28, 2022 at 11:54:27AM +0800, Jason Wang wrote:
> > > > > On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > > > > > > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > > > > > > We count pinned_vm as follow in vhost-vDPA
> > > > > > > > >
> > > > > > > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > > > > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > > > > > > >          ret = -ENOMEM;
> > > > > > > > >          goto unlock;
> > > > > > > > > }
> > > > > > > > > This means if we have two vDPA devices for the same VM the pages
> > > > > > > > > would be counted twice. So we add a tree to save the page that
> > > > > > > > > counted and we will not count it again.
> > > > > > > > >
> > > > > > > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > > > > > > use a hlist to saved the root for vdpa_mem_tree.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > > > > > > ---
> > > > > > > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > >  drivers/vhost/vhost.h |  1 +
> > > > > > > > >  2 files changed, 64 insertions(+)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > > > @@ -32,6 +32,8 @@
> > > > > > > > >  #include <linux/kcov.h>
> > > > > > > > >
> > > > > > > > >  #include "vhost.h"
> > > > > > > > > +#include <linux/hashtable.h>
> > > > > > > > > +#include <linux/jhash.h>
> > > > > > > > >
> > > > > > > > >  static ushort max_mem_regions = 64;
> > > > > > > > >  module_param(max_mem_regions, ushort, 0444);
> > > > > > > > > @@ -49,6 +51,14 @@ enum {
> > > > > > > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > > > > > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > > > > > > >
> > > > > > > > > +struct vhost_vdpa_rbtree_node {
> > > > > > > > > +     struct hlist_node node;
> > > > > > > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > > > > > > +     struct mm_struct *mm_using;
> > > > > > > > > +};
> > > > > > > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > > > > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > > > > > > +
> > > > > > > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > > > > > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > > > > > > >  {
> > > > > > > >
> > > > > > > > Are you trying to save some per-mm information here?
> > > > > > > > Can't we just add it to mm_struct?
> > > > > > > >
> > > > > > > yes, this is a per-mm information, but I have checked with jason before,
> > > > > > > seems it maybe difficult to change the mm_struct in upstream
> > > > > > > so I add an to add a hlist  instead
> > > > > > > Thanks
> > > > > > > Cindy
> > > > > >
> > > > > > Difficult how?
> > > > >
> > > > > It is only useful for vDPA probably. Though it could be used by VFIO
> > > > > as well, VFIO does pinning/accounting at the container level and it
> > > > > has been there for years.
> > > >
> > > > Yes it's been there, I'm not sure this means it's perfect.
> > > > Also, rdma guys might be interested too I guess?
> > >
> > > It looks to me they plan to go to iommufd as well.
> > >
> > > >
> > > > > vDPA have an implicit "container" the
> > > > > mm_struct, but the accounting is done per device right now.
> > > > >
> > > > > In the future, when vDPA switches to iommufd, it can be then solved at
> > > > > iommufd level.
> > > >
> > > > So is it even worth fixing now?
> > >
> > > Not sure, but I guess it's better. (Or we need to teach the libvirt to
> > > have special care on this).
> >
> > It already has to for existing kernels.  Let's just move to iommufd
> > faster then?
> 
> This is fine, Cindy, any idea on this?
> 
> >
> > > >
> > > > > And if we do this in mm, it will bring extra overheads.
> > > > >
> > > > > Thanks
> > > >
> > > > Pointer per mm, not too bad ...
> > >
> > > Unless we enable it unconditionally, it requires a lot of tree
> > > operations at least.
> > >
> > > Thanks
> >
> > Not sure I understand.
> 
> I mean in order to not count pinned pages multiple times we need to
> keep track of the pages that are already pinned in an rbtree with
> refcounts. This means we need to populate the tree when userspace
> pin/unpin pages.
> 
> Thanks

Yep. In fact, if you are going to try and fix accounting you should
probably find a way to fix a mix of vdpa and -realtime mlock=on as well
as a mix of vfio and vdpa.


> >
> > > >
> > > > > > You get more scrutiny if you try, for sure,
> > > > > > and you need to need to generalize it enough that it looks
> > > > > > useful outside the driver. But I guess that's good?
> > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > > > > > > >       dev->mm = NULL;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > > > > > > +{
> > > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > > +     struct rb_root_cached *vdpa_tree;
> > > > > > > > > +     u32 key;
> > > > > > > > > +
> > > > > > > > > +     /* No hased table, init one */
> > > > > > > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > > > > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > > > > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > > > > > > +     }
> > > > > > > > > +
> > > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > > +                            key) {
> > > > > > > > > +             if (rbtree_root->mm_using == mm)
> > > > > > > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > > > > > > +     }
> > > > > > > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > > > > > > +     if (!rbtree_root)
> > > > > > > > > +             return NULL;
> > > > > > > > > +     rbtree_root->mm_using = mm;
> > > > > > > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > > > > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > > > > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > > > > > > +     return vdpa_tree;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > > > > > > +{
> > > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > > +     u32 key;
> > > > > > > > > +
> > > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > > +
> > > > > > > > > +     /* No hased table, init one */
> > > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > > +                            key) {
> > > > > > > > > +             if (rbtree_root->mm_using == mm) {
> > > > > > > > > +                     hash_del(&rbtree_root->node);
> > > > > > > > > +                     kfree(rbtree_root);
> > > > > > > > > +             }
> > > > > > > > > +     }
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  /* Caller should have device mutex */
> > > > > > > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > >  {
> > > > > > > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > >       err = vhost_dev_alloc_iovecs(dev);
> > > > > > > > >       if (err)
> > > > > > > > >               goto err_cgroup;
> > > > > > > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > > > > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > > > > > > +             err = -ENOMEM;
> > > > > > > > > +             goto err_cgroup;
> > > > > > > > > +     }
> > > > > > > > >
> > > > > > > > >       return 0;
> > > > > > > > >  err_cgroup:
> > > > > > > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > >               dev->worker = NULL;
> > > > > > > > >       }
> > > > > > > > >  err_worker:
> > > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > > >       vhost_detach_mm(dev);
> > > > > > > > >       dev->kcov_handle = 0;
> > > > > > > > >  err_mm:
> > > > > > > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > > > > > > >               dev->worker = NULL;
> > > > > > > > >               dev->kcov_handle = 0;
> > > > > > > > >       }
> > > > > > > > > +
> > > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > > >       vhost_detach_mm(dev);
> > > > > > > > >  }
> > > > > > > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > > > > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > > > > > > index d9109107af08..84de33de3abf 100644
> > > > > > > > > --- a/drivers/vhost/vhost.h
> > > > > > > > > +++ b/drivers/vhost/vhost.h
> > > > > > > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > > > > > > >       int byte_weight;
> > > > > > > > >       u64 kcov_handle;
> > > > > > > > >       bool use_worker;
> > > > > > > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > > > > > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > > > > > > >                          struct vhost_iotlb_msg *msg);
> > > > > > > > >  };
> > > > > > > > > --
> > > > > > > > > 2.34.3
> > > > > > > >
> > > > > >
> > > >
> >

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
  2022-06-28  6:17                   ` Michael S. Tsirkin
@ 2022-06-28  6:18                     ` Jason Wang
  -1 siblings, 0 replies; 24+ messages in thread
From: Jason Wang @ 2022-06-28  6:18 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Cindy Lu, virtualization, linux-kernel

On Tue, Jun 28, 2022 at 2:17 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Jun 28, 2022 at 02:10:38PM +0800, Jason Wang wrote:
> > On Tue, Jun 28, 2022 at 2:07 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Tue, Jun 28, 2022 at 02:03:38PM +0800, Jason Wang wrote:
> > > > On Tue, Jun 28, 2022 at 1:58 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Tue, Jun 28, 2022 at 11:54:27AM +0800, Jason Wang wrote:
> > > > > > On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > > > > > > > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > > > > > > > We count pinned_vm as follow in vhost-vDPA
> > > > > > > > > >
> > > > > > > > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > > > > > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > > > > > > > >          ret = -ENOMEM;
> > > > > > > > > >          goto unlock;
> > > > > > > > > > }
> > > > > > > > > > This means if we have two vDPA devices for the same VM the pages
> > > > > > > > > > would be counted twice. So we add a tree to save the page that
> > > > > > > > > > counted and we will not count it again.
> > > > > > > > > >
> > > > > > > > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > > > > > > > use a hlist to saved the root for vdpa_mem_tree.
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > > > > > > > ---
> > > > > > > > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > > >  drivers/vhost/vhost.h |  1 +
> > > > > > > > > >  2 files changed, 64 insertions(+)
> > > > > > > > > >
> > > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > > > > @@ -32,6 +32,8 @@
> > > > > > > > > >  #include <linux/kcov.h>
> > > > > > > > > >
> > > > > > > > > >  #include "vhost.h"
> > > > > > > > > > +#include <linux/hashtable.h>
> > > > > > > > > > +#include <linux/jhash.h>
> > > > > > > > > >
> > > > > > > > > >  static ushort max_mem_regions = 64;
> > > > > > > > > >  module_param(max_mem_regions, ushort, 0444);
> > > > > > > > > > @@ -49,6 +51,14 @@ enum {
> > > > > > > > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > > > > > > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > > > > > > > >
> > > > > > > > > > +struct vhost_vdpa_rbtree_node {
> > > > > > > > > > +     struct hlist_node node;
> > > > > > > > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > > > > > > > +     struct mm_struct *mm_using;
> > > > > > > > > > +};
> > > > > > > > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > > > > > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > > > > > > > +
> > > > > > > > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > > > > > > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > > > > > > > >  {
> > > > > > > > >
> > > > > > > > > Are you trying to save some per-mm information here?
> > > > > > > > > Can't we just add it to mm_struct?
> > > > > > > > >
> > > > > > > > yes, this is a per-mm information, but I have checked with jason before,
> > > > > > > > seems it maybe difficult to change the mm_struct in upstream
> > > > > > > > so I add an to add a hlist  instead
> > > > > > > > Thanks
> > > > > > > > Cindy
> > > > > > >
> > > > > > > Difficult how?
> > > > > >
> > > > > > It is only useful for vDPA probably. Though it could be used by VFIO
> > > > > > as well, VFIO does pinning/accounting at the container level and it
> > > > > > has been there for years.
> > > > >
> > > > > Yes it's been there, I'm not sure this means it's perfect.
> > > > > Also, rdma guys might be interested too I guess?
> > > >
> > > > It looks to me they plan to go to iommufd as well.
> > > >
> > > > >
> > > > > > vDPA have an implicit "container" the
> > > > > > mm_struct, but the accounting is done per device right now.
> > > > > >
> > > > > > In the future, when vDPA switches to iommufd, it can be then solved at
> > > > > > iommufd level.
> > > > >
> > > > > So is it even worth fixing now?
> > > >
> > > > Not sure, but I guess it's better. (Or we need to teach the libvirt to
> > > > have special care on this).
> > >
> > > It already has to for existing kernels.  Let's just move to iommufd
> > > faster then?
> >
> > This is fine, Cindy, any idea on this?
> >
> > >
> > > > >
> > > > > > And if we do this in mm, it will bring extra overheads.
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > Pointer per mm, not too bad ...
> > > >
> > > > Unless we enable it unconditionally, it requires a lot of tree
> > > > operations at least.
> > > >
> > > > Thanks
> > >
> > > Not sure I understand.
> >
> > I mean in order to not count pinned pages multiple times we need to
> > keep track of the pages that are already pinned in an rbtree with
> > refcounts. This means we need to populate the tree when userspace
> > pin/unpin pages.
> >
> > Thanks
>
> Yep. In fact, if you are going to try and fix accounting you should
> probably find a way to fix a mix of vdpa and -realtime mlock=on as well
> as a mix of vfio and vdpa.

I think iommufd could be a way. E.g VFIO and vDPA can share an iommufd
address space.

Thanks

>
>
> > >
> > > > >
> > > > > > > You get more scrutiny if you try, for sure,
> > > > > > > and you need to need to generalize it enough that it looks
> > > > > > > useful outside the driver. But I guess that's good?
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > > > > > > > >       dev->mm = NULL;
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > > > > > > > +{
> > > > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > > > +     struct rb_root_cached *vdpa_tree;
> > > > > > > > > > +     u32 key;
> > > > > > > > > > +
> > > > > > > > > > +     /* No hased table, init one */
> > > > > > > > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > > > > > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > > > > > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > > > > > > > +     }
> > > > > > > > > > +
> > > > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > > > +                            key) {
> > > > > > > > > > +             if (rbtree_root->mm_using == mm)
> > > > > > > > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > > > > > > > +     }
> > > > > > > > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > > > > > > > +     if (!rbtree_root)
> > > > > > > > > > +             return NULL;
> > > > > > > > > > +     rbtree_root->mm_using = mm;
> > > > > > > > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > > > > > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > > > > > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > > > > > > > +     return vdpa_tree;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > > > > > > > +{
> > > > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > > > +     u32 key;
> > > > > > > > > > +
> > > > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > > > +
> > > > > > > > > > +     /* No hased table, init one */
> > > > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > > > +                            key) {
> > > > > > > > > > +             if (rbtree_root->mm_using == mm) {
> > > > > > > > > > +                     hash_del(&rbtree_root->node);
> > > > > > > > > > +                     kfree(rbtree_root);
> > > > > > > > > > +             }
> > > > > > > > > > +     }
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > >  /* Caller should have device mutex */
> > > > > > > > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > >  {
> > > > > > > > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > >       err = vhost_dev_alloc_iovecs(dev);
> > > > > > > > > >       if (err)
> > > > > > > > > >               goto err_cgroup;
> > > > > > > > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > > > > > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > > > > > > > +             err = -ENOMEM;
> > > > > > > > > > +             goto err_cgroup;
> > > > > > > > > > +     }
> > > > > > > > > >
> > > > > > > > > >       return 0;
> > > > > > > > > >  err_cgroup:
> > > > > > > > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > >               dev->worker = NULL;
> > > > > > > > > >       }
> > > > > > > > > >  err_worker:
> > > > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > > > >       vhost_detach_mm(dev);
> > > > > > > > > >       dev->kcov_handle = 0;
> > > > > > > > > >  err_mm:
> > > > > > > > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > > > > > > > >               dev->worker = NULL;
> > > > > > > > > >               dev->kcov_handle = 0;
> > > > > > > > > >       }
> > > > > > > > > > +
> > > > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > > > >       vhost_detach_mm(dev);
> > > > > > > > > >  }
> > > > > > > > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > > > > > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > > > > > > > index d9109107af08..84de33de3abf 100644
> > > > > > > > > > --- a/drivers/vhost/vhost.h
> > > > > > > > > > +++ b/drivers/vhost/vhost.h
> > > > > > > > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > > > > > > > >       int byte_weight;
> > > > > > > > > >       u64 kcov_handle;
> > > > > > > > > >       bool use_worker;
> > > > > > > > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > > > > > > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > > > > > > > >                          struct vhost_iotlb_msg *msg);
> > > > > > > > > >  };
> > > > > > > > > > --
> > > > > > > > > > 2.34.3
> > > > > > > > >
> > > > > > >
> > > > >
> > >
>


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
@ 2022-06-28  6:18                     ` Jason Wang
  0 siblings, 0 replies; 24+ messages in thread
From: Jason Wang @ 2022-06-28  6:18 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: linux-kernel, Cindy Lu, virtualization

On Tue, Jun 28, 2022 at 2:17 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Jun 28, 2022 at 02:10:38PM +0800, Jason Wang wrote:
> > On Tue, Jun 28, 2022 at 2:07 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Tue, Jun 28, 2022 at 02:03:38PM +0800, Jason Wang wrote:
> > > > On Tue, Jun 28, 2022 at 1:58 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Tue, Jun 28, 2022 at 11:54:27AM +0800, Jason Wang wrote:
> > > > > > On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > > > > > > > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > > > > > > > We count pinned_vm as follow in vhost-vDPA
> > > > > > > > > >
> > > > > > > > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > > > > > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > > > > > > > >          ret = -ENOMEM;
> > > > > > > > > >          goto unlock;
> > > > > > > > > > }
> > > > > > > > > > This means if we have two vDPA devices for the same VM the pages
> > > > > > > > > > would be counted twice. So we add a tree to save the page that
> > > > > > > > > > counted and we will not count it again.
> > > > > > > > > >
> > > > > > > > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > > > > > > > use a hlist to saved the root for vdpa_mem_tree.
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > > > > > > > ---
> > > > > > > > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > > >  drivers/vhost/vhost.h |  1 +
> > > > > > > > > >  2 files changed, 64 insertions(+)
> > > > > > > > > >
> > > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > > > > @@ -32,6 +32,8 @@
> > > > > > > > > >  #include <linux/kcov.h>
> > > > > > > > > >
> > > > > > > > > >  #include "vhost.h"
> > > > > > > > > > +#include <linux/hashtable.h>
> > > > > > > > > > +#include <linux/jhash.h>
> > > > > > > > > >
> > > > > > > > > >  static ushort max_mem_regions = 64;
> > > > > > > > > >  module_param(max_mem_regions, ushort, 0444);
> > > > > > > > > > @@ -49,6 +51,14 @@ enum {
> > > > > > > > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > > > > > > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > > > > > > > >
> > > > > > > > > > +struct vhost_vdpa_rbtree_node {
> > > > > > > > > > +     struct hlist_node node;
> > > > > > > > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > > > > > > > +     struct mm_struct *mm_using;
> > > > > > > > > > +};
> > > > > > > > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > > > > > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > > > > > > > +
> > > > > > > > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > > > > > > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > > > > > > > >  {
> > > > > > > > >
> > > > > > > > > Are you trying to save some per-mm information here?
> > > > > > > > > Can't we just add it to mm_struct?
> > > > > > > > >
> > > > > > > > yes, this is a per-mm information, but I have checked with jason before,
> > > > > > > > seems it maybe difficult to change the mm_struct in upstream
> > > > > > > > so I add an to add a hlist  instead
> > > > > > > > Thanks
> > > > > > > > Cindy
> > > > > > >
> > > > > > > Difficult how?
> > > > > >
> > > > > > It is only useful for vDPA probably. Though it could be used by VFIO
> > > > > > as well, VFIO does pinning/accounting at the container level and it
> > > > > > has been there for years.
> > > > >
> > > > > Yes it's been there, I'm not sure this means it's perfect.
> > > > > Also, rdma guys might be interested too I guess?
> > > >
> > > > It looks to me they plan to go to iommufd as well.
> > > >
> > > > >
> > > > > > vDPA have an implicit "container" the
> > > > > > mm_struct, but the accounting is done per device right now.
> > > > > >
> > > > > > In the future, when vDPA switches to iommufd, it can be then solved at
> > > > > > iommufd level.
> > > > >
> > > > > So is it even worth fixing now?
> > > >
> > > > Not sure, but I guess it's better. (Or we need to teach the libvirt to
> > > > have special care on this).
> > >
> > > It already has to for existing kernels.  Let's just move to iommufd
> > > faster then?
> >
> > This is fine, Cindy, any idea on this?
> >
> > >
> > > > >
> > > > > > And if we do this in mm, it will bring extra overheads.
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > Pointer per mm, not too bad ...
> > > >
> > > > Unless we enable it unconditionally, it requires a lot of tree
> > > > operations at least.
> > > >
> > > > Thanks
> > >
> > > Not sure I understand.
> >
> > I mean in order to not count pinned pages multiple times we need to
> > keep track of the pages that are already pinned in an rbtree with
> > refcounts. This means we need to populate the tree when userspace
> > pin/unpin pages.
> >
> > Thanks
>
> Yep. In fact, if you are going to try and fix accounting you should
> probably find a way to fix a mix of vdpa and -realtime mlock=on as well
> as a mix of vfio and vdpa.

I think iommufd could be a way. E.g VFIO and vDPA can share an iommufd
address space.

Thanks

>
>
> > >
> > > > >
> > > > > > > You get more scrutiny if you try, for sure,
> > > > > > > and you need to need to generalize it enough that it looks
> > > > > > > useful outside the driver. But I guess that's good?
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > > > > > > > >       dev->mm = NULL;
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > > > > > > > +{
> > > > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > > > +     struct rb_root_cached *vdpa_tree;
> > > > > > > > > > +     u32 key;
> > > > > > > > > > +
> > > > > > > > > > +     /* No hased table, init one */
> > > > > > > > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > > > > > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > > > > > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > > > > > > > +     }
> > > > > > > > > > +
> > > > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > > > +                            key) {
> > > > > > > > > > +             if (rbtree_root->mm_using == mm)
> > > > > > > > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > > > > > > > +     }
> > > > > > > > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > > > > > > > +     if (!rbtree_root)
> > > > > > > > > > +             return NULL;
> > > > > > > > > > +     rbtree_root->mm_using = mm;
> > > > > > > > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > > > > > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > > > > > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > > > > > > > +     return vdpa_tree;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > > > > > > > +{
> > > > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > > > +     u32 key;
> > > > > > > > > > +
> > > > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > > > +
> > > > > > > > > > +     /* No hased table, init one */
> > > > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > > > +                            key) {
> > > > > > > > > > +             if (rbtree_root->mm_using == mm) {
> > > > > > > > > > +                     hash_del(&rbtree_root->node);
> > > > > > > > > > +                     kfree(rbtree_root);
> > > > > > > > > > +             }
> > > > > > > > > > +     }
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > >  /* Caller should have device mutex */
> > > > > > > > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > >  {
> > > > > > > > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > >       err = vhost_dev_alloc_iovecs(dev);
> > > > > > > > > >       if (err)
> > > > > > > > > >               goto err_cgroup;
> > > > > > > > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > > > > > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > > > > > > > +             err = -ENOMEM;
> > > > > > > > > > +             goto err_cgroup;
> > > > > > > > > > +     }
> > > > > > > > > >
> > > > > > > > > >       return 0;
> > > > > > > > > >  err_cgroup:
> > > > > > > > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > >               dev->worker = NULL;
> > > > > > > > > >       }
> > > > > > > > > >  err_worker:
> > > > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > > > >       vhost_detach_mm(dev);
> > > > > > > > > >       dev->kcov_handle = 0;
> > > > > > > > > >  err_mm:
> > > > > > > > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > > > > > > > >               dev->worker = NULL;
> > > > > > > > > >               dev->kcov_handle = 0;
> > > > > > > > > >       }
> > > > > > > > > > +
> > > > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > > > >       vhost_detach_mm(dev);
> > > > > > > > > >  }
> > > > > > > > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > > > > > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > > > > > > > index d9109107af08..84de33de3abf 100644
> > > > > > > > > > --- a/drivers/vhost/vhost.h
> > > > > > > > > > +++ b/drivers/vhost/vhost.h
> > > > > > > > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > > > > > > > >       int byte_weight;
> > > > > > > > > >       u64 kcov_handle;
> > > > > > > > > >       bool use_worker;
> > > > > > > > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > > > > > > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > > > > > > > >                          struct vhost_iotlb_msg *msg);
> > > > > > > > > >  };
> > > > > > > > > > --
> > > > > > > > > > 2.34.3
> > > > > > > > >
> > > > > > >
> > > > >
> > >
>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
  2022-06-28  6:18                     ` Jason Wang
@ 2022-06-28  6:25                       ` Michael S. Tsirkin
  -1 siblings, 0 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2022-06-28  6:25 UTC (permalink / raw)
  To: Jason Wang; +Cc: Cindy Lu, virtualization, linux-kernel

On Tue, Jun 28, 2022 at 02:18:55PM +0800, Jason Wang wrote:
> On Tue, Jun 28, 2022 at 2:17 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Tue, Jun 28, 2022 at 02:10:38PM +0800, Jason Wang wrote:
> > > On Tue, Jun 28, 2022 at 2:07 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Tue, Jun 28, 2022 at 02:03:38PM +0800, Jason Wang wrote:
> > > > > On Tue, Jun 28, 2022 at 1:58 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Tue, Jun 28, 2022 at 11:54:27AM +0800, Jason Wang wrote:
> > > > > > > On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > > > > > > > > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > > > > > > > > We count pinned_vm as follow in vhost-vDPA
> > > > > > > > > > >
> > > > > > > > > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > > > > > > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > > > > > > > > >          ret = -ENOMEM;
> > > > > > > > > > >          goto unlock;
> > > > > > > > > > > }
> > > > > > > > > > > This means if we have two vDPA devices for the same VM the pages
> > > > > > > > > > > would be counted twice. So we add a tree to save the page that
> > > > > > > > > > > counted and we will not count it again.
> > > > > > > > > > >
> > > > > > > > > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > > > > > > > > use a hlist to saved the root for vdpa_mem_tree.
> > > > > > > > > > >
> > > > > > > > > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > > > > > > > > ---
> > > > > > > > > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > > > >  drivers/vhost/vhost.h |  1 +
> > > > > > > > > > >  2 files changed, 64 insertions(+)
> > > > > > > > > > >
> > > > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > > > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > > > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > > > > > @@ -32,6 +32,8 @@
> > > > > > > > > > >  #include <linux/kcov.h>
> > > > > > > > > > >
> > > > > > > > > > >  #include "vhost.h"
> > > > > > > > > > > +#include <linux/hashtable.h>
> > > > > > > > > > > +#include <linux/jhash.h>
> > > > > > > > > > >
> > > > > > > > > > >  static ushort max_mem_regions = 64;
> > > > > > > > > > >  module_param(max_mem_regions, ushort, 0444);
> > > > > > > > > > > @@ -49,6 +51,14 @@ enum {
> > > > > > > > > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > > > > > > > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > > > > > > > > >
> > > > > > > > > > > +struct vhost_vdpa_rbtree_node {
> > > > > > > > > > > +     struct hlist_node node;
> > > > > > > > > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > > > > > > > > +     struct mm_struct *mm_using;
> > > > > > > > > > > +};
> > > > > > > > > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > > > > > > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > > > > > > > > +
> > > > > > > > > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > > > > > > > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > > > > > > > > >  {
> > > > > > > > > >
> > > > > > > > > > Are you trying to save some per-mm information here?
> > > > > > > > > > Can't we just add it to mm_struct?
> > > > > > > > > >
> > > > > > > > > yes, this is a per-mm information, but I have checked with jason before,
> > > > > > > > > seems it maybe difficult to change the mm_struct in upstream
> > > > > > > > > so I add an to add a hlist  instead
> > > > > > > > > Thanks
> > > > > > > > > Cindy
> > > > > > > >
> > > > > > > > Difficult how?
> > > > > > >
> > > > > > > It is only useful for vDPA probably. Though it could be used by VFIO
> > > > > > > as well, VFIO does pinning/accounting at the container level and it
> > > > > > > has been there for years.
> > > > > >
> > > > > > Yes it's been there, I'm not sure this means it's perfect.
> > > > > > Also, rdma guys might be interested too I guess?
> > > > >
> > > > > It looks to me they plan to go to iommufd as well.
> > > > >
> > > > > >
> > > > > > > vDPA have an implicit "container" the
> > > > > > > mm_struct, but the accounting is done per device right now.
> > > > > > >
> > > > > > > In the future, when vDPA switches to iommufd, it can be then solved at
> > > > > > > iommufd level.
> > > > > >
> > > > > > So is it even worth fixing now?
> > > > >
> > > > > Not sure, but I guess it's better. (Or we need to teach the libvirt to
> > > > > have special care on this).
> > > >
> > > > It already has to for existing kernels.  Let's just move to iommufd
> > > > faster then?
> > >
> > > This is fine, Cindy, any idea on this?
> > >
> > > >
> > > > > >
> > > > > > > And if we do this in mm, it will bring extra overheads.
> > > > > > >
> > > > > > > Thanks
> > > > > >
> > > > > > Pointer per mm, not too bad ...
> > > > >
> > > > > Unless we enable it unconditionally, it requires a lot of tree
> > > > > operations at least.
> > > > >
> > > > > Thanks
> > > >
> > > > Not sure I understand.
> > >
> > > I mean in order to not count pinned pages multiple times we need to
> > > keep track of the pages that are already pinned in an rbtree with
> > > refcounts. This means we need to populate the tree when userspace
> > > pin/unpin pages.
> > >
> > > Thanks
> >
> > Yep. In fact, if you are going to try and fix accounting you should
> > probably find a way to fix a mix of vdpa and -realtime mlock=on as well
> > as a mix of vfio and vdpa.
> 
> I think iommufd could be a way. E.g VFIO and vDPA can share an iommufd
> address space.
> 
> Thanks

And maybe -realtime mlock=on could use iommufd too ..

> >
> >
> > > >
> > > > > >
> > > > > > > > You get more scrutiny if you try, for sure,
> > > > > > > > and you need to need to generalize it enough that it looks
> > > > > > > > useful outside the driver. But I guess that's good?
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > > > > > > > > >       dev->mm = NULL;
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > > > > > > > > +{
> > > > > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > > > > +     struct rb_root_cached *vdpa_tree;
> > > > > > > > > > > +     u32 key;
> > > > > > > > > > > +
> > > > > > > > > > > +     /* No hased table, init one */
> > > > > > > > > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > > > > > > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > > > > > > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > > > > > > > > +     }
> > > > > > > > > > > +
> > > > > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > > > > +                            key) {
> > > > > > > > > > > +             if (rbtree_root->mm_using == mm)
> > > > > > > > > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > > > > > > > > +     }
> > > > > > > > > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > > > > > > > > +     if (!rbtree_root)
> > > > > > > > > > > +             return NULL;
> > > > > > > > > > > +     rbtree_root->mm_using = mm;
> > > > > > > > > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > > > > > > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > > > > > > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > > > > > > > > +     return vdpa_tree;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > > > > > > > > +{
> > > > > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > > > > +     u32 key;
> > > > > > > > > > > +
> > > > > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > > > > +
> > > > > > > > > > > +     /* No hased table, init one */
> > > > > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > > > > +                            key) {
> > > > > > > > > > > +             if (rbtree_root->mm_using == mm) {
> > > > > > > > > > > +                     hash_del(&rbtree_root->node);
> > > > > > > > > > > +                     kfree(rbtree_root);
> > > > > > > > > > > +             }
> > > > > > > > > > > +     }
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > >  /* Caller should have device mutex */
> > > > > > > > > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > > >  {
> > > > > > > > > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > > >       err = vhost_dev_alloc_iovecs(dev);
> > > > > > > > > > >       if (err)
> > > > > > > > > > >               goto err_cgroup;
> > > > > > > > > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > > > > > > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > > > > > > > > +             err = -ENOMEM;
> > > > > > > > > > > +             goto err_cgroup;
> > > > > > > > > > > +     }
> > > > > > > > > > >
> > > > > > > > > > >       return 0;
> > > > > > > > > > >  err_cgroup:
> > > > > > > > > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > > >               dev->worker = NULL;
> > > > > > > > > > >       }
> > > > > > > > > > >  err_worker:
> > > > > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > > > > >       vhost_detach_mm(dev);
> > > > > > > > > > >       dev->kcov_handle = 0;
> > > > > > > > > > >  err_mm:
> > > > > > > > > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > > > > > > > > >               dev->worker = NULL;
> > > > > > > > > > >               dev->kcov_handle = 0;
> > > > > > > > > > >       }
> > > > > > > > > > > +
> > > > > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > > > > >       vhost_detach_mm(dev);
> > > > > > > > > > >  }
> > > > > > > > > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > > > > > > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > > > > > > > > index d9109107af08..84de33de3abf 100644
> > > > > > > > > > > --- a/drivers/vhost/vhost.h
> > > > > > > > > > > +++ b/drivers/vhost/vhost.h
> > > > > > > > > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > > > > > > > > >       int byte_weight;
> > > > > > > > > > >       u64 kcov_handle;
> > > > > > > > > > >       bool use_worker;
> > > > > > > > > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > > > > > > > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > > > > > > > > >                          struct vhost_iotlb_msg *msg);
> > > > > > > > > > >  };
> > > > > > > > > > > --
> > > > > > > > > > > 2.34.3
> > > > > > > > > >
> > > > > > > >
> > > > > >
> > > >
> >


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
@ 2022-06-28  6:25                       ` Michael S. Tsirkin
  0 siblings, 0 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2022-06-28  6:25 UTC (permalink / raw)
  To: Jason Wang; +Cc: linux-kernel, Cindy Lu, virtualization

On Tue, Jun 28, 2022 at 02:18:55PM +0800, Jason Wang wrote:
> On Tue, Jun 28, 2022 at 2:17 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Tue, Jun 28, 2022 at 02:10:38PM +0800, Jason Wang wrote:
> > > On Tue, Jun 28, 2022 at 2:07 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Tue, Jun 28, 2022 at 02:03:38PM +0800, Jason Wang wrote:
> > > > > On Tue, Jun 28, 2022 at 1:58 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Tue, Jun 28, 2022 at 11:54:27AM +0800, Jason Wang wrote:
> > > > > > > On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > > > > > > > > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > > > > > > > > We count pinned_vm as follow in vhost-vDPA
> > > > > > > > > > >
> > > > > > > > > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > > > > > > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > > > > > > > > >          ret = -ENOMEM;
> > > > > > > > > > >          goto unlock;
> > > > > > > > > > > }
> > > > > > > > > > > This means if we have two vDPA devices for the same VM the pages
> > > > > > > > > > > would be counted twice. So we add a tree to save the page that
> > > > > > > > > > > counted and we will not count it again.
> > > > > > > > > > >
> > > > > > > > > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > > > > > > > > use a hlist to saved the root for vdpa_mem_tree.
> > > > > > > > > > >
> > > > > > > > > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > > > > > > > > ---
> > > > > > > > > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > > > >  drivers/vhost/vhost.h |  1 +
> > > > > > > > > > >  2 files changed, 64 insertions(+)
> > > > > > > > > > >
> > > > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > > > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > > > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > > > > > @@ -32,6 +32,8 @@
> > > > > > > > > > >  #include <linux/kcov.h>
> > > > > > > > > > >
> > > > > > > > > > >  #include "vhost.h"
> > > > > > > > > > > +#include <linux/hashtable.h>
> > > > > > > > > > > +#include <linux/jhash.h>
> > > > > > > > > > >
> > > > > > > > > > >  static ushort max_mem_regions = 64;
> > > > > > > > > > >  module_param(max_mem_regions, ushort, 0444);
> > > > > > > > > > > @@ -49,6 +51,14 @@ enum {
> > > > > > > > > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > > > > > > > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > > > > > > > > >
> > > > > > > > > > > +struct vhost_vdpa_rbtree_node {
> > > > > > > > > > > +     struct hlist_node node;
> > > > > > > > > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > > > > > > > > +     struct mm_struct *mm_using;
> > > > > > > > > > > +};
> > > > > > > > > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > > > > > > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > > > > > > > > +
> > > > > > > > > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > > > > > > > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > > > > > > > > >  {
> > > > > > > > > >
> > > > > > > > > > Are you trying to save some per-mm information here?
> > > > > > > > > > Can't we just add it to mm_struct?
> > > > > > > > > >
> > > > > > > > > yes, this is a per-mm information, but I have checked with jason before,
> > > > > > > > > seems it maybe difficult to change the mm_struct in upstream
> > > > > > > > > so I add an to add a hlist  instead
> > > > > > > > > Thanks
> > > > > > > > > Cindy
> > > > > > > >
> > > > > > > > Difficult how?
> > > > > > >
> > > > > > > It is only useful for vDPA probably. Though it could be used by VFIO
> > > > > > > as well, VFIO does pinning/accounting at the container level and it
> > > > > > > has been there for years.
> > > > > >
> > > > > > Yes it's been there, I'm not sure this means it's perfect.
> > > > > > Also, rdma guys might be interested too I guess?
> > > > >
> > > > > It looks to me they plan to go to iommufd as well.
> > > > >
> > > > > >
> > > > > > > vDPA have an implicit "container" the
> > > > > > > mm_struct, but the accounting is done per device right now.
> > > > > > >
> > > > > > > In the future, when vDPA switches to iommufd, it can be then solved at
> > > > > > > iommufd level.
> > > > > >
> > > > > > So is it even worth fixing now?
> > > > >
> > > > > Not sure, but I guess it's better. (Or we need to teach the libvirt to
> > > > > have special care on this).
> > > >
> > > > It already has to for existing kernels.  Let's just move to iommufd
> > > > faster then?
> > >
> > > This is fine, Cindy, any idea on this?
> > >
> > > >
> > > > > >
> > > > > > > And if we do this in mm, it will bring extra overheads.
> > > > > > >
> > > > > > > Thanks
> > > > > >
> > > > > > Pointer per mm, not too bad ...
> > > > >
> > > > > Unless we enable it unconditionally, it requires a lot of tree
> > > > > operations at least.
> > > > >
> > > > > Thanks
> > > >
> > > > Not sure I understand.
> > >
> > > I mean in order to not count pinned pages multiple times we need to
> > > keep track of the pages that are already pinned in an rbtree with
> > > refcounts. This means we need to populate the tree when userspace
> > > pin/unpin pages.
> > >
> > > Thanks
> >
> > Yep. In fact, if you are going to try and fix accounting you should
> > probably find a way to fix a mix of vdpa and -realtime mlock=on as well
> > as a mix of vfio and vdpa.
> 
> I think iommufd could be a way. E.g VFIO and vDPA can share an iommufd
> address space.
> 
> Thanks

And maybe -realtime mlock=on could use iommufd too ..

> >
> >
> > > >
> > > > > >
> > > > > > > > You get more scrutiny if you try, for sure,
> > > > > > > > and you need to need to generalize it enough that it looks
> > > > > > > > useful outside the driver. But I guess that's good?
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > > > > > > > > >       dev->mm = NULL;
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > > > > > > > > +{
> > > > > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > > > > +     struct rb_root_cached *vdpa_tree;
> > > > > > > > > > > +     u32 key;
> > > > > > > > > > > +
> > > > > > > > > > > +     /* No hased table, init one */
> > > > > > > > > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > > > > > > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > > > > > > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > > > > > > > > +     }
> > > > > > > > > > > +
> > > > > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > > > > +                            key) {
> > > > > > > > > > > +             if (rbtree_root->mm_using == mm)
> > > > > > > > > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > > > > > > > > +     }
> > > > > > > > > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > > > > > > > > +     if (!rbtree_root)
> > > > > > > > > > > +             return NULL;
> > > > > > > > > > > +     rbtree_root->mm_using = mm;
> > > > > > > > > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > > > > > > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > > > > > > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > > > > > > > > +     return vdpa_tree;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > > > > > > > > +{
> > > > > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > > > > +     u32 key;
> > > > > > > > > > > +
> > > > > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > > > > +
> > > > > > > > > > > +     /* No hased table, init one */
> > > > > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > > > > +                            key) {
> > > > > > > > > > > +             if (rbtree_root->mm_using == mm) {
> > > > > > > > > > > +                     hash_del(&rbtree_root->node);
> > > > > > > > > > > +                     kfree(rbtree_root);
> > > > > > > > > > > +             }
> > > > > > > > > > > +     }
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > >  /* Caller should have device mutex */
> > > > > > > > > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > > >  {
> > > > > > > > > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > > >       err = vhost_dev_alloc_iovecs(dev);
> > > > > > > > > > >       if (err)
> > > > > > > > > > >               goto err_cgroup;
> > > > > > > > > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > > > > > > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > > > > > > > > +             err = -ENOMEM;
> > > > > > > > > > > +             goto err_cgroup;
> > > > > > > > > > > +     }
> > > > > > > > > > >
> > > > > > > > > > >       return 0;
> > > > > > > > > > >  err_cgroup:
> > > > > > > > > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > > >               dev->worker = NULL;
> > > > > > > > > > >       }
> > > > > > > > > > >  err_worker:
> > > > > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > > > > >       vhost_detach_mm(dev);
> > > > > > > > > > >       dev->kcov_handle = 0;
> > > > > > > > > > >  err_mm:
> > > > > > > > > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > > > > > > > > >               dev->worker = NULL;
> > > > > > > > > > >               dev->kcov_handle = 0;
> > > > > > > > > > >       }
> > > > > > > > > > > +
> > > > > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > > > > >       vhost_detach_mm(dev);
> > > > > > > > > > >  }
> > > > > > > > > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > > > > > > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > > > > > > > > index d9109107af08..84de33de3abf 100644
> > > > > > > > > > > --- a/drivers/vhost/vhost.h
> > > > > > > > > > > +++ b/drivers/vhost/vhost.h
> > > > > > > > > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > > > > > > > > >       int byte_weight;
> > > > > > > > > > >       u64 kcov_handle;
> > > > > > > > > > >       bool use_worker;
> > > > > > > > > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > > > > > > > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > > > > > > > > >                          struct vhost_iotlb_msg *msg);
> > > > > > > > > > >  };
> > > > > > > > > > > --
> > > > > > > > > > > 2.34.3
> > > > > > > > > >
> > > > > > > >
> > > > > >
> > > >
> >

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
  2022-06-28  6:25                       ` Michael S. Tsirkin
@ 2022-06-28  6:27                         ` Jason Wang
  -1 siblings, 0 replies; 24+ messages in thread
From: Jason Wang @ 2022-06-28  6:27 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Cindy Lu, virtualization, linux-kernel

On Tue, Jun 28, 2022 at 2:25 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Jun 28, 2022 at 02:18:55PM +0800, Jason Wang wrote:
> > On Tue, Jun 28, 2022 at 2:17 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Tue, Jun 28, 2022 at 02:10:38PM +0800, Jason Wang wrote:
> > > > On Tue, Jun 28, 2022 at 2:07 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Tue, Jun 28, 2022 at 02:03:38PM +0800, Jason Wang wrote:
> > > > > > On Tue, Jun 28, 2022 at 1:58 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Tue, Jun 28, 2022 at 11:54:27AM +0800, Jason Wang wrote:
> > > > > > > > On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > > > > > > > > > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > > > > > > > > > We count pinned_vm as follow in vhost-vDPA
> > > > > > > > > > > >
> > > > > > > > > > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > > > > > > > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > > > > > > > > > >          ret = -ENOMEM;
> > > > > > > > > > > >          goto unlock;
> > > > > > > > > > > > }
> > > > > > > > > > > > This means if we have two vDPA devices for the same VM the pages
> > > > > > > > > > > > would be counted twice. So we add a tree to save the page that
> > > > > > > > > > > > counted and we will not count it again.
> > > > > > > > > > > >
> > > > > > > > > > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > > > > > > > > > use a hlist to saved the root for vdpa_mem_tree.
> > > > > > > > > > > >
> > > > > > > > > > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > > > > > > > > > ---
> > > > > > > > > > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > > > > >  drivers/vhost/vhost.h |  1 +
> > > > > > > > > > > >  2 files changed, 64 insertions(+)
> > > > > > > > > > > >
> > > > > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > > > > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > > > > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > > > > > > @@ -32,6 +32,8 @@
> > > > > > > > > > > >  #include <linux/kcov.h>
> > > > > > > > > > > >
> > > > > > > > > > > >  #include "vhost.h"
> > > > > > > > > > > > +#include <linux/hashtable.h>
> > > > > > > > > > > > +#include <linux/jhash.h>
> > > > > > > > > > > >
> > > > > > > > > > > >  static ushort max_mem_regions = 64;
> > > > > > > > > > > >  module_param(max_mem_regions, ushort, 0444);
> > > > > > > > > > > > @@ -49,6 +51,14 @@ enum {
> > > > > > > > > > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > > > > > > > > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > > > > > > > > > >
> > > > > > > > > > > > +struct vhost_vdpa_rbtree_node {
> > > > > > > > > > > > +     struct hlist_node node;
> > > > > > > > > > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > > > > > > > > > +     struct mm_struct *mm_using;
> > > > > > > > > > > > +};
> > > > > > > > > > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > > > > > > > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > > > > > > > > > +
> > > > > > > > > > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > > > > > > > > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > > > > > > > > > >  {
> > > > > > > > > > >
> > > > > > > > > > > Are you trying to save some per-mm information here?
> > > > > > > > > > > Can't we just add it to mm_struct?
> > > > > > > > > > >
> > > > > > > > > > yes, this is a per-mm information, but I have checked with jason before,
> > > > > > > > > > seems it maybe difficult to change the mm_struct in upstream
> > > > > > > > > > so I add an to add a hlist  instead
> > > > > > > > > > Thanks
> > > > > > > > > > Cindy
> > > > > > > > >
> > > > > > > > > Difficult how?
> > > > > > > >
> > > > > > > > It is only useful for vDPA probably. Though it could be used by VFIO
> > > > > > > > as well, VFIO does pinning/accounting at the container level and it
> > > > > > > > has been there for years.
> > > > > > >
> > > > > > > Yes it's been there, I'm not sure this means it's perfect.
> > > > > > > Also, rdma guys might be interested too I guess?
> > > > > >
> > > > > > It looks to me they plan to go to iommufd as well.
> > > > > >
> > > > > > >
> > > > > > > > vDPA have an implicit "container" the
> > > > > > > > mm_struct, but the accounting is done per device right now.
> > > > > > > >
> > > > > > > > In the future, when vDPA switches to iommufd, it can be then solved at
> > > > > > > > iommufd level.
> > > > > > >
> > > > > > > So is it even worth fixing now?
> > > > > >
> > > > > > Not sure, but I guess it's better. (Or we need to teach the libvirt to
> > > > > > have special care on this).
> > > > >
> > > > > It already has to for existing kernels.  Let's just move to iommufd
> > > > > faster then?
> > > >
> > > > This is fine, Cindy, any idea on this?
> > > >
> > > > >
> > > > > > >
> > > > > > > > And if we do this in mm, it will bring extra overheads.
> > > > > > > >
> > > > > > > > Thanks
> > > > > > >
> > > > > > > Pointer per mm, not too bad ...
> > > > > >
> > > > > > Unless we enable it unconditionally, it requires a lot of tree
> > > > > > operations at least.
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > Not sure I understand.
> > > >
> > > > I mean in order to not count pinned pages multiple times we need to
> > > > keep track of the pages that are already pinned in an rbtree with
> > > > refcounts. This means we need to populate the tree when userspace
> > > > pin/unpin pages.
> > > >
> > > > Thanks
> > >
> > > Yep. In fact, if you are going to try and fix accounting you should
> > > probably find a way to fix a mix of vdpa and -realtime mlock=on as well
> > > as a mix of vfio and vdpa.
> >
> > I think iommufd could be a way. E.g VFIO and vDPA can share an iommufd
> > address space.
> >
> > Thanks
>
> And maybe -realtime mlock=on could use iommufd too ..

Probably, since iommufd is under development we can ask/code for new features.

Thanks

>
> > >
> > >
> > > > >
> > > > > > >
> > > > > > > > > You get more scrutiny if you try, for sure,
> > > > > > > > > and you need to need to generalize it enough that it looks
> > > > > > > > > useful outside the driver. But I guess that's good?
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > > > > > > > > > >       dev->mm = NULL;
> > > > > > > > > > > >  }
> > > > > > > > > > > >
> > > > > > > > > > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > > > > > +     struct rb_root_cached *vdpa_tree;
> > > > > > > > > > > > +     u32 key;
> > > > > > > > > > > > +
> > > > > > > > > > > > +     /* No hased table, init one */
> > > > > > > > > > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > > > > > > > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > > > > > > > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > > > > > > > > > +     }
> > > > > > > > > > > > +
> > > > > > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > > > > > +                            key) {
> > > > > > > > > > > > +             if (rbtree_root->mm_using == mm)
> > > > > > > > > > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > > > > > > > > > +     }
> > > > > > > > > > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > > > > > > > > > +     if (!rbtree_root)
> > > > > > > > > > > > +             return NULL;
> > > > > > > > > > > > +     rbtree_root->mm_using = mm;
> > > > > > > > > > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > > > > > > > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > > > > > > > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > > > > > > > > > +     return vdpa_tree;
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > > > > > +     u32 key;
> > > > > > > > > > > > +
> > > > > > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > > > > > +
> > > > > > > > > > > > +     /* No hased table, init one */
> > > > > > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > > > > > +                            key) {
> > > > > > > > > > > > +             if (rbtree_root->mm_using == mm) {
> > > > > > > > > > > > +                     hash_del(&rbtree_root->node);
> > > > > > > > > > > > +                     kfree(rbtree_root);
> > > > > > > > > > > > +             }
> > > > > > > > > > > > +     }
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > >  /* Caller should have device mutex */
> > > > > > > > > > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > > > >  {
> > > > > > > > > > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > > > >       err = vhost_dev_alloc_iovecs(dev);
> > > > > > > > > > > >       if (err)
> > > > > > > > > > > >               goto err_cgroup;
> > > > > > > > > > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > > > > > > > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > > > > > > > > > +             err = -ENOMEM;
> > > > > > > > > > > > +             goto err_cgroup;
> > > > > > > > > > > > +     }
> > > > > > > > > > > >
> > > > > > > > > > > >       return 0;
> > > > > > > > > > > >  err_cgroup:
> > > > > > > > > > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > > > >               dev->worker = NULL;
> > > > > > > > > > > >       }
> > > > > > > > > > > >  err_worker:
> > > > > > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > > > > > >       vhost_detach_mm(dev);
> > > > > > > > > > > >       dev->kcov_handle = 0;
> > > > > > > > > > > >  err_mm:
> > > > > > > > > > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > > > > > > > > > >               dev->worker = NULL;
> > > > > > > > > > > >               dev->kcov_handle = 0;
> > > > > > > > > > > >       }
> > > > > > > > > > > > +
> > > > > > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > > > > > >       vhost_detach_mm(dev);
> > > > > > > > > > > >  }
> > > > > > > > > > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > > > > > > > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > > > > > > > > > index d9109107af08..84de33de3abf 100644
> > > > > > > > > > > > --- a/drivers/vhost/vhost.h
> > > > > > > > > > > > +++ b/drivers/vhost/vhost.h
> > > > > > > > > > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > > > > > > > > > >       int byte_weight;
> > > > > > > > > > > >       u64 kcov_handle;
> > > > > > > > > > > >       bool use_worker;
> > > > > > > > > > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > > > > > > > > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > > > > > > > > > >                          struct vhost_iotlb_msg *msg);
> > > > > > > > > > > >  };
> > > > > > > > > > > > --
> > > > > > > > > > > > 2.34.3
> > > > > > > > > > >
> > > > > > > > >
> > > > > > >
> > > > >
> > >
>


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem
@ 2022-06-28  6:27                         ` Jason Wang
  0 siblings, 0 replies; 24+ messages in thread
From: Jason Wang @ 2022-06-28  6:27 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: linux-kernel, Cindy Lu, virtualization

On Tue, Jun 28, 2022 at 2:25 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Jun 28, 2022 at 02:18:55PM +0800, Jason Wang wrote:
> > On Tue, Jun 28, 2022 at 2:17 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Tue, Jun 28, 2022 at 02:10:38PM +0800, Jason Wang wrote:
> > > > On Tue, Jun 28, 2022 at 2:07 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Tue, Jun 28, 2022 at 02:03:38PM +0800, Jason Wang wrote:
> > > > > > On Tue, Jun 28, 2022 at 1:58 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Tue, Jun 28, 2022 at 11:54:27AM +0800, Jason Wang wrote:
> > > > > > > > On Mon, Jun 27, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Mon, Jun 27, 2022 at 04:12:57PM +0800, Cindy Lu wrote:
> > > > > > > > > > On Sun, Jun 26, 2022 at 6:01 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Sun, Jun 26, 2022 at 05:04:08PM +0800, Cindy Lu wrote:
> > > > > > > > > > > > We count pinned_vm as follow in vhost-vDPA
> > > > > > > > > > > >
> > > > > > > > > > > > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > > > > > > > > > > if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> > > > > > > > > > > >          ret = -ENOMEM;
> > > > > > > > > > > >          goto unlock;
> > > > > > > > > > > > }
> > > > > > > > > > > > This means if we have two vDPA devices for the same VM the pages
> > > > > > > > > > > > would be counted twice. So we add a tree to save the page that
> > > > > > > > > > > > counted and we will not count it again.
> > > > > > > > > > > >
> > > > > > > > > > > > Add vdpa_mem_tree to saved the mem that already counted.
> > > > > > > > > > > > use a hlist to saved the root for vdpa_mem_tree.
> > > > > > > > > > > >
> > > > > > > > > > > > Signed-off-by: Cindy Lu <lulu@redhat.com>
> > > > > > > > > > > > ---
> > > > > > > > > > > >  drivers/vhost/vhost.c | 63 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > > > > >  drivers/vhost/vhost.h |  1 +
> > > > > > > > > > > >  2 files changed, 64 insertions(+)
> > > > > > > > > > > >
> > > > > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > > > > > > index 40097826cff0..4ca8b1ed944b 100644
> > > > > > > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > > > > > > @@ -32,6 +32,8 @@
> > > > > > > > > > > >  #include <linux/kcov.h>
> > > > > > > > > > > >
> > > > > > > > > > > >  #include "vhost.h"
> > > > > > > > > > > > +#include <linux/hashtable.h>
> > > > > > > > > > > > +#include <linux/jhash.h>
> > > > > > > > > > > >
> > > > > > > > > > > >  static ushort max_mem_regions = 64;
> > > > > > > > > > > >  module_param(max_mem_regions, ushort, 0444);
> > > > > > > > > > > > @@ -49,6 +51,14 @@ enum {
> > > > > > > > > > > >  #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
> > > > > > > > > > > >  #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
> > > > > > > > > > > >
> > > > > > > > > > > > +struct vhost_vdpa_rbtree_node {
> > > > > > > > > > > > +     struct hlist_node node;
> > > > > > > > > > > > +     struct rb_root_cached vdpa_mem_tree;
> > > > > > > > > > > > +     struct mm_struct *mm_using;
> > > > > > > > > > > > +};
> > > > > > > > > > > > +static DECLARE_HASHTABLE(vhost_vdpa_rbtree_hlist, 8);
> > > > > > > > > > > > +int vhost_vdpa_rbtree_hlist_status;
> > > > > > > > > > > > +
> > > > > > > > > > > >  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> > > > > > > > > > > >  static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
> > > > > > > > > > > >  {
> > > > > > > > > > >
> > > > > > > > > > > Are you trying to save some per-mm information here?
> > > > > > > > > > > Can't we just add it to mm_struct?
> > > > > > > > > > >
> > > > > > > > > > yes, this is a per-mm information, but I have checked with jason before,
> > > > > > > > > > seems it maybe difficult to change the mm_struct in upstream
> > > > > > > > > > so I add an to add a hlist  instead
> > > > > > > > > > Thanks
> > > > > > > > > > Cindy
> > > > > > > > >
> > > > > > > > > Difficult how?
> > > > > > > >
> > > > > > > > It is only useful for vDPA probably. Though it could be used by VFIO
> > > > > > > > as well, VFIO does pinning/accounting at the container level and it
> > > > > > > > has been there for years.
> > > > > > >
> > > > > > > Yes it's been there, I'm not sure this means it's perfect.
> > > > > > > Also, rdma guys might be interested too I guess?
> > > > > >
> > > > > > It looks to me they plan to go to iommufd as well.
> > > > > >
> > > > > > >
> > > > > > > > vDPA have an implicit "container" the
> > > > > > > > mm_struct, but the accounting is done per device right now.
> > > > > > > >
> > > > > > > > In the future, when vDPA switches to iommufd, it can be then solved at
> > > > > > > > iommufd level.
> > > > > > >
> > > > > > > So is it even worth fixing now?
> > > > > >
> > > > > > Not sure, but I guess it's better. (Or we need to teach the libvirt to
> > > > > > have special care on this).
> > > > >
> > > > > It already has to for existing kernels.  Let's just move to iommufd
> > > > > faster then?
> > > >
> > > > This is fine, Cindy, any idea on this?
> > > >
> > > > >
> > > > > > >
> > > > > > > > And if we do this in mm, it will bring extra overheads.
> > > > > > > >
> > > > > > > > Thanks
> > > > > > >
> > > > > > > Pointer per mm, not too bad ...
> > > > > >
> > > > > > Unless we enable it unconditionally, it requires a lot of tree
> > > > > > operations at least.
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > Not sure I understand.
> > > >
> > > > I mean in order to not count pinned pages multiple times we need to
> > > > keep track of the pages that are already pinned in an rbtree with
> > > > refcounts. This means we need to populate the tree when userspace
> > > > pin/unpin pages.
> > > >
> > > > Thanks
> > >
> > > Yep. In fact, if you are going to try and fix accounting you should
> > > probably find a way to fix a mix of vdpa and -realtime mlock=on as well
> > > as a mix of vfio and vdpa.
> >
> > I think iommufd could be a way. E.g VFIO and vDPA can share an iommufd
> > address space.
> >
> > Thanks
>
> And maybe -realtime mlock=on could use iommufd too ..

Probably, since iommufd is under development we can ask/code for new features.

Thanks

>
> > >
> > >
> > > > >
> > > > > > >
> > > > > > > > > You get more scrutiny if you try, for sure,
> > > > > > > > > and you need to need to generalize it enough that it looks
> > > > > > > > > useful outside the driver. But I guess that's good?
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > @@ -571,6 +581,51 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> > > > > > > > > > > >       dev->mm = NULL;
> > > > > > > > > > > >  }
> > > > > > > > > > > >
> > > > > > > > > > > > +struct rb_root_cached *vhost_vdpa_get_mem_tree(struct mm_struct *mm)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > > > > > +     struct rb_root_cached *vdpa_tree;
> > > > > > > > > > > > +     u32 key;
> > > > > > > > > > > > +
> > > > > > > > > > > > +     /* No hased table, init one */
> > > > > > > > > > > > +     if (vhost_vdpa_rbtree_hlist_status == 0) {
> > > > > > > > > > > > +             hash_init(vhost_vdpa_rbtree_hlist);
> > > > > > > > > > > > +             vhost_vdpa_rbtree_hlist_status = 1;
> > > > > > > > > > > > +     }
> > > > > > > > > > > > +
> > > > > > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > > > > > +                            key) {
> > > > > > > > > > > > +             if (rbtree_root->mm_using == mm)
> > > > > > > > > > > > +                     return &(rbtree_root->vdpa_mem_tree);
> > > > > > > > > > > > +     }
> > > > > > > > > > > > +     rbtree_root = kmalloc(sizeof(*rbtree_root), GFP_KERNEL);
> > > > > > > > > > > > +     if (!rbtree_root)
> > > > > > > > > > > > +             return NULL;
> > > > > > > > > > > > +     rbtree_root->mm_using = mm;
> > > > > > > > > > > > +     rbtree_root->vdpa_mem_tree = RB_ROOT_CACHED;
> > > > > > > > > > > > +     hash_add(vhost_vdpa_rbtree_hlist, &rbtree_root->node, key);
> > > > > > > > > > > > +     vdpa_tree = &(rbtree_root->vdpa_mem_tree);
> > > > > > > > > > > > +     return vdpa_tree;
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > > +void vhost_vdpa_relase_mem_tree(struct mm_struct *mm)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +     struct vhost_vdpa_rbtree_node *rbtree_root = NULL;
> > > > > > > > > > > > +     u32 key;
> > > > > > > > > > > > +
> > > > > > > > > > > > +     key = jhash_1word((u64)mm, JHASH_INITVAL);
> > > > > > > > > > > > +
> > > > > > > > > > > > +     /* No hased table, init one */
> > > > > > > > > > > > +     hash_for_each_possible(vhost_vdpa_rbtree_hlist, rbtree_root, node,
> > > > > > > > > > > > +                            key) {
> > > > > > > > > > > > +             if (rbtree_root->mm_using == mm) {
> > > > > > > > > > > > +                     hash_del(&rbtree_root->node);
> > > > > > > > > > > > +                     kfree(rbtree_root);
> > > > > > > > > > > > +             }
> > > > > > > > > > > > +     }
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > >  /* Caller should have device mutex */
> > > > > > > > > > > >  long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > > > >  {
> > > > > > > > > > > > @@ -605,6 +660,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > > > >       err = vhost_dev_alloc_iovecs(dev);
> > > > > > > > > > > >       if (err)
> > > > > > > > > > > >               goto err_cgroup;
> > > > > > > > > > > > +     dev->vdpa_mem_tree = vhost_vdpa_get_mem_tree(dev->mm);
> > > > > > > > > > > > +     if (dev->vdpa_mem_tree == NULL) {
> > > > > > > > > > > > +             err = -ENOMEM;
> > > > > > > > > > > > +             goto err_cgroup;
> > > > > > > > > > > > +     }
> > > > > > > > > > > >
> > > > > > > > > > > >       return 0;
> > > > > > > > > > > >  err_cgroup:
> > > > > > > > > > > > @@ -613,6 +673,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> > > > > > > > > > > >               dev->worker = NULL;
> > > > > > > > > > > >       }
> > > > > > > > > > > >  err_worker:
> > > > > > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > > > > > >       vhost_detach_mm(dev);
> > > > > > > > > > > >       dev->kcov_handle = 0;
> > > > > > > > > > > >  err_mm:
> > > > > > > > > > > > @@ -710,6 +771,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> > > > > > > > > > > >               dev->worker = NULL;
> > > > > > > > > > > >               dev->kcov_handle = 0;
> > > > > > > > > > > >       }
> > > > > > > > > > > > +
> > > > > > > > > > > > +     vhost_vdpa_relase_mem_tree(dev->mm);
> > > > > > > > > > > >       vhost_detach_mm(dev);
> > > > > > > > > > > >  }
> > > > > > > > > > > >  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> > > > > > > > > > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > > > > > > > > > index d9109107af08..84de33de3abf 100644
> > > > > > > > > > > > --- a/drivers/vhost/vhost.h
> > > > > > > > > > > > +++ b/drivers/vhost/vhost.h
> > > > > > > > > > > > @@ -160,6 +160,7 @@ struct vhost_dev {
> > > > > > > > > > > >       int byte_weight;
> > > > > > > > > > > >       u64 kcov_handle;
> > > > > > > > > > > >       bool use_worker;
> > > > > > > > > > > > +     struct rb_root_cached *vdpa_mem_tree;
> > > > > > > > > > > >       int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> > > > > > > > > > > >                          struct vhost_iotlb_msg *msg);
> > > > > > > > > > > >  };
> > > > > > > > > > > > --
> > > > > > > > > > > > 2.34.3
> > > > > > > > > > >
> > > > > > > > >
> > > > > > >
> > > > >
> > >
>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2022-06-28  6:27 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-26  9:04 [PATCH v3 1/2] vhost: Add rbtree vdpa_mem_tree to saved the counted mem Cindy Lu
2022-06-26 10:01 ` Michael S. Tsirkin
2022-06-26 10:01   ` Michael S. Tsirkin
2022-06-27  8:12   ` Cindy Lu
2022-06-27 10:01     ` Michael S. Tsirkin
2022-06-27 10:01       ` Michael S. Tsirkin
2022-06-28  3:54       ` Jason Wang
2022-06-28  3:54         ` Jason Wang
2022-06-28  5:58         ` Michael S. Tsirkin
2022-06-28  5:58           ` Michael S. Tsirkin
2022-06-28  6:03           ` Jason Wang
2022-06-28  6:03             ` Jason Wang
2022-06-28  6:07             ` Michael S. Tsirkin
2022-06-28  6:07               ` Michael S. Tsirkin
2022-06-28  6:10               ` Jason Wang
2022-06-28  6:10                 ` Jason Wang
2022-06-28  6:17                 ` Michael S. Tsirkin
2022-06-28  6:17                   ` Michael S. Tsirkin
2022-06-28  6:18                   ` Jason Wang
2022-06-28  6:18                     ` Jason Wang
2022-06-28  6:25                     ` Michael S. Tsirkin
2022-06-28  6:25                       ` Michael S. Tsirkin
2022-06-28  6:27                       ` Jason Wang
2022-06-28  6:27                         ` Jason Wang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.