linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
@ 2020-03-12  7:49 Hui Zhu
  2020-03-12  7:49 ` [RFC for QEMU] virtio-balloon: Add option thp-order to set VIRTIO_BALLOON_F_THP_ORDER Hui Zhu
                   ` (2 more replies)
  0 siblings, 3 replies; 32+ messages in thread
From: Hui Zhu @ 2020-03-12  7:49 UTC (permalink / raw)
  To: mst, jasowang, akpm, pagupta, mojha, david, namit,
	virtualization, linux-kernel, qemu-devel
  Cc: Hui Zhu, Hui Zhu

If the guest kernel has many fragmentation pages, use virtio_balloon
will split THP of QEMU when it calls MADV_DONTNEED madvise to release
the balloon pages.
This is an example in a VM with 1G memory 1CPU:
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages:         0 kB

usemem --punch-holes -s -1 800m &

cat /proc/meminfo | grep AnonHugePages:
AnonHugePages:    976896 kB

(qemu) device_add virtio-balloon-pci,id=balloon1
(qemu) info balloon
balloon: actual=1024
(qemu) balloon 624
(qemu) info balloon
balloon: actual=624

cat /proc/meminfo | grep AnonHugePages:
AnonHugePages:    153600 kB

THP number decreased more than 800M.
The reason is usemem with punch-holes option will free every other page
after allocation.  Then 400M free memory inside the guest kernel is
fragmentation pages.
The guest kernel will use them to inflate the balloon.  When these
fragmentation pages are freed, THP will be split.

This commit tries to handle this with add a new flag
VIRTIO_BALLOON_F_THP_ORDER.
When this flag is set, the balloon page order will be set to the THP order.
Then THP pages will be freed together in the host.
This is an example in a VM with 1G memory 1CPU:
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages:         0 kB

usemem --punch-holes -s -1 800m &

cat /proc/meminfo | grep AnonHugePages:
AnonHugePages:    976896 kB

(qemu) device_add virtio-balloon-pci,id=balloon1,thp-order=on
(qemu) info balloon
balloon: actual=1024
(qemu) balloon 624
(qemu) info balloon
balloon: actual=624

cat /proc/meminfo | grep AnonHugePages:
AnonHugePages:    583680 kB

The THP number decreases 384M.  This shows that VIRTIO_BALLOON_F_THP_ORDER
can help handle the THP split issue.

Signed-off-by: Hui Zhu <teawaterz@linux.alibaba.com>
---
 drivers/virtio/virtio_balloon.c     | 57 ++++++++++++++++++++++++++-----------
 include/linux/balloon_compaction.h  | 14 ++++++---
 include/uapi/linux/virtio_balloon.h |  4 +++
 3 files changed, 54 insertions(+), 21 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 7bfe365..1e1dc76 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -175,18 +175,31 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 	unsigned num_pfns;
 	struct page *page;
 	LIST_HEAD(pages);
+	int page_order = 0;
 
 	/* We can only do one array worth at a time. */
 	num = min(num, ARRAY_SIZE(vb->pfns));
 
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_THP_ORDER))
+		page_order = VIRTIO_BALLOON_THP_ORDER;
+
 	for (num_pfns = 0; num_pfns < num;
 	     num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-		struct page *page = balloon_page_alloc();
+		struct page *page;
+
+		if (page_order)
+			page = alloc_pages(__GFP_HIGHMEM |
+					   __GFP_KSWAPD_RECLAIM |
+					   __GFP_RETRY_MAYFAIL |
+					   __GFP_NOWARN | __GFP_NOMEMALLOC,
+					   page_order);
+		else
+			page = balloon_page_alloc();
 
 		if (!page) {
 			dev_info_ratelimited(&vb->vdev->dev,
-					     "Out of puff! Can't get %u pages\n",
-					     VIRTIO_BALLOON_PAGES_PER_PAGE);
+				"Out of puff! Can't get %u pages\n",
+				VIRTIO_BALLOON_PAGES_PER_PAGE << page_order);
 			/* Sleep for at least 1/5 of a second before retry. */
 			msleep(200);
 			break;
@@ -206,7 +219,7 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
 		if (!virtio_has_feature(vb->vdev,
 					VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
-			adjust_managed_page_count(page, -1);
+			adjust_managed_page_count(page, -(1 << page_order));
 		vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
 	}
 
@@ -223,13 +236,20 @@ static void release_pages_balloon(struct virtio_balloon *vb,
 				 struct list_head *pages)
 {
 	struct page *page, *next;
+	int page_order = 0;
+
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_THP_ORDER))
+		page_order = VIRTIO_BALLOON_THP_ORDER;
 
 	list_for_each_entry_safe(page, next, pages, lru) {
 		if (!virtio_has_feature(vb->vdev,
 					VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
-			adjust_managed_page_count(page, 1);
+			adjust_managed_page_count(page, 1 << page_order);
 		list_del(&page->lru);
-		put_page(page); /* balloon reference */
+		if (page_order)
+			__free_pages(page, page_order);
+		else
+			put_page(page); /* balloon reference */
 	}
 }
 
@@ -893,19 +913,21 @@ static int virtballoon_probe(struct virtio_device *vdev)
 		goto out_free_vb;
 
 #ifdef CONFIG_BALLOON_COMPACTION
-	balloon_mnt = kern_mount(&balloon_fs);
-	if (IS_ERR(balloon_mnt)) {
-		err = PTR_ERR(balloon_mnt);
-		goto out_del_vqs;
-	}
+	if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_THP_ORDER)) {
+		balloon_mnt = kern_mount(&balloon_fs);
+		if (IS_ERR(balloon_mnt)) {
+			err = PTR_ERR(balloon_mnt);
+			goto out_del_vqs;
+		}
 
-	vb->vb_dev_info.migratepage = virtballoon_migratepage;
-	vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
-	if (IS_ERR(vb->vb_dev_info.inode)) {
-		err = PTR_ERR(vb->vb_dev_info.inode);
-		goto out_kern_unmount;
+		vb->vb_dev_info.migratepage = virtballoon_migratepage;
+		vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
+		if (IS_ERR(vb->vb_dev_info.inode)) {
+			err = PTR_ERR(vb->vb_dev_info.inode);
+			goto out_kern_unmount;
+		}
+		vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
 	}
-	vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
 #endif
 	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
 		/*
@@ -1058,6 +1080,7 @@ static unsigned int features[] = {
 	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
 	VIRTIO_BALLOON_F_FREE_PAGE_HINT,
 	VIRTIO_BALLOON_F_PAGE_POISON,
+	VIRTIO_BALLOON_F_THP_ORDER,
 };
 
 static struct virtio_driver virtio_balloon_driver = {
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 338aa27..4c9164e 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -100,8 +100,12 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 				       struct page *page)
 {
 	__SetPageOffline(page);
-	__SetPageMovable(page, balloon->inode->i_mapping);
-	set_page_private(page, (unsigned long)balloon);
+	if (balloon->inode) {
+		__SetPageMovable(page, balloon->inode->i_mapping);
+		set_page_private(page, (unsigned long)balloon);
+	} else {
+		set_page_private(page, 0);
+	}
 	list_add(&page->lru, &balloon->pages);
 }
 
@@ -116,8 +120,10 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 static inline void balloon_page_delete(struct page *page)
 {
 	__ClearPageOffline(page);
-	__ClearPageMovable(page);
-	set_page_private(page, 0);
+	if (page_private(page)) {
+		__ClearPageMovable(page);
+		set_page_private(page, 0);
+	}
 	/*
 	 * No touch page.lru field once @page has been isolated
 	 * because VM is using the field.
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index a1966cd7..a2998a9 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -36,10 +36,14 @@
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
 #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
 #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
+#define VIRTIO_BALLOON_F_THP_ORDER	5 /* Balloon page order to thp order */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
 
+/* The order of the balloon page */
+#define VIRTIO_BALLOON_THP_ORDER 9
+
 #define VIRTIO_BALLOON_CMD_ID_STOP	0
 #define VIRTIO_BALLOON_CMD_ID_DONE	1
 struct virtio_balloon_config {
-- 
2.7.4


^ permalink raw reply	[flat|nested] 32+ messages in thread

* [RFC for QEMU] virtio-balloon: Add option thp-order to set VIRTIO_BALLOON_F_THP_ORDER
  2020-03-12  7:49 [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue Hui Zhu
@ 2020-03-12  7:49 ` Hui Zhu
  2020-03-12  8:22   ` no-reply
  2020-03-12  8:25   ` Michael S. Tsirkin
  2020-03-12  8:18 ` [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue Michael S. Tsirkin
  2020-03-12  8:37 ` David Hildenbrand
  2 siblings, 2 replies; 32+ messages in thread
From: Hui Zhu @ 2020-03-12  7:49 UTC (permalink / raw)
  To: mst, jasowang, akpm, pagupta, mojha, david, namit,
	virtualization, linux-kernel, qemu-devel
  Cc: Hui Zhu, Hui Zhu

If the guest kernel has many fragmentation pages, use virtio_balloon
will split THP of QEMU when it calls MADV_DONTNEED madvise to release
the balloon pages.
Set option thp-order to on will open flags VIRTIO_BALLOON_F_THP_ORDER.
It will set balloon size to THP size to handle the THP split issue.

Signed-off-by: Hui Zhu <teawaterz@linux.alibaba.com>
---
 hw/virtio/virtio-balloon.c                      | 67 ++++++++++++++++---------
 include/standard-headers/linux/virtio_balloon.h |  4 ++
 2 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index a4729f7..cfe86b0 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -340,37 +340,49 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
         while (iov_to_buf(elem->out_sg, elem->out_num, offset, &pfn, 4) == 4) {
             unsigned int p = virtio_ldl_p(vdev, &pfn);
             hwaddr pa;
+            size_t handle_size = BALLOON_PAGE_SIZE;
 
             pa = (hwaddr) p << VIRTIO_BALLOON_PFN_SHIFT;
             offset += 4;
 
-            section = memory_region_find(get_system_memory(), pa,
-                                         BALLOON_PAGE_SIZE);
-            if (!section.mr) {
-                trace_virtio_balloon_bad_addr(pa);
-                continue;
-            }
-            if (!memory_region_is_ram(section.mr) ||
-                memory_region_is_rom(section.mr) ||
-                memory_region_is_romd(section.mr)) {
-                trace_virtio_balloon_bad_addr(pa);
-                memory_region_unref(section.mr);
-                continue;
-            }
+            if (virtio_has_feature(s->host_features,
+                                   VIRTIO_BALLOON_F_THP_ORDER))
+                handle_size = BALLOON_PAGE_SIZE << VIRTIO_BALLOON_THP_ORDER;
+
+            while (handle_size > 0) {
+                section = memory_region_find(get_system_memory(), pa,
+                                             BALLOON_PAGE_SIZE);
+                if (!section.mr) {
+                    trace_virtio_balloon_bad_addr(pa);
+                    continue;
+                }
+                if (!memory_region_is_ram(section.mr) ||
+                    memory_region_is_rom(section.mr) ||
+                    memory_region_is_romd(section.mr)) {
+                    trace_virtio_balloon_bad_addr(pa);
+                    memory_region_unref(section.mr);
+                    continue;
+                }
 
-            trace_virtio_balloon_handle_output(memory_region_name(section.mr),
-                                               pa);
-            if (!qemu_balloon_is_inhibited()) {
-                if (vq == s->ivq) {
-                    balloon_inflate_page(s, section.mr,
-                                         section.offset_within_region, &pbp);
-                } else if (vq == s->dvq) {
-                    balloon_deflate_page(s, section.mr, section.offset_within_region);
-                } else {
-                    g_assert_not_reached();
+                trace_virtio_balloon_handle_output(memory_region_name(section.mr),
+                                                   pa);
+                if (!qemu_balloon_is_inhibited()) {
+                    if (vq == s->ivq) {
+                        balloon_inflate_page(s, section.mr,
+                                             section.offset_within_region,
+                                             &pbp);
+                    } else if (vq == s->dvq) {
+                        balloon_deflate_page(s, section.mr,
+                                             section.offset_within_region);
+                    } else {
+                        g_assert_not_reached();
+                    }
                 }
+                memory_region_unref(section.mr);
+
+                pa += BALLOON_PAGE_SIZE;
+                handle_size -= BALLOON_PAGE_SIZE;
             }
-            memory_region_unref(section.mr);
         }
 
         virtqueue_push(vq, elem, offset);
@@ -693,6 +705,8 @@ static void virtio_balloon_set_config(VirtIODevice *vdev,
 
     memcpy(&config, config_data, virtio_balloon_config_size(dev));
     dev->actual = le32_to_cpu(config.actual);
+    if (virtio_has_feature(vdev->host_features, VIRTIO_BALLOON_F_THP_ORDER))
+        dev->actual <<= VIRTIO_BALLOON_THP_ORDER;
     if (dev->actual != oldactual) {
         qapi_event_send_balloon_change(vm_ram_size -
                         ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT));
@@ -728,6 +742,9 @@ static void virtio_balloon_to_target(void *opaque, ram_addr_t target)
     }
     if (target) {
         dev->num_pages = (vm_ram_size - target) >> VIRTIO_BALLOON_PFN_SHIFT;
+        if (virtio_has_feature(dev->host_features,
+                               VIRTIO_BALLOON_F_THP_ORDER))
+            dev->num_pages >>= VIRTIO_BALLOON_THP_ORDER;
         virtio_notify_config(vdev);
     }
     trace_virtio_balloon_to_target(target, dev->num_pages);
@@ -916,6 +933,8 @@ static Property virtio_balloon_properties[] = {
                     VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false),
     DEFINE_PROP_BIT("free-page-hint", VirtIOBalloon, host_features,
                     VIRTIO_BALLOON_F_FREE_PAGE_HINT, false),
+    DEFINE_PROP_BIT("thp-order", VirtIOBalloon, host_features,
+                    VIRTIO_BALLOON_F_THP_ORDER, false),
     /* QEMU 4.0 accidentally changed the config size even when free-page-hint
      * is disabled, resulting in QEMU 3.1 migration incompatibility.  This
      * property retains this quirk for QEMU 4.1 machine types.
diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h
index 9375ca2..f54d613 100644
--- a/include/standard-headers/linux/virtio_balloon.h
+++ b/include/standard-headers/linux/virtio_balloon.h
@@ -36,10 +36,14 @@
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
 #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
 #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
+#define VIRTIO_BALLOON_F_THP_ORDER	5 /* Set balloon page order to thp order */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
 
+/* The order of the balloon page */
+#define VIRTIO_BALLOON_THP_ORDER 9
+
 #define VIRTIO_BALLOON_CMD_ID_STOP	0
 #define VIRTIO_BALLOON_CMD_ID_DONE	1
 struct virtio_balloon_config {
-- 
2.7.4


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-12  7:49 [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue Hui Zhu
  2020-03-12  7:49 ` [RFC for QEMU] virtio-balloon: Add option thp-order to set VIRTIO_BALLOON_F_THP_ORDER Hui Zhu
@ 2020-03-12  8:18 ` Michael S. Tsirkin
  2020-03-12  8:37 ` David Hildenbrand
  2 siblings, 0 replies; 32+ messages in thread
From: Michael S. Tsirkin @ 2020-03-12  8:18 UTC (permalink / raw)
  To: Hui Zhu
  Cc: jasowang, akpm, pagupta, mojha, david, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu

On Thu, Mar 12, 2020 at 03:49:54PM +0800, Hui Zhu wrote:
> If the guest kernel has many fragmentation pages, use virtio_balloon
> will split THP of QEMU when it calls MADV_DONTNEED madvise to release
> the balloon pages.
> This is an example in a VM with 1G memory 1CPU:
> cat /proc/meminfo | grep AnonHugePages:
> AnonHugePages:         0 kB
> 
> usemem --punch-holes -s -1 800m &
> 
> cat /proc/meminfo | grep AnonHugePages:
> AnonHugePages:    976896 kB
> 
> (qemu) device_add virtio-balloon-pci,id=balloon1
> (qemu) info balloon
> balloon: actual=1024
> (qemu) balloon 624
> (qemu) info balloon
> balloon: actual=624
> 
> cat /proc/meminfo | grep AnonHugePages:
> AnonHugePages:    153600 kB
> 
> THP number decreased more than 800M.
> The reason is usemem with punch-holes option will free every other page
> after allocation.  Then 400M free memory inside the guest kernel is
> fragmentation pages.
> The guest kernel will use them to inflate the balloon.  When these
> fragmentation pages are freed, THP will be split.
> 
> This commit tries to handle this with add a new flag
> VIRTIO_BALLOON_F_THP_ORDER.
> When this flag is set, the balloon page order will be set to the THP order.
> Then THP pages will be freed together in the host.
> This is an example in a VM with 1G memory 1CPU:
> cat /proc/meminfo | grep AnonHugePages:
> AnonHugePages:         0 kB
> 
> usemem --punch-holes -s -1 800m &
> 
> cat /proc/meminfo | grep AnonHugePages:
> AnonHugePages:    976896 kB
> 
> (qemu) device_add virtio-balloon-pci,id=balloon1,thp-order=on
> (qemu) info balloon
> balloon: actual=1024
> (qemu) balloon 624
> (qemu) info balloon
> balloon: actual=624
> 
> cat /proc/meminfo | grep AnonHugePages:
> AnonHugePages:    583680 kB
> 
> The THP number decreases 384M.  This shows that VIRTIO_BALLOON_F_THP_ORDER
> can help handle the THP split issue.
> 
> Signed-off-by: Hui Zhu <teawaterz@linux.alibaba.com>
> ---
>  drivers/virtio/virtio_balloon.c     | 57 ++++++++++++++++++++++++++-----------
>  include/linux/balloon_compaction.h  | 14 ++++++---
>  include/uapi/linux/virtio_balloon.h |  4 +++
>  3 files changed, 54 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 7bfe365..1e1dc76 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -175,18 +175,31 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>  	unsigned num_pfns;
>  	struct page *page;
>  	LIST_HEAD(pages);
> +	int page_order = 0;
>  
>  	/* We can only do one array worth at a time. */
>  	num = min(num, ARRAY_SIZE(vb->pfns));
>  
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_THP_ORDER))
> +		page_order = VIRTIO_BALLOON_THP_ORDER;
> +
>  	for (num_pfns = 0; num_pfns < num;
>  	     num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
> -		struct page *page = balloon_page_alloc();
> +		struct page *page;
> +
> +		if (page_order)
> +			page = alloc_pages(__GFP_HIGHMEM |
> +					   __GFP_KSWAPD_RECLAIM |
> +					   __GFP_RETRY_MAYFAIL |
> +					   __GFP_NOWARN | __GFP_NOMEMALLOC,

The set of flags is inconsistent with balloon_page_alloc.
Pls extend that do not bypass it.


> +					   page_order);
> +		else
> +			page = balloon_page_alloc();
>  
>  		if (!page) {
>  			dev_info_ratelimited(&vb->vdev->dev,
> -					     "Out of puff! Can't get %u pages\n",
> -					     VIRTIO_BALLOON_PAGES_PER_PAGE);
> +				"Out of puff! Can't get %u pages\n",
> +				VIRTIO_BALLOON_PAGES_PER_PAGE << page_order);
>  			/* Sleep for at least 1/5 of a second before retry. */
>  			msleep(200);
>  			break;

I suggest we do something guest side only for starters: if we need a
power of two pages, try to get them in a single chunk, with no retrying.
If that fails go back to a single page.


> @@ -206,7 +219,7 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>  		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
>  		if (!virtio_has_feature(vb->vdev,
>  					VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> -			adjust_managed_page_count(page, -1);
> +			adjust_managed_page_count(page, -(1 << page_order));
>  		vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
>  	}
>  
> @@ -223,13 +236,20 @@ static void release_pages_balloon(struct virtio_balloon *vb,
>  				 struct list_head *pages)
>  {
>  	struct page *page, *next;
> +	int page_order = 0;
> +
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_THP_ORDER))
> +		page_order = VIRTIO_BALLOON_THP_ORDER;
>  
>  	list_for_each_entry_safe(page, next, pages, lru) {
>  		if (!virtio_has_feature(vb->vdev,
>  					VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> -			adjust_managed_page_count(page, 1);
> +			adjust_managed_page_count(page, 1 << page_order);
>  		list_del(&page->lru);
> -		put_page(page); /* balloon reference */
> +		if (page_order)
> +			__free_pages(page, page_order);
> +		else
> +			put_page(page); /* balloon reference */
>  	}
>  }
>  
> @@ -893,19 +913,21 @@ static int virtballoon_probe(struct virtio_device *vdev)
>  		goto out_free_vb;
>  
>  #ifdef CONFIG_BALLOON_COMPACTION
> -	balloon_mnt = kern_mount(&balloon_fs);
> -	if (IS_ERR(balloon_mnt)) {
> -		err = PTR_ERR(balloon_mnt);
> -		goto out_del_vqs;
> -	}
> +	if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_THP_ORDER)) {
> +		balloon_mnt = kern_mount(&balloon_fs);
> +		if (IS_ERR(balloon_mnt)) {
> +			err = PTR_ERR(balloon_mnt);
> +			goto out_del_vqs;
> +		}
>  
> -	vb->vb_dev_info.migratepage = virtballoon_migratepage;
> -	vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
> -	if (IS_ERR(vb->vb_dev_info.inode)) {
> -		err = PTR_ERR(vb->vb_dev_info.inode);
> -		goto out_kern_unmount;
> +		vb->vb_dev_info.migratepage = virtballoon_migratepage;
> +		vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
> +		if (IS_ERR(vb->vb_dev_info.inode)) {
> +			err = PTR_ERR(vb->vb_dev_info.inode);
> +			goto out_kern_unmount;
> +		}
> +		vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
>  	}
> -	vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
>  #endif
>  	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
>  		/*


I doubt this fixed all code. Anything using VIRTIO_BALLOON_PAGES_PER_PAGE
would be suspect. Also, the result might not fit in the pfns array.




> @@ -1058,6 +1080,7 @@ static unsigned int features[] = {
>  	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
>  	VIRTIO_BALLOON_F_FREE_PAGE_HINT,
>  	VIRTIO_BALLOON_F_PAGE_POISON,
> +	VIRTIO_BALLOON_F_THP_ORDER,
>  };
>  
>  static struct virtio_driver virtio_balloon_driver = {
> diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
> index 338aa27..4c9164e 100644
> --- a/include/linux/balloon_compaction.h
> +++ b/include/linux/balloon_compaction.h
> @@ -100,8 +100,12 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
>  				       struct page *page)
>  {
>  	__SetPageOffline(page);
> -	__SetPageMovable(page, balloon->inode->i_mapping);
> -	set_page_private(page, (unsigned long)balloon);
> +	if (balloon->inode) {
> +		__SetPageMovable(page, balloon->inode->i_mapping);
> +		set_page_private(page, (unsigned long)balloon);
> +	} else {
> +		set_page_private(page, 0);
> +	}
>  	list_add(&page->lru, &balloon->pages);
>  }
>  
> @@ -116,8 +120,10 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
>  static inline void balloon_page_delete(struct page *page)
>  {
>  	__ClearPageOffline(page);
> -	__ClearPageMovable(page);
> -	set_page_private(page, 0);
> +	if (page_private(page)) {
> +		__ClearPageMovable(page);
> +		set_page_private(page, 0);
> +	}
>  	/*
>  	 * No touch page.lru field once @page has been isolated
>  	 * because VM is using the field.
> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> index a1966cd7..a2998a9 100644
> --- a/include/uapi/linux/virtio_balloon.h
> +++ b/include/uapi/linux/virtio_balloon.h
> @@ -36,10 +36,14 @@
>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
>  #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
>  #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
> +#define VIRTIO_BALLOON_F_THP_ORDER	5 /* Balloon page order to thp order */
>  
>  /* Size of a PFN in the balloon interface. */
>  #define VIRTIO_BALLOON_PFN_SHIFT 12
>  
> +/* The order of the balloon page */
> +#define VIRTIO_BALLOON_THP_ORDER 9
> +

Why 9?

>  #define VIRTIO_BALLOON_CMD_ID_STOP	0
>  #define VIRTIO_BALLOON_CMD_ID_DONE	1
>  struct virtio_balloon_config {


Assuming the idea is to also allow passing larger chunks to host,
I think we need to switch to using regular virtio S/G for starters.
That involves spec work though.



> -- 
> 2.7.4


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for QEMU] virtio-balloon: Add option thp-order to set VIRTIO_BALLOON_F_THP_ORDER
  2020-03-12  7:49 ` [RFC for QEMU] virtio-balloon: Add option thp-order to set VIRTIO_BALLOON_F_THP_ORDER Hui Zhu
@ 2020-03-12  8:22   ` no-reply
  2020-03-12  8:25   ` Michael S. Tsirkin
  1 sibling, 0 replies; 32+ messages in thread
From: no-reply @ 2020-03-12  8:22 UTC (permalink / raw)
  To: teawater
  Cc: mst, jasowang, akpm, pagupta, mojha, david, namit,
	virtualization, linux-kernel, qemu-devel, teawaterz, teawater

Patchew URL: https://patchew.org/QEMU/1583999395-9131-2-git-send-email-teawater@gmail.com/



Hi,

This series failed the asan build test. Please find the testing commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#!/bin/bash
export ARCH=x86_64
make docker-image-fedora V=1 NETWORK=1
time make docker-test-debug@fedora TARGET_LIST=x86_64-softmmu J=14 NETWORK=1
=== TEST SCRIPT END ===

PASS 1 fdc-test /x86_64/fdc/cmos
PASS 2 fdc-test /x86_64/fdc/no_media_on_start
PASS 3 fdc-test /x86_64/fdc/read_without_media
==6136==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 4 fdc-test /x86_64/fdc/media_change
PASS 5 fdc-test /x86_64/fdc/sense_interrupt
PASS 6 fdc-test /x86_64/fdc/relative_seek
---
PASS 32 test-opts-visitor /visitor/opts/range/beyond
PASS 33 test-opts-visitor /visitor/opts/dict/unvisited
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-coroutine -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-coroutine" 
==6187==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==6187==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7fff659f7000; bottom 0x7f4bef2e8000; size: 0x00b37670f000 (770786258944)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 1 test-coroutine /basic/no-dangling-access
---
PASS 12 test-aio /aio/event/flush
PASS 13 test-aio /aio/event/wait/no-flush-cb
PASS 14 test-aio /aio/timer/schedule
==6202==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 15 test-aio /aio/coroutine/queue-chaining
PASS 16 test-aio /aio-gsource/flush
PASS 17 test-aio /aio-gsource/bh/schedule
---
PASS 12 fdc-test /x86_64/fdc/read_no_dma_19
PASS 13 fdc-test /x86_64/fdc/fuzz-registers
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  QTEST_QEMU_BINARY=x86_64-softmmu/qemu-system-x86_64 QTEST_QEMU_IMG=qemu-img tests/qtest/ide-test -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="ide-test" 
==6210==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 28 test-aio /aio-gsource/timer/schedule
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-aio-multithread -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-aio-multithread" 
PASS 1 ide-test /x86_64/ide/identify
==6217==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-aio-multithread /aio/multi/lifecycle
==6219==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 2 ide-test /x86_64/ide/flush
PASS 2 test-aio-multithread /aio/multi/schedule
==6236==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 3 ide-test /x86_64/ide/bmdma/simple_rw
==6247==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 4 ide-test /x86_64/ide/bmdma/trim
PASS 3 test-aio-multithread /aio/multi/mutex/contended
==6253==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 4 test-aio-multithread /aio/multi/mutex/handoff
PASS 5 test-aio-multithread /aio/multi/mutex/mcs
PASS 6 test-aio-multithread /aio/multi/mutex/pthread
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-throttle -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-throttle" 
==6275==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-throttle /throttle/leak_bucket
PASS 2 test-throttle /throttle/compute_wait
PASS 3 test-throttle /throttle/init
---
PASS 15 test-throttle /throttle/config/iops_size
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-thread-pool -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-thread-pool" 
PASS 1 test-thread-pool /thread-pool/submit
==6279==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 2 test-thread-pool /thread-pool/submit-aio
PASS 3 test-thread-pool /thread-pool/submit-co
PASS 4 test-thread-pool /thread-pool/submit-many
==6346==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 5 test-thread-pool /thread-pool/cancel
PASS 6 test-thread-pool /thread-pool/cancel-async
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-hbitmap -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-hbitmap" 
---
PASS 14 test-hbitmap /hbitmap/set/twice
PASS 15 test-hbitmap /hbitmap/set/overlap
PASS 16 test-hbitmap /hbitmap/reset/empty
==6356==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 17 test-hbitmap /hbitmap/reset/general
PASS 18 test-hbitmap /hbitmap/reset/all
PASS 19 test-hbitmap /hbitmap/truncate/nop
---
PASS 31 test-hbitmap /hbitmap/meta/one
PASS 32 test-hbitmap /hbitmap/meta/byte
PASS 33 test-hbitmap /hbitmap/meta/word
==6362==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 34 test-hbitmap /hbitmap/meta/sector
PASS 35 test-hbitmap /hbitmap/serialize/align
PASS 36 test-hbitmap /hbitmap/serialize/basic
---
PASS 44 test-hbitmap /hbitmap/next_dirty_area/next_dirty_area_4
PASS 45 test-hbitmap /hbitmap/next_dirty_area/next_dirty_area_after_truncate
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-bdrv-drain -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-bdrv-drain" 
==6369==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-bdrv-drain /bdrv-drain/nested
PASS 2 test-bdrv-drain /bdrv-drain/multiparent
PASS 3 test-bdrv-drain /bdrv-drain/set_aio_context
---
PASS 41 test-bdrv-drain /bdrv-drain/bdrv_drop_intermediate/poll
PASS 42 test-bdrv-drain /bdrv-drain/replace_child/mid-drain
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-bdrv-graph-mod -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-bdrv-graph-mod" 
==6408==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-bdrv-graph-mod /bdrv-graph-mod/update-perm-tree
PASS 2 test-bdrv-graph-mod /bdrv-graph-mod/should-update-child
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-blockjob -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-blockjob" 
==6412==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-blockjob /blockjob/ids
PASS 2 test-blockjob /blockjob/cancel/created
PASS 3 test-blockjob /blockjob/cancel/running
---
PASS 7 test-blockjob /blockjob/cancel/pending
PASS 8 test-blockjob /blockjob/cancel/concluded
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-blockjob-txn -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-blockjob-txn" 
==6416==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-blockjob-txn /single/success
PASS 2 test-blockjob-txn /single/failure
PASS 3 test-blockjob-txn /single/cancel
---
PASS 6 test-blockjob-txn /pair/cancel
PASS 7 test-blockjob-txn /pair/fail-cancel-race
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-block-backend -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-block-backend" 
==6422==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-block-backend /block-backend/drain_aio_error
PASS 2 test-block-backend /block-backend/drain_all_aio_error
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-block-iothread -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-block-iothread" 
==6419==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==6429==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-block-iothread /sync-op/pread
PASS 2 test-block-iothread /sync-op/pwrite
PASS 3 test-block-iothread /sync-op/load_vmstate
---
PASS 15 test-block-iothread /propagate/diamond
PASS 16 test-block-iothread /propagate/mirror
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-image-locking -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-image-locking" 
==6450==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-image-locking /image-locking/basic
PASS 2 test-image-locking /image-locking/set-perm-abort
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-x86-cpuid -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-x86-cpuid" 
---
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-rcu-list -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-rcu-list" 
PASS 1 test-rcu-list /rcu/qlist/single-threaded
PASS 2 test-rcu-list /rcu/qlist/short-few
==6514==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 3 test-rcu-list /rcu/qlist/long-many
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-rcu-simpleq -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-rcu-simpleq" 
PASS 1 test-rcu-simpleq /rcu/qsimpleq/single-threaded
PASS 2 test-rcu-simpleq /rcu/qsimpleq/short-few
PASS 3 test-rcu-simpleq /rcu/qsimpleq/long-many
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-rcu-tailq -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-rcu-tailq" 
==6574==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-rcu-tailq /rcu/qtailq/single-threaded
PASS 2 test-rcu-tailq /rcu/qtailq/short-few
PASS 3 test-rcu-tailq /rcu/qtailq/long-many
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-rcu-slist -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-rcu-slist" 
PASS 1 test-rcu-slist /rcu/qslist/single-threaded
==6619==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 2 test-rcu-slist /rcu/qslist/short-few
PASS 3 test-rcu-slist /rcu/qslist/long-many
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-qdist -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-qdist" 
---
PASS 7 test-qdist /qdist/binning/expand
PASS 8 test-qdist /qdist/binning/shrink
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-qht -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-qht" 
==6659==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 5 ide-test /x86_64/ide/bmdma/various_prdts
==6665==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==6665==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7fff4fd7b000; bottom 0x7fb28fffe000; size: 0x004cbfd7d000 (329636106240)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 6 ide-test /x86_64/ide/bmdma/no_busmaster
PASS 7 ide-test /x86_64/ide/flush/nodev
==6676==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 8 ide-test /x86_64/ide/flush/empty_drive
==6681==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 9 ide-test /x86_64/ide/flush/retry_pci
==6687==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 10 ide-test /x86_64/ide/flush/retry_isa
==6693==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 11 ide-test /x86_64/ide/cdrom/pio
==6699==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-qht /qht/mode/default
PASS 12 ide-test /x86_64/ide/cdrom/pio_large
PASS 2 test-qht /qht/mode/resize
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-qht-par -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-qht-par" 
==6705==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 13 ide-test /x86_64/ide/cdrom/dma
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  QTEST_QEMU_BINARY=x86_64-softmmu/qemu-system-x86_64 QTEST_QEMU_IMG=qemu-img tests/qtest/ahci-test -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="ahci-test" 
PASS 1 test-qht-par /qht/parallel/2threads-0%updates-1s
==6728==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 ahci-test /x86_64/ahci/sanity
PASS 2 test-qht-par /qht/parallel/2threads-20%updates-1s
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-bitops -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-bitops" 
==6740==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-bitops /bitops/sextract32
PASS 2 test-bitops /bitops/sextract64
PASS 3 test-bitops /bitops/half_shuffle32
---
PASS 1 check-qom-interface /qom/interface/direct_impl
PASS 2 check-qom-interface /qom/interface/intermediate_impl
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/check-qom-proplist -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="check-qom-proplist" 
==6759==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 check-qom-proplist /qom/proplist/createlist
PASS 2 check-qom-proplist /qom/proplist/createv
PASS 3 check-qom-proplist /qom/proplist/createcmdline
---
PASS 3 test-write-threshold /write-threshold/multi-set-get
PASS 4 test-write-threshold /write-threshold/not-trigger
PASS 5 test-write-threshold /write-threshold/trigger
==6784==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-crypto-hash -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-crypto-hash" 
PASS 1 test-crypto-hash /crypto/hash/iov
PASS 2 test-crypto-hash /crypto/hash/alloc
---
PASS 15 test-crypto-secret /crypto/secret/crypt/missingiv
PASS 16 test-crypto-secret /crypto/secret/crypt/badiv
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-crypto-tlscredsx509 -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-crypto-tlscredsx509" 
==6810==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 5 ahci-test /x86_64/ahci/hba_enable
==6824==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-crypto-tlscredsx509 /qcrypto/tlscredsx509/perfectserver
PASS 2 test-crypto-tlscredsx509 /qcrypto/tlscredsx509/perfectclient
PASS 6 ahci-test /x86_64/ahci/identify
==6830==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 7 ahci-test /x86_64/ahci/max
==6836==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 3 test-crypto-tlscredsx509 /qcrypto/tlscredsx509/goodca1
PASS 4 test-crypto-tlscredsx509 /qcrypto/tlscredsx509/goodca2
PASS 8 ahci-test /x86_64/ahci/reset
==6842==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==6842==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7fffcedf6000; bottom 0x7f5ebdbfe000; size: 0x00a1111f8000 (691777011712)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 5 test-crypto-tlscredsx509 /qcrypto/tlscredsx509/goodca3
---
PASS 8 test-crypto-tlscredsx509 /qcrypto/tlscredsx509/badca3
PASS 9 ahci-test /x86_64/ahci/io/pio/lba28/simple/zero
PASS 9 test-crypto-tlscredsx509 /qcrypto/tlscredsx509/goodserver1
==6848==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==6848==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7fff87881000; bottom 0x7fe1e9dfe000; size: 0x001d9da83000 (127199096832)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 10 ahci-test /x86_64/ahci/io/pio/lba28/simple/low
PASS 10 test-crypto-tlscredsx509 /qcrypto/tlscredsx509/goodserver2
PASS 11 test-crypto-tlscredsx509 /qcrypto/tlscredsx509/goodserver3
==6854==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==6854==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffe879cc000; bottom 0x7f9e113fe000; size: 0x0060765ce000 (414302658560)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 11 ahci-test /x86_64/ahci/io/pio/lba28/simple/high
PASS 12 test-crypto-tlscredsx509 /qcrypto/tlscredsx509/goodserver4
==6860==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==6860==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7fff14a10000; bottom 0x7fc38a9fe000; size: 0x003b8a012000 (255718400000)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 12 ahci-test /x86_64/ahci/io/pio/lba28/double/zero
==6866==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==6866==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7fff36210000; bottom 0x7f4ef89fe000; size: 0x00b03d812000 (756946116608)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 13 ahci-test /x86_64/ahci/io/pio/lba28/double/low
PASS 13 test-crypto-tlscredsx509 /qcrypto/tlscredsx509/goodserver5
==6872==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==6872==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffebaaca000; bottom 0x7f8ecc1fe000; size: 0x006fee8cc000 (480743571456)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 14 ahci-test /x86_64/ahci/io/pio/lba28/double/high
==6878==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 14 test-crypto-tlscredsx509 /qcrypto/tlscredsx509/goodserver6
==6878==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffea3d5c000; bottom 0x7fbcdb37c000; size: 0x0041c89e0000 (282538672128)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 15 test-crypto-tlscredsx509 /qcrypto/tlscredsx509/goodserver7
---
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-crypto-tlssession -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-crypto-tlssession" 
PASS 15 ahci-test /x86_64/ahci/io/pio/lba28/long/zero
PASS 1 test-crypto-tlssession /qcrypto/tlssession/psk
==6888==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 2 test-crypto-tlssession /qcrypto/tlssession/basicca
==6888==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffeb04c7000; bottom 0x7f4a26b24000; size: 0x00b4899a3000 (775402696704)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 16 ahci-test /x86_64/ahci/io/pio/lba28/long/low
==6894==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==6894==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffc2c698000; bottom 0x7f7437bfe000; size: 0x0087f4a9a000 (583925342208)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 17 ahci-test /x86_64/ahci/io/pio/lba28/long/high
PASS 3 test-crypto-tlssession /qcrypto/tlssession/differentca
==6900==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 18 ahci-test /x86_64/ahci/io/pio/lba28/short/zero
==6906==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 19 ahci-test /x86_64/ahci/io/pio/lba28/short/low
==6912==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 20 ahci-test /x86_64/ahci/io/pio/lba28/short/high
PASS 4 test-crypto-tlssession /qcrypto/tlssession/altname1
==6918==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 5 test-crypto-tlssession /qcrypto/tlssession/altname2
==6918==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7fff9ed92000; bottom 0x7f69c77fe000; size: 0x0095d7594000 (643563077632)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 21 ahci-test /x86_64/ahci/io/pio/lba48/simple/zero
PASS 6 test-crypto-tlssession /qcrypto/tlssession/altname3
==6924==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==6924==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffda3fc3000; bottom 0x7f051bffe000; size: 0x00f887fc5000 (1067433349120)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 22 ahci-test /x86_64/ahci/io/pio/lba48/simple/low
PASS 7 test-crypto-tlssession /qcrypto/tlssession/altname4
==6930==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==6930==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffdecdbb000; bottom 0x7ff410dfe000; size: 0x0009dbfbd000 (42345418752)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 23 ahci-test /x86_64/ahci/io/pio/lba48/simple/high
PASS 8 test-crypto-tlssession /qcrypto/tlssession/altname5
PASS 9 test-crypto-tlssession /qcrypto/tlssession/altname6
==6936==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==6936==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffe016fb000; bottom 0x7f0438ffe000; size: 0x00f9c86fd000 (1072809627648)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 10 test-crypto-tlssession /qcrypto/tlssession/wildcard1
PASS 24 ahci-test /x86_64/ahci/io/pio/lba48/double/zero
==6942==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 11 test-crypto-tlssession /qcrypto/tlssession/wildcard2
==6942==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffed5a28000; bottom 0x7fca249fe000; size: 0x0034b102a000 (226308038656)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 25 ahci-test /x86_64/ahci/io/pio/lba48/double/low
==6948==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==6948==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffcefea2000; bottom 0x7ff65cffe000; size: 0x000692ea4000 (28234629120)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 26 ahci-test /x86_64/ahci/io/pio/lba48/double/high
==6954==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 12 test-crypto-tlssession /qcrypto/tlssession/wildcard3
==6954==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffca1c24000; bottom 0x7fdf071fe000; size: 0x001d9aa26000 (127148384256)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 13 test-crypto-tlssession /qcrypto/tlssession/wildcard4
PASS 27 ahci-test /x86_64/ahci/io/pio/lba48/long/zero
PASS 14 test-crypto-tlssession /qcrypto/tlssession/wildcard5
==6960==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==6960==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffc0dfa2000; bottom 0x7f0c47b7c000; size: 0x00efc6426000 (1029823422464)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 28 ahci-test /x86_64/ahci/io/pio/lba48/long/low
==6966==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 15 test-crypto-tlssession /qcrypto/tlssession/wildcard6
==6966==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffcc0eeb000; bottom 0x7fa8f537c000; size: 0x0053cbb6f000 (359900049408)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 16 test-crypto-tlssession /qcrypto/tlssession/cachain
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-qga -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-qga" 
PASS 29 ahci-test /x86_64/ahci/io/pio/lba48/long/high
==6980==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 30 ahci-test /x86_64/ahci/io/pio/lba48/short/zero
PASS 1 test-qga /qga/sync-delimited
PASS 2 test-qga /qga/sync
---
PASS 15 test-qga /qga/invalid-cmd
PASS 16 test-qga /qga/invalid-args
PASS 17 test-qga /qga/fsfreeze-status
==6986==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 31 ahci-test /x86_64/ahci/io/pio/lba48/short/low
PASS 18 test-qga /qga/blacklist
==6995==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 19 test-qga /qga/config
PASS 20 test-qga /qga/guest-exec
PASS 21 test-qga /qga/guest-exec-invalid
PASS 32 ahci-test /x86_64/ahci/io/pio/lba48/short/high
==7013==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 22 test-qga /qga/guest-get-osinfo
PASS 23 test-qga /qga/guest-get-host-name
PASS 24 test-qga /qga/guest-get-timezone
---
PASS 7 test-util-sockets /socket/fd-pass/num/bad
PASS 8 test-util-sockets /socket/fd-pass/num/nocli
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-authz-simple -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-authz-simple" 
==7024==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-authz-simple /authz/simple
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-authz-list -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-authz-list" 
PASS 34 ahci-test /x86_64/ahci/io/dma/lba28/retry
---
PASS 4 test-authz-listfile /auth/list/explicit/deny
PASS 5 test-authz-listfile /auth/list/explicit/allow
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-io-task -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-io-task" 
==7047==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-io-task /crypto/task/complete
PASS 2 test-io-task /crypto/task/datafree
PASS 3 test-io-task /crypto/task/failure
---
PASS 4 test-io-channel-file /io/channel/pipe/sync
PASS 5 test-io-channel-file /io/channel/pipe/async
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-io-channel-tls -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-io-channel-tls" 
==7091==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 36 ahci-test /x86_64/ahci/io/dma/lba28/simple/low
PASS 1 test-io-channel-tls /qio/channel/tls/basic
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-io-channel-command -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-io-channel-command" 
==7121==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-io-channel-command /io/channel/command/fifo/sync
PASS 2 test-io-channel-command /io/channel/command/fifo/async
PASS 3 test-io-channel-command /io/channel/command/echo/sync
---
PASS 3 test-base64 /util/base64/not-nul-terminated
PASS 4 test-base64 /util/base64/invalid-chars
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-crypto-pbkdf -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-crypto-pbkdf" 
==7139==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-crypto-pbkdf /crypto/pbkdf/rfc3962/sha1/iter1
PASS 2 test-crypto-pbkdf /crypto/pbkdf/rfc3962/sha1/iter2
PASS 3 test-crypto-pbkdf /crypto/pbkdf/rfc3962/sha1/iter1200a
---
PASS 17 test-crypto-xts /crypto/xts/t-21-key-32-ptx-31/basic
PASS 18 test-crypto-xts /crypto/xts/t-21-key-32-ptx-31/unaligned
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-crypto-block -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-crypto-block" 
==7156==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-crypto-block /crypto/block/qcow
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-logging -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-logging" 
PASS 1 test-logging /logging/parse_range
---
PASS 4 test-logging /logging/logfile_lock_path
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-replication -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-replication" 
PASS 39 ahci-test /x86_64/ahci/io/dma/lba28/double/low
==7179==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7181==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-replication /replication/primary/read
PASS 2 test-replication /replication/primary/write
PASS 40 ahci-test /x86_64/ahci/io/dma/lba28/double/high
==7189==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 3 test-replication /replication/primary/start
PASS 4 test-replication /replication/primary/stop
PASS 5 test-replication /replication/primary/do_checkpoint
PASS 6 test-replication /replication/primary/get_error_all
==7189==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffe97ab4000; bottom 0x7f8d3e7fd000; size: 0x0071592b7000 (486827323392)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 41 ahci-test /x86_64/ahci/io/dma/lba28/long/zero
==7196==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7196==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffcee2ff000; bottom 0x7f4eb5323000; size: 0x00ae38fdc000 (748280463360)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 7 test-replication /replication/secondary/read
PASS 42 ahci-test /x86_64/ahci/io/dma/lba28/long/low
==7203==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7203==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffde06e6000; bottom 0x7fc35a57b000; size: 0x003a8616b000 (251357736960)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 8 test-replication /replication/secondary/write
PASS 43 ahci-test /x86_64/ahci/io/dma/lba28/long/high
==7210==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 44 ahci-test /x86_64/ahci/io/dma/lba28/short/zero
==7216==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 45 ahci-test /x86_64/ahci/io/dma/lba28/short/low
==7223==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7179==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffc5a310000; bottom 0x7f4406be8000; size: 0x00b853728000 (791673995264)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 9 test-replication /replication/secondary/start
PASS 46 ahci-test /x86_64/ahci/io/dma/lba28/short/high
==7247==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 47 ahci-test /x86_64/ahci/io/dma/lba48/simple/zero
==7253==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 48 ahci-test /x86_64/ahci/io/dma/lba48/simple/low
==7259==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 49 ahci-test /x86_64/ahci/io/dma/lba48/simple/high
PASS 10 test-replication /replication/secondary/stop
==7265==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 50 ahci-test /x86_64/ahci/io/dma/lba48/double/zero
==7271==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 51 ahci-test /x86_64/ahci/io/dma/lba48/double/low
==7277==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 11 test-replication /replication/secondary/continuous_replication
PASS 52 ahci-test /x86_64/ahci/io/dma/lba48/double/high
==7283==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7283==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffeb26d1000; bottom 0x7f9d00ffd000; size: 0x0061b16d4000 (419588554752)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 53 ahci-test /x86_64/ahci/io/dma/lba48/long/zero
==7290==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7290==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffc097f1000; bottom 0x7f694f923000; size: 0x0092b9ece000 (630184534016)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 54 ahci-test /x86_64/ahci/io/dma/lba48/long/low
==7297==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 12 test-replication /replication/secondary/do_checkpoint
==7297==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffc23565000; bottom 0x7f6912123000; size: 0x009311442000 (631649869824)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 55 ahci-test /x86_64/ahci/io/dma/lba48/long/high
==7304==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 13 test-replication /replication/secondary/get_error_all
PASS 56 ahci-test /x86_64/ahci/io/dma/lba48/short/zero
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-bufferiszero -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-bufferiszero" 
==7310==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 57 ahci-test /x86_64/ahci/io/dma/lba48/short/low
==7319==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 58 ahci-test /x86_64/ahci/io/dma/lba48/short/high
==7325==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 59 ahci-test /x86_64/ahci/io/ncq/simple
==7331==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 60 ahci-test /x86_64/ahci/io/ncq/retry
==7337==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 61 ahci-test /x86_64/ahci/flush/simple
==7343==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 62 ahci-test /x86_64/ahci/flush/retry
==7349==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7355==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 63 ahci-test /x86_64/ahci/flush/migrate
==7363==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7369==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 64 ahci-test /x86_64/ahci/migrate/sanity
==7377==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7383==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 65 ahci-test /x86_64/ahci/migrate/dma/simple
==7391==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7397==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 66 ahci-test /x86_64/ahci/migrate/dma/halted
==7405==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7411==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 67 ahci-test /x86_64/ahci/migrate/ncq/simple
==7419==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7425==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 68 ahci-test /x86_64/ahci/migrate/ncq/halted
==7433==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 69 ahci-test /x86_64/ahci/cdrom/eject
==7438==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 70 ahci-test /x86_64/ahci/cdrom/dma/single
==7444==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 71 ahci-test /x86_64/ahci/cdrom/dma/multi
==7450==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 72 ahci-test /x86_64/ahci/cdrom/pio/single
==7456==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7456==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffd43437000; bottom 0x7f71a0dfe000; size: 0x008ba2639000 (599724888064)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 73 ahci-test /x86_64/ahci/cdrom/pio/multi
==7462==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 74 ahci-test /x86_64/ahci/cdrom/pio/bcl
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  QTEST_QEMU_BINARY=x86_64-softmmu/qemu-system-x86_64 QTEST_QEMU_IMG=qemu-img tests/qtest/hd-geo-test -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="hd-geo-test" 
PASS 1 hd-geo-test /x86_64/hd-geo/ide/none
==7476==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 2 hd-geo-test /x86_64/hd-geo/ide/drive/cd_0
==7482==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 3 hd-geo-test /x86_64/hd-geo/ide/drive/mbr/blank
==7488==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 4 hd-geo-test /x86_64/hd-geo/ide/drive/mbr/lba
==7494==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 5 hd-geo-test /x86_64/hd-geo/ide/drive/mbr/chs
==7500==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 6 hd-geo-test /x86_64/hd-geo/ide/device/mbr/blank
PASS 1 test-bufferiszero /cutils/bufferiszero
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-uuid -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-uuid" 
==7506==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 1 test-uuid /uuid/is_null
PASS 2 test-uuid /uuid/generate
PASS 3 test-uuid /uuid/parse
---
PASS 1 test-qapi-util /qapi/util/qapi_enum_parse
PASS 2 test-qapi-util /qapi/util/parse_qapi_name
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  tests/test-qgraph -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-qgraph" 
==7518==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 8 hd-geo-test /x86_64/hd-geo/ide/device/mbr/chs
PASS 1 test-qgraph /qgraph/init_nop
PASS 2 test-qgraph /qgraph/test_machine
---
PASS 21 test-qgraph /qgraph/test_two_test_same_interface
PASS 22 test-qgraph /qgraph/test_test_in_path
PASS 23 test-qgraph /qgraph/test_double_edge
==7531==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 9 hd-geo-test /x86_64/hd-geo/ide/device/user/chs
==7536==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 10 hd-geo-test /x86_64/hd-geo/ide/device/user/chst
==7542==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7546==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7550==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7554==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7558==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7562==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7566==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7570==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7573==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 11 hd-geo-test /x86_64/hd-geo/override/ide
==7580==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7584==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7588==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7592==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7596==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7600==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7604==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7608==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7611==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 12 hd-geo-test /x86_64/hd-geo/override/scsi
==7618==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7622==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7626==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7630==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7634==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7638==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7642==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7646==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7649==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 13 hd-geo-test /x86_64/hd-geo/override/scsi_2_controllers
==7656==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7660==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7664==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7668==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7671==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 14 hd-geo-test /x86_64/hd-geo/override/virtio_blk
==7678==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7682==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7685==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 15 hd-geo-test /x86_64/hd-geo/override/zero_chs
==7692==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7696==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7700==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7704==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7707==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 16 hd-geo-test /x86_64/hd-geo/override/scsi_hot_unplug
==7714==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7718==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7722==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7726==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
==7729==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 17 hd-geo-test /x86_64/hd-geo/override/virtio_hot_unplug
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  QTEST_QEMU_BINARY=x86_64-softmmu/qemu-system-x86_64 QTEST_QEMU_IMG=qemu-img tests/qtest/boot-order-test -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="boot-order-test" 
PASS 1 boot-order-test /x86_64/boot-order/pc
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7798==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/pc/FACP'
Using expected file 'tests/data/acpi/pc/FACP'
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7804==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/q35/FACP'
Using expected file 'tests/data/acpi/q35/FACP'
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7810==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/pc/FACP.bridge'
Looking for expected file 'tests/data/acpi/pc/FACP'
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7816==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/pc/FACP.ipmikcs'
Looking for expected file 'tests/data/acpi/pc/FACP'
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7822==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/pc/FACP.cphp'
Looking for expected file 'tests/data/acpi/pc/FACP'
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7829==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/pc/FACP.memhp'
Looking for expected file 'tests/data/acpi/pc/FACP'
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7835==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/pc/FACP.numamem'
Looking for expected file 'tests/data/acpi/pc/FACP'
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7841==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/pc/FACP.dimmpxm'
Looking for expected file 'tests/data/acpi/pc/FACP'
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7850==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/pc/FACP.acpihmat'
Looking for expected file 'tests/data/acpi/pc/FACP'
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7857==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/q35/FACP.bridge'
Looking for expected file 'tests/data/acpi/q35/FACP'
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7863==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/q35/FACP.mmio64'
Looking for expected file 'tests/data/acpi/q35/FACP'
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7869==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/q35/FACP.ipmibt'
Looking for expected file 'tests/data/acpi/q35/FACP'
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7875==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/q35/FACP.cphp'
Looking for expected file 'tests/data/acpi/q35/FACP'
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7882==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/q35/FACP.memhp'
Looking for expected file 'tests/data/acpi/q35/FACP'
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7888==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/q35/FACP.numamem'
Looking for expected file 'tests/data/acpi/q35/FACP'
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7894==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/q35/FACP.dimmpxm'
Looking for expected file 'tests/data/acpi/q35/FACP'
---
Could not access KVM kernel module: No such file or directory
qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or directory
qemu-system-x86_64: falling back to tcg
==7903==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!

Looking for expected file 'tests/data/acpi/q35/FACP.acpihmat'
Looking for expected file 'tests/data/acpi/q35/FACP'
---
PASS 1 i440fx-test /x86_64/i440fx/defaults
PASS 2 i440fx-test /x86_64/i440fx/pam
PASS 3 i440fx-test /x86_64/i440fx/firmware/bios
==7995==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 4 i440fx-test /x86_64/i440fx/firmware/pflash
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  QTEST_QEMU_BINARY=x86_64-softmmu/qemu-system-x86_64 QTEST_QEMU_IMG=qemu-img tests/qtest/fw_cfg-test -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="fw_cfg-test" 
PASS 1 fw_cfg-test /x86_64/fw_cfg/signature
---
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  QTEST_QEMU_BINARY=x86_64-softmmu/qemu-system-x86_64 QTEST_QEMU_IMG=qemu-img tests/qtest/drive_del-test -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="drive_del-test" 
PASS 1 drive_del-test /x86_64/drive_del/without-dev
PASS 2 drive_del-test /x86_64/drive_del/after_failed_device_add
==8088==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!
PASS 3 drive_del-test /x86_64/blockdev/drive_del_device_del
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  QTEST_QEMU_BINARY=x86_64-softmmu/qemu-system-x86_64 QTEST_QEMU_IMG=qemu-img tests/qtest/wdt_ib700-test -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="wdt_ib700-test" 
PASS 1 wdt_ib700-test /x86_64/wdt_ib700/pause
---
dbus-daemon[8258]: Could not get password database information for UID of current process: User "???" unknown or no memory to allocate password entry

**
ERROR:/tmp/qemu-test/src/tests/qtest/dbus-vmstate-test.c:114:get_connection: assertion failed (err == NULL): The connection is closed (g-io-error-quark, 18)
cleaning up pid 8258
ERROR - Bail out! ERROR:/tmp/qemu-test/src/tests/qtest/dbus-vmstate-test.c:114:get_connection: assertion failed (err == NULL): The connection is closed (g-io-error-quark, 18)
make: *** [/tmp/qemu-test/src/tests/Makefile.include:632: check-qtest-x86_64] Error 1
make: *** Waiting for unfinished jobs....
Traceback (most recent call last):
  File "./tests/docker/docker.py", line 664, in <module>
---
    raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', '--label', 'com.qemu.instance.uuid=097891ecdb724e63ab99fd5fa3b8b179', '-u', '1003', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=x86_64-softmmu', '-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 'SHOW_ENV=', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', '/home/patchew2/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', '/var/tmp/patchew-tester-tmp-logmabgq/src/docker-src.2020-03-12-03.54.30.26791:/var/tmp/qemu:z,ro', 'qemu:fedora', '/var/tmp/qemu/run', 'test-debug']' returned non-zero exit status 2.
filter=--filter=label=com.qemu.instance.uuid=097891ecdb724e63ab99fd5fa3b8b179
make[1]: *** [docker-run] Error 1
make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-logmabgq/src'
make: *** [docker-run-test-debug@fedora] Error 2

real    27m51.719s
user    0m9.162s


The full log is available at
http://patchew.org/logs/1583999395-9131-2-git-send-email-teawater@gmail.com/testing.asan/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-devel@redhat.com

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for QEMU] virtio-balloon: Add option thp-order to set VIRTIO_BALLOON_F_THP_ORDER
  2020-03-12  7:49 ` [RFC for QEMU] virtio-balloon: Add option thp-order to set VIRTIO_BALLOON_F_THP_ORDER Hui Zhu
  2020-03-12  8:22   ` no-reply
@ 2020-03-12  8:25   ` Michael S. Tsirkin
  2020-03-17 10:13     ` teawater
  1 sibling, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2020-03-12  8:25 UTC (permalink / raw)
  To: Hui Zhu
  Cc: jasowang, akpm, pagupta, mojha, david, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu

On Thu, Mar 12, 2020 at 03:49:55PM +0800, Hui Zhu wrote:
> If the guest kernel has many fragmentation pages, use virtio_balloon
> will split THP of QEMU when it calls MADV_DONTNEED madvise to release
> the balloon pages.
> Set option thp-order to on will open flags VIRTIO_BALLOON_F_THP_ORDER.
> It will set balloon size to THP size to handle the THP split issue.
> 
> Signed-off-by: Hui Zhu <teawaterz@linux.alibaba.com>

What's wrong with just using the PartiallyBalloonedPage machinery
instead? That would make it guest transparent.

> ---
>  hw/virtio/virtio-balloon.c                      | 67 ++++++++++++++++---------
>  include/standard-headers/linux/virtio_balloon.h |  4 ++
>  2 files changed, 47 insertions(+), 24 deletions(-)
> 
> diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
> index a4729f7..cfe86b0 100644
> --- a/hw/virtio/virtio-balloon.c
> +++ b/hw/virtio/virtio-balloon.c
> @@ -340,37 +340,49 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
>          while (iov_to_buf(elem->out_sg, elem->out_num, offset, &pfn, 4) == 4) {
>              unsigned int p = virtio_ldl_p(vdev, &pfn);
>              hwaddr pa;
> +            size_t handle_size = BALLOON_PAGE_SIZE;
>  
>              pa = (hwaddr) p << VIRTIO_BALLOON_PFN_SHIFT;
>              offset += 4;
>  
> -            section = memory_region_find(get_system_memory(), pa,
> -                                         BALLOON_PAGE_SIZE);
> -            if (!section.mr) {
> -                trace_virtio_balloon_bad_addr(pa);
> -                continue;
> -            }
> -            if (!memory_region_is_ram(section.mr) ||
> -                memory_region_is_rom(section.mr) ||
> -                memory_region_is_romd(section.mr)) {
> -                trace_virtio_balloon_bad_addr(pa);
> -                memory_region_unref(section.mr);
> -                continue;
> -            }
> +            if (virtio_has_feature(s->host_features,
> +                                   VIRTIO_BALLOON_F_THP_ORDER))
> +                handle_size = BALLOON_PAGE_SIZE << VIRTIO_BALLOON_THP_ORDER;
> +
> +            while (handle_size > 0) {
> +                section = memory_region_find(get_system_memory(), pa,
> +                                             BALLOON_PAGE_SIZE);
> +                if (!section.mr) {
> +                    trace_virtio_balloon_bad_addr(pa);
> +                    continue;
> +                }
> +                if (!memory_region_is_ram(section.mr) ||
> +                    memory_region_is_rom(section.mr) ||
> +                    memory_region_is_romd(section.mr)) {
> +                    trace_virtio_balloon_bad_addr(pa);
> +                    memory_region_unref(section.mr);
> +                    continue;
> +                }
>  
> -            trace_virtio_balloon_handle_output(memory_region_name(section.mr),
> -                                               pa);
> -            if (!qemu_balloon_is_inhibited()) {
> -                if (vq == s->ivq) {
> -                    balloon_inflate_page(s, section.mr,
> -                                         section.offset_within_region, &pbp);
> -                } else if (vq == s->dvq) {
> -                    balloon_deflate_page(s, section.mr, section.offset_within_region);
> -                } else {
> -                    g_assert_not_reached();
> +                trace_virtio_balloon_handle_output(memory_region_name(section.mr),
> +                                                   pa);
> +                if (!qemu_balloon_is_inhibited()) {
> +                    if (vq == s->ivq) {
> +                        balloon_inflate_page(s, section.mr,
> +                                             section.offset_within_region,
> +                                             &pbp);
> +                    } else if (vq == s->dvq) {
> +                        balloon_deflate_page(s, section.mr,
> +                                             section.offset_within_region);
> +                    } else {
> +                        g_assert_not_reached();
> +                    }
>                  }
> +                memory_region_unref(section.mr);
> +
> +                pa += BALLOON_PAGE_SIZE;
> +                handle_size -= BALLOON_PAGE_SIZE;
>              }
> -            memory_region_unref(section.mr);
>          }
>  
>          virtqueue_push(vq, elem, offset);
> @@ -693,6 +705,8 @@ static void virtio_balloon_set_config(VirtIODevice *vdev,
>  
>      memcpy(&config, config_data, virtio_balloon_config_size(dev));
>      dev->actual = le32_to_cpu(config.actual);
> +    if (virtio_has_feature(vdev->host_features, VIRTIO_BALLOON_F_THP_ORDER))
> +        dev->actual <<= VIRTIO_BALLOON_THP_ORDER;
>      if (dev->actual != oldactual) {
>          qapi_event_send_balloon_change(vm_ram_size -
>                          ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT));
> @@ -728,6 +742,9 @@ static void virtio_balloon_to_target(void *opaque, ram_addr_t target)
>      }
>      if (target) {
>          dev->num_pages = (vm_ram_size - target) >> VIRTIO_BALLOON_PFN_SHIFT;
> +        if (virtio_has_feature(dev->host_features,
> +                               VIRTIO_BALLOON_F_THP_ORDER))
> +            dev->num_pages >>= VIRTIO_BALLOON_THP_ORDER;
>          virtio_notify_config(vdev);
>      }
>      trace_virtio_balloon_to_target(target, dev->num_pages);
> @@ -916,6 +933,8 @@ static Property virtio_balloon_properties[] = {
>                      VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false),
>      DEFINE_PROP_BIT("free-page-hint", VirtIOBalloon, host_features,
>                      VIRTIO_BALLOON_F_FREE_PAGE_HINT, false),
> +    DEFINE_PROP_BIT("thp-order", VirtIOBalloon, host_features,
> +                    VIRTIO_BALLOON_F_THP_ORDER, false),
>      /* QEMU 4.0 accidentally changed the config size even when free-page-hint
>       * is disabled, resulting in QEMU 3.1 migration incompatibility.  This
>       * property retains this quirk for QEMU 4.1 machine types.
> diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h
> index 9375ca2..f54d613 100644
> --- a/include/standard-headers/linux/virtio_balloon.h
> +++ b/include/standard-headers/linux/virtio_balloon.h
> @@ -36,10 +36,14 @@
>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
>  #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
>  #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
> +#define VIRTIO_BALLOON_F_THP_ORDER	5 /* Set balloon page order to thp order */
>  
>  /* Size of a PFN in the balloon interface. */
>  #define VIRTIO_BALLOON_PFN_SHIFT 12
>  
> +/* The order of the balloon page */
> +#define VIRTIO_BALLOON_THP_ORDER 9
> +
>  #define VIRTIO_BALLOON_CMD_ID_STOP	0
>  #define VIRTIO_BALLOON_CMD_ID_DONE	1
>  struct virtio_balloon_config {
> -- 
> 2.7.4


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-12  7:49 [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue Hui Zhu
  2020-03-12  7:49 ` [RFC for QEMU] virtio-balloon: Add option thp-order to set VIRTIO_BALLOON_F_THP_ORDER Hui Zhu
  2020-03-12  8:18 ` [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue Michael S. Tsirkin
@ 2020-03-12  8:37 ` David Hildenbrand
  2020-03-12  8:47   ` Michael S. Tsirkin
  2 siblings, 1 reply; 32+ messages in thread
From: David Hildenbrand @ 2020-03-12  8:37 UTC (permalink / raw)
  To: Hui Zhu, mst, jasowang, akpm, pagupta, mojha, namit,
	virtualization, linux-kernel, qemu-devel
  Cc: Hui Zhu, Alexander Duyck

On 12.03.20 08:49, Hui Zhu wrote:
> If the guest kernel has many fragmentation pages, use virtio_balloon
> will split THP of QEMU when it calls MADV_DONTNEED madvise to release
> the balloon pages.
> This is an example in a VM with 1G memory 1CPU:
> cat /proc/meminfo | grep AnonHugePages:
> AnonHugePages:         0 kB
> 
> usemem --punch-holes -s -1 800m &
> 
> cat /proc/meminfo | grep AnonHugePages:
> AnonHugePages:    976896 kB
> 
> (qemu) device_add virtio-balloon-pci,id=balloon1
> (qemu) info balloon
> balloon: actual=1024
> (qemu) balloon 624
> (qemu) info balloon
> balloon: actual=624
> 
> cat /proc/meminfo | grep AnonHugePages:
> AnonHugePages:    153600 kB
> 
> THP number decreased more than 800M.
> The reason is usemem with punch-holes option will free every other page
> after allocation.  Then 400M free memory inside the guest kernel is
> fragmentation pages.
> The guest kernel will use them to inflate the balloon.  When these
> fragmentation pages are freed, THP will be split.
> 
> This commit tries to handle this with add a new flag
> VIRTIO_BALLOON_F_THP_ORDER.
> When this flag is set, the balloon page order will be set to the THP order.
> Then THP pages will be freed together in the host.
> This is an example in a VM with 1G memory 1CPU:
> cat /proc/meminfo | grep AnonHugePages:
> AnonHugePages:         0 kB
> 
> usemem --punch-holes -s -1 800m &
> 
> cat /proc/meminfo | grep AnonHugePages:
> AnonHugePages:    976896 kB
> 
> (qemu) device_add virtio-balloon-pci,id=balloon1,thp-order=on
> (qemu) info balloon
> balloon: actual=1024
> (qemu) balloon 624
> (qemu) info balloon
> balloon: actual=624
> 
> cat /proc/meminfo | grep AnonHugePages:
> AnonHugePages:    583680 kB
> 
> The THP number decreases 384M.  This shows that VIRTIO_BALLOON_F_THP_ORDER
> can help handle the THP split issue.


Multiple things:

I recently had a similar discussion with Alex [1] and I think this needs
more thought.

My thoughts:

1. You most certainly want to fallback to allocating pages in a smaller
granularity once you run out of bigger allocations. Sacrifice
performance for memory inflation, which has always been the case and
which is what people expect to happen. (e.g., to shrink the page cache
properly)

2. You are essentially stealing THPs in the guest. So the fastest
mapping (THP in guest and host) is gone. The guest won't be able to make
use of THP where it previously was able to. I can imagine this implies a
performance degradation for some workloads. This needs a proper
performance evaluation.

3. The pages you allocate are not migrateable, e.g., for memory
offlining or alloc_contig_range() users like gigantic pages or soon
virtio-mem. I strongly dislike that. This is IMHO a step backwards. We
want to be able to migrate or even split-up and migrate such pages.

Assume the guest could make good use of a THP somewhere. Who says it
wouldn't be better to sacrifice a huge balloon page to be able to use
THP both in the guest and the host for that mapping? I am not convinced
stealing possible THPs in the guest and not being able to split them up
is really what we want performance wise.


4. I think we also want a better mechanism to directly inflate/deflate
higher/order pages and not reuse the 4k inflate/deflate queues.

5. I think we don't want to hard code such THP values but let the host
tell us the THP size instead, which can easily differ between guest and
host.

Also, I do wonder if balloon compaction in the guest will already result
in more THP getting used again long term. Assume the guest compacts
balloon pages into a single THP again. This will result in a bunch of
DONTNEED/WILLNEED in the hypervisor due to inflation/deflation. I wonder
if the WILLNEED on the sub-pages of a candidate THP in the host will
allow to use a THP in the host again.


[1]
https://lore.kernel.org/linux-mm/939de9de-d82a-aed2-6a51-57a55d81cbff@redhat.com/

> 
> Signed-off-by: Hui Zhu <teawaterz@linux.alibaba.com>
> ---
>  drivers/virtio/virtio_balloon.c     | 57 ++++++++++++++++++++++++++-----------
>  include/linux/balloon_compaction.h  | 14 ++++++---
>  include/uapi/linux/virtio_balloon.h |  4 +++
>  3 files changed, 54 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 7bfe365..1e1dc76 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -175,18 +175,31 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>  	unsigned num_pfns;
>  	struct page *page;
>  	LIST_HEAD(pages);
> +	int page_order = 0;
>  
>  	/* We can only do one array worth at a time. */
>  	num = min(num, ARRAY_SIZE(vb->pfns));
>  
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_THP_ORDER))
> +		page_order = VIRTIO_BALLOON_THP_ORDER;
> +
>  	for (num_pfns = 0; num_pfns < num;
>  	     num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
> -		struct page *page = balloon_page_alloc();
> +		struct page *page;
> +
> +		if (page_order)
> +			page = alloc_pages(__GFP_HIGHMEM |
> +					   __GFP_KSWAPD_RECLAIM |
> +					   __GFP_RETRY_MAYFAIL |
> +					   __GFP_NOWARN | __GFP_NOMEMALLOC,
> +					   page_order);
> +		else
> +			page = balloon_page_alloc();
>  
>  		if (!page) {
>  			dev_info_ratelimited(&vb->vdev->dev,
> -					     "Out of puff! Can't get %u pages\n",
> -					     VIRTIO_BALLOON_PAGES_PER_PAGE);
> +				"Out of puff! Can't get %u pages\n",
> +				VIRTIO_BALLOON_PAGES_PER_PAGE << page_order);
>  			/* Sleep for at least 1/5 of a second before retry. */
>  			msleep(200);
>  			break;
> @@ -206,7 +219,7 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>  		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
>  		if (!virtio_has_feature(vb->vdev,
>  					VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> -			adjust_managed_page_count(page, -1);
> +			adjust_managed_page_count(page, -(1 << page_order));
>  		vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
>  	}
>  
> @@ -223,13 +236,20 @@ static void release_pages_balloon(struct virtio_balloon *vb,
>  				 struct list_head *pages)
>  {
>  	struct page *page, *next;
> +	int page_order = 0;
> +
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_THP_ORDER))
> +		page_order = VIRTIO_BALLOON_THP_ORDER;
>  
>  	list_for_each_entry_safe(page, next, pages, lru) {
>  		if (!virtio_has_feature(vb->vdev,
>  					VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> -			adjust_managed_page_count(page, 1);
> +			adjust_managed_page_count(page, 1 << page_order);
>  		list_del(&page->lru);
> -		put_page(page); /* balloon reference */
> +		if (page_order)
> +			__free_pages(page, page_order);
> +		else
> +			put_page(page); /* balloon reference */
>  	}
>  }
>  
> @@ -893,19 +913,21 @@ static int virtballoon_probe(struct virtio_device *vdev)
>  		goto out_free_vb;
>  
>  #ifdef CONFIG_BALLOON_COMPACTION
> -	balloon_mnt = kern_mount(&balloon_fs);
> -	if (IS_ERR(balloon_mnt)) {
> -		err = PTR_ERR(balloon_mnt);
> -		goto out_del_vqs;
> -	}
> +	if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_THP_ORDER)) {
> +		balloon_mnt = kern_mount(&balloon_fs);
> +		if (IS_ERR(balloon_mnt)) {
> +			err = PTR_ERR(balloon_mnt);
> +			goto out_del_vqs;
> +		}
>  
> -	vb->vb_dev_info.migratepage = virtballoon_migratepage;
> -	vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
> -	if (IS_ERR(vb->vb_dev_info.inode)) {
> -		err = PTR_ERR(vb->vb_dev_info.inode);
> -		goto out_kern_unmount;
> +		vb->vb_dev_info.migratepage = virtballoon_migratepage;
> +		vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
> +		if (IS_ERR(vb->vb_dev_info.inode)) {
> +			err = PTR_ERR(vb->vb_dev_info.inode);
> +			goto out_kern_unmount;
> +		}
> +		vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
>  	}
> -	vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
>  #endif
>  	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
>  		/*
> @@ -1058,6 +1080,7 @@ static unsigned int features[] = {
>  	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
>  	VIRTIO_BALLOON_F_FREE_PAGE_HINT,
>  	VIRTIO_BALLOON_F_PAGE_POISON,
> +	VIRTIO_BALLOON_F_THP_ORDER,
>  };
>  
>  static struct virtio_driver virtio_balloon_driver = {
> diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
> index 338aa27..4c9164e 100644
> --- a/include/linux/balloon_compaction.h
> +++ b/include/linux/balloon_compaction.h
> @@ -100,8 +100,12 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
>  				       struct page *page)
>  {
>  	__SetPageOffline(page);
> -	__SetPageMovable(page, balloon->inode->i_mapping);
> -	set_page_private(page, (unsigned long)balloon);
> +	if (balloon->inode) {
> +		__SetPageMovable(page, balloon->inode->i_mapping);
> +		set_page_private(page, (unsigned long)balloon);
> +	} else {
> +		set_page_private(page, 0);
> +	}
>  	list_add(&page->lru, &balloon->pages);
>  }
>  
> @@ -116,8 +120,10 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
>  static inline void balloon_page_delete(struct page *page)
>  {
>  	__ClearPageOffline(page);
> -	__ClearPageMovable(page);
> -	set_page_private(page, 0);
> +	if (page_private(page)) {
> +		__ClearPageMovable(page);
> +		set_page_private(page, 0);
> +	}
>  	/*
>  	 * No touch page.lru field once @page has been isolated
>  	 * because VM is using the field.
> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> index a1966cd7..a2998a9 100644
> --- a/include/uapi/linux/virtio_balloon.h
> +++ b/include/uapi/linux/virtio_balloon.h
> @@ -36,10 +36,14 @@
>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
>  #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
>  #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
> +#define VIRTIO_BALLOON_F_THP_ORDER	5 /* Balloon page order to thp order */
>  
>  /* Size of a PFN in the balloon interface. */
>  #define VIRTIO_BALLOON_PFN_SHIFT 12
>  
> +/* The order of the balloon page */
> +#define VIRTIO_BALLOON_THP_ORDER 9
> +
>  #define VIRTIO_BALLOON_CMD_ID_STOP	0
>  #define VIRTIO_BALLOON_CMD_ID_DONE	1
>  struct virtio_balloon_config {
> 


-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-12  8:37 ` David Hildenbrand
@ 2020-03-12  8:47   ` Michael S. Tsirkin
  2020-03-12  8:51     ` David Hildenbrand
  0 siblings, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2020-03-12  8:47 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu, Alexander Duyck

On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
> 2. You are essentially stealing THPs in the guest. So the fastest
> mapping (THP in guest and host) is gone. The guest won't be able to make
> use of THP where it previously was able to. I can imagine this implies a
> performance degradation for some workloads. This needs a proper
> performance evaluation.

I think the problem is more with the alloc_pages API.
That gives you exactly the given order, and if there's
a larger chunk available, it will split it up.

But for balloon - I suspect lots of other users,
we do not want to stress the system but if a large
chunk is available anyway, then we could handle
that more optimally by getting it all in one go.


So if we want to address this, IMHO this calls for a new API.
Along the lines of

	struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
					unsigned int max_order, unsigned int *order)

the idea would then be to return at a number of pages in the given
range.

What do you think? Want to try implementing that?

-- 
MST


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-12  8:47   ` Michael S. Tsirkin
@ 2020-03-12  8:51     ` David Hildenbrand
  2020-03-26  7:10       ` Michael S. Tsirkin
  2020-03-26  7:20       ` Michael S. Tsirkin
  0 siblings, 2 replies; 32+ messages in thread
From: David Hildenbrand @ 2020-03-12  8:51 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu, Alexander Duyck

On 12.03.20 09:47, Michael S. Tsirkin wrote:
> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
>> 2. You are essentially stealing THPs in the guest. So the fastest
>> mapping (THP in guest and host) is gone. The guest won't be able to make
>> use of THP where it previously was able to. I can imagine this implies a
>> performance degradation for some workloads. This needs a proper
>> performance evaluation.
> 
> I think the problem is more with the alloc_pages API.
> That gives you exactly the given order, and if there's
> a larger chunk available, it will split it up.
> 
> But for balloon - I suspect lots of other users,
> we do not want to stress the system but if a large
> chunk is available anyway, then we could handle
> that more optimally by getting it all in one go.
> 
> 
> So if we want to address this, IMHO this calls for a new API.
> Along the lines of
> 
> 	struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
> 					unsigned int max_order, unsigned int *order)
> 
> the idea would then be to return at a number of pages in the given
> range.
> 
> What do you think? Want to try implementing that?

You can just start with the highest order and decrement the order until
your allocation succeeds using alloc_pages(), which would be enough for
a first version. At least I don't see the immediate need for a new
kernel API.

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for QEMU] virtio-balloon: Add option thp-order to set VIRTIO_BALLOON_F_THP_ORDER
  2020-03-12  8:25   ` Michael S. Tsirkin
@ 2020-03-17 10:13     ` teawater
  2020-03-26  7:07       ` Michael S. Tsirkin
  0 siblings, 1 reply; 32+ messages in thread
From: teawater @ 2020-03-17 10:13 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, david, namit,
	virtualization, linux-kernel, qemu-devel



> 2020年3月12日 16:25,Michael S. Tsirkin <mst@redhat.com> 写道:
> 
> On Thu, Mar 12, 2020 at 03:49:55PM +0800, Hui Zhu wrote:
>> If the guest kernel has many fragmentation pages, use virtio_balloon
>> will split THP of QEMU when it calls MADV_DONTNEED madvise to release
>> the balloon pages.
>> Set option thp-order to on will open flags VIRTIO_BALLOON_F_THP_ORDER.
>> It will set balloon size to THP size to handle the THP split issue.
>> 
>> Signed-off-by: Hui Zhu <teawaterz@linux.alibaba.com>
> 
> What's wrong with just using the PartiallyBalloonedPage machinery
> instead? That would make it guest transparent.

In balloon_inflate_page:
    rb_page_size = qemu_ram_pagesize(rb);

    if (rb_page_size == BALLOON_PAGE_SIZE) {
        /* Easy case */

It seems that PartiallyBalloonedPage is only used when rb_page_size is greater than BALLOON_PAGE_SIZE.
Do you mean I should modify the working mechanism of balloon_inflate_page function?

Thanks,
Hui

> 
>> ---
>> hw/virtio/virtio-balloon.c                      | 67 ++++++++++++++++---------
>> include/standard-headers/linux/virtio_balloon.h |  4 ++
>> 2 files changed, 47 insertions(+), 24 deletions(-)
>> 
>> diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
>> index a4729f7..cfe86b0 100644
>> --- a/hw/virtio/virtio-balloon.c
>> +++ b/hw/virtio/virtio-balloon.c
>> @@ -340,37 +340,49 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
>>         while (iov_to_buf(elem->out_sg, elem->out_num, offset, &pfn, 4) == 4) {
>>             unsigned int p = virtio_ldl_p(vdev, &pfn);
>>             hwaddr pa;
>> +            size_t handle_size = BALLOON_PAGE_SIZE;
>> 
>>             pa = (hwaddr) p << VIRTIO_BALLOON_PFN_SHIFT;
>>             offset += 4;
>> 
>> -            section = memory_region_find(get_system_memory(), pa,
>> -                                         BALLOON_PAGE_SIZE);
>> -            if (!section.mr) {
>> -                trace_virtio_balloon_bad_addr(pa);
>> -                continue;
>> -            }
>> -            if (!memory_region_is_ram(section.mr) ||
>> -                memory_region_is_rom(section.mr) ||
>> -                memory_region_is_romd(section.mr)) {
>> -                trace_virtio_balloon_bad_addr(pa);
>> -                memory_region_unref(section.mr);
>> -                continue;
>> -            }
>> +            if (virtio_has_feature(s->host_features,
>> +                                   VIRTIO_BALLOON_F_THP_ORDER))
>> +                handle_size = BALLOON_PAGE_SIZE << VIRTIO_BALLOON_THP_ORDER;
>> +
>> +            while (handle_size > 0) {
>> +                section = memory_region_find(get_system_memory(), pa,
>> +                                             BALLOON_PAGE_SIZE);
>> +                if (!section.mr) {
>> +                    trace_virtio_balloon_bad_addr(pa);
>> +                    continue;
>> +                }
>> +                if (!memory_region_is_ram(section.mr) ||
>> +                    memory_region_is_rom(section.mr) ||
>> +                    memory_region_is_romd(section.mr)) {
>> +                    trace_virtio_balloon_bad_addr(pa);
>> +                    memory_region_unref(section.mr);
>> +                    continue;
>> +                }
>> 
>> -            trace_virtio_balloon_handle_output(memory_region_name(section.mr),
>> -                                               pa);
>> -            if (!qemu_balloon_is_inhibited()) {
>> -                if (vq == s->ivq) {
>> -                    balloon_inflate_page(s, section.mr,
>> -                                         section.offset_within_region, &pbp);
>> -                } else if (vq == s->dvq) {
>> -                    balloon_deflate_page(s, section.mr, section.offset_within_region);
>> -                } else {
>> -                    g_assert_not_reached();
>> +                trace_virtio_balloon_handle_output(memory_region_name(section.mr),
>> +                                                   pa);
>> +                if (!qemu_balloon_is_inhibited()) {
>> +                    if (vq == s->ivq) {
>> +                        balloon_inflate_page(s, section.mr,
>> +                                             section.offset_within_region,
>> +                                             &pbp);
>> +                    } else if (vq == s->dvq) {
>> +                        balloon_deflate_page(s, section.mr,
>> +                                             section.offset_within_region);
>> +                    } else {
>> +                        g_assert_not_reached();
>> +                    }
>>                 }
>> +                memory_region_unref(section.mr);
>> +
>> +                pa += BALLOON_PAGE_SIZE;
>> +                handle_size -= BALLOON_PAGE_SIZE;
>>             }
>> -            memory_region_unref(section.mr);
>>         }
>> 
>>         virtqueue_push(vq, elem, offset);
>> @@ -693,6 +705,8 @@ static void virtio_balloon_set_config(VirtIODevice *vdev,
>> 
>>     memcpy(&config, config_data, virtio_balloon_config_size(dev));
>>     dev->actual = le32_to_cpu(config.actual);
>> +    if (virtio_has_feature(vdev->host_features, VIRTIO_BALLOON_F_THP_ORDER))
>> +        dev->actual <<= VIRTIO_BALLOON_THP_ORDER;
>>     if (dev->actual != oldactual) {
>>         qapi_event_send_balloon_change(vm_ram_size -
>>                         ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT));
>> @@ -728,6 +742,9 @@ static void virtio_balloon_to_target(void *opaque, ram_addr_t target)
>>     }
>>     if (target) {
>>         dev->num_pages = (vm_ram_size - target) >> VIRTIO_BALLOON_PFN_SHIFT;
>> +        if (virtio_has_feature(dev->host_features,
>> +                               VIRTIO_BALLOON_F_THP_ORDER))
>> +            dev->num_pages >>= VIRTIO_BALLOON_THP_ORDER;
>>         virtio_notify_config(vdev);
>>     }
>>     trace_virtio_balloon_to_target(target, dev->num_pages);
>> @@ -916,6 +933,8 @@ static Property virtio_balloon_properties[] = {
>>                     VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false),
>>     DEFINE_PROP_BIT("free-page-hint", VirtIOBalloon, host_features,
>>                     VIRTIO_BALLOON_F_FREE_PAGE_HINT, false),
>> +    DEFINE_PROP_BIT("thp-order", VirtIOBalloon, host_features,
>> +                    VIRTIO_BALLOON_F_THP_ORDER, false),
>>     /* QEMU 4.0 accidentally changed the config size even when free-page-hint
>>      * is disabled, resulting in QEMU 3.1 migration incompatibility.  This
>>      * property retains this quirk for QEMU 4.1 machine types.
>> diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h
>> index 9375ca2..f54d613 100644
>> --- a/include/standard-headers/linux/virtio_balloon.h
>> +++ b/include/standard-headers/linux/virtio_balloon.h
>> @@ -36,10 +36,14 @@
>> #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
>> #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
>> #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
>> +#define VIRTIO_BALLOON_F_THP_ORDER	5 /* Set balloon page order to thp order */
>> 
>> /* Size of a PFN in the balloon interface. */
>> #define VIRTIO_BALLOON_PFN_SHIFT 12
>> 
>> +/* The order of the balloon page */
>> +#define VIRTIO_BALLOON_THP_ORDER 9
>> +
>> #define VIRTIO_BALLOON_CMD_ID_STOP	0
>> #define VIRTIO_BALLOON_CMD_ID_DONE	1
>> struct virtio_balloon_config {
>> -- 
>> 2.7.4


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for QEMU] virtio-balloon: Add option thp-order to set VIRTIO_BALLOON_F_THP_ORDER
  2020-03-17 10:13     ` teawater
@ 2020-03-26  7:07       ` Michael S. Tsirkin
  0 siblings, 0 replies; 32+ messages in thread
From: Michael S. Tsirkin @ 2020-03-26  7:07 UTC (permalink / raw)
  To: teawater
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, david, namit,
	virtualization, linux-kernel, qemu-devel

On Tue, Mar 17, 2020 at 06:13:32PM +0800, teawater wrote:
> 
> 
> > 2020年3月12日 16:25,Michael S. Tsirkin <mst@redhat.com> 写道:
> > 
> > On Thu, Mar 12, 2020 at 03:49:55PM +0800, Hui Zhu wrote:
> >> If the guest kernel has many fragmentation pages, use virtio_balloon
> >> will split THP of QEMU when it calls MADV_DONTNEED madvise to release
> >> the balloon pages.
> >> Set option thp-order to on will open flags VIRTIO_BALLOON_F_THP_ORDER.
> >> It will set balloon size to THP size to handle the THP split issue.
> >> 
> >> Signed-off-by: Hui Zhu <teawaterz@linux.alibaba.com>
> > 
> > What's wrong with just using the PartiallyBalloonedPage machinery
> > instead? That would make it guest transparent.
> 
> In balloon_inflate_page:
>     rb_page_size = qemu_ram_pagesize(rb);
> 
>     if (rb_page_size == BALLOON_PAGE_SIZE) {
>         /* Easy case */
> 
> It seems that PartiallyBalloonedPage is only used when rb_page_size is greater than BALLOON_PAGE_SIZE.
> Do you mean I should modify the working mechanism of balloon_inflate_page function?
> 
> Thanks,
> Hui

Yes, we can tweak it to unconditionally combine pages to
a huge page.


> > 
> >> ---
> >> hw/virtio/virtio-balloon.c                      | 67 ++++++++++++++++---------
> >> include/standard-headers/linux/virtio_balloon.h |  4 ++
> >> 2 files changed, 47 insertions(+), 24 deletions(-)
> >> 
> >> diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
> >> index a4729f7..cfe86b0 100644
> >> --- a/hw/virtio/virtio-balloon.c
> >> +++ b/hw/virtio/virtio-balloon.c
> >> @@ -340,37 +340,49 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
> >>         while (iov_to_buf(elem->out_sg, elem->out_num, offset, &pfn, 4) == 4) {
> >>             unsigned int p = virtio_ldl_p(vdev, &pfn);
> >>             hwaddr pa;
> >> +            size_t handle_size = BALLOON_PAGE_SIZE;
> >> 
> >>             pa = (hwaddr) p << VIRTIO_BALLOON_PFN_SHIFT;
> >>             offset += 4;
> >> 
> >> -            section = memory_region_find(get_system_memory(), pa,
> >> -                                         BALLOON_PAGE_SIZE);
> >> -            if (!section.mr) {
> >> -                trace_virtio_balloon_bad_addr(pa);
> >> -                continue;
> >> -            }
> >> -            if (!memory_region_is_ram(section.mr) ||
> >> -                memory_region_is_rom(section.mr) ||
> >> -                memory_region_is_romd(section.mr)) {
> >> -                trace_virtio_balloon_bad_addr(pa);
> >> -                memory_region_unref(section.mr);
> >> -                continue;
> >> -            }
> >> +            if (virtio_has_feature(s->host_features,
> >> +                                   VIRTIO_BALLOON_F_THP_ORDER))
> >> +                handle_size = BALLOON_PAGE_SIZE << VIRTIO_BALLOON_THP_ORDER;
> >> +
> >> +            while (handle_size > 0) {
> >> +                section = memory_region_find(get_system_memory(), pa,
> >> +                                             BALLOON_PAGE_SIZE);
> >> +                if (!section.mr) {
> >> +                    trace_virtio_balloon_bad_addr(pa);
> >> +                    continue;
> >> +                }
> >> +                if (!memory_region_is_ram(section.mr) ||
> >> +                    memory_region_is_rom(section.mr) ||
> >> +                    memory_region_is_romd(section.mr)) {
> >> +                    trace_virtio_balloon_bad_addr(pa);
> >> +                    memory_region_unref(section.mr);
> >> +                    continue;
> >> +                }
> >> 
> >> -            trace_virtio_balloon_handle_output(memory_region_name(section.mr),
> >> -                                               pa);
> >> -            if (!qemu_balloon_is_inhibited()) {
> >> -                if (vq == s->ivq) {
> >> -                    balloon_inflate_page(s, section.mr,
> >> -                                         section.offset_within_region, &pbp);
> >> -                } else if (vq == s->dvq) {
> >> -                    balloon_deflate_page(s, section.mr, section.offset_within_region);
> >> -                } else {
> >> -                    g_assert_not_reached();
> >> +                trace_virtio_balloon_handle_output(memory_region_name(section.mr),
> >> +                                                   pa);
> >> +                if (!qemu_balloon_is_inhibited()) {
> >> +                    if (vq == s->ivq) {
> >> +                        balloon_inflate_page(s, section.mr,
> >> +                                             section.offset_within_region,
> >> +                                             &pbp);
> >> +                    } else if (vq == s->dvq) {
> >> +                        balloon_deflate_page(s, section.mr,
> >> +                                             section.offset_within_region);
> >> +                    } else {
> >> +                        g_assert_not_reached();
> >> +                    }
> >>                 }
> >> +                memory_region_unref(section.mr);
> >> +
> >> +                pa += BALLOON_PAGE_SIZE;
> >> +                handle_size -= BALLOON_PAGE_SIZE;
> >>             }
> >> -            memory_region_unref(section.mr);
> >>         }
> >> 
> >>         virtqueue_push(vq, elem, offset);
> >> @@ -693,6 +705,8 @@ static void virtio_balloon_set_config(VirtIODevice *vdev,
> >> 
> >>     memcpy(&config, config_data, virtio_balloon_config_size(dev));
> >>     dev->actual = le32_to_cpu(config.actual);
> >> +    if (virtio_has_feature(vdev->host_features, VIRTIO_BALLOON_F_THP_ORDER))
> >> +        dev->actual <<= VIRTIO_BALLOON_THP_ORDER;
> >>     if (dev->actual != oldactual) {
> >>         qapi_event_send_balloon_change(vm_ram_size -
> >>                         ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT));
> >> @@ -728,6 +742,9 @@ static void virtio_balloon_to_target(void *opaque, ram_addr_t target)
> >>     }
> >>     if (target) {
> >>         dev->num_pages = (vm_ram_size - target) >> VIRTIO_BALLOON_PFN_SHIFT;
> >> +        if (virtio_has_feature(dev->host_features,
> >> +                               VIRTIO_BALLOON_F_THP_ORDER))
> >> +            dev->num_pages >>= VIRTIO_BALLOON_THP_ORDER;
> >>         virtio_notify_config(vdev);
> >>     }
> >>     trace_virtio_balloon_to_target(target, dev->num_pages);
> >> @@ -916,6 +933,8 @@ static Property virtio_balloon_properties[] = {
> >>                     VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false),
> >>     DEFINE_PROP_BIT("free-page-hint", VirtIOBalloon, host_features,
> >>                     VIRTIO_BALLOON_F_FREE_PAGE_HINT, false),
> >> +    DEFINE_PROP_BIT("thp-order", VirtIOBalloon, host_features,
> >> +                    VIRTIO_BALLOON_F_THP_ORDER, false),
> >>     /* QEMU 4.0 accidentally changed the config size even when free-page-hint
> >>      * is disabled, resulting in QEMU 3.1 migration incompatibility.  This
> >>      * property retains this quirk for QEMU 4.1 machine types.
> >> diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h
> >> index 9375ca2..f54d613 100644
> >> --- a/include/standard-headers/linux/virtio_balloon.h
> >> +++ b/include/standard-headers/linux/virtio_balloon.h
> >> @@ -36,10 +36,14 @@
> >> #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
> >> #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
> >> #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
> >> +#define VIRTIO_BALLOON_F_THP_ORDER	5 /* Set balloon page order to thp order */
> >> 
> >> /* Size of a PFN in the balloon interface. */
> >> #define VIRTIO_BALLOON_PFN_SHIFT 12
> >> 
> >> +/* The order of the balloon page */
> >> +#define VIRTIO_BALLOON_THP_ORDER 9
> >> +
> >> #define VIRTIO_BALLOON_CMD_ID_STOP	0
> >> #define VIRTIO_BALLOON_CMD_ID_DONE	1
> >> struct virtio_balloon_config {
> >> -- 
> >> 2.7.4


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-12  8:51     ` David Hildenbrand
@ 2020-03-26  7:10       ` Michael S. Tsirkin
  2020-03-26  7:20       ` Michael S. Tsirkin
  1 sibling, 0 replies; 32+ messages in thread
From: Michael S. Tsirkin @ 2020-03-26  7:10 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu, Alexander Duyck

On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
> On 12.03.20 09:47, Michael S. Tsirkin wrote:
> > On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
> >> 2. You are essentially stealing THPs in the guest. So the fastest
> >> mapping (THP in guest and host) is gone. The guest won't be able to make
> >> use of THP where it previously was able to. I can imagine this implies a
> >> performance degradation for some workloads. This needs a proper
> >> performance evaluation.
> > 
> > I think the problem is more with the alloc_pages API.
> > That gives you exactly the given order, and if there's
> > a larger chunk available, it will split it up.
> > 
> > But for balloon - I suspect lots of other users,
> > we do not want to stress the system but if a large
> > chunk is available anyway, then we could handle
> > that more optimally by getting it all in one go.
> > 
> > 
> > So if we want to address this, IMHO this calls for a new API.
> > Along the lines of
> > 
> > 	struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
> > 					unsigned int max_order, unsigned int *order)
> > 
> > the idea would then be to return at a number of pages in the given
> > range.
> > 
> > What do you think? Want to try implementing that?
> 
> You can just start with the highest order and decrement the order until
> your allocation succeeds using alloc_pages(), which would be enough for
> a first version. At least I don't see the immediate need for a new
> kernel API.

Well there's still a chance of splitting a big page if one
becomes available meanwhile. But OK.

> -- 
> Thanks,
> 
> David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-12  8:51     ` David Hildenbrand
  2020-03-26  7:10       ` Michael S. Tsirkin
@ 2020-03-26  7:20       ` Michael S. Tsirkin
  2020-03-26  7:54         ` David Hildenbrand
  1 sibling, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2020-03-26  7:20 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu, Alexander Duyck

On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
> On 12.03.20 09:47, Michael S. Tsirkin wrote:
> > On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
> >> 2. You are essentially stealing THPs in the guest. So the fastest
> >> mapping (THP in guest and host) is gone. The guest won't be able to make
> >> use of THP where it previously was able to. I can imagine this implies a
> >> performance degradation for some workloads. This needs a proper
> >> performance evaluation.
> > 
> > I think the problem is more with the alloc_pages API.
> > That gives you exactly the given order, and if there's
> > a larger chunk available, it will split it up.
> > 
> > But for balloon - I suspect lots of other users,
> > we do not want to stress the system but if a large
> > chunk is available anyway, then we could handle
> > that more optimally by getting it all in one go.
> > 
> > 
> > So if we want to address this, IMHO this calls for a new API.
> > Along the lines of
> > 
> > 	struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
> > 					unsigned int max_order, unsigned int *order)
> > 
> > the idea would then be to return at a number of pages in the given
> > range.
> > 
> > What do you think? Want to try implementing that?
> 
> You can just start with the highest order and decrement the order until
> your allocation succeeds using alloc_pages(), which would be enough for
> a first version. At least I don't see the immediate need for a new
> kernel API.

OK I remember now.  The problem is with reclaim. Unless reclaim is
completely disabled, any of these calls can sleep. After it wakes up,
we would like to get the larger order that has become available
meanwhile.


> -- 
> Thanks,
> 
> David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-26  7:20       ` Michael S. Tsirkin
@ 2020-03-26  7:54         ` David Hildenbrand
  2020-03-26  9:49           ` Michael S. Tsirkin
  0 siblings, 1 reply; 32+ messages in thread
From: David Hildenbrand @ 2020-03-26  7:54 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Hildenbrand, Hui Zhu, jasowang, akpm, pagupta, mojha,
	namit, virtualization, linux-kernel, qemu-devel, Hui Zhu,
	Alexander Duyck



> Am 26.03.2020 um 08:21 schrieb Michael S. Tsirkin <mst@redhat.com>:
> 
> On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
>>> On 12.03.20 09:47, Michael S. Tsirkin wrote:
>>> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
>>>> 2. You are essentially stealing THPs in the guest. So the fastest
>>>> mapping (THP in guest and host) is gone. The guest won't be able to make
>>>> use of THP where it previously was able to. I can imagine this implies a
>>>> performance degradation for some workloads. This needs a proper
>>>> performance evaluation.
>>> 
>>> I think the problem is more with the alloc_pages API.
>>> That gives you exactly the given order, and if there's
>>> a larger chunk available, it will split it up.
>>> 
>>> But for balloon - I suspect lots of other users,
>>> we do not want to stress the system but if a large
>>> chunk is available anyway, then we could handle
>>> that more optimally by getting it all in one go.
>>> 
>>> 
>>> So if we want to address this, IMHO this calls for a new API.
>>> Along the lines of
>>> 
>>>    struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
>>>                    unsigned int max_order, unsigned int *order)
>>> 
>>> the idea would then be to return at a number of pages in the given
>>> range.
>>> 
>>> What do you think? Want to try implementing that?
>> 
>> You can just start with the highest order and decrement the order until
>> your allocation succeeds using alloc_pages(), which would be enough for
>> a first version. At least I don't see the immediate need for a new
>> kernel API.
> 
> OK I remember now.  The problem is with reclaim. Unless reclaim is
> completely disabled, any of these calls can sleep. After it wakes up,
> we would like to get the larger order that has become available
> meanwhile.
> 

Yes, but that‘s a pure optimization IMHO.

So I think we should do a trivial implementation first and then see what we gain from a new allocator API. Then we might also be able to justify it using real numbers.

> 
>> -- 
>> Thanks,
>> 
>> David / dhildenb
> 


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-26  7:54         ` David Hildenbrand
@ 2020-03-26  9:49           ` Michael S. Tsirkin
  2020-03-31 10:35             ` David Hildenbrand
  0 siblings, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2020-03-26  9:49 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu, Alexander Duyck

On Thu, Mar 26, 2020 at 08:54:04AM +0100, David Hildenbrand wrote:
> 
> 
> > Am 26.03.2020 um 08:21 schrieb Michael S. Tsirkin <mst@redhat.com>:
> > 
> > On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
> >>> On 12.03.20 09:47, Michael S. Tsirkin wrote:
> >>> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
> >>>> 2. You are essentially stealing THPs in the guest. So the fastest
> >>>> mapping (THP in guest and host) is gone. The guest won't be able to make
> >>>> use of THP where it previously was able to. I can imagine this implies a
> >>>> performance degradation for some workloads. This needs a proper
> >>>> performance evaluation.
> >>> 
> >>> I think the problem is more with the alloc_pages API.
> >>> That gives you exactly the given order, and if there's
> >>> a larger chunk available, it will split it up.
> >>> 
> >>> But for balloon - I suspect lots of other users,
> >>> we do not want to stress the system but if a large
> >>> chunk is available anyway, then we could handle
> >>> that more optimally by getting it all in one go.
> >>> 
> >>> 
> >>> So if we want to address this, IMHO this calls for a new API.
> >>> Along the lines of
> >>> 
> >>>    struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
> >>>                    unsigned int max_order, unsigned int *order)
> >>> 
> >>> the idea would then be to return at a number of pages in the given
> >>> range.
> >>> 
> >>> What do you think? Want to try implementing that?
> >> 
> >> You can just start with the highest order and decrement the order until
> >> your allocation succeeds using alloc_pages(), which would be enough for
> >> a first version. At least I don't see the immediate need for a new
> >> kernel API.
> > 
> > OK I remember now.  The problem is with reclaim. Unless reclaim is
> > completely disabled, any of these calls can sleep. After it wakes up,
> > we would like to get the larger order that has become available
> > meanwhile.
> > 
> 
> Yes, but that‘s a pure optimization IMHO.
> So I think we should do a trivial implementation first and then see what we gain from a new allocator API. Then we might also be able to justify it using real numbers.
> 

Well how do you propose implement the necessary semantics?
I think we are both agreed that alloc_page_range is more or
less what's necessary anyway - so how would you approximate it
on top of existing APIs?


> > 
> >> -- 
> >> Thanks,
> >> 
> >> David / dhildenb
> > 


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-26  9:49           ` Michael S. Tsirkin
@ 2020-03-31 10:35             ` David Hildenbrand
  2020-03-31 13:24               ` Michael S. Tsirkin
  0 siblings, 1 reply; 32+ messages in thread
From: David Hildenbrand @ 2020-03-31 10:35 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu, Alexander Duyck

On 26.03.20 10:49, Michael S. Tsirkin wrote:
> On Thu, Mar 26, 2020 at 08:54:04AM +0100, David Hildenbrand wrote:
>>
>>
>>> Am 26.03.2020 um 08:21 schrieb Michael S. Tsirkin <mst@redhat.com>:
>>>
>>> On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
>>>>> On 12.03.20 09:47, Michael S. Tsirkin wrote:
>>>>> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
>>>>>> 2. You are essentially stealing THPs in the guest. So the fastest
>>>>>> mapping (THP in guest and host) is gone. The guest won't be able to make
>>>>>> use of THP where it previously was able to. I can imagine this implies a
>>>>>> performance degradation for some workloads. This needs a proper
>>>>>> performance evaluation.
>>>>>
>>>>> I think the problem is more with the alloc_pages API.
>>>>> That gives you exactly the given order, and if there's
>>>>> a larger chunk available, it will split it up.
>>>>>
>>>>> But for balloon - I suspect lots of other users,
>>>>> we do not want to stress the system but if a large
>>>>> chunk is available anyway, then we could handle
>>>>> that more optimally by getting it all in one go.
>>>>>
>>>>>
>>>>> So if we want to address this, IMHO this calls for a new API.
>>>>> Along the lines of
>>>>>
>>>>>    struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
>>>>>                    unsigned int max_order, unsigned int *order)
>>>>>
>>>>> the idea would then be to return at a number of pages in the given
>>>>> range.
>>>>>
>>>>> What do you think? Want to try implementing that?
>>>>
>>>> You can just start with the highest order and decrement the order until
>>>> your allocation succeeds using alloc_pages(), which would be enough for
>>>> a first version. At least I don't see the immediate need for a new
>>>> kernel API.
>>>
>>> OK I remember now.  The problem is with reclaim. Unless reclaim is
>>> completely disabled, any of these calls can sleep. After it wakes up,
>>> we would like to get the larger order that has become available
>>> meanwhile.
>>>
>>
>> Yes, but that‘s a pure optimization IMHO.
>> So I think we should do a trivial implementation first and then see what we gain from a new allocator API. Then we might also be able to justify it using real numbers.
>>
> 
> Well how do you propose implement the necessary semantics?
> I think we are both agreed that alloc_page_range is more or
> less what's necessary anyway - so how would you approximate it
> on top of existing APIs?

Looking at drivers/misc/vmw_balloon.c:vmballoon_inflate(), it first
tries to allocate huge pages using

	alloc_pages(__GFP_HIGHMEM|__GFP_NOWARN| __GFP_NOMEMALLOC, 
                    VMW_BALLOON_2M_ORDER)

And then falls back to 4k allocations (balloon_page_alloc()) in case
allocation fails.

I'm roughly thinking of something like the following, but with an
optimized reporting interface/bigger pfn array so we can report >
1MB at a time. Also, it might make sense to remember the order that
succeeded across some fill_balloon() calls.

Don't even expect it to compile ...



From 4305f989672ccca4be9293e6d4167e929f3e299b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 31 Mar 2020 12:28:07 +0200
Subject: [PATCH RFC] tmp

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 drivers/virtio/virtio_balloon.c    | 38 ++++++++++++++++++--------
 include/linux/balloon_compaction.h |  7 ++++-
 mm/balloon_compaction.c            | 43 +++++++++++++++++++++++-------
 3 files changed, 67 insertions(+), 21 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 8511d258dbb4..0660b1b988f0 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -187,7 +187,7 @@ int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_info,
 }
 
 static void set_page_pfns(struct virtio_balloon *vb,
-			  __virtio32 pfns[], struct page *page)
+			  __virtio32 pfns[], struct page *page, int order)
 {
 	unsigned int i;
 
@@ -197,7 +197,7 @@ static void set_page_pfns(struct virtio_balloon *vb,
 	 * Set balloon pfns pointing at this page.
 	 * Note that the first pfn points at start of the page.
 	 */
-	for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE; i++)
+	for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order); i++)
 		pfns[i] = cpu_to_virtio32(vb->vdev,
 					  page_to_balloon_pfn(page) + i);
 }
@@ -205,6 +205,7 @@ static void set_page_pfns(struct virtio_balloon *vb,
 static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 {
 	unsigned num_allocated_pages;
+	int order = MAX_ORDER - 1;
 	unsigned num_pfns;
 	struct page *page;
 	LIST_HEAD(pages);
@@ -212,9 +213,20 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 	/* We can only do one array worth at a time. */
 	num = min(num, ARRAY_SIZE(vb->pfns));
 
+	/*
+	 * Note: we will currently never allocate more than 1MB due to the
+	 * pfn array size, so we will not allocate MAX_ORDER - 1 ...
+	 */
+
 	for (num_pfns = 0; num_pfns < num;
-	     num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-		struct page *page = balloon_page_alloc();
+	     num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order)) {
+		const unsigned long remaining = num - num_pfns;
+
+		order = MIN(order,
+			    get_order(remaining << VIRTIO_BALLOON_PFN_SHIFT));
+		if ((1 << order) * VIRTIO_BALLOON_PAGES_PER_PAGE > remaining)
+			order--;
+		page = balloon_pages_alloc(order);
 
 		if (!page) {
 			dev_info_ratelimited(&vb->vdev->dev,
@@ -225,6 +237,8 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 			break;
 		}
 
+		/* Continue with the actual order that succeeded. */
+		order = page_private(page);
 		balloon_page_push(&pages, page);
 	}
 
@@ -233,14 +247,16 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 	vb->num_pfns = 0;
 
 	while ((page = balloon_page_pop(&pages))) {
+		order = page_order(page);
+		/* enqueuing will split the page and clear the order */
 		balloon_page_enqueue(&vb->vb_dev_info, page);
 
-		set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
-		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
+		set_page_pfns(vb, vb->pfns + vb->num_pfns, page, order);
+		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order);
 		if (!virtio_has_feature(vb->vdev,
 					VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
-			adjust_managed_page_count(page, -1);
-		vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
+			adjust_managed_page_count(page, -1 * (1 << order));
+		vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order);
 	}
 
 	num_allocated_pages = vb->num_pfns;
@@ -284,7 +300,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
 		page = balloon_page_dequeue(vb_dev_info);
 		if (!page)
 			break;
-		set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
+		set_page_pfns(vb, vb->pfns + vb->num_pfns, page, 0);
 		list_add(&page->lru, &pages);
 		vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
 	}
@@ -786,7 +802,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
 	__count_vm_event(BALLOON_MIGRATE);
 	spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
 	vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
-	set_page_pfns(vb, vb->pfns, newpage);
+	set_page_pfns(vb, vb->pfns, newpage, 0);
 	tell_host(vb, vb->inflate_vq);
 
 	/* balloon's page migration 2nd step -- deflate "page" */
@@ -794,7 +810,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
 	balloon_page_delete(page);
 	spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
 	vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
-	set_page_pfns(vb, vb->pfns, page);
+	set_page_pfns(vb, vb->pfns, page, 0);
 	tell_host(vb, vb->deflate_vq);
 
 	mutex_unlock(&vb->balloon_lock);
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 338aa27e4773..ed93fe5704d1 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -60,7 +60,7 @@ struct balloon_dev_info {
 	struct inode *inode;
 };
 
-extern struct page *balloon_page_alloc(void);
+extern struct page *balloon_pages_alloc(int order);
 extern void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
 				 struct page *page);
 extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
@@ -78,6 +78,11 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
 	balloon->inode = NULL;
 }
 
+static inline struct page *balloon_page_alloc(void)
+{
+	return balloon_pages_alloc(0);
+}
+
 #ifdef CONFIG_BALLOON_COMPACTION
 extern const struct address_space_operations balloon_aops;
 extern bool balloon_page_isolate(struct page *page,
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 26de020aae7b..067810b32813 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -112,23 +112,35 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
 EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
 
 /*
- * balloon_page_alloc - allocates a new page for insertion into the balloon
- *			page list.
+ * balloon_pages_alloc - allocates a new page (of at most the given order)
+ * 			 for insertion into the balloon page list.
  *
  * Driver must call this function to properly allocate a new balloon page.
  * Driver must call balloon_page_enqueue before definitively removing the page
  * from the guest system.
  *
+ * Will fall back to smaller orders if allocation fails. The order of the
+ * allocated page is stored in page->private.
+ *
  * Return: struct page for the allocated page or NULL on allocation failure.
  */
-struct page *balloon_page_alloc(void)
+struct page *balloon_pages_alloc(int order)
 {
-	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
-				       __GFP_NOMEMALLOC | __GFP_NORETRY |
-				       __GFP_NOWARN);
-	return page;
+	struct page *page;
+
+	while (order >= 0) {
+		page = alloc_pages(balloon_mapping_gfp_mask() |
+				   __GFP_NOMEMALLOC | __GFP_NORETRY |
+				   __GFP_NOWARN, order);
+		if (page) {
+			set_page_private(page, order);
+			return page;
+		}
+		order--;
+	}
+	return NULL;
 }
-EXPORT_SYMBOL_GPL(balloon_page_alloc);
+EXPORT_SYMBOL_GPL(balloon_pages_alloc);
 
 /*
  * balloon_page_enqueue - inserts a new page into the balloon page list.
@@ -146,10 +158,23 @@ EXPORT_SYMBOL_GPL(balloon_page_alloc);
 void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
 			  struct page *page)
 {
+	const int order = page_private(page);
 	unsigned long flags;
+	int i;
+
+	/*
+	 * We can only migrate single pages - and even if we could migrate
+	 * bigger ones, we would want to split them on demand instead of
+	 * trying to move around big chunks.
+	 */
+	if (order > 0)
+		split_page(page, order);
+	set_page_private(page, order);
 
 	spin_lock_irqsave(&b_dev_info->pages_lock, flags);
-	balloon_page_enqueue_one(b_dev_info, page);
+	for (i = 0; i < (1 << order); i++)
+		balloon_page_enqueue_one(b_dev_info, page + i);
+
 	spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
 }
 EXPORT_SYMBOL_GPL(balloon_page_enqueue);
-- 
2.25.1

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-31 10:35             ` David Hildenbrand
@ 2020-03-31 13:24               ` Michael S. Tsirkin
  2020-03-31 13:32                 ` David Hildenbrand
  0 siblings, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2020-03-31 13:24 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu, Alexander Duyck

On Tue, Mar 31, 2020 at 12:35:24PM +0200, David Hildenbrand wrote:
> On 26.03.20 10:49, Michael S. Tsirkin wrote:
> > On Thu, Mar 26, 2020 at 08:54:04AM +0100, David Hildenbrand wrote:
> >>
> >>
> >>> Am 26.03.2020 um 08:21 schrieb Michael S. Tsirkin <mst@redhat.com>:
> >>>
> >>> On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
> >>>>> On 12.03.20 09:47, Michael S. Tsirkin wrote:
> >>>>> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
> >>>>>> 2. You are essentially stealing THPs in the guest. So the fastest
> >>>>>> mapping (THP in guest and host) is gone. The guest won't be able to make
> >>>>>> use of THP where it previously was able to. I can imagine this implies a
> >>>>>> performance degradation for some workloads. This needs a proper
> >>>>>> performance evaluation.
> >>>>>
> >>>>> I think the problem is more with the alloc_pages API.
> >>>>> That gives you exactly the given order, and if there's
> >>>>> a larger chunk available, it will split it up.
> >>>>>
> >>>>> But for balloon - I suspect lots of other users,
> >>>>> we do not want to stress the system but if a large
> >>>>> chunk is available anyway, then we could handle
> >>>>> that more optimally by getting it all in one go.
> >>>>>
> >>>>>
> >>>>> So if we want to address this, IMHO this calls for a new API.
> >>>>> Along the lines of
> >>>>>
> >>>>>    struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
> >>>>>                    unsigned int max_order, unsigned int *order)
> >>>>>
> >>>>> the idea would then be to return at a number of pages in the given
> >>>>> range.
> >>>>>
> >>>>> What do you think? Want to try implementing that?
> >>>>
> >>>> You can just start with the highest order and decrement the order until
> >>>> your allocation succeeds using alloc_pages(), which would be enough for
> >>>> a first version. At least I don't see the immediate need for a new
> >>>> kernel API.
> >>>
> >>> OK I remember now.  The problem is with reclaim. Unless reclaim is
> >>> completely disabled, any of these calls can sleep. After it wakes up,
> >>> we would like to get the larger order that has become available
> >>> meanwhile.
> >>>
> >>
> >> Yes, but that‘s a pure optimization IMHO.
> >> So I think we should do a trivial implementation first and then see what we gain from a new allocator API. Then we might also be able to justify it using real numbers.
> >>
> > 
> > Well how do you propose implement the necessary semantics?
> > I think we are both agreed that alloc_page_range is more or
> > less what's necessary anyway - so how would you approximate it
> > on top of existing APIs?
> 
> Looking at drivers/misc/vmw_balloon.c:vmballoon_inflate(), it first
> tries to allocate huge pages using
> 
> 	alloc_pages(__GFP_HIGHMEM|__GFP_NOWARN| __GFP_NOMEMALLOC, 
>                     VMW_BALLOON_2M_ORDER)
> 
> And then falls back to 4k allocations (balloon_page_alloc()) in case
> allocation fails.
> 
> I'm roughly thinking of something like the following, but with an
> optimized reporting interface/bigger pfn array so we can report >
> 1MB at a time. Also, it might make sense to remember the order that
> succeeded across some fill_balloon() calls.
> 
> Don't even expect it to compile ...
> 
> 
> 
> >From 4305f989672ccca4be9293e6d4167e929f3e299b Mon Sep 17 00:00:00 2001
> From: David Hildenbrand <david@redhat.com>
> Date: Tue, 31 Mar 2020 12:28:07 +0200
> Subject: [PATCH RFC] tmp
> 
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  drivers/virtio/virtio_balloon.c    | 38 ++++++++++++++++++--------
>  include/linux/balloon_compaction.h |  7 ++++-
>  mm/balloon_compaction.c            | 43 +++++++++++++++++++++++-------
>  3 files changed, 67 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 8511d258dbb4..0660b1b988f0 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -187,7 +187,7 @@ int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_info,
>  }
>  
>  static void set_page_pfns(struct virtio_balloon *vb,
> -			  __virtio32 pfns[], struct page *page)
> +			  __virtio32 pfns[], struct page *page, int order)
>  {
>  	unsigned int i;
>  
> @@ -197,7 +197,7 @@ static void set_page_pfns(struct virtio_balloon *vb,
>  	 * Set balloon pfns pointing at this page.
>  	 * Note that the first pfn points at start of the page.
>  	 */
> -	for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE; i++)
> +	for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order); i++)
>  		pfns[i] = cpu_to_virtio32(vb->vdev,
>  					  page_to_balloon_pfn(page) + i);
>  }
> @@ -205,6 +205,7 @@ static void set_page_pfns(struct virtio_balloon *vb,
>  static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>  {
>  	unsigned num_allocated_pages;
> +	int order = MAX_ORDER - 1;
>  	unsigned num_pfns;
>  	struct page *page;
>  	LIST_HEAD(pages);
> @@ -212,9 +213,20 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>  	/* We can only do one array worth at a time. */
>  	num = min(num, ARRAY_SIZE(vb->pfns));
>  
> +	/*
> +	 * Note: we will currently never allocate more than 1MB due to the
> +	 * pfn array size, so we will not allocate MAX_ORDER - 1 ...
> +	 */
> +
>  	for (num_pfns = 0; num_pfns < num;
> -	     num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
> -		struct page *page = balloon_page_alloc();
> +	     num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order)) {
> +		const unsigned long remaining = num - num_pfns;
> +
> +		order = MIN(order,
> +			    get_order(remaining << VIRTIO_BALLOON_PFN_SHIFT));
> +		if ((1 << order) * VIRTIO_BALLOON_PAGES_PER_PAGE > remaining)
> +			order--;
> +		page = balloon_pages_alloc(order);
>  
>  		if (!page) {
>  			dev_info_ratelimited(&vb->vdev->dev,
> @@ -225,6 +237,8 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>  			break;
>  		}
>  
> +		/* Continue with the actual order that succeeded. */
> +		order = page_private(page);
>  		balloon_page_push(&pages, page);
>  	}
>  
> @@ -233,14 +247,16 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>  	vb->num_pfns = 0;
>  
>  	while ((page = balloon_page_pop(&pages))) {
> +		order = page_order(page);
> +		/* enqueuing will split the page and clear the order */
>  		balloon_page_enqueue(&vb->vb_dev_info, page);
>  
> -		set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
> -		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
> +		set_page_pfns(vb, vb->pfns + vb->num_pfns, page, order);
> +		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order);
>  		if (!virtio_has_feature(vb->vdev,
>  					VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> -			adjust_managed_page_count(page, -1);
> -		vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
> +			adjust_managed_page_count(page, -1 * (1 << order));
> +		vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order);
>  	}
>  
>  	num_allocated_pages = vb->num_pfns;
> @@ -284,7 +300,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
>  		page = balloon_page_dequeue(vb_dev_info);
>  		if (!page)
>  			break;
> -		set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
> +		set_page_pfns(vb, vb->pfns + vb->num_pfns, page, 0);
>  		list_add(&page->lru, &pages);
>  		vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
>  	}
> @@ -786,7 +802,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
>  	__count_vm_event(BALLOON_MIGRATE);
>  	spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
>  	vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
> -	set_page_pfns(vb, vb->pfns, newpage);
> +	set_page_pfns(vb, vb->pfns, newpage, 0);
>  	tell_host(vb, vb->inflate_vq);
>  
>  	/* balloon's page migration 2nd step -- deflate "page" */
> @@ -794,7 +810,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
>  	balloon_page_delete(page);
>  	spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
>  	vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
> -	set_page_pfns(vb, vb->pfns, page);
> +	set_page_pfns(vb, vb->pfns, page, 0);
>  	tell_host(vb, vb->deflate_vq);
>  
>  	mutex_unlock(&vb->balloon_lock);
> diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
> index 338aa27e4773..ed93fe5704d1 100644
> --- a/include/linux/balloon_compaction.h
> +++ b/include/linux/balloon_compaction.h
> @@ -60,7 +60,7 @@ struct balloon_dev_info {
>  	struct inode *inode;
>  };
>  
> -extern struct page *balloon_page_alloc(void);
> +extern struct page *balloon_pages_alloc(int order);
>  extern void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
>  				 struct page *page);
>  extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
> @@ -78,6 +78,11 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
>  	balloon->inode = NULL;
>  }
>  
> +static inline struct page *balloon_page_alloc(void)
> +{
> +	return balloon_pages_alloc(0);
> +}
> +
>  #ifdef CONFIG_BALLOON_COMPACTION
>  extern const struct address_space_operations balloon_aops;
>  extern bool balloon_page_isolate(struct page *page,
> diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
> index 26de020aae7b..067810b32813 100644
> --- a/mm/balloon_compaction.c
> +++ b/mm/balloon_compaction.c
> @@ -112,23 +112,35 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
>  EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
>  
>  /*
> - * balloon_page_alloc - allocates a new page for insertion into the balloon
> - *			page list.
> + * balloon_pages_alloc - allocates a new page (of at most the given order)
> + * 			 for insertion into the balloon page list.
>   *
>   * Driver must call this function to properly allocate a new balloon page.
>   * Driver must call balloon_page_enqueue before definitively removing the page
>   * from the guest system.
>   *
> + * Will fall back to smaller orders if allocation fails. The order of the
> + * allocated page is stored in page->private.
> + *
>   * Return: struct page for the allocated page or NULL on allocation failure.
>   */
> -struct page *balloon_page_alloc(void)
> +struct page *balloon_pages_alloc(int order)
>  {
> -	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
> -				       __GFP_NOMEMALLOC | __GFP_NORETRY |
> -				       __GFP_NOWARN);
> -	return page;
> +	struct page *page;
> +
> +	while (order >= 0) {
> +		page = alloc_pages(balloon_mapping_gfp_mask() |
> +				   __GFP_NOMEMALLOC | __GFP_NORETRY |
> +				   __GFP_NOWARN, order);
> +		if (page) {
> +			set_page_private(page, order);
> +			return page;
> +		}
> +		order--;
> +	}
> +	return NULL;
>  }
> -EXPORT_SYMBOL_GPL(balloon_page_alloc);
> +EXPORT_SYMBOL_GPL(balloon_pages_alloc);
>  
>  /*
>   * balloon_page_enqueue - inserts a new page into the balloon page list.


I think this will try to invoke direct reclaim from the first iteration
to free up the max order.

> @@ -146,10 +158,23 @@ EXPORT_SYMBOL_GPL(balloon_page_alloc);
>  void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
>  			  struct page *page)
>  {
> +	const int order = page_private(page);
>  	unsigned long flags;
> +	int i;
> +
> +	/*
> +	 * We can only migrate single pages - and even if we could migrate
> +	 * bigger ones, we would want to split them on demand instead of
> +	 * trying to move around big chunks.
> +	 */
> +	if (order > 0)
> +		split_page(page, order);
> +	set_page_private(page, order);
>  
>  	spin_lock_irqsave(&b_dev_info->pages_lock, flags);
> -	balloon_page_enqueue_one(b_dev_info, page);
> +	for (i = 0; i < (1 << order); i++)
> +		balloon_page_enqueue_one(b_dev_info, page + i);
> +
>  	spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
>  }
>  EXPORT_SYMBOL_GPL(balloon_page_enqueue);
> -- 
> 2.25.1
> 
> -- 
> Thanks,
> 
> David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-31 13:24               ` Michael S. Tsirkin
@ 2020-03-31 13:32                 ` David Hildenbrand
  2020-03-31 13:37                   ` Michael S. Tsirkin
  2020-03-31 16:27                   ` Nadav Amit
  0 siblings, 2 replies; 32+ messages in thread
From: David Hildenbrand @ 2020-03-31 13:32 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu, Alexander Duyck

On 31.03.20 15:24, Michael S. Tsirkin wrote:
> On Tue, Mar 31, 2020 at 12:35:24PM +0200, David Hildenbrand wrote:
>> On 26.03.20 10:49, Michael S. Tsirkin wrote:
>>> On Thu, Mar 26, 2020 at 08:54:04AM +0100, David Hildenbrand wrote:
>>>>
>>>>
>>>>> Am 26.03.2020 um 08:21 schrieb Michael S. Tsirkin <mst@redhat.com>:
>>>>>
>>>>> On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
>>>>>>> On 12.03.20 09:47, Michael S. Tsirkin wrote:
>>>>>>> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
>>>>>>>> 2. You are essentially stealing THPs in the guest. So the fastest
>>>>>>>> mapping (THP in guest and host) is gone. The guest won't be able to make
>>>>>>>> use of THP where it previously was able to. I can imagine this implies a
>>>>>>>> performance degradation for some workloads. This needs a proper
>>>>>>>> performance evaluation.
>>>>>>>
>>>>>>> I think the problem is more with the alloc_pages API.
>>>>>>> That gives you exactly the given order, and if there's
>>>>>>> a larger chunk available, it will split it up.
>>>>>>>
>>>>>>> But for balloon - I suspect lots of other users,
>>>>>>> we do not want to stress the system but if a large
>>>>>>> chunk is available anyway, then we could handle
>>>>>>> that more optimally by getting it all in one go.
>>>>>>>
>>>>>>>
>>>>>>> So if we want to address this, IMHO this calls for a new API.
>>>>>>> Along the lines of
>>>>>>>
>>>>>>>    struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
>>>>>>>                    unsigned int max_order, unsigned int *order)
>>>>>>>
>>>>>>> the idea would then be to return at a number of pages in the given
>>>>>>> range.
>>>>>>>
>>>>>>> What do you think? Want to try implementing that?
>>>>>>
>>>>>> You can just start with the highest order and decrement the order until
>>>>>> your allocation succeeds using alloc_pages(), which would be enough for
>>>>>> a first version. At least I don't see the immediate need for a new
>>>>>> kernel API.
>>>>>
>>>>> OK I remember now.  The problem is with reclaim. Unless reclaim is
>>>>> completely disabled, any of these calls can sleep. After it wakes up,
>>>>> we would like to get the larger order that has become available
>>>>> meanwhile.
>>>>>
>>>>
>>>> Yes, but that‘s a pure optimization IMHO.
>>>> So I think we should do a trivial implementation first and then see what we gain from a new allocator API. Then we might also be able to justify it using real numbers.
>>>>
>>>
>>> Well how do you propose implement the necessary semantics?
>>> I think we are both agreed that alloc_page_range is more or
>>> less what's necessary anyway - so how would you approximate it
>>> on top of existing APIs?
>>
>> Looking at drivers/misc/vmw_balloon.c:vmballoon_inflate(), it first
>> tries to allocate huge pages using
>>
>> 	alloc_pages(__GFP_HIGHMEM|__GFP_NOWARN| __GFP_NOMEMALLOC, 
>>                     VMW_BALLOON_2M_ORDER)
>>
>> And then falls back to 4k allocations (balloon_page_alloc()) in case
>> allocation fails.
>>
>> I'm roughly thinking of something like the following, but with an
>> optimized reporting interface/bigger pfn array so we can report >
>> 1MB at a time. Also, it might make sense to remember the order that
>> succeeded across some fill_balloon() calls.
>>
>> Don't even expect it to compile ...
>>
>>
>>
>> >From 4305f989672ccca4be9293e6d4167e929f3e299b Mon Sep 17 00:00:00 2001
>> From: David Hildenbrand <david@redhat.com>
>> Date: Tue, 31 Mar 2020 12:28:07 +0200
>> Subject: [PATCH RFC] tmp
>>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
>>  drivers/virtio/virtio_balloon.c    | 38 ++++++++++++++++++--------
>>  include/linux/balloon_compaction.h |  7 ++++-
>>  mm/balloon_compaction.c            | 43 +++++++++++++++++++++++-------
>>  3 files changed, 67 insertions(+), 21 deletions(-)
>>
>> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
>> index 8511d258dbb4..0660b1b988f0 100644
>> --- a/drivers/virtio/virtio_balloon.c
>> +++ b/drivers/virtio/virtio_balloon.c
>> @@ -187,7 +187,7 @@ int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_info,
>>  }
>>  
>>  static void set_page_pfns(struct virtio_balloon *vb,
>> -			  __virtio32 pfns[], struct page *page)
>> +			  __virtio32 pfns[], struct page *page, int order)
>>  {
>>  	unsigned int i;
>>  
>> @@ -197,7 +197,7 @@ static void set_page_pfns(struct virtio_balloon *vb,
>>  	 * Set balloon pfns pointing at this page.
>>  	 * Note that the first pfn points at start of the page.
>>  	 */
>> -	for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE; i++)
>> +	for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order); i++)
>>  		pfns[i] = cpu_to_virtio32(vb->vdev,
>>  					  page_to_balloon_pfn(page) + i);
>>  }
>> @@ -205,6 +205,7 @@ static void set_page_pfns(struct virtio_balloon *vb,
>>  static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>>  {
>>  	unsigned num_allocated_pages;
>> +	int order = MAX_ORDER - 1;
>>  	unsigned num_pfns;
>>  	struct page *page;
>>  	LIST_HEAD(pages);
>> @@ -212,9 +213,20 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>>  	/* We can only do one array worth at a time. */
>>  	num = min(num, ARRAY_SIZE(vb->pfns));
>>  
>> +	/*
>> +	 * Note: we will currently never allocate more than 1MB due to the
>> +	 * pfn array size, so we will not allocate MAX_ORDER - 1 ...
>> +	 */
>> +
>>  	for (num_pfns = 0; num_pfns < num;
>> -	     num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
>> -		struct page *page = balloon_page_alloc();
>> +	     num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order)) {
>> +		const unsigned long remaining = num - num_pfns;
>> +
>> +		order = MIN(order,
>> +			    get_order(remaining << VIRTIO_BALLOON_PFN_SHIFT));
>> +		if ((1 << order) * VIRTIO_BALLOON_PAGES_PER_PAGE > remaining)
>> +			order--;
>> +		page = balloon_pages_alloc(order);
>>  
>>  		if (!page) {
>>  			dev_info_ratelimited(&vb->vdev->dev,
>> @@ -225,6 +237,8 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>>  			break;
>>  		}
>>  
>> +		/* Continue with the actual order that succeeded. */
>> +		order = page_private(page);
>>  		balloon_page_push(&pages, page);
>>  	}
>>  
>> @@ -233,14 +247,16 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>>  	vb->num_pfns = 0;
>>  
>>  	while ((page = balloon_page_pop(&pages))) {
>> +		order = page_order(page);
>> +		/* enqueuing will split the page and clear the order */
>>  		balloon_page_enqueue(&vb->vb_dev_info, page);
>>  
>> -		set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
>> -		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
>> +		set_page_pfns(vb, vb->pfns + vb->num_pfns, page, order);
>> +		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order);
>>  		if (!virtio_has_feature(vb->vdev,
>>  					VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
>> -			adjust_managed_page_count(page, -1);
>> -		vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
>> +			adjust_managed_page_count(page, -1 * (1 << order));
>> +		vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order);
>>  	}
>>  
>>  	num_allocated_pages = vb->num_pfns;
>> @@ -284,7 +300,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
>>  		page = balloon_page_dequeue(vb_dev_info);
>>  		if (!page)
>>  			break;
>> -		set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
>> +		set_page_pfns(vb, vb->pfns + vb->num_pfns, page, 0);
>>  		list_add(&page->lru, &pages);
>>  		vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
>>  	}
>> @@ -786,7 +802,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
>>  	__count_vm_event(BALLOON_MIGRATE);
>>  	spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
>>  	vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
>> -	set_page_pfns(vb, vb->pfns, newpage);
>> +	set_page_pfns(vb, vb->pfns, newpage, 0);
>>  	tell_host(vb, vb->inflate_vq);
>>  
>>  	/* balloon's page migration 2nd step -- deflate "page" */
>> @@ -794,7 +810,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
>>  	balloon_page_delete(page);
>>  	spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
>>  	vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
>> -	set_page_pfns(vb, vb->pfns, page);
>> +	set_page_pfns(vb, vb->pfns, page, 0);
>>  	tell_host(vb, vb->deflate_vq);
>>  
>>  	mutex_unlock(&vb->balloon_lock);
>> diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
>> index 338aa27e4773..ed93fe5704d1 100644
>> --- a/include/linux/balloon_compaction.h
>> +++ b/include/linux/balloon_compaction.h
>> @@ -60,7 +60,7 @@ struct balloon_dev_info {
>>  	struct inode *inode;
>>  };
>>  
>> -extern struct page *balloon_page_alloc(void);
>> +extern struct page *balloon_pages_alloc(int order);
>>  extern void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
>>  				 struct page *page);
>>  extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
>> @@ -78,6 +78,11 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
>>  	balloon->inode = NULL;
>>  }
>>  
>> +static inline struct page *balloon_page_alloc(void)
>> +{
>> +	return balloon_pages_alloc(0);
>> +}
>> +
>>  #ifdef CONFIG_BALLOON_COMPACTION
>>  extern const struct address_space_operations balloon_aops;
>>  extern bool balloon_page_isolate(struct page *page,
>> diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
>> index 26de020aae7b..067810b32813 100644
>> --- a/mm/balloon_compaction.c
>> +++ b/mm/balloon_compaction.c
>> @@ -112,23 +112,35 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
>>  EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
>>  
>>  /*
>> - * balloon_page_alloc - allocates a new page for insertion into the balloon
>> - *			page list.
>> + * balloon_pages_alloc - allocates a new page (of at most the given order)
>> + * 			 for insertion into the balloon page list.
>>   *
>>   * Driver must call this function to properly allocate a new balloon page.
>>   * Driver must call balloon_page_enqueue before definitively removing the page
>>   * from the guest system.
>>   *
>> + * Will fall back to smaller orders if allocation fails. The order of the
>> + * allocated page is stored in page->private.
>> + *
>>   * Return: struct page for the allocated page or NULL on allocation failure.
>>   */
>> -struct page *balloon_page_alloc(void)
>> +struct page *balloon_pages_alloc(int order)
>>  {
>> -	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
>> -				       __GFP_NOMEMALLOC | __GFP_NORETRY |
>> -				       __GFP_NOWARN);
>> -	return page;
>> +	struct page *page;
>> +
>> +	while (order >= 0) {
>> +		page = alloc_pages(balloon_mapping_gfp_mask() |
>> +				   __GFP_NOMEMALLOC | __GFP_NORETRY |
>> +				   __GFP_NOWARN, order);
>> +		if (page) {
>> +			set_page_private(page, order);
>> +			return page;
>> +		}
>> +		order--;
>> +	}
>> +	return NULL;
>>  }
>> -EXPORT_SYMBOL_GPL(balloon_page_alloc);
>> +EXPORT_SYMBOL_GPL(balloon_pages_alloc);
>>  
>>  /*
>>   * balloon_page_enqueue - inserts a new page into the balloon page list.
> 
> 
> I think this will try to invoke direct reclaim from the first iteration
> to free up the max order.

%__GFP_NORETRY: The VM implementation will try only very lightweight
memory direct reclaim to get some memory under memory pressure (thus it
can sleep). It will avoid disruptive actions like OOM killer.

Certainly good enough for a first version I would say, no? Looking at
the vmware balloon, they don't even set __GFP_NORETRY.

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-31 13:32                 ` David Hildenbrand
@ 2020-03-31 13:37                   ` Michael S. Tsirkin
  2020-03-31 14:03                     ` David Hildenbrand
  2020-03-31 16:27                   ` Nadav Amit
  1 sibling, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2020-03-31 13:37 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu, Alexander Duyck

On Tue, Mar 31, 2020 at 03:32:05PM +0200, David Hildenbrand wrote:
> On 31.03.20 15:24, Michael S. Tsirkin wrote:
> > On Tue, Mar 31, 2020 at 12:35:24PM +0200, David Hildenbrand wrote:
> >> On 26.03.20 10:49, Michael S. Tsirkin wrote:
> >>> On Thu, Mar 26, 2020 at 08:54:04AM +0100, David Hildenbrand wrote:
> >>>>
> >>>>
> >>>>> Am 26.03.2020 um 08:21 schrieb Michael S. Tsirkin <mst@redhat.com>:
> >>>>>
> >>>>> On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
> >>>>>>> On 12.03.20 09:47, Michael S. Tsirkin wrote:
> >>>>>>> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
> >>>>>>>> 2. You are essentially stealing THPs in the guest. So the fastest
> >>>>>>>> mapping (THP in guest and host) is gone. The guest won't be able to make
> >>>>>>>> use of THP where it previously was able to. I can imagine this implies a
> >>>>>>>> performance degradation for some workloads. This needs a proper
> >>>>>>>> performance evaluation.
> >>>>>>>
> >>>>>>> I think the problem is more with the alloc_pages API.
> >>>>>>> That gives you exactly the given order, and if there's
> >>>>>>> a larger chunk available, it will split it up.
> >>>>>>>
> >>>>>>> But for balloon - I suspect lots of other users,
> >>>>>>> we do not want to stress the system but if a large
> >>>>>>> chunk is available anyway, then we could handle
> >>>>>>> that more optimally by getting it all in one go.
> >>>>>>>
> >>>>>>>
> >>>>>>> So if we want to address this, IMHO this calls for a new API.
> >>>>>>> Along the lines of
> >>>>>>>
> >>>>>>>    struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
> >>>>>>>                    unsigned int max_order, unsigned int *order)
> >>>>>>>
> >>>>>>> the idea would then be to return at a number of pages in the given
> >>>>>>> range.
> >>>>>>>
> >>>>>>> What do you think? Want to try implementing that?
> >>>>>>
> >>>>>> You can just start with the highest order and decrement the order until
> >>>>>> your allocation succeeds using alloc_pages(), which would be enough for
> >>>>>> a first version. At least I don't see the immediate need for a new
> >>>>>> kernel API.
> >>>>>
> >>>>> OK I remember now.  The problem is with reclaim. Unless reclaim is
> >>>>> completely disabled, any of these calls can sleep. After it wakes up,
> >>>>> we would like to get the larger order that has become available
> >>>>> meanwhile.
> >>>>>
> >>>>
> >>>> Yes, but that‘s a pure optimization IMHO.
> >>>> So I think we should do a trivial implementation first and then see what we gain from a new allocator API. Then we might also be able to justify it using real numbers.
> >>>>
> >>>
> >>> Well how do you propose implement the necessary semantics?
> >>> I think we are both agreed that alloc_page_range is more or
> >>> less what's necessary anyway - so how would you approximate it
> >>> on top of existing APIs?
> >> diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h

.....


> >> diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
> >> index 26de020aae7b..067810b32813 100644
> >> --- a/mm/balloon_compaction.c
> >> +++ b/mm/balloon_compaction.c
> >> @@ -112,23 +112,35 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
> >>  EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
> >>  
> >>  /*
> >> - * balloon_page_alloc - allocates a new page for insertion into the balloon
> >> - *			page list.
> >> + * balloon_pages_alloc - allocates a new page (of at most the given order)
> >> + * 			 for insertion into the balloon page list.
> >>   *
> >>   * Driver must call this function to properly allocate a new balloon page.
> >>   * Driver must call balloon_page_enqueue before definitively removing the page
> >>   * from the guest system.
> >>   *
> >> + * Will fall back to smaller orders if allocation fails. The order of the
> >> + * allocated page is stored in page->private.
> >> + *
> >>   * Return: struct page for the allocated page or NULL on allocation failure.
> >>   */
> >> -struct page *balloon_page_alloc(void)
> >> +struct page *balloon_pages_alloc(int order)
> >>  {
> >> -	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
> >> -				       __GFP_NOMEMALLOC | __GFP_NORETRY |
> >> -				       __GFP_NOWARN);
> >> -	return page;
> >> +	struct page *page;
> >> +
> >> +	while (order >= 0) {
> >> +		page = alloc_pages(balloon_mapping_gfp_mask() |
> >> +				   __GFP_NOMEMALLOC | __GFP_NORETRY |
> >> +				   __GFP_NOWARN, order);
> >> +		if (page) {
> >> +			set_page_private(page, order);
> >> +			return page;
> >> +		}
> >> +		order--;
> >> +	}
> >> +	return NULL;
> >>  }
> >> -EXPORT_SYMBOL_GPL(balloon_page_alloc);
> >> +EXPORT_SYMBOL_GPL(balloon_pages_alloc);
> >>  
> >>  /*
> >>   * balloon_page_enqueue - inserts a new page into the balloon page list.
> > 
> > 
> > I think this will try to invoke direct reclaim from the first iteration
> > to free up the max order.
> 
> %__GFP_NORETRY: The VM implementation will try only very lightweight
> memory direct reclaim to get some memory under memory pressure (thus it
> can sleep). It will avoid disruptive actions like OOM killer.
> 
> Certainly good enough for a first version I would say, no?

Frankly how well that behaves would depend a lot on the workload.
Can regress just as well.

For the 1st version I'd prefer something that is the least disruptive,
and that IMHO means we only trigger reclaim at all in the same configuration
as now - when we can't satisfy the lowest order allocation.

Anything else would be a huge amount of testing with all kind of
workloads.

-- 
MST


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-31 13:37                   ` Michael S. Tsirkin
@ 2020-03-31 14:03                     ` David Hildenbrand
  2020-03-31 14:07                       ` Michael S. Tsirkin
  0 siblings, 1 reply; 32+ messages in thread
From: David Hildenbrand @ 2020-03-31 14:03 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu, Alexander Duyck

On 31.03.20 15:37, Michael S. Tsirkin wrote:
> On Tue, Mar 31, 2020 at 03:32:05PM +0200, David Hildenbrand wrote:
>> On 31.03.20 15:24, Michael S. Tsirkin wrote:
>>> On Tue, Mar 31, 2020 at 12:35:24PM +0200, David Hildenbrand wrote:
>>>> On 26.03.20 10:49, Michael S. Tsirkin wrote:
>>>>> On Thu, Mar 26, 2020 at 08:54:04AM +0100, David Hildenbrand wrote:
>>>>>>
>>>>>>
>>>>>>> Am 26.03.2020 um 08:21 schrieb Michael S. Tsirkin <mst@redhat.com>:
>>>>>>>
>>>>>>> On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
>>>>>>>>> On 12.03.20 09:47, Michael S. Tsirkin wrote:
>>>>>>>>> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
>>>>>>>>>> 2. You are essentially stealing THPs in the guest. So the fastest
>>>>>>>>>> mapping (THP in guest and host) is gone. The guest won't be able to make
>>>>>>>>>> use of THP where it previously was able to. I can imagine this implies a
>>>>>>>>>> performance degradation for some workloads. This needs a proper
>>>>>>>>>> performance evaluation.
>>>>>>>>>
>>>>>>>>> I think the problem is more with the alloc_pages API.
>>>>>>>>> That gives you exactly the given order, and if there's
>>>>>>>>> a larger chunk available, it will split it up.
>>>>>>>>>
>>>>>>>>> But for balloon - I suspect lots of other users,
>>>>>>>>> we do not want to stress the system but if a large
>>>>>>>>> chunk is available anyway, then we could handle
>>>>>>>>> that more optimally by getting it all in one go.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> So if we want to address this, IMHO this calls for a new API.
>>>>>>>>> Along the lines of
>>>>>>>>>
>>>>>>>>>    struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
>>>>>>>>>                    unsigned int max_order, unsigned int *order)
>>>>>>>>>
>>>>>>>>> the idea would then be to return at a number of pages in the given
>>>>>>>>> range.
>>>>>>>>>
>>>>>>>>> What do you think? Want to try implementing that?
>>>>>>>>
>>>>>>>> You can just start with the highest order and decrement the order until
>>>>>>>> your allocation succeeds using alloc_pages(), which would be enough for
>>>>>>>> a first version. At least I don't see the immediate need for a new
>>>>>>>> kernel API.
>>>>>>>
>>>>>>> OK I remember now.  The problem is with reclaim. Unless reclaim is
>>>>>>> completely disabled, any of these calls can sleep. After it wakes up,
>>>>>>> we would like to get the larger order that has become available
>>>>>>> meanwhile.
>>>>>>>
>>>>>>
>>>>>> Yes, but that‘s a pure optimization IMHO.
>>>>>> So I think we should do a trivial implementation first and then see what we gain from a new allocator API. Then we might also be able to justify it using real numbers.
>>>>>>
>>>>>
>>>>> Well how do you propose implement the necessary semantics?
>>>>> I think we are both agreed that alloc_page_range is more or
>>>>> less what's necessary anyway - so how would you approximate it
>>>>> on top of existing APIs?
>>>> diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
> 
> .....
> 
> 
>>>> diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
>>>> index 26de020aae7b..067810b32813 100644
>>>> --- a/mm/balloon_compaction.c
>>>> +++ b/mm/balloon_compaction.c
>>>> @@ -112,23 +112,35 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
>>>>  EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
>>>>  
>>>>  /*
>>>> - * balloon_page_alloc - allocates a new page for insertion into the balloon
>>>> - *			page list.
>>>> + * balloon_pages_alloc - allocates a new page (of at most the given order)
>>>> + * 			 for insertion into the balloon page list.
>>>>   *
>>>>   * Driver must call this function to properly allocate a new balloon page.
>>>>   * Driver must call balloon_page_enqueue before definitively removing the page
>>>>   * from the guest system.
>>>>   *
>>>> + * Will fall back to smaller orders if allocation fails. The order of the
>>>> + * allocated page is stored in page->private.
>>>> + *
>>>>   * Return: struct page for the allocated page or NULL on allocation failure.
>>>>   */
>>>> -struct page *balloon_page_alloc(void)
>>>> +struct page *balloon_pages_alloc(int order)
>>>>  {
>>>> -	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
>>>> -				       __GFP_NOMEMALLOC | __GFP_NORETRY |
>>>> -				       __GFP_NOWARN);
>>>> -	return page;
>>>> +	struct page *page;
>>>> +
>>>> +	while (order >= 0) {
>>>> +		page = alloc_pages(balloon_mapping_gfp_mask() |
>>>> +				   __GFP_NOMEMALLOC | __GFP_NORETRY |
>>>> +				   __GFP_NOWARN, order);
>>>> +		if (page) {
>>>> +			set_page_private(page, order);
>>>> +			return page;
>>>> +		}
>>>> +		order--;
>>>> +	}
>>>> +	return NULL;
>>>>  }
>>>> -EXPORT_SYMBOL_GPL(balloon_page_alloc);
>>>> +EXPORT_SYMBOL_GPL(balloon_pages_alloc);
>>>>  
>>>>  /*
>>>>   * balloon_page_enqueue - inserts a new page into the balloon page list.
>>>
>>>
>>> I think this will try to invoke direct reclaim from the first iteration
>>> to free up the max order.
>>
>> %__GFP_NORETRY: The VM implementation will try only very lightweight
>> memory direct reclaim to get some memory under memory pressure (thus it
>> can sleep). It will avoid disruptive actions like OOM killer.
>>
>> Certainly good enough for a first version I would say, no?
> 
> Frankly how well that behaves would depend a lot on the workload.
> Can regress just as well.
> 
> For the 1st version I'd prefer something that is the least disruptive,
> and that IMHO means we only trigger reclaim at all in the same configuration
> as now - when we can't satisfy the lowest order allocation.

Agreed.

> 
> Anything else would be a huge amount of testing with all kind of
> workloads.
> 

So doing a "& ~__GFP_RECLAIM" in case order > 0? (as done in
GFP_TRANSHUGE_LIGHT)

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-31 14:03                     ` David Hildenbrand
@ 2020-03-31 14:07                       ` Michael S. Tsirkin
  2020-03-31 14:09                         ` David Hildenbrand
  2020-04-02  8:00                         ` teawater
  0 siblings, 2 replies; 32+ messages in thread
From: Michael S. Tsirkin @ 2020-03-31 14:07 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu, Alexander Duyck

On Tue, Mar 31, 2020 at 04:03:18PM +0200, David Hildenbrand wrote:
> On 31.03.20 15:37, Michael S. Tsirkin wrote:
> > On Tue, Mar 31, 2020 at 03:32:05PM +0200, David Hildenbrand wrote:
> >> On 31.03.20 15:24, Michael S. Tsirkin wrote:
> >>> On Tue, Mar 31, 2020 at 12:35:24PM +0200, David Hildenbrand wrote:
> >>>> On 26.03.20 10:49, Michael S. Tsirkin wrote:
> >>>>> On Thu, Mar 26, 2020 at 08:54:04AM +0100, David Hildenbrand wrote:
> >>>>>>
> >>>>>>
> >>>>>>> Am 26.03.2020 um 08:21 schrieb Michael S. Tsirkin <mst@redhat.com>:
> >>>>>>>
> >>>>>>> On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
> >>>>>>>>> On 12.03.20 09:47, Michael S. Tsirkin wrote:
> >>>>>>>>> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
> >>>>>>>>>> 2. You are essentially stealing THPs in the guest. So the fastest
> >>>>>>>>>> mapping (THP in guest and host) is gone. The guest won't be able to make
> >>>>>>>>>> use of THP where it previously was able to. I can imagine this implies a
> >>>>>>>>>> performance degradation for some workloads. This needs a proper
> >>>>>>>>>> performance evaluation.
> >>>>>>>>>
> >>>>>>>>> I think the problem is more with the alloc_pages API.
> >>>>>>>>> That gives you exactly the given order, and if there's
> >>>>>>>>> a larger chunk available, it will split it up.
> >>>>>>>>>
> >>>>>>>>> But for balloon - I suspect lots of other users,
> >>>>>>>>> we do not want to stress the system but if a large
> >>>>>>>>> chunk is available anyway, then we could handle
> >>>>>>>>> that more optimally by getting it all in one go.
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> So if we want to address this, IMHO this calls for a new API.
> >>>>>>>>> Along the lines of
> >>>>>>>>>
> >>>>>>>>>    struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
> >>>>>>>>>                    unsigned int max_order, unsigned int *order)
> >>>>>>>>>
> >>>>>>>>> the idea would then be to return at a number of pages in the given
> >>>>>>>>> range.
> >>>>>>>>>
> >>>>>>>>> What do you think? Want to try implementing that?
> >>>>>>>>
> >>>>>>>> You can just start with the highest order and decrement the order until
> >>>>>>>> your allocation succeeds using alloc_pages(), which would be enough for
> >>>>>>>> a first version. At least I don't see the immediate need for a new
> >>>>>>>> kernel API.
> >>>>>>>
> >>>>>>> OK I remember now.  The problem is with reclaim. Unless reclaim is
> >>>>>>> completely disabled, any of these calls can sleep. After it wakes up,
> >>>>>>> we would like to get the larger order that has become available
> >>>>>>> meanwhile.
> >>>>>>>
> >>>>>>
> >>>>>> Yes, but that‘s a pure optimization IMHO.
> >>>>>> So I think we should do a trivial implementation first and then see what we gain from a new allocator API. Then we might also be able to justify it using real numbers.
> >>>>>>
> >>>>>
> >>>>> Well how do you propose implement the necessary semantics?
> >>>>> I think we are both agreed that alloc_page_range is more or
> >>>>> less what's necessary anyway - so how would you approximate it
> >>>>> on top of existing APIs?
> >>>> diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
> > 
> > .....
> > 
> > 
> >>>> diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
> >>>> index 26de020aae7b..067810b32813 100644
> >>>> --- a/mm/balloon_compaction.c
> >>>> +++ b/mm/balloon_compaction.c
> >>>> @@ -112,23 +112,35 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
> >>>>  EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
> >>>>  
> >>>>  /*
> >>>> - * balloon_page_alloc - allocates a new page for insertion into the balloon
> >>>> - *			page list.
> >>>> + * balloon_pages_alloc - allocates a new page (of at most the given order)
> >>>> + * 			 for insertion into the balloon page list.
> >>>>   *
> >>>>   * Driver must call this function to properly allocate a new balloon page.
> >>>>   * Driver must call balloon_page_enqueue before definitively removing the page
> >>>>   * from the guest system.
> >>>>   *
> >>>> + * Will fall back to smaller orders if allocation fails. The order of the
> >>>> + * allocated page is stored in page->private.
> >>>> + *
> >>>>   * Return: struct page for the allocated page or NULL on allocation failure.
> >>>>   */
> >>>> -struct page *balloon_page_alloc(void)
> >>>> +struct page *balloon_pages_alloc(int order)
> >>>>  {
> >>>> -	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
> >>>> -				       __GFP_NOMEMALLOC | __GFP_NORETRY |
> >>>> -				       __GFP_NOWARN);
> >>>> -	return page;
> >>>> +	struct page *page;
> >>>> +
> >>>> +	while (order >= 0) {
> >>>> +		page = alloc_pages(balloon_mapping_gfp_mask() |
> >>>> +				   __GFP_NOMEMALLOC | __GFP_NORETRY |
> >>>> +				   __GFP_NOWARN, order);
> >>>> +		if (page) {
> >>>> +			set_page_private(page, order);
> >>>> +			return page;
> >>>> +		}
> >>>> +		order--;
> >>>> +	}
> >>>> +	return NULL;
> >>>>  }
> >>>> -EXPORT_SYMBOL_GPL(balloon_page_alloc);
> >>>> +EXPORT_SYMBOL_GPL(balloon_pages_alloc);
> >>>>  
> >>>>  /*
> >>>>   * balloon_page_enqueue - inserts a new page into the balloon page list.
> >>>
> >>>
> >>> I think this will try to invoke direct reclaim from the first iteration
> >>> to free up the max order.
> >>
> >> %__GFP_NORETRY: The VM implementation will try only very lightweight
> >> memory direct reclaim to get some memory under memory pressure (thus it
> >> can sleep). It will avoid disruptive actions like OOM killer.
> >>
> >> Certainly good enough for a first version I would say, no?
> > 
> > Frankly how well that behaves would depend a lot on the workload.
> > Can regress just as well.
> > 
> > For the 1st version I'd prefer something that is the least disruptive,
> > and that IMHO means we only trigger reclaim at all in the same configuration
> > as now - when we can't satisfy the lowest order allocation.
> 
> Agreed.
> 
> > 
> > Anything else would be a huge amount of testing with all kind of
> > workloads.
> > 
> 
> So doing a "& ~__GFP_RECLAIM" in case order > 0? (as done in
> GFP_TRANSHUGE_LIGHT)

That will improve the situation when reclaim is not needed, but leave
the problem in place for when it's needed: if reclaim does trigger, we
can get a huge free page and immediately break it up.

So it's ok as a first step but it will make the second step harder as
we'll need to test with reclaim :).


> -- 
> Thanks,
> 
> David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-31 14:07                       ` Michael S. Tsirkin
@ 2020-03-31 14:09                         ` David Hildenbrand
  2020-03-31 14:18                           ` Michael S. Tsirkin
  2020-03-31 16:37                           ` Nadav Amit
  2020-04-02  8:00                         ` teawater
  1 sibling, 2 replies; 32+ messages in thread
From: David Hildenbrand @ 2020-03-31 14:09 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu, Alexander Duyck

On 31.03.20 16:07, Michael S. Tsirkin wrote:
> On Tue, Mar 31, 2020 at 04:03:18PM +0200, David Hildenbrand wrote:
>> On 31.03.20 15:37, Michael S. Tsirkin wrote:
>>> On Tue, Mar 31, 2020 at 03:32:05PM +0200, David Hildenbrand wrote:
>>>> On 31.03.20 15:24, Michael S. Tsirkin wrote:
>>>>> On Tue, Mar 31, 2020 at 12:35:24PM +0200, David Hildenbrand wrote:
>>>>>> On 26.03.20 10:49, Michael S. Tsirkin wrote:
>>>>>>> On Thu, Mar 26, 2020 at 08:54:04AM +0100, David Hildenbrand wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>> Am 26.03.2020 um 08:21 schrieb Michael S. Tsirkin <mst@redhat.com>:
>>>>>>>>>
>>>>>>>>> On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
>>>>>>>>>>> On 12.03.20 09:47, Michael S. Tsirkin wrote:
>>>>>>>>>>> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
>>>>>>>>>>>> 2. You are essentially stealing THPs in the guest. So the fastest
>>>>>>>>>>>> mapping (THP in guest and host) is gone. The guest won't be able to make
>>>>>>>>>>>> use of THP where it previously was able to. I can imagine this implies a
>>>>>>>>>>>> performance degradation for some workloads. This needs a proper
>>>>>>>>>>>> performance evaluation.
>>>>>>>>>>>
>>>>>>>>>>> I think the problem is more with the alloc_pages API.
>>>>>>>>>>> That gives you exactly the given order, and if there's
>>>>>>>>>>> a larger chunk available, it will split it up.
>>>>>>>>>>>
>>>>>>>>>>> But for balloon - I suspect lots of other users,
>>>>>>>>>>> we do not want to stress the system but if a large
>>>>>>>>>>> chunk is available anyway, then we could handle
>>>>>>>>>>> that more optimally by getting it all in one go.
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> So if we want to address this, IMHO this calls for a new API.
>>>>>>>>>>> Along the lines of
>>>>>>>>>>>
>>>>>>>>>>>    struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
>>>>>>>>>>>                    unsigned int max_order, unsigned int *order)
>>>>>>>>>>>
>>>>>>>>>>> the idea would then be to return at a number of pages in the given
>>>>>>>>>>> range.
>>>>>>>>>>>
>>>>>>>>>>> What do you think? Want to try implementing that?
>>>>>>>>>>
>>>>>>>>>> You can just start with the highest order and decrement the order until
>>>>>>>>>> your allocation succeeds using alloc_pages(), which would be enough for
>>>>>>>>>> a first version. At least I don't see the immediate need for a new
>>>>>>>>>> kernel API.
>>>>>>>>>
>>>>>>>>> OK I remember now.  The problem is with reclaim. Unless reclaim is
>>>>>>>>> completely disabled, any of these calls can sleep. After it wakes up,
>>>>>>>>> we would like to get the larger order that has become available
>>>>>>>>> meanwhile.
>>>>>>>>>
>>>>>>>>
>>>>>>>> Yes, but that‘s a pure optimization IMHO.
>>>>>>>> So I think we should do a trivial implementation first and then see what we gain from a new allocator API. Then we might also be able to justify it using real numbers.
>>>>>>>>
>>>>>>>
>>>>>>> Well how do you propose implement the necessary semantics?
>>>>>>> I think we are both agreed that alloc_page_range is more or
>>>>>>> less what's necessary anyway - so how would you approximate it
>>>>>>> on top of existing APIs?
>>>>>> diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
>>>
>>> .....
>>>
>>>
>>>>>> diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
>>>>>> index 26de020aae7b..067810b32813 100644
>>>>>> --- a/mm/balloon_compaction.c
>>>>>> +++ b/mm/balloon_compaction.c
>>>>>> @@ -112,23 +112,35 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
>>>>>>  EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
>>>>>>  
>>>>>>  /*
>>>>>> - * balloon_page_alloc - allocates a new page for insertion into the balloon
>>>>>> - *			page list.
>>>>>> + * balloon_pages_alloc - allocates a new page (of at most the given order)
>>>>>> + * 			 for insertion into the balloon page list.
>>>>>>   *
>>>>>>   * Driver must call this function to properly allocate a new balloon page.
>>>>>>   * Driver must call balloon_page_enqueue before definitively removing the page
>>>>>>   * from the guest system.
>>>>>>   *
>>>>>> + * Will fall back to smaller orders if allocation fails. The order of the
>>>>>> + * allocated page is stored in page->private.
>>>>>> + *
>>>>>>   * Return: struct page for the allocated page or NULL on allocation failure.
>>>>>>   */
>>>>>> -struct page *balloon_page_alloc(void)
>>>>>> +struct page *balloon_pages_alloc(int order)
>>>>>>  {
>>>>>> -	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
>>>>>> -				       __GFP_NOMEMALLOC | __GFP_NORETRY |
>>>>>> -				       __GFP_NOWARN);
>>>>>> -	return page;
>>>>>> +	struct page *page;
>>>>>> +
>>>>>> +	while (order >= 0) {
>>>>>> +		page = alloc_pages(balloon_mapping_gfp_mask() |
>>>>>> +				   __GFP_NOMEMALLOC | __GFP_NORETRY |
>>>>>> +				   __GFP_NOWARN, order);
>>>>>> +		if (page) {
>>>>>> +			set_page_private(page, order);
>>>>>> +			return page;
>>>>>> +		}
>>>>>> +		order--;
>>>>>> +	}
>>>>>> +	return NULL;
>>>>>>  }
>>>>>> -EXPORT_SYMBOL_GPL(balloon_page_alloc);
>>>>>> +EXPORT_SYMBOL_GPL(balloon_pages_alloc);
>>>>>>  
>>>>>>  /*
>>>>>>   * balloon_page_enqueue - inserts a new page into the balloon page list.
>>>>>
>>>>>
>>>>> I think this will try to invoke direct reclaim from the first iteration
>>>>> to free up the max order.
>>>>
>>>> %__GFP_NORETRY: The VM implementation will try only very lightweight
>>>> memory direct reclaim to get some memory under memory pressure (thus it
>>>> can sleep). It will avoid disruptive actions like OOM killer.
>>>>
>>>> Certainly good enough for a first version I would say, no?
>>>
>>> Frankly how well that behaves would depend a lot on the workload.
>>> Can regress just as well.
>>>
>>> For the 1st version I'd prefer something that is the least disruptive,
>>> and that IMHO means we only trigger reclaim at all in the same configuration
>>> as now - when we can't satisfy the lowest order allocation.
>>
>> Agreed.
>>
>>>
>>> Anything else would be a huge amount of testing with all kind of
>>> workloads.
>>>
>>
>> So doing a "& ~__GFP_RECLAIM" in case order > 0? (as done in
>> GFP_TRANSHUGE_LIGHT)
> 
> That will improve the situation when reclaim is not needed, but leave
> the problem in place for when it's needed: if reclaim does trigger, we
> can get a huge free page and immediately break it up.
> 
> So it's ok as a first step but it will make the second step harder as
> we'll need to test with reclaim :).

I expect the whole "steal huge pages from your guest" to be problematic,
as I already mentioned to Alex. This needs a performance evaluation.

This all smells like a lot of workload dependent fine-tuning. :)

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-31 14:09                         ` David Hildenbrand
@ 2020-03-31 14:18                           ` Michael S. Tsirkin
  2020-03-31 14:29                             ` David Hildenbrand
  2020-03-31 16:37                           ` Nadav Amit
  1 sibling, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2020-03-31 14:18 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu, Alexander Duyck

On Tue, Mar 31, 2020 at 04:09:59PM +0200, David Hildenbrand wrote:

...

> >>>>>>>>>>> So if we want to address this, IMHO this calls for a new API.
> >>>>>>>>>>> Along the lines of
> >>>>>>>>>>>
> >>>>>>>>>>>    struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
> >>>>>>>>>>>                    unsigned int max_order, unsigned int *order)
> >>>>>>>>>>>
> >>>>>>>>>>> the idea would then be to return at a number of pages in the given
> >>>>>>>>>>> range.
> >>>>>>>>>>>
> >>>>>>>>>>> What do you think? Want to try implementing that?

..

> I expect the whole "steal huge pages from your guest" to be problematic,
> as I already mentioned to Alex. This needs a performance evaluation.
> 
> This all smells like a lot of workload dependent fine-tuning. :)


So that's why I proposed the API above.

The idea is that *if we are allocating a huge page anyway*,
rather than break it up let's send it whole to the device.
If we have smaller pages, return smaller pages.

That seems like it would always be an improvement, whatever the
workload.

-- 
MST


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-31 14:18                           ` Michael S. Tsirkin
@ 2020-03-31 14:29                             ` David Hildenbrand
  2020-03-31 14:34                               ` David Hildenbrand
  0 siblings, 1 reply; 32+ messages in thread
From: David Hildenbrand @ 2020-03-31 14:29 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu, Alexander Duyck

On 31.03.20 16:18, Michael S. Tsirkin wrote:
> On Tue, Mar 31, 2020 at 04:09:59PM +0200, David Hildenbrand wrote:
> 
> ...
> 
>>>>>>>>>>>>> So if we want to address this, IMHO this calls for a new API.
>>>>>>>>>>>>> Along the lines of
>>>>>>>>>>>>>
>>>>>>>>>>>>>    struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
>>>>>>>>>>>>>                    unsigned int max_order, unsigned int *order)
>>>>>>>>>>>>>
>>>>>>>>>>>>> the idea would then be to return at a number of pages in the given
>>>>>>>>>>>>> range.
>>>>>>>>>>>>>
>>>>>>>>>>>>> What do you think? Want to try implementing that?
> 
> ..
> 
>> I expect the whole "steal huge pages from your guest" to be problematic,
>> as I already mentioned to Alex. This needs a performance evaluation.
>>
>> This all smells like a lot of workload dependent fine-tuning. :)
> 
> 
> So that's why I proposed the API above.
> 
> The idea is that *if we are allocating a huge page anyway*,
> rather than break it up let's send it whole to the device.
> If we have smaller pages, return smaller pages.
> 

Sorry, I still fail to see why you cannot do that with my version of
balloon_pages_alloc(). But maybe I haven't understood the magic you
expect to happen in alloc_page_range() :)

It's just going via a different inflate queue once we have that page, as
I stated in front of my draft patch "but with an
optimized reporting interface".

> That seems like it would always be an improvement, whatever the
> workload.
> 

Don't think so. Assume there are plenty of 4k pages lying around. It
might actually be *bad* for guest performance if you take a huge page
instead of all the leftover 4k pages that cannot be merged. Only at the
point where you would want to break a bigger page up and report it in
pieces, where it would definitely make no difference.

I guess Hui Zhu now has something to look into/work on :)

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-31 14:29                             ` David Hildenbrand
@ 2020-03-31 14:34                               ` David Hildenbrand
  2020-03-31 15:28                                 ` Michael S. Tsirkin
  0 siblings, 1 reply; 32+ messages in thread
From: David Hildenbrand @ 2020-03-31 14:34 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu, Alexander Duyck

On 31.03.20 16:29, David Hildenbrand wrote:
> On 31.03.20 16:18, Michael S. Tsirkin wrote:
>> On Tue, Mar 31, 2020 at 04:09:59PM +0200, David Hildenbrand wrote:
>>
>> ...
>>
>>>>>>>>>>>>>> So if we want to address this, IMHO this calls for a new API.
>>>>>>>>>>>>>> Along the lines of
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>    struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
>>>>>>>>>>>>>>                    unsigned int max_order, unsigned int *order)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> the idea would then be to return at a number of pages in the given
>>>>>>>>>>>>>> range.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> What do you think? Want to try implementing that?
>>
>> ..
>>
>>> I expect the whole "steal huge pages from your guest" to be problematic,
>>> as I already mentioned to Alex. This needs a performance evaluation.
>>>
>>> This all smells like a lot of workload dependent fine-tuning. :)
>>
>>
>> So that's why I proposed the API above.
>>
>> The idea is that *if we are allocating a huge page anyway*,
>> rather than break it up let's send it whole to the device.
>> If we have smaller pages, return smaller pages.
>>
> 
> Sorry, I still fail to see why you cannot do that with my version of
> balloon_pages_alloc(). But maybe I haven't understood the magic you
> expect to happen in alloc_page_range() :)
> 
> It's just going via a different inflate queue once we have that page, as
> I stated in front of my draft patch "but with an
> optimized reporting interface".
> 
>> That seems like it would always be an improvement, whatever the
>> workload.
>>
> 
> Don't think so. Assume there are plenty of 4k pages lying around. It
> might actually be *bad* for guest performance if you take a huge page
> instead of all the leftover 4k pages that cannot be merged. Only at the
> point where you would want to break a bigger page up and report it in
> pieces, where it would definitely make no difference.

I just understood what you mean :) and now it makes sense - it avoids
exactly that. Basically

1. Try to allocate order-0. No split necessary? return the page
2. Try to allocate order-1. No split necessary? return the page
...

up to MAX_ORDER - 1.

Yeah, I guess this will need a new kernel API.


-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-31 14:34                               ` David Hildenbrand
@ 2020-03-31 15:28                                 ` Michael S. Tsirkin
  0 siblings, 0 replies; 32+ messages in thread
From: Michael S. Tsirkin @ 2020-03-31 15:28 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Hui Zhu, jasowang, akpm, pagupta, mojha, namit, virtualization,
	linux-kernel, qemu-devel, Hui Zhu, Alexander Duyck

On Tue, Mar 31, 2020 at 04:34:48PM +0200, David Hildenbrand wrote:
> On 31.03.20 16:29, David Hildenbrand wrote:
> > On 31.03.20 16:18, Michael S. Tsirkin wrote:
> >> On Tue, Mar 31, 2020 at 04:09:59PM +0200, David Hildenbrand wrote:
> >>
> >> ...
> >>
> >>>>>>>>>>>>>> So if we want to address this, IMHO this calls for a new API.
> >>>>>>>>>>>>>> Along the lines of
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>    struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
> >>>>>>>>>>>>>>                    unsigned int max_order, unsigned int *order)
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> the idea would then be to return at a number of pages in the given
> >>>>>>>>>>>>>> range.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> What do you think? Want to try implementing that?
> >>
> >> ..
> >>
> >>> I expect the whole "steal huge pages from your guest" to be problematic,
> >>> as I already mentioned to Alex. This needs a performance evaluation.
> >>>
> >>> This all smells like a lot of workload dependent fine-tuning. :)
> >>
> >>
> >> So that's why I proposed the API above.
> >>
> >> The idea is that *if we are allocating a huge page anyway*,
> >> rather than break it up let's send it whole to the device.
> >> If we have smaller pages, return smaller pages.
> >>
> > 
> > Sorry, I still fail to see why you cannot do that with my version of
> > balloon_pages_alloc(). But maybe I haven't understood the magic you
> > expect to happen in alloc_page_range() :)
> > 
> > It's just going via a different inflate queue once we have that page, as
> > I stated in front of my draft patch "but with an
> > optimized reporting interface".
> > 
> >> That seems like it would always be an improvement, whatever the
> >> workload.
> >>
> > 
> > Don't think so. Assume there are plenty of 4k pages lying around. It
> > might actually be *bad* for guest performance if you take a huge page
> > instead of all the leftover 4k pages that cannot be merged. Only at the
> > point where you would want to break a bigger page up and report it in
> > pieces, where it would definitely make no difference.
> 
> I just understood what you mean :) and now it makes sense - it avoids
> exactly that. Basically
> 
> 1. Try to allocate order-0. No split necessary? return the page
> 2. Try to allocate order-1. No split necessary? return the page
> ...
> 
> up to MAX_ORDER - 1.
> 
> Yeah, I guess this will need a new kernel API.

Exactly what I meant. And whever we fail and block for reclaim, we
restart this.

> 
> -- 
> Thanks,
> 
> David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-31 13:32                 ` David Hildenbrand
  2020-03-31 13:37                   ` Michael S. Tsirkin
@ 2020-03-31 16:27                   ` Nadav Amit
  2020-04-01 11:21                     ` David Hildenbrand
  1 sibling, 1 reply; 32+ messages in thread
From: Nadav Amit @ 2020-03-31 16:27 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Michael S. Tsirkin, pagupta, Alexander Duyck, qemu-devel, mojha,
	LKML, Linux Virtualization, Hui Zhu, akpm, jasowang, Hui Zhu

> On Mar 31, 2020, at 6:32 AM, David Hildenbrand <david@redhat.com> wrote:
> 
> On 31.03.20 15:24, Michael S. Tsirkin wrote:
>> On Tue, Mar 31, 2020 at 12:35:24PM +0200, David Hildenbrand wrote:
>>> On 26.03.20 10:49, Michael S. Tsirkin wrote:
>>>> On Thu, Mar 26, 2020 at 08:54:04AM +0100, David Hildenbrand wrote:
>>>>>> Am 26.03.2020 um 08:21 schrieb Michael S. Tsirkin <mst@redhat.com>:
>>>>>> 
>>>>>> On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
>>>>>>>> On 12.03.20 09:47, Michael S. Tsirkin wrote:
>>>>>>>> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
>>>>>>>>> 2. You are essentially stealing THPs in the guest. So the fastest
>>>>>>>>> mapping (THP in guest and host) is gone. The guest won't be able to make
>>>>>>>>> use of THP where it previously was able to. I can imagine this implies a
>>>>>>>>> performance degradation for some workloads. This needs a proper
>>>>>>>>> performance evaluation.
>>>>>>>> 
>>>>>>>> I think the problem is more with the alloc_pages API.
>>>>>>>> That gives you exactly the given order, and if there's
>>>>>>>> a larger chunk available, it will split it up.
>>>>>>>> 
>>>>>>>> But for balloon - I suspect lots of other users,
>>>>>>>> we do not want to stress the system but if a large
>>>>>>>> chunk is available anyway, then we could handle
>>>>>>>> that more optimally by getting it all in one go.
>>>>>>>> 
>>>>>>>> 
>>>>>>>> So if we want to address this, IMHO this calls for a new API.
>>>>>>>> Along the lines of
>>>>>>>> 
>>>>>>>>   struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
>>>>>>>>                   unsigned int max_order, unsigned int *order)
>>>>>>>> 
>>>>>>>> the idea would then be to return at a number of pages in the given
>>>>>>>> range.
>>>>>>>> 
>>>>>>>> What do you think? Want to try implementing that?
>>>>>>> 
>>>>>>> You can just start with the highest order and decrement the order until
>>>>>>> your allocation succeeds using alloc_pages(), which would be enough for
>>>>>>> a first version. At least I don't see the immediate need for a new
>>>>>>> kernel API.
>>>>>> 
>>>>>> OK I remember now.  The problem is with reclaim. Unless reclaim is
>>>>>> completely disabled, any of these calls can sleep. After it wakes up,
>>>>>> we would like to get the larger order that has become available
>>>>>> meanwhile.
>>>>> 
>>>>> Yes, but that‘s a pure optimization IMHO.
>>>>> So I think we should do a trivial implementation first and then see what we gain from a new allocator API. Then we might also be able to justify it using real numbers.
>>>> 
>>>> Well how do you propose implement the necessary semantics?
>>>> I think we are both agreed that alloc_page_range is more or
>>>> less what's necessary anyway - so how would you approximate it
>>>> on top of existing APIs?
>>> 
>>> Looking at drivers/misc/vmw_balloon.c:vmballoon_inflate(), it first
>>> tries to allocate huge pages using
>>> 
>>> 	alloc_pages(__GFP_HIGHMEM|__GFP_NOWARN| __GFP_NOMEMALLOC, 
>>>                    VMW_BALLOON_2M_ORDER)
>>> 
>>> And then falls back to 4k allocations (balloon_page_alloc()) in case
>>> allocation fails.
>>> 
>>> I'm roughly thinking of something like the following, but with an
>>> optimized reporting interface/bigger pfn array so we can report >
>>> 1MB at a time. Also, it might make sense to remember the order that
>>> succeeded across some fill_balloon() calls.
>>> 
>>> Don't even expect it to compile ...
>>> 
>>> 
>>> 
>>>> From 4305f989672ccca4be9293e6d4167e929f3e299b Mon Sep 17 00:00:00 2001
>>> From: David Hildenbrand <david@redhat.com>
>>> Date: Tue, 31 Mar 2020 12:28:07 +0200
>>> Subject: [PATCH RFC] tmp
>>> 
>>> Signed-off-by: David Hildenbrand <david@redhat.com>
>>> ---
>>> drivers/virtio/virtio_balloon.c    | 38 ++++++++++++++++++--------
>>> include/linux/balloon_compaction.h |  7 ++++-
>>> mm/balloon_compaction.c            | 43 +++++++++++++++++++++++-------
>>> 3 files changed, 67 insertions(+), 21 deletions(-)
>>> 
>>> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
>>> index 8511d258dbb4..0660b1b988f0 100644
>>> --- a/drivers/virtio/virtio_balloon.c
>>> +++ b/drivers/virtio/virtio_balloon.c
>>> @@ -187,7 +187,7 @@ int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_info,
>>> }
>>> 
>>> static void set_page_pfns(struct virtio_balloon *vb,
>>> -			  __virtio32 pfns[], struct page *page)
>>> +			  __virtio32 pfns[], struct page *page, int order)
>>> {
>>> 	unsigned int i;
>>> 
>>> @@ -197,7 +197,7 @@ static void set_page_pfns(struct virtio_balloon *vb,
>>> 	 * Set balloon pfns pointing at this page.
>>> 	 * Note that the first pfn points at start of the page.
>>> 	 */
>>> -	for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE; i++)
>>> +	for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order); i++)
>>> 		pfns[i] = cpu_to_virtio32(vb->vdev,
>>> 					  page_to_balloon_pfn(page) + i);
>>> }
>>> @@ -205,6 +205,7 @@ static void set_page_pfns(struct virtio_balloon *vb,
>>> static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>>> {
>>> 	unsigned num_allocated_pages;
>>> +	int order = MAX_ORDER - 1;
>>> 	unsigned num_pfns;
>>> 	struct page *page;
>>> 	LIST_HEAD(pages);
>>> @@ -212,9 +213,20 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>>> 	/* We can only do one array worth at a time. */
>>> 	num = min(num, ARRAY_SIZE(vb->pfns));
>>> 
>>> +	/*
>>> +	 * Note: we will currently never allocate more than 1MB due to the
>>> +	 * pfn array size, so we will not allocate MAX_ORDER - 1 ...
>>> +	 */
>>> +
>>> 	for (num_pfns = 0; num_pfns < num;
>>> -	     num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
>>> -		struct page *page = balloon_page_alloc();
>>> +	     num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order)) {
>>> +		const unsigned long remaining = num - num_pfns;
>>> +
>>> +		order = MIN(order,
>>> +			    get_order(remaining << VIRTIO_BALLOON_PFN_SHIFT));
>>> +		if ((1 << order) * VIRTIO_BALLOON_PAGES_PER_PAGE > remaining)
>>> +			order--;
>>> +		page = balloon_pages_alloc(order);
>>> 
>>> 		if (!page) {
>>> 			dev_info_ratelimited(&vb->vdev->dev,
>>> @@ -225,6 +237,8 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>>> 			break;
>>> 		}
>>> 
>>> +		/* Continue with the actual order that succeeded. */
>>> +		order = page_private(page);
>>> 		balloon_page_push(&pages, page);
>>> 	}
>>> 
>>> @@ -233,14 +247,16 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>>> 	vb->num_pfns = 0;
>>> 
>>> 	while ((page = balloon_page_pop(&pages))) {
>>> +		order = page_order(page);
>>> +		/* enqueuing will split the page and clear the order */
>>> 		balloon_page_enqueue(&vb->vb_dev_info, page);
>>> 
>>> -		set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
>>> -		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
>>> +		set_page_pfns(vb, vb->pfns + vb->num_pfns, page, order);
>>> +		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order);
>>> 		if (!virtio_has_feature(vb->vdev,
>>> 					VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
>>> -			adjust_managed_page_count(page, -1);
>>> -		vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
>>> +			adjust_managed_page_count(page, -1 * (1 << order));
>>> +		vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order);
>>> 	}
>>> 
>>> 	num_allocated_pages = vb->num_pfns;
>>> @@ -284,7 +300,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
>>> 		page = balloon_page_dequeue(vb_dev_info);
>>> 		if (!page)
>>> 			break;
>>> -		set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
>>> +		set_page_pfns(vb, vb->pfns + vb->num_pfns, page, 0);
>>> 		list_add(&page->lru, &pages);
>>> 		vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
>>> 	}
>>> @@ -786,7 +802,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
>>> 	__count_vm_event(BALLOON_MIGRATE);
>>> 	spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
>>> 	vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
>>> -	set_page_pfns(vb, vb->pfns, newpage);
>>> +	set_page_pfns(vb, vb->pfns, newpage, 0);
>>> 	tell_host(vb, vb->inflate_vq);
>>> 
>>> 	/* balloon's page migration 2nd step -- deflate "page" */
>>> @@ -794,7 +810,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
>>> 	balloon_page_delete(page);
>>> 	spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
>>> 	vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
>>> -	set_page_pfns(vb, vb->pfns, page);
>>> +	set_page_pfns(vb, vb->pfns, page, 0);
>>> 	tell_host(vb, vb->deflate_vq);
>>> 
>>> 	mutex_unlock(&vb->balloon_lock);
>>> diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
>>> index 338aa27e4773..ed93fe5704d1 100644
>>> --- a/include/linux/balloon_compaction.h
>>> +++ b/include/linux/balloon_compaction.h
>>> @@ -60,7 +60,7 @@ struct balloon_dev_info {
>>> 	struct inode *inode;
>>> };
>>> 
>>> -extern struct page *balloon_page_alloc(void);
>>> +extern struct page *balloon_pages_alloc(int order);
>>> extern void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
>>> 				 struct page *page);
>>> extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
>>> @@ -78,6 +78,11 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
>>> 	balloon->inode = NULL;
>>> }
>>> 
>>> +static inline struct page *balloon_page_alloc(void)
>>> +{
>>> +	return balloon_pages_alloc(0);
>>> +}
>>> +
>>> #ifdef CONFIG_BALLOON_COMPACTION
>>> extern const struct address_space_operations balloon_aops;
>>> extern bool balloon_page_isolate(struct page *page,
>>> diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
>>> index 26de020aae7b..067810b32813 100644
>>> --- a/mm/balloon_compaction.c
>>> +++ b/mm/balloon_compaction.c
>>> @@ -112,23 +112,35 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
>>> EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
>>> 
>>> /*
>>> - * balloon_page_alloc - allocates a new page for insertion into the balloon
>>> - *			page list.
>>> + * balloon_pages_alloc - allocates a new page (of at most the given order)
>>> + * 			 for insertion into the balloon page list.
>>>  *
>>>  * Driver must call this function to properly allocate a new balloon page.
>>>  * Driver must call balloon_page_enqueue before definitively removing the page
>>>  * from the guest system.
>>>  *
>>> + * Will fall back to smaller orders if allocation fails. The order of the
>>> + * allocated page is stored in page->private.
>>> + *
>>>  * Return: struct page for the allocated page or NULL on allocation failure.
>>>  */
>>> -struct page *balloon_page_alloc(void)
>>> +struct page *balloon_pages_alloc(int order)
>>> {
>>> -	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
>>> -				       __GFP_NOMEMALLOC | __GFP_NORETRY |
>>> -				       __GFP_NOWARN);
>>> -	return page;
>>> +	struct page *page;
>>> +
>>> +	while (order >= 0) {
>>> +		page = alloc_pages(balloon_mapping_gfp_mask() |
>>> +				   __GFP_NOMEMALLOC | __GFP_NORETRY |
>>> +				   __GFP_NOWARN, order);
>>> +		if (page) {
>>> +			set_page_private(page, order);
>>> +			return page;
>>> +		}
>>> +		order--;
>>> +	}
>>> +	return NULL;
>>> }
>>> -EXPORT_SYMBOL_GPL(balloon_page_alloc);
>>> +EXPORT_SYMBOL_GPL(balloon_pages_alloc);
>>> 
>>> /*
>>>  * balloon_page_enqueue - inserts a new page into the balloon page list.
>> 
>> 
>> I think this will try to invoke direct reclaim from the first iteration
>> to free up the max order.
> 
> %__GFP_NORETRY: The VM implementation will try only very lightweight
> memory direct reclaim to get some memory under memory pressure (thus it
> can sleep). It will avoid disruptive actions like OOM killer.
> 
> Certainly good enough for a first version I would say, no? Looking at
> the vmware balloon, they don't even set __GFP_NORETRY.

Yes, it does seem that we are missing __GFP_NORETRY. I really do not know
what I was thinking when I did not add it for huge-pages allocation. I will
send a patch. Thanks for noticing :)

In regard to your patch, I would be happy to consolidate the allocation
mechanisms, so VMware balloon driver would also use your code. In general
your code looks good, take-away some style issues.


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-31 14:09                         ` David Hildenbrand
  2020-03-31 14:18                           ` Michael S. Tsirkin
@ 2020-03-31 16:37                           ` Nadav Amit
  2020-04-01  9:48                             ` David Hildenbrand
  1 sibling, 1 reply; 32+ messages in thread
From: Nadav Amit @ 2020-03-31 16:37 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Michael S. Tsirkin, pagupta, Alexander Duyck, qemu-devel, mojha,
	LKML, Linux Virtualization, Hui Zhu, Andrew Morton, jasowang,
	Hui Zhu

> On Mar 31, 2020, at 7:09 AM, David Hildenbrand <david@redhat.com> wrote:
> 
> On 31.03.20 16:07, Michael S. Tsirkin wrote:
>> On Tue, Mar 31, 2020 at 04:03:18PM +0200, David Hildenbrand wrote:
>>> On 31.03.20 15:37, Michael S. Tsirkin wrote:
>>>> On Tue, Mar 31, 2020 at 03:32:05PM +0200, David Hildenbrand wrote:
>>>>> On 31.03.20 15:24, Michael S. Tsirkin wrote:
>>>>>> On Tue, Mar 31, 2020 at 12:35:24PM +0200, David Hildenbrand wrote:
>>>>>>> On 26.03.20 10:49, Michael S. Tsirkin wrote:
>>>>>>>> On Thu, Mar 26, 2020 at 08:54:04AM +0100, David Hildenbrand wrote:
>>>>>>>>>> Am 26.03.2020 um 08:21 schrieb Michael S. Tsirkin <mst@redhat.com>:
>>>>>>>>>> 
>>>>>>>>>> On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
>>>>>>>>>>>> On 12.03.20 09:47, Michael S. Tsirkin wrote:
>>>>>>>>>>>> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
>>>>>>>>>>>>> 2. You are essentially stealing THPs in the guest. So the fastest
>>>>>>>>>>>>> mapping (THP in guest and host) is gone. The guest won't be able to make
>>>>>>>>>>>>> use of THP where it previously was able to. I can imagine this implies a
>>>>>>>>>>>>> performance degradation for some workloads. This needs a proper
>>>>>>>>>>>>> performance evaluation.
>>>>>>>>>>>> 
>>>>>>>>>>>> I think the problem is more with the alloc_pages API.
>>>>>>>>>>>> That gives you exactly the given order, and if there's
>>>>>>>>>>>> a larger chunk available, it will split it up.
>>>>>>>>>>>> 
>>>>>>>>>>>> But for balloon - I suspect lots of other users,
>>>>>>>>>>>> we do not want to stress the system but if a large
>>>>>>>>>>>> chunk is available anyway, then we could handle
>>>>>>>>>>>> that more optimally by getting it all in one go.
>>>>>>>>>>>> 
>>>>>>>>>>>> 
>>>>>>>>>>>> So if we want to address this, IMHO this calls for a new API.
>>>>>>>>>>>> Along the lines of
>>>>>>>>>>>> 
>>>>>>>>>>>>   struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
>>>>>>>>>>>>                   unsigned int max_order, unsigned int *order)
>>>>>>>>>>>> 
>>>>>>>>>>>> the idea would then be to return at a number of pages in the given
>>>>>>>>>>>> range.
>>>>>>>>>>>> 
>>>>>>>>>>>> What do you think? Want to try implementing that?
>>>>>>>>>>> 
>>>>>>>>>>> You can just start with the highest order and decrement the order until
>>>>>>>>>>> your allocation succeeds using alloc_pages(), which would be enough for
>>>>>>>>>>> a first version. At least I don't see the immediate need for a new
>>>>>>>>>>> kernel API.
>>>>>>>>>> 
>>>>>>>>>> OK I remember now.  The problem is with reclaim. Unless reclaim is
>>>>>>>>>> completely disabled, any of these calls can sleep. After it wakes up,
>>>>>>>>>> we would like to get the larger order that has become available
>>>>>>>>>> meanwhile.
>>>>>>>>> 
>>>>>>>>> Yes, but that‘s a pure optimization IMHO.
>>>>>>>>> So I think we should do a trivial implementation first and then see what we gain from a new allocator API. Then we might also be able to justify it using real numbers.
>>>>>>>> 
>>>>>>>> Well how do you propose implement the necessary semantics?
>>>>>>>> I think we are both agreed that alloc_page_range is more or
>>>>>>>> less what's necessary anyway - so how would you approximate it
>>>>>>>> on top of existing APIs?
>>>>>>> diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
>>>> 
>>>> .....
>>>> 
>>>> 
>>>>>>> diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
>>>>>>> index 26de020aae7b..067810b32813 100644
>>>>>>> --- a/mm/balloon_compaction.c
>>>>>>> +++ b/mm/balloon_compaction.c
>>>>>>> @@ -112,23 +112,35 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
>>>>>>> EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
>>>>>>> 
>>>>>>> /*
>>>>>>> - * balloon_page_alloc - allocates a new page for insertion into the balloon
>>>>>>> - *			page list.
>>>>>>> + * balloon_pages_alloc - allocates a new page (of at most the given order)
>>>>>>> + * 			 for insertion into the balloon page list.
>>>>>>>  *
>>>>>>>  * Driver must call this function to properly allocate a new balloon page.
>>>>>>>  * Driver must call balloon_page_enqueue before definitively removing the page
>>>>>>>  * from the guest system.
>>>>>>>  *
>>>>>>> + * Will fall back to smaller orders if allocation fails. The order of the
>>>>>>> + * allocated page is stored in page->private.
>>>>>>> + *
>>>>>>>  * Return: struct page for the allocated page or NULL on allocation failure.
>>>>>>>  */
>>>>>>> -struct page *balloon_page_alloc(void)
>>>>>>> +struct page *balloon_pages_alloc(int order)
>>>>>>> {
>>>>>>> -	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
>>>>>>> -				       __GFP_NOMEMALLOC | __GFP_NORETRY |
>>>>>>> -				       __GFP_NOWARN);
>>>>>>> -	return page;
>>>>>>> +	struct page *page;
>>>>>>> +
>>>>>>> +	while (order >= 0) {
>>>>>>> +		page = alloc_pages(balloon_mapping_gfp_mask() |
>>>>>>> +				   __GFP_NOMEMALLOC | __GFP_NORETRY |
>>>>>>> +				   __GFP_NOWARN, order);
>>>>>>> +		if (page) {
>>>>>>> +			set_page_private(page, order);
>>>>>>> +			return page;
>>>>>>> +		}
>>>>>>> +		order--;
>>>>>>> +	}
>>>>>>> +	return NULL;
>>>>>>> }
>>>>>>> -EXPORT_SYMBOL_GPL(balloon_page_alloc);
>>>>>>> +EXPORT_SYMBOL_GPL(balloon_pages_alloc);
>>>>>>> 
>>>>>>> /*
>>>>>>>  * balloon_page_enqueue - inserts a new page into the balloon page list.
>>>>>> 
>>>>>> 
>>>>>> I think this will try to invoke direct reclaim from the first iteration
>>>>>> to free up the max order.
>>>>> 
>>>>> %__GFP_NORETRY: The VM implementation will try only very lightweight
>>>>> memory direct reclaim to get some memory under memory pressure (thus it
>>>>> can sleep). It will avoid disruptive actions like OOM killer.
>>>>> 
>>>>> Certainly good enough for a first version I would say, no?
>>>> 
>>>> Frankly how well that behaves would depend a lot on the workload.
>>>> Can regress just as well.
>>>> 
>>>> For the 1st version I'd prefer something that is the least disruptive,
>>>> and that IMHO means we only trigger reclaim at all in the same configuration
>>>> as now - when we can't satisfy the lowest order allocation.
>>> 
>>> Agreed.
>>> 
>>>> Anything else would be a huge amount of testing with all kind of
>>>> workloads.
>>> 
>>> So doing a "& ~__GFP_RECLAIM" in case order > 0? (as done in
>>> GFP_TRANSHUGE_LIGHT)
>> 
>> That will improve the situation when reclaim is not needed, but leave
>> the problem in place for when it's needed: if reclaim does trigger, we
>> can get a huge free page and immediately break it up.
>> 
>> So it's ok as a first step but it will make the second step harder as
>> we'll need to test with reclaim :).
> 
> I expect the whole "steal huge pages from your guest" to be problematic,
> as I already mentioned to Alex. This needs a performance evaluation.
> 
> This all smells like a lot of workload dependent fine-tuning. :)

AFAIK the hardware overheads of keeping huge-pages in the guest and backing
them with 4KB pages are non-negligible. Did you take those into account?


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-31 16:37                           ` Nadav Amit
@ 2020-04-01  9:48                             ` David Hildenbrand
  2020-04-02  4:02                               ` teawater
  0 siblings, 1 reply; 32+ messages in thread
From: David Hildenbrand @ 2020-04-01  9:48 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Michael S. Tsirkin, pagupta, Alexander Duyck, qemu-devel, mojha,
	LKML, Linux Virtualization, Hui Zhu, Andrew Morton, jasowang,
	Hui Zhu

On 31.03.20 18:37, Nadav Amit wrote:
>> On Mar 31, 2020, at 7:09 AM, David Hildenbrand <david@redhat.com> wrote:
>>
>> On 31.03.20 16:07, Michael S. Tsirkin wrote:
>>> On Tue, Mar 31, 2020 at 04:03:18PM +0200, David Hildenbrand wrote:
>>>> On 31.03.20 15:37, Michael S. Tsirkin wrote:
>>>>> On Tue, Mar 31, 2020 at 03:32:05PM +0200, David Hildenbrand wrote:
>>>>>> On 31.03.20 15:24, Michael S. Tsirkin wrote:
>>>>>>> On Tue, Mar 31, 2020 at 12:35:24PM +0200, David Hildenbrand wrote:
>>>>>>>> On 26.03.20 10:49, Michael S. Tsirkin wrote:
>>>>>>>>> On Thu, Mar 26, 2020 at 08:54:04AM +0100, David Hildenbrand wrote:
>>>>>>>>>>> Am 26.03.2020 um 08:21 schrieb Michael S. Tsirkin <mst@redhat.com>:
>>>>>>>>>>>
>>>>>>>>>>> On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
>>>>>>>>>>>>> On 12.03.20 09:47, Michael S. Tsirkin wrote:
>>>>>>>>>>>>> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
>>>>>>>>>>>>>> 2. You are essentially stealing THPs in the guest. So the fastest
>>>>>>>>>>>>>> mapping (THP in guest and host) is gone. The guest won't be able to make
>>>>>>>>>>>>>> use of THP where it previously was able to. I can imagine this implies a
>>>>>>>>>>>>>> performance degradation for some workloads. This needs a proper
>>>>>>>>>>>>>> performance evaluation.
>>>>>>>>>>>>>
>>>>>>>>>>>>> I think the problem is more with the alloc_pages API.
>>>>>>>>>>>>> That gives you exactly the given order, and if there's
>>>>>>>>>>>>> a larger chunk available, it will split it up.
>>>>>>>>>>>>>
>>>>>>>>>>>>> But for balloon - I suspect lots of other users,
>>>>>>>>>>>>> we do not want to stress the system but if a large
>>>>>>>>>>>>> chunk is available anyway, then we could handle
>>>>>>>>>>>>> that more optimally by getting it all in one go.
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> So if we want to address this, IMHO this calls for a new API.
>>>>>>>>>>>>> Along the lines of
>>>>>>>>>>>>>
>>>>>>>>>>>>>   struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
>>>>>>>>>>>>>                   unsigned int max_order, unsigned int *order)
>>>>>>>>>>>>>
>>>>>>>>>>>>> the idea would then be to return at a number of pages in the given
>>>>>>>>>>>>> range.
>>>>>>>>>>>>>
>>>>>>>>>>>>> What do you think? Want to try implementing that?
>>>>>>>>>>>>
>>>>>>>>>>>> You can just start with the highest order and decrement the order until
>>>>>>>>>>>> your allocation succeeds using alloc_pages(), which would be enough for
>>>>>>>>>>>> a first version. At least I don't see the immediate need for a new
>>>>>>>>>>>> kernel API.
>>>>>>>>>>>
>>>>>>>>>>> OK I remember now.  The problem is with reclaim. Unless reclaim is
>>>>>>>>>>> completely disabled, any of these calls can sleep. After it wakes up,
>>>>>>>>>>> we would like to get the larger order that has become available
>>>>>>>>>>> meanwhile.
>>>>>>>>>>
>>>>>>>>>> Yes, but that‘s a pure optimization IMHO.
>>>>>>>>>> So I think we should do a trivial implementation first and then see what we gain from a new allocator API. Then we might also be able to justify it using real numbers.
>>>>>>>>>
>>>>>>>>> Well how do you propose implement the necessary semantics?
>>>>>>>>> I think we are both agreed that alloc_page_range is more or
>>>>>>>>> less what's necessary anyway - so how would you approximate it
>>>>>>>>> on top of existing APIs?
>>>>>>>> diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
>>>>>
>>>>> .....
>>>>>
>>>>>
>>>>>>>> diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
>>>>>>>> index 26de020aae7b..067810b32813 100644
>>>>>>>> --- a/mm/balloon_compaction.c
>>>>>>>> +++ b/mm/balloon_compaction.c
>>>>>>>> @@ -112,23 +112,35 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
>>>>>>>> EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
>>>>>>>>
>>>>>>>> /*
>>>>>>>> - * balloon_page_alloc - allocates a new page for insertion into the balloon
>>>>>>>> - *			page list.
>>>>>>>> + * balloon_pages_alloc - allocates a new page (of at most the given order)
>>>>>>>> + * 			 for insertion into the balloon page list.
>>>>>>>>  *
>>>>>>>>  * Driver must call this function to properly allocate a new balloon page.
>>>>>>>>  * Driver must call balloon_page_enqueue before definitively removing the page
>>>>>>>>  * from the guest system.
>>>>>>>>  *
>>>>>>>> + * Will fall back to smaller orders if allocation fails. The order of the
>>>>>>>> + * allocated page is stored in page->private.
>>>>>>>> + *
>>>>>>>>  * Return: struct page for the allocated page or NULL on allocation failure.
>>>>>>>>  */
>>>>>>>> -struct page *balloon_page_alloc(void)
>>>>>>>> +struct page *balloon_pages_alloc(int order)
>>>>>>>> {
>>>>>>>> -	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
>>>>>>>> -				       __GFP_NOMEMALLOC | __GFP_NORETRY |
>>>>>>>> -				       __GFP_NOWARN);
>>>>>>>> -	return page;
>>>>>>>> +	struct page *page;
>>>>>>>> +
>>>>>>>> +	while (order >= 0) {
>>>>>>>> +		page = alloc_pages(balloon_mapping_gfp_mask() |
>>>>>>>> +				   __GFP_NOMEMALLOC | __GFP_NORETRY |
>>>>>>>> +				   __GFP_NOWARN, order);
>>>>>>>> +		if (page) {
>>>>>>>> +			set_page_private(page, order);
>>>>>>>> +			return page;
>>>>>>>> +		}
>>>>>>>> +		order--;
>>>>>>>> +	}
>>>>>>>> +	return NULL;
>>>>>>>> }
>>>>>>>> -EXPORT_SYMBOL_GPL(balloon_page_alloc);
>>>>>>>> +EXPORT_SYMBOL_GPL(balloon_pages_alloc);
>>>>>>>>
>>>>>>>> /*
>>>>>>>>  * balloon_page_enqueue - inserts a new page into the balloon page list.
>>>>>>>
>>>>>>>
>>>>>>> I think this will try to invoke direct reclaim from the first iteration
>>>>>>> to free up the max order.
>>>>>>
>>>>>> %__GFP_NORETRY: The VM implementation will try only very lightweight
>>>>>> memory direct reclaim to get some memory under memory pressure (thus it
>>>>>> can sleep). It will avoid disruptive actions like OOM killer.
>>>>>>
>>>>>> Certainly good enough for a first version I would say, no?
>>>>>
>>>>> Frankly how well that behaves would depend a lot on the workload.
>>>>> Can regress just as well.
>>>>>
>>>>> For the 1st version I'd prefer something that is the least disruptive,
>>>>> and that IMHO means we only trigger reclaim at all in the same configuration
>>>>> as now - when we can't satisfy the lowest order allocation.
>>>>
>>>> Agreed.
>>>>
>>>>> Anything else would be a huge amount of testing with all kind of
>>>>> workloads.
>>>>
>>>> So doing a "& ~__GFP_RECLAIM" in case order > 0? (as done in
>>>> GFP_TRANSHUGE_LIGHT)
>>>
>>> That will improve the situation when reclaim is not needed, but leave
>>> the problem in place for when it's needed: if reclaim does trigger, we
>>> can get a huge free page and immediately break it up.
>>>
>>> So it's ok as a first step but it will make the second step harder as
>>> we'll need to test with reclaim :).
>>
>> I expect the whole "steal huge pages from your guest" to be problematic,
>> as I already mentioned to Alex. This needs a performance evaluation.
>>
>> This all smells like a lot of workload dependent fine-tuning. :)
> 
> AFAIK the hardware overheads of keeping huge-pages in the guest and backing
> them with 4KB pages are non-negligible. Did you take those into account?

Of course, the fastest mapping will be huge pages in host and guest.
Having huge pages in your guest but not in your host cannot really be
solved using ballooning AFAIKs. Hopefully THP in the host will be doing
its job properly :)

... however, so far, we haven't done any performance comparisons at all.
The only numbers from Hui Zhu that I can spot are number of THP in the
host, which is not really expressing actual guest performance IMHO. That
definitely has to be done to evaluate the different optimizations we
might want to try out.

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-31 16:27                   ` Nadav Amit
@ 2020-04-01 11:21                     ` David Hildenbrand
  0 siblings, 0 replies; 32+ messages in thread
From: David Hildenbrand @ 2020-04-01 11:21 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Michael S. Tsirkin, pagupta, Alexander Duyck, qemu-devel, mojha,
	LKML, Linux Virtualization, Hui Zhu, akpm, jasowang, Hui Zhu

On 31.03.20 18:27, Nadav Amit wrote:
>> On Mar 31, 2020, at 6:32 AM, David Hildenbrand <david@redhat.com> wrote:
>>
>> On 31.03.20 15:24, Michael S. Tsirkin wrote:
>>> On Tue, Mar 31, 2020 at 12:35:24PM +0200, David Hildenbrand wrote:
>>>> On 26.03.20 10:49, Michael S. Tsirkin wrote:
>>>>> On Thu, Mar 26, 2020 at 08:54:04AM +0100, David Hildenbrand wrote:
>>>>>>> Am 26.03.2020 um 08:21 schrieb Michael S. Tsirkin <mst@redhat.com>:
>>>>>>>
>>>>>>> On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
>>>>>>>>> On 12.03.20 09:47, Michael S. Tsirkin wrote:
>>>>>>>>> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
>>>>>>>>>> 2. You are essentially stealing THPs in the guest. So the fastest
>>>>>>>>>> mapping (THP in guest and host) is gone. The guest won't be able to make
>>>>>>>>>> use of THP where it previously was able to. I can imagine this implies a
>>>>>>>>>> performance degradation for some workloads. This needs a proper
>>>>>>>>>> performance evaluation.
>>>>>>>>>
>>>>>>>>> I think the problem is more with the alloc_pages API.
>>>>>>>>> That gives you exactly the given order, and if there's
>>>>>>>>> a larger chunk available, it will split it up.
>>>>>>>>>
>>>>>>>>> But for balloon - I suspect lots of other users,
>>>>>>>>> we do not want to stress the system but if a large
>>>>>>>>> chunk is available anyway, then we could handle
>>>>>>>>> that more optimally by getting it all in one go.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> So if we want to address this, IMHO this calls for a new API.
>>>>>>>>> Along the lines of
>>>>>>>>>
>>>>>>>>>   struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
>>>>>>>>>                   unsigned int max_order, unsigned int *order)
>>>>>>>>>
>>>>>>>>> the idea would then be to return at a number of pages in the given
>>>>>>>>> range.
>>>>>>>>>
>>>>>>>>> What do you think? Want to try implementing that?
>>>>>>>>
>>>>>>>> You can just start with the highest order and decrement the order until
>>>>>>>> your allocation succeeds using alloc_pages(), which would be enough for
>>>>>>>> a first version. At least I don't see the immediate need for a new
>>>>>>>> kernel API.
>>>>>>>
>>>>>>> OK I remember now.  The problem is with reclaim. Unless reclaim is
>>>>>>> completely disabled, any of these calls can sleep. After it wakes up,
>>>>>>> we would like to get the larger order that has become available
>>>>>>> meanwhile.
>>>>>>
>>>>>> Yes, but that‘s a pure optimization IMHO.
>>>>>> So I think we should do a trivial implementation first and then see what we gain from a new allocator API. Then we might also be able to justify it using real numbers.
>>>>>
>>>>> Well how do you propose implement the necessary semantics?
>>>>> I think we are both agreed that alloc_page_range is more or
>>>>> less what's necessary anyway - so how would you approximate it
>>>>> on top of existing APIs?
>>>>
>>>> Looking at drivers/misc/vmw_balloon.c:vmballoon_inflate(), it first
>>>> tries to allocate huge pages using
>>>>
>>>> 	alloc_pages(__GFP_HIGHMEM|__GFP_NOWARN| __GFP_NOMEMALLOC, 
>>>>                    VMW_BALLOON_2M_ORDER)
>>>>
>>>> And then falls back to 4k allocations (balloon_page_alloc()) in case
>>>> allocation fails.
>>>>
>>>> I'm roughly thinking of something like the following, but with an
>>>> optimized reporting interface/bigger pfn array so we can report >
>>>> 1MB at a time. Also, it might make sense to remember the order that
>>>> succeeded across some fill_balloon() calls.
>>>>
>>>> Don't even expect it to compile ...
>>>>
>>>>
>>>>
>>>>> From 4305f989672ccca4be9293e6d4167e929f3e299b Mon Sep 17 00:00:00 2001
>>>> From: David Hildenbrand <david@redhat.com>
>>>> Date: Tue, 31 Mar 2020 12:28:07 +0200
>>>> Subject: [PATCH RFC] tmp
>>>>
>>>> Signed-off-by: David Hildenbrand <david@redhat.com>
>>>> ---
>>>> drivers/virtio/virtio_balloon.c    | 38 ++++++++++++++++++--------
>>>> include/linux/balloon_compaction.h |  7 ++++-
>>>> mm/balloon_compaction.c            | 43 +++++++++++++++++++++++-------
>>>> 3 files changed, 67 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
>>>> index 8511d258dbb4..0660b1b988f0 100644
>>>> --- a/drivers/virtio/virtio_balloon.c
>>>> +++ b/drivers/virtio/virtio_balloon.c
>>>> @@ -187,7 +187,7 @@ int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_info,
>>>> }
>>>>
>>>> static void set_page_pfns(struct virtio_balloon *vb,
>>>> -			  __virtio32 pfns[], struct page *page)
>>>> +			  __virtio32 pfns[], struct page *page, int order)
>>>> {
>>>> 	unsigned int i;
>>>>
>>>> @@ -197,7 +197,7 @@ static void set_page_pfns(struct virtio_balloon *vb,
>>>> 	 * Set balloon pfns pointing at this page.
>>>> 	 * Note that the first pfn points at start of the page.
>>>> 	 */
>>>> -	for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE; i++)
>>>> +	for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order); i++)
>>>> 		pfns[i] = cpu_to_virtio32(vb->vdev,
>>>> 					  page_to_balloon_pfn(page) + i);
>>>> }
>>>> @@ -205,6 +205,7 @@ static void set_page_pfns(struct virtio_balloon *vb,
>>>> static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>>>> {
>>>> 	unsigned num_allocated_pages;
>>>> +	int order = MAX_ORDER - 1;
>>>> 	unsigned num_pfns;
>>>> 	struct page *page;
>>>> 	LIST_HEAD(pages);
>>>> @@ -212,9 +213,20 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>>>> 	/* We can only do one array worth at a time. */
>>>> 	num = min(num, ARRAY_SIZE(vb->pfns));
>>>>
>>>> +	/*
>>>> +	 * Note: we will currently never allocate more than 1MB due to the
>>>> +	 * pfn array size, so we will not allocate MAX_ORDER - 1 ...
>>>> +	 */
>>>> +
>>>> 	for (num_pfns = 0; num_pfns < num;
>>>> -	     num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
>>>> -		struct page *page = balloon_page_alloc();
>>>> +	     num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order)) {
>>>> +		const unsigned long remaining = num - num_pfns;
>>>> +
>>>> +		order = MIN(order,
>>>> +			    get_order(remaining << VIRTIO_BALLOON_PFN_SHIFT));
>>>> +		if ((1 << order) * VIRTIO_BALLOON_PAGES_PER_PAGE > remaining)
>>>> +			order--;
>>>> +		page = balloon_pages_alloc(order);
>>>>
>>>> 		if (!page) {
>>>> 			dev_info_ratelimited(&vb->vdev->dev,
>>>> @@ -225,6 +237,8 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>>>> 			break;
>>>> 		}
>>>>
>>>> +		/* Continue with the actual order that succeeded. */
>>>> +		order = page_private(page);
>>>> 		balloon_page_push(&pages, page);
>>>> 	}
>>>>
>>>> @@ -233,14 +247,16 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
>>>> 	vb->num_pfns = 0;
>>>>
>>>> 	while ((page = balloon_page_pop(&pages))) {
>>>> +		order = page_order(page);
>>>> +		/* enqueuing will split the page and clear the order */
>>>> 		balloon_page_enqueue(&vb->vb_dev_info, page);
>>>>
>>>> -		set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
>>>> -		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
>>>> +		set_page_pfns(vb, vb->pfns + vb->num_pfns, page, order);
>>>> +		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order);
>>>> 		if (!virtio_has_feature(vb->vdev,
>>>> 					VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
>>>> -			adjust_managed_page_count(page, -1);
>>>> -		vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
>>>> +			adjust_managed_page_count(page, -1 * (1 << order));
>>>> +		vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE * (1 << order);
>>>> 	}
>>>>
>>>> 	num_allocated_pages = vb->num_pfns;
>>>> @@ -284,7 +300,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
>>>> 		page = balloon_page_dequeue(vb_dev_info);
>>>> 		if (!page)
>>>> 			break;
>>>> -		set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
>>>> +		set_page_pfns(vb, vb->pfns + vb->num_pfns, page, 0);
>>>> 		list_add(&page->lru, &pages);
>>>> 		vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
>>>> 	}
>>>> @@ -786,7 +802,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
>>>> 	__count_vm_event(BALLOON_MIGRATE);
>>>> 	spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
>>>> 	vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
>>>> -	set_page_pfns(vb, vb->pfns, newpage);
>>>> +	set_page_pfns(vb, vb->pfns, newpage, 0);
>>>> 	tell_host(vb, vb->inflate_vq);
>>>>
>>>> 	/* balloon's page migration 2nd step -- deflate "page" */
>>>> @@ -794,7 +810,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
>>>> 	balloon_page_delete(page);
>>>> 	spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
>>>> 	vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
>>>> -	set_page_pfns(vb, vb->pfns, page);
>>>> +	set_page_pfns(vb, vb->pfns, page, 0);
>>>> 	tell_host(vb, vb->deflate_vq);
>>>>
>>>> 	mutex_unlock(&vb->balloon_lock);
>>>> diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
>>>> index 338aa27e4773..ed93fe5704d1 100644
>>>> --- a/include/linux/balloon_compaction.h
>>>> +++ b/include/linux/balloon_compaction.h
>>>> @@ -60,7 +60,7 @@ struct balloon_dev_info {
>>>> 	struct inode *inode;
>>>> };
>>>>
>>>> -extern struct page *balloon_page_alloc(void);
>>>> +extern struct page *balloon_pages_alloc(int order);
>>>> extern void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
>>>> 				 struct page *page);
>>>> extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
>>>> @@ -78,6 +78,11 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
>>>> 	balloon->inode = NULL;
>>>> }
>>>>
>>>> +static inline struct page *balloon_page_alloc(void)
>>>> +{
>>>> +	return balloon_pages_alloc(0);
>>>> +}
>>>> +
>>>> #ifdef CONFIG_BALLOON_COMPACTION
>>>> extern const struct address_space_operations balloon_aops;
>>>> extern bool balloon_page_isolate(struct page *page,
>>>> diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
>>>> index 26de020aae7b..067810b32813 100644
>>>> --- a/mm/balloon_compaction.c
>>>> +++ b/mm/balloon_compaction.c
>>>> @@ -112,23 +112,35 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
>>>> EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
>>>>
>>>> /*
>>>> - * balloon_page_alloc - allocates a new page for insertion into the balloon
>>>> - *			page list.
>>>> + * balloon_pages_alloc - allocates a new page (of at most the given order)
>>>> + * 			 for insertion into the balloon page list.
>>>>  *
>>>>  * Driver must call this function to properly allocate a new balloon page.
>>>>  * Driver must call balloon_page_enqueue before definitively removing the page
>>>>  * from the guest system.
>>>>  *
>>>> + * Will fall back to smaller orders if allocation fails. The order of the
>>>> + * allocated page is stored in page->private.
>>>> + *
>>>>  * Return: struct page for the allocated page or NULL on allocation failure.
>>>>  */
>>>> -struct page *balloon_page_alloc(void)
>>>> +struct page *balloon_pages_alloc(int order)
>>>> {
>>>> -	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
>>>> -				       __GFP_NOMEMALLOC | __GFP_NORETRY |
>>>> -				       __GFP_NOWARN);
>>>> -	return page;
>>>> +	struct page *page;
>>>> +
>>>> +	while (order >= 0) {
>>>> +		page = alloc_pages(balloon_mapping_gfp_mask() |
>>>> +				   __GFP_NOMEMALLOC | __GFP_NORETRY |
>>>> +				   __GFP_NOWARN, order);
>>>> +		if (page) {
>>>> +			set_page_private(page, order);
>>>> +			return page;
>>>> +		}
>>>> +		order--;
>>>> +	}
>>>> +	return NULL;
>>>> }
>>>> -EXPORT_SYMBOL_GPL(balloon_page_alloc);
>>>> +EXPORT_SYMBOL_GPL(balloon_pages_alloc);
>>>>
>>>> /*
>>>>  * balloon_page_enqueue - inserts a new page into the balloon page list.
>>>
>>>
>>> I think this will try to invoke direct reclaim from the first iteration
>>> to free up the max order.
>>
>> %__GFP_NORETRY: The VM implementation will try only very lightweight
>> memory direct reclaim to get some memory under memory pressure (thus it
>> can sleep). It will avoid disruptive actions like OOM killer.
>>
>> Certainly good enough for a first version I would say, no? Looking at
>> the vmware balloon, they don't even set __GFP_NORETRY.
> 
> Yes, it does seem that we are missing __GFP_NORETRY. I really do not know
> what I was thinking when I did not add it for huge-pages allocation. I will
> send a patch. Thanks for noticing :)
> 
> In regard to your patch, I would be happy to consolidate the allocation
> mechanisms, so VMware balloon driver would also use your code. In general
> your code looks good, take-away some style issues.

Yeah, let's see in which direction we'll be bringing
balloon_page_alloc(), I think there are still some questions to be
answered (mostly performance implications).

Cheers!

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-04-01  9:48                             ` David Hildenbrand
@ 2020-04-02  4:02                               ` teawater
  0 siblings, 0 replies; 32+ messages in thread
From: teawater @ 2020-04-02  4:02 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Nadav Amit, Michael S. Tsirkin, pagupta, Alexander Duyck,
	qemu-devel, mojha, LKML, Linux Virtualization, Andrew Morton,
	jasowang, Hui Zhu



> 2020年4月1日 17:48,David Hildenbrand <david@redhat.com> 写道:
> 
> On 31.03.20 18:37, Nadav Amit wrote:
>>> On Mar 31, 2020, at 7:09 AM, David Hildenbrand <david@redhat.com> wrote:
>>> 
>>> On 31.03.20 16:07, Michael S. Tsirkin wrote:
>>>> On Tue, Mar 31, 2020 at 04:03:18PM +0200, David Hildenbrand wrote:
>>>>> On 31.03.20 15:37, Michael S. Tsirkin wrote:
>>>>>> On Tue, Mar 31, 2020 at 03:32:05PM +0200, David Hildenbrand wrote:
>>>>>>> On 31.03.20 15:24, Michael S. Tsirkin wrote:
>>>>>>>> On Tue, Mar 31, 2020 at 12:35:24PM +0200, David Hildenbrand wrote:
>>>>>>>>> On 26.03.20 10:49, Michael S. Tsirkin wrote:
>>>>>>>>>> On Thu, Mar 26, 2020 at 08:54:04AM +0100, David Hildenbrand wrote:
>>>>>>>>>>>> Am 26.03.2020 um 08:21 schrieb Michael S. Tsirkin <mst@redhat.com>:
>>>>>>>>>>>> 
>>>>>>>>>>>> On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
>>>>>>>>>>>>>> On 12.03.20 09:47, Michael S. Tsirkin wrote:
>>>>>>>>>>>>>> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
>>>>>>>>>>>>>>> 2. You are essentially stealing THPs in the guest. So the fastest
>>>>>>>>>>>>>>> mapping (THP in guest and host) is gone. The guest won't be able to make
>>>>>>>>>>>>>>> use of THP where it previously was able to. I can imagine this implies a
>>>>>>>>>>>>>>> performance degradation for some workloads. This needs a proper
>>>>>>>>>>>>>>> performance evaluation.
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>> I think the problem is more with the alloc_pages API.
>>>>>>>>>>>>>> That gives you exactly the given order, and if there's
>>>>>>>>>>>>>> a larger chunk available, it will split it up.
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>> But for balloon - I suspect lots of other users,
>>>>>>>>>>>>>> we do not want to stress the system but if a large
>>>>>>>>>>>>>> chunk is available anyway, then we could handle
>>>>>>>>>>>>>> that more optimally by getting it all in one go.
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>> So if we want to address this, IMHO this calls for a new API.
>>>>>>>>>>>>>> Along the lines of
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>>  struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
>>>>>>>>>>>>>>                  unsigned int max_order, unsigned int *order)
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>> the idea would then be to return at a number of pages in the given
>>>>>>>>>>>>>> range.
>>>>>>>>>>>>>> 
>>>>>>>>>>>>>> What do you think? Want to try implementing that?
>>>>>>>>>>>>> 
>>>>>>>>>>>>> You can just start with the highest order and decrement the order until
>>>>>>>>>>>>> your allocation succeeds using alloc_pages(), which would be enough for
>>>>>>>>>>>>> a first version. At least I don't see the immediate need for a new
>>>>>>>>>>>>> kernel API.
>>>>>>>>>>>> 
>>>>>>>>>>>> OK I remember now.  The problem is with reclaim. Unless reclaim is
>>>>>>>>>>>> completely disabled, any of these calls can sleep. After it wakes up,
>>>>>>>>>>>> we would like to get the larger order that has become available
>>>>>>>>>>>> meanwhile.
>>>>>>>>>>> 
>>>>>>>>>>> Yes, but that‘s a pure optimization IMHO.
>>>>>>>>>>> So I think we should do a trivial implementation first and then see what we gain from a new allocator API. Then we might also be able to justify it using real numbers.
>>>>>>>>>> 
>>>>>>>>>> Well how do you propose implement the necessary semantics?
>>>>>>>>>> I think we are both agreed that alloc_page_range is more or
>>>>>>>>>> less what's necessary anyway - so how would you approximate it
>>>>>>>>>> on top of existing APIs?
>>>>>>>>> diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
>>>>>> 
>>>>>> .....
>>>>>> 
>>>>>> 
>>>>>>>>> diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
>>>>>>>>> index 26de020aae7b..067810b32813 100644
>>>>>>>>> --- a/mm/balloon_compaction.c
>>>>>>>>> +++ b/mm/balloon_compaction.c
>>>>>>>>> @@ -112,23 +112,35 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
>>>>>>>>> EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
>>>>>>>>> 
>>>>>>>>> /*
>>>>>>>>> - * balloon_page_alloc - allocates a new page for insertion into the balloon
>>>>>>>>> - *			page list.
>>>>>>>>> + * balloon_pages_alloc - allocates a new page (of at most the given order)
>>>>>>>>> + * 			 for insertion into the balloon page list.
>>>>>>>>> *
>>>>>>>>> * Driver must call this function to properly allocate a new balloon page.
>>>>>>>>> * Driver must call balloon_page_enqueue before definitively removing the page
>>>>>>>>> * from the guest system.
>>>>>>>>> *
>>>>>>>>> + * Will fall back to smaller orders if allocation fails. The order of the
>>>>>>>>> + * allocated page is stored in page->private.
>>>>>>>>> + *
>>>>>>>>> * Return: struct page for the allocated page or NULL on allocation failure.
>>>>>>>>> */
>>>>>>>>> -struct page *balloon_page_alloc(void)
>>>>>>>>> +struct page *balloon_pages_alloc(int order)
>>>>>>>>> {
>>>>>>>>> -	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
>>>>>>>>> -				       __GFP_NOMEMALLOC | __GFP_NORETRY |
>>>>>>>>> -				       __GFP_NOWARN);
>>>>>>>>> -	return page;
>>>>>>>>> +	struct page *page;
>>>>>>>>> +
>>>>>>>>> +	while (order >= 0) {
>>>>>>>>> +		page = alloc_pages(balloon_mapping_gfp_mask() |
>>>>>>>>> +				   __GFP_NOMEMALLOC | __GFP_NORETRY |
>>>>>>>>> +				   __GFP_NOWARN, order);
>>>>>>>>> +		if (page) {
>>>>>>>>> +			set_page_private(page, order);
>>>>>>>>> +			return page;
>>>>>>>>> +		}
>>>>>>>>> +		order--;
>>>>>>>>> +	}
>>>>>>>>> +	return NULL;
>>>>>>>>> }
>>>>>>>>> -EXPORT_SYMBOL_GPL(balloon_page_alloc);
>>>>>>>>> +EXPORT_SYMBOL_GPL(balloon_pages_alloc);
>>>>>>>>> 
>>>>>>>>> /*
>>>>>>>>> * balloon_page_enqueue - inserts a new page into the balloon page list.
>>>>>>>> 
>>>>>>>> 
>>>>>>>> I think this will try to invoke direct reclaim from the first iteration
>>>>>>>> to free up the max order.
>>>>>>> 
>>>>>>> %__GFP_NORETRY: The VM implementation will try only very lightweight
>>>>>>> memory direct reclaim to get some memory under memory pressure (thus it
>>>>>>> can sleep). It will avoid disruptive actions like OOM killer.
>>>>>>> 
>>>>>>> Certainly good enough for a first version I would say, no?
>>>>>> 
>>>>>> Frankly how well that behaves would depend a lot on the workload.
>>>>>> Can regress just as well.
>>>>>> 
>>>>>> For the 1st version I'd prefer something that is the least disruptive,
>>>>>> and that IMHO means we only trigger reclaim at all in the same configuration
>>>>>> as now - when we can't satisfy the lowest order allocation.
>>>>> 
>>>>> Agreed.
>>>>> 
>>>>>> Anything else would be a huge amount of testing with all kind of
>>>>>> workloads.
>>>>> 
>>>>> So doing a "& ~__GFP_RECLAIM" in case order > 0? (as done in
>>>>> GFP_TRANSHUGE_LIGHT)
>>>> 
>>>> That will improve the situation when reclaim is not needed, but leave
>>>> the problem in place for when it's needed: if reclaim does trigger, we
>>>> can get a huge free page and immediately break it up.
>>>> 
>>>> So it's ok as a first step but it will make the second step harder as
>>>> we'll need to test with reclaim :).
>>> 
>>> I expect the whole "steal huge pages from your guest" to be problematic,
>>> as I already mentioned to Alex. This needs a performance evaluation.
>>> 
>>> This all smells like a lot of workload dependent fine-tuning. :)
>> 
>> AFAIK the hardware overheads of keeping huge-pages in the guest and backing
>> them with 4KB pages are non-negligible. Did you take those into account?
> 
> Of course, the fastest mapping will be huge pages in host and guest.
> Having huge pages in your guest but not in your host cannot really be
> solved using ballooning AFAIKs. Hopefully THP in the host will be doing
> its job properly :)
> 
> ... however, so far, we haven't done any performance comparisons at all.
> The only numbers from Hui Zhu that I can spot are number of THP in the
> host, which is not really expressing actual guest performance IMHO. That
> definitely has to be done to evaluate the different optimizations we
> might want to try out.
> 

I did some tests with vm-scalability on Monday comparing their performance in VM:
//4 processes random r/w
usemem -R -a -Z  -n 4 1g

write:
hugepage: 146367 KB/s
thp:	  133550 KB/s
normal:   124248 KB/s

read:
hugepage: 103969 KB/s
thp:	  100622 KB/s
normal:   88755 KB/s

Best,
Hui


> -- 
> Thanks,
> 
> David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-03-31 14:07                       ` Michael S. Tsirkin
  2020-03-31 14:09                         ` David Hildenbrand
@ 2020-04-02  8:00                         ` teawater
  2020-04-02 12:37                           ` Michael S. Tsirkin
  1 sibling, 1 reply; 32+ messages in thread
From: teawater @ 2020-04-02  8:00 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Hildenbrand, Hui Zhu, Jason Wang, Andrew Morton, pagupta,
	mojha, namit, virtualization, linux-kernel, qemu-devel,
	Alexander Duyck



> 2020年3月31日 22:07,Michael S. Tsirkin <mst@redhat.com> 写道:
> 
> On Tue, Mar 31, 2020 at 04:03:18PM +0200, David Hildenbrand wrote:
>> On 31.03.20 15:37, Michael S. Tsirkin wrote:
>>> On Tue, Mar 31, 2020 at 03:32:05PM +0200, David Hildenbrand wrote:
>>>> On 31.03.20 15:24, Michael S. Tsirkin wrote:
>>>>> On Tue, Mar 31, 2020 at 12:35:24PM +0200, David Hildenbrand wrote:
>>>>>> On 26.03.20 10:49, Michael S. Tsirkin wrote:
>>>>>>> On Thu, Mar 26, 2020 at 08:54:04AM +0100, David Hildenbrand wrote:
>>>>>>>> 
>>>>>>>> 
>>>>>>>>> Am 26.03.2020 um 08:21 schrieb Michael S. Tsirkin <mst@redhat.com>:
>>>>>>>>> 
>>>>>>>>> On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
>>>>>>>>>>> On 12.03.20 09:47, Michael S. Tsirkin wrote:
>>>>>>>>>>> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
>>>>>>>>>>>> 2. You are essentially stealing THPs in the guest. So the fastest
>>>>>>>>>>>> mapping (THP in guest and host) is gone. The guest won't be able to make
>>>>>>>>>>>> use of THP where it previously was able to. I can imagine this implies a
>>>>>>>>>>>> performance degradation for some workloads. This needs a proper
>>>>>>>>>>>> performance evaluation.
>>>>>>>>>>> 
>>>>>>>>>>> I think the problem is more with the alloc_pages API.
>>>>>>>>>>> That gives you exactly the given order, and if there's
>>>>>>>>>>> a larger chunk available, it will split it up.
>>>>>>>>>>> 
>>>>>>>>>>> But for balloon - I suspect lots of other users,
>>>>>>>>>>> we do not want to stress the system but if a large
>>>>>>>>>>> chunk is available anyway, then we could handle
>>>>>>>>>>> that more optimally by getting it all in one go.
>>>>>>>>>>> 
>>>>>>>>>>> 
>>>>>>>>>>> So if we want to address this, IMHO this calls for a new API.
>>>>>>>>>>> Along the lines of
>>>>>>>>>>> 
>>>>>>>>>>> struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
>>>>>>>>>>>                 unsigned int max_order, unsigned int *order)
>>>>>>>>>>> 
>>>>>>>>>>> the idea would then be to return at a number of pages in the given
>>>>>>>>>>> range.
>>>>>>>>>>> 
>>>>>>>>>>> What do you think? Want to try implementing that?
>>>>>>>>>> 
>>>>>>>>>> You can just start with the highest order and decrement the order until
>>>>>>>>>> your allocation succeeds using alloc_pages(), which would be enough for
>>>>>>>>>> a first version. At least I don't see the immediate need for a new
>>>>>>>>>> kernel API.
>>>>>>>>> 
>>>>>>>>> OK I remember now.  The problem is with reclaim. Unless reclaim is
>>>>>>>>> completely disabled, any of these calls can sleep. After it wakes up,
>>>>>>>>> we would like to get the larger order that has become available
>>>>>>>>> meanwhile.
>>>>>>>>> 
>>>>>>>> 
>>>>>>>> Yes, but that‘s a pure optimization IMHO.
>>>>>>>> So I think we should do a trivial implementation first and then see what we gain from a new allocator API. Then we might also be able to justify it using real numbers.
>>>>>>>> 
>>>>>>> 
>>>>>>> Well how do you propose implement the necessary semantics?
>>>>>>> I think we are both agreed that alloc_page_range is more or
>>>>>>> less what's necessary anyway - so how would you approximate it
>>>>>>> on top of existing APIs?
>>>>>> diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
>>> 
>>> .....
>>> 
>>> 
>>>>>> diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
>>>>>> index 26de020aae7b..067810b32813 100644
>>>>>> --- a/mm/balloon_compaction.c
>>>>>> +++ b/mm/balloon_compaction.c
>>>>>> @@ -112,23 +112,35 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
>>>>>> EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
>>>>>> 
>>>>>> /*
>>>>>> - * balloon_page_alloc - allocates a new page for insertion into the balloon
>>>>>> - *			page list.
>>>>>> + * balloon_pages_alloc - allocates a new page (of at most the given order)
>>>>>> + * 			 for insertion into the balloon page list.
>>>>>> *
>>>>>> * Driver must call this function to properly allocate a new balloon page.
>>>>>> * Driver must call balloon_page_enqueue before definitively removing the page
>>>>>> * from the guest system.
>>>>>> *
>>>>>> + * Will fall back to smaller orders if allocation fails. The order of the
>>>>>> + * allocated page is stored in page->private.
>>>>>> + *
>>>>>> * Return: struct page for the allocated page or NULL on allocation failure.
>>>>>> */
>>>>>> -struct page *balloon_page_alloc(void)
>>>>>> +struct page *balloon_pages_alloc(int order)
>>>>>> {
>>>>>> -	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
>>>>>> -				       __GFP_NOMEMALLOC | __GFP_NORETRY |
>>>>>> -				       __GFP_NOWARN);
>>>>>> -	return page;
>>>>>> +	struct page *page;
>>>>>> +
>>>>>> +	while (order >= 0) {
>>>>>> +		page = alloc_pages(balloon_mapping_gfp_mask() |
>>>>>> +				   __GFP_NOMEMALLOC | __GFP_NORETRY |
>>>>>> +				   __GFP_NOWARN, order);
>>>>>> +		if (page) {
>>>>>> +			set_page_private(page, order);
>>>>>> +			return page;
>>>>>> +		}
>>>>>> +		order--;
>>>>>> +	}
>>>>>> +	return NULL;
>>>>>> }
>>>>>> -EXPORT_SYMBOL_GPL(balloon_page_alloc);
>>>>>> +EXPORT_SYMBOL_GPL(balloon_pages_alloc);
>>>>>> 
>>>>>> /*
>>>>>> * balloon_page_enqueue - inserts a new page into the balloon page list.
>>>>> 
>>>>> 
>>>>> I think this will try to invoke direct reclaim from the first iteration
>>>>> to free up the max order.
>>>> 
>>>> %__GFP_NORETRY: The VM implementation will try only very lightweight
>>>> memory direct reclaim to get some memory under memory pressure (thus it
>>>> can sleep). It will avoid disruptive actions like OOM killer.
>>>> 
>>>> Certainly good enough for a first version I would say, no?
>>> 
>>> Frankly how well that behaves would depend a lot on the workload.
>>> Can regress just as well.
>>> 
>>> For the 1st version I'd prefer something that is the least disruptive,
>>> and that IMHO means we only trigger reclaim at all in the same configuration
>>> as now - when we can't satisfy the lowest order allocation.
>> 
>> Agreed.
>> 
>>> 
>>> Anything else would be a huge amount of testing with all kind of
>>> workloads.
>>> 
>> 
>> So doing a "& ~__GFP_RECLAIM" in case order > 0? (as done in
>> GFP_TRANSHUGE_LIGHT)
> 
> That will improve the situation when reclaim is not needed, but leave
> the problem in place for when it's needed: if reclaim does trigger, we
> can get a huge free page and immediately break it up.
> 
> So it's ok as a first step but it will make the second step harder as
> we'll need to test with reclaim :).


I worry that will increases the allocation failure rate for large pages.

I tried alloc 2M memory without __GFP_RECLAIM when I wrote the VIRTIO_BALLOON_F_THP_ORDER first version.
It will fail when I use usemem punch-holes function generates 400m fragmentation pages in the guest kernel.

What about add another option to balloon to control with __GFP_RECLAIM or without it?

Best,
Hui

> 
> 
>> -- 
>> Thanks,
>> 
>> David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue
  2020-04-02  8:00                         ` teawater
@ 2020-04-02 12:37                           ` Michael S. Tsirkin
  0 siblings, 0 replies; 32+ messages in thread
From: Michael S. Tsirkin @ 2020-04-02 12:37 UTC (permalink / raw)
  To: teawater
  Cc: David Hildenbrand, Hui Zhu, Jason Wang, Andrew Morton, pagupta,
	mojha, namit, virtualization, linux-kernel, qemu-devel,
	Alexander Duyck

On Thu, Apr 02, 2020 at 04:00:05PM +0800, teawater wrote:
> 
> 
> > 2020年3月31日 22:07,Michael S. Tsirkin <mst@redhat.com> 写道:
> > 
> > On Tue, Mar 31, 2020 at 04:03:18PM +0200, David Hildenbrand wrote:
> >> On 31.03.20 15:37, Michael S. Tsirkin wrote:
> >>> On Tue, Mar 31, 2020 at 03:32:05PM +0200, David Hildenbrand wrote:
> >>>> On 31.03.20 15:24, Michael S. Tsirkin wrote:
> >>>>> On Tue, Mar 31, 2020 at 12:35:24PM +0200, David Hildenbrand wrote:
> >>>>>> On 26.03.20 10:49, Michael S. Tsirkin wrote:
> >>>>>>> On Thu, Mar 26, 2020 at 08:54:04AM +0100, David Hildenbrand wrote:
> >>>>>>>> 
> >>>>>>>> 
> >>>>>>>>> Am 26.03.2020 um 08:21 schrieb Michael S. Tsirkin <mst@redhat.com>:
> >>>>>>>>> 
> >>>>>>>>> On Thu, Mar 12, 2020 at 09:51:25AM +0100, David Hildenbrand wrote:
> >>>>>>>>>>> On 12.03.20 09:47, Michael S. Tsirkin wrote:
> >>>>>>>>>>> On Thu, Mar 12, 2020 at 09:37:32AM +0100, David Hildenbrand wrote:
> >>>>>>>>>>>> 2. You are essentially stealing THPs in the guest. So the fastest
> >>>>>>>>>>>> mapping (THP in guest and host) is gone. The guest won't be able to make
> >>>>>>>>>>>> use of THP where it previously was able to. I can imagine this implies a
> >>>>>>>>>>>> performance degradation for some workloads. This needs a proper
> >>>>>>>>>>>> performance evaluation.
> >>>>>>>>>>> 
> >>>>>>>>>>> I think the problem is more with the alloc_pages API.
> >>>>>>>>>>> That gives you exactly the given order, and if there's
> >>>>>>>>>>> a larger chunk available, it will split it up.
> >>>>>>>>>>> 
> >>>>>>>>>>> But for balloon - I suspect lots of other users,
> >>>>>>>>>>> we do not want to stress the system but if a large
> >>>>>>>>>>> chunk is available anyway, then we could handle
> >>>>>>>>>>> that more optimally by getting it all in one go.
> >>>>>>>>>>> 
> >>>>>>>>>>> 
> >>>>>>>>>>> So if we want to address this, IMHO this calls for a new API.
> >>>>>>>>>>> Along the lines of
> >>>>>>>>>>> 
> >>>>>>>>>>> struct page *alloc_page_range(gfp_t gfp, unsigned int min_order,
> >>>>>>>>>>>                 unsigned int max_order, unsigned int *order)
> >>>>>>>>>>> 
> >>>>>>>>>>> the idea would then be to return at a number of pages in the given
> >>>>>>>>>>> range.
> >>>>>>>>>>> 
> >>>>>>>>>>> What do you think? Want to try implementing that?
> >>>>>>>>>> 
> >>>>>>>>>> You can just start with the highest order and decrement the order until
> >>>>>>>>>> your allocation succeeds using alloc_pages(), which would be enough for
> >>>>>>>>>> a first version. At least I don't see the immediate need for a new
> >>>>>>>>>> kernel API.
> >>>>>>>>> 
> >>>>>>>>> OK I remember now.  The problem is with reclaim. Unless reclaim is
> >>>>>>>>> completely disabled, any of these calls can sleep. After it wakes up,
> >>>>>>>>> we would like to get the larger order that has become available
> >>>>>>>>> meanwhile.
> >>>>>>>>> 
> >>>>>>>> 
> >>>>>>>> Yes, but that‘s a pure optimization IMHO.
> >>>>>>>> So I think we should do a trivial implementation first and then see what we gain from a new allocator API. Then we might also be able to justify it using real numbers.
> >>>>>>>> 
> >>>>>>> 
> >>>>>>> Well how do you propose implement the necessary semantics?
> >>>>>>> I think we are both agreed that alloc_page_range is more or
> >>>>>>> less what's necessary anyway - so how would you approximate it
> >>>>>>> on top of existing APIs?
> >>>>>> diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
> >>> 
> >>> .....
> >>> 
> >>> 
> >>>>>> diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
> >>>>>> index 26de020aae7b..067810b32813 100644
> >>>>>> --- a/mm/balloon_compaction.c
> >>>>>> +++ b/mm/balloon_compaction.c
> >>>>>> @@ -112,23 +112,35 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
> >>>>>> EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
> >>>>>> 
> >>>>>> /*
> >>>>>> - * balloon_page_alloc - allocates a new page for insertion into the balloon
> >>>>>> - *			page list.
> >>>>>> + * balloon_pages_alloc - allocates a new page (of at most the given order)
> >>>>>> + * 			 for insertion into the balloon page list.
> >>>>>> *
> >>>>>> * Driver must call this function to properly allocate a new balloon page.
> >>>>>> * Driver must call balloon_page_enqueue before definitively removing the page
> >>>>>> * from the guest system.
> >>>>>> *
> >>>>>> + * Will fall back to smaller orders if allocation fails. The order of the
> >>>>>> + * allocated page is stored in page->private.
> >>>>>> + *
> >>>>>> * Return: struct page for the allocated page or NULL on allocation failure.
> >>>>>> */
> >>>>>> -struct page *balloon_page_alloc(void)
> >>>>>> +struct page *balloon_pages_alloc(int order)
> >>>>>> {
> >>>>>> -	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
> >>>>>> -				       __GFP_NOMEMALLOC | __GFP_NORETRY |
> >>>>>> -				       __GFP_NOWARN);
> >>>>>> -	return page;
> >>>>>> +	struct page *page;
> >>>>>> +
> >>>>>> +	while (order >= 0) {
> >>>>>> +		page = alloc_pages(balloon_mapping_gfp_mask() |
> >>>>>> +				   __GFP_NOMEMALLOC | __GFP_NORETRY |
> >>>>>> +				   __GFP_NOWARN, order);
> >>>>>> +		if (page) {
> >>>>>> +			set_page_private(page, order);
> >>>>>> +			return page;
> >>>>>> +		}
> >>>>>> +		order--;
> >>>>>> +	}
> >>>>>> +	return NULL;
> >>>>>> }
> >>>>>> -EXPORT_SYMBOL_GPL(balloon_page_alloc);
> >>>>>> +EXPORT_SYMBOL_GPL(balloon_pages_alloc);
> >>>>>> 
> >>>>>> /*
> >>>>>> * balloon_page_enqueue - inserts a new page into the balloon page list.
> >>>>> 
> >>>>> 
> >>>>> I think this will try to invoke direct reclaim from the first iteration
> >>>>> to free up the max order.
> >>>> 
> >>>> %__GFP_NORETRY: The VM implementation will try only very lightweight
> >>>> memory direct reclaim to get some memory under memory pressure (thus it
> >>>> can sleep). It will avoid disruptive actions like OOM killer.
> >>>> 
> >>>> Certainly good enough for a first version I would say, no?
> >>> 
> >>> Frankly how well that behaves would depend a lot on the workload.
> >>> Can regress just as well.
> >>> 
> >>> For the 1st version I'd prefer something that is the least disruptive,
> >>> and that IMHO means we only trigger reclaim at all in the same configuration
> >>> as now - when we can't satisfy the lowest order allocation.
> >> 
> >> Agreed.
> >> 
> >>> 
> >>> Anything else would be a huge amount of testing with all kind of
> >>> workloads.
> >>> 
> >> 
> >> So doing a "& ~__GFP_RECLAIM" in case order > 0? (as done in
> >> GFP_TRANSHUGE_LIGHT)
> > 
> > That will improve the situation when reclaim is not needed, but leave
> > the problem in place for when it's needed: if reclaim does trigger, we
> > can get a huge free page and immediately break it up.
> > 
> > So it's ok as a first step but it will make the second step harder as
> > we'll need to test with reclaim :).
> 
> 
> I worry that will increases the allocation failure rate for large pages.
> 
> I tried alloc 2M memory without __GFP_RECLAIM when I wrote the VIRTIO_BALLOON_F_THP_ORDER first version.
> It will fail when I use usemem punch-holes function generates 400m fragmentation pages in the guest kernel.
> 
> What about add another option to balloon to control with __GFP_RECLAIM or without it?
> 
> Best,
> Hui

That is why I suggested a new API so we do not fragment memory.

> > 
> > 
> >> -- 
> >> Thanks,
> >> 
> >> David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2020-04-02 12:37 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-12  7:49 [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue Hui Zhu
2020-03-12  7:49 ` [RFC for QEMU] virtio-balloon: Add option thp-order to set VIRTIO_BALLOON_F_THP_ORDER Hui Zhu
2020-03-12  8:22   ` no-reply
2020-03-12  8:25   ` Michael S. Tsirkin
2020-03-17 10:13     ` teawater
2020-03-26  7:07       ` Michael S. Tsirkin
2020-03-12  8:18 ` [RFC for Linux] virtio_balloon: Add VIRTIO_BALLOON_F_THP_ORDER to handle THP spilt issue Michael S. Tsirkin
2020-03-12  8:37 ` David Hildenbrand
2020-03-12  8:47   ` Michael S. Tsirkin
2020-03-12  8:51     ` David Hildenbrand
2020-03-26  7:10       ` Michael S. Tsirkin
2020-03-26  7:20       ` Michael S. Tsirkin
2020-03-26  7:54         ` David Hildenbrand
2020-03-26  9:49           ` Michael S. Tsirkin
2020-03-31 10:35             ` David Hildenbrand
2020-03-31 13:24               ` Michael S. Tsirkin
2020-03-31 13:32                 ` David Hildenbrand
2020-03-31 13:37                   ` Michael S. Tsirkin
2020-03-31 14:03                     ` David Hildenbrand
2020-03-31 14:07                       ` Michael S. Tsirkin
2020-03-31 14:09                         ` David Hildenbrand
2020-03-31 14:18                           ` Michael S. Tsirkin
2020-03-31 14:29                             ` David Hildenbrand
2020-03-31 14:34                               ` David Hildenbrand
2020-03-31 15:28                                 ` Michael S. Tsirkin
2020-03-31 16:37                           ` Nadav Amit
2020-04-01  9:48                             ` David Hildenbrand
2020-04-02  4:02                               ` teawater
2020-04-02  8:00                         ` teawater
2020-04-02 12:37                           ` Michael S. Tsirkin
2020-03-31 16:27                   ` Nadav Amit
2020-04-01 11:21                     ` David Hildenbrand

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).