linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4 1/7] Make place for common balloon code
       [not found] <20221005090158.2801592-1-alexander.atanasov@virtuozzo.com>
@ 2022-10-05  9:01 ` Alexander Atanasov
  2022-10-05  9:01 ` [PATCH v4 2/7] Enable balloon drivers to report inflated memory Alexander Atanasov
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 21+ messages in thread
From: Alexander Atanasov @ 2022-10-05  9:01 UTC (permalink / raw)
  To: Michael Ellerman, Nicholas Piggin, Christophe Leroy, Nadav Amit,
	VMware PV-Drivers Reviewers, Arnd Bergmann, Greg Kroah-Hartman,
	Michael S. Tsirkin, David Hildenbrand, Jason Wang, Andrew Morton
  Cc: kernel, Alexander Atanasov, linux-kernel, linuxppc-dev,
	virtualization, linux-mm

File already contains code that is common along balloon
drivers so rename it to reflect its contents.
mm/balloon_compaction.c -> mm/balloon.c
include/linux/balloon_compaction.h -> include/linux/balloon.h
Remove it from files which do not actually use it.
Drop externs from function delcarations.

Signed-off-by: Alexander Atanasov <alexander.atanasov@virtuozzo.com>
Acked-by: Nadav Amit <namit@vmware.com>
---
 MAINTAINERS                                       |  4 ++--
 arch/powerpc/platforms/pseries/cmm.c              |  2 +-
 drivers/misc/vmw_balloon.c                        |  2 +-
 drivers/virtio/virtio_balloon.c                   |  2 +-
 include/linux/{balloon_compaction.h => balloon.h} | 12 +++++-------
 mm/Makefile                                       |  2 +-
 mm/{balloon_compaction.c => balloon.c}            |  4 +---
 mm/migrate.c                                      |  1 -
 mm/vmscan.c                                       |  1 -
 9 files changed, 12 insertions(+), 18 deletions(-)
 rename include/linux/{balloon_compaction.h => balloon.h} (93%)
 rename mm/{balloon_compaction.c => balloon.c} (99%)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8656ab794786..745eb0ada366 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21572,8 +21572,8 @@ L:	virtualization@lists.linux-foundation.org
 S:	Maintained
 F:	drivers/virtio/virtio_balloon.c
 F:	include/uapi/linux/virtio_balloon.h
-F:	include/linux/balloon_compaction.h
-F:	mm/balloon_compaction.c
+F:	include/linux/balloon.h
+F:	mm/balloon.c
 
 VIRTIO CRYPTO DRIVER
 M:	Gonglei <arei.gonglei@huawei.com>
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 5f4037c1d7fe..1d40f6416d6a 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -19,7 +19,7 @@
 #include <linux/stringify.h>
 #include <linux/swap.h>
 #include <linux/device.h>
-#include <linux/balloon_compaction.h>
+#include <linux/balloon.h>
 #include <asm/firmware.h>
 #include <asm/hvcall.h>
 #include <asm/mmu.h>
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index 61a2be712bf7..91d4d2a285c5 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -29,7 +29,7 @@
 #include <linux/rwsem.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/balloon_compaction.h>
+#include <linux/balloon.h>
 #include <linux/vmw_vmci_defs.h>
 #include <linux/vmw_vmci_api.h>
 #include <asm/hypervisor.h>
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 3f78a3a1eb75..d0c27c680721 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -13,7 +13,7 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <linux/balloon_compaction.h>
+#include <linux/balloon.h>
 #include <linux/oom.h>
 #include <linux/wait.h>
 #include <linux/mm.h>
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon.h
similarity index 93%
rename from include/linux/balloon_compaction.h
rename to include/linux/balloon.h
index 5ca2d5699620..46ac8f61f607 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * include/linux/balloon_compaction.h
- *
  * Common interface definitions for making balloon pages movable by compaction.
  *
  * Balloon page migration makes use of the general non-lru movable page
@@ -59,13 +57,13 @@ struct balloon_dev_info {
 			struct page *page, enum migrate_mode mode);
 };
 
-extern struct page *balloon_page_alloc(void);
-extern void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
+struct page *balloon_page_alloc(void);
+void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
 				 struct page *page);
-extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
-extern size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info,
+struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
+size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info,
 				      struct list_head *pages);
-extern size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
+size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
 				     struct list_head *pages, size_t n_req_pages);
 
 static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
diff --git a/mm/Makefile b/mm/Makefile
index 9a564f836403..550cb0663f50 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -112,7 +112,7 @@ obj-$(CONFIG_ZSMALLOC)	+= zsmalloc.o
 obj-$(CONFIG_Z3FOLD)	+= z3fold.o
 obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
 obj-$(CONFIG_CMA)	+= cma.o
-obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
+obj-$(CONFIG_MEMORY_BALLOON) += balloon.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
diff --git a/mm/balloon_compaction.c b/mm/balloon.c
similarity index 99%
rename from mm/balloon_compaction.c
rename to mm/balloon.c
index 22c96fed70b5..22b3e876bc78 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * mm/balloon_compaction.c
- *
  * Common interface for making balloon pages movable by compaction.
  *
  * Copyright (C) 2012, Red Hat, Inc.  Rafael Aquini <aquini@redhat.com>
@@ -9,7 +7,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/export.h>
-#include <linux/balloon_compaction.h>
+#include <linux/balloon.h>
 
 static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info,
 				     struct page *page)
diff --git a/mm/migrate.c b/mm/migrate.c
index 6a1597c92261..a4c8bb334dde 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -41,7 +41,6 @@
 #include <linux/pfn_t.h>
 #include <linux/memremap.h>
 #include <linux/userfaultfd_k.h>
-#include <linux/balloon_compaction.h>
 #include <linux/page_idle.h>
 #include <linux/page_owner.h>
 #include <linux/sched/mm.h>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 382dbe97329f..0274e2cb8430 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -54,7 +54,6 @@
 #include <asm/div64.h>
 
 #include <linux/swapops.h>
-#include <linux/balloon_compaction.h>
 #include <linux/sched/sysctl.h>
 
 #include "internal.h"
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v4 2/7] Enable balloon drivers to report inflated memory
       [not found] <20221005090158.2801592-1-alexander.atanasov@virtuozzo.com>
  2022-10-05  9:01 ` [PATCH v4 1/7] Make place for common balloon code Alexander Atanasov
@ 2022-10-05  9:01 ` Alexander Atanasov
  2022-10-05 17:25   ` Nadav Amit
  2022-10-05  9:01 ` [PATCH v4 3/7] Display inflated memory to users Alexander Atanasov
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 21+ messages in thread
From: Alexander Atanasov @ 2022-10-05  9:01 UTC (permalink / raw)
  To: Michael S. Tsirkin, David Hildenbrand, Andrew Morton
  Cc: kernel, Alexander Atanasov, virtualization, linux-kernel, linux-mm

Add counters to be updated by the balloon drivers.
Create balloon notifier to propagate changes.

Signed-off-by: Alexander Atanasov <alexander.atanasov@virtuozzo.com>
---
 include/linux/balloon.h | 18 ++++++++++++++++++
 mm/balloon.c            | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/include/linux/balloon.h b/include/linux/balloon.h
index 46ac8f61f607..59657da77d95 100644
--- a/include/linux/balloon.h
+++ b/include/linux/balloon.h
@@ -57,6 +57,24 @@ struct balloon_dev_info {
 			struct page *page, enum migrate_mode mode);
 };
 
+extern atomic_long_t mem_balloon_inflated_total_kb;
+extern atomic_long_t mem_balloon_inflated_free_kb;
+
+void balloon_set_inflated_total(long inflated_kb);
+void balloon_set_inflated_free(long inflated_kb);
+
+#define BALLOON_CHANGED_TOTAL 0
+#define BALLOON_CHANGED_FREE  1
+
+int register_balloon_notifier(struct notifier_block *nb);
+void unregister_balloon_notifier(struct notifier_block *nb);
+
+#define balloon_notifier(fn, pri) ({						\
+	static struct notifier_block fn##_mem_nb __meminitdata =\
+		{ .notifier_call = fn, .priority = pri };			\
+	register_balloon_notifier(&fn##_mem_nb);				\
+})
+
 struct page *balloon_page_alloc(void);
 void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
 				 struct page *page);
diff --git a/mm/balloon.c b/mm/balloon.c
index 22b3e876bc78..8e1d5855fef8 100644
--- a/mm/balloon.c
+++ b/mm/balloon.c
@@ -7,8 +7,44 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/notifier.h>
 #include <linux/balloon.h>
 
+atomic_long_t mem_balloon_inflated_total_kb = ATOMIC_LONG_INIT(0);
+atomic_long_t mem_balloon_inflated_free_kb = ATOMIC_LONG_INIT(0);
+SRCU_NOTIFIER_HEAD_STATIC(balloon_chain);
+
+int register_balloon_notifier(struct notifier_block *nb)
+{
+	return srcu_notifier_chain_register(&balloon_chain, nb);
+}
+EXPORT_SYMBOL(register_balloon_notifier);
+
+void unregister_balloon_notifier(struct notifier_block *nb)
+{
+	srcu_notifier_chain_unregister(&balloon_chain, nb);
+}
+EXPORT_SYMBOL(unregister_balloon_notifier);
+
+static int balloon_notify(unsigned long val)
+{
+	return srcu_notifier_call_chain(&balloon_chain, val, NULL);
+}
+
+void balloon_set_inflated_total(long inflated_kb)
+{
+	atomic_long_set(&mem_balloon_inflated_total_kb, inflated_kb);
+	balloon_notify(BALLOON_CHANGED_TOTAL);
+}
+EXPORT_SYMBOL(balloon_set_inflated_total);
+
+void balloon_set_inflated_free(long inflated_kb)
+{
+	atomic_long_set(&mem_balloon_inflated_free_kb, inflated_kb);
+	balloon_notify(BALLOON_CHANGED_FREE);
+}
+EXPORT_SYMBOL(balloon_set_inflated_free);
+
 static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info,
 				     struct page *page)
 {
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v4 3/7] Display inflated memory to users
       [not found] <20221005090158.2801592-1-alexander.atanasov@virtuozzo.com>
  2022-10-05  9:01 ` [PATCH v4 1/7] Make place for common balloon code Alexander Atanasov
  2022-10-05  9:01 ` [PATCH v4 2/7] Enable balloon drivers to report inflated memory Alexander Atanasov
@ 2022-10-05  9:01 ` Alexander Atanasov
  2022-10-05  9:01 ` [PATCH v4 4/7] drivers: virtio: balloon - update inflated memory Alexander Atanasov
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 21+ messages in thread
From: Alexander Atanasov @ 2022-10-05  9:01 UTC (permalink / raw)
  To: Jonathan Corbet
  Cc: kernel, Alexander Atanasov, linux-kernel, linux-fsdevel, linux-doc

Add InflatedTotal and InflatedFree to /proc/meminfo

Signed-off-by: Alexander Atanasov <alexander.atanasov@virtuozzo.com>
---
 Documentation/filesystems/proc.rst |  6 ++++++
 fs/proc/meminfo.c                  | 10 ++++++++++
 2 files changed, 16 insertions(+)

diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index e7aafc82be99..690e1b90ffee 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -991,6 +991,8 @@ Example output. You may not have all of these fields.
     VmallocUsed:       40444 kB
     VmallocChunk:          0 kB
     Percpu:            29312 kB
+    InflatedTotal:   2097152 kB
+    InflatedFree:          0 kB
     HardwareCorrupted:     0 kB
     AnonHugePages:   4149248 kB
     ShmemHugePages:        0 kB
@@ -1138,6 +1140,10 @@ VmallocChunk
 Percpu
               Memory allocated to the percpu allocator used to back percpu
               allocations. This stat excludes the cost of metadata.
+InflatedTotal and InflatedFree
+               Amount of memory that is inflated by the balloon driver.
+               Due to differences among the drivers inflated memory
+               is subtracted from TotalRam or from MemFree.
 HardwareCorrupted
               The amount of RAM/memory in KB, the kernel identifies as
               corrupted.
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 6e89f0e2fd20..7182886efdbf 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -16,6 +16,9 @@
 #ifdef CONFIG_CMA
 #include <linux/cma.h>
 #endif
+#ifdef CONFIG_MEMORY_BALLOON
+#include <linux/balloon.h>
+#endif
 #include <asm/page.h>
 #include "internal.h"
 
@@ -153,6 +156,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		    global_zone_page_state(NR_FREE_CMA_PAGES));
 #endif
 
+#ifdef CONFIG_MEMORY_BALLOON
+	seq_printf(m,  "InflatedTotal:  %8ld kB\n",
+		atomic_long_read(&mem_balloon_inflated_total_kb));
+	seq_printf(m,  "InflatedFree:   %8ld kB\n",
+		atomic_long_read(&mem_balloon_inflated_free_kb));
+#endif
+
 	hugetlb_report_meminfo(m);
 
 	arch_report_meminfo(m);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v4 4/7] drivers: virtio: balloon - update inflated memory
       [not found] <20221005090158.2801592-1-alexander.atanasov@virtuozzo.com>
                   ` (2 preceding siblings ...)
  2022-10-05  9:01 ` [PATCH v4 3/7] Display inflated memory to users Alexander Atanasov
@ 2022-10-05  9:01 ` Alexander Atanasov
  2022-10-05  9:01 ` [PATCH v4 5/7] Display inflated memory in logs Alexander Atanasov
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 21+ messages in thread
From: Alexander Atanasov @ 2022-10-05  9:01 UTC (permalink / raw)
  To: Michael S. Tsirkin, David Hildenbrand, Jason Wang
  Cc: kernel, Alexander Atanasov, virtualization, linux-kernel

Update the inflated memory when it changes.

Signed-off-by: Alexander Atanasov <alexander.atanasov@virtuozzo.com>
---
 drivers/virtio/virtio_balloon.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index d0c27c680721..e9c3642eef17 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -450,10 +450,15 @@ static void virtballoon_changed(struct virtio_device *vdev)
 static void update_balloon_size(struct virtio_balloon *vb)
 {
 	u32 actual = vb->num_pages;
+	long inflated_kb = actual << (VIRTIO_BALLOON_PFN_SHIFT - 10);
 
 	/* Legacy balloon config space is LE, unlike all other devices. */
 	virtio_cwrite_le(vb->vdev, struct virtio_balloon_config, actual,
 			 &actual);
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
+		balloon_set_inflated_free(inflated_kb);
+	else
+		balloon_set_inflated_total(inflated_kb);
 }
 
 static void update_balloon_stats_func(struct work_struct *work)
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v4 5/7] Display inflated memory in logs
       [not found] <20221005090158.2801592-1-alexander.atanasov@virtuozzo.com>
                   ` (3 preceding siblings ...)
  2022-10-05  9:01 ` [PATCH v4 4/7] drivers: virtio: balloon - update inflated memory Alexander Atanasov
@ 2022-10-05  9:01 ` Alexander Atanasov
  2022-10-05  9:01 ` [PATCH v4 6/7] drivers: vmware: balloon - report inflated memory Alexander Atanasov
  2022-10-05  9:01 ` [PATCH v4 7/7] drivers: hyperv: " Alexander Atanasov
  6 siblings, 0 replies; 21+ messages in thread
From: Alexander Atanasov @ 2022-10-05  9:01 UTC (permalink / raw)
  Cc: kernel, Alexander Atanasov, linux-kernel

Add InflatedTotal and InflatedFree to show_mem(...) so
it is seen in the logs.

Signed-off-by: Alexander Atanasov <alexander.atanasov@virtuozzo.com>
---
 lib/show_mem.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lib/show_mem.c b/lib/show_mem.c
index 1c26c14ffbb9..1c022b9d7ac5 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -7,6 +7,9 @@
 
 #include <linux/mm.h>
 #include <linux/cma.h>
+#ifdef CONFIG_MEMORY_BALLOON
+#include <linux/balloon.h>
+#endif
 
 void show_mem(unsigned int filter, nodemask_t *nodemask)
 {
@@ -41,4 +44,9 @@ void show_mem(unsigned int filter, nodemask_t *nodemask)
 #ifdef CONFIG_MEMORY_FAILURE
 	printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
 #endif
+#ifdef CONFIG_MEMORY_BALLOON
+	printk("Balloon InflatedTotal:%ldkB InflatedFree:%ldkB\n",
+		atomic_long_read(&mem_balloon_inflated_total_kb),
+		atomic_long_read(&mem_balloon_inflated_free_kb));
+#endif
 }
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v4 6/7] drivers: vmware: balloon - report inflated memory
       [not found] <20221005090158.2801592-1-alexander.atanasov@virtuozzo.com>
                   ` (4 preceding siblings ...)
  2022-10-05  9:01 ` [PATCH v4 5/7] Display inflated memory in logs Alexander Atanasov
@ 2022-10-05  9:01 ` Alexander Atanasov
  2022-10-05  9:01 ` [PATCH v4 7/7] drivers: hyperv: " Alexander Atanasov
  6 siblings, 0 replies; 21+ messages in thread
From: Alexander Atanasov @ 2022-10-05  9:01 UTC (permalink / raw)
  To: Nadav Amit, VMware PV-Drivers Reviewers, Arnd Bergmann,
	Greg Kroah-Hartman
  Cc: kernel, Alexander Atanasov, linux-kernel

Propagate inflated memory changes to mm.

Signed-off-by: Alexander Atanasov <alexander.atanasov@virtuozzo.com>
---
 drivers/misc/vmw_balloon.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index 91d4d2a285c5..ce581e3d101c 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -1161,6 +1161,8 @@ static void vmballoon_inflate(struct vmballoon *b)
 
 		cond_resched();
 	}
+	/* Update core after inflation is done - send a single notification */
+	balloon_set_inflated_free(atomic64_read(&b->size) << 2);
 
 	/*
 	 * Release pages that were allocated while attempting to inflate the
@@ -1285,6 +1287,8 @@ static unsigned long vmballoon_deflate(struct vmballoon *b, uint64_t n_frames,
 
 		cond_resched();
 	}
+	/* Update core after deflation is done - send a single notification */
+	balloon_set_inflated_free(atomic64_read(&b->size) << 2);
 
 	return deflated_frames;
 }
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v4 7/7] drivers: hyperv: balloon - report inflated memory
       [not found] <20221005090158.2801592-1-alexander.atanasov@virtuozzo.com>
                   ` (5 preceding siblings ...)
  2022-10-05  9:01 ` [PATCH v4 6/7] drivers: vmware: balloon - report inflated memory Alexander Atanasov
@ 2022-10-05  9:01 ` Alexander Atanasov
  6 siblings, 0 replies; 21+ messages in thread
From: Alexander Atanasov @ 2022-10-05  9:01 UTC (permalink / raw)
  To: K. Y. Srinivasan, Haiyang Zhang, Stephen Hemminger, Wei Liu, Dexuan Cui
  Cc: kernel, Alexander Atanasov, linux-hyperv, linux-kernel

Propagate inflated memory changes to mm.

Signed-off-by: Alexander Atanasov <alexander.atanasov@virtuozzo.com>
---
 drivers/hv/hv_balloon.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index fdf6decacf06..b4c828c462a4 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -24,6 +24,7 @@
 #include <linux/notifier.h>
 #include <linux/percpu_counter.h>
 #include <linux/page_reporting.h>
+#include <linux/balloon.h>
 
 #include <linux/hyperv.h>
 #include <asm/hyperv-tlfs.h>
@@ -1280,6 +1281,13 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
 	return i * alloc_unit;
 }
 
+static void report_ballooned_pages(struct hv_dynmem_device *dm)
+{
+	u32 actual = dm->num_pages_ballooned;
+	long inflated_kb = actual << (HV_HYP_PAGE_SHIFT - 10);
+
+	balloon_set_inflated_total(inflated_kb);
+}
+
 static void balloon_up(struct work_struct *dummy)
 {
 	unsigned int num_pages = dm_device.balloon_wrk.num_pages;
@@ -1368,6 +1376,7 @@ static void balloon_up(struct work_struct *dummy)
 		}
 	}
 
+	report_ballooned_pages(&dm_device);
 }
 
 static void balloon_down(struct hv_dynmem_device *dm,
@@ -1387,6 +1396,8 @@ static void balloon_down(struct hv_dynmem_device *dm,
 	pr_debug("Freed %u ballooned pages.\n",
 		prev_pages_ballooned - dm->num_pages_ballooned);
 
+	report_ballooned_pages(dm);
+
 	if (req->more_pages == 1)
 		return;
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 2/7] Enable balloon drivers to report inflated memory
  2022-10-05  9:01 ` [PATCH v4 2/7] Enable balloon drivers to report inflated memory Alexander Atanasov
@ 2022-10-05 17:25   ` Nadav Amit
  2022-10-06  7:34     ` Alexander Atanasov
  0 siblings, 1 reply; 21+ messages in thread
From: Nadav Amit @ 2022-10-05 17:25 UTC (permalink / raw)
  To: Alexander Atanasov
  Cc: Michael S. Tsirkin, David Hildenbrand, Andrew Morton, kernel,
	Linux Virtualization, LKML, linux-mm

On Oct 5, 2022, at 2:01 AM, Alexander Atanasov <alexander.atanasov@virtuozzo.com> wrote:

> Add counters to be updated by the balloon drivers.
> Create balloon notifier to propagate changes.

I missed the other patches before (including this one). Sorry, but next
time, please cc me.

I was looking through the series and I did not see actual users of the
notifier. Usually, it is not great to build an API without users.

[snip] 

> +
> +static int balloon_notify(unsigned long val)
> +{
> +	return srcu_notifier_call_chain(&balloon_chain, val, NULL);

Since you know the inflated_kb value here, why not to use it as an argument
to the callback? I think casting to (void *) and back is best. But you can
also provide pointer to the value. Doesn’t it sound better than having
potentially different notifiers reading different values?

Anyhow, without users (actual notifiers) it’s kind of hard to know how
reasonable it all is. For instance, is it balloon_notify() supposed to
prevent further balloon inflating/deflating until the notifier completes?
Accordingly, are callers to balloon_notify() expected to relinquish locks
before calling balloon_notify() to prevent deadlocks and high latency?


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 2/7] Enable balloon drivers to report inflated memory
  2022-10-05 17:25   ` Nadav Amit
@ 2022-10-06  7:34     ` Alexander Atanasov
  2022-10-06 21:07       ` Nadav Amit
  0 siblings, 1 reply; 21+ messages in thread
From: Alexander Atanasov @ 2022-10-06  7:34 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Michael S. Tsirkin, David Hildenbrand, Andrew Morton, kernel,
	Linux Virtualization, LKML, linux-mm

Hello,


On 5.10.22 20:25, Nadav Amit wrote:
> On Oct 5, 2022, at 2:01 AM, Alexander Atanasov <alexander.atanasov@virtuozzo.com> wrote:
> 
>> Add counters to be updated by the balloon drivers.
>> Create balloon notifier to propagate changes.
> 
> I missed the other patches before (including this one). Sorry, but next
> time, please cc me.

You are CCed in the cover letter since the version. I will add CC to you
in the individual patches if you want so.

> 
> I was looking through the series and I did not see actual users of the
> notifier. Usually, it is not great to build an API without users.


You are right. I hope to get some feedback/interest from potential users 
that i mentioned in the cover letter. I will probably split the notifier
in separate series. To make it usefull it will require more changes.
See bellow more about them.


> [snip]
> 
>> +
>> +static int balloon_notify(unsigned long val)
>> +{
>> +	return srcu_notifier_call_chain(&balloon_chain, val, NULL);
> 
> Since you know the inflated_kb value here, why not to use it as an argument
> to the callback? I think casting to (void *) and back is best. But you can
> also provide pointer to the value. Doesn’t it sound better than having
> potentially different notifiers reading different values?

My current idea is to have a struct with current and previous value,
may be change in percents. The actual value does not matter to anyone
but the size of change does. When a user gets notified it can act upon
the change - if it is small it can ignore it , if it is above some 
threshold it can act - if it makes sense for some receiver  is can 
accumulate changes from several notification. Other option/addition is 
to have si_meminfo_current(..) and totalram_pages_current(..) that 
return values adjusted with the balloon values.

Going further - there are few places that calculate something based on 
available memory that do not have sysfs/proc interface for setting 
limits. Most of them work in percents so they can be converted to do 
calculations when they get notification.

The one that have interface for configuration but use memory values can 
be handled in two ways - convert to use percents of what is available or 
extend the notifier to notify userspace which in turn to do calculations 
and update configuration.

> Anyhow, without users (actual notifiers) it’s kind of hard to know how
> reasonable it all is. For instance, is it balloon_notify() supposed to
> prevent further balloon inflating/deflating until the notifier completes?

No, we must avoid that at any cost.

> Accordingly, are callers to balloon_notify() expected to relinquish locks
> before calling balloon_notify() to prevent deadlocks and high latency?

My goal is to avoid any possible impact on performance. Drivers are free 
to delay notifications if they get in the way. (I see that i need to 
move the notification after the semaphore in the vmw driver - i missed 
that - will fix in the next iterration.)
Deadlocks - depends on the users but a few to none will possibly have to 
deal with common locks.


-- 
Regards,
Alexander Atanasov


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 2/7] Enable balloon drivers to report inflated memory
  2022-10-06  7:34     ` Alexander Atanasov
@ 2022-10-06 21:07       ` Nadav Amit
  2022-10-07 10:58         ` RFC " Alexander Atanasov
  0 siblings, 1 reply; 21+ messages in thread
From: Nadav Amit @ 2022-10-06 21:07 UTC (permalink / raw)
  To: Alexander Atanasov
  Cc: Michael S. Tsirkin, David Hildenbrand, Andrew Morton, kernel,
	Linux Virtualization, LKML, linux-mm

On Oct 6, 2022, at 12:34 AM, Alexander Atanasov <alexander.atanasov@virtuozzo.com> wrote:

> Hello,
> 
> 
> On 5.10.22 20:25, Nadav Amit wrote:
>> On Oct 5, 2022, at 2:01 AM, Alexander Atanasov <alexander.atanasov@virtuozzo.com> wrote:
>>> Add counters to be updated by the balloon drivers.
>>> Create balloon notifier to propagate changes.
>> I missed the other patches before (including this one). Sorry, but next
>> time, please cc me.
> 
> You are CCed in the cover letter since the version. I will add CC to you
> in the individual patches if you want so.

Thanks.

Just to clarify - I am not attacking you. It’s more of me making an excuse
for not addressing some issues in earlier versions.

>> I was looking through the series and I did not see actual users of the
>> notifier. Usually, it is not great to build an API without users.
> 
> 
> You are right. I hope to get some feedback/interest from potential users that i mentioned in the cover letter. I will probably split the notifier
> in separate series. To make it usefull it will require more changes.
> See bellow more about them.

Fair, but this is something that is more suitable for RFC. Otherwise, more
likely than not - your patches would go in as is.

>> [snip]
>>> +
>>> +static int balloon_notify(unsigned long val)
>>> +{
>>> +	return srcu_notifier_call_chain(&balloon_chain, val, NULL);
>> Since you know the inflated_kb value here, why not to use it as an argument
>> to the callback? I think casting to (void *) and back is best. But you can
>> also provide pointer to the value. Doesn’t it sound better than having
>> potentially different notifiers reading different values?
> 
> My current idea is to have a struct with current and previous value,
> may be change in percents. The actual value does not matter to anyone
> but the size of change does. When a user gets notified it can act upon
> the change - if it is small it can ignore it , if it is above some threshold it can act - if it makes sense for some receiver  is can accumulate changes from several notification. Other option/addition is to have si_meminfo_current(..) and totalram_pages_current(..) that return values adjusted with the balloon values.
> 
> Going further - there are few places that calculate something based on available memory that do not have sysfs/proc interface for setting limits. Most of them work in percents so they can be converted to do calculations when they get notification.
> 
> The one that have interface for configuration but use memory values can be handled in two ways - convert to use percents of what is available or extend the notifier to notify userspace which in turn to do calculations and update configuration.

I really need to see code to fully understand what you have in mind.
Division, as you know, is not something that we really want to do very
frequently.

>> Anyhow, without users (actual notifiers) it’s kind of hard to know how
>> reasonable it all is. For instance, is it balloon_notify() supposed to
>> prevent further balloon inflating/deflating until the notifier completes?
> 
> No, we must avoid that at any cost.
> 
>> Accordingly, are callers to balloon_notify() expected to relinquish locks
>> before calling balloon_notify() to prevent deadlocks and high latency?
> 
> My goal is to avoid any possible impact on performance. Drivers are free to delay notifications if they get in the way. (I see that i need to move the notification after the semaphore in the vmw driver - i missed that - will fix in the next iterration.)
> Deadlocks - depends on the users but a few to none will possibly have to deal with common locks.

I will need to see the next version to give better feedback. One more thing
that comes to mind though is whether saving the balloon size in multiple
places (both mem_balloon_inflated_total_kb and each balloon’s accounting) is
the right way. It does not sounds very clean.

Two other options is to move *all* the accounting to your new
mem_balloon_inflated_total_kb-like interface or expose some per-balloon
function to get the balloon size (indirect-function-call would likely have
some overhead though).

Anyhow, I am not crazy about having the same data replicated. Even from
reading the code stand-of-view it is not intuitive.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: RFC [PATCH v4 2/7] Enable balloon drivers to report inflated memory
  2022-10-06 21:07       ` Nadav Amit
@ 2022-10-07 10:58         ` Alexander Atanasov
  2022-10-10  6:18           ` Nadav Amit
  0 siblings, 1 reply; 21+ messages in thread
From: Alexander Atanasov @ 2022-10-07 10:58 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Michael S. Tsirkin, David Hildenbrand, Andrew Morton, kernel,
	Linux Virtualization, LKML, linux-mm

On 7.10.22 0:07, Nadav Amit wrote:
> On Oct 6, 2022, at 12:34 AM, Alexander Atanasov <alexander.atanasov@virtuozzo.com> wrote:
> 
>> Hello,
>>
>>
>> On 5.10.22 20:25, Nadav Amit wrote:
>>> On Oct 5, 2022, at 2:01 AM, Alexander Atanasov <alexander.atanasov@virtuozzo.com> wrote:
>>>> Add counters to be updated by the balloon drivers.
>>>> Create balloon notifier to propagate changes.
>>> I missed the other patches before (including this one). Sorry, but next
>>> time, please cc me.
>>
>> You are CCed in the cover letter since the version. I will add CC to you
>> in the individual patches if you want so.
> 
> Thanks.
> 
> Just to clarify - I am not attacking you. It’s more of me making an excuse
> for not addressing some issues in earlier versions.

ok, i am glad that you did it now but i do not think you need to excuse.

>>> I was looking through the series and I did not see actual users of the
>>> notifier. Usually, it is not great to build an API without users.
>>
>>
>> You are right. I hope to get some feedback/interest from potential users that i mentioned in the cover letter. I will probably split the notifier
>> in separate series. To make it usefull it will require more changes.
>> See bellow more about them.
> 
> Fair, but this is something that is more suitable for RFC. Otherwise, more
> likely than not - your patches would go in as is.

Yes, i will remove the notifier and resend both as RFC. I think that 
every patch is an RFC and RFC tag is used for more general changes that 
could affect unexpected areas, change functionality, change design and 
in general can lead to bigger impact. In the case with this it adds 
functionality that is missing and it could hardly affect anything else.
In essence it provides information that you can not get without it.
But i will take your advice and push everything thru RFC from now on.

>>> [snip]
>>>> +
>>>> +static int balloon_notify(unsigned long val)
>>>> +{
>>>> +	return srcu_notifier_call_chain(&balloon_chain, val, NULL);
>>> Since you know the inflated_kb value here, why not to use it as an argument
>>> to the callback? I think casting to (void *) and back is best. But you can
>>> also provide pointer to the value. Doesn’t it sound better than having
>>> potentially different notifiers reading different values?
>>
>> My current idea is to have a struct with current and previous value,
>> may be change in percents. The actual value does not matter to anyone
>> but the size of change does. When a user gets notified it can act upon
>> the change - if it is small it can ignore it , if it is above some threshold it can act - if it makes sense for some receiver  is can accumulate changes from several notification. Other option/addition is to have si_meminfo_current(..) and totalram_pages_current(..) that return values adjusted with the balloon values.
>>
>> Going further - there are few places that calculate something based on available memory that do not have sysfs/proc interface for setting limits. Most of them work in percents so they can be converted to do calculations when they get notification.
>>
>> The one that have interface for configuration but use memory values can be handled in two ways - convert to use percents of what is available or extend the notifier to notify userspace which in turn to do calculations and update configuration.
> 
> I really need to see code to fully understand what you have in mind.

Sure - you can check some of the users with git grep totalram_pages - 
shows self explanatory results of usage like:
fs/f2fs/node.c:bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, 
int type) - calculations in percents - one good example
fs/ceph/super.h:        congestion_kb = (16*int_sqrt(totalram_pages())) 
<< (PAGE_SHIFT-10);
fs/fuse/inode.c:                *limit = ((totalram_pages() << 
PAGE_SHIFT) >> 13) / 392;
fs/nfs/write.c: nfs_congestion_kb = (16*int_sqrt(totalram_pages())) << 
(PAGE_SHIFT-10);
fs/nfsd/nfscache.c:     unsigned long low_pages = totalram_pages() - 
totalhigh_pages()
mm/oom_kill.c:  oc->totalpages = totalram_pages() + total_swap_pages;


So all balloon drivers give large amount of RAM on boot , then inflate 
the balloon. But this places have already been initiallized and they 
know that the system have given amount of totalram which is not true the 
moment they start to operate. the result is that too much space gets 
used and it degrades the userspace performance.
example - fs/eventpoll.c:static int __init eventpoll_init(void) - 4% of 
ram for eventpool - when you inflate half of the ram it becomes 8% of 
the ram - do you really need 8% of your ram to be used for eventpool?

To solve this you need to register and when notified update - cache 
size, limits and for whatever is the calculated amount of memory used.

The difference is here:

mm/zswap.c:     return totalram_pages() * zswap_max_pool_percent / 100 <
mm/zswap.c:     return totalram_pages() * zswap_accept_thr_percent / 100
uses percents and you can recalculate easy with

+static inline unsigned long totalram_pages_current(void)
+{
+       unsigned long inflated = 0;
+#ifdef CONFIG_MEMORY_BALLOON
+       extern atomic_long_t mem_balloon_inflated_free_kb;
+       inflated = atomic_long_read(&mem_balloon_inflated_free_kb);
+       inflated >>= (PAGE_SHIFT - 10);
+#endif
+       return (unsigned long)atomic_long_read(&_totalram_pages) - inflated;
+}

And you are good when you switch to _current version - 
si_meminfo_current is alike .

On init (probably) all use some kind of fractions to calculate but when 
there is a set value via /proc/sys/net/ipv4/tcp_wmem for example it is 
just a value and you can not recalculate it. And here, please, share 
your ideas how to solve this.


> Division, as you know, is not something that we really want to do very
> frequently.

Yes, thats true but in the actual implementation there are a lot of ways 
to avoid it. But it is easier to explain with division.

Even if you do have to do a division you can limit the recalculations
struct balloon_notify {
    unsigned long last_inflated_free_kb;
    unsigned long last_inflated_free_kb;
    unsigned long inflated_free_kb;
    unsigned long inflated_used_kb;
}

So you can do it only if change is more then 1GB and you do nothing if 
change is 1MB.

> 
>>> Anyhow, without users (actual notifiers) it’s kind of hard to know how
>>> reasonable it all is. For instance, is it balloon_notify() supposed to
>>> prevent further balloon inflating/deflating until the notifier completes?
>>
>> No, we must avoid that at any cost.
>>
>>> Accordingly, are callers to balloon_notify() expected to relinquish locks
>>> before calling balloon_notify() to prevent deadlocks and high latency?
>>
>> My goal is to avoid any possible impact on performance. Drivers are free to delay notifications if they get in the way. (I see that i need to move the notification after the semaphore in the vmw driver - i missed that - will fix in the next iterration.)
>> Deadlocks - depends on the users but a few to none will possibly have to deal with common locks.
> 
> I will need to see the next version to give better feedback. One more thing
> that comes to mind though is whether saving the balloon size in multiple
> places (both mem_balloon_inflated_total_kb and each balloon’s accounting) is
> the right way. It does not sounds very clean.
> 
> Two other options is to move *all* the accounting to your new
> mem_balloon_inflated_total_kb-like interface or expose some per-balloon
> function to get the balloon size (indirect-function-call would likely have
> some overhead though).
> 
> Anyhow, I am not crazy about having the same data replicated. Even from
> reading the code stand-of-view it is not intuitive.

If such interface existed before the drivers it would ideally be like 
that all in one place. But keeping the internal (for the driver) 
representation which may differ from the system and the external 
(system) representation separate is a a good option. if a driver can 
convert and use only the system counters they can do so.

-- 
Regards,
Alexander Atanasov


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: RFC [PATCH v4 2/7] Enable balloon drivers to report inflated memory
  2022-10-07 10:58         ` RFC " Alexander Atanasov
@ 2022-10-10  6:18           ` Nadav Amit
  2022-10-10  7:24             ` Alexander Atanasov
  0 siblings, 1 reply; 21+ messages in thread
From: Nadav Amit @ 2022-10-10  6:18 UTC (permalink / raw)
  To: Alexander Atanasov
  Cc: Michael S. Tsirkin, David Hildenbrand, Andrew Morton, kernel,
	Linux Virtualization, LKML, Linux MM

On Oct 7, 2022, at 3:58 AM, Alexander Atanasov <alexander.atanasov@virtuozzo.com> wrote:

> On 7.10.22 0:07, Nadav Amit wrote:
>>>> 
>>>> I was looking through the series and I did not see actual users of the
>>>> notifier. Usually, it is not great to build an API without users.
>>> 
>>> 
>>> You are right. I hope to get some feedback/interest from potential users that i mentioned in the cover letter. I will probably split the notifier
>>> in separate series. To make it usefull it will require more changes.
>>> See bellow more about them.
>> Fair, but this is something that is more suitable for RFC. Otherwise, more
>> likely than not - your patches would go in as is.
> 
> Yes, i will remove the notifier and resend both as RFC. I think that every patch is an RFC and RFC tag is used for more general changes that could affect unexpected areas, change functionality, change design and in general can lead to bigger impact. In the case with this it adds functionality that is missing and it could hardly affect anything else.
> In essence it provides information that you can not get without it.
> But i will take your advice and push everything thru RFC from now on.

Jus keep the version numbers as you had before. That’s fine and better to
prevent confusion.

>>>> [snip]
>>>>> +
>>>>> +static int balloon_notify(unsigned long val)
>>>>> +{
>>>>> +	return srcu_notifier_call_chain(&balloon_chain, val, NULL);
>>>> Since you know the inflated_kb value here, why not to use it as an argument
>>>> to the callback? I think casting to (void *) and back is best. But you can
>>>> also provide pointer to the value. Doesn’t it sound better than having
>>>> potentially different notifiers reading different values?
>>> 
>>> My current idea is to have a struct with current and previous value,
>>> may be change in percents. The actual value does not matter to anyone
>>> but the size of change does. When a user gets notified it can act upon
>>> the change - if it is small it can ignore it , if it is above some threshold it can act - if it makes sense for some receiver  is can accumulate changes from several notification. Other option/addition is to have si_meminfo_current(..) and totalram_pages_current(..) that return values adjusted with the balloon values.
>>> 
>>> Going further - there are few places that calculate something based on available memory that do not have sysfs/proc interface for setting limits. Most of them work in percents so they can be converted to do calculations when they get notification.
>>> 
>>> The one that have interface for configuration but use memory values can be handled in two ways - convert to use percents of what is available or extend the notifier to notify userspace which in turn to do calculations and update configuration.
>> I really need to see code to fully understand what you have in mind.
> 
> Sure - you can check some of the users with git grep totalram_pages - shows self explanatory results of usage like:
> fs/f2fs/node.c:bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) - calculations in percents - one good example
> fs/ceph/super.h:        congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
> fs/fuse/inode.c:                *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / 392;
> fs/nfs/write.c: nfs_congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
> fs/nfsd/nfscache.c:     unsigned long low_pages = totalram_pages() - totalhigh_pages()
> mm/oom_kill.c:  oc->totalpages = totalram_pages() + total_swap_pages;
> 
> 
> So all balloon drivers give large amount of RAM on boot , then inflate the balloon. But this places have already been initiallized and they know that the system have given amount of totalram which is not true the moment they start to operate. the result is that too much space gets used and it degrades the userspace performance.
> example - fs/eventpoll.c:static int __init eventpoll_init(void) - 4% of ram for eventpool - when you inflate half of the ram it becomes 8% of the ram - do you really need 8% of your ram to be used for eventpool?
> 
> To solve this you need to register and when notified update - cache size, limits and for whatever is the calculated amount of memory used.

Hmm.. Not sure about all of that. Most balloon drivers are manually managed,
and call adjust_managed_page_count(), and tas a result might want to redo
all the calculations that are based on totalram_pages().

Side-note: That’s not the case for VMware balloon. I actually considered
calling adjust_managed_page_count() just to conform with other balloon
drivers. But since we use totalram_pages() to communicate to the hypervisor
the total-ram, this would create endless (and wrong) feedback loop. I am not
claiming it is not possible to VMware balloon driver to call
adjust_managed_page_count(), but the chances are that it would create more
harm than good.

Back to the matter at hand. It seems that you wish that the notifiers would
be called following any changes that would be reflected in totalram_pages().
So, doesn't it make more sense to call it from adjust_managed_page_count() ?

> The difference is here:
> 
> mm/zswap.c:     return totalram_pages() * zswap_max_pool_percent / 100 <
> mm/zswap.c:     return totalram_pages() * zswap_accept_thr_percent / 100
> uses percents and you can recalculate easy with
> 
> +static inline unsigned long totalram_pages_current(void)
> +{
> +       unsigned long inflated = 0;
> +#ifdef CONFIG_MEMORY_BALLOON
> +       extern atomic_long_t mem_balloon_inflated_free_kb;
> +       inflated = atomic_long_read(&mem_balloon_inflated_free_kb);
> +       inflated >>= (PAGE_SHIFT - 10);
> +#endif
> +       return (unsigned long)atomic_long_read(&_totalram_pages) - inflated;
> +}
> 
> And you are good when you switch to _current version - si_meminfo_current is alike .
> 
> On init (probably) all use some kind of fractions to calculate but when there is a set value via /proc/sys/net/ipv4/tcp_wmem for example it is just a value and you can not recalculate it. And here, please, share your ideas how to solve this.

I don’t get all of that. Now that you provided some more explanations, it
sounds that what you want is adjust_managed_page_count(), which we already
have and affects the output of totalram_pages(). Therefore, totalram_pages()
anyhow accounts for the balloon memory (excluding VMware’s). So why do we
need to take mem_balloon_inflated_free_kb into account?

Sounds to me that all you want is some notifier to be called from
adjust_managed_page_count(). What am I missing?


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: RFC [PATCH v4 2/7] Enable balloon drivers to report inflated memory
  2022-10-10  6:18           ` Nadav Amit
@ 2022-10-10  7:24             ` Alexander Atanasov
  2022-10-10 14:47               ` Nadav Amit
  0 siblings, 1 reply; 21+ messages in thread
From: Alexander Atanasov @ 2022-10-10  7:24 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Michael S. Tsirkin, David Hildenbrand, Andrew Morton, kernel,
	Linux Virtualization, LKML, Linux MM

Hello,

On 10.10.22 9:18, Nadav Amit wrote:
> On Oct 7, 2022, at 3:58 AM, Alexander Atanasov <alexander.atanasov@virtuozzo.com> wrote:
> 
>> On 7.10.22 0:07, Nadav Amit wrote:
>>>>>
>>>>> I was looking through the series and I did not see actual users of the
>>>>> notifier. Usually, it is not great to build an API without users.
>>>>
>>>>
>>>> You are right. I hope to get some feedback/interest from potential users that i mentioned in the cover letter. I will probably split the notifier
>>>> in separate series. To make it usefull it will require more changes.
>>>> See bellow more about them.
>>> Fair, but this is something that is more suitable for RFC. Otherwise, more
>>> likely than not - your patches would go in as is.
>>
>> Yes, i will remove the notifier and resend both as RFC. I think that every patch is an RFC and RFC tag is used for more general changes that could affect unexpected areas, change functionality, change design and in general can lead to bigger impact. In the case with this it adds functionality that is missing and it could hardly affect anything else.
>> In essence it provides information that you can not get without it.
>> But i will take your advice and push everything thru RFC from now on.
> 
> Jus keep the version numbers as you had before. That’s fine and better to
> prevent confusion.

Sure, i will.

>>>>> [snip]
>>>>>> +
>>>>>> +static int balloon_notify(unsigned long val)
>>>>>> +{
>>>>>> +	return srcu_notifier_call_chain(&balloon_chain, val, NULL);
>>>>> Since you know the inflated_kb value here, why not to use it as an argument
>>>>> to the callback? I think casting to (void *) and back is best. But you can
>>>>> also provide pointer to the value. Doesn’t it sound better than having
>>>>> potentially different notifiers reading different values?
>>>>
>>>> My current idea is to have a struct with current and previous value,
>>>> may be change in percents. The actual value does not matter to anyone
>>>> but the size of change does. When a user gets notified it can act upon
>>>> the change - if it is small it can ignore it , if it is above some threshold it can act - if it makes sense for some receiver  is can accumulate changes from several notification. Other option/addition is to have si_meminfo_current(..) and totalram_pages_current(..) that return values adjusted with the balloon values.
>>>>
>>>> Going further - there are few places that calculate something based on available memory that do not have sysfs/proc interface for setting limits. Most of them work in percents so they can be converted to do calculations when they get notification.
>>>>
>>>> The one that have interface for configuration but use memory values can be handled in two ways - convert to use percents of what is available or extend the notifier to notify userspace which in turn to do calculations and update configuration.
>>> I really need to see code to fully understand what you have in mind.
>>
>> Sure - you can check some of the users with git grep totalram_pages - shows self explanatory results of usage like:
>> fs/f2fs/node.c:bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) - calculations in percents - one good example
>> fs/ceph/super.h:        congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
>> fs/fuse/inode.c:                *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / 392;
>> fs/nfs/write.c: nfs_congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
>> fs/nfsd/nfscache.c:     unsigned long low_pages = totalram_pages() - totalhigh_pages()
>> mm/oom_kill.c:  oc->totalpages = totalram_pages() + total_swap_pages;
>>
>>
>> So all balloon drivers give large amount of RAM on boot , then inflate the balloon. But this places have already been initiallized and they know that the system have given amount of totalram which is not true the moment they start to operate. the result is that too much space gets used and it degrades the userspace performance.
>> example - fs/eventpoll.c:static int __init eventpoll_init(void) - 4% of ram for eventpool - when you inflate half of the ram it becomes 8% of the ram - do you really need 8% of your ram to be used for eventpool?
>>
>> To solve this you need to register and when notified update - cache size, limits and for whatever is the calculated amount of memory used.
> 
> Hmm.. Not sure about all of that. Most balloon drivers are manually managed,
> and call adjust_managed_page_count(), and tas a result might want to redo
> all the calculations that are based on totalram_pages().

Yes, i will say that it looks like mixed manual - for large changes and 
automatic for small changes. VMWare and HyperV have automatic and 
manual/not sure exactly what you can change on a running VM but i guess 
you can/ - Virtio is only manual. I do not know about dlpar / xen.

Scenario is like this start a VM with 4GB ram, reduce to 2GB with 
balloon - vm can be upgraded.

All we are talking about relates to memory hotplug/unplug /where unplug 
is  close to nonexistant hence the balloons are used./

All values should be recalculated on memory hotplug too, so you can use 
the newly available RAM.

RAM is the most valuable resource of all so i consider using it 
optimally to be of a great importance.

> Side-note: That’s not the case for VMware balloon. I actually considered
> calling adjust_managed_page_count() just to conform with other balloon
> drivers. But since we use totalram_pages() to communicate to the hypervisor
> the total-ram, this would create endless (and wrong) feedback loop. I am not
> claiming it is not possible to VMware balloon driver to call
> adjust_managed_page_count(), but the chances are that it would create more
> harm than good.

Virtio does both - depending on the deflate on OOM option. I suggested 
already to unify all drivers to inflate the used memory as it seems more 
logical to me since  no body expects the totalram_pages() to change but 
the current state is that both ways are accepted and if changed can 
break existing users.
See  discussion here 
https://lore.kernel.org/lkml/20220809095358.2203355-1-alexander.atanasov@virtuozzo.com/.


> Back to the matter at hand. It seems that you wish that the notifiers would
> be called following any changes that would be reflected in totalram_pages().
> So, doesn't it make more sense to call it from adjust_managed_page_count() ?

It will hurt performance - all drivers work page by page , i.e. they 
update by +1/-1 and they do so under locks which as you already noted 
can lead to bad things. The notifier will accumulate the change and let 
its user know how much changed, so the can decide if they have to 
recalculate - it even can do so async in order to not disturb the drivers.

>> The difference is here:
>>
>> mm/zswap.c:     return totalram_pages() * zswap_max_pool_percent / 100 <
>> mm/zswap.c:     return totalram_pages() * zswap_accept_thr_percent / 100
>> uses percents and you can recalculate easy with
>>
>> +static inline unsigned long totalram_pages_current(void)
>> +{
>> +       unsigned long inflated = 0;
>> +#ifdef CONFIG_MEMORY_BALLOON
>> +       extern atomic_long_t mem_balloon_inflated_free_kb;
>> +       inflated = atomic_long_read(&mem_balloon_inflated_free_kb);
>> +       inflated >>= (PAGE_SHIFT - 10);
>> +#endif
>> +       return (unsigned long)atomic_long_read(&_totalram_pages) - inflated;
>> +}
>>
>> And you are good when you switch to _current version - si_meminfo_current is alike .
>>
>> On init (probably) all use some kind of fractions to calculate but when there is a set value via /proc/sys/net/ipv4/tcp_wmem for example it is just a value and you can not recalculate it. And here, please, share your ideas how to solve this.
> 
> I don’t get all of that. Now that you provided some more explanations, it
> sounds that what you want is adjust_managed_page_count(), which we already
> have and affects the output of totalram_pages(). Therefore, totalram_pages()
> anyhow accounts for the balloon memory (excluding VMware’s). So why do we
> need to take mem_balloon_inflated_free_kb into account?
Ok, you have this:
                                    / totalram
|----used----|b1|----free------|b2|

drivers can inflate both b1 and b2 - b1 free gets smaller, b2 totalram 
pages get smaller. so when you need totalram_pages() to do a calculation 
you need to adjust it with the pages that are inflated in free/used 
(b1). VMWare is not exception , Virtio does the same.
And according to to mst and davidh it is okay like this.
So i am proposing a way to handle both cases.

> Sounds to me that all you want is some notifier to be called from
> adjust_managed_page_count(). What am I missing?

Notifier will act as an accumulator to report size of change and it will 
make things easier for the drivers and users wrt locking.
Notifier is similar to the memory hotplug notifier.

-- 
Regards,
Alexander Atanasov


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: RFC [PATCH v4 2/7] Enable balloon drivers to report inflated memory
  2022-10-10  7:24             ` Alexander Atanasov
@ 2022-10-10 14:47               ` Nadav Amit
  2022-10-11  9:07                 ` Alexander Atanasov
  0 siblings, 1 reply; 21+ messages in thread
From: Nadav Amit @ 2022-10-10 14:47 UTC (permalink / raw)
  To: Alexander Atanasov
  Cc: Michael S. Tsirkin, David Hildenbrand, Andrew Morton, kernel,
	Linux Virtualization, LKML, Linux MM

On Oct 10, 2022, at 12:24 AM, Alexander Atanasov <alexander.atanasov@virtuozzo.com> wrote:

> Hello,
> 
> On 10.10.22 9:18, Nadav Amit wrote:
>> On Oct 7, 2022, at 3:58 AM, Alexander Atanasov <alexander.atanasov@virtuozzo.com> wrote:
>>> So all balloon drivers give large amount of RAM on boot , then inflate the balloon. But this places have already been initiallized and they know that the system have given amount of totalram which is not true the moment they start to operate. the result is that too much space gets used and it degrades the userspace performance.
>>> example - fs/eventpoll.c:static int __init eventpoll_init(void) - 4% of ram for eventpool - when you inflate half of the ram it becomes 8% of the ram - do you really need 8% of your ram to be used for eventpool?
>>> 
>>> To solve this you need to register and when notified update - cache size, limits and for whatever is the calculated amount of memory used.
>> Hmm.. Not sure about all of that. Most balloon drivers are manually managed,
>> and call adjust_managed_page_count(), and tas a result might want to redo
>> all the calculations that are based on totalram_pages().
> 
> Yes, i will say that it looks like mixed manual - for large changes and automatic for small changes. VMWare and HyperV have automatic and manual/not sure exactly what you can change on a running VM but i guess you can/ - Virtio is only manual. I do not know about dlpar / xen.
> 
> Scenario is like this start a VM with 4GB ram, reduce to 2GB with balloon - vm can be upgraded.
> 
> All we are talking about relates to memory hotplug/unplug /where unplug is  close to nonexistant hence the balloons are used./
> 
> All values should be recalculated on memory hotplug too, so you can use the newly available RAM.
> 
> RAM is the most valuable resource of all so i consider using it optimally to be of a great importance.
> 
>> Side-note: That’s not the case for VMware balloon. I actually considered
>> calling adjust_managed_page_count() just to conform with other balloon
>> drivers. But since we use totalram_pages() to communicate to the hypervisor
>> the total-ram, this would create endless (and wrong) feedback loop. I am not
>> claiming it is not possible to VMware balloon driver to call
>> adjust_managed_page_count(), but the chances are that it would create more
>> harm than good.
> 
> Virtio does both - depending on the deflate on OOM option. I suggested already to unify all drivers to inflate the used memory as it seems more logical to me since  no body expects the totalram_pages() to change but the current state is that both ways are accepted and if changed can break existing users.
> See  discussion here https://lore.kernel.org/lkml/20220809095358.2203355-1-alexander.atanasov@virtuozzo.com/.

Thanks for the reminder. I wish you can somehow summarize all of that into the
cover-letter and/or the commit messages for these patches.

> 
> 
>> Back to the matter at hand. It seems that you wish that the notifiers would
>> be called following any changes that would be reflected in totalram_pages().
>> So, doesn't it make more sense to call it from adjust_managed_page_count() ?
> 
> It will hurt performance - all drivers work page by page , i.e. they update by +1/-1 and they do so under locks which as you already noted can lead to bad things. The notifier will accumulate the change and let its user know how much changed, so the can decide if they have to recalculate - it even can do so async in order to not disturb the drivers.

So updating the counters by 1 is ok (using atomic operation, which is not
free)? And the reason it is (relatively) cheap is because nobody actually
looks at the value (i.e., nobody actually acts on the value)?

If nobody considers the value, then doesn’t it make sense just to update it
less frequently, and then call the notifiers?

>>> The difference is here:
>>> 
>>> mm/zswap.c:     return totalram_pages() * zswap_max_pool_percent / 100 <
>>> mm/zswap.c:     return totalram_pages() * zswap_accept_thr_percent / 100
>>> uses percents and you can recalculate easy with
>>> 
>>> +static inline unsigned long totalram_pages_current(void)
>>> +{
>>> +       unsigned long inflated = 0;
>>> +#ifdef CONFIG_MEMORY_BALLOON
>>> +       extern atomic_long_t mem_balloon_inflated_free_kb;
>>> +       inflated = atomic_long_read(&mem_balloon_inflated_free_kb);
>>> +       inflated >>= (PAGE_SHIFT - 10);
>>> +#endif
>>> +       return (unsigned long)atomic_long_read(&_totalram_pages) - inflated;
>>> +}

So we have here two values and it appears there is a hidden assumption that
they are both updated atomically. Otherwise, it appears, inflated
theoretically might be greater that _totalram_pages dn we get negative value
and all hell breaks loose.

But _totalram_pages and mem_balloon_inflated_free_kb are not updated
atomically together (each one is, but not together).

>>> And you are good when you switch to _current version - si_meminfo_current is alike .
>>> 
>>> On init (probably) all use some kind of fractions to calculate but when there is a set value via /proc/sys/net/ipv4/tcp_wmem for example it is just a value and you can not recalculate it. And here, please, share your ideas how to solve this.
>> I don’t get all of that. Now that you provided some more explanations, it
>> sounds that what you want is adjust_managed_page_count(), which we already
>> have and affects the output of totalram_pages(). Therefore, totalram_pages()
>> anyhow accounts for the balloon memory (excluding VMware’s). So why do we
>> need to take mem_balloon_inflated_free_kb into account?
> Ok, you have this:
>                                   / totalram
> |----used----|b1|----free------|b2|
> 
> drivers can inflate both b1 and b2 - b1 free gets smaller, b2 totalram pages get smaller. so when you need totalram_pages() to do a calculation you need to adjust it with the pages that are inflated in free/used (b1). VMWare is not exception , Virtio does the same.
> And according to to mst and davidh it is okay like this.
> So i am proposing a way to handle both cases.

Ugh. What about BALLOON_INFLATE and BALLOON_DEFLATE vm-events? Can’t this
information be used instead of yet another counter? Unless, of course, you
get the atomicity that I mentioned before.

>> Sounds to me that all you want is some notifier to be called from
>> adjust_managed_page_count(). What am I missing?
> 
> Notifier will act as an accumulator to report size of change and it will make things easier for the drivers and users wrt locking.
> Notifier is similar to the memory hotplug notifier.

Overall, I am not convinced that there is any value of separating the value
and the notifier. You can batch both or not batch both. In addition, as I
mentioned, having two values seems racy.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: RFC [PATCH v4 2/7] Enable balloon drivers to report inflated memory
  2022-10-10 14:47               ` Nadav Amit
@ 2022-10-11  9:07                 ` Alexander Atanasov
  2022-10-11  9:23                   ` David Hildenbrand
  0 siblings, 1 reply; 21+ messages in thread
From: Alexander Atanasov @ 2022-10-11  9:07 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Michael S. Tsirkin, David Hildenbrand, Andrew Morton, kernel,
	Linux Virtualization, LKML, Linux MM

Hello,

On 10.10.22 17:47, Nadav Amit wrote:
> On Oct 10, 2022, at 12:24 AM, Alexander Atanasov <alexander.atanasov@virtuozzo.com> wrote:
> 
>> Hello,
>>
>> On 10.10.22 9:18, Nadav Amit wrote:
>>> On Oct 7, 2022, at 3:58 AM, Alexander Atanasov <alexander.atanasov@virtuozzo.com> wrote:>

[snip]

>>> Side-note: That’s not the case for VMware balloon. I actually considered
>>> calling adjust_managed_page_count() just to conform with other balloon
>>> drivers. But since we use totalram_pages() to communicate to the hypervisor
>>> the total-ram, this would create endless (and wrong) feedback loop. I am not
>>> claiming it is not possible to VMware balloon driver to call
>>> adjust_managed_page_count(), but the chances are that it would create more
>>> harm than good.
>>
>> Virtio does both - depending on the deflate on OOM option. I suggested already to unify all drivers to inflate the used memory as it seems more logical to me since  no body expects the totalram_pages() to change but the current state is that both ways are accepted and if changed can break existing users.
>> See  discussion here https://lore.kernel.org/lkml/20220809095358.2203355-1-alexander.atanasov@virtuozzo.com/.
> 
> Thanks for the reminder. I wish you can somehow summarize all of that into the
> cover-letter and/or the commit messages for these patches.


I will put excerpts in the next versions and relevant links in the next 
versions. I see that the more i dig into it the deeper it becomes so it 
needs more explanations.


>>
>>
>>> Back to the matter at hand. It seems that you wish that the notifiers would
>>> be called following any changes that would be reflected in totalram_pages().
>>> So, doesn't it make more sense to call it from adjust_managed_page_count() ?
>>
>> It will hurt performance - all drivers work page by page , i.e. they update by +1/-1 and they do so under locks which as you already noted can lead to bad things. The notifier will accumulate the change and let its user know how much changed, so the can decide if they have to recalculate - it even can do so async in order to not disturb the drivers.
> 
> So updating the counters by 1 is ok (using atomic operation, which is not
> free)? And the reason it is (relatively) cheap is because nobody actually
> looks at the value (i.e., nobody actually acts on the value)?
> 
> If nobody considers the value, then doesn’t it make sense just to update it
> less frequently, and then call the notifiers?

That's my point too.
The drivers update managed page count by 1.
My goal is  when they are done to fire the notifier.

All drivers are similiar and work like this:
HV sends request inflate up/down
   driver up/down
     lock
       get_page()/put_page()
       optionally - adjust_managed_page_count(... +-1);
     unlock
    update_core and notify_balloon_changed

>>>> The difference is here:
>>>>
>>>> mm/zswap.c:     return totalram_pages() * zswap_max_pool_percent / 100 <
>>>> mm/zswap.c:     return totalram_pages() * zswap_accept_thr_percent / 100
>>>> uses percents and you can recalculate easy with
>>>>
>>>> +static inline unsigned long totalram_pages_current(void)
>>>> +{
>>>> +       unsigned long inflated = 0;
>>>> +#ifdef CONFIG_MEMORY_BALLOON
>>>> +       extern atomic_long_t mem_balloon_inflated_free_kb;
>>>> +       inflated = atomic_long_read(&mem_balloon_inflated_free_kb);
>>>> +       inflated >>= (PAGE_SHIFT - 10);
>>>> +#endif
>>>> +       return (unsigned long)atomic_long_read(&_totalram_pages) - inflated;
>>>> +}
> 
> So we have here two values and it appears there is a hidden assumption that
> they are both updated atomically. Otherwise, it appears, inflated
> theoretically might be greater that _totalram_pages dn we get negative value
> and all hell breaks loose.
> 
> But _totalram_pages and mem_balloon_inflated_free_kb are not updated
> atomically together (each one is, but not together).
> 

I do not think that can happen - in that case totalram_pages() is not 
adjusted and you can never inflate more than total ram.

Yes, they are not set atomic but see the use cases:

- a driver that does calculations on init.
   It will use notifier to redo the calculations.
   The notifier will bring the values and the size of change to help the 
driver decide if it needs to recalculate.

- a user of totalram_pages() that does calculations at run time -
   i have to research are there any users that could be affected by not 
setting the two values atomicaly - assuming there can be a slight 
difference. I.e. do we need precise calculations or they are calculating 
fractions.


>>>> And you are good when you switch to _current version - si_meminfo_current is alike .
>>>>
>>>> On init (probably) all use some kind of fractions to calculate but when there is a set value via /proc/sys/net/ipv4/tcp_wmem for example it is just a value and you can not recalculate it. And here, please, share your ideas how to solve this.
>>> I don’t get all of that. Now that you provided some more explanations, it
>>> sounds that what you want is adjust_managed_page_count(), which we already
>>> have and affects the output of totalram_pages(). Therefore, totalram_pages()
>>> anyhow accounts for the balloon memory (excluding VMware’s). So why do we
>>> need to take mem_balloon_inflated_free_kb into account?
>> Ok, you have this:
>>                                    / totalram
>> |----used----|b1|----free------|b2|
>>
>> drivers can inflate both b1 and b2 - b1 free gets smaller, b2 totalram pages get smaller. so when you need totalram_pages() to do a calculation you need to adjust it with the pages that are inflated in free/used (b1). VMWare is not exception , Virtio does the same.
>> And according to to mst and davidh it is okay like this.
>> So i am proposing a way to handle both cases.
> 
> Ugh. What about BALLOON_INFLATE and BALLOON_DEFLATE vm-events? Can’t this
> information be used instead of yet another counter? Unless, of course, you
> get the atomicity that I mentioned before.

What do you mean by vm-events ?


>>> Sounds to me that all you want is some notifier to be called from
>>> adjust_managed_page_count(). What am I missing?
>>
>> Notifier will act as an accumulator to report size of change and it will make things easier for the drivers and users wrt locking.
>> Notifier is similar to the memory hotplug notifier.
> 
> Overall, I am not convinced that there is any value of separating the value
> and the notifier. You can batch both or not batch both. In addition, as I
> mentioned, having two values seems racy.

I have identified two users so far above - may be more to come.
One type needs the value to adjust. Also having the value is necessary 
to report it to users and oom. There are options with callbacks and so 
on but it will complicate things with no real gain. You are right about 
the atomicity but i guess if that's a problem for some user it could 
find a way to ensure it. i am yet to find such place.

-- 
Regards,
Alexander Atanasov


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: RFC [PATCH v4 2/7] Enable balloon drivers to report inflated memory
  2022-10-11  9:07                 ` Alexander Atanasov
@ 2022-10-11  9:23                   ` David Hildenbrand
  2022-10-14 12:50                     ` Alexander Atanasov
  0 siblings, 1 reply; 21+ messages in thread
From: David Hildenbrand @ 2022-10-11  9:23 UTC (permalink / raw)
  To: Alexander Atanasov, Nadav Amit
  Cc: Michael S. Tsirkin, Andrew Morton, kernel, Linux Virtualization,
	LKML, Linux MM

>>>> Sounds to me that all you want is some notifier to be called from
>>>> adjust_managed_page_count(). What am I missing?
>>>
>>> Notifier will act as an accumulator to report size of change and it will make things easier for the drivers and users wrt locking.
>>> Notifier is similar to the memory hotplug notifier.
>>
>> Overall, I am not convinced that there is any value of separating the value
>> and the notifier. You can batch both or not batch both. In addition, as I
>> mentioned, having two values seems racy.
> 
> I have identified two users so far above - may be more to come.
> One type needs the value to adjust. Also having the value is necessary
> to report it to users and oom. There are options with callbacks and so
> on but it will complicate things with no real gain. You are right about
> the atomicity but i guess if that's a problem for some user it could
> find a way to ensure it. i am yet to find such place.
> 

I haven't followed the whole discussion, but I just wanted to raise that 
having a generic mechanism to notify on such changes could be valuable.

For example, virtio-mem also uses adjust_managed_page_count() and might 
sometimes not trigger memory hotplug notifiers when adding more memory 
(essentially, when it fake-adds memory part of an already added Linux 
memory block).

What might make sense is schedule some kind of deferred notification on 
adjust_managed_page_count() changes. This way, we could notify without 
caring about locking and would naturally batch notifications.

adjust_managed_page_count() users would not require changes.

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: RFC [PATCH v4 2/7] Enable balloon drivers to report inflated memory
  2022-10-11  9:23                   ` David Hildenbrand
@ 2022-10-14 12:50                     ` Alexander Atanasov
  2022-10-14 13:01                       ` David Hildenbrand
  0 siblings, 1 reply; 21+ messages in thread
From: Alexander Atanasov @ 2022-10-14 12:50 UTC (permalink / raw)
  To: David Hildenbrand, Nadav Amit
  Cc: Michael S. Tsirkin, Andrew Morton, kernel, Linux Virtualization,
	LKML, Linux MM

Hello,

On 11.10.22 12:23, David Hildenbrand wrote:
>>>>> Sounds to me that all you want is some notifier to be called from
>>>>> adjust_managed_page_count(). What am I missing?
>>>>
>>>> Notifier will act as an accumulator to report size of change and it 
>>>> will make things easier for the drivers and users wrt locking.
>>>> Notifier is similar to the memory hotplug notifier.
>>>
>>> Overall, I am not convinced that there is any value of separating the 
>>> value
>>> and the notifier. You can batch both or not batch both. In addition, 
>>> as I
>>> mentioned, having two values seems racy.
>>
>> I have identified two users so far above - may be more to come.
>> One type needs the value to adjust. Also having the value is necessary
>> to report it to users and oom. There are options with callbacks and so
>> on but it will complicate things with no real gain. You are right about
>> the atomicity but i guess if that's a problem for some user it could
>> find a way to ensure it. i am yet to find such place.
>>
> 
> I haven't followed the whole discussion, but I just wanted to raise that 
> having a generic mechanism to notify on such changes could be valuable.
> 
> For example, virtio-mem also uses adjust_managed_page_count() and might 
> sometimes not trigger memory hotplug notifiers when adding more memory 
> (essentially, when it fake-adds memory part of an already added Linux 
> memory block).
> 
> What might make sense is schedule some kind of deferred notification on 
> adjust_managed_page_count() changes. This way, we could notify without 
> caring about locking and would naturally batch notifications.
> 
> adjust_managed_page_count() users would not require changes.

Making it deferred will bring issues for both the users of the 
adjust_managed_page_count and the receivers of the notification - 
locking as first. And it is hard to know when the adjustment will 
finish, some of the drivers wait and retry in blocks. It will bring 
complexity and it will not be possible to convert users in small steps.

Other problem is that there are drivers that do not use 
adjust_managed_page_count().

Extending the current memory hotplug notifier is not an option too much 
things can break.

I think some new option CONFIG_DYN_MEM_CONSTRAINTS or a better name.
Add a dynmem_notifier that bring size changes to the subscribers.
CONFIG_DYN_MEM_CONSTRAINTS to be able to convert current __init 
functions without growing kernel in size when not using it.

One idea about the absolut values in sysfs/proc is to check if the 
option is set by the user or it is the same as was calculated at init 
time - if it is not set by the user it can be adjusted up/down by 
percent of the change in memory - it may require min/max value boundaries.

Does anyone know - if it is possible to know if a sysfs/proc file has 
been written to since boot?


-- 
Regards,
Alexander Atanasov


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: RFC [PATCH v4 2/7] Enable balloon drivers to report inflated memory
  2022-10-14 12:50                     ` Alexander Atanasov
@ 2022-10-14 13:01                       ` David Hildenbrand
  2022-10-14 13:33                         ` Alexander Atanasov
  0 siblings, 1 reply; 21+ messages in thread
From: David Hildenbrand @ 2022-10-14 13:01 UTC (permalink / raw)
  To: Alexander Atanasov, Nadav Amit
  Cc: Michael S. Tsirkin, Andrew Morton, kernel, Linux Virtualization,
	LKML, Linux MM

On 14.10.22 14:50, Alexander Atanasov wrote:
> Hello,
> 
> On 11.10.22 12:23, David Hildenbrand wrote:
>>>>>> Sounds to me that all you want is some notifier to be called from
>>>>>> adjust_managed_page_count(). What am I missing?
>>>>>
>>>>> Notifier will act as an accumulator to report size of change and it
>>>>> will make things easier for the drivers and users wrt locking.
>>>>> Notifier is similar to the memory hotplug notifier.
>>>>
>>>> Overall, I am not convinced that there is any value of separating the
>>>> value
>>>> and the notifier. You can batch both or not batch both. In addition,
>>>> as I
>>>> mentioned, having two values seems racy.
>>>
>>> I have identified two users so far above - may be more to come.
>>> One type needs the value to adjust. Also having the value is necessary
>>> to report it to users and oom. There are options with callbacks and so
>>> on but it will complicate things with no real gain. You are right about
>>> the atomicity but i guess if that's a problem for some user it could
>>> find a way to ensure it. i am yet to find such place.
>>>
>>
>> I haven't followed the whole discussion, but I just wanted to raise that
>> having a generic mechanism to notify on such changes could be valuable.
>>
>> For example, virtio-mem also uses adjust_managed_page_count() and might
>> sometimes not trigger memory hotplug notifiers when adding more memory
>> (essentially, when it fake-adds memory part of an already added Linux
>> memory block).
>>
>> What might make sense is schedule some kind of deferred notification on
>> adjust_managed_page_count() changes. This way, we could notify without
>> caring about locking and would naturally batch notifications.
>>
>> adjust_managed_page_count() users would not require changes.
> 
> Making it deferred will bring issues for both the users of the
> adjust_managed_page_count and the receivers of the notification -
> locking as first. And it is hard to know when the adjustment will
> finish, some of the drivers wait and retry in blocks. It will bring
> complexity and it will not be possible to convert users in small steps.

What exactly is the issue about handling that deferred? Who needs an 
immediate, 100% precise notification?

Locking from a separate workqueue shouldn't be too hard, or what am i 
missing?

> 
> Other problem is that there are drivers that do not use
> adjust_managed_page_count().

Which ones? Do we care?

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: RFC [PATCH v4 2/7] Enable balloon drivers to report inflated memory
  2022-10-14 13:01                       ` David Hildenbrand
@ 2022-10-14 13:33                         ` Alexander Atanasov
  2022-10-14 13:40                           ` David Hildenbrand
  0 siblings, 1 reply; 21+ messages in thread
From: Alexander Atanasov @ 2022-10-14 13:33 UTC (permalink / raw)
  To: David Hildenbrand, Nadav Amit
  Cc: Michael S. Tsirkin, Andrew Morton, kernel, Linux Virtualization,
	LKML, Linux MM

On 14.10.22 16:01, David Hildenbrand wrote:
> On 14.10.22 14:50, Alexander Atanasov wrote:
>> Hello,
>>
>> On 11.10.22 12:23, David Hildenbrand wrote:
>>>>>>> Sounds to me that all you want is some notifier to be called from
>>>>>>> adjust_managed_page_count(). What am I missing?
>>>>>>
>>>>>> Notifier will act as an accumulator to report size of change and it
>>>>>> will make things easier for the drivers and users wrt locking.
>>>>>> Notifier is similar to the memory hotplug notifier.
>>>>>
>>>>> Overall, I am not convinced that there is any value of separating the
>>>>> value
>>>>> and the notifier. You can batch both or not batch both. In addition,
>>>>> as I
>>>>> mentioned, having two values seems racy.
>>>>
>>>> I have identified two users so far above - may be more to come.
>>>> One type needs the value to adjust. Also having the value is necessary
>>>> to report it to users and oom. There are options with callbacks and so
>>>> on but it will complicate things with no real gain. You are right about
>>>> the atomicity but i guess if that's a problem for some user it could
>>>> find a way to ensure it. i am yet to find such place.
>>>>
>>>
>>> I haven't followed the whole discussion, but I just wanted to raise that
>>> having a generic mechanism to notify on such changes could be valuable.
>>>
>>> For example, virtio-mem also uses adjust_managed_page_count() and might
>>> sometimes not trigger memory hotplug notifiers when adding more memory
>>> (essentially, when it fake-adds memory part of an already added Linux
>>> memory block).
>>>
>>> What might make sense is schedule some kind of deferred notification on
>>> adjust_managed_page_count() changes. This way, we could notify without
>>> caring about locking and would naturally batch notifications.
>>>
>>> adjust_managed_page_count() users would not require changes.
>>
>> Making it deferred will bring issues for both the users of the
>> adjust_managed_page_count and the receivers of the notification -
>> locking as first. And it is hard to know when the adjustment will
>> finish, some of the drivers wait and retry in blocks. It will bring
>> complexity and it will not be possible to convert users in small steps.
> 
> What exactly is the issue about handling that deferred? Who needs an 
> immediate, 100% precise notification? >
> Locking from a separate workqueue shouldn't be too hard, or what am i 
> missing?
>

We do not need immediate but most of the current callers of 
adjust_managed_page_count work in +1/-1 updates - so we want to defer 
the notification until they are done with changes. Deferring to a wq is 
not the problem, it would need to be done most likely.


>>
>> Other problem is that there are drivers that do not use
>> adjust_managed_page_count().
> 
> Which ones? Do we care?

VMWare and Virtio balloon drivers. I recently proposed to unify them and 
the objection was that it would break existing users - which is valid so 
we must care i guess.

-- 
Regards,
Alexander Atanasov


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: RFC [PATCH v4 2/7] Enable balloon drivers to report inflated memory
  2022-10-14 13:33                         ` Alexander Atanasov
@ 2022-10-14 13:40                           ` David Hildenbrand
  2022-10-14 14:10                             ` Alexander Atanasov
  0 siblings, 1 reply; 21+ messages in thread
From: David Hildenbrand @ 2022-10-14 13:40 UTC (permalink / raw)
  To: Alexander Atanasov, Nadav Amit
  Cc: Michael S. Tsirkin, Andrew Morton, kernel, Linux Virtualization,
	LKML, Linux MM

>>>
>>> Other problem is that there are drivers that do not use
>>> adjust_managed_page_count().
>>
>> Which ones? Do we care?
> 
> VMWare and Virtio balloon drivers. I recently proposed to unify them and
> the objection was that it would break existing users - which is valid so
> we must care i guess.

I'm confused, I think we care about actual adjustment of the total pages 
available here, that we want to notify the system about. These 
approaches (vmware, virtio-balloon with deflate-on-oom) don't adjust 
totalpages, because the assumption is that we can get back the inflated 
memory any time we really need it automatically.

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: RFC [PATCH v4 2/7] Enable balloon drivers to report inflated memory
  2022-10-14 13:40                           ` David Hildenbrand
@ 2022-10-14 14:10                             ` Alexander Atanasov
  0 siblings, 0 replies; 21+ messages in thread
From: Alexander Atanasov @ 2022-10-14 14:10 UTC (permalink / raw)
  To: David Hildenbrand, Nadav Amit
  Cc: Michael S. Tsirkin, Andrew Morton, kernel, Linux Virtualization,
	LKML, Linux MM

Hello,

On 14.10.22 16:40, David Hildenbrand wrote:
>>>>
>>>> Other problem is that there are drivers that do not use
>>>> adjust_managed_page_count().
>>>
>>> Which ones? Do we care?
>>
>> VMWare and Virtio balloon drivers. I recently proposed to unify them and
>> the objection was that it would break existing users - which is valid so
>> we must care i guess.
> 
> I'm confused, I think we care about actual adjustment of the total pages 
> available here, that we want to notify the system about. These 
> approaches (vmware, virtio-balloon with deflate-on-oom) don't adjust 
> totalpages, because the assumption is that we can get back the inflated 
> memory any time we really need it automatically.

We want to notify about the actual adjustment of available pages no 
matter where they are accounted free or total. Users don't care where 
the ram came from or has gone. They need the total change, so they can 
decided if they need to recalculate.

The example i wrote earlier:
Kernel boots with 4GB.
Balloon takes back 2GB.
epoll_events allocated 4% of the total memory at boot.
For simpler math after total ram is reduced to 2GB, that 4% become 
really 8% of the total ram.
We want to tell epoll that there is 2GB change in total ram, so it can 
update to really use 4%.

Reverse direction is true too - you hot plug 4GB and the 4% become just 
2% so you are not using newly available ram optimally.

And it is not only about epoll.

When not recalculated/updated this allocations/limits/etc the balance of 
memory usage between userspace and kernel gets a bit off, and i think 
not a bit but a way off.

About the assumption drivers can get back the ram at anytime - if they 
use the oom_notifier - they can update the totalpages without problem. 
oom_killer doesn't check totalram_pages() but tries to free memory with 
the notifier and only if it fails totalram_pages() is consulted.

-- 
Regards,
Alexander Atanasov


^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2022-10-14 14:10 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20221005090158.2801592-1-alexander.atanasov@virtuozzo.com>
2022-10-05  9:01 ` [PATCH v4 1/7] Make place for common balloon code Alexander Atanasov
2022-10-05  9:01 ` [PATCH v4 2/7] Enable balloon drivers to report inflated memory Alexander Atanasov
2022-10-05 17:25   ` Nadav Amit
2022-10-06  7:34     ` Alexander Atanasov
2022-10-06 21:07       ` Nadav Amit
2022-10-07 10:58         ` RFC " Alexander Atanasov
2022-10-10  6:18           ` Nadav Amit
2022-10-10  7:24             ` Alexander Atanasov
2022-10-10 14:47               ` Nadav Amit
2022-10-11  9:07                 ` Alexander Atanasov
2022-10-11  9:23                   ` David Hildenbrand
2022-10-14 12:50                     ` Alexander Atanasov
2022-10-14 13:01                       ` David Hildenbrand
2022-10-14 13:33                         ` Alexander Atanasov
2022-10-14 13:40                           ` David Hildenbrand
2022-10-14 14:10                             ` Alexander Atanasov
2022-10-05  9:01 ` [PATCH v4 3/7] Display inflated memory to users Alexander Atanasov
2022-10-05  9:01 ` [PATCH v4 4/7] drivers: virtio: balloon - update inflated memory Alexander Atanasov
2022-10-05  9:01 ` [PATCH v4 5/7] Display inflated memory in logs Alexander Atanasov
2022-10-05  9:01 ` [PATCH v4 6/7] drivers: vmware: balloon - report inflated memory Alexander Atanasov
2022-10-05  9:01 ` [PATCH v4 7/7] drivers: hyperv: " Alexander Atanasov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).