[PATCH rfc 3/4] percpu: on demand chunk depopulation

From: Roman Gushchin <guro@fb.com>
To: Dennis Zhou <dennis@kernel.org>
Cc: Tejun Heo <tj@kernel.org>, Christoph Lameter <cl@linux.com>,
	Andrew Morton <akpm@linux-foundation.org>, <linux-mm@kvack.org>,
	<linux-kernel@vger.kernel.org>, Roman Gushchin <guro@fb.com>
Subject: [PATCH rfc 3/4] percpu: on demand chunk depopulation
Date: Wed, 24 Mar 2021 12:06:25 -0700	[thread overview]
Message-ID: <20210324190626.564297-4-guro@fb.com> (raw)
In-Reply-To: <20210324190626.564297-1-guro@fb.com>

To return unused memory to the system schedule an async
depopulation of percpu chunks.

To balance between scanning too much and creating an overhead because
of the pcpu_lock contention and scanning not enough, let's track an
amount of chunks to scan and mark chunks which are potentially a good
target for the depopulation with a new boolean flag.  The async
depopulation work will clear the flag after trying to depopulate a
chunk (successfully or not).

This commit suggest the following logic: if a chunk
  1) has more than 1/4 of total pages free and populated
  2) isn't a reserved chunk
  3) isn't entirely free
  4) isn't alone in the corresponding slot
it's a good target for depopulation.

If there are 2 or more of such chunks, an async depopulation
is scheduled.

Because chunk population and depopulation are opposite processes
which make a little sense together, split out the shrinking part of
pcpu_balance_populated() into pcpu_grow_populated() and make
pcpu_balance_populated() calling into pcpu_grow_populated() or
pcpu_shrink_populated() conditionally.

Signed-off-by: Roman Gushchin <guro@fb.com>
---
 mm/percpu-internal.h |   1 +
 mm/percpu.c          | 111 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 85 insertions(+), 27 deletions(-)

diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 18b768ac7dca..1c5b92af02eb 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -67,6 +67,7 @@ struct pcpu_chunk {
 
 	void			*data;		/* chunk data */
 	bool			immutable;	/* no [de]population allowed */
+	bool			depopulate;	/* depopulation hint */
 	int			start_offset;	/* the overlap with the previous
 						   region to have a page aligned
 						   base_addr */
diff --git a/mm/percpu.c b/mm/percpu.c
index 015d076893f5..148137f0fc0b 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -178,6 +178,12 @@ static LIST_HEAD(pcpu_map_extend_chunks);
  */
 int pcpu_nr_empty_pop_pages;
 
+/*
+ * Track the number of chunks with a lot of free memory.
+ * It's used to release unused pages to the system.
+ */
+static int pcpu_nr_chunks_to_depopulate;
+
 /*
  * The number of populated pages in use by the allocator, protected by
  * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
@@ -1955,6 +1961,11 @@ static void pcpu_balance_free(enum pcpu_chunk_type type)
 		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
 			continue;
 
+		if (chunk->depopulate) {
+			chunk->depopulate = false;
+			pcpu_nr_chunks_to_depopulate--;
+		}
+
 		list_move(&chunk->list, &to_free);
 	}
 
@@ -1976,7 +1987,7 @@ static void pcpu_balance_free(enum pcpu_chunk_type type)
 }
 
 /**
- * pcpu_balance_populated - manage the amount of populated pages
+ * pcpu_grow_populated - populate chunk(s) to satisfy atomic allocations
  * @type: chunk type
  *
  * Maintain a certain amount of populated pages to satisfy atomic allocations.
@@ -1985,35 +1996,15 @@ static void pcpu_balance_free(enum pcpu_chunk_type type)
  * allocation causes the failure as it is possible that requests can be
  * serviced from already backed regions.
  */
-static void pcpu_balance_populated(enum pcpu_chunk_type type)
+static void pcpu_grow_populated(enum pcpu_chunk_type type, int nr_to_pop)
 {
 	/* gfp flags passed to underlying allocators */
 	const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
 	struct list_head *pcpu_slot = pcpu_chunk_list(type);
 	struct pcpu_chunk *chunk;
-	int slot, nr_to_pop, ret;
+	int slot, ret;
 
-	/*
-	 * Ensure there are certain number of free populated pages for
-	 * atomic allocs.  Fill up from the most packed so that atomic
-	 * allocs don't increase fragmentation.  If atomic allocation
-	 * failed previously, always populate the maximum amount.  This
-	 * should prevent atomic allocs larger than PAGE_SIZE from keeping
-	 * failing indefinitely; however, large atomic allocs are not
-	 * something we support properly and can be highly unreliable and
-	 * inefficient.
-	 */
 retry_pop:
-	if (pcpu_atomic_alloc_failed) {
-		nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
-		/* best effort anyway, don't worry about synchronization */
-		pcpu_atomic_alloc_failed = false;
-	} else {
-		nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
-				  pcpu_nr_empty_pop_pages,
-				  0, PCPU_EMPTY_POP_PAGES_HIGH);
-	}
-
 	for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
 		unsigned int nr_unpop = 0, rs, re;
 
@@ -2084,9 +2075,18 @@ static void pcpu_shrink_populated(enum pcpu_chunk_type type)
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			bool isolated = false;
 
-			if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH)
+			if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH ||
+			    pcpu_nr_chunks_to_depopulate < 1)
 				break;
 
+			/*
+			 * Don't try to depopulate a chunk again and again.
+			 */
+			if (!chunk->depopulate)
+				continue;
+			chunk->depopulate = false;
+			pcpu_nr_chunks_to_depopulate--;
+
 			for (i = 0, start = -1; i < chunk->nr_pages; i++) {
 				if (!chunk->nr_empty_pop_pages)
 					break;
@@ -2153,6 +2153,41 @@ static void pcpu_shrink_populated(enum pcpu_chunk_type type)
 	spin_unlock_irq(&pcpu_lock);
 }
 
+/**
+ * pcpu_balance_populated - manage the amount of populated pages
+ * @type: chunk type
+ *
+ * Populate or depopulate chunks to maintain a certain amount
+ * of free pages to satisfy atomic allocations, but not waste
+ * large amounts of memory.
+ */
+static void pcpu_balance_populated(enum pcpu_chunk_type type)
+{
+	int nr_to_pop;
+
+	/*
+	 * Ensure there are certain number of free populated pages for
+	 * atomic allocs.  Fill up from the most packed so that atomic
+	 * allocs don't increase fragmentation.  If atomic allocation
+	 * failed previously, always populate the maximum amount.  This
+	 * should prevent atomic allocs larger than PAGE_SIZE from keeping
+	 * failing indefinitely; however, large atomic allocs are not
+	 * something we support properly and can be highly unreliable and
+	 * inefficient.
+	 */
+	if (pcpu_atomic_alloc_failed) {
+		nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
+		/* best effort anyway, don't worry about synchronization */
+		pcpu_atomic_alloc_failed = false;
+		pcpu_grow_populated(type, nr_to_pop);
+	} else if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
+		nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH - pcpu_nr_empty_pop_pages;
+		pcpu_grow_populated(type, nr_to_pop);
+	} else if (pcpu_nr_chunks_to_depopulate > 0) {
+		pcpu_shrink_populated(type);
+	}
+}
+
 /**
  * pcpu_balance_workfn - manage the amount of free chunks and populated pages
  * @work: unused
@@ -2188,6 +2223,7 @@ void free_percpu(void __percpu *ptr)
 	int size, off;
 	bool need_balance = false;
 	struct list_head *pcpu_slot;
+	struct pcpu_chunk *pos;
 
 	if (!ptr)
 		return;
@@ -2207,15 +2243,36 @@ void free_percpu(void __percpu *ptr)
 
 	pcpu_memcg_free_hook(chunk, off, size);
 
-	/* if there are more than one fully free chunks, wake up grim reaper */
 	if (chunk->free_bytes == pcpu_unit_size) {
-		struct pcpu_chunk *pos;
-
+		/*
+		 * If there are more than one fully free chunks,
+		 * wake up grim reaper.
+		 */
 		list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
 			if (pos != chunk) {
 				need_balance = true;
 				break;
 			}
+
+	} else if (chunk->nr_empty_pop_pages > chunk->nr_pages / 4) {
+		/*
+		 * If there is more than one chunk in the slot and
+		 * at least 1/4 of its pages are empty, mark the chunk
+		 * as a target for the depopulation. If there is more
+		 * than one chunk like this, schedule an async balancing.
+		 */
+		int nslot = pcpu_chunk_slot(chunk);
+
+		list_for_each_entry(pos, &pcpu_slot[nslot], list)
+			if (pos != chunk && !chunk->depopulate &&
+			    !chunk->immutable) {
+				chunk->depopulate = true;
+				pcpu_nr_chunks_to_depopulate++;
+				break;
+			}
+
+		if (pcpu_nr_chunks_to_depopulate > 1)
+			need_balance = true;
 	}
 
 	trace_percpu_free_percpu(chunk->base_addr, off, ptr);
-- 
2.30.2