* [PATCH v2 1/2] mm/rmap: provide folio_referenced with the options to try_lock or lock
2024-03-08 3:11 [PATCH v2 0/2] reclaim contended folios asynchronously instead of promoting them lipeifeng
@ 2024-03-08 3:11 ` lipeifeng
2024-03-08 3:11 ` [PATCH v2 2/2] mm: vmscan: reclaim contended folios asynchronously instead of promoting them lipeifeng
2024-03-08 4:56 ` [PATCH v2 0/2] " Matthew Wilcox
2 siblings, 0 replies; 7+ messages in thread
From: lipeifeng @ 2024-03-08 3:11 UTC (permalink / raw)
To: lipeifeng, 21cnbao, akpm, david, osalvador, willy
Cc: linux-mm, linux-kernel, Barry Song
From: Peifeng Li <lipeifeng@oppo.com>
The commit 6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path")
unconditionally switches to try_lock to avoid lock contention. This patch
introduces a parameter to allow folio_referenced to genuinely wait and
hold the lock in certain scenarios.
Before introducing the new context, we always set try_lock to true to
maintain the current behavior of the code.
Signed-off-by: Peifeng Li <lipeifeng@oppo.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
include/linux/rmap.h | 5 +++--
mm/rmap.c | 5 +++--
mm/vmscan.c | 16 ++++++++++++++--
3 files changed, 20 insertions(+), 6 deletions(-)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b7944a833668..846b2617a9f2 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -623,7 +623,8 @@ static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
* Called from mm/vmscan.c to handle paging out
*/
int folio_referenced(struct folio *, int is_locked,
- struct mem_cgroup *memcg, unsigned long *vm_flags);
+ struct mem_cgroup *memcg, unsigned long *vm_flags,
+ unsigned int rw_try_lock);
void try_to_migrate(struct folio *folio, enum ttu_flags flags);
void try_to_unmap(struct folio *, enum ttu_flags flags);
@@ -739,7 +740,7 @@ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
static inline int folio_referenced(struct folio *folio, int is_locked,
struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+ unsigned long *vm_flags, unsigned int rw_try_lock)
{
*vm_flags = 0;
return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index 3746a5531018..7d01f81ca587 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -952,6 +952,7 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
* @is_locked: Caller holds lock on the folio.
* @memcg: target memory cgroup
* @vm_flags: A combination of all the vma->vm_flags which referenced the folio.
+ * @rw_try_lock: if try_lock in rmap_walk
*
* Quick test_and_clear_referenced for all mappings of a folio,
*
@@ -959,7 +960,7 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
* the function bailed out due to rmap lock contention.
*/
int folio_referenced(struct folio *folio, int is_locked,
- struct mem_cgroup *memcg, unsigned long *vm_flags)
+ struct mem_cgroup *memcg, unsigned long *vm_flags, unsigned int rw_try_lock)
{
int we_locked = 0;
struct folio_referenced_arg pra = {
@@ -970,7 +971,7 @@ int folio_referenced(struct folio *folio, int is_locked,
.rmap_one = folio_referenced_one,
.arg = (void *)&pra,
.anon_lock = folio_lock_anon_vma_read,
- .try_lock = true,
+ .try_lock = rw_try_lock ? true : false,
.invalid_vma = invalid_folio_referenced_vma,
};
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a0e53999a865..509b5e0dffd3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -147,6 +147,9 @@ struct scan_control {
/* Always discard instead of demoting to lower tier memory */
unsigned int no_demotion:1;
+ /* if try_lock in rmap_walk */
+ unsigned int rw_try_lock:1;
+
/* Allocation order */
s8 order;
@@ -850,7 +853,7 @@ static enum folio_references folio_check_references(struct folio *folio,
unsigned long vm_flags;
referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
- &vm_flags);
+ &vm_flags, sc->rw_try_lock);
referenced_folio = folio_test_clear_referenced(folio);
/*
@@ -1522,6 +1525,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.may_unmap = 1,
+ .rw_try_lock = 1,
};
struct reclaim_stat stat;
unsigned int nr_reclaimed;
@@ -2059,7 +2063,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
/* Referenced or rmap lock contention: rotate */
if (folio_referenced(folio, 0, sc->target_mem_cgroup,
- &vm_flags) != 0) {
+ &vm_flags, sc->rw_try_lock) != 0) {
/*
* Identify referenced, file-backed active folios and
* give them one more trip around the active list. So
@@ -2114,6 +2118,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
.may_unmap = 1,
.may_swap = 1,
.no_demotion = 1,
+ .rw_try_lock = 1,
};
nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, ignore_references);
@@ -5459,6 +5464,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
.may_swap = true,
.reclaim_idx = MAX_NR_ZONES - 1,
.gfp_mask = GFP_KERNEL,
+ .rw_try_lock = 1,
};
buf = kvmalloc(len + 1, GFP_KERNEL);
@@ -6436,6 +6442,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = 1,
+ .rw_try_lock = 1,
};
/*
@@ -6481,6 +6488,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
.may_unmap = 1,
.reclaim_idx = MAX_NR_ZONES - 1,
.may_swap = !noswap,
+ .rw_try_lock = 1,
};
WARN_ON_ONCE(!current->reclaim_state);
@@ -6527,6 +6535,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
+ .rw_try_lock = 1,
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put
@@ -6788,6 +6797,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
.gfp_mask = GFP_KERNEL,
.order = order,
.may_unmap = 1,
+ .rw_try_lock = 1,
};
set_task_reclaim_state(current, &sc.reclaim_state);
@@ -7257,6 +7267,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
.may_unmap = 1,
.may_swap = 1,
.hibernation_mode = 1,
+ .rw_try_lock = 1,
};
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
unsigned long nr_reclaimed;
@@ -7415,6 +7426,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
.may_swap = 1,
.reclaim_idx = gfp_zone(gfp_mask),
+ .rw_try_lock = 1,
};
unsigned long pflags;
--
2.34.1
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH v2 2/2] mm: vmscan: reclaim contended folios asynchronously instead of promoting them
2024-03-08 3:11 [PATCH v2 0/2] reclaim contended folios asynchronously instead of promoting them lipeifeng
2024-03-08 3:11 ` [PATCH v2 1/2] mm/rmap: provide folio_referenced with the options to try_lock or lock lipeifeng
@ 2024-03-08 3:11 ` lipeifeng
2024-03-08 4:56 ` [PATCH v2 0/2] " Matthew Wilcox
2 siblings, 0 replies; 7+ messages in thread
From: lipeifeng @ 2024-03-08 3:11 UTC (permalink / raw)
To: lipeifeng, 21cnbao, akpm, david, osalvador, willy
Cc: linux-mm, linux-kernel, Barry Song
From: Peifeng Li <lipeifeng@oppo.com>
Commit 6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path")
prevents the reclaim path from becoming stuck on the rmap lock. However,
it reinserts those folios at the head of the LRU during shrink_folio_list,
even if those folios are very cold.
While running an Android phone with 6GiB memory for 2 hours, I observed
that 321728 folios can be incorrectly placed back to the inactive head
of the LRU due to lock contention, which amounts to approximately 44
folios per second. Similarly, the same test conducted on 4GiB phones
shows that 106 folios are improperly promoted per second. This can
have a detrimental effect on performance by increasing refaults and
the likelihood of OOM (Out of Memory) killing.
For this reason, the patch introduces a separate list for contended folios
and wakes up a new kthread:kshrinkd thread to asynchronously reclaim them,
thus preventing excessive violations of LRU rules. This new thread will
set try_lock to false and always wait until it holds the lock.
Below is some data collected from two phones running monkey for two
hours(less is better):
Phone with 6GiB memory:
w/o patch w/patch delta
workingset_refault 1451043114 1408015823 -2.9%
lmkd count 9231 9009 -2.4%
Phone with 4GiB memory:
w/o patch w/patch delta
workingset_refault 2674649801 2581150132 -3.4%
lmkd count 13800 13061 -5.3%
The Monkey is a program that runs on your emulator or device and generates
pseudo-random streams of user events such as clicks, touches, or gestures,
as well as a number of system-level events.
The Android low memory killer daemon (lmkd) process monitors the memory
state of a running Android system and reacts to high memory pressure by
killing the least essential processes to keep the system performing at
acceptable levels.
Signed-off-by: Peifeng Li <lipeifeng@oppo.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
include/linux/mmzone.h | 6 ++
include/linux/swap.h | 3 +
include/linux/vm_event_item.h | 2 +
mm/memory_hotplug.c | 2 +
mm/vmscan.c | 189 +++++++++++++++++++++++++++++++++-
mm/vmstat.c | 2 +
6 files changed, 201 insertions(+), 3 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c11b7cde81ef..19acacf92cc9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1332,6 +1332,12 @@ typedef struct pglist_data {
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
+ struct list_head kshrinkd_folios; /* rmap_walk contended folios list*/
+ spinlock_t kf_lock; /* Protect kshrinkd_folios list*/
+
+ struct task_struct *kshrinkd; /* reclaim kshrinkd_folios*/
+ wait_queue_head_t kshrinkd_wait;
+
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
enum zone_type kcompactd_highest_zoneidx;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2955f7a78d8d..6d15b577b6a3 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -438,6 +438,9 @@ void check_move_unevictable_folios(struct folio_batch *fbatch);
extern void __meminit kswapd_run(int nid);
extern void __meminit kswapd_stop(int nid);
+extern void kshrinkd_run(int nid);
+extern void kshrinkd_stop(int nid);
+
#ifdef CONFIG_SWAP
int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 747943bc8cc2..ee95ab138c87 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -38,9 +38,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
PGLAZYFREED,
PGREFILL,
PGREUSE,
+ PGSTEAL_KSHRINKD,
PGSTEAL_KSWAPD,
PGSTEAL_DIRECT,
PGSTEAL_KHUGEPAGED,
+ PGSCAN_KSHRINKD,
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_KHUGEPAGED,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a444e2d7dd2b..5e1c326a8bde 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1218,6 +1218,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
kswapd_run(nid);
kcompactd_run(nid);
+ kshrinkd_run(nid);
writeback_set_ratelimit();
@@ -2098,6 +2099,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
}
if (arg.status_change_nid >= 0) {
+ kshrinkd_stop(node);
kcompactd_stop(node);
kswapd_stop(node);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 509b5e0dffd3..ef540a520b47 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -150,6 +150,9 @@ struct scan_control {
/* if try_lock in rmap_walk */
unsigned int rw_try_lock:1;
+ /* need kshrinkd to reclaim if rwc trylock contended*/
+ unsigned int need_kshrinkd:1;
+
/* Allocation order */
s8 order;
@@ -201,6 +204,17 @@ struct scan_control {
*/
int vm_swappiness = 60;
+/*
+ * Wakeup kshrinkd those folios which lock-contended in ramp_walk
+ * during shrink_folio_list, instead of putting back to the head
+ * of LRU, to avoid to break the rules of LRU.
+ */
+static void wakeup_kshrinkd(struct pglist_data *pgdat)
+{
+ if (likely(pgdat->kshrinkd))
+ wake_up_interruptible(&pgdat->kshrinkd_wait);
+}
+
#ifdef CONFIG_MEMCG
/* Returns true for reclaim through cgroup limits or cgroup interfaces. */
@@ -844,6 +858,7 @@ enum folio_references {
FOLIOREF_RECLAIM_CLEAN,
FOLIOREF_KEEP,
FOLIOREF_ACTIVATE,
+ FOLIOREF_LOCK_CONTENDED,
};
static enum folio_references folio_check_references(struct folio *folio,
@@ -864,8 +879,12 @@ static enum folio_references folio_check_references(struct folio *folio,
return FOLIOREF_ACTIVATE;
/* rmap lock contention: rotate */
- if (referenced_ptes == -1)
- return FOLIOREF_KEEP;
+ if (referenced_ptes == -1) {
+ if (sc->need_kshrinkd && folio_pgdat(folio)->kshrinkd)
+ return FOLIOREF_LOCK_CONTENDED;
+ else
+ return FOLIOREF_KEEP;
+ }
if (referenced_ptes) {
/*
@@ -1035,6 +1054,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
struct folio_batch free_folios;
LIST_HEAD(ret_folios);
LIST_HEAD(demote_folios);
+ LIST_HEAD(contended_folios);
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
@@ -1052,6 +1072,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
enum folio_references references = FOLIOREF_RECLAIM;
bool dirty, writeback;
unsigned int nr_pages;
+ bool lock_contended = false;
cond_resched();
@@ -1193,6 +1214,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
case FOLIOREF_KEEP:
stat->nr_ref_keep += nr_pages;
goto keep_locked;
+ case FOLIOREF_LOCK_CONTENDED:
+ lock_contended = true;
+ goto keep_locked;
case FOLIOREF_RECLAIM:
case FOLIOREF_RECLAIM_CLEAN:
; /* try to reclaim the folio below */
@@ -1470,7 +1494,10 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
keep_locked:
folio_unlock(folio);
keep:
- list_add(&folio->lru, &ret_folios);
+ if (unlikely(lock_contended))
+ list_add(&folio->lru, &contended_folios);
+ else
+ list_add(&folio->lru, &ret_folios);
VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
folio_test_unevictable(folio), folio);
}
@@ -1512,6 +1539,14 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
free_unref_folios(&free_folios);
list_splice(&ret_folios, folio_list);
+
+ if (!list_empty(&contended_folios)) {
+ spin_lock_irq(&pgdat->kf_lock);
+ list_splice(&contended_folios, &pgdat->kshrinkd_folios);
+ spin_unlock_irq(&pgdat->kf_lock);
+ wakeup_kshrinkd(pgdat);
+ }
+
count_vm_events(PGACTIVATE, pgactivate);
if (plug)
@@ -1526,6 +1561,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
.gfp_mask = GFP_KERNEL,
.may_unmap = 1,
.rw_try_lock = 1,
+ .need_kshrinkd = 0,
};
struct reclaim_stat stat;
unsigned int nr_reclaimed;
@@ -2119,6 +2155,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
.may_swap = 1,
.no_demotion = 1,
.rw_try_lock = 1,
+ .need_kshrinkd = 0,
};
nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, ignore_references);
@@ -5465,6 +5502,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
.reclaim_idx = MAX_NR_ZONES - 1,
.gfp_mask = GFP_KERNEL,
.rw_try_lock = 1,
+ .need_kshrinkd = 0,
};
buf = kvmalloc(len + 1, GFP_KERNEL);
@@ -6443,6 +6481,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.may_unmap = 1,
.may_swap = 1,
.rw_try_lock = 1,
+ .need_kshrinkd = 1,
};
/*
@@ -6489,6 +6528,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
.reclaim_idx = MAX_NR_ZONES - 1,
.may_swap = !noswap,
.rw_try_lock = 1,
+ .need_kshrinkd = 0,
};
WARN_ON_ONCE(!current->reclaim_state);
@@ -6536,6 +6576,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
.rw_try_lock = 1,
+ .need_kshrinkd = 0,
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put
@@ -6798,6 +6839,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
.order = order,
.may_unmap = 1,
.rw_try_lock = 1,
+ .need_kshrinkd = 1,
};
set_task_reclaim_state(current, &sc.reclaim_state);
@@ -7268,6 +7310,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
.may_swap = 1,
.hibernation_mode = 1,
.rw_try_lock = 1,
+ .need_kshrinkd = 0,
};
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
unsigned long nr_reclaimed;
@@ -7338,6 +7381,145 @@ static int __init kswapd_init(void)
module_init(kswapd_init)
+static int kshrinkd_should_run(pg_data_t *pgdat)
+{
+ int should_run;
+
+ spin_lock_irq(&pgdat->kf_lock);
+ should_run = !list_empty(&pgdat->kshrinkd_folios);
+ spin_unlock_irq(&pgdat->kf_lock);
+
+ return should_run;
+}
+
+static unsigned long kshrinkd_reclaim_folios(struct list_head *folio_list,
+ struct pglist_data *pgdat)
+{
+ struct reclaim_stat dummy_stat;
+ unsigned int nr_reclaimed = 0;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .may_writepage = 1,
+ .may_unmap = 1,
+ .may_swap = 1,
+ .no_demotion = 1,
+ .rw_try_lock = 0,
+ .need_kshrinkd = 0,
+ };
+
+ if (list_empty(folio_list))
+ return nr_reclaimed;
+
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
+
+ return nr_reclaimed;
+}
+
+/*
+ * The background kshrink daemon, started as a kernel thread
+ * from the init process.
+ *
+ * Kshrinkd is to reclaim the contended-folio in rmap_walk when
+ * shrink_folio_list instead of putting back into the head of LRU
+ * directly, to avoid to break the rules of LRU.
+ */
+
+static int kshrinkd(void *p)
+{
+ pg_data_t *pgdat;
+ LIST_HEAD(tmp_contended_folios);
+
+ pgdat = (pg_data_t *)p;
+
+ current->flags |= PF_MEMALLOC | PF_KSWAPD;
+ set_freezable();
+
+ while (!kthread_should_stop()) {
+ unsigned long nr_reclaimed = 0;
+ unsigned long nr_putback = 0;
+
+ wait_event_freezable(pgdat->kshrinkd_wait,
+ kshrinkd_should_run(pgdat));
+
+ /* splice rmap_walk contended folios to tmp-list */
+ spin_lock_irq(&pgdat->kf_lock);
+ list_splice(&pgdat->kshrinkd_folios, &tmp_contended_folios);
+ INIT_LIST_HEAD(&pgdat->kshrinkd_folios);
+ spin_unlock_irq(&pgdat->kf_lock);
+
+ /* reclaim rmap_walk contended folios */
+ nr_reclaimed = kshrinkd_reclaim_folios(&tmp_contended_folios, pgdat);
+ __count_vm_events(PGSTEAL_KSHRINKD, nr_reclaimed);
+
+ /* putback the folios which failed to reclaim to lru */
+ while (!list_empty(&tmp_contended_folios)) {
+ struct folio *folio = lru_to_folio(&tmp_contended_folios);
+
+ nr_putback += folio_nr_pages(folio);
+ list_del(&folio->lru);
+ folio_putback_lru(folio);
+ }
+
+ __count_vm_events(PGSCAN_KSHRINKD, nr_reclaimed + nr_putback);
+ }
+
+ current->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
+
+ return 0;
+}
+
+/*
+ * This kshrinkd start function will be called by init and node-hot-add.
+ */
+void kshrinkd_run(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ if (pgdat->kshrinkd)
+ return;
+
+ pgdat->kshrinkd = kthread_run(kshrinkd, pgdat, "kshrinkd%d", nid);
+ if (IS_ERR(pgdat->kshrinkd)) {
+ /* failure to start kshrinkd */
+ WARN_ON_ONCE(system_state < SYSTEM_RUNNING);
+ pr_err("Failed to start kshrinkd on node %d\n", nid);
+ pgdat->kshrinkd = NULL;
+ }
+}
+
+/*
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * be holding mem_hotplug_begin/done().
+ */
+void kshrinkd_stop(int nid)
+{
+ struct task_struct *kshrinkd = NODE_DATA(nid)->kshrinkd;
+
+ if (kshrinkd) {
+ kthread_stop(kshrinkd);
+ NODE_DATA(nid)->kshrinkd = NULL;
+ }
+}
+
+static int __init kshrinkd_init(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ spin_lock_init(&pgdat->kf_lock);
+ init_waitqueue_head(&pgdat->kshrinkd_wait);
+ INIT_LIST_HEAD(&pgdat->kshrinkd_folios);
+
+ kshrinkd_run(nid);
+ }
+
+ return 0;
+}
+
+module_init(kshrinkd_init)
+
#ifdef CONFIG_NUMA
/*
* Node reclaim mode
@@ -7427,6 +7609,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
.may_swap = 1,
.reclaim_idx = gfp_zone(gfp_mask),
.rw_try_lock = 1,
+ .need_kshrinkd = 1,
};
unsigned long pflags;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index db79935e4a54..76d8a3b2d1a8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1279,9 +1279,11 @@ const char * const vmstat_text[] = {
"pgrefill",
"pgreuse",
+ "pgsteal_kshrinkd",
"pgsteal_kswapd",
"pgsteal_direct",
"pgsteal_khugepaged",
+ "pgscan_kshrinkd",
"pgscan_kswapd",
"pgscan_direct",
"pgscan_khugepaged",
--
2.34.1
^ permalink raw reply related [flat|nested] 7+ messages in thread