[RFC v1] mm: add page preemption

* [RFC v1] mm: add page preemption
@ 2019-10-20 13:43 Hillf Danton
  2019-10-20 15:24 ` Matthew Wilcox
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Hillf Danton @ 2019-10-20 13:43 UTC (permalink / raw)
  To: linux-mm
  Cc: Andrew Morton, linux-kernel, Michal Hocko, Johannes Weiner,
	Shakeel Butt, Minchan Kim, Mel Gorman, Vladimir Davydov,
	Jan Kara, Hillf Danton


Unlike cpu preemption, page preemption would have been a two-edge
option for quite a while. It is added by preventing tasks from
reclaiming as many lru pages as possible from other tasks of
higher priorities.

Who need pp?
Users who want to manage/control jitters in lru pages under memory
pressure. Way in parallel to scheduling with task's memory footprint
taken into account, pp makes task prio a part of page reclaiming.

[Off topic: prio can also be defined and added in memory controller
and then plays a role in memcg reclaiming, for example check prio
when selecting victim memcg.]

First on the page side, page->prio that is used to mirror the prio
of page owner tasks is added, and a couple of helpers for setting,
copying and comparing page->prio to help to add pages to lru.

Then on the reclaimer side, pgdat->kswapd_prio is added to mirror
the prio of tasks that wake up the kthread, and it is updated
every time kswapd raises its reclaiming priority.

Finally priorities on both sides are compared when deactivating lru
pages, and skip page if it is higher on prio.

V1 is based on next-20191018.

Changes since v0
- s/page->nice/page->prio/
- drop the role of kswapd's reclaiming prioirty in prio comparison
- add pgdat->kswapd_prio

Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Hillf Danton <hdanton@sina.com>
---

--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -281,6 +281,15 @@ config VIRT_TO_BUS
 	  should probably not select this.
 
 
+config PAGE_PREEMPTION
+	bool
+	help
+	  This makes a task unable to reclaim as many lru pages as
+	  possible from other tasks of higher priorities.
+
+	  Say N if unsure.
+
+
 config MMU_NOTIFIER
 	bool
 	select SRCU
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -14,6 +14,7 @@
 #include <linux/uprobes.h>
 #include <linux/page-flags-layout.h>
 #include <linux/workqueue.h>
+#include <linux/sched/prio.h>
 
 #include <asm/mmu.h>
 
@@ -197,6 +198,10 @@ struct page {
 	/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
 	atomic_t _refcount;
 
+#ifdef CONFIG_PAGE_PREEMPTION
+	int prio; /* mirror page owner task->prio */
+#endif
+
 #ifdef CONFIG_MEMCG
 	struct mem_cgroup *mem_cgroup;
 #endif
@@ -232,6 +237,54 @@ struct page {
 #define page_private(page)		((page)->private)
 #define set_page_private(page, v)	((page)->private = (v))
 
+#ifdef CONFIG_PAGE_PREEMPTION
+static inline bool page_prio_valid(struct page *p)
+{
+	return p->prio > MAX_PRIO;
+}
+
+static inline void set_page_prio(struct page *p, int task_prio)
+{
+	/* store page prio low enough to help khugepaged add lru pages */
+	if (!page_prio_valid(p))
+		p->prio = task_prio + MAX_PRIO + 1;
+}
+
+static inline void copy_page_prio(struct page *to, struct page *from)
+{
+	to->prio = from->prio;
+}
+
+static inline int page_prio(struct page *p)
+{
+	return p->prio - MAX_PRIO - 1;
+}
+
+static inline bool page_prio_higher(struct page *p, int prio)
+{
+	return page_prio(p) < prio;
+}
+#else
+static inline bool page_prio_valid(struct page *p)
+{
+	return true;
+}
+static inline void set_page_prio(struct page *p, int task_prio)
+{
+}
+static inline void copy_page_prio(struct page *to, struct page *from)
+{
+}
+static inline int page_prio(struct page *p)
+{
+	return MAX_PRIO + 1;
+}
+static inline bool page_prio_higher(struct page *p, int prio)
+{
+	return false;
+}
+#endif
+
 struct page_frag_cache {
 	void * va;
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -402,6 +402,7 @@ static void __lru_cache_add(struct page
 	struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
 
 	get_page(page);
+	set_page_prio(page, current->prio);
 	if (!pagevec_add(pvec, page) || PageCompound(page))
 		__pagevec_lru_add(pvec);
 	put_cpu_var(lru_add_pvec);
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -647,6 +647,7 @@ void migrate_page_states(struct page *ne
 		end_page_writeback(newpage);
 
 	copy_page_owner(page, newpage);
+	copy_page_prio(newpage, page);
 
 	mem_cgroup_migrate(page, newpage);
 }
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1575,6 +1575,7 @@ static int shmem_replace_page(struct pag
 
 	get_page(newpage);
 	copy_highpage(newpage, oldpage);
+	copy_page_prio(newpage, oldpage);
 	flush_dcache_page(newpage);
 
 	__SetPageLocked(newpage);
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -671,6 +671,7 @@ static void __collapse_huge_page_copy(pt
 			}
 		} else {
 			src_page = pte_page(pteval);
+			copy_page_prio(page, src_page);
 			copy_user_highpage(page, src_page, address, vma);
 			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
 			release_pte_page(src_page);
@@ -1723,6 +1724,7 @@ xa_unlocked:
 				clear_highpage(new_page + (index % HPAGE_PMD_NR));
 				index++;
 			}
+			copy_page_prio(new_page, page);
 			copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
 					page);
 			list_del(&page->lru);
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -738,6 +738,9 @@ typedef struct pglist_data {
 	int kswapd_order;
 	enum zone_type kswapd_classzone_idx;
 
+#ifdef CONFIG_PAGE_PREEMPTION
+	int kswapd_prio; /* mirror task->prio waking up kswapd */
+#endif
 	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
 
 #ifdef CONFIG_COMPACTION
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -110,6 +110,10 @@ struct scan_control {
 	/* The highest zone to isolate pages for reclaim from */
 	s8 reclaim_idx;
 
+#ifdef CONFIG_PAGE_PREEMPTION
+	s8 __pad;
+	int reclaimer_prio; /* mirror task->prio */
+#endif
 	/* This context's GFP mask */
 	gfp_t gfp_mask;
 
@@ -1710,11 +1714,18 @@ static unsigned long isolate_lru_pages(u
 		total_scan += nr_pages;
 
 		if (page_zonenum(page) > sc->reclaim_idx) {
+next_page:
 			list_move(&page->lru, &pages_skipped);
 			nr_skipped[page_zonenum(page)] += nr_pages;
 			continue;
 		}
 
+		if (IS_ENABLED(CONFIG_PAGE_PREEMPTION) &&
+		    is_active_lru(lru) &&
+		    global_reclaim(sc) &&
+		    page_prio_higher(page, sc->reclaimer_prio))
+			goto next_page;
+
 		/*
 		 * Do not count skipped pages because that makes the function
 		 * return with no isolated pages if the LRU mostly contains
@@ -3260,6 +3271,9 @@ unsigned long try_to_free_pages(struct z
 	unsigned long nr_reclaimed;
 	struct scan_control sc = {
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
+#ifdef CONFIG_PAGE_PREEMPTION
+		.reclaimer_prio = current->prio,
+#endif
 		.gfp_mask = current_gfp_context(gfp_mask),
 		.reclaim_idx = gfp_zone(gfp_mask),
 		.order = order,
@@ -3586,6 +3600,9 @@ static int balance_pgdat(pg_data_t *pgda
 	bool boosted;
 	struct zone *zone;
 	struct scan_control sc = {
+#ifdef CONFIG_PAGE_PREEMPTION
+		.reclaimer_prio = pgdat->kswapd_prio,
+#endif
 		.gfp_mask = GFP_KERNEL,
 		.order = order,
 		.may_unmap = 1,
@@ -3739,6 +3756,9 @@ restart:
 		if (nr_boost_reclaim && !nr_reclaimed)
 			break;
 
+		if (IS_ENABLED(CONFIG_PAGE_PREEMPTION))
+			sc.reclaimer_prio = pgdat->kswapd_prio;
+
 		if (raise_priority || !nr_reclaimed)
 			sc.priority--;
 	} while (sc.priority >= 1);
@@ -3831,6 +3851,10 @@ static void kswapd_try_to_sleep(pg_data_
 		 */
 		wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
 
+		if (IS_ENABLED(CONFIG_PAGE_PREEMPTION)) {
+			pgdat->kswapd_prio = MAX_PRIO + 1;
+			smp_wmb();
+		}
 		remaining = schedule_timeout(HZ/10);
 
 		/*
@@ -3865,8 +3889,13 @@ static void kswapd_try_to_sleep(pg_data_
 		 */
 		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
 
-		if (!kthread_should_stop())
+		if (!kthread_should_stop()) {
+			if (IS_ENABLED(CONFIG_PAGE_PREEMPTION)) {
+				pgdat->kswapd_prio = MAX_PRIO + 1;
+				smp_wmb();
+			}
 			schedule();
+		}
 
 		set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
 	} else {
@@ -3917,6 +3946,8 @@ static int kswapd(void *p)
 	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
 	set_freezable();
 
+	if (IS_ENABLED(CONFIG_PAGE_PREEMPTION))
+		pgdat->kswapd_prio = MAX_PRIO + 1;
 	pgdat->kswapd_order = 0;
 	pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
 	for ( ; ; ) {
@@ -3985,6 +4016,17 @@ void wakeup_kswapd(struct zone *zone, gf
 		return;
 	pgdat = zone->zone_pgdat;
 
+	if (IS_ENABLED(CONFIG_PAGE_PREEMPTION)) {
+		int prio = current->prio;
+
+		if (pgdat->kswapd_prio < prio) {
+			smp_rmb();
+			return;
+		}
+		pgdat->kswapd_prio = prio;
+		smp_wmb();
+	}
+
 	if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
 		pgdat->kswapd_classzone_idx = classzone_idx;
 	else
--



^ permalink raw reply	[flat|nested] 11+ messages in thread