All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 5/5]thp: split huge page if head page is isolated
@ 2011-10-25  2:59 ` Shaohua Li
  0 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-10-25  2:59 UTC (permalink / raw)
  To: Andrew Morton
  Cc: aarcange, Hugh Dickins, Rik van Riel, mel, KAMEZAWA Hiroyuki,
	Minchan Kim, linux-mm, lkml

With current logic, if page reclaim finds a huge page, it will just reclaim
the head page and leave tail pages reclaimed later. Let's take an example,
lru list has page A and B, page A is huge page:
1. page A is isolated
2. page B is isolated
3. shrink_page_list() adds page A to swap page cache. so page A is split.
page A+1, page A+2, ... are added to lru list.
4. shrink_page_list() adds page B to swap page cache.
5. page A and B is written out and reclaimed.
6. page A+1, A+2 ... is isolated and reclaimed later.
So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...

We expected the whole huge page A is reclaimed in the meantime, so
the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....

With this patch, we do huge page split just after the head page is isolated
for inactive lru list, so the tail pages will be reclaimed immediately.

In a test, a range of anonymous memory is written and will trigger swap.
Without the patch:
#cat /proc/vmstat|grep thp
thp_fault_alloc 451
thp_fault_fallback 0
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 238

With the patch:
#cat /proc/vmstat|grep thp
thp_fault_alloc 450
thp_fault_fallback 1
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 103

So the thp_split number is reduced a lot, though there is one extra
thp_fault_fallback.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
---
 include/linux/memcontrol.h |    3 +-
 mm/memcontrol.c            |   12 +++++++++--
 mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
 3 files changed, 50 insertions(+), 14 deletions(-)

Index: linux/mm/vmscan.c
===================================================================
--- linux.orig/mm/vmscan.c	2011-10-25 08:36:08.000000000 +0800
+++ linux/mm/vmscan.c	2011-10-25 09:51:44.000000000 +0800
@@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		struct list_head *src, struct list_head *dst,
-		unsigned long *scanned, int order, int mode, int file)
+		unsigned long *scanned, int order, int mode, int file,
+		struct page **split_page)
 {
 	unsigned long nr_taken = 0;
 	unsigned long nr_lumpy_taken = 0;
@@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
 		case 0:
 			list_move(&page->lru, dst);
 			mem_cgroup_del_lru(page);
-			nr_taken += hpage_nr_pages(page);
+			if (PageTransHuge(page) && split_page) {
+				nr_taken++;
+				*split_page = page;
+				goto out;
+			} else
+				nr_taken += hpage_nr_pages(page);
 			break;
 
 		case -EBUSY:
@@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
 			if (__isolate_lru_page(cursor_page, mode, file) == 0) {
 				list_move(&cursor_page->lru, dst);
 				mem_cgroup_del_lru(cursor_page);
-				nr_taken += hpage_nr_pages(page);
 				nr_lumpy_taken++;
 				if (PageDirty(cursor_page))
 					nr_lumpy_dirty++;
 				scan++;
+				if (PageTransHuge(page) && split_page) {
+					nr_taken++;
+					*split_page = page;
+					goto out;
+				} else
+					nr_taken += hpage_nr_pages(page);
 			} else {
 				/*
 				 * Check if the page is freed already.
@@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
 			nr_lumpy_failed++;
 	}
 
+out:
 	*scanned = scan;
 
 	trace_mm_vmscan_lru_isolate(order,
@@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
 					struct list_head *dst,
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
-					int active, int file)
+					int active, int file,
+					struct page **split_page)
 {
 	int lru = LRU_BASE;
 	if (active)
@@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
 	if (file)
 		lru += LRU_FILE;
 	return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
-								mode, file);
+							mode, file, split_page);
 }
 
 /*
@@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
 {
 	LIST_HEAD(page_list);
 	unsigned long nr_scanned;
+	unsigned long total_scanned = 0;
 	unsigned long nr_reclaimed = 0;
 	unsigned long nr_taken;
 	unsigned long nr_anon;
 	unsigned long nr_file;
+	struct page *split_page;
 
 	while (unlikely(too_many_isolated(zone, file, sc))) {
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
 	}
 
 	set_reclaim_mode(priority, sc, false);
+again:
 	lru_add_drain();
+	split_page = NULL;
 	spin_lock_irq(&zone->lru_lock);
 
 	if (scanning_global_lru(sc)) {
-		nr_taken = isolate_pages_global(nr_to_scan,
+		nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
 			&page_list, &nr_scanned, sc->order,
 			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
 					ISOLATE_BOTH : ISOLATE_INACTIVE,
-			zone, 0, file);
+			zone, 0, file, &split_page);
 		zone->pages_scanned += nr_scanned;
+		total_scanned += nr_scanned;
 		if (current_is_kswapd())
 			__count_zone_vm_events(PGSCAN_KSWAPD, zone,
 					       nr_scanned);
@@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
 			__count_zone_vm_events(PGSCAN_DIRECT, zone,
 					       nr_scanned);
 	} else {
-		nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
+		nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
 			&page_list, &nr_scanned, sc->order,
 			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
 					ISOLATE_BOTH : ISOLATE_INACTIVE,
 			zone, sc->mem_cgroup,
-			0, file);
+			0, file, &split_page);
+		total_scanned += nr_scanned;
 		/*
 		 * mem_cgroup_isolate_pages() keeps track of
 		 * scanned pages on its own.
@@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
 		spin_unlock_irq(&zone->lru_lock);
 		return 0;
 	}
+	if (split_page && total_scanned < nr_to_scan) {
+		spin_unlock_irq(&zone->lru_lock);
+		split_huge_page(split_page);
+		goto again;
+	}
 
 	update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
 
 	spin_unlock_irq(&zone->lru_lock);
 
+	if (split_page)
+		split_huge_page(split_page);
+
 	nr_reclaimed = shrink_page_list(&page_list, zone, sc);
 
 	/* Check if we should syncronously wait for writeback */
@@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
 		nr_taken = isolate_pages_global(nr_pages, &l_hold,
 						&pgscanned, sc->order,
 						ISOLATE_ACTIVE, zone,
-						1, file);
+						1, file, NULL);
 		zone->pages_scanned += pgscanned;
 	} else {
 		nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
 						&pgscanned, sc->order,
 						ISOLATE_ACTIVE, zone,
-						sc->mem_cgroup, 1, file);
+						sc->mem_cgroup, 1, file, NULL);
 		/*
 		 * mem_cgroup_isolate_pages() keeps track of
 		 * scanned pages on its own.
Index: linux/mm/memcontrol.c
===================================================================
--- linux.orig/mm/memcontrol.c	2011-10-25 08:36:08.000000000 +0800
+++ linux/mm/memcontrol.c	2011-10-25 09:33:51.000000000 +0800
@@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
-					int active, int file)
+					int active, int file,
+					struct page **split_page)
 {
 	unsigned long nr_taken = 0;
 	struct page *page;
@@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
 		case 0:
 			list_move(&page->lru, dst);
 			mem_cgroup_del_lru(page);
-			nr_taken += hpage_nr_pages(page);
+			if (PageTransHuge(page) && split_page) {
+				nr_taken++;
+				*split_page = page;
+				goto out;
+			} else
+				nr_taken += hpage_nr_pages(page);
+
 			break;
 		case -EBUSY:
 			/* we don't affect global LRU but rotate in our LRU */
@@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
 		}
 	}
 
+out:
 	*scanned = scan;
 
 	trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
Index: linux/include/linux/memcontrol.h
===================================================================
--- linux.orig/include/linux/memcontrol.h	2011-10-25 08:36:08.000000000 +0800
+++ linux/include/linux/memcontrol.h	2011-10-25 09:33:51.000000000 +0800
@@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
-					int active, int file);
+					int active, int file,
+					struct page **split_page);
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 /*



^ permalink raw reply	[flat|nested] 42+ messages in thread

* [patch 5/5]thp: split huge page if head page is isolated
@ 2011-10-25  2:59 ` Shaohua Li
  0 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-10-25  2:59 UTC (permalink / raw)
  To: Andrew Morton
  Cc: aarcange, Hugh Dickins, Rik van Riel, mel, KAMEZAWA Hiroyuki,
	Minchan Kim, linux-mm, lkml

With current logic, if page reclaim finds a huge page, it will just reclaim
the head page and leave tail pages reclaimed later. Let's take an example,
lru list has page A and B, page A is huge page:
1. page A is isolated
2. page B is isolated
3. shrink_page_list() adds page A to swap page cache. so page A is split.
page A+1, page A+2, ... are added to lru list.
4. shrink_page_list() adds page B to swap page cache.
5. page A and B is written out and reclaimed.
6. page A+1, A+2 ... is isolated and reclaimed later.
So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...

We expected the whole huge page A is reclaimed in the meantime, so
the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....

With this patch, we do huge page split just after the head page is isolated
for inactive lru list, so the tail pages will be reclaimed immediately.

In a test, a range of anonymous memory is written and will trigger swap.
Without the patch:
#cat /proc/vmstat|grep thp
thp_fault_alloc 451
thp_fault_fallback 0
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 238

With the patch:
#cat /proc/vmstat|grep thp
thp_fault_alloc 450
thp_fault_fallback 1
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 103

So the thp_split number is reduced a lot, though there is one extra
thp_fault_fallback.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
---
 include/linux/memcontrol.h |    3 +-
 mm/memcontrol.c            |   12 +++++++++--
 mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
 3 files changed, 50 insertions(+), 14 deletions(-)

Index: linux/mm/vmscan.c
===================================================================
--- linux.orig/mm/vmscan.c	2011-10-25 08:36:08.000000000 +0800
+++ linux/mm/vmscan.c	2011-10-25 09:51:44.000000000 +0800
@@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		struct list_head *src, struct list_head *dst,
-		unsigned long *scanned, int order, int mode, int file)
+		unsigned long *scanned, int order, int mode, int file,
+		struct page **split_page)
 {
 	unsigned long nr_taken = 0;
 	unsigned long nr_lumpy_taken = 0;
@@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
 		case 0:
 			list_move(&page->lru, dst);
 			mem_cgroup_del_lru(page);
-			nr_taken += hpage_nr_pages(page);
+			if (PageTransHuge(page) && split_page) {
+				nr_taken++;
+				*split_page = page;
+				goto out;
+			} else
+				nr_taken += hpage_nr_pages(page);
 			break;
 
 		case -EBUSY:
@@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
 			if (__isolate_lru_page(cursor_page, mode, file) == 0) {
 				list_move(&cursor_page->lru, dst);
 				mem_cgroup_del_lru(cursor_page);
-				nr_taken += hpage_nr_pages(page);
 				nr_lumpy_taken++;
 				if (PageDirty(cursor_page))
 					nr_lumpy_dirty++;
 				scan++;
+				if (PageTransHuge(page) && split_page) {
+					nr_taken++;
+					*split_page = page;
+					goto out;
+				} else
+					nr_taken += hpage_nr_pages(page);
 			} else {
 				/*
 				 * Check if the page is freed already.
@@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
 			nr_lumpy_failed++;
 	}
 
+out:
 	*scanned = scan;
 
 	trace_mm_vmscan_lru_isolate(order,
@@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
 					struct list_head *dst,
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
-					int active, int file)
+					int active, int file,
+					struct page **split_page)
 {
 	int lru = LRU_BASE;
 	if (active)
@@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
 	if (file)
 		lru += LRU_FILE;
 	return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
-								mode, file);
+							mode, file, split_page);
 }
 
 /*
@@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
 {
 	LIST_HEAD(page_list);
 	unsigned long nr_scanned;
+	unsigned long total_scanned = 0;
 	unsigned long nr_reclaimed = 0;
 	unsigned long nr_taken;
 	unsigned long nr_anon;
 	unsigned long nr_file;
+	struct page *split_page;
 
 	while (unlikely(too_many_isolated(zone, file, sc))) {
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
 	}
 
 	set_reclaim_mode(priority, sc, false);
+again:
 	lru_add_drain();
+	split_page = NULL;
 	spin_lock_irq(&zone->lru_lock);
 
 	if (scanning_global_lru(sc)) {
-		nr_taken = isolate_pages_global(nr_to_scan,
+		nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
 			&page_list, &nr_scanned, sc->order,
 			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
 					ISOLATE_BOTH : ISOLATE_INACTIVE,
-			zone, 0, file);
+			zone, 0, file, &split_page);
 		zone->pages_scanned += nr_scanned;
+		total_scanned += nr_scanned;
 		if (current_is_kswapd())
 			__count_zone_vm_events(PGSCAN_KSWAPD, zone,
 					       nr_scanned);
@@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
 			__count_zone_vm_events(PGSCAN_DIRECT, zone,
 					       nr_scanned);
 	} else {
-		nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
+		nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
 			&page_list, &nr_scanned, sc->order,
 			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
 					ISOLATE_BOTH : ISOLATE_INACTIVE,
 			zone, sc->mem_cgroup,
-			0, file);
+			0, file, &split_page);
+		total_scanned += nr_scanned;
 		/*
 		 * mem_cgroup_isolate_pages() keeps track of
 		 * scanned pages on its own.
@@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
 		spin_unlock_irq(&zone->lru_lock);
 		return 0;
 	}
+	if (split_page && total_scanned < nr_to_scan) {
+		spin_unlock_irq(&zone->lru_lock);
+		split_huge_page(split_page);
+		goto again;
+	}
 
 	update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
 
 	spin_unlock_irq(&zone->lru_lock);
 
+	if (split_page)
+		split_huge_page(split_page);
+
 	nr_reclaimed = shrink_page_list(&page_list, zone, sc);
 
 	/* Check if we should syncronously wait for writeback */
@@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
 		nr_taken = isolate_pages_global(nr_pages, &l_hold,
 						&pgscanned, sc->order,
 						ISOLATE_ACTIVE, zone,
-						1, file);
+						1, file, NULL);
 		zone->pages_scanned += pgscanned;
 	} else {
 		nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
 						&pgscanned, sc->order,
 						ISOLATE_ACTIVE, zone,
-						sc->mem_cgroup, 1, file);
+						sc->mem_cgroup, 1, file, NULL);
 		/*
 		 * mem_cgroup_isolate_pages() keeps track of
 		 * scanned pages on its own.
Index: linux/mm/memcontrol.c
===================================================================
--- linux.orig/mm/memcontrol.c	2011-10-25 08:36:08.000000000 +0800
+++ linux/mm/memcontrol.c	2011-10-25 09:33:51.000000000 +0800
@@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
-					int active, int file)
+					int active, int file,
+					struct page **split_page)
 {
 	unsigned long nr_taken = 0;
 	struct page *page;
@@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
 		case 0:
 			list_move(&page->lru, dst);
 			mem_cgroup_del_lru(page);
-			nr_taken += hpage_nr_pages(page);
+			if (PageTransHuge(page) && split_page) {
+				nr_taken++;
+				*split_page = page;
+				goto out;
+			} else
+				nr_taken += hpage_nr_pages(page);
+
 			break;
 		case -EBUSY:
 			/* we don't affect global LRU but rotate in our LRU */
@@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
 		}
 	}
 
+out:
 	*scanned = scan;
 
 	trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
Index: linux/include/linux/memcontrol.h
===================================================================
--- linux.orig/include/linux/memcontrol.h	2011-10-25 08:36:08.000000000 +0800
+++ linux/include/linux/memcontrol.h	2011-10-25 09:33:51.000000000 +0800
@@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
-					int active, int file);
+					int active, int file,
+					struct page **split_page);
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 /*


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-10-25  2:59 ` Shaohua Li
@ 2011-10-27 23:34   ` Minchan Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-10-27 23:34 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> With current logic, if page reclaim finds a huge page, it will just reclaim
> the head page and leave tail pages reclaimed later. Let's take an example,
> lru list has page A and B, page A is huge page:
> 1. page A is isolated
> 2. page B is isolated
> 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> page A+1, page A+2, ... are added to lru list.
> 4. shrink_page_list() adds page B to swap page cache.
> 5. page A and B is written out and reclaimed.
> 6. page A+1, A+2 ... is isolated and reclaimed later.
> So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...

I don't see your code yet but have a question.
You mitigate this problem by 4/5 which could add subpages into lru tail
so subpages would reclaim next interation of reclaim.

What do we need 5/5?
Do I miss something?

> 
> We expected the whole huge page A is reclaimed in the meantime, so
> the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> 
> With this patch, we do huge page split just after the head page is isolated
> for inactive lru list, so the tail pages will be reclaimed immediately.
> 
> In a test, a range of anonymous memory is written and will trigger swap.
> Without the patch:
> #cat /proc/vmstat|grep thp
> thp_fault_alloc 451
> thp_fault_fallback 0
> thp_collapse_alloc 0
> thp_collapse_alloc_failed 0
> thp_split 238
> 
> With the patch:
> #cat /proc/vmstat|grep thp
> thp_fault_alloc 450
> thp_fault_fallback 1
> thp_collapse_alloc 0
> thp_collapse_alloc_failed 0
> thp_split 103
> 
> So the thp_split number is reduced a lot, though there is one extra
> thp_fault_fallback.

Wow. The result seems to be good.
Is it result of effect only 5/5? or both 4/5 and 5/5?

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-10-27 23:34   ` Minchan Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-10-27 23:34 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> With current logic, if page reclaim finds a huge page, it will just reclaim
> the head page and leave tail pages reclaimed later. Let's take an example,
> lru list has page A and B, page A is huge page:
> 1. page A is isolated
> 2. page B is isolated
> 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> page A+1, page A+2, ... are added to lru list.
> 4. shrink_page_list() adds page B to swap page cache.
> 5. page A and B is written out and reclaimed.
> 6. page A+1, A+2 ... is isolated and reclaimed later.
> So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...

I don't see your code yet but have a question.
You mitigate this problem by 4/5 which could add subpages into lru tail
so subpages would reclaim next interation of reclaim.

What do we need 5/5?
Do I miss something?

> 
> We expected the whole huge page A is reclaimed in the meantime, so
> the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> 
> With this patch, we do huge page split just after the head page is isolated
> for inactive lru list, so the tail pages will be reclaimed immediately.
> 
> In a test, a range of anonymous memory is written and will trigger swap.
> Without the patch:
> #cat /proc/vmstat|grep thp
> thp_fault_alloc 451
> thp_fault_fallback 0
> thp_collapse_alloc 0
> thp_collapse_alloc_failed 0
> thp_split 238
> 
> With the patch:
> #cat /proc/vmstat|grep thp
> thp_fault_alloc 450
> thp_fault_fallback 1
> thp_collapse_alloc 0
> thp_collapse_alloc_failed 0
> thp_split 103
> 
> So the thp_split number is reduced a lot, though there is one extra
> thp_fault_fallback.

Wow. The result seems to be good.
Is it result of effect only 5/5? or both 4/5 and 5/5?

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-10-27 23:34   ` Minchan Kim
@ 2011-10-28  5:11     ` Shaohua Li
  -1 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-10-28  5:11 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Fri, 2011-10-28 at 07:34 +0800, Minchan Kim wrote:
> On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > With current logic, if page reclaim finds a huge page, it will just reclaim
> > the head page and leave tail pages reclaimed later. Let's take an example,
> > lru list has page A and B, page A is huge page:
> > 1. page A is isolated
> > 2. page B is isolated
> > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > page A+1, page A+2, ... are added to lru list.
> > 4. shrink_page_list() adds page B to swap page cache.
> > 5. page A and B is written out and reclaimed.
> > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> 
> I don't see your code yet but have a question.
> You mitigate this problem by 4/5 which could add subpages into lru tail
> so subpages would reclaim next interation of reclaim.
> 
> What do we need 5/5?
> Do I miss something?
Both patches are required. without this patch, current page reclaim will
only reclaim the first page of a huge page, because the hugepage isn't
split yet. The hugepage is split when the first page is being written to
swap, which is too later and page reclaim might already isolated a lot
of pages.
 
> > We expected the whole huge page A is reclaimed in the meantime, so
> > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > 
> > With this patch, we do huge page split just after the head page is isolated
> > for inactive lru list, so the tail pages will be reclaimed immediately.
> > 
> > In a test, a range of anonymous memory is written and will trigger swap.
> > Without the patch:
> > #cat /proc/vmstat|grep thp
> > thp_fault_alloc 451
> > thp_fault_fallback 0
> > thp_collapse_alloc 0
> > thp_collapse_alloc_failed 0
> > thp_split 238
> > 
> > With the patch:
> > #cat /proc/vmstat|grep thp
> > thp_fault_alloc 450
> > thp_fault_fallback 1
> > thp_collapse_alloc 0
> > thp_collapse_alloc_failed 0
> > thp_split 103
> > 
> > So the thp_split number is reduced a lot, though there is one extra
> > thp_fault_fallback.
> 
> Wow. The result seems to be good.
> Is it result of effect only 5/5? or both 4/5 and 5/5?
both are required.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-10-28  5:11     ` Shaohua Li
  0 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-10-28  5:11 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Fri, 2011-10-28 at 07:34 +0800, Minchan Kim wrote:
> On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > With current logic, if page reclaim finds a huge page, it will just reclaim
> > the head page and leave tail pages reclaimed later. Let's take an example,
> > lru list has page A and B, page A is huge page:
> > 1. page A is isolated
> > 2. page B is isolated
> > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > page A+1, page A+2, ... are added to lru list.
> > 4. shrink_page_list() adds page B to swap page cache.
> > 5. page A and B is written out and reclaimed.
> > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> 
> I don't see your code yet but have a question.
> You mitigate this problem by 4/5 which could add subpages into lru tail
> so subpages would reclaim next interation of reclaim.
> 
> What do we need 5/5?
> Do I miss something?
Both patches are required. without this patch, current page reclaim will
only reclaim the first page of a huge page, because the hugepage isn't
split yet. The hugepage is split when the first page is being written to
swap, which is too later and page reclaim might already isolated a lot
of pages.
 
> > We expected the whole huge page A is reclaimed in the meantime, so
> > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > 
> > With this patch, we do huge page split just after the head page is isolated
> > for inactive lru list, so the tail pages will be reclaimed immediately.
> > 
> > In a test, a range of anonymous memory is written and will trigger swap.
> > Without the patch:
> > #cat /proc/vmstat|grep thp
> > thp_fault_alloc 451
> > thp_fault_fallback 0
> > thp_collapse_alloc 0
> > thp_collapse_alloc_failed 0
> > thp_split 238
> > 
> > With the patch:
> > #cat /proc/vmstat|grep thp
> > thp_fault_alloc 450
> > thp_fault_fallback 1
> > thp_collapse_alloc 0
> > thp_collapse_alloc_failed 0
> > thp_split 103
> > 
> > So the thp_split number is reduced a lot, though there is one extra
> > thp_fault_fallback.
> 
> Wow. The result seems to be good.
> Is it result of effect only 5/5? or both 4/5 and 5/5?
both are required.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-10-28  5:11     ` Shaohua Li
@ 2011-10-28  7:30       ` Minchan Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-10-28  7:30 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Fri, Oct 28, 2011 at 01:11:55PM +0800, Shaohua Li wrote:
> On Fri, 2011-10-28 at 07:34 +0800, Minchan Kim wrote:
> > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > lru list has page A and B, page A is huge page:
> > > 1. page A is isolated
> > > 2. page B is isolated
> > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > page A+1, page A+2, ... are added to lru list.
> > > 4. shrink_page_list() adds page B to swap page cache.
> > > 5. page A and B is written out and reclaimed.
> > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > 
> > I don't see your code yet but have a question.
> > You mitigate this problem by 4/5 which could add subpages into lru tail
> > so subpages would reclaim next interation of reclaim.
> > 
> > What do we need 5/5?
> > Do I miss something?
> Both patches are required. without this patch, current page reclaim will
> only reclaim the first page of a huge page, because the hugepage isn't
> split yet. The hugepage is split when the first page is being written to
> swap, which is too later and page reclaim might already isolated a lot
> of pages.

When split happens, subpages would be located in tail of LRU by your 4/5.
(Assume tail of LRU is old age).
In addtion, isolation happens 32 page chunk so the subpages would be isolated
and reclaimed in next iteration. I think 32 pages are not too many.

What do you think about it?
-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-10-28  7:30       ` Minchan Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-10-28  7:30 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Fri, Oct 28, 2011 at 01:11:55PM +0800, Shaohua Li wrote:
> On Fri, 2011-10-28 at 07:34 +0800, Minchan Kim wrote:
> > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > lru list has page A and B, page A is huge page:
> > > 1. page A is isolated
> > > 2. page B is isolated
> > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > page A+1, page A+2, ... are added to lru list.
> > > 4. shrink_page_list() adds page B to swap page cache.
> > > 5. page A and B is written out and reclaimed.
> > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > 
> > I don't see your code yet but have a question.
> > You mitigate this problem by 4/5 which could add subpages into lru tail
> > so subpages would reclaim next interation of reclaim.
> > 
> > What do we need 5/5?
> > Do I miss something?
> Both patches are required. without this patch, current page reclaim will
> only reclaim the first page of a huge page, because the hugepage isn't
> split yet. The hugepage is split when the first page is being written to
> swap, which is too later and page reclaim might already isolated a lot
> of pages.

When split happens, subpages would be located in tail of LRU by your 4/5.
(Assume tail of LRU is old age).
In addtion, isolation happens 32 page chunk so the subpages would be isolated
and reclaimed in next iteration. I think 32 pages are not too many.

What do you think about it?
-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-10-28  7:30       ` Minchan Kim
@ 2011-10-28  8:25         ` Shaohua Li
  -1 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-10-28  8:25 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Fri, 2011-10-28 at 15:30 +0800, Minchan Kim wrote:
> On Fri, Oct 28, 2011 at 01:11:55PM +0800, Shaohua Li wrote:
> > On Fri, 2011-10-28 at 07:34 +0800, Minchan Kim wrote:
> > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > lru list has page A and B, page A is huge page:
> > > > 1. page A is isolated
> > > > 2. page B is isolated
> > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > page A+1, page A+2, ... are added to lru list.
> > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > 5. page A and B is written out and reclaimed.
> > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > 
> > > I don't see your code yet but have a question.
> > > You mitigate this problem by 4/5 which could add subpages into lru tail
> > > so subpages would reclaim next interation of reclaim.
> > > 
> > > What do we need 5/5?
> > > Do I miss something?
> > Both patches are required. without this patch, current page reclaim will
> > only reclaim the first page of a huge page, because the hugepage isn't
> > split yet. The hugepage is split when the first page is being written to
> > swap, which is too later and page reclaim might already isolated a lot
> > of pages.
> 
> When split happens, subpages would be located in tail of LRU by your 4/5.
> (Assume tail of LRU is old age).
yes, but a lot of other pages already isolated. we will reclaim those
pages first. for example, reclaim huge page A, B. current reclaim order
is A, B, A+1, ... B+1, because we will isolated A and B first, all tail
pages are not isolated yet. While with my patch, the order is A, A
+1, ... B, B+1,.... with my patch, we can avoid unnecessary page split
or page isolation. This is exactly why my patch reduces the thp_split
count.

> In addtion, isolation happens 32 page chunk so the subpages would be isolated
> and reclaimed in next iteration. I think 32 pages are not too many.
> 
> What do you think about it?
since headpage and tailpages are in different list, the 32 chunk will
not include tailpages.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-10-28  8:25         ` Shaohua Li
  0 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-10-28  8:25 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Fri, 2011-10-28 at 15:30 +0800, Minchan Kim wrote:
> On Fri, Oct 28, 2011 at 01:11:55PM +0800, Shaohua Li wrote:
> > On Fri, 2011-10-28 at 07:34 +0800, Minchan Kim wrote:
> > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > lru list has page A and B, page A is huge page:
> > > > 1. page A is isolated
> > > > 2. page B is isolated
> > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > page A+1, page A+2, ... are added to lru list.
> > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > 5. page A and B is written out and reclaimed.
> > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > 
> > > I don't see your code yet but have a question.
> > > You mitigate this problem by 4/5 which could add subpages into lru tail
> > > so subpages would reclaim next interation of reclaim.
> > > 
> > > What do we need 5/5?
> > > Do I miss something?
> > Both patches are required. without this patch, current page reclaim will
> > only reclaim the first page of a huge page, because the hugepage isn't
> > split yet. The hugepage is split when the first page is being written to
> > swap, which is too later and page reclaim might already isolated a lot
> > of pages.
> 
> When split happens, subpages would be located in tail of LRU by your 4/5.
> (Assume tail of LRU is old age).
yes, but a lot of other pages already isolated. we will reclaim those
pages first. for example, reclaim huge page A, B. current reclaim order
is A, B, A+1, ... B+1, because we will isolated A and B first, all tail
pages are not isolated yet. While with my patch, the order is A, A
+1, ... B, B+1,.... with my patch, we can avoid unnecessary page split
or page isolation. This is exactly why my patch reduces the thp_split
count.

> In addtion, isolation happens 32 page chunk so the subpages would be isolated
> and reclaimed in next iteration. I think 32 pages are not too many.
> 
> What do you think about it?
since headpage and tailpages are in different list, the 32 chunk will
not include tailpages.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-10-28  8:25         ` Shaohua Li
@ 2011-10-28  9:50           ` Minchan Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-10-28  9:50 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Fri, Oct 28, 2011 at 04:25:56PM +0800, Shaohua Li wrote:
> On Fri, 2011-10-28 at 15:30 +0800, Minchan Kim wrote:
> > On Fri, Oct 28, 2011 at 01:11:55PM +0800, Shaohua Li wrote:
> > > On Fri, 2011-10-28 at 07:34 +0800, Minchan Kim wrote:
> > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > lru list has page A and B, page A is huge page:
> > > > > 1. page A is isolated
> > > > > 2. page B is isolated
> > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > page A+1, page A+2, ... are added to lru list.
> > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > 5. page A and B is written out and reclaimed.
> > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > 
> > > > I don't see your code yet but have a question.
> > > > You mitigate this problem by 4/5 which could add subpages into lru tail
> > > > so subpages would reclaim next interation of reclaim.
> > > > 
> > > > What do we need 5/5?
> > > > Do I miss something?
> > > Both patches are required. without this patch, current page reclaim will
> > > only reclaim the first page of a huge page, because the hugepage isn't
> > > split yet. The hugepage is split when the first page is being written to
> > > swap, which is too later and page reclaim might already isolated a lot
> > > of pages.
> > 
> > When split happens, subpages would be located in tail of LRU by your 4/5.
> > (Assume tail of LRU is old age).
> yes, but a lot of other pages already isolated. we will reclaim those
> pages first. for example, reclaim huge page A, B. current reclaim order
> is A, B, A+1, ... B+1, because we will isolated A and B first, all tail
> pages are not isolated yet. While with my patch, the order is A, A
> +1, ... B, B+1,.... with my patch, we can avoid unnecessary page split
> or page isolation. This is exactly why my patch reduces the thp_split
> count.

It's possbile but I doubt how it is effective becuase add_to_swap has a unlikely as follows

	if (unlikely(PageTransHuge(page)))

I don't mean unlikely assumption is absolutely right.
But at least, you have to convince us of it's wrong.
Personally, I don't want to add more logic and handling THP pages
different with normal page unless it's real concern.

> 
> > In addtion, isolation happens 32 page chunk so the subpages would be isolated
> > and reclaimed in next iteration. I think 32 pages are not too many.
> > 
> > What do you think about it?
> since headpage and tailpages are in different list, the 32 chunk will
> not include tailpages.

Yes. but it would be handled in next iteration.

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-10-28  9:50           ` Minchan Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-10-28  9:50 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Fri, Oct 28, 2011 at 04:25:56PM +0800, Shaohua Li wrote:
> On Fri, 2011-10-28 at 15:30 +0800, Minchan Kim wrote:
> > On Fri, Oct 28, 2011 at 01:11:55PM +0800, Shaohua Li wrote:
> > > On Fri, 2011-10-28 at 07:34 +0800, Minchan Kim wrote:
> > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > lru list has page A and B, page A is huge page:
> > > > > 1. page A is isolated
> > > > > 2. page B is isolated
> > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > page A+1, page A+2, ... are added to lru list.
> > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > 5. page A and B is written out and reclaimed.
> > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > 
> > > > I don't see your code yet but have a question.
> > > > You mitigate this problem by 4/5 which could add subpages into lru tail
> > > > so subpages would reclaim next interation of reclaim.
> > > > 
> > > > What do we need 5/5?
> > > > Do I miss something?
> > > Both patches are required. without this patch, current page reclaim will
> > > only reclaim the first page of a huge page, because the hugepage isn't
> > > split yet. The hugepage is split when the first page is being written to
> > > swap, which is too later and page reclaim might already isolated a lot
> > > of pages.
> > 
> > When split happens, subpages would be located in tail of LRU by your 4/5.
> > (Assume tail of LRU is old age).
> yes, but a lot of other pages already isolated. we will reclaim those
> pages first. for example, reclaim huge page A, B. current reclaim order
> is A, B, A+1, ... B+1, because we will isolated A and B first, all tail
> pages are not isolated yet. While with my patch, the order is A, A
> +1, ... B, B+1,.... with my patch, we can avoid unnecessary page split
> or page isolation. This is exactly why my patch reduces the thp_split
> count.

It's possbile but I doubt how it is effective becuase add_to_swap has a unlikely as follows

	if (unlikely(PageTransHuge(page)))

I don't mean unlikely assumption is absolutely right.
But at least, you have to convince us of it's wrong.
Personally, I don't want to add more logic and handling THP pages
different with normal page unless it's real concern.

> 
> > In addtion, isolation happens 32 page chunk so the subpages would be isolated
> > and reclaimed in next iteration. I think 32 pages are not too many.
> > 
> > What do you think about it?
> since headpage and tailpages are in different list, the 32 chunk will
> not include tailpages.

Yes. but it would be handled in next iteration.

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-10-25  2:59 ` Shaohua Li
@ 2011-10-29  0:06   ` Minchan Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-10-29  0:06 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> With current logic, if page reclaim finds a huge page, it will just reclaim
> the head page and leave tail pages reclaimed later. Let's take an example,
> lru list has page A and B, page A is huge page:
> 1. page A is isolated
> 2. page B is isolated
> 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> page A+1, page A+2, ... are added to lru list.
> 4. shrink_page_list() adds page B to swap page cache.
> 5. page A and B is written out and reclaimed.
> 6. page A+1, A+2 ... is isolated and reclaimed later.
> So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> 
> We expected the whole huge page A is reclaimed in the meantime, so
> the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> 
> With this patch, we do huge page split just after the head page is isolated
> for inactive lru list, so the tail pages will be reclaimed immediately.
> 
> In a test, a range of anonymous memory is written and will trigger swap.
> Without the patch:
> #cat /proc/vmstat|grep thp
> thp_fault_alloc 451
> thp_fault_fallback 0
> thp_collapse_alloc 0
> thp_collapse_alloc_failed 0
> thp_split 238
> 
> With the patch:
> #cat /proc/vmstat|grep thp
> thp_fault_alloc 450
> thp_fault_fallback 1
> thp_collapse_alloc 0
> thp_collapse_alloc_failed 0
> thp_split 103
> 
> So the thp_split number is reduced a lot, though there is one extra
> thp_fault_fallback.
> 
> Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> ---
>  include/linux/memcontrol.h |    3 +-
>  mm/memcontrol.c            |   12 +++++++++--
>  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
>  3 files changed, 50 insertions(+), 14 deletions(-)
> 
> Index: linux/mm/vmscan.c
> ===================================================================
> --- linux.orig/mm/vmscan.c	2011-10-25 08:36:08.000000000 +0800
> +++ linux/mm/vmscan.c	2011-10-25 09:51:44.000000000 +0800
> @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
>   */
>  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
>  		struct list_head *src, struct list_head *dst,
> -		unsigned long *scanned, int order, int mode, int file)
> +		unsigned long *scanned, int order, int mode, int file,
> +		struct page **split_page)
>  {
>  	unsigned long nr_taken = 0;
>  	unsigned long nr_lumpy_taken = 0;
> @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
>  		case 0:
>  			list_move(&page->lru, dst);
>  			mem_cgroup_del_lru(page);
> -			nr_taken += hpage_nr_pages(page);
> +			if (PageTransHuge(page) && split_page) {
> +				nr_taken++;
> +				*split_page = page;
> +				goto out;
> +			} else
> +				nr_taken += hpage_nr_pages(page);
>  			break;
>  
>  		case -EBUSY:
> @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
>  			if (__isolate_lru_page(cursor_page, mode, file) == 0) {
>  				list_move(&cursor_page->lru, dst);
>  				mem_cgroup_del_lru(cursor_page);
> -				nr_taken += hpage_nr_pages(page);
>  				nr_lumpy_taken++;
>  				if (PageDirty(cursor_page))
>  					nr_lumpy_dirty++;
>  				scan++;
> +				if (PageTransHuge(page) && split_page) {
> +					nr_taken++;
> +					*split_page = page;
> +					goto out;
> +				} else
> +					nr_taken += hpage_nr_pages(page);
>  			} else {
>  				/*
>  				 * Check if the page is freed already.
> @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
>  			nr_lumpy_failed++;
>  	}
>  
> +out:
>  	*scanned = scan;
>  
>  	trace_mm_vmscan_lru_isolate(order,
> @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
>  					struct list_head *dst,
>  					unsigned long *scanned, int order,
>  					int mode, struct zone *z,
> -					int active, int file)
> +					int active, int file,
> +					struct page **split_page)
>  {
>  	int lru = LRU_BASE;
>  	if (active)
> @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
>  	if (file)
>  		lru += LRU_FILE;
>  	return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> -								mode, file);
> +							mode, file, split_page);
>  }
>  
>  /*
> @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
>  {
>  	LIST_HEAD(page_list);
>  	unsigned long nr_scanned;
> +	unsigned long total_scanned = 0;
>  	unsigned long nr_reclaimed = 0;
>  	unsigned long nr_taken;
>  	unsigned long nr_anon;
>  	unsigned long nr_file;
> +	struct page *split_page;
>  
>  	while (unlikely(too_many_isolated(zone, file, sc))) {
>  		congestion_wait(BLK_RW_ASYNC, HZ/10);
> @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
>  	}
>  
>  	set_reclaim_mode(priority, sc, false);
> +again:
>  	lru_add_drain();
> +	split_page = NULL;
>  	spin_lock_irq(&zone->lru_lock);
>  
>  	if (scanning_global_lru(sc)) {
> -		nr_taken = isolate_pages_global(nr_to_scan,
> +		nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
>  			&page_list, &nr_scanned, sc->order,
>  			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
>  					ISOLATE_BOTH : ISOLATE_INACTIVE,
> -			zone, 0, file);
> +			zone, 0, file, &split_page);
>  		zone->pages_scanned += nr_scanned;
> +		total_scanned += nr_scanned;
>  		if (current_is_kswapd())
>  			__count_zone_vm_events(PGSCAN_KSWAPD, zone,
>  					       nr_scanned);
> @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
>  			__count_zone_vm_events(PGSCAN_DIRECT, zone,
>  					       nr_scanned);
>  	} else {
> -		nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> +		nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
>  			&page_list, &nr_scanned, sc->order,
>  			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
>  					ISOLATE_BOTH : ISOLATE_INACTIVE,
>  			zone, sc->mem_cgroup,
> -			0, file);
> +			0, file, &split_page);
> +		total_scanned += nr_scanned;
>  		/*
>  		 * mem_cgroup_isolate_pages() keeps track of
>  		 * scanned pages on its own.
> @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
>  		spin_unlock_irq(&zone->lru_lock);
>  		return 0;
>  	}
> +	if (split_page && total_scanned < nr_to_scan) {
> +		spin_unlock_irq(&zone->lru_lock);
> +		split_huge_page(split_page);
> +		goto again;
> +	}
>  
>  	update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
>  
>  	spin_unlock_irq(&zone->lru_lock);
>  
> +	if (split_page)
> +		split_huge_page(split_page);
> +
>  	nr_reclaimed = shrink_page_list(&page_list, zone, sc);
>  
>  	/* Check if we should syncronously wait for writeback */
> @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
>  		nr_taken = isolate_pages_global(nr_pages, &l_hold,
>  						&pgscanned, sc->order,
>  						ISOLATE_ACTIVE, zone,
> -						1, file);
> +						1, file, NULL);
>  		zone->pages_scanned += pgscanned;
>  	} else {
>  		nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
>  						&pgscanned, sc->order,
>  						ISOLATE_ACTIVE, zone,
> -						sc->mem_cgroup, 1, file);
> +						sc->mem_cgroup, 1, file, NULL);
>  		/*
>  		 * mem_cgroup_isolate_pages() keeps track of
>  		 * scanned pages on its own.
> Index: linux/mm/memcontrol.c
> ===================================================================
> --- linux.orig/mm/memcontrol.c	2011-10-25 08:36:08.000000000 +0800
> +++ linux/mm/memcontrol.c	2011-10-25 09:33:51.000000000 +0800
> @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
>  					unsigned long *scanned, int order,
>  					int mode, struct zone *z,
>  					struct mem_cgroup *mem_cont,
> -					int active, int file)
> +					int active, int file,
> +					struct page **split_page)
>  {
>  	unsigned long nr_taken = 0;
>  	struct page *page;
> @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
>  		case 0:
>  			list_move(&page->lru, dst);
>  			mem_cgroup_del_lru(page);
> -			nr_taken += hpage_nr_pages(page);
> +			if (PageTransHuge(page) && split_page) {
> +				nr_taken++;
> +				*split_page = page;
> +				goto out;
> +			} else
> +				nr_taken += hpage_nr_pages(page);
> +
>  			break;
>  		case -EBUSY:
>  			/* we don't affect global LRU but rotate in our LRU */
> @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
>  		}
>  	}
>  
> +out:
>  	*scanned = scan;
>  
>  	trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> Index: linux/include/linux/memcontrol.h
> ===================================================================
> --- linux.orig/include/linux/memcontrol.h	2011-10-25 08:36:08.000000000 +0800
> +++ linux/include/linux/memcontrol.h	2011-10-25 09:33:51.000000000 +0800
> @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
>  					unsigned long *scanned, int order,
>  					int mode, struct zone *z,
>  					struct mem_cgroup *mem_cont,
> -					int active, int file);
> +					int active, int file,
> +					struct page **split_page);
>  
>  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
>  /*
> 
> 

I saw the code. my concern is your patch could make unnecessary split of THP.

When we isolates page, we can't know whether it's working set or not.
So split should happen after we judge it's working set page.

If you really want to merge this patch, I suggest that
we can handle it in shrink_page_list step, not isolation step.

My totally untested code which is just to show the concept is as follows,

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 48c32eb..86c79ac 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -81,7 +81,13 @@ extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 extern int handle_pte_fault(struct mm_struct *mm,
 			    struct vm_area_struct *vma, unsigned long address,
 			    pte_t *pte, pmd_t *pmd, unsigned int flags);
-extern int split_huge_page(struct page *page);
+
+extern int split_huge_page_list(struct page *page, struct list_head *dst);
+static inline int split_huge_page(struct page *page)
+{
+	return split_huge_page_list(page, NULL);
+}
+
 extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
 #define split_huge_page_pmd(__mm, __pmd)				\
 	do {								\
diff --git a/include/linux/swap.h b/include/linux/swap.h
index c71f84b..9eff62b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -218,7 +218,8 @@ extern unsigned int nr_free_pagecache_pages(void);
 extern void __lru_cache_add(struct page *, enum lru_list lru);
 extern void lru_cache_add_lru(struct page *, enum lru_list lru);
 extern void lru_add_page_tail(struct zone* zone,
-			      struct page *page, struct page *page_tail);
+			      struct page *page, struct page *page_tail,
+			      struct list_head *list);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e2d1587..0f920ad 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1150,7 +1150,7 @@ static int __split_huge_page_splitting(struct page *page,
 	return ret;
 }
 
-static void __split_huge_page_refcount(struct page *page)
+static void __split_huge_page_refcount(struct page *page, struct list_head *list)
 {
 	int i;
 	unsigned long head_index = page->index;
@@ -1221,7 +1221,7 @@ static void __split_huge_page_refcount(struct page *page)
 
 		mem_cgroup_split_huge_fixup(page, page_tail);
 
-		lru_add_page_tail(zone, page, page_tail);
+		lru_add_page_tail(zone, page, page_tail, list);
 	}
 
 	__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
@@ -1335,7 +1335,8 @@ static int __split_huge_page_map(struct page *page,
 
 /* must be called with anon_vma->root->mutex hold */
 static void __split_huge_page(struct page *page,
-			      struct anon_vma *anon_vma)
+			      struct anon_vma *anon_vma,
+			      struct list_head *list)
 {
 	int mapcount, mapcount2;
 	struct anon_vma_chain *avc;
@@ -1367,7 +1368,7 @@ static void __split_huge_page(struct page *page,
 		       mapcount, page_mapcount(page));
 	BUG_ON(mapcount != page_mapcount(page));
 
-	__split_huge_page_refcount(page);
+	__split_huge_page_refcount(page, list);
 
 	mapcount2 = 0;
 	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
@@ -1384,7 +1385,7 @@ static void __split_huge_page(struct page *page,
 	BUG_ON(mapcount != mapcount2);
 }
 
-int split_huge_page(struct page *page)
+int split_huge_page_list(struct page *page, struct list_head *list)
 {
 	struct anon_vma *anon_vma;
 	int ret = 1;
@@ -1398,7 +1399,7 @@ int split_huge_page(struct page *page)
 		goto out_unlock;
 
 	BUG_ON(!PageSwapBacked(page));
-	__split_huge_page(page, anon_vma);
+	__split_huge_page(page, anon_vma, list);
 	count_vm_event(THP_SPLIT);
 
 	BUG_ON(PageCompound(page));
diff --git a/mm/swap.c b/mm/swap.c
index 3a442f1..d76e332 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -632,9 +632,14 @@ void __pagevec_release(struct pagevec *pvec)
 
 EXPORT_SYMBOL(__pagevec_release);
 
-/* used by __split_huge_page_refcount() */
+/*
+ * used by __split_huge_page_refcount()
+ * If @list is NULL,  @page_tail is inserted into zone->lru.
+ * Otherwise, it is inserted into @list.
+ */
 void lru_add_page_tail(struct zone* zone,
-		       struct page *page, struct page *page_tail)
+		       struct page *page, struct page *page_tail,
+		       struct list_head *list)
 {
 	int active;
 	enum lru_list lru;
@@ -658,11 +663,15 @@ void lru_add_page_tail(struct zone* zone,
 			lru = LRU_INACTIVE_ANON;
 		}
 		update_page_reclaim_stat(zone, page_tail, file, active);
-		if (likely(PageLRU(page)))
-			head = page->lru.prev;
-		else
-			head = &zone->lru[lru].list;
-		__add_page_to_lru_list(zone, page_tail, lru, head);
+		if (unlikely(list))
+			list_add(&page_tail->lru, list);
+		else {
+			if (likely(PageLRU(page)))
+				head = page->lru.prev;
+			else
+				head = &zone->lru[lru].list;
+			__add_page_to_lru_list(zone, page_tail, lru, head);
+		}
 	} else {
 		SetPageUnevictable(page_tail);
 		add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 4668046..25c7fdc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -153,13 +153,6 @@ int add_to_swap(struct page *page)
 	entry = get_swap_page();
 	if (!entry.val)
 		return 0;
-
-	if (unlikely(PageTransHuge(page)))
-		if (unlikely(split_huge_page(page))) {
-			swapcache_free(entry, NULL);
-			return 0;
-		}
-
 	/*
 	 * Radix-tree node allocations from PF_MEMALLOC contexts could
 	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b55699c..8e0aaf6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -838,6 +838,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!(sc->gfp_mask & __GFP_IO))
 				goto keep_locked;
+			if (unlikely(PageTransHuge(page)))
+				if (unlikely(split_huge_page_list(page, page_list)))
+					goto activate_locked;
 			if (!add_to_swap(page))
 				goto activate_locked;
 			may_enter_fs = 1;

-- 
Kind regards,
Minchan Kim

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-10-29  0:06   ` Minchan Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-10-29  0:06 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> With current logic, if page reclaim finds a huge page, it will just reclaim
> the head page and leave tail pages reclaimed later. Let's take an example,
> lru list has page A and B, page A is huge page:
> 1. page A is isolated
> 2. page B is isolated
> 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> page A+1, page A+2, ... are added to lru list.
> 4. shrink_page_list() adds page B to swap page cache.
> 5. page A and B is written out and reclaimed.
> 6. page A+1, A+2 ... is isolated and reclaimed later.
> So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> 
> We expected the whole huge page A is reclaimed in the meantime, so
> the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> 
> With this patch, we do huge page split just after the head page is isolated
> for inactive lru list, so the tail pages will be reclaimed immediately.
> 
> In a test, a range of anonymous memory is written and will trigger swap.
> Without the patch:
> #cat /proc/vmstat|grep thp
> thp_fault_alloc 451
> thp_fault_fallback 0
> thp_collapse_alloc 0
> thp_collapse_alloc_failed 0
> thp_split 238
> 
> With the patch:
> #cat /proc/vmstat|grep thp
> thp_fault_alloc 450
> thp_fault_fallback 1
> thp_collapse_alloc 0
> thp_collapse_alloc_failed 0
> thp_split 103
> 
> So the thp_split number is reduced a lot, though there is one extra
> thp_fault_fallback.
> 
> Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> ---
>  include/linux/memcontrol.h |    3 +-
>  mm/memcontrol.c            |   12 +++++++++--
>  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
>  3 files changed, 50 insertions(+), 14 deletions(-)
> 
> Index: linux/mm/vmscan.c
> ===================================================================
> --- linux.orig/mm/vmscan.c	2011-10-25 08:36:08.000000000 +0800
> +++ linux/mm/vmscan.c	2011-10-25 09:51:44.000000000 +0800
> @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
>   */
>  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
>  		struct list_head *src, struct list_head *dst,
> -		unsigned long *scanned, int order, int mode, int file)
> +		unsigned long *scanned, int order, int mode, int file,
> +		struct page **split_page)
>  {
>  	unsigned long nr_taken = 0;
>  	unsigned long nr_lumpy_taken = 0;
> @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
>  		case 0:
>  			list_move(&page->lru, dst);
>  			mem_cgroup_del_lru(page);
> -			nr_taken += hpage_nr_pages(page);
> +			if (PageTransHuge(page) && split_page) {
> +				nr_taken++;
> +				*split_page = page;
> +				goto out;
> +			} else
> +				nr_taken += hpage_nr_pages(page);
>  			break;
>  
>  		case -EBUSY:
> @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
>  			if (__isolate_lru_page(cursor_page, mode, file) == 0) {
>  				list_move(&cursor_page->lru, dst);
>  				mem_cgroup_del_lru(cursor_page);
> -				nr_taken += hpage_nr_pages(page);
>  				nr_lumpy_taken++;
>  				if (PageDirty(cursor_page))
>  					nr_lumpy_dirty++;
>  				scan++;
> +				if (PageTransHuge(page) && split_page) {
> +					nr_taken++;
> +					*split_page = page;
> +					goto out;
> +				} else
> +					nr_taken += hpage_nr_pages(page);
>  			} else {
>  				/*
>  				 * Check if the page is freed already.
> @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
>  			nr_lumpy_failed++;
>  	}
>  
> +out:
>  	*scanned = scan;
>  
>  	trace_mm_vmscan_lru_isolate(order,
> @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
>  					struct list_head *dst,
>  					unsigned long *scanned, int order,
>  					int mode, struct zone *z,
> -					int active, int file)
> +					int active, int file,
> +					struct page **split_page)
>  {
>  	int lru = LRU_BASE;
>  	if (active)
> @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
>  	if (file)
>  		lru += LRU_FILE;
>  	return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> -								mode, file);
> +							mode, file, split_page);
>  }
>  
>  /*
> @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
>  {
>  	LIST_HEAD(page_list);
>  	unsigned long nr_scanned;
> +	unsigned long total_scanned = 0;
>  	unsigned long nr_reclaimed = 0;
>  	unsigned long nr_taken;
>  	unsigned long nr_anon;
>  	unsigned long nr_file;
> +	struct page *split_page;
>  
>  	while (unlikely(too_many_isolated(zone, file, sc))) {
>  		congestion_wait(BLK_RW_ASYNC, HZ/10);
> @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
>  	}
>  
>  	set_reclaim_mode(priority, sc, false);
> +again:
>  	lru_add_drain();
> +	split_page = NULL;
>  	spin_lock_irq(&zone->lru_lock);
>  
>  	if (scanning_global_lru(sc)) {
> -		nr_taken = isolate_pages_global(nr_to_scan,
> +		nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
>  			&page_list, &nr_scanned, sc->order,
>  			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
>  					ISOLATE_BOTH : ISOLATE_INACTIVE,
> -			zone, 0, file);
> +			zone, 0, file, &split_page);
>  		zone->pages_scanned += nr_scanned;
> +		total_scanned += nr_scanned;
>  		if (current_is_kswapd())
>  			__count_zone_vm_events(PGSCAN_KSWAPD, zone,
>  					       nr_scanned);
> @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
>  			__count_zone_vm_events(PGSCAN_DIRECT, zone,
>  					       nr_scanned);
>  	} else {
> -		nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> +		nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
>  			&page_list, &nr_scanned, sc->order,
>  			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
>  					ISOLATE_BOTH : ISOLATE_INACTIVE,
>  			zone, sc->mem_cgroup,
> -			0, file);
> +			0, file, &split_page);
> +		total_scanned += nr_scanned;
>  		/*
>  		 * mem_cgroup_isolate_pages() keeps track of
>  		 * scanned pages on its own.
> @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
>  		spin_unlock_irq(&zone->lru_lock);
>  		return 0;
>  	}
> +	if (split_page && total_scanned < nr_to_scan) {
> +		spin_unlock_irq(&zone->lru_lock);
> +		split_huge_page(split_page);
> +		goto again;
> +	}
>  
>  	update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
>  
>  	spin_unlock_irq(&zone->lru_lock);
>  
> +	if (split_page)
> +		split_huge_page(split_page);
> +
>  	nr_reclaimed = shrink_page_list(&page_list, zone, sc);
>  
>  	/* Check if we should syncronously wait for writeback */
> @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
>  		nr_taken = isolate_pages_global(nr_pages, &l_hold,
>  						&pgscanned, sc->order,
>  						ISOLATE_ACTIVE, zone,
> -						1, file);
> +						1, file, NULL);
>  		zone->pages_scanned += pgscanned;
>  	} else {
>  		nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
>  						&pgscanned, sc->order,
>  						ISOLATE_ACTIVE, zone,
> -						sc->mem_cgroup, 1, file);
> +						sc->mem_cgroup, 1, file, NULL);
>  		/*
>  		 * mem_cgroup_isolate_pages() keeps track of
>  		 * scanned pages on its own.
> Index: linux/mm/memcontrol.c
> ===================================================================
> --- linux.orig/mm/memcontrol.c	2011-10-25 08:36:08.000000000 +0800
> +++ linux/mm/memcontrol.c	2011-10-25 09:33:51.000000000 +0800
> @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
>  					unsigned long *scanned, int order,
>  					int mode, struct zone *z,
>  					struct mem_cgroup *mem_cont,
> -					int active, int file)
> +					int active, int file,
> +					struct page **split_page)
>  {
>  	unsigned long nr_taken = 0;
>  	struct page *page;
> @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
>  		case 0:
>  			list_move(&page->lru, dst);
>  			mem_cgroup_del_lru(page);
> -			nr_taken += hpage_nr_pages(page);
> +			if (PageTransHuge(page) && split_page) {
> +				nr_taken++;
> +				*split_page = page;
> +				goto out;
> +			} else
> +				nr_taken += hpage_nr_pages(page);
> +
>  			break;
>  		case -EBUSY:
>  			/* we don't affect global LRU but rotate in our LRU */
> @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
>  		}
>  	}
>  
> +out:
>  	*scanned = scan;
>  
>  	trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> Index: linux/include/linux/memcontrol.h
> ===================================================================
> --- linux.orig/include/linux/memcontrol.h	2011-10-25 08:36:08.000000000 +0800
> +++ linux/include/linux/memcontrol.h	2011-10-25 09:33:51.000000000 +0800
> @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
>  					unsigned long *scanned, int order,
>  					int mode, struct zone *z,
>  					struct mem_cgroup *mem_cont,
> -					int active, int file);
> +					int active, int file,
> +					struct page **split_page);
>  
>  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
>  /*
> 
> 

I saw the code. my concern is your patch could make unnecessary split of THP.

When we isolates page, we can't know whether it's working set or not.
So split should happen after we judge it's working set page.

If you really want to merge this patch, I suggest that
we can handle it in shrink_page_list step, not isolation step.

My totally untested code which is just to show the concept is as follows,

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 48c32eb..86c79ac 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -81,7 +81,13 @@ extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 extern int handle_pte_fault(struct mm_struct *mm,
 			    struct vm_area_struct *vma, unsigned long address,
 			    pte_t *pte, pmd_t *pmd, unsigned int flags);
-extern int split_huge_page(struct page *page);
+
+extern int split_huge_page_list(struct page *page, struct list_head *dst);
+static inline int split_huge_page(struct page *page)
+{
+	return split_huge_page_list(page, NULL);
+}
+
 extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
 #define split_huge_page_pmd(__mm, __pmd)				\
 	do {								\
diff --git a/include/linux/swap.h b/include/linux/swap.h
index c71f84b..9eff62b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -218,7 +218,8 @@ extern unsigned int nr_free_pagecache_pages(void);
 extern void __lru_cache_add(struct page *, enum lru_list lru);
 extern void lru_cache_add_lru(struct page *, enum lru_list lru);
 extern void lru_add_page_tail(struct zone* zone,
-			      struct page *page, struct page *page_tail);
+			      struct page *page, struct page *page_tail,
+			      struct list_head *list);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e2d1587..0f920ad 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1150,7 +1150,7 @@ static int __split_huge_page_splitting(struct page *page,
 	return ret;
 }
 
-static void __split_huge_page_refcount(struct page *page)
+static void __split_huge_page_refcount(struct page *page, struct list_head *list)
 {
 	int i;
 	unsigned long head_index = page->index;
@@ -1221,7 +1221,7 @@ static void __split_huge_page_refcount(struct page *page)
 
 		mem_cgroup_split_huge_fixup(page, page_tail);
 
-		lru_add_page_tail(zone, page, page_tail);
+		lru_add_page_tail(zone, page, page_tail, list);
 	}
 
 	__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
@@ -1335,7 +1335,8 @@ static int __split_huge_page_map(struct page *page,
 
 /* must be called with anon_vma->root->mutex hold */
 static void __split_huge_page(struct page *page,
-			      struct anon_vma *anon_vma)
+			      struct anon_vma *anon_vma,
+			      struct list_head *list)
 {
 	int mapcount, mapcount2;
 	struct anon_vma_chain *avc;
@@ -1367,7 +1368,7 @@ static void __split_huge_page(struct page *page,
 		       mapcount, page_mapcount(page));
 	BUG_ON(mapcount != page_mapcount(page));
 
-	__split_huge_page_refcount(page);
+	__split_huge_page_refcount(page, list);
 
 	mapcount2 = 0;
 	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
@@ -1384,7 +1385,7 @@ static void __split_huge_page(struct page *page,
 	BUG_ON(mapcount != mapcount2);
 }
 
-int split_huge_page(struct page *page)
+int split_huge_page_list(struct page *page, struct list_head *list)
 {
 	struct anon_vma *anon_vma;
 	int ret = 1;
@@ -1398,7 +1399,7 @@ int split_huge_page(struct page *page)
 		goto out_unlock;
 
 	BUG_ON(!PageSwapBacked(page));
-	__split_huge_page(page, anon_vma);
+	__split_huge_page(page, anon_vma, list);
 	count_vm_event(THP_SPLIT);
 
 	BUG_ON(PageCompound(page));
diff --git a/mm/swap.c b/mm/swap.c
index 3a442f1..d76e332 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -632,9 +632,14 @@ void __pagevec_release(struct pagevec *pvec)
 
 EXPORT_SYMBOL(__pagevec_release);
 
-/* used by __split_huge_page_refcount() */
+/*
+ * used by __split_huge_page_refcount()
+ * If @list is NULL,  @page_tail is inserted into zone->lru.
+ * Otherwise, it is inserted into @list.
+ */
 void lru_add_page_tail(struct zone* zone,
-		       struct page *page, struct page *page_tail)
+		       struct page *page, struct page *page_tail,
+		       struct list_head *list)
 {
 	int active;
 	enum lru_list lru;
@@ -658,11 +663,15 @@ void lru_add_page_tail(struct zone* zone,
 			lru = LRU_INACTIVE_ANON;
 		}
 		update_page_reclaim_stat(zone, page_tail, file, active);
-		if (likely(PageLRU(page)))
-			head = page->lru.prev;
-		else
-			head = &zone->lru[lru].list;
-		__add_page_to_lru_list(zone, page_tail, lru, head);
+		if (unlikely(list))
+			list_add(&page_tail->lru, list);
+		else {
+			if (likely(PageLRU(page)))
+				head = page->lru.prev;
+			else
+				head = &zone->lru[lru].list;
+			__add_page_to_lru_list(zone, page_tail, lru, head);
+		}
 	} else {
 		SetPageUnevictable(page_tail);
 		add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 4668046..25c7fdc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -153,13 +153,6 @@ int add_to_swap(struct page *page)
 	entry = get_swap_page();
 	if (!entry.val)
 		return 0;
-
-	if (unlikely(PageTransHuge(page)))
-		if (unlikely(split_huge_page(page))) {
-			swapcache_free(entry, NULL);
-			return 0;
-		}
-
 	/*
 	 * Radix-tree node allocations from PF_MEMALLOC contexts could
 	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b55699c..8e0aaf6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -838,6 +838,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!(sc->gfp_mask & __GFP_IO))
 				goto keep_locked;
+			if (unlikely(PageTransHuge(page)))
+				if (unlikely(split_huge_page_list(page, page_list)))
+					goto activate_locked;
 			if (!add_to_swap(page))
 				goto activate_locked;
 			may_enter_fs = 1;

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-10-28  9:50           ` Minchan Kim
@ 2011-10-31  1:10             ` Shaohua Li
  -1 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-10-31  1:10 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Fri, 2011-10-28 at 17:50 +0800, Minchan Kim wrote:
> On Fri, Oct 28, 2011 at 04:25:56PM +0800, Shaohua Li wrote:
> > On Fri, 2011-10-28 at 15:30 +0800, Minchan Kim wrote:
> > > On Fri, Oct 28, 2011 at 01:11:55PM +0800, Shaohua Li wrote:
> > > > On Fri, 2011-10-28 at 07:34 +0800, Minchan Kim wrote:
> > > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > > lru list has page A and B, page A is huge page:
> > > > > > 1. page A is isolated
> > > > > > 2. page B is isolated
> > > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > > page A+1, page A+2, ... are added to lru list.
> > > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > > 5. page A and B is written out and reclaimed.
> > > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > > 
> > > > > I don't see your code yet but have a question.
> > > > > You mitigate this problem by 4/5 which could add subpages into lru tail
> > > > > so subpages would reclaim next interation of reclaim.
> > > > > 
> > > > > What do we need 5/5?
> > > > > Do I miss something?
> > > > Both patches are required. without this patch, current page reclaim will
> > > > only reclaim the first page of a huge page, because the hugepage isn't
> > > > split yet. The hugepage is split when the first page is being written to
> > > > swap, which is too later and page reclaim might already isolated a lot
> > > > of pages.
> > > 
> > > When split happens, subpages would be located in tail of LRU by your 4/5.
> > > (Assume tail of LRU is old age).
> > yes, but a lot of other pages already isolated. we will reclaim those
> > pages first. for example, reclaim huge page A, B. current reclaim order
> > is A, B, A+1, ... B+1, because we will isolated A and B first, all tail
> > pages are not isolated yet. While with my patch, the order is A, A
> > +1, ... B, B+1,.... with my patch, we can avoid unnecessary page split
> > or page isolation. This is exactly why my patch reduces the thp_split
> > count.
> 
> It's possbile but I doubt how it is effective becuase add_to_swap has a unlikely as follows
> 
> 	if (unlikely(PageTransHuge(page)))
> 
> I don't mean unlikely assumption is absolutely right.
> But at least, you have to convince us of it's wrong.
> Personally, I don't want to add more logic and handling THP pages
> different with normal page unless it's real concern.
if you actually use THP, you will find it's a problem. The data I posted
already clearly showed it.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-10-31  1:10             ` Shaohua Li
  0 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-10-31  1:10 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Fri, 2011-10-28 at 17:50 +0800, Minchan Kim wrote:
> On Fri, Oct 28, 2011 at 04:25:56PM +0800, Shaohua Li wrote:
> > On Fri, 2011-10-28 at 15:30 +0800, Minchan Kim wrote:
> > > On Fri, Oct 28, 2011 at 01:11:55PM +0800, Shaohua Li wrote:
> > > > On Fri, 2011-10-28 at 07:34 +0800, Minchan Kim wrote:
> > > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > > lru list has page A and B, page A is huge page:
> > > > > > 1. page A is isolated
> > > > > > 2. page B is isolated
> > > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > > page A+1, page A+2, ... are added to lru list.
> > > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > > 5. page A and B is written out and reclaimed.
> > > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > > 
> > > > > I don't see your code yet but have a question.
> > > > > You mitigate this problem by 4/5 which could add subpages into lru tail
> > > > > so subpages would reclaim next interation of reclaim.
> > > > > 
> > > > > What do we need 5/5?
> > > > > Do I miss something?
> > > > Both patches are required. without this patch, current page reclaim will
> > > > only reclaim the first page of a huge page, because the hugepage isn't
> > > > split yet. The hugepage is split when the first page is being written to
> > > > swap, which is too later and page reclaim might already isolated a lot
> > > > of pages.
> > > 
> > > When split happens, subpages would be located in tail of LRU by your 4/5.
> > > (Assume tail of LRU is old age).
> > yes, but a lot of other pages already isolated. we will reclaim those
> > pages first. for example, reclaim huge page A, B. current reclaim order
> > is A, B, A+1, ... B+1, because we will isolated A and B first, all tail
> > pages are not isolated yet. While with my patch, the order is A, A
> > +1, ... B, B+1,.... with my patch, we can avoid unnecessary page split
> > or page isolation. This is exactly why my patch reduces the thp_split
> > count.
> 
> It's possbile but I doubt how it is effective becuase add_to_swap has a unlikely as follows
> 
> 	if (unlikely(PageTransHuge(page)))
> 
> I don't mean unlikely assumption is absolutely right.
> But at least, you have to convince us of it's wrong.
> Personally, I don't want to add more logic and handling THP pages
> different with normal page unless it's real concern.
if you actually use THP, you will find it's a problem. The data I posted
already clearly showed it.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-10-29  0:06   ` Minchan Kim
@ 2011-10-31  1:21     ` Shaohua Li
  -1 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-10-31  1:21 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > With current logic, if page reclaim finds a huge page, it will just reclaim
> > the head page and leave tail pages reclaimed later. Let's take an example,
> > lru list has page A and B, page A is huge page:
> > 1. page A is isolated
> > 2. page B is isolated
> > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > page A+1, page A+2, ... are added to lru list.
> > 4. shrink_page_list() adds page B to swap page cache.
> > 5. page A and B is written out and reclaimed.
> > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> >
> > We expected the whole huge page A is reclaimed in the meantime, so
> > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> >
> > With this patch, we do huge page split just after the head page is isolated
> > for inactive lru list, so the tail pages will be reclaimed immediately.
> >
> > In a test, a range of anonymous memory is written and will trigger swap.
> > Without the patch:
> > #cat /proc/vmstat|grep thp
> > thp_fault_alloc 451
> > thp_fault_fallback 0
> > thp_collapse_alloc 0
> > thp_collapse_alloc_failed 0
> > thp_split 238
> >
> > With the patch:
> > #cat /proc/vmstat|grep thp
> > thp_fault_alloc 450
> > thp_fault_fallback 1
> > thp_collapse_alloc 0
> > thp_collapse_alloc_failed 0
> > thp_split 103
> >
> > So the thp_split number is reduced a lot, though there is one extra
> > thp_fault_fallback.
> >
> > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > ---
> >  include/linux/memcontrol.h |    3 +-
> >  mm/memcontrol.c            |   12 +++++++++--
> >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> >  3 files changed, 50 insertions(+), 14 deletions(-)
> >
> > Index: linux/mm/vmscan.c
> > ===================================================================
> > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> >   */
> >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> >               struct list_head *src, struct list_head *dst,
> > -             unsigned long *scanned, int order, int mode, int file)
> > +             unsigned long *scanned, int order, int mode, int file,
> > +             struct page **split_page)
> >  {
> >       unsigned long nr_taken = 0;
> >       unsigned long nr_lumpy_taken = 0;
> > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> >               case 0:
> >                       list_move(&page->lru, dst);
> >                       mem_cgroup_del_lru(page);
> > -                     nr_taken += hpage_nr_pages(page);
> > +                     if (PageTransHuge(page) && split_page) {
> > +                             nr_taken++;
> > +                             *split_page = page;
> > +                             goto out;
> > +                     } else
> > +                             nr_taken += hpage_nr_pages(page);
> >                       break;
> >
> >               case -EBUSY:
> > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> >                               list_move(&cursor_page->lru, dst);
> >                               mem_cgroup_del_lru(cursor_page);
> > -                             nr_taken += hpage_nr_pages(page);
> >                               nr_lumpy_taken++;
> >                               if (PageDirty(cursor_page))
> >                                       nr_lumpy_dirty++;
> >                               scan++;
> > +                             if (PageTransHuge(page) && split_page) {
> > +                                     nr_taken++;
> > +                                     *split_page = page;
> > +                                     goto out;
> > +                             } else
> > +                                     nr_taken += hpage_nr_pages(page);
> >                       } else {
> >                               /*
> >                                * Check if the page is freed already.
> > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> >                       nr_lumpy_failed++;
> >       }
> >
> > +out:
> >       *scanned = scan;
> >
> >       trace_mm_vmscan_lru_isolate(order,
> > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> >                                       struct list_head *dst,
> >                                       unsigned long *scanned, int order,
> >                                       int mode, struct zone *z,
> > -                                     int active, int file)
> > +                                     int active, int file,
> > +                                     struct page **split_page)
> >  {
> >       int lru = LRU_BASE;
> >       if (active)
> > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> >       if (file)
> >               lru += LRU_FILE;
> >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > -                                                             mode, file);
> > +                                                     mode, file, split_page);
> >  }
> >
> >  /*
> > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> >  {
> >       LIST_HEAD(page_list);
> >       unsigned long nr_scanned;
> > +     unsigned long total_scanned = 0;
> >       unsigned long nr_reclaimed = 0;
> >       unsigned long nr_taken;
> >       unsigned long nr_anon;
> >       unsigned long nr_file;
> > +     struct page *split_page;
> >
> >       while (unlikely(too_many_isolated(zone, file, sc))) {
> >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> >       }
> >
> >       set_reclaim_mode(priority, sc, false);
> > +again:
> >       lru_add_drain();
> > +     split_page = NULL;
> >       spin_lock_irq(&zone->lru_lock);
> >
> >       if (scanning_global_lru(sc)) {
> > -             nr_taken = isolate_pages_global(nr_to_scan,
> > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> >                       &page_list, &nr_scanned, sc->order,
> >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > -                     zone, 0, file);
> > +                     zone, 0, file, &split_page);
> >               zone->pages_scanned += nr_scanned;
> > +             total_scanned += nr_scanned;
> >               if (current_is_kswapd())
> >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> >                                              nr_scanned);
> > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> >                                              nr_scanned);
> >       } else {
> > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> >                       &page_list, &nr_scanned, sc->order,
> >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> >                       zone, sc->mem_cgroup,
> > -                     0, file);
> > +                     0, file, &split_page);
> > +             total_scanned += nr_scanned;
> >               /*
> >                * mem_cgroup_isolate_pages() keeps track of
> >                * scanned pages on its own.
> > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> >               spin_unlock_irq(&zone->lru_lock);
> >               return 0;
> >       }
> > +     if (split_page && total_scanned < nr_to_scan) {
> > +             spin_unlock_irq(&zone->lru_lock);
> > +             split_huge_page(split_page);
> > +             goto again;
> > +     }
> >
> >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> >
> >       spin_unlock_irq(&zone->lru_lock);
> >
> > +     if (split_page)
> > +             split_huge_page(split_page);
> > +
> >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> >
> >       /* Check if we should syncronously wait for writeback */
> > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> >                                               &pgscanned, sc->order,
> >                                               ISOLATE_ACTIVE, zone,
> > -                                             1, file);
> > +                                             1, file, NULL);
> >               zone->pages_scanned += pgscanned;
> >       } else {
> >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> >                                               &pgscanned, sc->order,
> >                                               ISOLATE_ACTIVE, zone,
> > -                                             sc->mem_cgroup, 1, file);
> > +                                             sc->mem_cgroup, 1, file, NULL);
> >               /*
> >                * mem_cgroup_isolate_pages() keeps track of
> >                * scanned pages on its own.
> > Index: linux/mm/memcontrol.c
> > ===================================================================
> > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> >                                       unsigned long *scanned, int order,
> >                                       int mode, struct zone *z,
> >                                       struct mem_cgroup *mem_cont,
> > -                                     int active, int file)
> > +                                     int active, int file,
> > +                                     struct page **split_page)
> >  {
> >       unsigned long nr_taken = 0;
> >       struct page *page;
> > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> >               case 0:
> >                       list_move(&page->lru, dst);
> >                       mem_cgroup_del_lru(page);
> > -                     nr_taken += hpage_nr_pages(page);
> > +                     if (PageTransHuge(page) && split_page) {
> > +                             nr_taken++;
> > +                             *split_page = page;
> > +                             goto out;
> > +                     } else
> > +                             nr_taken += hpage_nr_pages(page);
> > +
> >                       break;
> >               case -EBUSY:
> >                       /* we don't affect global LRU but rotate in our LRU */
> > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> >               }
> >       }
> >
> > +out:
> >       *scanned = scan;
> >
> >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > Index: linux/include/linux/memcontrol.h
> > ===================================================================
> > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> >                                       unsigned long *scanned, int order,
> >                                       int mode, struct zone *z,
> >                                       struct mem_cgroup *mem_cont,
> > -                                     int active, int file);
> > +                                     int active, int file,
> > +                                     struct page **split_page);
> >
> >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> >  /*
> >
> >
> 
> I saw the code. my concern is your patch could make unnecessary split of THP.
> 
> When we isolates page, we can't know whether it's working set or not.
> So split should happen after we judge it's working set page.
yes, but since memory is big currently, it's unlikely the isolated page
get accessed in the window. And I only did the split in
shrink_inactive_list, not in active list.
And THP has mechanism to collapse small pages to huge page later.

> If you really want to merge this patch, I suggest that
> we can handle it in shrink_page_list step, not isolation step.
> 
> My totally untested code which is just to show the concept is as follows,
I did consider this option before. It has its problem too. The isolation
can isolate several huge page one time. And then later shrink_page_list
can swap several huge page one time, which is unfortunate. I'm pretty
sure this method can't reduce the thp_split count in my test. It could
be helpful when pages are heavily rotated, but this means page reclaim
is already broken, which is rare case.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-10-31  1:21     ` Shaohua Li
  0 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-10-31  1:21 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > With current logic, if page reclaim finds a huge page, it will just reclaim
> > the head page and leave tail pages reclaimed later. Let's take an example,
> > lru list has page A and B, page A is huge page:
> > 1. page A is isolated
> > 2. page B is isolated
> > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > page A+1, page A+2, ... are added to lru list.
> > 4. shrink_page_list() adds page B to swap page cache.
> > 5. page A and B is written out and reclaimed.
> > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> >
> > We expected the whole huge page A is reclaimed in the meantime, so
> > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> >
> > With this patch, we do huge page split just after the head page is isolated
> > for inactive lru list, so the tail pages will be reclaimed immediately.
> >
> > In a test, a range of anonymous memory is written and will trigger swap.
> > Without the patch:
> > #cat /proc/vmstat|grep thp
> > thp_fault_alloc 451
> > thp_fault_fallback 0
> > thp_collapse_alloc 0
> > thp_collapse_alloc_failed 0
> > thp_split 238
> >
> > With the patch:
> > #cat /proc/vmstat|grep thp
> > thp_fault_alloc 450
> > thp_fault_fallback 1
> > thp_collapse_alloc 0
> > thp_collapse_alloc_failed 0
> > thp_split 103
> >
> > So the thp_split number is reduced a lot, though there is one extra
> > thp_fault_fallback.
> >
> > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > ---
> >  include/linux/memcontrol.h |    3 +-
> >  mm/memcontrol.c            |   12 +++++++++--
> >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> >  3 files changed, 50 insertions(+), 14 deletions(-)
> >
> > Index: linux/mm/vmscan.c
> > ===================================================================
> > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> >   */
> >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> >               struct list_head *src, struct list_head *dst,
> > -             unsigned long *scanned, int order, int mode, int file)
> > +             unsigned long *scanned, int order, int mode, int file,
> > +             struct page **split_page)
> >  {
> >       unsigned long nr_taken = 0;
> >       unsigned long nr_lumpy_taken = 0;
> > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> >               case 0:
> >                       list_move(&page->lru, dst);
> >                       mem_cgroup_del_lru(page);
> > -                     nr_taken += hpage_nr_pages(page);
> > +                     if (PageTransHuge(page) && split_page) {
> > +                             nr_taken++;
> > +                             *split_page = page;
> > +                             goto out;
> > +                     } else
> > +                             nr_taken += hpage_nr_pages(page);
> >                       break;
> >
> >               case -EBUSY:
> > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> >                               list_move(&cursor_page->lru, dst);
> >                               mem_cgroup_del_lru(cursor_page);
> > -                             nr_taken += hpage_nr_pages(page);
> >                               nr_lumpy_taken++;
> >                               if (PageDirty(cursor_page))
> >                                       nr_lumpy_dirty++;
> >                               scan++;
> > +                             if (PageTransHuge(page) && split_page) {
> > +                                     nr_taken++;
> > +                                     *split_page = page;
> > +                                     goto out;
> > +                             } else
> > +                                     nr_taken += hpage_nr_pages(page);
> >                       } else {
> >                               /*
> >                                * Check if the page is freed already.
> > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> >                       nr_lumpy_failed++;
> >       }
> >
> > +out:
> >       *scanned = scan;
> >
> >       trace_mm_vmscan_lru_isolate(order,
> > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> >                                       struct list_head *dst,
> >                                       unsigned long *scanned, int order,
> >                                       int mode, struct zone *z,
> > -                                     int active, int file)
> > +                                     int active, int file,
> > +                                     struct page **split_page)
> >  {
> >       int lru = LRU_BASE;
> >       if (active)
> > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> >       if (file)
> >               lru += LRU_FILE;
> >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > -                                                             mode, file);
> > +                                                     mode, file, split_page);
> >  }
> >
> >  /*
> > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> >  {
> >       LIST_HEAD(page_list);
> >       unsigned long nr_scanned;
> > +     unsigned long total_scanned = 0;
> >       unsigned long nr_reclaimed = 0;
> >       unsigned long nr_taken;
> >       unsigned long nr_anon;
> >       unsigned long nr_file;
> > +     struct page *split_page;
> >
> >       while (unlikely(too_many_isolated(zone, file, sc))) {
> >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> >       }
> >
> >       set_reclaim_mode(priority, sc, false);
> > +again:
> >       lru_add_drain();
> > +     split_page = NULL;
> >       spin_lock_irq(&zone->lru_lock);
> >
> >       if (scanning_global_lru(sc)) {
> > -             nr_taken = isolate_pages_global(nr_to_scan,
> > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> >                       &page_list, &nr_scanned, sc->order,
> >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > -                     zone, 0, file);
> > +                     zone, 0, file, &split_page);
> >               zone->pages_scanned += nr_scanned;
> > +             total_scanned += nr_scanned;
> >               if (current_is_kswapd())
> >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> >                                              nr_scanned);
> > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> >                                              nr_scanned);
> >       } else {
> > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> >                       &page_list, &nr_scanned, sc->order,
> >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> >                       zone, sc->mem_cgroup,
> > -                     0, file);
> > +                     0, file, &split_page);
> > +             total_scanned += nr_scanned;
> >               /*
> >                * mem_cgroup_isolate_pages() keeps track of
> >                * scanned pages on its own.
> > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> >               spin_unlock_irq(&zone->lru_lock);
> >               return 0;
> >       }
> > +     if (split_page && total_scanned < nr_to_scan) {
> > +             spin_unlock_irq(&zone->lru_lock);
> > +             split_huge_page(split_page);
> > +             goto again;
> > +     }
> >
> >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> >
> >       spin_unlock_irq(&zone->lru_lock);
> >
> > +     if (split_page)
> > +             split_huge_page(split_page);
> > +
> >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> >
> >       /* Check if we should syncronously wait for writeback */
> > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> >                                               &pgscanned, sc->order,
> >                                               ISOLATE_ACTIVE, zone,
> > -                                             1, file);
> > +                                             1, file, NULL);
> >               zone->pages_scanned += pgscanned;
> >       } else {
> >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> >                                               &pgscanned, sc->order,
> >                                               ISOLATE_ACTIVE, zone,
> > -                                             sc->mem_cgroup, 1, file);
> > +                                             sc->mem_cgroup, 1, file, NULL);
> >               /*
> >                * mem_cgroup_isolate_pages() keeps track of
> >                * scanned pages on its own.
> > Index: linux/mm/memcontrol.c
> > ===================================================================
> > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> >                                       unsigned long *scanned, int order,
> >                                       int mode, struct zone *z,
> >                                       struct mem_cgroup *mem_cont,
> > -                                     int active, int file)
> > +                                     int active, int file,
> > +                                     struct page **split_page)
> >  {
> >       unsigned long nr_taken = 0;
> >       struct page *page;
> > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> >               case 0:
> >                       list_move(&page->lru, dst);
> >                       mem_cgroup_del_lru(page);
> > -                     nr_taken += hpage_nr_pages(page);
> > +                     if (PageTransHuge(page) && split_page) {
> > +                             nr_taken++;
> > +                             *split_page = page;
> > +                             goto out;
> > +                     } else
> > +                             nr_taken += hpage_nr_pages(page);
> > +
> >                       break;
> >               case -EBUSY:
> >                       /* we don't affect global LRU but rotate in our LRU */
> > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> >               }
> >       }
> >
> > +out:
> >       *scanned = scan;
> >
> >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > Index: linux/include/linux/memcontrol.h
> > ===================================================================
> > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> >                                       unsigned long *scanned, int order,
> >                                       int mode, struct zone *z,
> >                                       struct mem_cgroup *mem_cont,
> > -                                     int active, int file);
> > +                                     int active, int file,
> > +                                     struct page **split_page);
> >
> >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> >  /*
> >
> >
> 
> I saw the code. my concern is your patch could make unnecessary split of THP.
> 
> When we isolates page, we can't know whether it's working set or not.
> So split should happen after we judge it's working set page.
yes, but since memory is big currently, it's unlikely the isolated page
get accessed in the window. And I only did the split in
shrink_inactive_list, not in active list.
And THP has mechanism to collapse small pages to huge page later.

> If you really want to merge this patch, I suggest that
> we can handle it in shrink_page_list step, not isolation step.
> 
> My totally untested code which is just to show the concept is as follows,
I did consider this option before. It has its problem too. The isolation
can isolate several huge page one time. And then later shrink_page_list
can swap several huge page one time, which is unfortunate. I'm pretty
sure this method can't reduce the thp_split count in my test. It could
be helpful when pages are heavily rotated, but this means page reclaim
is already broken, which is rare case.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-10-31  1:21     ` Shaohua Li
@ 2011-10-31  8:23       ` Minchan Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-10-31  8:23 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Mon, Oct 31, 2011 at 09:21:28AM +0800, Shaohua Li wrote:
> On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > lru list has page A and B, page A is huge page:
> > > 1. page A is isolated
> > > 2. page B is isolated
> > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > page A+1, page A+2, ... are added to lru list.
> > > 4. shrink_page_list() adds page B to swap page cache.
> > > 5. page A and B is written out and reclaimed.
> > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > >
> > > We expected the whole huge page A is reclaimed in the meantime, so
> > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > >
> > > With this patch, we do huge page split just after the head page is isolated
> > > for inactive lru list, so the tail pages will be reclaimed immediately.
> > >
> > > In a test, a range of anonymous memory is written and will trigger swap.
> > > Without the patch:
> > > #cat /proc/vmstat|grep thp
> > > thp_fault_alloc 451
> > > thp_fault_fallback 0
> > > thp_collapse_alloc 0
> > > thp_collapse_alloc_failed 0
> > > thp_split 238
> > >
> > > With the patch:
> > > #cat /proc/vmstat|grep thp
> > > thp_fault_alloc 450
> > > thp_fault_fallback 1
> > > thp_collapse_alloc 0
> > > thp_collapse_alloc_failed 0
> > > thp_split 103
> > >
> > > So the thp_split number is reduced a lot, though there is one extra
> > > thp_fault_fallback.
> > >
> > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > ---
> > >  include/linux/memcontrol.h |    3 +-
> > >  mm/memcontrol.c            |   12 +++++++++--
> > >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > >
> > > Index: linux/mm/vmscan.c
> > > ===================================================================
> > > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> > >   */
> > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > >               struct list_head *src, struct list_head *dst,
> > > -             unsigned long *scanned, int order, int mode, int file)
> > > +             unsigned long *scanned, int order, int mode, int file,
> > > +             struct page **split_page)
> > >  {
> > >       unsigned long nr_taken = 0;
> > >       unsigned long nr_lumpy_taken = 0;
> > > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> > >               case 0:
> > >                       list_move(&page->lru, dst);
> > >                       mem_cgroup_del_lru(page);
> > > -                     nr_taken += hpage_nr_pages(page);
> > > +                     if (PageTransHuge(page) && split_page) {
> > > +                             nr_taken++;
> > > +                             *split_page = page;
> > > +                             goto out;
> > > +                     } else
> > > +                             nr_taken += hpage_nr_pages(page);
> > >                       break;
> > >
> > >               case -EBUSY:
> > > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> > >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> > >                               list_move(&cursor_page->lru, dst);
> > >                               mem_cgroup_del_lru(cursor_page);
> > > -                             nr_taken += hpage_nr_pages(page);
> > >                               nr_lumpy_taken++;
> > >                               if (PageDirty(cursor_page))
> > >                                       nr_lumpy_dirty++;
> > >                               scan++;
> > > +                             if (PageTransHuge(page) && split_page) {
> > > +                                     nr_taken++;
> > > +                                     *split_page = page;
> > > +                                     goto out;
> > > +                             } else
> > > +                                     nr_taken += hpage_nr_pages(page);
> > >                       } else {
> > >                               /*
> > >                                * Check if the page is freed already.
> > > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> > >                       nr_lumpy_failed++;
> > >       }
> > >
> > > +out:
> > >       *scanned = scan;
> > >
> > >       trace_mm_vmscan_lru_isolate(order,
> > > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> > >                                       struct list_head *dst,
> > >                                       unsigned long *scanned, int order,
> > >                                       int mode, struct zone *z,
> > > -                                     int active, int file)
> > > +                                     int active, int file,
> > > +                                     struct page **split_page)
> > >  {
> > >       int lru = LRU_BASE;
> > >       if (active)
> > > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> > >       if (file)
> > >               lru += LRU_FILE;
> > >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > > -                                                             mode, file);
> > > +                                                     mode, file, split_page);
> > >  }
> > >
> > >  /*
> > > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> > >  {
> > >       LIST_HEAD(page_list);
> > >       unsigned long nr_scanned;
> > > +     unsigned long total_scanned = 0;
> > >       unsigned long nr_reclaimed = 0;
> > >       unsigned long nr_taken;
> > >       unsigned long nr_anon;
> > >       unsigned long nr_file;
> > > +     struct page *split_page;
> > >
> > >       while (unlikely(too_many_isolated(zone, file, sc))) {
> > >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> > >       }
> > >
> > >       set_reclaim_mode(priority, sc, false);
> > > +again:
> > >       lru_add_drain();
> > > +     split_page = NULL;
> > >       spin_lock_irq(&zone->lru_lock);
> > >
> > >       if (scanning_global_lru(sc)) {
> > > -             nr_taken = isolate_pages_global(nr_to_scan,
> > > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> > >                       &page_list, &nr_scanned, sc->order,
> > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > -                     zone, 0, file);
> > > +                     zone, 0, file, &split_page);
> > >               zone->pages_scanned += nr_scanned;
> > > +             total_scanned += nr_scanned;
> > >               if (current_is_kswapd())
> > >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > >                                              nr_scanned);
> > > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> > >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > >                                              nr_scanned);
> > >       } else {
> > > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> > >                       &page_list, &nr_scanned, sc->order,
> > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > >                       zone, sc->mem_cgroup,
> > > -                     0, file);
> > > +                     0, file, &split_page);
> > > +             total_scanned += nr_scanned;
> > >               /*
> > >                * mem_cgroup_isolate_pages() keeps track of
> > >                * scanned pages on its own.
> > > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> > >               spin_unlock_irq(&zone->lru_lock);
> > >               return 0;
> > >       }
> > > +     if (split_page && total_scanned < nr_to_scan) {
> > > +             spin_unlock_irq(&zone->lru_lock);
> > > +             split_huge_page(split_page);
> > > +             goto again;
> > > +     }
> > >
> > >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> > >
> > >       spin_unlock_irq(&zone->lru_lock);
> > >
> > > +     if (split_page)
> > > +             split_huge_page(split_page);
> > > +
> > >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> > >
> > >       /* Check if we should syncronously wait for writeback */
> > > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> > >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> > >                                               &pgscanned, sc->order,
> > >                                               ISOLATE_ACTIVE, zone,
> > > -                                             1, file);
> > > +                                             1, file, NULL);
> > >               zone->pages_scanned += pgscanned;
> > >       } else {
> > >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> > >                                               &pgscanned, sc->order,
> > >                                               ISOLATE_ACTIVE, zone,
> > > -                                             sc->mem_cgroup, 1, file);
> > > +                                             sc->mem_cgroup, 1, file, NULL);
> > >               /*
> > >                * mem_cgroup_isolate_pages() keeps track of
> > >                * scanned pages on its own.
> > > Index: linux/mm/memcontrol.c
> > > ===================================================================
> > > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> > >                                       unsigned long *scanned, int order,
> > >                                       int mode, struct zone *z,
> > >                                       struct mem_cgroup *mem_cont,
> > > -                                     int active, int file)
> > > +                                     int active, int file,
> > > +                                     struct page **split_page)
> > >  {
> > >       unsigned long nr_taken = 0;
> > >       struct page *page;
> > > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> > >               case 0:
> > >                       list_move(&page->lru, dst);
> > >                       mem_cgroup_del_lru(page);
> > > -                     nr_taken += hpage_nr_pages(page);
> > > +                     if (PageTransHuge(page) && split_page) {
> > > +                             nr_taken++;
> > > +                             *split_page = page;
> > > +                             goto out;
> > > +                     } else
> > > +                             nr_taken += hpage_nr_pages(page);
> > > +
> > >                       break;
> > >               case -EBUSY:
> > >                       /* we don't affect global LRU but rotate in our LRU */
> > > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> > >               }
> > >       }
> > >
> > > +out:
> > >       *scanned = scan;
> > >
> > >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > > Index: linux/include/linux/memcontrol.h
> > > ===================================================================
> > > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> > >                                       unsigned long *scanned, int order,
> > >                                       int mode, struct zone *z,
> > >                                       struct mem_cgroup *mem_cont,
> > > -                                     int active, int file);
> > > +                                     int active, int file,
> > > +                                     struct page **split_page);
> > >
> > >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > >  /*
> > >
> > >
> > 
> > I saw the code. my concern is your patch could make unnecessary split of THP.
> > 
> > When we isolates page, we can't know whether it's working set or not.
> > So split should happen after we judge it's working set page.
> yes, but since memory is big currently, it's unlikely the isolated page
> get accessed in the window. And I only did the split in

We don't check page_reference when isolate happens.
Window which between isolation time and reclaim?
No. Window is from inactive's head to tail and it's the basic concept of
our LRU.

> shrink_inactive_list, not in active list.

But inactive list's size could be still big and
page reference heuristic is very important for reclaim algorithm.
 
> And THP has mechanism to collapse small pages to huge page later.

You mean "merge" instead of "collapse"?

> 
> > If you really want to merge this patch, I suggest that
> > we can handle it in shrink_page_list step, not isolation step.
> > 
> > My totally untested code which is just to show the concept is as follows,
> I did consider this option before. It has its problem too. The isolation
> can isolate several huge page one time. And then later shrink_page_list
> can swap several huge page one time, which is unfortunate. I'm pretty
> sure this method can't reduce the thp_split count in my test. It could

I understand your point but approach isn't good to me.
Maybe we can check whether we are going on or not before other THP page split happens
in shrink_page_list. If we split THP page successfully, maybe we can skip another THP split.

Another idea is we can avoid split of THP unless high order reclaim happens or low order 
high priority pressure happens.

> be helpful when pages are heavily rotated, but this means page reclaim
> is already broken, which is rare case.

Hmm. I don't think so. It's possbile because of anon reclaim algorithm use SEQ.

> 

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-10-31  8:23       ` Minchan Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-10-31  8:23 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Mon, Oct 31, 2011 at 09:21:28AM +0800, Shaohua Li wrote:
> On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > lru list has page A and B, page A is huge page:
> > > 1. page A is isolated
> > > 2. page B is isolated
> > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > page A+1, page A+2, ... are added to lru list.
> > > 4. shrink_page_list() adds page B to swap page cache.
> > > 5. page A and B is written out and reclaimed.
> > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > >
> > > We expected the whole huge page A is reclaimed in the meantime, so
> > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > >
> > > With this patch, we do huge page split just after the head page is isolated
> > > for inactive lru list, so the tail pages will be reclaimed immediately.
> > >
> > > In a test, a range of anonymous memory is written and will trigger swap.
> > > Without the patch:
> > > #cat /proc/vmstat|grep thp
> > > thp_fault_alloc 451
> > > thp_fault_fallback 0
> > > thp_collapse_alloc 0
> > > thp_collapse_alloc_failed 0
> > > thp_split 238
> > >
> > > With the patch:
> > > #cat /proc/vmstat|grep thp
> > > thp_fault_alloc 450
> > > thp_fault_fallback 1
> > > thp_collapse_alloc 0
> > > thp_collapse_alloc_failed 0
> > > thp_split 103
> > >
> > > So the thp_split number is reduced a lot, though there is one extra
> > > thp_fault_fallback.
> > >
> > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > ---
> > >  include/linux/memcontrol.h |    3 +-
> > >  mm/memcontrol.c            |   12 +++++++++--
> > >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > >
> > > Index: linux/mm/vmscan.c
> > > ===================================================================
> > > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> > >   */
> > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > >               struct list_head *src, struct list_head *dst,
> > > -             unsigned long *scanned, int order, int mode, int file)
> > > +             unsigned long *scanned, int order, int mode, int file,
> > > +             struct page **split_page)
> > >  {
> > >       unsigned long nr_taken = 0;
> > >       unsigned long nr_lumpy_taken = 0;
> > > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> > >               case 0:
> > >                       list_move(&page->lru, dst);
> > >                       mem_cgroup_del_lru(page);
> > > -                     nr_taken += hpage_nr_pages(page);
> > > +                     if (PageTransHuge(page) && split_page) {
> > > +                             nr_taken++;
> > > +                             *split_page = page;
> > > +                             goto out;
> > > +                     } else
> > > +                             nr_taken += hpage_nr_pages(page);
> > >                       break;
> > >
> > >               case -EBUSY:
> > > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> > >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> > >                               list_move(&cursor_page->lru, dst);
> > >                               mem_cgroup_del_lru(cursor_page);
> > > -                             nr_taken += hpage_nr_pages(page);
> > >                               nr_lumpy_taken++;
> > >                               if (PageDirty(cursor_page))
> > >                                       nr_lumpy_dirty++;
> > >                               scan++;
> > > +                             if (PageTransHuge(page) && split_page) {
> > > +                                     nr_taken++;
> > > +                                     *split_page = page;
> > > +                                     goto out;
> > > +                             } else
> > > +                                     nr_taken += hpage_nr_pages(page);
> > >                       } else {
> > >                               /*
> > >                                * Check if the page is freed already.
> > > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> > >                       nr_lumpy_failed++;
> > >       }
> > >
> > > +out:
> > >       *scanned = scan;
> > >
> > >       trace_mm_vmscan_lru_isolate(order,
> > > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> > >                                       struct list_head *dst,
> > >                                       unsigned long *scanned, int order,
> > >                                       int mode, struct zone *z,
> > > -                                     int active, int file)
> > > +                                     int active, int file,
> > > +                                     struct page **split_page)
> > >  {
> > >       int lru = LRU_BASE;
> > >       if (active)
> > > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> > >       if (file)
> > >               lru += LRU_FILE;
> > >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > > -                                                             mode, file);
> > > +                                                     mode, file, split_page);
> > >  }
> > >
> > >  /*
> > > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> > >  {
> > >       LIST_HEAD(page_list);
> > >       unsigned long nr_scanned;
> > > +     unsigned long total_scanned = 0;
> > >       unsigned long nr_reclaimed = 0;
> > >       unsigned long nr_taken;
> > >       unsigned long nr_anon;
> > >       unsigned long nr_file;
> > > +     struct page *split_page;
> > >
> > >       while (unlikely(too_many_isolated(zone, file, sc))) {
> > >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> > >       }
> > >
> > >       set_reclaim_mode(priority, sc, false);
> > > +again:
> > >       lru_add_drain();
> > > +     split_page = NULL;
> > >       spin_lock_irq(&zone->lru_lock);
> > >
> > >       if (scanning_global_lru(sc)) {
> > > -             nr_taken = isolate_pages_global(nr_to_scan,
> > > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> > >                       &page_list, &nr_scanned, sc->order,
> > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > -                     zone, 0, file);
> > > +                     zone, 0, file, &split_page);
> > >               zone->pages_scanned += nr_scanned;
> > > +             total_scanned += nr_scanned;
> > >               if (current_is_kswapd())
> > >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > >                                              nr_scanned);
> > > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> > >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > >                                              nr_scanned);
> > >       } else {
> > > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> > >                       &page_list, &nr_scanned, sc->order,
> > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > >                       zone, sc->mem_cgroup,
> > > -                     0, file);
> > > +                     0, file, &split_page);
> > > +             total_scanned += nr_scanned;
> > >               /*
> > >                * mem_cgroup_isolate_pages() keeps track of
> > >                * scanned pages on its own.
> > > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> > >               spin_unlock_irq(&zone->lru_lock);
> > >               return 0;
> > >       }
> > > +     if (split_page && total_scanned < nr_to_scan) {
> > > +             spin_unlock_irq(&zone->lru_lock);
> > > +             split_huge_page(split_page);
> > > +             goto again;
> > > +     }
> > >
> > >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> > >
> > >       spin_unlock_irq(&zone->lru_lock);
> > >
> > > +     if (split_page)
> > > +             split_huge_page(split_page);
> > > +
> > >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> > >
> > >       /* Check if we should syncronously wait for writeback */
> > > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> > >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> > >                                               &pgscanned, sc->order,
> > >                                               ISOLATE_ACTIVE, zone,
> > > -                                             1, file);
> > > +                                             1, file, NULL);
> > >               zone->pages_scanned += pgscanned;
> > >       } else {
> > >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> > >                                               &pgscanned, sc->order,
> > >                                               ISOLATE_ACTIVE, zone,
> > > -                                             sc->mem_cgroup, 1, file);
> > > +                                             sc->mem_cgroup, 1, file, NULL);
> > >               /*
> > >                * mem_cgroup_isolate_pages() keeps track of
> > >                * scanned pages on its own.
> > > Index: linux/mm/memcontrol.c
> > > ===================================================================
> > > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> > >                                       unsigned long *scanned, int order,
> > >                                       int mode, struct zone *z,
> > >                                       struct mem_cgroup *mem_cont,
> > > -                                     int active, int file)
> > > +                                     int active, int file,
> > > +                                     struct page **split_page)
> > >  {
> > >       unsigned long nr_taken = 0;
> > >       struct page *page;
> > > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> > >               case 0:
> > >                       list_move(&page->lru, dst);
> > >                       mem_cgroup_del_lru(page);
> > > -                     nr_taken += hpage_nr_pages(page);
> > > +                     if (PageTransHuge(page) && split_page) {
> > > +                             nr_taken++;
> > > +                             *split_page = page;
> > > +                             goto out;
> > > +                     } else
> > > +                             nr_taken += hpage_nr_pages(page);
> > > +
> > >                       break;
> > >               case -EBUSY:
> > >                       /* we don't affect global LRU but rotate in our LRU */
> > > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> > >               }
> > >       }
> > >
> > > +out:
> > >       *scanned = scan;
> > >
> > >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > > Index: linux/include/linux/memcontrol.h
> > > ===================================================================
> > > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> > >                                       unsigned long *scanned, int order,
> > >                                       int mode, struct zone *z,
> > >                                       struct mem_cgroup *mem_cont,
> > > -                                     int active, int file);
> > > +                                     int active, int file,
> > > +                                     struct page **split_page);
> > >
> > >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > >  /*
> > >
> > >
> > 
> > I saw the code. my concern is your patch could make unnecessary split of THP.
> > 
> > When we isolates page, we can't know whether it's working set or not.
> > So split should happen after we judge it's working set page.
> yes, but since memory is big currently, it's unlikely the isolated page
> get accessed in the window. And I only did the split in

We don't check page_reference when isolate happens.
Window which between isolation time and reclaim?
No. Window is from inactive's head to tail and it's the basic concept of
our LRU.

> shrink_inactive_list, not in active list.

But inactive list's size could be still big and
page reference heuristic is very important for reclaim algorithm.
 
> And THP has mechanism to collapse small pages to huge page later.

You mean "merge" instead of "collapse"?

> 
> > If you really want to merge this patch, I suggest that
> > we can handle it in shrink_page_list step, not isolation step.
> > 
> > My totally untested code which is just to show the concept is as follows,
> I did consider this option before. It has its problem too. The isolation
> can isolate several huge page one time. And then later shrink_page_list
> can swap several huge page one time, which is unfortunate. I'm pretty
> sure this method can't reduce the thp_split count in my test. It could

I understand your point but approach isn't good to me.
Maybe we can check whether we are going on or not before other THP page split happens
in shrink_page_list. If we split THP page successfully, maybe we can skip another THP split.

Another idea is we can avoid split of THP unless high order reclaim happens or low order 
high priority pressure happens.

> be helpful when pages are heavily rotated, but this means page reclaim
> is already broken, which is rare case.

Hmm. I don't think so. It's possbile because of anon reclaim algorithm use SEQ.

> 

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-10-31  1:10             ` Shaohua Li
@ 2011-10-31  8:24               ` Minchan Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-10-31  8:24 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Mon, Oct 31, 2011 at 09:10:49AM +0800, Shaohua Li wrote:
> On Fri, 2011-10-28 at 17:50 +0800, Minchan Kim wrote:
> > On Fri, Oct 28, 2011 at 04:25:56PM +0800, Shaohua Li wrote:
> > > On Fri, 2011-10-28 at 15:30 +0800, Minchan Kim wrote:
> > > > On Fri, Oct 28, 2011 at 01:11:55PM +0800, Shaohua Li wrote:
> > > > > On Fri, 2011-10-28 at 07:34 +0800, Minchan Kim wrote:
> > > > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > > > lru list has page A and B, page A is huge page:
> > > > > > > 1. page A is isolated
> > > > > > > 2. page B is isolated
> > > > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > > > page A+1, page A+2, ... are added to lru list.
> > > > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > > > 5. page A and B is written out and reclaimed.
> > > > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > > > 
> > > > > > I don't see your code yet but have a question.
> > > > > > You mitigate this problem by 4/5 which could add subpages into lru tail
> > > > > > so subpages would reclaim next interation of reclaim.
> > > > > > 
> > > > > > What do we need 5/5?
> > > > > > Do I miss something?
> > > > > Both patches are required. without this patch, current page reclaim will
> > > > > only reclaim the first page of a huge page, because the hugepage isn't
> > > > > split yet. The hugepage is split when the first page is being written to
> > > > > swap, which is too later and page reclaim might already isolated a lot
> > > > > of pages.
> > > > 
> > > > When split happens, subpages would be located in tail of LRU by your 4/5.
> > > > (Assume tail of LRU is old age).
> > > yes, but a lot of other pages already isolated. we will reclaim those
> > > pages first. for example, reclaim huge page A, B. current reclaim order
> > > is A, B, A+1, ... B+1, because we will isolated A and B first, all tail
> > > pages are not isolated yet. While with my patch, the order is A, A
> > > +1, ... B, B+1,.... with my patch, we can avoid unnecessary page split
> > > or page isolation. This is exactly why my patch reduces the thp_split
> > > count.
> > 
> > It's possbile but I doubt how it is effective becuase add_to_swap has a unlikely as follows
> > 
> > 	if (unlikely(PageTransHuge(page)))
> > 
> > I don't mean unlikely assumption is absolutely right.
> > But at least, you have to convince us of it's wrong.
> > Personally, I don't want to add more logic and handling THP pages
> > different with normal page unless it's real concern.
> if you actually use THP, you will find it's a problem. The data I posted
> already clearly showed it.
> 

If so, could you fix above thing in next iteration if you don't mind?

Thanks.

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-10-31  8:24               ` Minchan Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-10-31  8:24 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Mon, Oct 31, 2011 at 09:10:49AM +0800, Shaohua Li wrote:
> On Fri, 2011-10-28 at 17:50 +0800, Minchan Kim wrote:
> > On Fri, Oct 28, 2011 at 04:25:56PM +0800, Shaohua Li wrote:
> > > On Fri, 2011-10-28 at 15:30 +0800, Minchan Kim wrote:
> > > > On Fri, Oct 28, 2011 at 01:11:55PM +0800, Shaohua Li wrote:
> > > > > On Fri, 2011-10-28 at 07:34 +0800, Minchan Kim wrote:
> > > > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > > > lru list has page A and B, page A is huge page:
> > > > > > > 1. page A is isolated
> > > > > > > 2. page B is isolated
> > > > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > > > page A+1, page A+2, ... are added to lru list.
> > > > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > > > 5. page A and B is written out and reclaimed.
> > > > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > > > 
> > > > > > I don't see your code yet but have a question.
> > > > > > You mitigate this problem by 4/5 which could add subpages into lru tail
> > > > > > so subpages would reclaim next interation of reclaim.
> > > > > > 
> > > > > > What do we need 5/5?
> > > > > > Do I miss something?
> > > > > Both patches are required. without this patch, current page reclaim will
> > > > > only reclaim the first page of a huge page, because the hugepage isn't
> > > > > split yet. The hugepage is split when the first page is being written to
> > > > > swap, which is too later and page reclaim might already isolated a lot
> > > > > of pages.
> > > > 
> > > > When split happens, subpages would be located in tail of LRU by your 4/5.
> > > > (Assume tail of LRU is old age).
> > > yes, but a lot of other pages already isolated. we will reclaim those
> > > pages first. for example, reclaim huge page A, B. current reclaim order
> > > is A, B, A+1, ... B+1, because we will isolated A and B first, all tail
> > > pages are not isolated yet. While with my patch, the order is A, A
> > > +1, ... B, B+1,.... with my patch, we can avoid unnecessary page split
> > > or page isolation. This is exactly why my patch reduces the thp_split
> > > count.
> > 
> > It's possbile but I doubt how it is effective becuase add_to_swap has a unlikely as follows
> > 
> > 	if (unlikely(PageTransHuge(page)))
> > 
> > I don't mean unlikely assumption is absolutely right.
> > But at least, you have to convince us of it's wrong.
> > Personally, I don't want to add more logic and handling THP pages
> > different with normal page unless it's real concern.
> if you actually use THP, you will find it's a problem. The data I posted
> already clearly showed it.
> 

If so, could you fix above thing in next iteration if you don't mind?

Thanks.

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-10-31  8:23       ` Minchan Kim
@ 2011-10-31  9:03         ` Shaohua Li
  -1 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-10-31  9:03 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Mon, 2011-10-31 at 16:23 +0800, Minchan Kim wrote:
> On Mon, Oct 31, 2011 at 09:21:28AM +0800, Shaohua Li wrote:
> > On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > lru list has page A and B, page A is huge page:
> > > > 1. page A is isolated
> > > > 2. page B is isolated
> > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > page A+1, page A+2, ... are added to lru list.
> > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > 5. page A and B is written out and reclaimed.
> > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > >
> > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > > >
> > > > With this patch, we do huge page split just after the head page is isolated
> > > > for inactive lru list, so the tail pages will be reclaimed immediately.
> > > >
> > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > Without the patch:
> > > > #cat /proc/vmstat|grep thp
> > > > thp_fault_alloc 451
> > > > thp_fault_fallback 0
> > > > thp_collapse_alloc 0
> > > > thp_collapse_alloc_failed 0
> > > > thp_split 238
> > > >
> > > > With the patch:
> > > > #cat /proc/vmstat|grep thp
> > > > thp_fault_alloc 450
> > > > thp_fault_fallback 1
> > > > thp_collapse_alloc 0
> > > > thp_collapse_alloc_failed 0
> > > > thp_split 103
> > > >
> > > > So the thp_split number is reduced a lot, though there is one extra
> > > > thp_fault_fallback.
> > > >
> > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > > ---
> > > >  include/linux/memcontrol.h |    3 +-
> > > >  mm/memcontrol.c            |   12 +++++++++--
> > > >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> > > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > > >
> > > > Index: linux/mm/vmscan.c
> > > > ===================================================================
> > > > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > > > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > > > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> > > >   */
> > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > >               struct list_head *src, struct list_head *dst,
> > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > +             struct page **split_page)
> > > >  {
> > > >       unsigned long nr_taken = 0;
> > > >       unsigned long nr_lumpy_taken = 0;
> > > > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> > > >               case 0:
> > > >                       list_move(&page->lru, dst);
> > > >                       mem_cgroup_del_lru(page);
> > > > -                     nr_taken += hpage_nr_pages(page);
> > > > +                     if (PageTransHuge(page) && split_page) {
> > > > +                             nr_taken++;
> > > > +                             *split_page = page;
> > > > +                             goto out;
> > > > +                     } else
> > > > +                             nr_taken += hpage_nr_pages(page);
> > > >                       break;
> > > >
> > > >               case -EBUSY:
> > > > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> > > >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> > > >                               list_move(&cursor_page->lru, dst);
> > > >                               mem_cgroup_del_lru(cursor_page);
> > > > -                             nr_taken += hpage_nr_pages(page);
> > > >                               nr_lumpy_taken++;
> > > >                               if (PageDirty(cursor_page))
> > > >                                       nr_lumpy_dirty++;
> > > >                               scan++;
> > > > +                             if (PageTransHuge(page) && split_page) {
> > > > +                                     nr_taken++;
> > > > +                                     *split_page = page;
> > > > +                                     goto out;
> > > > +                             } else
> > > > +                                     nr_taken += hpage_nr_pages(page);
> > > >                       } else {
> > > >                               /*
> > > >                                * Check if the page is freed already.
> > > > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> > > >                       nr_lumpy_failed++;
> > > >       }
> > > >
> > > > +out:
> > > >       *scanned = scan;
> > > >
> > > >       trace_mm_vmscan_lru_isolate(order,
> > > > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> > > >                                       struct list_head *dst,
> > > >                                       unsigned long *scanned, int order,
> > > >                                       int mode, struct zone *z,
> > > > -                                     int active, int file)
> > > > +                                     int active, int file,
> > > > +                                     struct page **split_page)
> > > >  {
> > > >       int lru = LRU_BASE;
> > > >       if (active)
> > > > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> > > >       if (file)
> > > >               lru += LRU_FILE;
> > > >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > > > -                                                             mode, file);
> > > > +                                                     mode, file, split_page);
> > > >  }
> > > >
> > > >  /*
> > > > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> > > >  {
> > > >       LIST_HEAD(page_list);
> > > >       unsigned long nr_scanned;
> > > > +     unsigned long total_scanned = 0;
> > > >       unsigned long nr_reclaimed = 0;
> > > >       unsigned long nr_taken;
> > > >       unsigned long nr_anon;
> > > >       unsigned long nr_file;
> > > > +     struct page *split_page;
> > > >
> > > >       while (unlikely(too_many_isolated(zone, file, sc))) {
> > > >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> > > >       }
> > > >
> > > >       set_reclaim_mode(priority, sc, false);
> > > > +again:
> > > >       lru_add_drain();
> > > > +     split_page = NULL;
> > > >       spin_lock_irq(&zone->lru_lock);
> > > >
> > > >       if (scanning_global_lru(sc)) {
> > > > -             nr_taken = isolate_pages_global(nr_to_scan,
> > > > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> > > >                       &page_list, &nr_scanned, sc->order,
> > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > -                     zone, 0, file);
> > > > +                     zone, 0, file, &split_page);
> > > >               zone->pages_scanned += nr_scanned;
> > > > +             total_scanned += nr_scanned;
> > > >               if (current_is_kswapd())
> > > >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > > >                                              nr_scanned);
> > > > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> > > >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > > >                                              nr_scanned);
> > > >       } else {
> > > > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > > > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> > > >                       &page_list, &nr_scanned, sc->order,
> > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > >                       zone, sc->mem_cgroup,
> > > > -                     0, file);
> > > > +                     0, file, &split_page);
> > > > +             total_scanned += nr_scanned;
> > > >               /*
> > > >                * mem_cgroup_isolate_pages() keeps track of
> > > >                * scanned pages on its own.
> > > > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> > > >               spin_unlock_irq(&zone->lru_lock);
> > > >               return 0;
> > > >       }
> > > > +     if (split_page && total_scanned < nr_to_scan) {
> > > > +             spin_unlock_irq(&zone->lru_lock);
> > > > +             split_huge_page(split_page);
> > > > +             goto again;
> > > > +     }
> > > >
> > > >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> > > >
> > > >       spin_unlock_irq(&zone->lru_lock);
> > > >
> > > > +     if (split_page)
> > > > +             split_huge_page(split_page);
> > > > +
> > > >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> > > >
> > > >       /* Check if we should syncronously wait for writeback */
> > > > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> > > >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> > > >                                               &pgscanned, sc->order,
> > > >                                               ISOLATE_ACTIVE, zone,
> > > > -                                             1, file);
> > > > +                                             1, file, NULL);
> > > >               zone->pages_scanned += pgscanned;
> > > >       } else {
> > > >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> > > >                                               &pgscanned, sc->order,
> > > >                                               ISOLATE_ACTIVE, zone,
> > > > -                                             sc->mem_cgroup, 1, file);
> > > > +                                             sc->mem_cgroup, 1, file, NULL);
> > > >               /*
> > > >                * mem_cgroup_isolate_pages() keeps track of
> > > >                * scanned pages on its own.
> > > > Index: linux/mm/memcontrol.c
> > > > ===================================================================
> > > > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > > > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > > > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> > > >                                       unsigned long *scanned, int order,
> > > >                                       int mode, struct zone *z,
> > > >                                       struct mem_cgroup *mem_cont,
> > > > -                                     int active, int file)
> > > > +                                     int active, int file,
> > > > +                                     struct page **split_page)
> > > >  {
> > > >       unsigned long nr_taken = 0;
> > > >       struct page *page;
> > > > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> > > >               case 0:
> > > >                       list_move(&page->lru, dst);
> > > >                       mem_cgroup_del_lru(page);
> > > > -                     nr_taken += hpage_nr_pages(page);
> > > > +                     if (PageTransHuge(page) && split_page) {
> > > > +                             nr_taken++;
> > > > +                             *split_page = page;
> > > > +                             goto out;
> > > > +                     } else
> > > > +                             nr_taken += hpage_nr_pages(page);
> > > > +
> > > >                       break;
> > > >               case -EBUSY:
> > > >                       /* we don't affect global LRU but rotate in our LRU */
> > > > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> > > >               }
> > > >       }
> > > >
> > > > +out:
> > > >       *scanned = scan;
> > > >
> > > >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > > > Index: linux/include/linux/memcontrol.h
> > > > ===================================================================
> > > > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > > > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > > > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> > > >                                       unsigned long *scanned, int order,
> > > >                                       int mode, struct zone *z,
> > > >                                       struct mem_cgroup *mem_cont,
> > > > -                                     int active, int file);
> > > > +                                     int active, int file,
> > > > +                                     struct page **split_page);
> > > >
> > > >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > > >  /*
> > > >
> > > >
> > >
> > > I saw the code. my concern is your patch could make unnecessary split of THP.
> > >
> > > When we isolates page, we can't know whether it's working set or not.
> > > So split should happen after we judge it's working set page.
> > yes, but since memory is big currently, it's unlikely the isolated page
> > get accessed in the window. And I only did the split in
> 
> We don't check page_reference when isolate happens.
> Window which between isolation time and reclaim?
> No. Window is from inactive's head to tail and it's the basic concept of
> our LRU.
> 
> > shrink_inactive_list, not in active list.
> 
> But inactive list's size could be still big and
> page reference heuristic is very important for reclaim algorithm.
I mean pages aren't referenced. but ok, I can't take such assumption.

> > And THP has mechanism to collapse small pages to huge page later.
> 
> You mean "merge" instead of "collapse"?
> 
> >
> > > If you really want to merge this patch, I suggest that
> > > we can handle it in shrink_page_list step, not isolation step.
> > >
> > > My totally untested code which is just to show the concept is as follows,
> > I did consider this option before. It has its problem too. The isolation
> > can isolate several huge page one time. And then later shrink_page_list
> > can swap several huge page one time, which is unfortunate. I'm pretty
> > sure this method can't reduce the thp_split count in my test. It could
> 
> I understand your point but approach isn't good to me.
> Maybe we can check whether we are going on or not before other THP page split happens
> in shrink_page_list. If we split THP page successfully, maybe we can skip another THP split.
> Another idea is we can avoid split of THP unless high order reclaim happens or low order
> high priority pressure happens.
I agreed the split better be done at shrink_page_list, but we must avoid
isolate too many pages. I'll check if I can have a better solution for
next post.

Thanks,
Shaohua


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-10-31  9:03         ` Shaohua Li
  0 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-10-31  9:03 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Mon, 2011-10-31 at 16:23 +0800, Minchan Kim wrote:
> On Mon, Oct 31, 2011 at 09:21:28AM +0800, Shaohua Li wrote:
> > On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > lru list has page A and B, page A is huge page:
> > > > 1. page A is isolated
> > > > 2. page B is isolated
> > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > page A+1, page A+2, ... are added to lru list.
> > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > 5. page A and B is written out and reclaimed.
> > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > >
> > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > > >
> > > > With this patch, we do huge page split just after the head page is isolated
> > > > for inactive lru list, so the tail pages will be reclaimed immediately.
> > > >
> > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > Without the patch:
> > > > #cat /proc/vmstat|grep thp
> > > > thp_fault_alloc 451
> > > > thp_fault_fallback 0
> > > > thp_collapse_alloc 0
> > > > thp_collapse_alloc_failed 0
> > > > thp_split 238
> > > >
> > > > With the patch:
> > > > #cat /proc/vmstat|grep thp
> > > > thp_fault_alloc 450
> > > > thp_fault_fallback 1
> > > > thp_collapse_alloc 0
> > > > thp_collapse_alloc_failed 0
> > > > thp_split 103
> > > >
> > > > So the thp_split number is reduced a lot, though there is one extra
> > > > thp_fault_fallback.
> > > >
> > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > > ---
> > > >  include/linux/memcontrol.h |    3 +-
> > > >  mm/memcontrol.c            |   12 +++++++++--
> > > >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> > > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > > >
> > > > Index: linux/mm/vmscan.c
> > > > ===================================================================
> > > > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > > > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > > > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> > > >   */
> > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > >               struct list_head *src, struct list_head *dst,
> > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > +             struct page **split_page)
> > > >  {
> > > >       unsigned long nr_taken = 0;
> > > >       unsigned long nr_lumpy_taken = 0;
> > > > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> > > >               case 0:
> > > >                       list_move(&page->lru, dst);
> > > >                       mem_cgroup_del_lru(page);
> > > > -                     nr_taken += hpage_nr_pages(page);
> > > > +                     if (PageTransHuge(page) && split_page) {
> > > > +                             nr_taken++;
> > > > +                             *split_page = page;
> > > > +                             goto out;
> > > > +                     } else
> > > > +                             nr_taken += hpage_nr_pages(page);
> > > >                       break;
> > > >
> > > >               case -EBUSY:
> > > > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> > > >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> > > >                               list_move(&cursor_page->lru, dst);
> > > >                               mem_cgroup_del_lru(cursor_page);
> > > > -                             nr_taken += hpage_nr_pages(page);
> > > >                               nr_lumpy_taken++;
> > > >                               if (PageDirty(cursor_page))
> > > >                                       nr_lumpy_dirty++;
> > > >                               scan++;
> > > > +                             if (PageTransHuge(page) && split_page) {
> > > > +                                     nr_taken++;
> > > > +                                     *split_page = page;
> > > > +                                     goto out;
> > > > +                             } else
> > > > +                                     nr_taken += hpage_nr_pages(page);
> > > >                       } else {
> > > >                               /*
> > > >                                * Check if the page is freed already.
> > > > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> > > >                       nr_lumpy_failed++;
> > > >       }
> > > >
> > > > +out:
> > > >       *scanned = scan;
> > > >
> > > >       trace_mm_vmscan_lru_isolate(order,
> > > > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> > > >                                       struct list_head *dst,
> > > >                                       unsigned long *scanned, int order,
> > > >                                       int mode, struct zone *z,
> > > > -                                     int active, int file)
> > > > +                                     int active, int file,
> > > > +                                     struct page **split_page)
> > > >  {
> > > >       int lru = LRU_BASE;
> > > >       if (active)
> > > > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> > > >       if (file)
> > > >               lru += LRU_FILE;
> > > >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > > > -                                                             mode, file);
> > > > +                                                     mode, file, split_page);
> > > >  }
> > > >
> > > >  /*
> > > > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> > > >  {
> > > >       LIST_HEAD(page_list);
> > > >       unsigned long nr_scanned;
> > > > +     unsigned long total_scanned = 0;
> > > >       unsigned long nr_reclaimed = 0;
> > > >       unsigned long nr_taken;
> > > >       unsigned long nr_anon;
> > > >       unsigned long nr_file;
> > > > +     struct page *split_page;
> > > >
> > > >       while (unlikely(too_many_isolated(zone, file, sc))) {
> > > >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> > > >       }
> > > >
> > > >       set_reclaim_mode(priority, sc, false);
> > > > +again:
> > > >       lru_add_drain();
> > > > +     split_page = NULL;
> > > >       spin_lock_irq(&zone->lru_lock);
> > > >
> > > >       if (scanning_global_lru(sc)) {
> > > > -             nr_taken = isolate_pages_global(nr_to_scan,
> > > > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> > > >                       &page_list, &nr_scanned, sc->order,
> > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > -                     zone, 0, file);
> > > > +                     zone, 0, file, &split_page);
> > > >               zone->pages_scanned += nr_scanned;
> > > > +             total_scanned += nr_scanned;
> > > >               if (current_is_kswapd())
> > > >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > > >                                              nr_scanned);
> > > > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> > > >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > > >                                              nr_scanned);
> > > >       } else {
> > > > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > > > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> > > >                       &page_list, &nr_scanned, sc->order,
> > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > >                       zone, sc->mem_cgroup,
> > > > -                     0, file);
> > > > +                     0, file, &split_page);
> > > > +             total_scanned += nr_scanned;
> > > >               /*
> > > >                * mem_cgroup_isolate_pages() keeps track of
> > > >                * scanned pages on its own.
> > > > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> > > >               spin_unlock_irq(&zone->lru_lock);
> > > >               return 0;
> > > >       }
> > > > +     if (split_page && total_scanned < nr_to_scan) {
> > > > +             spin_unlock_irq(&zone->lru_lock);
> > > > +             split_huge_page(split_page);
> > > > +             goto again;
> > > > +     }
> > > >
> > > >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> > > >
> > > >       spin_unlock_irq(&zone->lru_lock);
> > > >
> > > > +     if (split_page)
> > > > +             split_huge_page(split_page);
> > > > +
> > > >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> > > >
> > > >       /* Check if we should syncronously wait for writeback */
> > > > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> > > >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> > > >                                               &pgscanned, sc->order,
> > > >                                               ISOLATE_ACTIVE, zone,
> > > > -                                             1, file);
> > > > +                                             1, file, NULL);
> > > >               zone->pages_scanned += pgscanned;
> > > >       } else {
> > > >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> > > >                                               &pgscanned, sc->order,
> > > >                                               ISOLATE_ACTIVE, zone,
> > > > -                                             sc->mem_cgroup, 1, file);
> > > > +                                             sc->mem_cgroup, 1, file, NULL);
> > > >               /*
> > > >                * mem_cgroup_isolate_pages() keeps track of
> > > >                * scanned pages on its own.
> > > > Index: linux/mm/memcontrol.c
> > > > ===================================================================
> > > > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > > > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > > > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> > > >                                       unsigned long *scanned, int order,
> > > >                                       int mode, struct zone *z,
> > > >                                       struct mem_cgroup *mem_cont,
> > > > -                                     int active, int file)
> > > > +                                     int active, int file,
> > > > +                                     struct page **split_page)
> > > >  {
> > > >       unsigned long nr_taken = 0;
> > > >       struct page *page;
> > > > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> > > >               case 0:
> > > >                       list_move(&page->lru, dst);
> > > >                       mem_cgroup_del_lru(page);
> > > > -                     nr_taken += hpage_nr_pages(page);
> > > > +                     if (PageTransHuge(page) && split_page) {
> > > > +                             nr_taken++;
> > > > +                             *split_page = page;
> > > > +                             goto out;
> > > > +                     } else
> > > > +                             nr_taken += hpage_nr_pages(page);
> > > > +
> > > >                       break;
> > > >               case -EBUSY:
> > > >                       /* we don't affect global LRU but rotate in our LRU */
> > > > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> > > >               }
> > > >       }
> > > >
> > > > +out:
> > > >       *scanned = scan;
> > > >
> > > >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > > > Index: linux/include/linux/memcontrol.h
> > > > ===================================================================
> > > > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > > > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > > > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> > > >                                       unsigned long *scanned, int order,
> > > >                                       int mode, struct zone *z,
> > > >                                       struct mem_cgroup *mem_cont,
> > > > -                                     int active, int file);
> > > > +                                     int active, int file,
> > > > +                                     struct page **split_page);
> > > >
> > > >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > > >  /*
> > > >
> > > >
> > >
> > > I saw the code. my concern is your patch could make unnecessary split of THP.
> > >
> > > When we isolates page, we can't know whether it's working set or not.
> > > So split should happen after we judge it's working set page.
> > yes, but since memory is big currently, it's unlikely the isolated page
> > get accessed in the window. And I only did the split in
> 
> We don't check page_reference when isolate happens.
> Window which between isolation time and reclaim?
> No. Window is from inactive's head to tail and it's the basic concept of
> our LRU.
> 
> > shrink_inactive_list, not in active list.
> 
> But inactive list's size could be still big and
> page reference heuristic is very important for reclaim algorithm.
I mean pages aren't referenced. but ok, I can't take such assumption.

> > And THP has mechanism to collapse small pages to huge page later.
> 
> You mean "merge" instead of "collapse"?
> 
> >
> > > If you really want to merge this patch, I suggest that
> > > we can handle it in shrink_page_list step, not isolation step.
> > >
> > > My totally untested code which is just to show the concept is as follows,
> > I did consider this option before. It has its problem too. The isolation
> > can isolate several huge page one time. And then later shrink_page_list
> > can swap several huge page one time, which is unfortunate. I'm pretty
> > sure this method can't reduce the thp_split count in my test. It could
> 
> I understand your point but approach isn't good to me.
> Maybe we can check whether we are going on or not before other THP page split happens
> in shrink_page_list. If we split THP page successfully, maybe we can skip another THP split.
> Another idea is we can avoid split of THP unless high order reclaim happens or low order
> high priority pressure happens.
I agreed the split better be done at shrink_page_list, but we must avoid
isolate too many pages. I'll check if I can have a better solution for
next post.

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-10-31  9:03         ` Shaohua Li
@ 2011-11-02  3:17           ` Shaohua Li
  -1 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-11-02  3:17 UTC (permalink / raw)
  To: Minchan Kim, Andrew Morton
  Cc: aarcange, Hugh Dickins, Rik van Riel, mel, KAMEZAWA Hiroyuki,
	linux-mm, lkml

On Mon, 2011-10-31 at 17:03 +0800, Shaohua Li wrote:
> On Mon, 2011-10-31 at 16:23 +0800, Minchan Kim wrote:
> > On Mon, Oct 31, 2011 at 09:21:28AM +0800, Shaohua Li wrote:
> > > On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > lru list has page A and B, page A is huge page:
> > > > > 1. page A is isolated
> > > > > 2. page B is isolated
> > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > page A+1, page A+2, ... are added to lru list.
> > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > 5. page A and B is written out and reclaimed.
> > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > >
> > > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > > > >
> > > > > With this patch, we do huge page split just after the head page is isolated
> > > > > for inactive lru list, so the tail pages will be reclaimed immediately.
> > > > >
> > > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > > Without the patch:
> > > > > #cat /proc/vmstat|grep thp
> > > > > thp_fault_alloc 451
> > > > > thp_fault_fallback 0
> > > > > thp_collapse_alloc 0
> > > > > thp_collapse_alloc_failed 0
> > > > > thp_split 238
> > > > >
> > > > > With the patch:
> > > > > #cat /proc/vmstat|grep thp
> > > > > thp_fault_alloc 450
> > > > > thp_fault_fallback 1
> > > > > thp_collapse_alloc 0
> > > > > thp_collapse_alloc_failed 0
> > > > > thp_split 103
> > > > >
> > > > > So the thp_split number is reduced a lot, though there is one extra
> > > > > thp_fault_fallback.
> > > > >
> > > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > > > ---
> > > > >  include/linux/memcontrol.h |    3 +-
> > > > >  mm/memcontrol.c            |   12 +++++++++--
> > > > >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> > > > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > > > >
> > > > > Index: linux/mm/vmscan.c
> > > > > ===================================================================
> > > > > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > > > > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > > > > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> > > > >   */
> > > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > > >               struct list_head *src, struct list_head *dst,
> > > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > > +             struct page **split_page)
> > > > >  {
> > > > >       unsigned long nr_taken = 0;
> > > > >       unsigned long nr_lumpy_taken = 0;
> > > > > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> > > > >               case 0:
> > > > >                       list_move(&page->lru, dst);
> > > > >                       mem_cgroup_del_lru(page);
> > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > +                             nr_taken++;
> > > > > +                             *split_page = page;
> > > > > +                             goto out;
> > > > > +                     } else
> > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > >                       break;
> > > > >
> > > > >               case -EBUSY:
> > > > > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> > > > >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> > > > >                               list_move(&cursor_page->lru, dst);
> > > > >                               mem_cgroup_del_lru(cursor_page);
> > > > > -                             nr_taken += hpage_nr_pages(page);
> > > > >                               nr_lumpy_taken++;
> > > > >                               if (PageDirty(cursor_page))
> > > > >                                       nr_lumpy_dirty++;
> > > > >                               scan++;
> > > > > +                             if (PageTransHuge(page) && split_page) {
> > > > > +                                     nr_taken++;
> > > > > +                                     *split_page = page;
> > > > > +                                     goto out;
> > > > > +                             } else
> > > > > +                                     nr_taken += hpage_nr_pages(page);
> > > > >                       } else {
> > > > >                               /*
> > > > >                                * Check if the page is freed already.
> > > > > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> > > > >                       nr_lumpy_failed++;
> > > > >       }
> > > > >
> > > > > +out:
> > > > >       *scanned = scan;
> > > > >
> > > > >       trace_mm_vmscan_lru_isolate(order,
> > > > > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> > > > >                                       struct list_head *dst,
> > > > >                                       unsigned long *scanned, int order,
> > > > >                                       int mode, struct zone *z,
> > > > > -                                     int active, int file)
> > > > > +                                     int active, int file,
> > > > > +                                     struct page **split_page)
> > > > >  {
> > > > >       int lru = LRU_BASE;
> > > > >       if (active)
> > > > > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> > > > >       if (file)
> > > > >               lru += LRU_FILE;
> > > > >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > > > > -                                                             mode, file);
> > > > > +                                                     mode, file, split_page);
> > > > >  }
> > > > >
> > > > >  /*
> > > > > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> > > > >  {
> > > > >       LIST_HEAD(page_list);
> > > > >       unsigned long nr_scanned;
> > > > > +     unsigned long total_scanned = 0;
> > > > >       unsigned long nr_reclaimed = 0;
> > > > >       unsigned long nr_taken;
> > > > >       unsigned long nr_anon;
> > > > >       unsigned long nr_file;
> > > > > +     struct page *split_page;
> > > > >
> > > > >       while (unlikely(too_many_isolated(zone, file, sc))) {
> > > > >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > > > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > >       }
> > > > >
> > > > >       set_reclaim_mode(priority, sc, false);
> > > > > +again:
> > > > >       lru_add_drain();
> > > > > +     split_page = NULL;
> > > > >       spin_lock_irq(&zone->lru_lock);
> > > > >
> > > > >       if (scanning_global_lru(sc)) {
> > > > > -             nr_taken = isolate_pages_global(nr_to_scan,
> > > > > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> > > > >                       &page_list, &nr_scanned, sc->order,
> > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > -                     zone, 0, file);
> > > > > +                     zone, 0, file, &split_page);
> > > > >               zone->pages_scanned += nr_scanned;
> > > > > +             total_scanned += nr_scanned;
> > > > >               if (current_is_kswapd())
> > > > >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > > > >                                              nr_scanned);
> > > > > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> > > > >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > > > >                                              nr_scanned);
> > > > >       } else {
> > > > > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > > > > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> > > > >                       &page_list, &nr_scanned, sc->order,
> > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > >                       zone, sc->mem_cgroup,
> > > > > -                     0, file);
> > > > > +                     0, file, &split_page);
> > > > > +             total_scanned += nr_scanned;
> > > > >               /*
> > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > >                * scanned pages on its own.
> > > > > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > >               spin_unlock_irq(&zone->lru_lock);
> > > > >               return 0;
> > > > >       }
> > > > > +     if (split_page && total_scanned < nr_to_scan) {
> > > > > +             spin_unlock_irq(&zone->lru_lock);
> > > > > +             split_huge_page(split_page);
> > > > > +             goto again;
> > > > > +     }
> > > > >
> > > > >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> > > > >
> > > > >       spin_unlock_irq(&zone->lru_lock);
> > > > >
> > > > > +     if (split_page)
> > > > > +             split_huge_page(split_page);
> > > > > +
> > > > >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> > > > >
> > > > >       /* Check if we should syncronously wait for writeback */
> > > > > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> > > > >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> > > > >                                               &pgscanned, sc->order,
> > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > -                                             1, file);
> > > > > +                                             1, file, NULL);
> > > > >               zone->pages_scanned += pgscanned;
> > > > >       } else {
> > > > >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> > > > >                                               &pgscanned, sc->order,
> > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > -                                             sc->mem_cgroup, 1, file);
> > > > > +                                             sc->mem_cgroup, 1, file, NULL);
> > > > >               /*
> > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > >                * scanned pages on its own.
> > > > > Index: linux/mm/memcontrol.c
> > > > > ===================================================================
> > > > > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > > > > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > > > > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > >                                       unsigned long *scanned, int order,
> > > > >                                       int mode, struct zone *z,
> > > > >                                       struct mem_cgroup *mem_cont,
> > > > > -                                     int active, int file)
> > > > > +                                     int active, int file,
> > > > > +                                     struct page **split_page)
> > > > >  {
> > > > >       unsigned long nr_taken = 0;
> > > > >       struct page *page;
> > > > > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > >               case 0:
> > > > >                       list_move(&page->lru, dst);
> > > > >                       mem_cgroup_del_lru(page);
> > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > +                             nr_taken++;
> > > > > +                             *split_page = page;
> > > > > +                             goto out;
> > > > > +                     } else
> > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > +
> > > > >                       break;
> > > > >               case -EBUSY:
> > > > >                       /* we don't affect global LRU but rotate in our LRU */
> > > > > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > >               }
> > > > >       }
> > > > >
> > > > > +out:
> > > > >       *scanned = scan;
> > > > >
> > > > >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > > > > Index: linux/include/linux/memcontrol.h
> > > > > ===================================================================
> > > > > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > > > > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > > > > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> > > > >                                       unsigned long *scanned, int order,
> > > > >                                       int mode, struct zone *z,
> > > > >                                       struct mem_cgroup *mem_cont,
> > > > > -                                     int active, int file);
> > > > > +                                     int active, int file,
> > > > > +                                     struct page **split_page);
> > > > >
> > > > >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > > > >  /*
> > > > >
> > > > >
> > > >
> > > > I saw the code. my concern is your patch could make unnecessary split of THP.
> > > >
> > > > When we isolates page, we can't know whether it's working set or not.
> > > > So split should happen after we judge it's working set page.
> > > yes, but since memory is big currently, it's unlikely the isolated page
> > > get accessed in the window. And I only did the split in
> > 
> > We don't check page_reference when isolate happens.
> > Window which between isolation time and reclaim?
> > No. Window is from inactive's head to tail and it's the basic concept of
> > our LRU.
> > 
> > > shrink_inactive_list, not in active list.
> > 
> > But inactive list's size could be still big and
> > page reference heuristic is very important for reclaim algorithm.
> I mean pages aren't referenced. but ok, I can't take such assumption.
> 
> > > And THP has mechanism to collapse small pages to huge page later.
> > 
> > You mean "merge" instead of "collapse"?
> > 
> > >
> > > > If you really want to merge this patch, I suggest that
> > > > we can handle it in shrink_page_list step, not isolation step.
> > > >
> > > > My totally untested code which is just to show the concept is as follows,
> > > I did consider this option before. It has its problem too. The isolation
> > > can isolate several huge page one time. And then later shrink_page_list
> > > can swap several huge page one time, which is unfortunate. I'm pretty
> > > sure this method can't reduce the thp_split count in my test. It could
> > 
> > I understand your point but approach isn't good to me.
> > Maybe we can check whether we are going on or not before other THP page split happens
> > in shrink_page_list. If we split THP page successfully, maybe we can skip another THP split.
> > Another idea is we can avoid split of THP unless high order reclaim happens or low order
> > high priority pressure happens.
> I agreed the split better be done at shrink_page_list, but we must avoid
> isolate too many pages. I'll check if I can have a better solution for
> next post.
Let me try again.

Subject: thp: improve huge page reclaim -v2

With transparent huge page enabled, huge page will be split if it will
be reclaimed. With current logic, if page reclaim finds a huge page,
it will just reclaim the head page and leave tail pages reclaimed later.
Let's take an example, lru list has page A and B, page A is huge page:
1. page A is isolated
2. page B is isolated
3. shrink_page_list() adds page A to swap page cache. so page A is split.
page A+1, page A+2, ... are added to lru list.
4. shrink_page_list() adds page B to swap page cache.
5. page A and B is written out and reclaimed.
6. page A+1, A+2 ... is isolated and reclaimed later.
So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
The worst case could be we isolate/split 32 huge pages to try to reclaim
a huge page, but we only the 32 head pages are reclaimed.

We expected the whole huge page A is reclaimed in the meantime, so
the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, .... This could reduce a lot
of unnecessary huge page split and improve the reclaim.

With this patch, if a huge page is found in isolation, don't continue
isolation. Since if the huge page is reclaimed, we can reclaim more pages
than SWAP_CLUSTER_MAX. In shrink_page_list(), the huge page is split and
all tail pages will be added to the isolation list, so the tail pages can
be reclaimed immediately.

The drawback is we might isolate less pages if a huge page is found. But
I thought the benefit is far more than the drawback.

All code path are with PageTransHuge(), so should have no impact to normal
cases.

In a test, a range of anonymous memory is written and will trigger swap.
Without the patch:
#cat /proc/vmstat|grep thp
thp_fault_alloc 451
thp_fault_fallback 0
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 238

With the patch:
#cat /proc/vmstat|grep thp
thp_fault_alloc 451
thp_fault_fallback 0
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 76

So the thp_split number is reduced a lot.

v1->v2: Do the huge page split in shrink_page_list(). Some code are adopted from
Minchan's.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>

---
 include/linux/huge_mm.h    |    7 ++++++-
 include/linux/memcontrol.h |    3 ++-
 include/linux/swap.h       |    3 ++-
 mm/huge_memory.c           |   14 ++++++++------
 mm/memcontrol.c            |    6 +++++-
 mm/swap.c                  |   10 +++++++++-
 mm/swap_state.c            |    6 ------
 mm/vmscan.c                |   27 ++++++++++++++++++++-------
 8 files changed, 52 insertions(+), 24 deletions(-)

Index: linux/include/linux/huge_mm.h
===================================================================
--- linux.orig/include/linux/huge_mm.h	2011-11-02 09:48:16.000000000 +0800
+++ linux/include/linux/huge_mm.h	2011-11-02 10:06:33.000000000 +0800
@@ -81,7 +81,12 @@ extern int copy_pte_range(struct mm_stru
 extern int handle_pte_fault(struct mm_struct *mm,
 			    struct vm_area_struct *vma, unsigned long address,
 			    pte_t *pte, pmd_t *pmd, unsigned int flags);
-extern int split_huge_page(struct page *page);
+extern int split_huge_page_list(struct page *page, struct list_head *dst);
+static inline int split_huge_page(struct page *page)
+{
+	return split_huge_page_list(page, NULL);
+}
+
 extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
 #define split_huge_page_pmd(__mm, __pmd)				\
 	do {								\
Index: linux/include/linux/swap.h
===================================================================
--- linux.orig/include/linux/swap.h	2011-11-02 09:48:16.000000000 +0800
+++ linux/include/linux/swap.h	2011-11-02 10:06:33.000000000 +0800
@@ -218,7 +218,8 @@ extern unsigned int nr_free_pagecache_pa
 extern void __lru_cache_add(struct page *, enum lru_list lru);
 extern void lru_cache_add_lru(struct page *, enum lru_list lru);
 extern void lru_add_page_tail(struct zone* zone,
-			      struct page *page, struct page *page_tail);
+			      struct page *page, struct page *page_tail,
+			      struct list_head *dst);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
Index: linux/mm/huge_memory.c
===================================================================
--- linux.orig/mm/huge_memory.c	2011-11-02 09:48:16.000000000 +0800
+++ linux/mm/huge_memory.c	2011-11-02 10:58:21.000000000 +0800
@@ -1159,7 +1159,8 @@ static int __split_huge_page_splitting(s
 	return ret;
 }
 
-static void __split_huge_page_refcount(struct page *page)
+static void __split_huge_page_refcount(struct page *page,
+				       struct list_head *list)
 {
 	int i;
 	struct zone *zone = page_zone(page);
@@ -1229,7 +1230,7 @@ static void __split_huge_page_refcount(s
 
 		mem_cgroup_split_huge_fixup(page, page_tail);
 
-		lru_add_page_tail(zone, page, page_tail);
+		lru_add_page_tail(zone, page, page_tail, list);
 	}
 
 	__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
@@ -1343,7 +1344,8 @@ static int __split_huge_page_map(struct
 
 /* must be called with anon_vma->root->mutex hold */
 static void __split_huge_page(struct page *page,
-			      struct anon_vma *anon_vma)
+			      struct anon_vma *anon_vma,
+			      struct list_head *list)
 {
 	int mapcount, mapcount2;
 	struct anon_vma_chain *avc;
@@ -1375,7 +1377,7 @@ static void __split_huge_page(struct pag
 		       mapcount, page_mapcount(page));
 	BUG_ON(mapcount != page_mapcount(page));
 
-	__split_huge_page_refcount(page);
+	__split_huge_page_refcount(page, list);
 
 	mapcount2 = 0;
 	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
@@ -1392,7 +1394,7 @@ static void __split_huge_page(struct pag
 	BUG_ON(mapcount != mapcount2);
 }
 
-int split_huge_page(struct page *page)
+int split_huge_page_list(struct page *page, struct list_head *list)
 {
 	struct anon_vma *anon_vma;
 	int ret = 1;
@@ -1406,7 +1408,7 @@ int split_huge_page(struct page *page)
 		goto out_unlock;
 
 	BUG_ON(!PageSwapBacked(page));
-	__split_huge_page(page, anon_vma);
+	__split_huge_page(page, anon_vma, list);
 	count_vm_event(THP_SPLIT);
 
 	BUG_ON(PageCompound(page));
Index: linux/mm/swap.c
===================================================================
--- linux.orig/mm/swap.c	2011-11-02 09:48:16.000000000 +0800
+++ linux/mm/swap.c	2011-11-02 10:06:33.000000000 +0800
@@ -634,7 +634,8 @@ EXPORT_SYMBOL(__pagevec_release);
 
 /* used by __split_huge_page_refcount() */
 void lru_add_page_tail(struct zone* zone,
-		       struct page *page, struct page *page_tail)
+		       struct page *page, struct page *page_tail,
+		       struct list_head *dst)
 {
 	int active;
 	enum lru_list lru;
@@ -646,6 +647,13 @@ void lru_add_page_tail(struct zone* zone
 	VM_BUG_ON(PageLRU(page_tail));
 	VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
 
+	/* The huge page is isolated */
+	if (dst) {
+		get_page(page_tail);
+		list_add_tail(&page_tail->lru, dst);
+		return;
+	}
+
 	SetPageLRU(page_tail);
 
 	if (page_evictable(page_tail, NULL)) {
Index: linux/mm/swap_state.c
===================================================================
--- linux.orig/mm/swap_state.c	2011-11-02 09:48:16.000000000 +0800
+++ linux/mm/swap_state.c	2011-11-02 10:06:33.000000000 +0800
@@ -154,12 +154,6 @@ int add_to_swap(struct page *page)
 	if (!entry.val)
 		return 0;
 
-	if (unlikely(PageTransHuge(page)))
-		if (unlikely(split_huge_page(page))) {
-			swapcache_free(entry, NULL);
-			return 0;
-		}
-
 	/*
 	 * Radix-tree node allocations from PF_MEMALLOC contexts could
 	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
Index: linux/mm/vmscan.c
===================================================================
--- linux.orig/mm/vmscan.c	2011-11-02 09:48:16.000000000 +0800
+++ linux/mm/vmscan.c	2011-11-02 10:58:21.000000000 +0800
@@ -838,6 +838,10 @@ static unsigned long shrink_page_list(st
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!(sc->gfp_mask & __GFP_IO))
 				goto keep_locked;
+			if (unlikely(PageTransHuge(page)))
+				if (unlikely(split_huge_page_list(page,
+					page_list)))
+				    goto activate_locked;
 			if (!add_to_swap(page))
 				goto activate_locked;
 			may_enter_fs = 1;
@@ -1076,7 +1080,8 @@ int __isolate_lru_page(struct page *page
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		struct list_head *src, struct list_head *dst,
-		unsigned long *scanned, int order, int mode, int file)
+		unsigned long *scanned, int order, int mode, int file,
+		bool break_on_thp)
 {
 	unsigned long nr_taken = 0;
 	unsigned long nr_lumpy_taken = 0;
@@ -1101,6 +1106,10 @@ static unsigned long isolate_lru_pages(u
 			list_move(&page->lru, dst);
 			mem_cgroup_del_lru(page);
 			nr_taken += hpage_nr_pages(page);
+			if (unlikely(PageTransHuge(page)) && break_on_thp) {
+				scan++;
+				goto out;
+			}
 			break;
 
 		case -EBUSY:
@@ -1163,6 +1172,8 @@ static unsigned long isolate_lru_pages(u
 				if (PageDirty(cursor_page))
 					nr_lumpy_dirty++;
 				scan++;
+				if (unlikely(PageTransHuge(page)) && break_on_thp)
+					goto out;
 			} else {
 				/*
 				 * Check if the page is freed already.
@@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
 			nr_lumpy_failed++;
 	}
 
+out:
 	*scanned = scan;
 
 	trace_mm_vmscan_lru_isolate(order,
@@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
 					struct list_head *dst,
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
-					int active, int file)
+					int active, int file,
+					bool break_on_thp)
 {
 	int lru = LRU_BASE;
 	if (active)
@@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
 	if (file)
 		lru += LRU_FILE;
 	return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
-								mode, file);
+							mode, file, break_on_thp);
 }
 
 /*
@@ -1466,7 +1479,7 @@ shrink_inactive_list(unsigned long nr_to
 			&page_list, &nr_scanned, sc->order,
 			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
 					ISOLATE_BOTH : ISOLATE_INACTIVE,
-			zone, 0, file);
+			zone, 0, file, true);
 		zone->pages_scanned += nr_scanned;
 		if (current_is_kswapd())
 			__count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1480,7 +1493,7 @@ shrink_inactive_list(unsigned long nr_to
 			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
 					ISOLATE_BOTH : ISOLATE_INACTIVE,
 			zone, sc->mem_cgroup,
-			0, file);
+			0, file, true);
 		/*
 		 * mem_cgroup_isolate_pages() keeps track of
 		 * scanned pages on its own.
@@ -1589,13 +1602,13 @@ static void shrink_active_list(unsigned
 		nr_taken = isolate_pages_global(nr_pages, &l_hold,
 						&pgscanned, sc->order,
 						ISOLATE_ACTIVE, zone,
-						1, file);
+						1, file, false);
 		zone->pages_scanned += pgscanned;
 	} else {
 		nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
 						&pgscanned, sc->order,
 						ISOLATE_ACTIVE, zone,
-						sc->mem_cgroup, 1, file);
+						sc->mem_cgroup, 1, file, false);
 		/*
 		 * mem_cgroup_isolate_pages() keeps track of
 		 * scanned pages on its own.
Index: linux/include/linux/memcontrol.h
===================================================================
--- linux.orig/include/linux/memcontrol.h	2011-11-02 09:48:16.000000000 +0800
+++ linux/include/linux/memcontrol.h	2011-11-02 10:06:33.000000000 +0800
@@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
-					int active, int file);
+					int active, int file,
+					bool break_on_thp);
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 /*
Index: linux/mm/memcontrol.c
===================================================================
--- linux.orig/mm/memcontrol.c	2011-11-02 09:48:16.000000000 +0800
+++ linux/mm/memcontrol.c	2011-11-02 10:06:33.000000000 +0800
@@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
-					int active, int file)
+					int active, int file,
+					bool break_on_thp)
 {
 	unsigned long nr_taken = 0;
 	struct page *page;
@@ -1225,6 +1226,8 @@ unsigned long mem_cgroup_isolate_pages(u
 			list_move(&page->lru, dst);
 			mem_cgroup_del_lru(page);
 			nr_taken += hpage_nr_pages(page);
+			if (unlikely(PageTransHuge(page)) && break_on_thp)
+				goto out;
 			break;
 		case -EBUSY:
 			/* we don't affect global LRU but rotate in our LRU */
@@ -1235,6 +1238,7 @@ unsigned long mem_cgroup_isolate_pages(u
 		}
 	}
 
+out:
 	*scanned = scan;
 
 	trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-11-02  3:17           ` Shaohua Li
  0 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-11-02  3:17 UTC (permalink / raw)
  To: Minchan Kim, Andrew Morton
  Cc: aarcange, Hugh Dickins, Rik van Riel, mel, KAMEZAWA Hiroyuki,
	linux-mm, lkml

On Mon, 2011-10-31 at 17:03 +0800, Shaohua Li wrote:
> On Mon, 2011-10-31 at 16:23 +0800, Minchan Kim wrote:
> > On Mon, Oct 31, 2011 at 09:21:28AM +0800, Shaohua Li wrote:
> > > On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > lru list has page A and B, page A is huge page:
> > > > > 1. page A is isolated
> > > > > 2. page B is isolated
> > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > page A+1, page A+2, ... are added to lru list.
> > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > 5. page A and B is written out and reclaimed.
> > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > >
> > > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > > > >
> > > > > With this patch, we do huge page split just after the head page is isolated
> > > > > for inactive lru list, so the tail pages will be reclaimed immediately.
> > > > >
> > > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > > Without the patch:
> > > > > #cat /proc/vmstat|grep thp
> > > > > thp_fault_alloc 451
> > > > > thp_fault_fallback 0
> > > > > thp_collapse_alloc 0
> > > > > thp_collapse_alloc_failed 0
> > > > > thp_split 238
> > > > >
> > > > > With the patch:
> > > > > #cat /proc/vmstat|grep thp
> > > > > thp_fault_alloc 450
> > > > > thp_fault_fallback 1
> > > > > thp_collapse_alloc 0
> > > > > thp_collapse_alloc_failed 0
> > > > > thp_split 103
> > > > >
> > > > > So the thp_split number is reduced a lot, though there is one extra
> > > > > thp_fault_fallback.
> > > > >
> > > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > > > ---
> > > > >  include/linux/memcontrol.h |    3 +-
> > > > >  mm/memcontrol.c            |   12 +++++++++--
> > > > >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> > > > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > > > >
> > > > > Index: linux/mm/vmscan.c
> > > > > ===================================================================
> > > > > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > > > > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > > > > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> > > > >   */
> > > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > > >               struct list_head *src, struct list_head *dst,
> > > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > > +             struct page **split_page)
> > > > >  {
> > > > >       unsigned long nr_taken = 0;
> > > > >       unsigned long nr_lumpy_taken = 0;
> > > > > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> > > > >               case 0:
> > > > >                       list_move(&page->lru, dst);
> > > > >                       mem_cgroup_del_lru(page);
> > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > +                             nr_taken++;
> > > > > +                             *split_page = page;
> > > > > +                             goto out;
> > > > > +                     } else
> > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > >                       break;
> > > > >
> > > > >               case -EBUSY:
> > > > > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> > > > >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> > > > >                               list_move(&cursor_page->lru, dst);
> > > > >                               mem_cgroup_del_lru(cursor_page);
> > > > > -                             nr_taken += hpage_nr_pages(page);
> > > > >                               nr_lumpy_taken++;
> > > > >                               if (PageDirty(cursor_page))
> > > > >                                       nr_lumpy_dirty++;
> > > > >                               scan++;
> > > > > +                             if (PageTransHuge(page) && split_page) {
> > > > > +                                     nr_taken++;
> > > > > +                                     *split_page = page;
> > > > > +                                     goto out;
> > > > > +                             } else
> > > > > +                                     nr_taken += hpage_nr_pages(page);
> > > > >                       } else {
> > > > >                               /*
> > > > >                                * Check if the page is freed already.
> > > > > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> > > > >                       nr_lumpy_failed++;
> > > > >       }
> > > > >
> > > > > +out:
> > > > >       *scanned = scan;
> > > > >
> > > > >       trace_mm_vmscan_lru_isolate(order,
> > > > > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> > > > >                                       struct list_head *dst,
> > > > >                                       unsigned long *scanned, int order,
> > > > >                                       int mode, struct zone *z,
> > > > > -                                     int active, int file)
> > > > > +                                     int active, int file,
> > > > > +                                     struct page **split_page)
> > > > >  {
> > > > >       int lru = LRU_BASE;
> > > > >       if (active)
> > > > > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> > > > >       if (file)
> > > > >               lru += LRU_FILE;
> > > > >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > > > > -                                                             mode, file);
> > > > > +                                                     mode, file, split_page);
> > > > >  }
> > > > >
> > > > >  /*
> > > > > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> > > > >  {
> > > > >       LIST_HEAD(page_list);
> > > > >       unsigned long nr_scanned;
> > > > > +     unsigned long total_scanned = 0;
> > > > >       unsigned long nr_reclaimed = 0;
> > > > >       unsigned long nr_taken;
> > > > >       unsigned long nr_anon;
> > > > >       unsigned long nr_file;
> > > > > +     struct page *split_page;
> > > > >
> > > > >       while (unlikely(too_many_isolated(zone, file, sc))) {
> > > > >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > > > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > >       }
> > > > >
> > > > >       set_reclaim_mode(priority, sc, false);
> > > > > +again:
> > > > >       lru_add_drain();
> > > > > +     split_page = NULL;
> > > > >       spin_lock_irq(&zone->lru_lock);
> > > > >
> > > > >       if (scanning_global_lru(sc)) {
> > > > > -             nr_taken = isolate_pages_global(nr_to_scan,
> > > > > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> > > > >                       &page_list, &nr_scanned, sc->order,
> > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > -                     zone, 0, file);
> > > > > +                     zone, 0, file, &split_page);
> > > > >               zone->pages_scanned += nr_scanned;
> > > > > +             total_scanned += nr_scanned;
> > > > >               if (current_is_kswapd())
> > > > >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > > > >                                              nr_scanned);
> > > > > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> > > > >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > > > >                                              nr_scanned);
> > > > >       } else {
> > > > > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > > > > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> > > > >                       &page_list, &nr_scanned, sc->order,
> > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > >                       zone, sc->mem_cgroup,
> > > > > -                     0, file);
> > > > > +                     0, file, &split_page);
> > > > > +             total_scanned += nr_scanned;
> > > > >               /*
> > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > >                * scanned pages on its own.
> > > > > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > >               spin_unlock_irq(&zone->lru_lock);
> > > > >               return 0;
> > > > >       }
> > > > > +     if (split_page && total_scanned < nr_to_scan) {
> > > > > +             spin_unlock_irq(&zone->lru_lock);
> > > > > +             split_huge_page(split_page);
> > > > > +             goto again;
> > > > > +     }
> > > > >
> > > > >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> > > > >
> > > > >       spin_unlock_irq(&zone->lru_lock);
> > > > >
> > > > > +     if (split_page)
> > > > > +             split_huge_page(split_page);
> > > > > +
> > > > >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> > > > >
> > > > >       /* Check if we should syncronously wait for writeback */
> > > > > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> > > > >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> > > > >                                               &pgscanned, sc->order,
> > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > -                                             1, file);
> > > > > +                                             1, file, NULL);
> > > > >               zone->pages_scanned += pgscanned;
> > > > >       } else {
> > > > >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> > > > >                                               &pgscanned, sc->order,
> > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > -                                             sc->mem_cgroup, 1, file);
> > > > > +                                             sc->mem_cgroup, 1, file, NULL);
> > > > >               /*
> > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > >                * scanned pages on its own.
> > > > > Index: linux/mm/memcontrol.c
> > > > > ===================================================================
> > > > > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > > > > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > > > > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > >                                       unsigned long *scanned, int order,
> > > > >                                       int mode, struct zone *z,
> > > > >                                       struct mem_cgroup *mem_cont,
> > > > > -                                     int active, int file)
> > > > > +                                     int active, int file,
> > > > > +                                     struct page **split_page)
> > > > >  {
> > > > >       unsigned long nr_taken = 0;
> > > > >       struct page *page;
> > > > > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > >               case 0:
> > > > >                       list_move(&page->lru, dst);
> > > > >                       mem_cgroup_del_lru(page);
> > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > +                             nr_taken++;
> > > > > +                             *split_page = page;
> > > > > +                             goto out;
> > > > > +                     } else
> > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > +
> > > > >                       break;
> > > > >               case -EBUSY:
> > > > >                       /* we don't affect global LRU but rotate in our LRU */
> > > > > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > >               }
> > > > >       }
> > > > >
> > > > > +out:
> > > > >       *scanned = scan;
> > > > >
> > > > >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > > > > Index: linux/include/linux/memcontrol.h
> > > > > ===================================================================
> > > > > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > > > > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > > > > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> > > > >                                       unsigned long *scanned, int order,
> > > > >                                       int mode, struct zone *z,
> > > > >                                       struct mem_cgroup *mem_cont,
> > > > > -                                     int active, int file);
> > > > > +                                     int active, int file,
> > > > > +                                     struct page **split_page);
> > > > >
> > > > >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > > > >  /*
> > > > >
> > > > >
> > > >
> > > > I saw the code. my concern is your patch could make unnecessary split of THP.
> > > >
> > > > When we isolates page, we can't know whether it's working set or not.
> > > > So split should happen after we judge it's working set page.
> > > yes, but since memory is big currently, it's unlikely the isolated page
> > > get accessed in the window. And I only did the split in
> > 
> > We don't check page_reference when isolate happens.
> > Window which between isolation time and reclaim?
> > No. Window is from inactive's head to tail and it's the basic concept of
> > our LRU.
> > 
> > > shrink_inactive_list, not in active list.
> > 
> > But inactive list's size could be still big and
> > page reference heuristic is very important for reclaim algorithm.
> I mean pages aren't referenced. but ok, I can't take such assumption.
> 
> > > And THP has mechanism to collapse small pages to huge page later.
> > 
> > You mean "merge" instead of "collapse"?
> > 
> > >
> > > > If you really want to merge this patch, I suggest that
> > > > we can handle it in shrink_page_list step, not isolation step.
> > > >
> > > > My totally untested code which is just to show the concept is as follows,
> > > I did consider this option before. It has its problem too. The isolation
> > > can isolate several huge page one time. And then later shrink_page_list
> > > can swap several huge page one time, which is unfortunate. I'm pretty
> > > sure this method can't reduce the thp_split count in my test. It could
> > 
> > I understand your point but approach isn't good to me.
> > Maybe we can check whether we are going on or not before other THP page split happens
> > in shrink_page_list. If we split THP page successfully, maybe we can skip another THP split.
> > Another idea is we can avoid split of THP unless high order reclaim happens or low order
> > high priority pressure happens.
> I agreed the split better be done at shrink_page_list, but we must avoid
> isolate too many pages. I'll check if I can have a better solution for
> next post.
Let me try again.

Subject: thp: improve huge page reclaim -v2

With transparent huge page enabled, huge page will be split if it will
be reclaimed. With current logic, if page reclaim finds a huge page,
it will just reclaim the head page and leave tail pages reclaimed later.
Let's take an example, lru list has page A and B, page A is huge page:
1. page A is isolated
2. page B is isolated
3. shrink_page_list() adds page A to swap page cache. so page A is split.
page A+1, page A+2, ... are added to lru list.
4. shrink_page_list() adds page B to swap page cache.
5. page A and B is written out and reclaimed.
6. page A+1, A+2 ... is isolated and reclaimed later.
So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
The worst case could be we isolate/split 32 huge pages to try to reclaim
a huge page, but we only the 32 head pages are reclaimed.

We expected the whole huge page A is reclaimed in the meantime, so
the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, .... This could reduce a lot
of unnecessary huge page split and improve the reclaim.

With this patch, if a huge page is found in isolation, don't continue
isolation. Since if the huge page is reclaimed, we can reclaim more pages
than SWAP_CLUSTER_MAX. In shrink_page_list(), the huge page is split and
all tail pages will be added to the isolation list, so the tail pages can
be reclaimed immediately.

The drawback is we might isolate less pages if a huge page is found. But
I thought the benefit is far more than the drawback.

All code path are with PageTransHuge(), so should have no impact to normal
cases.

In a test, a range of anonymous memory is written and will trigger swap.
Without the patch:
#cat /proc/vmstat|grep thp
thp_fault_alloc 451
thp_fault_fallback 0
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 238

With the patch:
#cat /proc/vmstat|grep thp
thp_fault_alloc 451
thp_fault_fallback 0
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 76

So the thp_split number is reduced a lot.

v1->v2: Do the huge page split in shrink_page_list(). Some code are adopted from
Minchan's.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>

---
 include/linux/huge_mm.h    |    7 ++++++-
 include/linux/memcontrol.h |    3 ++-
 include/linux/swap.h       |    3 ++-
 mm/huge_memory.c           |   14 ++++++++------
 mm/memcontrol.c            |    6 +++++-
 mm/swap.c                  |   10 +++++++++-
 mm/swap_state.c            |    6 ------
 mm/vmscan.c                |   27 ++++++++++++++++++++-------
 8 files changed, 52 insertions(+), 24 deletions(-)

Index: linux/include/linux/huge_mm.h
===================================================================
--- linux.orig/include/linux/huge_mm.h	2011-11-02 09:48:16.000000000 +0800
+++ linux/include/linux/huge_mm.h	2011-11-02 10:06:33.000000000 +0800
@@ -81,7 +81,12 @@ extern int copy_pte_range(struct mm_stru
 extern int handle_pte_fault(struct mm_struct *mm,
 			    struct vm_area_struct *vma, unsigned long address,
 			    pte_t *pte, pmd_t *pmd, unsigned int flags);
-extern int split_huge_page(struct page *page);
+extern int split_huge_page_list(struct page *page, struct list_head *dst);
+static inline int split_huge_page(struct page *page)
+{
+	return split_huge_page_list(page, NULL);
+}
+
 extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
 #define split_huge_page_pmd(__mm, __pmd)				\
 	do {								\
Index: linux/include/linux/swap.h
===================================================================
--- linux.orig/include/linux/swap.h	2011-11-02 09:48:16.000000000 +0800
+++ linux/include/linux/swap.h	2011-11-02 10:06:33.000000000 +0800
@@ -218,7 +218,8 @@ extern unsigned int nr_free_pagecache_pa
 extern void __lru_cache_add(struct page *, enum lru_list lru);
 extern void lru_cache_add_lru(struct page *, enum lru_list lru);
 extern void lru_add_page_tail(struct zone* zone,
-			      struct page *page, struct page *page_tail);
+			      struct page *page, struct page *page_tail,
+			      struct list_head *dst);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
Index: linux/mm/huge_memory.c
===================================================================
--- linux.orig/mm/huge_memory.c	2011-11-02 09:48:16.000000000 +0800
+++ linux/mm/huge_memory.c	2011-11-02 10:58:21.000000000 +0800
@@ -1159,7 +1159,8 @@ static int __split_huge_page_splitting(s
 	return ret;
 }
 
-static void __split_huge_page_refcount(struct page *page)
+static void __split_huge_page_refcount(struct page *page,
+				       struct list_head *list)
 {
 	int i;
 	struct zone *zone = page_zone(page);
@@ -1229,7 +1230,7 @@ static void __split_huge_page_refcount(s
 
 		mem_cgroup_split_huge_fixup(page, page_tail);
 
-		lru_add_page_tail(zone, page, page_tail);
+		lru_add_page_tail(zone, page, page_tail, list);
 	}
 
 	__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
@@ -1343,7 +1344,8 @@ static int __split_huge_page_map(struct
 
 /* must be called with anon_vma->root->mutex hold */
 static void __split_huge_page(struct page *page,
-			      struct anon_vma *anon_vma)
+			      struct anon_vma *anon_vma,
+			      struct list_head *list)
 {
 	int mapcount, mapcount2;
 	struct anon_vma_chain *avc;
@@ -1375,7 +1377,7 @@ static void __split_huge_page(struct pag
 		       mapcount, page_mapcount(page));
 	BUG_ON(mapcount != page_mapcount(page));
 
-	__split_huge_page_refcount(page);
+	__split_huge_page_refcount(page, list);
 
 	mapcount2 = 0;
 	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
@@ -1392,7 +1394,7 @@ static void __split_huge_page(struct pag
 	BUG_ON(mapcount != mapcount2);
 }
 
-int split_huge_page(struct page *page)
+int split_huge_page_list(struct page *page, struct list_head *list)
 {
 	struct anon_vma *anon_vma;
 	int ret = 1;
@@ -1406,7 +1408,7 @@ int split_huge_page(struct page *page)
 		goto out_unlock;
 
 	BUG_ON(!PageSwapBacked(page));
-	__split_huge_page(page, anon_vma);
+	__split_huge_page(page, anon_vma, list);
 	count_vm_event(THP_SPLIT);
 
 	BUG_ON(PageCompound(page));
Index: linux/mm/swap.c
===================================================================
--- linux.orig/mm/swap.c	2011-11-02 09:48:16.000000000 +0800
+++ linux/mm/swap.c	2011-11-02 10:06:33.000000000 +0800
@@ -634,7 +634,8 @@ EXPORT_SYMBOL(__pagevec_release);
 
 /* used by __split_huge_page_refcount() */
 void lru_add_page_tail(struct zone* zone,
-		       struct page *page, struct page *page_tail)
+		       struct page *page, struct page *page_tail,
+		       struct list_head *dst)
 {
 	int active;
 	enum lru_list lru;
@@ -646,6 +647,13 @@ void lru_add_page_tail(struct zone* zone
 	VM_BUG_ON(PageLRU(page_tail));
 	VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
 
+	/* The huge page is isolated */
+	if (dst) {
+		get_page(page_tail);
+		list_add_tail(&page_tail->lru, dst);
+		return;
+	}
+
 	SetPageLRU(page_tail);
 
 	if (page_evictable(page_tail, NULL)) {
Index: linux/mm/swap_state.c
===================================================================
--- linux.orig/mm/swap_state.c	2011-11-02 09:48:16.000000000 +0800
+++ linux/mm/swap_state.c	2011-11-02 10:06:33.000000000 +0800
@@ -154,12 +154,6 @@ int add_to_swap(struct page *page)
 	if (!entry.val)
 		return 0;
 
-	if (unlikely(PageTransHuge(page)))
-		if (unlikely(split_huge_page(page))) {
-			swapcache_free(entry, NULL);
-			return 0;
-		}
-
 	/*
 	 * Radix-tree node allocations from PF_MEMALLOC contexts could
 	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
Index: linux/mm/vmscan.c
===================================================================
--- linux.orig/mm/vmscan.c	2011-11-02 09:48:16.000000000 +0800
+++ linux/mm/vmscan.c	2011-11-02 10:58:21.000000000 +0800
@@ -838,6 +838,10 @@ static unsigned long shrink_page_list(st
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!(sc->gfp_mask & __GFP_IO))
 				goto keep_locked;
+			if (unlikely(PageTransHuge(page)))
+				if (unlikely(split_huge_page_list(page,
+					page_list)))
+				    goto activate_locked;
 			if (!add_to_swap(page))
 				goto activate_locked;
 			may_enter_fs = 1;
@@ -1076,7 +1080,8 @@ int __isolate_lru_page(struct page *page
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		struct list_head *src, struct list_head *dst,
-		unsigned long *scanned, int order, int mode, int file)
+		unsigned long *scanned, int order, int mode, int file,
+		bool break_on_thp)
 {
 	unsigned long nr_taken = 0;
 	unsigned long nr_lumpy_taken = 0;
@@ -1101,6 +1106,10 @@ static unsigned long isolate_lru_pages(u
 			list_move(&page->lru, dst);
 			mem_cgroup_del_lru(page);
 			nr_taken += hpage_nr_pages(page);
+			if (unlikely(PageTransHuge(page)) && break_on_thp) {
+				scan++;
+				goto out;
+			}
 			break;
 
 		case -EBUSY:
@@ -1163,6 +1172,8 @@ static unsigned long isolate_lru_pages(u
 				if (PageDirty(cursor_page))
 					nr_lumpy_dirty++;
 				scan++;
+				if (unlikely(PageTransHuge(page)) && break_on_thp)
+					goto out;
 			} else {
 				/*
 				 * Check if the page is freed already.
@@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
 			nr_lumpy_failed++;
 	}
 
+out:
 	*scanned = scan;
 
 	trace_mm_vmscan_lru_isolate(order,
@@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
 					struct list_head *dst,
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
-					int active, int file)
+					int active, int file,
+					bool break_on_thp)
 {
 	int lru = LRU_BASE;
 	if (active)
@@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
 	if (file)
 		lru += LRU_FILE;
 	return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
-								mode, file);
+							mode, file, break_on_thp);
 }
 
 /*
@@ -1466,7 +1479,7 @@ shrink_inactive_list(unsigned long nr_to
 			&page_list, &nr_scanned, sc->order,
 			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
 					ISOLATE_BOTH : ISOLATE_INACTIVE,
-			zone, 0, file);
+			zone, 0, file, true);
 		zone->pages_scanned += nr_scanned;
 		if (current_is_kswapd())
 			__count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1480,7 +1493,7 @@ shrink_inactive_list(unsigned long nr_to
 			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
 					ISOLATE_BOTH : ISOLATE_INACTIVE,
 			zone, sc->mem_cgroup,
-			0, file);
+			0, file, true);
 		/*
 		 * mem_cgroup_isolate_pages() keeps track of
 		 * scanned pages on its own.
@@ -1589,13 +1602,13 @@ static void shrink_active_list(unsigned
 		nr_taken = isolate_pages_global(nr_pages, &l_hold,
 						&pgscanned, sc->order,
 						ISOLATE_ACTIVE, zone,
-						1, file);
+						1, file, false);
 		zone->pages_scanned += pgscanned;
 	} else {
 		nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
 						&pgscanned, sc->order,
 						ISOLATE_ACTIVE, zone,
-						sc->mem_cgroup, 1, file);
+						sc->mem_cgroup, 1, file, false);
 		/*
 		 * mem_cgroup_isolate_pages() keeps track of
 		 * scanned pages on its own.
Index: linux/include/linux/memcontrol.h
===================================================================
--- linux.orig/include/linux/memcontrol.h	2011-11-02 09:48:16.000000000 +0800
+++ linux/include/linux/memcontrol.h	2011-11-02 10:06:33.000000000 +0800
@@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
-					int active, int file);
+					int active, int file,
+					bool break_on_thp);
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 /*
Index: linux/mm/memcontrol.c
===================================================================
--- linux.orig/mm/memcontrol.c	2011-11-02 09:48:16.000000000 +0800
+++ linux/mm/memcontrol.c	2011-11-02 10:06:33.000000000 +0800
@@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
-					int active, int file)
+					int active, int file,
+					bool break_on_thp)
 {
 	unsigned long nr_taken = 0;
 	struct page *page;
@@ -1225,6 +1226,8 @@ unsigned long mem_cgroup_isolate_pages(u
 			list_move(&page->lru, dst);
 			mem_cgroup_del_lru(page);
 			nr_taken += hpage_nr_pages(page);
+			if (unlikely(PageTransHuge(page)) && break_on_thp)
+				goto out;
 			break;
 		case -EBUSY:
 			/* we don't affect global LRU but rotate in our LRU */
@@ -1235,6 +1238,7 @@ unsigned long mem_cgroup_isolate_pages(u
 		}
 	}
 
+out:
 	*scanned = scan;
 
 	trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-11-02  3:17           ` Shaohua Li
@ 2011-11-08  8:59             ` Minchan Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-11-08  8:59 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Wed, Nov 02, 2011 at 11:17:55AM +0800, Shaohua Li wrote:
> On Mon, 2011-10-31 at 17:03 +0800, Shaohua Li wrote:
> > On Mon, 2011-10-31 at 16:23 +0800, Minchan Kim wrote:
> > > On Mon, Oct 31, 2011 at 09:21:28AM +0800, Shaohua Li wrote:
> > > > On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> > > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > > lru list has page A and B, page A is huge page:
> > > > > > 1. page A is isolated
> > > > > > 2. page B is isolated
> > > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > > page A+1, page A+2, ... are added to lru list.
> > > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > > 5. page A and B is written out and reclaimed.
> > > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > > >
> > > > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > > > > >
> > > > > > With this patch, we do huge page split just after the head page is isolated
> > > > > > for inactive lru list, so the tail pages will be reclaimed immediately.
> > > > > >
> > > > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > > > Without the patch:
> > > > > > #cat /proc/vmstat|grep thp
> > > > > > thp_fault_alloc 451
> > > > > > thp_fault_fallback 0
> > > > > > thp_collapse_alloc 0
> > > > > > thp_collapse_alloc_failed 0
> > > > > > thp_split 238
> > > > > >
> > > > > > With the patch:
> > > > > > #cat /proc/vmstat|grep thp
> > > > > > thp_fault_alloc 450
> > > > > > thp_fault_fallback 1
> > > > > > thp_collapse_alloc 0
> > > > > > thp_collapse_alloc_failed 0
> > > > > > thp_split 103
> > > > > >
> > > > > > So the thp_split number is reduced a lot, though there is one extra
> > > > > > thp_fault_fallback.
> > > > > >
> > > > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > > > > ---
> > > > > >  include/linux/memcontrol.h |    3 +-
> > > > > >  mm/memcontrol.c            |   12 +++++++++--
> > > > > >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> > > > > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > > > > >
> > > > > > Index: linux/mm/vmscan.c
> > > > > > ===================================================================
> > > > > > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > > > > > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > > > > > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> > > > > >   */
> > > > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > > > >               struct list_head *src, struct list_head *dst,
> > > > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > > > +             struct page **split_page)
> > > > > >  {
> > > > > >       unsigned long nr_taken = 0;
> > > > > >       unsigned long nr_lumpy_taken = 0;
> > > > > > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> > > > > >               case 0:
> > > > > >                       list_move(&page->lru, dst);
> > > > > >                       mem_cgroup_del_lru(page);
> > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > +                             nr_taken++;
> > > > > > +                             *split_page = page;
> > > > > > +                             goto out;
> > > > > > +                     } else
> > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > >                       break;
> > > > > >
> > > > > >               case -EBUSY:
> > > > > > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> > > > > >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> > > > > >                               list_move(&cursor_page->lru, dst);
> > > > > >                               mem_cgroup_del_lru(cursor_page);
> > > > > > -                             nr_taken += hpage_nr_pages(page);
> > > > > >                               nr_lumpy_taken++;
> > > > > >                               if (PageDirty(cursor_page))
> > > > > >                                       nr_lumpy_dirty++;
> > > > > >                               scan++;
> > > > > > +                             if (PageTransHuge(page) && split_page) {
> > > > > > +                                     nr_taken++;
> > > > > > +                                     *split_page = page;
> > > > > > +                                     goto out;
> > > > > > +                             } else
> > > > > > +                                     nr_taken += hpage_nr_pages(page);
> > > > > >                       } else {
> > > > > >                               /*
> > > > > >                                * Check if the page is freed already.
> > > > > > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> > > > > >                       nr_lumpy_failed++;
> > > > > >       }
> > > > > >
> > > > > > +out:
> > > > > >       *scanned = scan;
> > > > > >
> > > > > >       trace_mm_vmscan_lru_isolate(order,
> > > > > > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> > > > > >                                       struct list_head *dst,
> > > > > >                                       unsigned long *scanned, int order,
> > > > > >                                       int mode, struct zone *z,
> > > > > > -                                     int active, int file)
> > > > > > +                                     int active, int file,
> > > > > > +                                     struct page **split_page)
> > > > > >  {
> > > > > >       int lru = LRU_BASE;
> > > > > >       if (active)
> > > > > > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> > > > > >       if (file)
> > > > > >               lru += LRU_FILE;
> > > > > >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > > > > > -                                                             mode, file);
> > > > > > +                                                     mode, file, split_page);
> > > > > >  }
> > > > > >
> > > > > >  /*
> > > > > > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> > > > > >  {
> > > > > >       LIST_HEAD(page_list);
> > > > > >       unsigned long nr_scanned;
> > > > > > +     unsigned long total_scanned = 0;
> > > > > >       unsigned long nr_reclaimed = 0;
> > > > > >       unsigned long nr_taken;
> > > > > >       unsigned long nr_anon;
> > > > > >       unsigned long nr_file;
> > > > > > +     struct page *split_page;
> > > > > >
> > > > > >       while (unlikely(too_many_isolated(zone, file, sc))) {
> > > > > >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > > > > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > >       }
> > > > > >
> > > > > >       set_reclaim_mode(priority, sc, false);
> > > > > > +again:
> > > > > >       lru_add_drain();
> > > > > > +     split_page = NULL;
> > > > > >       spin_lock_irq(&zone->lru_lock);
> > > > > >
> > > > > >       if (scanning_global_lru(sc)) {
> > > > > > -             nr_taken = isolate_pages_global(nr_to_scan,
> > > > > > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > -                     zone, 0, file);
> > > > > > +                     zone, 0, file, &split_page);
> > > > > >               zone->pages_scanned += nr_scanned;
> > > > > > +             total_scanned += nr_scanned;
> > > > > >               if (current_is_kswapd())
> > > > > >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > > > > >                                              nr_scanned);
> > > > > > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> > > > > >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > > > > >                                              nr_scanned);
> > > > > >       } else {
> > > > > > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > > > > > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > >                       zone, sc->mem_cgroup,
> > > > > > -                     0, file);
> > > > > > +                     0, file, &split_page);
> > > > > > +             total_scanned += nr_scanned;
> > > > > >               /*
> > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > >                * scanned pages on its own.
> > > > > > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > >               spin_unlock_irq(&zone->lru_lock);
> > > > > >               return 0;
> > > > > >       }
> > > > > > +     if (split_page && total_scanned < nr_to_scan) {
> > > > > > +             spin_unlock_irq(&zone->lru_lock);
> > > > > > +             split_huge_page(split_page);
> > > > > > +             goto again;
> > > > > > +     }
> > > > > >
> > > > > >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> > > > > >
> > > > > >       spin_unlock_irq(&zone->lru_lock);
> > > > > >
> > > > > > +     if (split_page)
> > > > > > +             split_huge_page(split_page);
> > > > > > +
> > > > > >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> > > > > >
> > > > > >       /* Check if we should syncronously wait for writeback */
> > > > > > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> > > > > >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> > > > > >                                               &pgscanned, sc->order,
> > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > -                                             1, file);
> > > > > > +                                             1, file, NULL);
> > > > > >               zone->pages_scanned += pgscanned;
> > > > > >       } else {
> > > > > >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> > > > > >                                               &pgscanned, sc->order,
> > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > -                                             sc->mem_cgroup, 1, file);
> > > > > > +                                             sc->mem_cgroup, 1, file, NULL);
> > > > > >               /*
> > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > >                * scanned pages on its own.
> > > > > > Index: linux/mm/memcontrol.c
> > > > > > ===================================================================
> > > > > > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > > > > > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > > > > > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > >                                       unsigned long *scanned, int order,
> > > > > >                                       int mode, struct zone *z,
> > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > -                                     int active, int file)
> > > > > > +                                     int active, int file,
> > > > > > +                                     struct page **split_page)
> > > > > >  {
> > > > > >       unsigned long nr_taken = 0;
> > > > > >       struct page *page;
> > > > > > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > >               case 0:
> > > > > >                       list_move(&page->lru, dst);
> > > > > >                       mem_cgroup_del_lru(page);
> > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > +                             nr_taken++;
> > > > > > +                             *split_page = page;
> > > > > > +                             goto out;
> > > > > > +                     } else
> > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > +
> > > > > >                       break;
> > > > > >               case -EBUSY:
> > > > > >                       /* we don't affect global LRU but rotate in our LRU */
> > > > > > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > >               }
> > > > > >       }
> > > > > >
> > > > > > +out:
> > > > > >       *scanned = scan;
> > > > > >
> > > > > >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > > > > > Index: linux/include/linux/memcontrol.h
> > > > > > ===================================================================
> > > > > > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > > > > > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > > > > > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> > > > > >                                       unsigned long *scanned, int order,
> > > > > >                                       int mode, struct zone *z,
> > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > -                                     int active, int file);
> > > > > > +                                     int active, int file,
> > > > > > +                                     struct page **split_page);
> > > > > >
> > > > > >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > > > > >  /*
> > > > > >
> > > > > >
> > > > >
> > > > > I saw the code. my concern is your patch could make unnecessary split of THP.
> > > > >
> > > > > When we isolates page, we can't know whether it's working set or not.
> > > > > So split should happen after we judge it's working set page.
> > > > yes, but since memory is big currently, it's unlikely the isolated page
> > > > get accessed in the window. And I only did the split in
> > > 
> > > We don't check page_reference when isolate happens.
> > > Window which between isolation time and reclaim?
> > > No. Window is from inactive's head to tail and it's the basic concept of
> > > our LRU.
> > > 
> > > > shrink_inactive_list, not in active list.
> > > 
> > > But inactive list's size could be still big and
> > > page reference heuristic is very important for reclaim algorithm.
> > I mean pages aren't referenced. but ok, I can't take such assumption.
> > 
> > > > And THP has mechanism to collapse small pages to huge page later.
> > > 
> > > You mean "merge" instead of "collapse"?
> > > 
> > > >
> > > > > If you really want to merge this patch, I suggest that
> > > > > we can handle it in shrink_page_list step, not isolation step.
> > > > >
> > > > > My totally untested code which is just to show the concept is as follows,
> > > > I did consider this option before. It has its problem too. The isolation
> > > > can isolate several huge page one time. And then later shrink_page_list
> > > > can swap several huge page one time, which is unfortunate. I'm pretty
> > > > sure this method can't reduce the thp_split count in my test. It could
> > > 
> > > I understand your point but approach isn't good to me.
> > > Maybe we can check whether we are going on or not before other THP page split happens
> > > in shrink_page_list. If we split THP page successfully, maybe we can skip another THP split.
> > > Another idea is we can avoid split of THP unless high order reclaim happens or low order
> > > high priority pressure happens.
> > I agreed the split better be done at shrink_page_list, but we must avoid
> > isolate too many pages. I'll check if I can have a better solution for
> > next post.
> Let me try again.
> 
> Subject: thp: improve huge page reclaim -v2
> 
> With transparent huge page enabled, huge page will be split if it will
> be reclaimed. With current logic, if page reclaim finds a huge page,
> it will just reclaim the head page and leave tail pages reclaimed later.
> Let's take an example, lru list has page A and B, page A is huge page:
> 1. page A is isolated
> 2. page B is isolated
> 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> page A+1, page A+2, ... are added to lru list.
> 4. shrink_page_list() adds page B to swap page cache.
> 5. page A and B is written out and reclaimed.
> 6. page A+1, A+2 ... is isolated and reclaimed later.
> So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> The worst case could be we isolate/split 32 huge pages to try to reclaim
> a huge page, but we only the 32 head pages are reclaimed.
> 
> We expected the whole huge page A is reclaimed in the meantime, so
> the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, .... This could reduce a lot
> of unnecessary huge page split and improve the reclaim.
> 
> With this patch, if a huge page is found in isolation, don't continue
> isolation. Since if the huge page is reclaimed, we can reclaim more pages
> than SWAP_CLUSTER_MAX. In shrink_page_list(), the huge page is split and
> all tail pages will be added to the isolation list, so the tail pages can
> be reclaimed immediately.
> 
> The drawback is we might isolate less pages if a huge page is found. But
> I thought the benefit is far more than the drawback.
> 
> All code path are with PageTransHuge(), so should have no impact to normal
> cases.
> 
> In a test, a range of anonymous memory is written and will trigger swap.
> Without the patch:
> #cat /proc/vmstat|grep thp
> thp_fault_alloc 451
> thp_fault_fallback 0
> thp_collapse_alloc 0
> thp_collapse_alloc_failed 0
> thp_split 238
> 
> With the patch:
> #cat /proc/vmstat|grep thp
> thp_fault_alloc 451
> thp_fault_fallback 0
> thp_collapse_alloc 0
> thp_collapse_alloc_failed 0
> thp_split 76
> 
> So the thp_split number is reduced a lot.
> 
> v1->v2: Do the huge page split in shrink_page_list(). Some code are adopted from
> Minchan's.
> 
> Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> 
> ---
>  include/linux/huge_mm.h    |    7 ++++++-
>  include/linux/memcontrol.h |    3 ++-
>  include/linux/swap.h       |    3 ++-
>  mm/huge_memory.c           |   14 ++++++++------
>  mm/memcontrol.c            |    6 +++++-
>  mm/swap.c                  |   10 +++++++++-
>  mm/swap_state.c            |    6 ------
>  mm/vmscan.c                |   27 ++++++++++++++++++++-------
>  8 files changed, 52 insertions(+), 24 deletions(-)
> 
> Index: linux/include/linux/huge_mm.h
> ===================================================================
> --- linux.orig/include/linux/huge_mm.h	2011-11-02 09:48:16.000000000 +0800
> +++ linux/include/linux/huge_mm.h	2011-11-02 10:06:33.000000000 +0800
> @@ -81,7 +81,12 @@ extern int copy_pte_range(struct mm_stru
>  extern int handle_pte_fault(struct mm_struct *mm,
>  			    struct vm_area_struct *vma, unsigned long address,
>  			    pte_t *pte, pmd_t *pmd, unsigned int flags);
> -extern int split_huge_page(struct page *page);
> +extern int split_huge_page_list(struct page *page, struct list_head *dst);
> +static inline int split_huge_page(struct page *page)
> +{
> +	return split_huge_page_list(page, NULL);
> +}
> +
>  extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
>  #define split_huge_page_pmd(__mm, __pmd)				\
>  	do {								\
> Index: linux/include/linux/swap.h
> ===================================================================
> --- linux.orig/include/linux/swap.h	2011-11-02 09:48:16.000000000 +0800
> +++ linux/include/linux/swap.h	2011-11-02 10:06:33.000000000 +0800
> @@ -218,7 +218,8 @@ extern unsigned int nr_free_pagecache_pa
>  extern void __lru_cache_add(struct page *, enum lru_list lru);
>  extern void lru_cache_add_lru(struct page *, enum lru_list lru);
>  extern void lru_add_page_tail(struct zone* zone,
> -			      struct page *page, struct page *page_tail);
> +			      struct page *page, struct page *page_tail,
> +			      struct list_head *dst);
>  extern void activate_page(struct page *);
>  extern void mark_page_accessed(struct page *);
>  extern void lru_add_drain(void);
> Index: linux/mm/huge_memory.c
> ===================================================================
> --- linux.orig/mm/huge_memory.c	2011-11-02 09:48:16.000000000 +0800
> +++ linux/mm/huge_memory.c	2011-11-02 10:58:21.000000000 +0800
> @@ -1159,7 +1159,8 @@ static int __split_huge_page_splitting(s
>  	return ret;
>  }
>  
> -static void __split_huge_page_refcount(struct page *page)
> +static void __split_huge_page_refcount(struct page *page,
> +				       struct list_head *list)
>  {
>  	int i;
>  	struct zone *zone = page_zone(page);
> @@ -1229,7 +1230,7 @@ static void __split_huge_page_refcount(s
>  
>  		mem_cgroup_split_huge_fixup(page, page_tail);
>  
> -		lru_add_page_tail(zone, page, page_tail);
> +		lru_add_page_tail(zone, page, page_tail, list);
>  	}
>  
>  	__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
> @@ -1343,7 +1344,8 @@ static int __split_huge_page_map(struct
>  
>  /* must be called with anon_vma->root->mutex hold */
>  static void __split_huge_page(struct page *page,
> -			      struct anon_vma *anon_vma)
> +			      struct anon_vma *anon_vma,
> +			      struct list_head *list)
>  {
>  	int mapcount, mapcount2;
>  	struct anon_vma_chain *avc;
> @@ -1375,7 +1377,7 @@ static void __split_huge_page(struct pag
>  		       mapcount, page_mapcount(page));
>  	BUG_ON(mapcount != page_mapcount(page));
>  
> -	__split_huge_page_refcount(page);
> +	__split_huge_page_refcount(page, list);
>  
>  	mapcount2 = 0;
>  	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
> @@ -1392,7 +1394,7 @@ static void __split_huge_page(struct pag
>  	BUG_ON(mapcount != mapcount2);
>  }
>  
> -int split_huge_page(struct page *page)
> +int split_huge_page_list(struct page *page, struct list_head *list)
>  {
>  	struct anon_vma *anon_vma;
>  	int ret = 1;
> @@ -1406,7 +1408,7 @@ int split_huge_page(struct page *page)
>  		goto out_unlock;
>  
>  	BUG_ON(!PageSwapBacked(page));
> -	__split_huge_page(page, anon_vma);
> +	__split_huge_page(page, anon_vma, list);
>  	count_vm_event(THP_SPLIT);
>  
>  	BUG_ON(PageCompound(page));
> Index: linux/mm/swap.c
> ===================================================================
> --- linux.orig/mm/swap.c	2011-11-02 09:48:16.000000000 +0800
> +++ linux/mm/swap.c	2011-11-02 10:06:33.000000000 +0800
> @@ -634,7 +634,8 @@ EXPORT_SYMBOL(__pagevec_release);
>  
>  /* used by __split_huge_page_refcount() */
>  void lru_add_page_tail(struct zone* zone,
> -		       struct page *page, struct page *page_tail)
> +		       struct page *page, struct page *page_tail,
> +		       struct list_head *dst)
>  {
>  	int active;
>  	enum lru_list lru;
> @@ -646,6 +647,13 @@ void lru_add_page_tail(struct zone* zone
>  	VM_BUG_ON(PageLRU(page_tail));
>  	VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
>  
> +	/* The huge page is isolated */
> +	if (dst) {
> +		get_page(page_tail);
> +		list_add_tail(&page_tail->lru, dst);
> +		return;
> +	}
> +
>  	SetPageLRU(page_tail);
>  
>  	if (page_evictable(page_tail, NULL)) {
> Index: linux/mm/swap_state.c
> ===================================================================
> --- linux.orig/mm/swap_state.c	2011-11-02 09:48:16.000000000 +0800
> +++ linux/mm/swap_state.c	2011-11-02 10:06:33.000000000 +0800
> @@ -154,12 +154,6 @@ int add_to_swap(struct page *page)
>  	if (!entry.val)
>  		return 0;
>  
> -	if (unlikely(PageTransHuge(page)))
> -		if (unlikely(split_huge_page(page))) {
> -			swapcache_free(entry, NULL);
> -			return 0;
> -		}
> -
>  	/*
>  	 * Radix-tree node allocations from PF_MEMALLOC contexts could
>  	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
> Index: linux/mm/vmscan.c
> ===================================================================
> --- linux.orig/mm/vmscan.c	2011-11-02 09:48:16.000000000 +0800
> +++ linux/mm/vmscan.c	2011-11-02 10:58:21.000000000 +0800
> @@ -838,6 +838,10 @@ static unsigned long shrink_page_list(st
>  		if (PageAnon(page) && !PageSwapCache(page)) {
>  			if (!(sc->gfp_mask & __GFP_IO))
>  				goto keep_locked;
> +			if (unlikely(PageTransHuge(page)))
> +				if (unlikely(split_huge_page_list(page,
> +					page_list)))
> +				    goto activate_locked;
>  			if (!add_to_swap(page))
>  				goto activate_locked;
>  			may_enter_fs = 1;
> @@ -1076,7 +1080,8 @@ int __isolate_lru_page(struct page *page
>   */
>  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
>  		struct list_head *src, struct list_head *dst,
> -		unsigned long *scanned, int order, int mode, int file)
> +		unsigned long *scanned, int order, int mode, int file,
> +		bool break_on_thp)
>  {

Sorry for late response.
These day, I am very busy for new job.

Still, I don't like surgery of isolation part.
What if we isolate a THP page but it is working set page?
Let's assume as follows

1. Ioslate 32 page
2. Unfortunately, 1st page is THP so isolate_lru_page isolates just a 
   page(of course, it's 512 pages)
3. shrink_page_list see that it's working set page but page_list
   have just a page so it have to isolate pages once more with higher priority.

How about this?

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9fdfce7..8121415 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -960,7 +960,15 @@ free_it:
                 * appear not as the counts should be low
                 */
                list_add(&page->lru, &free_pages);
-               continue;
+
+               /*
+                * If we have reclaimed enough pages, let's cut it off.
+                * It could prevent unnecessary THP split.
+                */
+               if (nr_reclaimed >= sc->nr_to_reclaim)
+                       break;
+               else
+                       continue;
 
 cull_mlocked:
                if (PageSwapCache(page))





>  	unsigned long nr_taken = 0;
>  	unsigned long nr_lumpy_taken = 0;
> @@ -1101,6 +1106,10 @@ static unsigned long isolate_lru_pages(u
>  			list_move(&page->lru, dst);
>  			mem_cgroup_del_lru(page);
>  			nr_taken += hpage_nr_pages(page);
> +			if (unlikely(PageTransHuge(page)) && break_on_thp) {
> +				scan++;
> +				goto out;
> +			}
>  			break;
>  
>  		case -EBUSY:
> @@ -1163,6 +1172,8 @@ static unsigned long isolate_lru_pages(u
>  				if (PageDirty(cursor_page))
>  					nr_lumpy_dirty++;
>  				scan++;
> +				if (unlikely(PageTransHuge(page)) && break_on_thp)
> +					goto out;
>  			} else {
>  				/*
>  				 * Check if the page is freed already.
> @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
>  			nr_lumpy_failed++;
>  	}
>  
> +out:
>  	*scanned = scan;
>  
>  	trace_mm_vmscan_lru_isolate(order,
> @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
>  					struct list_head *dst,
>  					unsigned long *scanned, int order,
>  					int mode, struct zone *z,
> -					int active, int file)
> +					int active, int file,
> +					bool break_on_thp)
>  {
>  	int lru = LRU_BASE;
>  	if (active)
> @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
>  	if (file)
>  		lru += LRU_FILE;
>  	return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> -								mode, file);
> +							mode, file, break_on_thp);
>  }
>  
>  /*
> @@ -1466,7 +1479,7 @@ shrink_inactive_list(unsigned long nr_to
>  			&page_list, &nr_scanned, sc->order,
>  			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
>  					ISOLATE_BOTH : ISOLATE_INACTIVE,
> -			zone, 0, file);
> +			zone, 0, file, true);
>  		zone->pages_scanned += nr_scanned;
>  		if (current_is_kswapd())
>  			__count_zone_vm_events(PGSCAN_KSWAPD, zone,
> @@ -1480,7 +1493,7 @@ shrink_inactive_list(unsigned long nr_to
>  			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
>  					ISOLATE_BOTH : ISOLATE_INACTIVE,
>  			zone, sc->mem_cgroup,
> -			0, file);
> +			0, file, true);
>  		/*
>  		 * mem_cgroup_isolate_pages() keeps track of
>  		 * scanned pages on its own.
> @@ -1589,13 +1602,13 @@ static void shrink_active_list(unsigned
>  		nr_taken = isolate_pages_global(nr_pages, &l_hold,
>  						&pgscanned, sc->order,
>  						ISOLATE_ACTIVE, zone,
> -						1, file);
> +						1, file, false);
>  		zone->pages_scanned += pgscanned;
>  	} else {
>  		nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
>  						&pgscanned, sc->order,
>  						ISOLATE_ACTIVE, zone,
> -						sc->mem_cgroup, 1, file);
> +						sc->mem_cgroup, 1, file, false);
>  		/*
>  		 * mem_cgroup_isolate_pages() keeps track of
>  		 * scanned pages on its own.
> Index: linux/include/linux/memcontrol.h
> ===================================================================
> --- linux.orig/include/linux/memcontrol.h	2011-11-02 09:48:16.000000000 +0800
> +++ linux/include/linux/memcontrol.h	2011-11-02 10:06:33.000000000 +0800
> @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
>  					unsigned long *scanned, int order,
>  					int mode, struct zone *z,
>  					struct mem_cgroup *mem_cont,
> -					int active, int file);
> +					int active, int file,
> +					bool break_on_thp);
>  
>  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
>  /*
> Index: linux/mm/memcontrol.c
> ===================================================================
> --- linux.orig/mm/memcontrol.c	2011-11-02 09:48:16.000000000 +0800
> +++ linux/mm/memcontrol.c	2011-11-02 10:06:33.000000000 +0800
> @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
>  					unsigned long *scanned, int order,
>  					int mode, struct zone *z,
>  					struct mem_cgroup *mem_cont,
> -					int active, int file)
> +					int active, int file,
> +					bool break_on_thp)
>  {
>  	unsigned long nr_taken = 0;
>  	struct page *page;
> @@ -1225,6 +1226,8 @@ unsigned long mem_cgroup_isolate_pages(u
>  			list_move(&page->lru, dst);
>  			mem_cgroup_del_lru(page);
>  			nr_taken += hpage_nr_pages(page);
> +			if (unlikely(PageTransHuge(page)) && break_on_thp)
> +				goto out;
>  			break;
>  		case -EBUSY:
>  			/* we don't affect global LRU but rotate in our LRU */
> @@ -1235,6 +1238,7 @@ unsigned long mem_cgroup_isolate_pages(u
>  		}
>  	}
>  
> +out:
>  	*scanned = scan;
>  
>  	trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> 
> 

-- 
Kind regards,
Minchan Kim

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-11-08  8:59             ` Minchan Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-11-08  8:59 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Wed, Nov 02, 2011 at 11:17:55AM +0800, Shaohua Li wrote:
> On Mon, 2011-10-31 at 17:03 +0800, Shaohua Li wrote:
> > On Mon, 2011-10-31 at 16:23 +0800, Minchan Kim wrote:
> > > On Mon, Oct 31, 2011 at 09:21:28AM +0800, Shaohua Li wrote:
> > > > On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> > > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > > lru list has page A and B, page A is huge page:
> > > > > > 1. page A is isolated
> > > > > > 2. page B is isolated
> > > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > > page A+1, page A+2, ... are added to lru list.
> > > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > > 5. page A and B is written out and reclaimed.
> > > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > > >
> > > > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > > > > >
> > > > > > With this patch, we do huge page split just after the head page is isolated
> > > > > > for inactive lru list, so the tail pages will be reclaimed immediately.
> > > > > >
> > > > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > > > Without the patch:
> > > > > > #cat /proc/vmstat|grep thp
> > > > > > thp_fault_alloc 451
> > > > > > thp_fault_fallback 0
> > > > > > thp_collapse_alloc 0
> > > > > > thp_collapse_alloc_failed 0
> > > > > > thp_split 238
> > > > > >
> > > > > > With the patch:
> > > > > > #cat /proc/vmstat|grep thp
> > > > > > thp_fault_alloc 450
> > > > > > thp_fault_fallback 1
> > > > > > thp_collapse_alloc 0
> > > > > > thp_collapse_alloc_failed 0
> > > > > > thp_split 103
> > > > > >
> > > > > > So the thp_split number is reduced a lot, though there is one extra
> > > > > > thp_fault_fallback.
> > > > > >
> > > > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > > > > ---
> > > > > >  include/linux/memcontrol.h |    3 +-
> > > > > >  mm/memcontrol.c            |   12 +++++++++--
> > > > > >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> > > > > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > > > > >
> > > > > > Index: linux/mm/vmscan.c
> > > > > > ===================================================================
> > > > > > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > > > > > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > > > > > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> > > > > >   */
> > > > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > > > >               struct list_head *src, struct list_head *dst,
> > > > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > > > +             struct page **split_page)
> > > > > >  {
> > > > > >       unsigned long nr_taken = 0;
> > > > > >       unsigned long nr_lumpy_taken = 0;
> > > > > > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> > > > > >               case 0:
> > > > > >                       list_move(&page->lru, dst);
> > > > > >                       mem_cgroup_del_lru(page);
> > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > +                             nr_taken++;
> > > > > > +                             *split_page = page;
> > > > > > +                             goto out;
> > > > > > +                     } else
> > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > >                       break;
> > > > > >
> > > > > >               case -EBUSY:
> > > > > > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> > > > > >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> > > > > >                               list_move(&cursor_page->lru, dst);
> > > > > >                               mem_cgroup_del_lru(cursor_page);
> > > > > > -                             nr_taken += hpage_nr_pages(page);
> > > > > >                               nr_lumpy_taken++;
> > > > > >                               if (PageDirty(cursor_page))
> > > > > >                                       nr_lumpy_dirty++;
> > > > > >                               scan++;
> > > > > > +                             if (PageTransHuge(page) && split_page) {
> > > > > > +                                     nr_taken++;
> > > > > > +                                     *split_page = page;
> > > > > > +                                     goto out;
> > > > > > +                             } else
> > > > > > +                                     nr_taken += hpage_nr_pages(page);
> > > > > >                       } else {
> > > > > >                               /*
> > > > > >                                * Check if the page is freed already.
> > > > > > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> > > > > >                       nr_lumpy_failed++;
> > > > > >       }
> > > > > >
> > > > > > +out:
> > > > > >       *scanned = scan;
> > > > > >
> > > > > >       trace_mm_vmscan_lru_isolate(order,
> > > > > > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> > > > > >                                       struct list_head *dst,
> > > > > >                                       unsigned long *scanned, int order,
> > > > > >                                       int mode, struct zone *z,
> > > > > > -                                     int active, int file)
> > > > > > +                                     int active, int file,
> > > > > > +                                     struct page **split_page)
> > > > > >  {
> > > > > >       int lru = LRU_BASE;
> > > > > >       if (active)
> > > > > > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> > > > > >       if (file)
> > > > > >               lru += LRU_FILE;
> > > > > >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > > > > > -                                                             mode, file);
> > > > > > +                                                     mode, file, split_page);
> > > > > >  }
> > > > > >
> > > > > >  /*
> > > > > > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> > > > > >  {
> > > > > >       LIST_HEAD(page_list);
> > > > > >       unsigned long nr_scanned;
> > > > > > +     unsigned long total_scanned = 0;
> > > > > >       unsigned long nr_reclaimed = 0;
> > > > > >       unsigned long nr_taken;
> > > > > >       unsigned long nr_anon;
> > > > > >       unsigned long nr_file;
> > > > > > +     struct page *split_page;
> > > > > >
> > > > > >       while (unlikely(too_many_isolated(zone, file, sc))) {
> > > > > >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > > > > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > >       }
> > > > > >
> > > > > >       set_reclaim_mode(priority, sc, false);
> > > > > > +again:
> > > > > >       lru_add_drain();
> > > > > > +     split_page = NULL;
> > > > > >       spin_lock_irq(&zone->lru_lock);
> > > > > >
> > > > > >       if (scanning_global_lru(sc)) {
> > > > > > -             nr_taken = isolate_pages_global(nr_to_scan,
> > > > > > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > -                     zone, 0, file);
> > > > > > +                     zone, 0, file, &split_page);
> > > > > >               zone->pages_scanned += nr_scanned;
> > > > > > +             total_scanned += nr_scanned;
> > > > > >               if (current_is_kswapd())
> > > > > >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > > > > >                                              nr_scanned);
> > > > > > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> > > > > >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > > > > >                                              nr_scanned);
> > > > > >       } else {
> > > > > > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > > > > > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > >                       zone, sc->mem_cgroup,
> > > > > > -                     0, file);
> > > > > > +                     0, file, &split_page);
> > > > > > +             total_scanned += nr_scanned;
> > > > > >               /*
> > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > >                * scanned pages on its own.
> > > > > > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > >               spin_unlock_irq(&zone->lru_lock);
> > > > > >               return 0;
> > > > > >       }
> > > > > > +     if (split_page && total_scanned < nr_to_scan) {
> > > > > > +             spin_unlock_irq(&zone->lru_lock);
> > > > > > +             split_huge_page(split_page);
> > > > > > +             goto again;
> > > > > > +     }
> > > > > >
> > > > > >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> > > > > >
> > > > > >       spin_unlock_irq(&zone->lru_lock);
> > > > > >
> > > > > > +     if (split_page)
> > > > > > +             split_huge_page(split_page);
> > > > > > +
> > > > > >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> > > > > >
> > > > > >       /* Check if we should syncronously wait for writeback */
> > > > > > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> > > > > >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> > > > > >                                               &pgscanned, sc->order,
> > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > -                                             1, file);
> > > > > > +                                             1, file, NULL);
> > > > > >               zone->pages_scanned += pgscanned;
> > > > > >       } else {
> > > > > >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> > > > > >                                               &pgscanned, sc->order,
> > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > -                                             sc->mem_cgroup, 1, file);
> > > > > > +                                             sc->mem_cgroup, 1, file, NULL);
> > > > > >               /*
> > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > >                * scanned pages on its own.
> > > > > > Index: linux/mm/memcontrol.c
> > > > > > ===================================================================
> > > > > > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > > > > > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > > > > > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > >                                       unsigned long *scanned, int order,
> > > > > >                                       int mode, struct zone *z,
> > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > -                                     int active, int file)
> > > > > > +                                     int active, int file,
> > > > > > +                                     struct page **split_page)
> > > > > >  {
> > > > > >       unsigned long nr_taken = 0;
> > > > > >       struct page *page;
> > > > > > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > >               case 0:
> > > > > >                       list_move(&page->lru, dst);
> > > > > >                       mem_cgroup_del_lru(page);
> > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > +                             nr_taken++;
> > > > > > +                             *split_page = page;
> > > > > > +                             goto out;
> > > > > > +                     } else
> > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > +
> > > > > >                       break;
> > > > > >               case -EBUSY:
> > > > > >                       /* we don't affect global LRU but rotate in our LRU */
> > > > > > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > >               }
> > > > > >       }
> > > > > >
> > > > > > +out:
> > > > > >       *scanned = scan;
> > > > > >
> > > > > >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > > > > > Index: linux/include/linux/memcontrol.h
> > > > > > ===================================================================
> > > > > > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > > > > > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > > > > > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> > > > > >                                       unsigned long *scanned, int order,
> > > > > >                                       int mode, struct zone *z,
> > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > -                                     int active, int file);
> > > > > > +                                     int active, int file,
> > > > > > +                                     struct page **split_page);
> > > > > >
> > > > > >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > > > > >  /*
> > > > > >
> > > > > >
> > > > >
> > > > > I saw the code. my concern is your patch could make unnecessary split of THP.
> > > > >
> > > > > When we isolates page, we can't know whether it's working set or not.
> > > > > So split should happen after we judge it's working set page.
> > > > yes, but since memory is big currently, it's unlikely the isolated page
> > > > get accessed in the window. And I only did the split in
> > > 
> > > We don't check page_reference when isolate happens.
> > > Window which between isolation time and reclaim?
> > > No. Window is from inactive's head to tail and it's the basic concept of
> > > our LRU.
> > > 
> > > > shrink_inactive_list, not in active list.
> > > 
> > > But inactive list's size could be still big and
> > > page reference heuristic is very important for reclaim algorithm.
> > I mean pages aren't referenced. but ok, I can't take such assumption.
> > 
> > > > And THP has mechanism to collapse small pages to huge page later.
> > > 
> > > You mean "merge" instead of "collapse"?
> > > 
> > > >
> > > > > If you really want to merge this patch, I suggest that
> > > > > we can handle it in shrink_page_list step, not isolation step.
> > > > >
> > > > > My totally untested code which is just to show the concept is as follows,
> > > > I did consider this option before. It has its problem too. The isolation
> > > > can isolate several huge page one time. And then later shrink_page_list
> > > > can swap several huge page one time, which is unfortunate. I'm pretty
> > > > sure this method can't reduce the thp_split count in my test. It could
> > > 
> > > I understand your point but approach isn't good to me.
> > > Maybe we can check whether we are going on or not before other THP page split happens
> > > in shrink_page_list. If we split THP page successfully, maybe we can skip another THP split.
> > > Another idea is we can avoid split of THP unless high order reclaim happens or low order
> > > high priority pressure happens.
> > I agreed the split better be done at shrink_page_list, but we must avoid
> > isolate too many pages. I'll check if I can have a better solution for
> > next post.
> Let me try again.
> 
> Subject: thp: improve huge page reclaim -v2
> 
> With transparent huge page enabled, huge page will be split if it will
> be reclaimed. With current logic, if page reclaim finds a huge page,
> it will just reclaim the head page and leave tail pages reclaimed later.
> Let's take an example, lru list has page A and B, page A is huge page:
> 1. page A is isolated
> 2. page B is isolated
> 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> page A+1, page A+2, ... are added to lru list.
> 4. shrink_page_list() adds page B to swap page cache.
> 5. page A and B is written out and reclaimed.
> 6. page A+1, A+2 ... is isolated and reclaimed later.
> So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> The worst case could be we isolate/split 32 huge pages to try to reclaim
> a huge page, but we only the 32 head pages are reclaimed.
> 
> We expected the whole huge page A is reclaimed in the meantime, so
> the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, .... This could reduce a lot
> of unnecessary huge page split and improve the reclaim.
> 
> With this patch, if a huge page is found in isolation, don't continue
> isolation. Since if the huge page is reclaimed, we can reclaim more pages
> than SWAP_CLUSTER_MAX. In shrink_page_list(), the huge page is split and
> all tail pages will be added to the isolation list, so the tail pages can
> be reclaimed immediately.
> 
> The drawback is we might isolate less pages if a huge page is found. But
> I thought the benefit is far more than the drawback.
> 
> All code path are with PageTransHuge(), so should have no impact to normal
> cases.
> 
> In a test, a range of anonymous memory is written and will trigger swap.
> Without the patch:
> #cat /proc/vmstat|grep thp
> thp_fault_alloc 451
> thp_fault_fallback 0
> thp_collapse_alloc 0
> thp_collapse_alloc_failed 0
> thp_split 238
> 
> With the patch:
> #cat /proc/vmstat|grep thp
> thp_fault_alloc 451
> thp_fault_fallback 0
> thp_collapse_alloc 0
> thp_collapse_alloc_failed 0
> thp_split 76
> 
> So the thp_split number is reduced a lot.
> 
> v1->v2: Do the huge page split in shrink_page_list(). Some code are adopted from
> Minchan's.
> 
> Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> 
> ---
>  include/linux/huge_mm.h    |    7 ++++++-
>  include/linux/memcontrol.h |    3 ++-
>  include/linux/swap.h       |    3 ++-
>  mm/huge_memory.c           |   14 ++++++++------
>  mm/memcontrol.c            |    6 +++++-
>  mm/swap.c                  |   10 +++++++++-
>  mm/swap_state.c            |    6 ------
>  mm/vmscan.c                |   27 ++++++++++++++++++++-------
>  8 files changed, 52 insertions(+), 24 deletions(-)
> 
> Index: linux/include/linux/huge_mm.h
> ===================================================================
> --- linux.orig/include/linux/huge_mm.h	2011-11-02 09:48:16.000000000 +0800
> +++ linux/include/linux/huge_mm.h	2011-11-02 10:06:33.000000000 +0800
> @@ -81,7 +81,12 @@ extern int copy_pte_range(struct mm_stru
>  extern int handle_pte_fault(struct mm_struct *mm,
>  			    struct vm_area_struct *vma, unsigned long address,
>  			    pte_t *pte, pmd_t *pmd, unsigned int flags);
> -extern int split_huge_page(struct page *page);
> +extern int split_huge_page_list(struct page *page, struct list_head *dst);
> +static inline int split_huge_page(struct page *page)
> +{
> +	return split_huge_page_list(page, NULL);
> +}
> +
>  extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
>  #define split_huge_page_pmd(__mm, __pmd)				\
>  	do {								\
> Index: linux/include/linux/swap.h
> ===================================================================
> --- linux.orig/include/linux/swap.h	2011-11-02 09:48:16.000000000 +0800
> +++ linux/include/linux/swap.h	2011-11-02 10:06:33.000000000 +0800
> @@ -218,7 +218,8 @@ extern unsigned int nr_free_pagecache_pa
>  extern void __lru_cache_add(struct page *, enum lru_list lru);
>  extern void lru_cache_add_lru(struct page *, enum lru_list lru);
>  extern void lru_add_page_tail(struct zone* zone,
> -			      struct page *page, struct page *page_tail);
> +			      struct page *page, struct page *page_tail,
> +			      struct list_head *dst);
>  extern void activate_page(struct page *);
>  extern void mark_page_accessed(struct page *);
>  extern void lru_add_drain(void);
> Index: linux/mm/huge_memory.c
> ===================================================================
> --- linux.orig/mm/huge_memory.c	2011-11-02 09:48:16.000000000 +0800
> +++ linux/mm/huge_memory.c	2011-11-02 10:58:21.000000000 +0800
> @@ -1159,7 +1159,8 @@ static int __split_huge_page_splitting(s
>  	return ret;
>  }
>  
> -static void __split_huge_page_refcount(struct page *page)
> +static void __split_huge_page_refcount(struct page *page,
> +				       struct list_head *list)
>  {
>  	int i;
>  	struct zone *zone = page_zone(page);
> @@ -1229,7 +1230,7 @@ static void __split_huge_page_refcount(s
>  
>  		mem_cgroup_split_huge_fixup(page, page_tail);
>  
> -		lru_add_page_tail(zone, page, page_tail);
> +		lru_add_page_tail(zone, page, page_tail, list);
>  	}
>  
>  	__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
> @@ -1343,7 +1344,8 @@ static int __split_huge_page_map(struct
>  
>  /* must be called with anon_vma->root->mutex hold */
>  static void __split_huge_page(struct page *page,
> -			      struct anon_vma *anon_vma)
> +			      struct anon_vma *anon_vma,
> +			      struct list_head *list)
>  {
>  	int mapcount, mapcount2;
>  	struct anon_vma_chain *avc;
> @@ -1375,7 +1377,7 @@ static void __split_huge_page(struct pag
>  		       mapcount, page_mapcount(page));
>  	BUG_ON(mapcount != page_mapcount(page));
>  
> -	__split_huge_page_refcount(page);
> +	__split_huge_page_refcount(page, list);
>  
>  	mapcount2 = 0;
>  	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
> @@ -1392,7 +1394,7 @@ static void __split_huge_page(struct pag
>  	BUG_ON(mapcount != mapcount2);
>  }
>  
> -int split_huge_page(struct page *page)
> +int split_huge_page_list(struct page *page, struct list_head *list)
>  {
>  	struct anon_vma *anon_vma;
>  	int ret = 1;
> @@ -1406,7 +1408,7 @@ int split_huge_page(struct page *page)
>  		goto out_unlock;
>  
>  	BUG_ON(!PageSwapBacked(page));
> -	__split_huge_page(page, anon_vma);
> +	__split_huge_page(page, anon_vma, list);
>  	count_vm_event(THP_SPLIT);
>  
>  	BUG_ON(PageCompound(page));
> Index: linux/mm/swap.c
> ===================================================================
> --- linux.orig/mm/swap.c	2011-11-02 09:48:16.000000000 +0800
> +++ linux/mm/swap.c	2011-11-02 10:06:33.000000000 +0800
> @@ -634,7 +634,8 @@ EXPORT_SYMBOL(__pagevec_release);
>  
>  /* used by __split_huge_page_refcount() */
>  void lru_add_page_tail(struct zone* zone,
> -		       struct page *page, struct page *page_tail)
> +		       struct page *page, struct page *page_tail,
> +		       struct list_head *dst)
>  {
>  	int active;
>  	enum lru_list lru;
> @@ -646,6 +647,13 @@ void lru_add_page_tail(struct zone* zone
>  	VM_BUG_ON(PageLRU(page_tail));
>  	VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
>  
> +	/* The huge page is isolated */
> +	if (dst) {
> +		get_page(page_tail);
> +		list_add_tail(&page_tail->lru, dst);
> +		return;
> +	}
> +
>  	SetPageLRU(page_tail);
>  
>  	if (page_evictable(page_tail, NULL)) {
> Index: linux/mm/swap_state.c
> ===================================================================
> --- linux.orig/mm/swap_state.c	2011-11-02 09:48:16.000000000 +0800
> +++ linux/mm/swap_state.c	2011-11-02 10:06:33.000000000 +0800
> @@ -154,12 +154,6 @@ int add_to_swap(struct page *page)
>  	if (!entry.val)
>  		return 0;
>  
> -	if (unlikely(PageTransHuge(page)))
> -		if (unlikely(split_huge_page(page))) {
> -			swapcache_free(entry, NULL);
> -			return 0;
> -		}
> -
>  	/*
>  	 * Radix-tree node allocations from PF_MEMALLOC contexts could
>  	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
> Index: linux/mm/vmscan.c
> ===================================================================
> --- linux.orig/mm/vmscan.c	2011-11-02 09:48:16.000000000 +0800
> +++ linux/mm/vmscan.c	2011-11-02 10:58:21.000000000 +0800
> @@ -838,6 +838,10 @@ static unsigned long shrink_page_list(st
>  		if (PageAnon(page) && !PageSwapCache(page)) {
>  			if (!(sc->gfp_mask & __GFP_IO))
>  				goto keep_locked;
> +			if (unlikely(PageTransHuge(page)))
> +				if (unlikely(split_huge_page_list(page,
> +					page_list)))
> +				    goto activate_locked;
>  			if (!add_to_swap(page))
>  				goto activate_locked;
>  			may_enter_fs = 1;
> @@ -1076,7 +1080,8 @@ int __isolate_lru_page(struct page *page
>   */
>  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
>  		struct list_head *src, struct list_head *dst,
> -		unsigned long *scanned, int order, int mode, int file)
> +		unsigned long *scanned, int order, int mode, int file,
> +		bool break_on_thp)
>  {

Sorry for late response.
These day, I am very busy for new job.

Still, I don't like surgery of isolation part.
What if we isolate a THP page but it is working set page?
Let's assume as follows

1. Ioslate 32 page
2. Unfortunately, 1st page is THP so isolate_lru_page isolates just a 
   page(of course, it's 512 pages)
3. shrink_page_list see that it's working set page but page_list
   have just a page so it have to isolate pages once more with higher priority.

How about this?

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9fdfce7..8121415 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -960,7 +960,15 @@ free_it:
                 * appear not as the counts should be low
                 */
                list_add(&page->lru, &free_pages);
-               continue;
+
+               /*
+                * If we have reclaimed enough pages, let's cut it off.
+                * It could prevent unnecessary THP split.
+                */
+               if (nr_reclaimed >= sc->nr_to_reclaim)
+                       break;
+               else
+                       continue;
 
 cull_mlocked:
                if (PageSwapCache(page))





>  	unsigned long nr_taken = 0;
>  	unsigned long nr_lumpy_taken = 0;
> @@ -1101,6 +1106,10 @@ static unsigned long isolate_lru_pages(u
>  			list_move(&page->lru, dst);
>  			mem_cgroup_del_lru(page);
>  			nr_taken += hpage_nr_pages(page);
> +			if (unlikely(PageTransHuge(page)) && break_on_thp) {
> +				scan++;
> +				goto out;
> +			}
>  			break;
>  
>  		case -EBUSY:
> @@ -1163,6 +1172,8 @@ static unsigned long isolate_lru_pages(u
>  				if (PageDirty(cursor_page))
>  					nr_lumpy_dirty++;
>  				scan++;
> +				if (unlikely(PageTransHuge(page)) && break_on_thp)
> +					goto out;
>  			} else {
>  				/*
>  				 * Check if the page is freed already.
> @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
>  			nr_lumpy_failed++;
>  	}
>  
> +out:
>  	*scanned = scan;
>  
>  	trace_mm_vmscan_lru_isolate(order,
> @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
>  					struct list_head *dst,
>  					unsigned long *scanned, int order,
>  					int mode, struct zone *z,
> -					int active, int file)
> +					int active, int file,
> +					bool break_on_thp)
>  {
>  	int lru = LRU_BASE;
>  	if (active)
> @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
>  	if (file)
>  		lru += LRU_FILE;
>  	return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> -								mode, file);
> +							mode, file, break_on_thp);
>  }
>  
>  /*
> @@ -1466,7 +1479,7 @@ shrink_inactive_list(unsigned long nr_to
>  			&page_list, &nr_scanned, sc->order,
>  			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
>  					ISOLATE_BOTH : ISOLATE_INACTIVE,
> -			zone, 0, file);
> +			zone, 0, file, true);
>  		zone->pages_scanned += nr_scanned;
>  		if (current_is_kswapd())
>  			__count_zone_vm_events(PGSCAN_KSWAPD, zone,
> @@ -1480,7 +1493,7 @@ shrink_inactive_list(unsigned long nr_to
>  			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
>  					ISOLATE_BOTH : ISOLATE_INACTIVE,
>  			zone, sc->mem_cgroup,
> -			0, file);
> +			0, file, true);
>  		/*
>  		 * mem_cgroup_isolate_pages() keeps track of
>  		 * scanned pages on its own.
> @@ -1589,13 +1602,13 @@ static void shrink_active_list(unsigned
>  		nr_taken = isolate_pages_global(nr_pages, &l_hold,
>  						&pgscanned, sc->order,
>  						ISOLATE_ACTIVE, zone,
> -						1, file);
> +						1, file, false);
>  		zone->pages_scanned += pgscanned;
>  	} else {
>  		nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
>  						&pgscanned, sc->order,
>  						ISOLATE_ACTIVE, zone,
> -						sc->mem_cgroup, 1, file);
> +						sc->mem_cgroup, 1, file, false);
>  		/*
>  		 * mem_cgroup_isolate_pages() keeps track of
>  		 * scanned pages on its own.
> Index: linux/include/linux/memcontrol.h
> ===================================================================
> --- linux.orig/include/linux/memcontrol.h	2011-11-02 09:48:16.000000000 +0800
> +++ linux/include/linux/memcontrol.h	2011-11-02 10:06:33.000000000 +0800
> @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
>  					unsigned long *scanned, int order,
>  					int mode, struct zone *z,
>  					struct mem_cgroup *mem_cont,
> -					int active, int file);
> +					int active, int file,
> +					bool break_on_thp);
>  
>  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
>  /*
> Index: linux/mm/memcontrol.c
> ===================================================================
> --- linux.orig/mm/memcontrol.c	2011-11-02 09:48:16.000000000 +0800
> +++ linux/mm/memcontrol.c	2011-11-02 10:06:33.000000000 +0800
> @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
>  					unsigned long *scanned, int order,
>  					int mode, struct zone *z,
>  					struct mem_cgroup *mem_cont,
> -					int active, int file)
> +					int active, int file,
> +					bool break_on_thp)
>  {
>  	unsigned long nr_taken = 0;
>  	struct page *page;
> @@ -1225,6 +1226,8 @@ unsigned long mem_cgroup_isolate_pages(u
>  			list_move(&page->lru, dst);
>  			mem_cgroup_del_lru(page);
>  			nr_taken += hpage_nr_pages(page);
> +			if (unlikely(PageTransHuge(page)) && break_on_thp)
> +				goto out;
>  			break;
>  		case -EBUSY:
>  			/* we don't affect global LRU but rotate in our LRU */
> @@ -1235,6 +1238,7 @@ unsigned long mem_cgroup_isolate_pages(u
>  		}
>  	}
>  
> +out:
>  	*scanned = scan;
>  
>  	trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> 
> 

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-11-08  8:59             ` Minchan Kim
@ 2011-11-09  5:27               ` Shaohua Li
  -1 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-11-09  5:27 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Tue, 2011-11-08 at 16:59 +0800, Minchan Kim wrote:
> On Wed, Nov 02, 2011 at 11:17:55AM +0800, Shaohua Li wrote:
> > On Mon, 2011-10-31 at 17:03 +0800, Shaohua Li wrote:
> > > On Mon, 2011-10-31 at 16:23 +0800, Minchan Kim wrote:
> > > > On Mon, Oct 31, 2011 at 09:21:28AM +0800, Shaohua Li wrote:
> > > > > On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> > > > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > > > lru list has page A and B, page A is huge page:
> > > > > > > 1. page A is isolated
> > > > > > > 2. page B is isolated
> > > > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > > > page A+1, page A+2, ... are added to lru list.
> > > > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > > > 5. page A and B is written out and reclaimed.
> > > > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > > > >
> > > > > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > > > > > >
> > > > > > > With this patch, we do huge page split just after the head page is isolated
> > > > > > > for inactive lru list, so the tail pages will be reclaimed immediately.
> > > > > > >
> > > > > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > > > > Without the patch:
> > > > > > > #cat /proc/vmstat|grep thp
> > > > > > > thp_fault_alloc 451
> > > > > > > thp_fault_fallback 0
> > > > > > > thp_collapse_alloc 0
> > > > > > > thp_collapse_alloc_failed 0
> > > > > > > thp_split 238
> > > > > > >
> > > > > > > With the patch:
> > > > > > > #cat /proc/vmstat|grep thp
> > > > > > > thp_fault_alloc 450
> > > > > > > thp_fault_fallback 1
> > > > > > > thp_collapse_alloc 0
> > > > > > > thp_collapse_alloc_failed 0
> > > > > > > thp_split 103
> > > > > > >
> > > > > > > So the thp_split number is reduced a lot, though there is one extra
> > > > > > > thp_fault_fallback.
> > > > > > >
> > > > > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > > > > > ---
> > > > > > >  include/linux/memcontrol.h |    3 +-
> > > > > > >  mm/memcontrol.c            |   12 +++++++++--
> > > > > > >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> > > > > > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > > > > > >
> > > > > > > Index: linux/mm/vmscan.c
> > > > > > > ===================================================================
> > > > > > > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > > > > > > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > > > > > > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> > > > > > >   */
> > > > > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > > > > >               struct list_head *src, struct list_head *dst,
> > > > > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > > > > +             struct page **split_page)
> > > > > > >  {
> > > > > > >       unsigned long nr_taken = 0;
> > > > > > >       unsigned long nr_lumpy_taken = 0;
> > > > > > > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> > > > > > >               case 0:
> > > > > > >                       list_move(&page->lru, dst);
> > > > > > >                       mem_cgroup_del_lru(page);
> > > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > > +                             nr_taken++;
> > > > > > > +                             *split_page = page;
> > > > > > > +                             goto out;
> > > > > > > +                     } else
> > > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > >                       break;
> > > > > > >
> > > > > > >               case -EBUSY:
> > > > > > > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> > > > > > >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> > > > > > >                               list_move(&cursor_page->lru, dst);
> > > > > > >                               mem_cgroup_del_lru(cursor_page);
> > > > > > > -                             nr_taken += hpage_nr_pages(page);
> > > > > > >                               nr_lumpy_taken++;
> > > > > > >                               if (PageDirty(cursor_page))
> > > > > > >                                       nr_lumpy_dirty++;
> > > > > > >                               scan++;
> > > > > > > +                             if (PageTransHuge(page) && split_page) {
> > > > > > > +                                     nr_taken++;
> > > > > > > +                                     *split_page = page;
> > > > > > > +                                     goto out;
> > > > > > > +                             } else
> > > > > > > +                                     nr_taken += hpage_nr_pages(page);
> > > > > > >                       } else {
> > > > > > >                               /*
> > > > > > >                                * Check if the page is freed already.
> > > > > > > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> > > > > > >                       nr_lumpy_failed++;
> > > > > > >       }
> > > > > > >
> > > > > > > +out:
> > > > > > >       *scanned = scan;
> > > > > > >
> > > > > > >       trace_mm_vmscan_lru_isolate(order,
> > > > > > > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> > > > > > >                                       struct list_head *dst,
> > > > > > >                                       unsigned long *scanned, int order,
> > > > > > >                                       int mode, struct zone *z,
> > > > > > > -                                     int active, int file)
> > > > > > > +                                     int active, int file,
> > > > > > > +                                     struct page **split_page)
> > > > > > >  {
> > > > > > >       int lru = LRU_BASE;
> > > > > > >       if (active)
> > > > > > > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> > > > > > >       if (file)
> > > > > > >               lru += LRU_FILE;
> > > > > > >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > > > > > > -                                                             mode, file);
> > > > > > > +                                                     mode, file, split_page);
> > > > > > >  }
> > > > > > >
> > > > > > >  /*
> > > > > > > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > >  {
> > > > > > >       LIST_HEAD(page_list);
> > > > > > >       unsigned long nr_scanned;
> > > > > > > +     unsigned long total_scanned = 0;
> > > > > > >       unsigned long nr_reclaimed = 0;
> > > > > > >       unsigned long nr_taken;
> > > > > > >       unsigned long nr_anon;
> > > > > > >       unsigned long nr_file;
> > > > > > > +     struct page *split_page;
> > > > > > >
> > > > > > >       while (unlikely(too_many_isolated(zone, file, sc))) {
> > > > > > >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > > > > > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > >       }
> > > > > > >
> > > > > > >       set_reclaim_mode(priority, sc, false);
> > > > > > > +again:
> > > > > > >       lru_add_drain();
> > > > > > > +     split_page = NULL;
> > > > > > >       spin_lock_irq(&zone->lru_lock);
> > > > > > >
> > > > > > >       if (scanning_global_lru(sc)) {
> > > > > > > -             nr_taken = isolate_pages_global(nr_to_scan,
> > > > > > > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> > > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > > -                     zone, 0, file);
> > > > > > > +                     zone, 0, file, &split_page);
> > > > > > >               zone->pages_scanned += nr_scanned;
> > > > > > > +             total_scanned += nr_scanned;
> > > > > > >               if (current_is_kswapd())
> > > > > > >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > > > > > >                                              nr_scanned);
> > > > > > > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > > > > > >                                              nr_scanned);
> > > > > > >       } else {
> > > > > > > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > > > > > > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> > > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > >                       zone, sc->mem_cgroup,
> > > > > > > -                     0, file);
> > > > > > > +                     0, file, &split_page);
> > > > > > > +             total_scanned += nr_scanned;
> > > > > > >               /*
> > > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > > >                * scanned pages on its own.
> > > > > > > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > >               spin_unlock_irq(&zone->lru_lock);
> > > > > > >               return 0;
> > > > > > >       }
> > > > > > > +     if (split_page && total_scanned < nr_to_scan) {
> > > > > > > +             spin_unlock_irq(&zone->lru_lock);
> > > > > > > +             split_huge_page(split_page);
> > > > > > > +             goto again;
> > > > > > > +     }
> > > > > > >
> > > > > > >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> > > > > > >
> > > > > > >       spin_unlock_irq(&zone->lru_lock);
> > > > > > >
> > > > > > > +     if (split_page)
> > > > > > > +             split_huge_page(split_page);
> > > > > > > +
> > > > > > >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> > > > > > >
> > > > > > >       /* Check if we should syncronously wait for writeback */
> > > > > > > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> > > > > > >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> > > > > > >                                               &pgscanned, sc->order,
> > > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > > -                                             1, file);
> > > > > > > +                                             1, file, NULL);
> > > > > > >               zone->pages_scanned += pgscanned;
> > > > > > >       } else {
> > > > > > >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> > > > > > >                                               &pgscanned, sc->order,
> > > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > > -                                             sc->mem_cgroup, 1, file);
> > > > > > > +                                             sc->mem_cgroup, 1, file, NULL);
> > > > > > >               /*
> > > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > > >                * scanned pages on its own.
> > > > > > > Index: linux/mm/memcontrol.c
> > > > > > > ===================================================================
> > > > > > > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > > > > > > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > > > > > > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > >                                       unsigned long *scanned, int order,
> > > > > > >                                       int mode, struct zone *z,
> > > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > > -                                     int active, int file)
> > > > > > > +                                     int active, int file,
> > > > > > > +                                     struct page **split_page)
> > > > > > >  {
> > > > > > >       unsigned long nr_taken = 0;
> > > > > > >       struct page *page;
> > > > > > > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > >               case 0:
> > > > > > >                       list_move(&page->lru, dst);
> > > > > > >                       mem_cgroup_del_lru(page);
> > > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > > +                             nr_taken++;
> > > > > > > +                             *split_page = page;
> > > > > > > +                             goto out;
> > > > > > > +                     } else
> > > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > > +
> > > > > > >                       break;
> > > > > > >               case -EBUSY:
> > > > > > >                       /* we don't affect global LRU but rotate in our LRU */
> > > > > > > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > >               }
> > > > > > >       }
> > > > > > >
> > > > > > > +out:
> > > > > > >       *scanned = scan;
> > > > > > >
> > > > > > >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > > > > > > Index: linux/include/linux/memcontrol.h
> > > > > > > ===================================================================
> > > > > > > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > > > > > > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > > > > > > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> > > > > > >                                       unsigned long *scanned, int order,
> > > > > > >                                       int mode, struct zone *z,
> > > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > > -                                     int active, int file);
> > > > > > > +                                     int active, int file,
> > > > > > > +                                     struct page **split_page);
> > > > > > >
> > > > > > >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > > > > > >  /*
> > > > > > >
> > > > > > >
> > > > > >
> > > > > > I saw the code. my concern is your patch could make unnecessary split of THP.
> > > > > >
> > > > > > When we isolates page, we can't know whether it's working set or not.
> > > > > > So split should happen after we judge it's working set page.
> > > > > yes, but since memory is big currently, it's unlikely the isolated page
> > > > > get accessed in the window. And I only did the split in
> > > >
> > > > We don't check page_reference when isolate happens.
> > > > Window which between isolation time and reclaim?
> > > > No. Window is from inactive's head to tail and it's the basic concept of
> > > > our LRU.
> > > >
> > > > > shrink_inactive_list, not in active list.
> > > >
> > > > But inactive list's size could be still big and
> > > > page reference heuristic is very important for reclaim algorithm.
> > > I mean pages aren't referenced. but ok, I can't take such assumption.
> > >
> > > > > And THP has mechanism to collapse small pages to huge page later.
> > > >
> > > > You mean "merge" instead of "collapse"?
> > > >
> > > > >
> > > > > > If you really want to merge this patch, I suggest that
> > > > > > we can handle it in shrink_page_list step, not isolation step.
> > > > > >
> > > > > > My totally untested code which is just to show the concept is as follows,
> > > > > I did consider this option before. It has its problem too. The isolation
> > > > > can isolate several huge page one time. And then later shrink_page_list
> > > > > can swap several huge page one time, which is unfortunate. I'm pretty
> > > > > sure this method can't reduce the thp_split count in my test. It could
> > > >
> > > > I understand your point but approach isn't good to me.
> > > > Maybe we can check whether we are going on or not before other THP page split happens
> > > > in shrink_page_list. If we split THP page successfully, maybe we can skip another THP split.
> > > > Another idea is we can avoid split of THP unless high order reclaim happens or low order
> > > > high priority pressure happens.
> > > I agreed the split better be done at shrink_page_list, but we must avoid
> > > isolate too many pages. I'll check if I can have a better solution for
> > > next post.
> > Let me try again.
> >
> > Subject: thp: improve huge page reclaim -v2
> >
> > With transparent huge page enabled, huge page will be split if it will
> > be reclaimed. With current logic, if page reclaim finds a huge page,
> > it will just reclaim the head page and leave tail pages reclaimed later.
> > Let's take an example, lru list has page A and B, page A is huge page:
> > 1. page A is isolated
> > 2. page B is isolated
> > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > page A+1, page A+2, ... are added to lru list.
> > 4. shrink_page_list() adds page B to swap page cache.
> > 5. page A and B is written out and reclaimed.
> > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > The worst case could be we isolate/split 32 huge pages to try to reclaim
> > a huge page, but we only the 32 head pages are reclaimed.
> >
> > We expected the whole huge page A is reclaimed in the meantime, so
> > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, .... This could reduce a lot
> > of unnecessary huge page split and improve the reclaim.
> >
> > With this patch, if a huge page is found in isolation, don't continue
> > isolation. Since if the huge page is reclaimed, we can reclaim more pages
> > than SWAP_CLUSTER_MAX. In shrink_page_list(), the huge page is split and
> > all tail pages will be added to the isolation list, so the tail pages can
> > be reclaimed immediately.
> >
> > The drawback is we might isolate less pages if a huge page is found. But
> > I thought the benefit is far more than the drawback.
> >
> > All code path are with PageTransHuge(), so should have no impact to normal
> > cases.
> >
> > In a test, a range of anonymous memory is written and will trigger swap.
> > Without the patch:
> > #cat /proc/vmstat|grep thp
> > thp_fault_alloc 451
> > thp_fault_fallback 0
> > thp_collapse_alloc 0
> > thp_collapse_alloc_failed 0
> > thp_split 238
> >
> > With the patch:
> > #cat /proc/vmstat|grep thp
> > thp_fault_alloc 451
> > thp_fault_fallback 0
> > thp_collapse_alloc 0
> > thp_collapse_alloc_failed 0
> > thp_split 76
> >
> > So the thp_split number is reduced a lot.
> >
> > v1->v2: Do the huge page split in shrink_page_list(). Some code are adopted from
> > Minchan's.
> >
> > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> >
> > ---
> >  include/linux/huge_mm.h    |    7 ++++++-
> >  include/linux/memcontrol.h |    3 ++-
> >  include/linux/swap.h       |    3 ++-
> >  mm/huge_memory.c           |   14 ++++++++------
> >  mm/memcontrol.c            |    6 +++++-
> >  mm/swap.c                  |   10 +++++++++-
> >  mm/swap_state.c            |    6 ------
> >  mm/vmscan.c                |   27 ++++++++++++++++++++-------
> >  8 files changed, 52 insertions(+), 24 deletions(-)
> >
> > Index: linux/include/linux/huge_mm.h
> > ===================================================================
> > --- linux.orig/include/linux/huge_mm.h        2011-11-02 09:48:16.000000000 +0800
> > +++ linux/include/linux/huge_mm.h     2011-11-02 10:06:33.000000000 +0800
> > @@ -81,7 +81,12 @@ extern int copy_pte_range(struct mm_stru
> >  extern int handle_pte_fault(struct mm_struct *mm,
> >                           struct vm_area_struct *vma, unsigned long address,
> >                           pte_t *pte, pmd_t *pmd, unsigned int flags);
> > -extern int split_huge_page(struct page *page);
> > +extern int split_huge_page_list(struct page *page, struct list_head *dst);
> > +static inline int split_huge_page(struct page *page)
> > +{
> > +     return split_huge_page_list(page, NULL);
> > +}
> > +
> >  extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
> >  #define split_huge_page_pmd(__mm, __pmd)                             \
> >       do {                                                            \
> > Index: linux/include/linux/swap.h
> > ===================================================================
> > --- linux.orig/include/linux/swap.h   2011-11-02 09:48:16.000000000 +0800
> > +++ linux/include/linux/swap.h        2011-11-02 10:06:33.000000000 +0800
> > @@ -218,7 +218,8 @@ extern unsigned int nr_free_pagecache_pa
> >  extern void __lru_cache_add(struct page *, enum lru_list lru);
> >  extern void lru_cache_add_lru(struct page *, enum lru_list lru);
> >  extern void lru_add_page_tail(struct zone* zone,
> > -                           struct page *page, struct page *page_tail);
> > +                           struct page *page, struct page *page_tail,
> > +                           struct list_head *dst);
> >  extern void activate_page(struct page *);
> >  extern void mark_page_accessed(struct page *);
> >  extern void lru_add_drain(void);
> > Index: linux/mm/huge_memory.c
> > ===================================================================
> > --- linux.orig/mm/huge_memory.c       2011-11-02 09:48:16.000000000 +0800
> > +++ linux/mm/huge_memory.c    2011-11-02 10:58:21.000000000 +0800
> > @@ -1159,7 +1159,8 @@ static int __split_huge_page_splitting(s
> >       return ret;
> >  }
> >
> > -static void __split_huge_page_refcount(struct page *page)
> > +static void __split_huge_page_refcount(struct page *page,
> > +                                    struct list_head *list)
> >  {
> >       int i;
> >       struct zone *zone = page_zone(page);
> > @@ -1229,7 +1230,7 @@ static void __split_huge_page_refcount(s
> >
> >               mem_cgroup_split_huge_fixup(page, page_tail);
> >
> > -             lru_add_page_tail(zone, page, page_tail);
> > +             lru_add_page_tail(zone, page, page_tail, list);
> >       }
> >
> >       __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
> > @@ -1343,7 +1344,8 @@ static int __split_huge_page_map(struct
> >
> >  /* must be called with anon_vma->root->mutex hold */
> >  static void __split_huge_page(struct page *page,
> > -                           struct anon_vma *anon_vma)
> > +                           struct anon_vma *anon_vma,
> > +                           struct list_head *list)
> >  {
> >       int mapcount, mapcount2;
> >       struct anon_vma_chain *avc;
> > @@ -1375,7 +1377,7 @@ static void __split_huge_page(struct pag
> >                      mapcount, page_mapcount(page));
> >       BUG_ON(mapcount != page_mapcount(page));
> >
> > -     __split_huge_page_refcount(page);
> > +     __split_huge_page_refcount(page, list);
> >
> >       mapcount2 = 0;
> >       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
> > @@ -1392,7 +1394,7 @@ static void __split_huge_page(struct pag
> >       BUG_ON(mapcount != mapcount2);
> >  }
> >
> > -int split_huge_page(struct page *page)
> > +int split_huge_page_list(struct page *page, struct list_head *list)
> >  {
> >       struct anon_vma *anon_vma;
> >       int ret = 1;
> > @@ -1406,7 +1408,7 @@ int split_huge_page(struct page *page)
> >               goto out_unlock;
> >
> >       BUG_ON(!PageSwapBacked(page));
> > -     __split_huge_page(page, anon_vma);
> > +     __split_huge_page(page, anon_vma, list);
> >       count_vm_event(THP_SPLIT);
> >
> >       BUG_ON(PageCompound(page));
> > Index: linux/mm/swap.c
> > ===================================================================
> > --- linux.orig/mm/swap.c      2011-11-02 09:48:16.000000000 +0800
> > +++ linux/mm/swap.c   2011-11-02 10:06:33.000000000 +0800
> > @@ -634,7 +634,8 @@ EXPORT_SYMBOL(__pagevec_release);
> >
> >  /* used by __split_huge_page_refcount() */
> >  void lru_add_page_tail(struct zone* zone,
> > -                    struct page *page, struct page *page_tail)
> > +                    struct page *page, struct page *page_tail,
> > +                    struct list_head *dst)
> >  {
> >       int active;
> >       enum lru_list lru;
> > @@ -646,6 +647,13 @@ void lru_add_page_tail(struct zone* zone
> >       VM_BUG_ON(PageLRU(page_tail));
> >       VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
> >
> > +     /* The huge page is isolated */
> > +     if (dst) {
> > +             get_page(page_tail);
> > +             list_add_tail(&page_tail->lru, dst);
> > +             return;
> > +     }
> > +
> >       SetPageLRU(page_tail);
> >
> >       if (page_evictable(page_tail, NULL)) {
> > Index: linux/mm/swap_state.c
> > ===================================================================
> > --- linux.orig/mm/swap_state.c        2011-11-02 09:48:16.000000000 +0800
> > +++ linux/mm/swap_state.c     2011-11-02 10:06:33.000000000 +0800
> > @@ -154,12 +154,6 @@ int add_to_swap(struct page *page)
> >       if (!entry.val)
> >               return 0;
> >
> > -     if (unlikely(PageTransHuge(page)))
> > -             if (unlikely(split_huge_page(page))) {
> > -                     swapcache_free(entry, NULL);
> > -                     return 0;
> > -             }
> > -
> >       /*
> >        * Radix-tree node allocations from PF_MEMALLOC contexts could
> >        * completely exhaust the page allocator. __GFP_NOMEMALLOC
> > Index: linux/mm/vmscan.c
> > ===================================================================
> > --- linux.orig/mm/vmscan.c    2011-11-02 09:48:16.000000000 +0800
> > +++ linux/mm/vmscan.c 2011-11-02 10:58:21.000000000 +0800
> > @@ -838,6 +838,10 @@ static unsigned long shrink_page_list(st
> >               if (PageAnon(page) && !PageSwapCache(page)) {
> >                       if (!(sc->gfp_mask & __GFP_IO))
> >                               goto keep_locked;
> > +                     if (unlikely(PageTransHuge(page)))
> > +                             if (unlikely(split_huge_page_list(page,
> > +                                     page_list)))
> > +                                 goto activate_locked;
> >                       if (!add_to_swap(page))
> >                               goto activate_locked;
> >                       may_enter_fs = 1;
> > @@ -1076,7 +1080,8 @@ int __isolate_lru_page(struct page *page
> >   */
> >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> >               struct list_head *src, struct list_head *dst,
> > -             unsigned long *scanned, int order, int mode, int file)
> > +             unsigned long *scanned, int order, int mode, int file,
> > +             bool break_on_thp)
> >  {
> 
> Sorry for late response.
> These day, I am very busy for new job.
Thanks for your time.

> Still, I don't like surgery of isolation part.
> What if we isolate a THP page but it is working set page?
> Let's assume as follows
> 
> 1. Ioslate 32 page
> 2. Unfortunately, 1st page is THP so isolate_lru_page isolates just a
>    page(of course, it's 512 pages)
> 3. shrink_page_list see that it's working set page but page_list
>    have just a page so it have to isolate pages once more with higher priority.
that's possible. we might scan more pages, but should not introduce more
THP split, since isolate stop at huge page. on the other hand, if
isolation doesn't break in huge page, we can't split it and reclaim it
as a whole immediately. I didn't get a way to make both sides good. I
still thought the benefit is bigger than the drawback.

> How about this?
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 9fdfce7..8121415 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -960,7 +960,15 @@ free_it:
>                  * appear not as the counts should be low
>                  */
>                 list_add(&page->lru, &free_pages);
> -               continue;
> +
> +               /*
> +                * If we have reclaimed enough pages, let's cut it off.
> +                * It could prevent unnecessary THP split.
> +                */
> +               if (nr_reclaimed >= sc->nr_to_reclaim)
> +                       break;
> +               else
> +                       continue;
> 
>  cull_mlocked:
>                 if (PageSwapCache(page))
this doesn't work. the huge page is dirty, so can't be reclaimed
immediately.

Thanks,
Shaohua


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-11-09  5:27               ` Shaohua Li
  0 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-11-09  5:27 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Tue, 2011-11-08 at 16:59 +0800, Minchan Kim wrote:
> On Wed, Nov 02, 2011 at 11:17:55AM +0800, Shaohua Li wrote:
> > On Mon, 2011-10-31 at 17:03 +0800, Shaohua Li wrote:
> > > On Mon, 2011-10-31 at 16:23 +0800, Minchan Kim wrote:
> > > > On Mon, Oct 31, 2011 at 09:21:28AM +0800, Shaohua Li wrote:
> > > > > On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> > > > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > > > lru list has page A and B, page A is huge page:
> > > > > > > 1. page A is isolated
> > > > > > > 2. page B is isolated
> > > > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > > > page A+1, page A+2, ... are added to lru list.
> > > > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > > > 5. page A and B is written out and reclaimed.
> > > > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > > > >
> > > > > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > > > > > >
> > > > > > > With this patch, we do huge page split just after the head page is isolated
> > > > > > > for inactive lru list, so the tail pages will be reclaimed immediately.
> > > > > > >
> > > > > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > > > > Without the patch:
> > > > > > > #cat /proc/vmstat|grep thp
> > > > > > > thp_fault_alloc 451
> > > > > > > thp_fault_fallback 0
> > > > > > > thp_collapse_alloc 0
> > > > > > > thp_collapse_alloc_failed 0
> > > > > > > thp_split 238
> > > > > > >
> > > > > > > With the patch:
> > > > > > > #cat /proc/vmstat|grep thp
> > > > > > > thp_fault_alloc 450
> > > > > > > thp_fault_fallback 1
> > > > > > > thp_collapse_alloc 0
> > > > > > > thp_collapse_alloc_failed 0
> > > > > > > thp_split 103
> > > > > > >
> > > > > > > So the thp_split number is reduced a lot, though there is one extra
> > > > > > > thp_fault_fallback.
> > > > > > >
> > > > > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > > > > > ---
> > > > > > >  include/linux/memcontrol.h |    3 +-
> > > > > > >  mm/memcontrol.c            |   12 +++++++++--
> > > > > > >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> > > > > > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > > > > > >
> > > > > > > Index: linux/mm/vmscan.c
> > > > > > > ===================================================================
> > > > > > > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > > > > > > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > > > > > > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> > > > > > >   */
> > > > > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > > > > >               struct list_head *src, struct list_head *dst,
> > > > > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > > > > +             struct page **split_page)
> > > > > > >  {
> > > > > > >       unsigned long nr_taken = 0;
> > > > > > >       unsigned long nr_lumpy_taken = 0;
> > > > > > > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> > > > > > >               case 0:
> > > > > > >                       list_move(&page->lru, dst);
> > > > > > >                       mem_cgroup_del_lru(page);
> > > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > > +                             nr_taken++;
> > > > > > > +                             *split_page = page;
> > > > > > > +                             goto out;
> > > > > > > +                     } else
> > > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > >                       break;
> > > > > > >
> > > > > > >               case -EBUSY:
> > > > > > > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> > > > > > >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> > > > > > >                               list_move(&cursor_page->lru, dst);
> > > > > > >                               mem_cgroup_del_lru(cursor_page);
> > > > > > > -                             nr_taken += hpage_nr_pages(page);
> > > > > > >                               nr_lumpy_taken++;
> > > > > > >                               if (PageDirty(cursor_page))
> > > > > > >                                       nr_lumpy_dirty++;
> > > > > > >                               scan++;
> > > > > > > +                             if (PageTransHuge(page) && split_page) {
> > > > > > > +                                     nr_taken++;
> > > > > > > +                                     *split_page = page;
> > > > > > > +                                     goto out;
> > > > > > > +                             } else
> > > > > > > +                                     nr_taken += hpage_nr_pages(page);
> > > > > > >                       } else {
> > > > > > >                               /*
> > > > > > >                                * Check if the page is freed already.
> > > > > > > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> > > > > > >                       nr_lumpy_failed++;
> > > > > > >       }
> > > > > > >
> > > > > > > +out:
> > > > > > >       *scanned = scan;
> > > > > > >
> > > > > > >       trace_mm_vmscan_lru_isolate(order,
> > > > > > > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> > > > > > >                                       struct list_head *dst,
> > > > > > >                                       unsigned long *scanned, int order,
> > > > > > >                                       int mode, struct zone *z,
> > > > > > > -                                     int active, int file)
> > > > > > > +                                     int active, int file,
> > > > > > > +                                     struct page **split_page)
> > > > > > >  {
> > > > > > >       int lru = LRU_BASE;
> > > > > > >       if (active)
> > > > > > > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> > > > > > >       if (file)
> > > > > > >               lru += LRU_FILE;
> > > > > > >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > > > > > > -                                                             mode, file);
> > > > > > > +                                                     mode, file, split_page);
> > > > > > >  }
> > > > > > >
> > > > > > >  /*
> > > > > > > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > >  {
> > > > > > >       LIST_HEAD(page_list);
> > > > > > >       unsigned long nr_scanned;
> > > > > > > +     unsigned long total_scanned = 0;
> > > > > > >       unsigned long nr_reclaimed = 0;
> > > > > > >       unsigned long nr_taken;
> > > > > > >       unsigned long nr_anon;
> > > > > > >       unsigned long nr_file;
> > > > > > > +     struct page *split_page;
> > > > > > >
> > > > > > >       while (unlikely(too_many_isolated(zone, file, sc))) {
> > > > > > >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > > > > > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > >       }
> > > > > > >
> > > > > > >       set_reclaim_mode(priority, sc, false);
> > > > > > > +again:
> > > > > > >       lru_add_drain();
> > > > > > > +     split_page = NULL;
> > > > > > >       spin_lock_irq(&zone->lru_lock);
> > > > > > >
> > > > > > >       if (scanning_global_lru(sc)) {
> > > > > > > -             nr_taken = isolate_pages_global(nr_to_scan,
> > > > > > > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> > > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > > -                     zone, 0, file);
> > > > > > > +                     zone, 0, file, &split_page);
> > > > > > >               zone->pages_scanned += nr_scanned;
> > > > > > > +             total_scanned += nr_scanned;
> > > > > > >               if (current_is_kswapd())
> > > > > > >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > > > > > >                                              nr_scanned);
> > > > > > > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > > > > > >                                              nr_scanned);
> > > > > > >       } else {
> > > > > > > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > > > > > > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> > > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > >                       zone, sc->mem_cgroup,
> > > > > > > -                     0, file);
> > > > > > > +                     0, file, &split_page);
> > > > > > > +             total_scanned += nr_scanned;
> > > > > > >               /*
> > > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > > >                * scanned pages on its own.
> > > > > > > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > >               spin_unlock_irq(&zone->lru_lock);
> > > > > > >               return 0;
> > > > > > >       }
> > > > > > > +     if (split_page && total_scanned < nr_to_scan) {
> > > > > > > +             spin_unlock_irq(&zone->lru_lock);
> > > > > > > +             split_huge_page(split_page);
> > > > > > > +             goto again;
> > > > > > > +     }
> > > > > > >
> > > > > > >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> > > > > > >
> > > > > > >       spin_unlock_irq(&zone->lru_lock);
> > > > > > >
> > > > > > > +     if (split_page)
> > > > > > > +             split_huge_page(split_page);
> > > > > > > +
> > > > > > >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> > > > > > >
> > > > > > >       /* Check if we should syncronously wait for writeback */
> > > > > > > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> > > > > > >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> > > > > > >                                               &pgscanned, sc->order,
> > > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > > -                                             1, file);
> > > > > > > +                                             1, file, NULL);
> > > > > > >               zone->pages_scanned += pgscanned;
> > > > > > >       } else {
> > > > > > >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> > > > > > >                                               &pgscanned, sc->order,
> > > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > > -                                             sc->mem_cgroup, 1, file);
> > > > > > > +                                             sc->mem_cgroup, 1, file, NULL);
> > > > > > >               /*
> > > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > > >                * scanned pages on its own.
> > > > > > > Index: linux/mm/memcontrol.c
> > > > > > > ===================================================================
> > > > > > > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > > > > > > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > > > > > > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > >                                       unsigned long *scanned, int order,
> > > > > > >                                       int mode, struct zone *z,
> > > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > > -                                     int active, int file)
> > > > > > > +                                     int active, int file,
> > > > > > > +                                     struct page **split_page)
> > > > > > >  {
> > > > > > >       unsigned long nr_taken = 0;
> > > > > > >       struct page *page;
> > > > > > > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > >               case 0:
> > > > > > >                       list_move(&page->lru, dst);
> > > > > > >                       mem_cgroup_del_lru(page);
> > > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > > +                             nr_taken++;
> > > > > > > +                             *split_page = page;
> > > > > > > +                             goto out;
> > > > > > > +                     } else
> > > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > > +
> > > > > > >                       break;
> > > > > > >               case -EBUSY:
> > > > > > >                       /* we don't affect global LRU but rotate in our LRU */
> > > > > > > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > >               }
> > > > > > >       }
> > > > > > >
> > > > > > > +out:
> > > > > > >       *scanned = scan;
> > > > > > >
> > > > > > >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > > > > > > Index: linux/include/linux/memcontrol.h
> > > > > > > ===================================================================
> > > > > > > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > > > > > > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > > > > > > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> > > > > > >                                       unsigned long *scanned, int order,
> > > > > > >                                       int mode, struct zone *z,
> > > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > > -                                     int active, int file);
> > > > > > > +                                     int active, int file,
> > > > > > > +                                     struct page **split_page);
> > > > > > >
> > > > > > >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > > > > > >  /*
> > > > > > >
> > > > > > >
> > > > > >
> > > > > > I saw the code. my concern is your patch could make unnecessary split of THP.
> > > > > >
> > > > > > When we isolates page, we can't know whether it's working set or not.
> > > > > > So split should happen after we judge it's working set page.
> > > > > yes, but since memory is big currently, it's unlikely the isolated page
> > > > > get accessed in the window. And I only did the split in
> > > >
> > > > We don't check page_reference when isolate happens.
> > > > Window which between isolation time and reclaim?
> > > > No. Window is from inactive's head to tail and it's the basic concept of
> > > > our LRU.
> > > >
> > > > > shrink_inactive_list, not in active list.
> > > >
> > > > But inactive list's size could be still big and
> > > > page reference heuristic is very important for reclaim algorithm.
> > > I mean pages aren't referenced. but ok, I can't take such assumption.
> > >
> > > > > And THP has mechanism to collapse small pages to huge page later.
> > > >
> > > > You mean "merge" instead of "collapse"?
> > > >
> > > > >
> > > > > > If you really want to merge this patch, I suggest that
> > > > > > we can handle it in shrink_page_list step, not isolation step.
> > > > > >
> > > > > > My totally untested code which is just to show the concept is as follows,
> > > > > I did consider this option before. It has its problem too. The isolation
> > > > > can isolate several huge page one time. And then later shrink_page_list
> > > > > can swap several huge page one time, which is unfortunate. I'm pretty
> > > > > sure this method can't reduce the thp_split count in my test. It could
> > > >
> > > > I understand your point but approach isn't good to me.
> > > > Maybe we can check whether we are going on or not before other THP page split happens
> > > > in shrink_page_list. If we split THP page successfully, maybe we can skip another THP split.
> > > > Another idea is we can avoid split of THP unless high order reclaim happens or low order
> > > > high priority pressure happens.
> > > I agreed the split better be done at shrink_page_list, but we must avoid
> > > isolate too many pages. I'll check if I can have a better solution for
> > > next post.
> > Let me try again.
> >
> > Subject: thp: improve huge page reclaim -v2
> >
> > With transparent huge page enabled, huge page will be split if it will
> > be reclaimed. With current logic, if page reclaim finds a huge page,
> > it will just reclaim the head page and leave tail pages reclaimed later.
> > Let's take an example, lru list has page A and B, page A is huge page:
> > 1. page A is isolated
> > 2. page B is isolated
> > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > page A+1, page A+2, ... are added to lru list.
> > 4. shrink_page_list() adds page B to swap page cache.
> > 5. page A and B is written out and reclaimed.
> > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > The worst case could be we isolate/split 32 huge pages to try to reclaim
> > a huge page, but we only the 32 head pages are reclaimed.
> >
> > We expected the whole huge page A is reclaimed in the meantime, so
> > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, .... This could reduce a lot
> > of unnecessary huge page split and improve the reclaim.
> >
> > With this patch, if a huge page is found in isolation, don't continue
> > isolation. Since if the huge page is reclaimed, we can reclaim more pages
> > than SWAP_CLUSTER_MAX. In shrink_page_list(), the huge page is split and
> > all tail pages will be added to the isolation list, so the tail pages can
> > be reclaimed immediately.
> >
> > The drawback is we might isolate less pages if a huge page is found. But
> > I thought the benefit is far more than the drawback.
> >
> > All code path are with PageTransHuge(), so should have no impact to normal
> > cases.
> >
> > In a test, a range of anonymous memory is written and will trigger swap.
> > Without the patch:
> > #cat /proc/vmstat|grep thp
> > thp_fault_alloc 451
> > thp_fault_fallback 0
> > thp_collapse_alloc 0
> > thp_collapse_alloc_failed 0
> > thp_split 238
> >
> > With the patch:
> > #cat /proc/vmstat|grep thp
> > thp_fault_alloc 451
> > thp_fault_fallback 0
> > thp_collapse_alloc 0
> > thp_collapse_alloc_failed 0
> > thp_split 76
> >
> > So the thp_split number is reduced a lot.
> >
> > v1->v2: Do the huge page split in shrink_page_list(). Some code are adopted from
> > Minchan's.
> >
> > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> >
> > ---
> >  include/linux/huge_mm.h    |    7 ++++++-
> >  include/linux/memcontrol.h |    3 ++-
> >  include/linux/swap.h       |    3 ++-
> >  mm/huge_memory.c           |   14 ++++++++------
> >  mm/memcontrol.c            |    6 +++++-
> >  mm/swap.c                  |   10 +++++++++-
> >  mm/swap_state.c            |    6 ------
> >  mm/vmscan.c                |   27 ++++++++++++++++++++-------
> >  8 files changed, 52 insertions(+), 24 deletions(-)
> >
> > Index: linux/include/linux/huge_mm.h
> > ===================================================================
> > --- linux.orig/include/linux/huge_mm.h        2011-11-02 09:48:16.000000000 +0800
> > +++ linux/include/linux/huge_mm.h     2011-11-02 10:06:33.000000000 +0800
> > @@ -81,7 +81,12 @@ extern int copy_pte_range(struct mm_stru
> >  extern int handle_pte_fault(struct mm_struct *mm,
> >                           struct vm_area_struct *vma, unsigned long address,
> >                           pte_t *pte, pmd_t *pmd, unsigned int flags);
> > -extern int split_huge_page(struct page *page);
> > +extern int split_huge_page_list(struct page *page, struct list_head *dst);
> > +static inline int split_huge_page(struct page *page)
> > +{
> > +     return split_huge_page_list(page, NULL);
> > +}
> > +
> >  extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
> >  #define split_huge_page_pmd(__mm, __pmd)                             \
> >       do {                                                            \
> > Index: linux/include/linux/swap.h
> > ===================================================================
> > --- linux.orig/include/linux/swap.h   2011-11-02 09:48:16.000000000 +0800
> > +++ linux/include/linux/swap.h        2011-11-02 10:06:33.000000000 +0800
> > @@ -218,7 +218,8 @@ extern unsigned int nr_free_pagecache_pa
> >  extern void __lru_cache_add(struct page *, enum lru_list lru);
> >  extern void lru_cache_add_lru(struct page *, enum lru_list lru);
> >  extern void lru_add_page_tail(struct zone* zone,
> > -                           struct page *page, struct page *page_tail);
> > +                           struct page *page, struct page *page_tail,
> > +                           struct list_head *dst);
> >  extern void activate_page(struct page *);
> >  extern void mark_page_accessed(struct page *);
> >  extern void lru_add_drain(void);
> > Index: linux/mm/huge_memory.c
> > ===================================================================
> > --- linux.orig/mm/huge_memory.c       2011-11-02 09:48:16.000000000 +0800
> > +++ linux/mm/huge_memory.c    2011-11-02 10:58:21.000000000 +0800
> > @@ -1159,7 +1159,8 @@ static int __split_huge_page_splitting(s
> >       return ret;
> >  }
> >
> > -static void __split_huge_page_refcount(struct page *page)
> > +static void __split_huge_page_refcount(struct page *page,
> > +                                    struct list_head *list)
> >  {
> >       int i;
> >       struct zone *zone = page_zone(page);
> > @@ -1229,7 +1230,7 @@ static void __split_huge_page_refcount(s
> >
> >               mem_cgroup_split_huge_fixup(page, page_tail);
> >
> > -             lru_add_page_tail(zone, page, page_tail);
> > +             lru_add_page_tail(zone, page, page_tail, list);
> >       }
> >
> >       __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
> > @@ -1343,7 +1344,8 @@ static int __split_huge_page_map(struct
> >
> >  /* must be called with anon_vma->root->mutex hold */
> >  static void __split_huge_page(struct page *page,
> > -                           struct anon_vma *anon_vma)
> > +                           struct anon_vma *anon_vma,
> > +                           struct list_head *list)
> >  {
> >       int mapcount, mapcount2;
> >       struct anon_vma_chain *avc;
> > @@ -1375,7 +1377,7 @@ static void __split_huge_page(struct pag
> >                      mapcount, page_mapcount(page));
> >       BUG_ON(mapcount != page_mapcount(page));
> >
> > -     __split_huge_page_refcount(page);
> > +     __split_huge_page_refcount(page, list);
> >
> >       mapcount2 = 0;
> >       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
> > @@ -1392,7 +1394,7 @@ static void __split_huge_page(struct pag
> >       BUG_ON(mapcount != mapcount2);
> >  }
> >
> > -int split_huge_page(struct page *page)
> > +int split_huge_page_list(struct page *page, struct list_head *list)
> >  {
> >       struct anon_vma *anon_vma;
> >       int ret = 1;
> > @@ -1406,7 +1408,7 @@ int split_huge_page(struct page *page)
> >               goto out_unlock;
> >
> >       BUG_ON(!PageSwapBacked(page));
> > -     __split_huge_page(page, anon_vma);
> > +     __split_huge_page(page, anon_vma, list);
> >       count_vm_event(THP_SPLIT);
> >
> >       BUG_ON(PageCompound(page));
> > Index: linux/mm/swap.c
> > ===================================================================
> > --- linux.orig/mm/swap.c      2011-11-02 09:48:16.000000000 +0800
> > +++ linux/mm/swap.c   2011-11-02 10:06:33.000000000 +0800
> > @@ -634,7 +634,8 @@ EXPORT_SYMBOL(__pagevec_release);
> >
> >  /* used by __split_huge_page_refcount() */
> >  void lru_add_page_tail(struct zone* zone,
> > -                    struct page *page, struct page *page_tail)
> > +                    struct page *page, struct page *page_tail,
> > +                    struct list_head *dst)
> >  {
> >       int active;
> >       enum lru_list lru;
> > @@ -646,6 +647,13 @@ void lru_add_page_tail(struct zone* zone
> >       VM_BUG_ON(PageLRU(page_tail));
> >       VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
> >
> > +     /* The huge page is isolated */
> > +     if (dst) {
> > +             get_page(page_tail);
> > +             list_add_tail(&page_tail->lru, dst);
> > +             return;
> > +     }
> > +
> >       SetPageLRU(page_tail);
> >
> >       if (page_evictable(page_tail, NULL)) {
> > Index: linux/mm/swap_state.c
> > ===================================================================
> > --- linux.orig/mm/swap_state.c        2011-11-02 09:48:16.000000000 +0800
> > +++ linux/mm/swap_state.c     2011-11-02 10:06:33.000000000 +0800
> > @@ -154,12 +154,6 @@ int add_to_swap(struct page *page)
> >       if (!entry.val)
> >               return 0;
> >
> > -     if (unlikely(PageTransHuge(page)))
> > -             if (unlikely(split_huge_page(page))) {
> > -                     swapcache_free(entry, NULL);
> > -                     return 0;
> > -             }
> > -
> >       /*
> >        * Radix-tree node allocations from PF_MEMALLOC contexts could
> >        * completely exhaust the page allocator. __GFP_NOMEMALLOC
> > Index: linux/mm/vmscan.c
> > ===================================================================
> > --- linux.orig/mm/vmscan.c    2011-11-02 09:48:16.000000000 +0800
> > +++ linux/mm/vmscan.c 2011-11-02 10:58:21.000000000 +0800
> > @@ -838,6 +838,10 @@ static unsigned long shrink_page_list(st
> >               if (PageAnon(page) && !PageSwapCache(page)) {
> >                       if (!(sc->gfp_mask & __GFP_IO))
> >                               goto keep_locked;
> > +                     if (unlikely(PageTransHuge(page)))
> > +                             if (unlikely(split_huge_page_list(page,
> > +                                     page_list)))
> > +                                 goto activate_locked;
> >                       if (!add_to_swap(page))
> >                               goto activate_locked;
> >                       may_enter_fs = 1;
> > @@ -1076,7 +1080,8 @@ int __isolate_lru_page(struct page *page
> >   */
> >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> >               struct list_head *src, struct list_head *dst,
> > -             unsigned long *scanned, int order, int mode, int file)
> > +             unsigned long *scanned, int order, int mode, int file,
> > +             bool break_on_thp)
> >  {
> 
> Sorry for late response.
> These day, I am very busy for new job.
Thanks for your time.

> Still, I don't like surgery of isolation part.
> What if we isolate a THP page but it is working set page?
> Let's assume as follows
> 
> 1. Ioslate 32 page
> 2. Unfortunately, 1st page is THP so isolate_lru_page isolates just a
>    page(of course, it's 512 pages)
> 3. shrink_page_list see that it's working set page but page_list
>    have just a page so it have to isolate pages once more with higher priority.
that's possible. we might scan more pages, but should not introduce more
THP split, since isolate stop at huge page. on the other hand, if
isolation doesn't break in huge page, we can't split it and reclaim it
as a whole immediately. I didn't get a way to make both sides good. I
still thought the benefit is bigger than the drawback.

> How about this?
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 9fdfce7..8121415 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -960,7 +960,15 @@ free_it:
>                  * appear not as the counts should be low
>                  */
>                 list_add(&page->lru, &free_pages);
> -               continue;
> +
> +               /*
> +                * If we have reclaimed enough pages, let's cut it off.
> +                * It could prevent unnecessary THP split.
> +                */
> +               if (nr_reclaimed >= sc->nr_to_reclaim)
> +                       break;
> +               else
> +                       continue;
> 
>  cull_mlocked:
>                 if (PageSwapCache(page))
this doesn't work. the huge page is dirty, so can't be reclaimed
immediately.

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-11-09  5:27               ` Shaohua Li
@ 2011-11-09  6:28                 ` Minchan Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-11-09  6:28 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Wed, Nov 09, 2011 at 01:27:55PM +0800, Shaohua Li wrote:
> On Tue, 2011-11-08 at 16:59 +0800, Minchan Kim wrote:
> > On Wed, Nov 02, 2011 at 11:17:55AM +0800, Shaohua Li wrote:
> > > On Mon, 2011-10-31 at 17:03 +0800, Shaohua Li wrote:
> > > > On Mon, 2011-10-31 at 16:23 +0800, Minchan Kim wrote:
> > > > > On Mon, Oct 31, 2011 at 09:21:28AM +0800, Shaohua Li wrote:
> > > > > > On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> > > > > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > > > > lru list has page A and B, page A is huge page:
> > > > > > > > 1. page A is isolated
> > > > > > > > 2. page B is isolated
> > > > > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > > > > page A+1, page A+2, ... are added to lru list.
> > > > > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > > > > 5. page A and B is written out and reclaimed.
> > > > > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > > > > >
> > > > > > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > > > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > > > > > > >
> > > > > > > > With this patch, we do huge page split just after the head page is isolated
> > > > > > > > for inactive lru list, so the tail pages will be reclaimed immediately.
> > > > > > > >
> > > > > > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > > > > > Without the patch:
> > > > > > > > #cat /proc/vmstat|grep thp
> > > > > > > > thp_fault_alloc 451
> > > > > > > > thp_fault_fallback 0
> > > > > > > > thp_collapse_alloc 0
> > > > > > > > thp_collapse_alloc_failed 0
> > > > > > > > thp_split 238
> > > > > > > >
> > > > > > > > With the patch:
> > > > > > > > #cat /proc/vmstat|grep thp
> > > > > > > > thp_fault_alloc 450
> > > > > > > > thp_fault_fallback 1
> > > > > > > > thp_collapse_alloc 0
> > > > > > > > thp_collapse_alloc_failed 0
> > > > > > > > thp_split 103
> > > > > > > >
> > > > > > > > So the thp_split number is reduced a lot, though there is one extra
> > > > > > > > thp_fault_fallback.
> > > > > > > >
> > > > > > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > > > > > > ---
> > > > > > > >  include/linux/memcontrol.h |    3 +-
> > > > > > > >  mm/memcontrol.c            |   12 +++++++++--
> > > > > > > >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> > > > > > > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > > > > > > >
> > > > > > > > Index: linux/mm/vmscan.c
> > > > > > > > ===================================================================
> > > > > > > > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > > > > > > > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > > > > > > > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> > > > > > > >   */
> > > > > > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > > > > > >               struct list_head *src, struct list_head *dst,
> > > > > > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > > > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > > > > > +             struct page **split_page)
> > > > > > > >  {
> > > > > > > >       unsigned long nr_taken = 0;
> > > > > > > >       unsigned long nr_lumpy_taken = 0;
> > > > > > > > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> > > > > > > >               case 0:
> > > > > > > >                       list_move(&page->lru, dst);
> > > > > > > >                       mem_cgroup_del_lru(page);
> > > > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > > > +                             nr_taken++;
> > > > > > > > +                             *split_page = page;
> > > > > > > > +                             goto out;
> > > > > > > > +                     } else
> > > > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > > >                       break;
> > > > > > > >
> > > > > > > >               case -EBUSY:
> > > > > > > > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> > > > > > > >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> > > > > > > >                               list_move(&cursor_page->lru, dst);
> > > > > > > >                               mem_cgroup_del_lru(cursor_page);
> > > > > > > > -                             nr_taken += hpage_nr_pages(page);
> > > > > > > >                               nr_lumpy_taken++;
> > > > > > > >                               if (PageDirty(cursor_page))
> > > > > > > >                                       nr_lumpy_dirty++;
> > > > > > > >                               scan++;
> > > > > > > > +                             if (PageTransHuge(page) && split_page) {
> > > > > > > > +                                     nr_taken++;
> > > > > > > > +                                     *split_page = page;
> > > > > > > > +                                     goto out;
> > > > > > > > +                             } else
> > > > > > > > +                                     nr_taken += hpage_nr_pages(page);
> > > > > > > >                       } else {
> > > > > > > >                               /*
> > > > > > > >                                * Check if the page is freed already.
> > > > > > > > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> > > > > > > >                       nr_lumpy_failed++;
> > > > > > > >       }
> > > > > > > >
> > > > > > > > +out:
> > > > > > > >       *scanned = scan;
> > > > > > > >
> > > > > > > >       trace_mm_vmscan_lru_isolate(order,
> > > > > > > > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> > > > > > > >                                       struct list_head *dst,
> > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > >                                       int mode, struct zone *z,
> > > > > > > > -                                     int active, int file)
> > > > > > > > +                                     int active, int file,
> > > > > > > > +                                     struct page **split_page)
> > > > > > > >  {
> > > > > > > >       int lru = LRU_BASE;
> > > > > > > >       if (active)
> > > > > > > > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> > > > > > > >       if (file)
> > > > > > > >               lru += LRU_FILE;
> > > > > > > >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > > > > > > > -                                                             mode, file);
> > > > > > > > +                                                     mode, file, split_page);
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  /*
> > > > > > > > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > >  {
> > > > > > > >       LIST_HEAD(page_list);
> > > > > > > >       unsigned long nr_scanned;
> > > > > > > > +     unsigned long total_scanned = 0;
> > > > > > > >       unsigned long nr_reclaimed = 0;
> > > > > > > >       unsigned long nr_taken;
> > > > > > > >       unsigned long nr_anon;
> > > > > > > >       unsigned long nr_file;
> > > > > > > > +     struct page *split_page;
> > > > > > > >
> > > > > > > >       while (unlikely(too_many_isolated(zone, file, sc))) {
> > > > > > > >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > > > > > > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > >       }
> > > > > > > >
> > > > > > > >       set_reclaim_mode(priority, sc, false);
> > > > > > > > +again:
> > > > > > > >       lru_add_drain();
> > > > > > > > +     split_page = NULL;
> > > > > > > >       spin_lock_irq(&zone->lru_lock);
> > > > > > > >
> > > > > > > >       if (scanning_global_lru(sc)) {
> > > > > > > > -             nr_taken = isolate_pages_global(nr_to_scan,
> > > > > > > > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> > > > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > > > -                     zone, 0, file);
> > > > > > > > +                     zone, 0, file, &split_page);
> > > > > > > >               zone->pages_scanned += nr_scanned;
> > > > > > > > +             total_scanned += nr_scanned;
> > > > > > > >               if (current_is_kswapd())
> > > > > > > >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > > > > > > >                                              nr_scanned);
> > > > > > > > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > > > > > > >                                              nr_scanned);
> > > > > > > >       } else {
> > > > > > > > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > > > > > > > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> > > > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > > >                       zone, sc->mem_cgroup,
> > > > > > > > -                     0, file);
> > > > > > > > +                     0, file, &split_page);
> > > > > > > > +             total_scanned += nr_scanned;
> > > > > > > >               /*
> > > > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > > > >                * scanned pages on its own.
> > > > > > > > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > >               spin_unlock_irq(&zone->lru_lock);
> > > > > > > >               return 0;
> > > > > > > >       }
> > > > > > > > +     if (split_page && total_scanned < nr_to_scan) {
> > > > > > > > +             spin_unlock_irq(&zone->lru_lock);
> > > > > > > > +             split_huge_page(split_page);
> > > > > > > > +             goto again;
> > > > > > > > +     }
> > > > > > > >
> > > > > > > >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> > > > > > > >
> > > > > > > >       spin_unlock_irq(&zone->lru_lock);
> > > > > > > >
> > > > > > > > +     if (split_page)
> > > > > > > > +             split_huge_page(split_page);
> > > > > > > > +
> > > > > > > >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> > > > > > > >
> > > > > > > >       /* Check if we should syncronously wait for writeback */
> > > > > > > > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> > > > > > > >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> > > > > > > >                                               &pgscanned, sc->order,
> > > > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > > > -                                             1, file);
> > > > > > > > +                                             1, file, NULL);
> > > > > > > >               zone->pages_scanned += pgscanned;
> > > > > > > >       } else {
> > > > > > > >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> > > > > > > >                                               &pgscanned, sc->order,
> > > > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > > > -                                             sc->mem_cgroup, 1, file);
> > > > > > > > +                                             sc->mem_cgroup, 1, file, NULL);
> > > > > > > >               /*
> > > > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > > > >                * scanned pages on its own.
> > > > > > > > Index: linux/mm/memcontrol.c
> > > > > > > > ===================================================================
> > > > > > > > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > > > > > > > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > > > > > > > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > >                                       int mode, struct zone *z,
> > > > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > > > -                                     int active, int file)
> > > > > > > > +                                     int active, int file,
> > > > > > > > +                                     struct page **split_page)
> > > > > > > >  {
> > > > > > > >       unsigned long nr_taken = 0;
> > > > > > > >       struct page *page;
> > > > > > > > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > >               case 0:
> > > > > > > >                       list_move(&page->lru, dst);
> > > > > > > >                       mem_cgroup_del_lru(page);
> > > > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > > > +                             nr_taken++;
> > > > > > > > +                             *split_page = page;
> > > > > > > > +                             goto out;
> > > > > > > > +                     } else
> > > > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > > > +
> > > > > > > >                       break;
> > > > > > > >               case -EBUSY:
> > > > > > > >                       /* we don't affect global LRU but rotate in our LRU */
> > > > > > > > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > >               }
> > > > > > > >       }
> > > > > > > >
> > > > > > > > +out:
> > > > > > > >       *scanned = scan;
> > > > > > > >
> > > > > > > >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > > > > > > > Index: linux/include/linux/memcontrol.h
> > > > > > > > ===================================================================
> > > > > > > > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > > > > > > > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > > > > > > > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > >                                       int mode, struct zone *z,
> > > > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > > > -                                     int active, int file);
> > > > > > > > +                                     int active, int file,
> > > > > > > > +                                     struct page **split_page);
> > > > > > > >
> > > > > > > >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > > > > > > >  /*
> > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > > > I saw the code. my concern is your patch could make unnecessary split of THP.
> > > > > > >
> > > > > > > When we isolates page, we can't know whether it's working set or not.
> > > > > > > So split should happen after we judge it's working set page.
> > > > > > yes, but since memory is big currently, it's unlikely the isolated page
> > > > > > get accessed in the window. And I only did the split in
> > > > >
> > > > > We don't check page_reference when isolate happens.
> > > > > Window which between isolation time and reclaim?
> > > > > No. Window is from inactive's head to tail and it's the basic concept of
> > > > > our LRU.
> > > > >
> > > > > > shrink_inactive_list, not in active list.
> > > > >
> > > > > But inactive list's size could be still big and
> > > > > page reference heuristic is very important for reclaim algorithm.
> > > > I mean pages aren't referenced. but ok, I can't take such assumption.
> > > >
> > > > > > And THP has mechanism to collapse small pages to huge page later.
> > > > >
> > > > > You mean "merge" instead of "collapse"?
> > > > >
> > > > > >
> > > > > > > If you really want to merge this patch, I suggest that
> > > > > > > we can handle it in shrink_page_list step, not isolation step.
> > > > > > >
> > > > > > > My totally untested code which is just to show the concept is as follows,
> > > > > > I did consider this option before. It has its problem too. The isolation
> > > > > > can isolate several huge page one time. And then later shrink_page_list
> > > > > > can swap several huge page one time, which is unfortunate. I'm pretty
> > > > > > sure this method can't reduce the thp_split count in my test. It could
> > > > >
> > > > > I understand your point but approach isn't good to me.
> > > > > Maybe we can check whether we are going on or not before other THP page split happens
> > > > > in shrink_page_list. If we split THP page successfully, maybe we can skip another THP split.
> > > > > Another idea is we can avoid split of THP unless high order reclaim happens or low order
> > > > > high priority pressure happens.
> > > > I agreed the split better be done at shrink_page_list, but we must avoid
> > > > isolate too many pages. I'll check if I can have a better solution for
> > > > next post.
> > > Let me try again.
> > >
> > > Subject: thp: improve huge page reclaim -v2
> > >
> > > With transparent huge page enabled, huge page will be split if it will
> > > be reclaimed. With current logic, if page reclaim finds a huge page,
> > > it will just reclaim the head page and leave tail pages reclaimed later.
> > > Let's take an example, lru list has page A and B, page A is huge page:
> > > 1. page A is isolated
> > > 2. page B is isolated
> > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > page A+1, page A+2, ... are added to lru list.
> > > 4. shrink_page_list() adds page B to swap page cache.
> > > 5. page A and B is written out and reclaimed.
> > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > The worst case could be we isolate/split 32 huge pages to try to reclaim
> > > a huge page, but we only the 32 head pages are reclaimed.
> > >
> > > We expected the whole huge page A is reclaimed in the meantime, so
> > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, .... This could reduce a lot
> > > of unnecessary huge page split and improve the reclaim.
> > >
> > > With this patch, if a huge page is found in isolation, don't continue
> > > isolation. Since if the huge page is reclaimed, we can reclaim more pages
> > > than SWAP_CLUSTER_MAX. In shrink_page_list(), the huge page is split and
> > > all tail pages will be added to the isolation list, so the tail pages can
> > > be reclaimed immediately.
> > >
> > > The drawback is we might isolate less pages if a huge page is found. But
> > > I thought the benefit is far more than the drawback.
> > >
> > > All code path are with PageTransHuge(), so should have no impact to normal
> > > cases.
> > >
> > > In a test, a range of anonymous memory is written and will trigger swap.
> > > Without the patch:
> > > #cat /proc/vmstat|grep thp
> > > thp_fault_alloc 451
> > > thp_fault_fallback 0
> > > thp_collapse_alloc 0
> > > thp_collapse_alloc_failed 0
> > > thp_split 238
> > >
> > > With the patch:
> > > #cat /proc/vmstat|grep thp
> > > thp_fault_alloc 451
> > > thp_fault_fallback 0
> > > thp_collapse_alloc 0
> > > thp_collapse_alloc_failed 0
> > > thp_split 76
> > >
> > > So the thp_split number is reduced a lot.
> > >
> > > v1->v2: Do the huge page split in shrink_page_list(). Some code are adopted from
> > > Minchan's.
> > >
> > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > >
> > > ---
> > >  include/linux/huge_mm.h    |    7 ++++++-
> > >  include/linux/memcontrol.h |    3 ++-
> > >  include/linux/swap.h       |    3 ++-
> > >  mm/huge_memory.c           |   14 ++++++++------
> > >  mm/memcontrol.c            |    6 +++++-
> > >  mm/swap.c                  |   10 +++++++++-
> > >  mm/swap_state.c            |    6 ------
> > >  mm/vmscan.c                |   27 ++++++++++++++++++++-------
> > >  8 files changed, 52 insertions(+), 24 deletions(-)
> > >
> > > Index: linux/include/linux/huge_mm.h
> > > ===================================================================
> > > --- linux.orig/include/linux/huge_mm.h        2011-11-02 09:48:16.000000000 +0800
> > > +++ linux/include/linux/huge_mm.h     2011-11-02 10:06:33.000000000 +0800
> > > @@ -81,7 +81,12 @@ extern int copy_pte_range(struct mm_stru
> > >  extern int handle_pte_fault(struct mm_struct *mm,
> > >                           struct vm_area_struct *vma, unsigned long address,
> > >                           pte_t *pte, pmd_t *pmd, unsigned int flags);
> > > -extern int split_huge_page(struct page *page);
> > > +extern int split_huge_page_list(struct page *page, struct list_head *dst);
> > > +static inline int split_huge_page(struct page *page)
> > > +{
> > > +     return split_huge_page_list(page, NULL);
> > > +}
> > > +
> > >  extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
> > >  #define split_huge_page_pmd(__mm, __pmd)                             \
> > >       do {                                                            \
> > > Index: linux/include/linux/swap.h
> > > ===================================================================
> > > --- linux.orig/include/linux/swap.h   2011-11-02 09:48:16.000000000 +0800
> > > +++ linux/include/linux/swap.h        2011-11-02 10:06:33.000000000 +0800
> > > @@ -218,7 +218,8 @@ extern unsigned int nr_free_pagecache_pa
> > >  extern void __lru_cache_add(struct page *, enum lru_list lru);
> > >  extern void lru_cache_add_lru(struct page *, enum lru_list lru);
> > >  extern void lru_add_page_tail(struct zone* zone,
> > > -                           struct page *page, struct page *page_tail);
> > > +                           struct page *page, struct page *page_tail,
> > > +                           struct list_head *dst);
> > >  extern void activate_page(struct page *);
> > >  extern void mark_page_accessed(struct page *);
> > >  extern void lru_add_drain(void);
> > > Index: linux/mm/huge_memory.c
> > > ===================================================================
> > > --- linux.orig/mm/huge_memory.c       2011-11-02 09:48:16.000000000 +0800
> > > +++ linux/mm/huge_memory.c    2011-11-02 10:58:21.000000000 +0800
> > > @@ -1159,7 +1159,8 @@ static int __split_huge_page_splitting(s
> > >       return ret;
> > >  }
> > >
> > > -static void __split_huge_page_refcount(struct page *page)
> > > +static void __split_huge_page_refcount(struct page *page,
> > > +                                    struct list_head *list)
> > >  {
> > >       int i;
> > >       struct zone *zone = page_zone(page);
> > > @@ -1229,7 +1230,7 @@ static void __split_huge_page_refcount(s
> > >
> > >               mem_cgroup_split_huge_fixup(page, page_tail);
> > >
> > > -             lru_add_page_tail(zone, page, page_tail);
> > > +             lru_add_page_tail(zone, page, page_tail, list);
> > >       }
> > >
> > >       __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
> > > @@ -1343,7 +1344,8 @@ static int __split_huge_page_map(struct
> > >
> > >  /* must be called with anon_vma->root->mutex hold */
> > >  static void __split_huge_page(struct page *page,
> > > -                           struct anon_vma *anon_vma)
> > > +                           struct anon_vma *anon_vma,
> > > +                           struct list_head *list)
> > >  {
> > >       int mapcount, mapcount2;
> > >       struct anon_vma_chain *avc;
> > > @@ -1375,7 +1377,7 @@ static void __split_huge_page(struct pag
> > >                      mapcount, page_mapcount(page));
> > >       BUG_ON(mapcount != page_mapcount(page));
> > >
> > > -     __split_huge_page_refcount(page);
> > > +     __split_huge_page_refcount(page, list);
> > >
> > >       mapcount2 = 0;
> > >       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
> > > @@ -1392,7 +1394,7 @@ static void __split_huge_page(struct pag
> > >       BUG_ON(mapcount != mapcount2);
> > >  }
> > >
> > > -int split_huge_page(struct page *page)
> > > +int split_huge_page_list(struct page *page, struct list_head *list)
> > >  {
> > >       struct anon_vma *anon_vma;
> > >       int ret = 1;
> > > @@ -1406,7 +1408,7 @@ int split_huge_page(struct page *page)
> > >               goto out_unlock;
> > >
> > >       BUG_ON(!PageSwapBacked(page));
> > > -     __split_huge_page(page, anon_vma);
> > > +     __split_huge_page(page, anon_vma, list);
> > >       count_vm_event(THP_SPLIT);
> > >
> > >       BUG_ON(PageCompound(page));
> > > Index: linux/mm/swap.c
> > > ===================================================================
> > > --- linux.orig/mm/swap.c      2011-11-02 09:48:16.000000000 +0800
> > > +++ linux/mm/swap.c   2011-11-02 10:06:33.000000000 +0800
> > > @@ -634,7 +634,8 @@ EXPORT_SYMBOL(__pagevec_release);
> > >
> > >  /* used by __split_huge_page_refcount() */
> > >  void lru_add_page_tail(struct zone* zone,
> > > -                    struct page *page, struct page *page_tail)
> > > +                    struct page *page, struct page *page_tail,
> > > +                    struct list_head *dst)
> > >  {
> > >       int active;
> > >       enum lru_list lru;
> > > @@ -646,6 +647,13 @@ void lru_add_page_tail(struct zone* zone
> > >       VM_BUG_ON(PageLRU(page_tail));
> > >       VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
> > >
> > > +     /* The huge page is isolated */
> > > +     if (dst) {
> > > +             get_page(page_tail);
> > > +             list_add_tail(&page_tail->lru, dst);
> > > +             return;
> > > +     }
> > > +
> > >       SetPageLRU(page_tail);
> > >
> > >       if (page_evictable(page_tail, NULL)) {
> > > Index: linux/mm/swap_state.c
> > > ===================================================================
> > > --- linux.orig/mm/swap_state.c        2011-11-02 09:48:16.000000000 +0800
> > > +++ linux/mm/swap_state.c     2011-11-02 10:06:33.000000000 +0800
> > > @@ -154,12 +154,6 @@ int add_to_swap(struct page *page)
> > >       if (!entry.val)
> > >               return 0;
> > >
> > > -     if (unlikely(PageTransHuge(page)))
> > > -             if (unlikely(split_huge_page(page))) {
> > > -                     swapcache_free(entry, NULL);
> > > -                     return 0;
> > > -             }
> > > -
> > >       /*
> > >        * Radix-tree node allocations from PF_MEMALLOC contexts could
> > >        * completely exhaust the page allocator. __GFP_NOMEMALLOC
> > > Index: linux/mm/vmscan.c
> > > ===================================================================
> > > --- linux.orig/mm/vmscan.c    2011-11-02 09:48:16.000000000 +0800
> > > +++ linux/mm/vmscan.c 2011-11-02 10:58:21.000000000 +0800
> > > @@ -838,6 +838,10 @@ static unsigned long shrink_page_list(st
> > >               if (PageAnon(page) && !PageSwapCache(page)) {
> > >                       if (!(sc->gfp_mask & __GFP_IO))
> > >                               goto keep_locked;
> > > +                     if (unlikely(PageTransHuge(page)))
> > > +                             if (unlikely(split_huge_page_list(page,
> > > +                                     page_list)))
> > > +                                 goto activate_locked;
> > >                       if (!add_to_swap(page))
> > >                               goto activate_locked;
> > >                       may_enter_fs = 1;
> > > @@ -1076,7 +1080,8 @@ int __isolate_lru_page(struct page *page
> > >   */
> > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > >               struct list_head *src, struct list_head *dst,
> > > -             unsigned long *scanned, int order, int mode, int file)
> > > +             unsigned long *scanned, int order, int mode, int file,
> > > +             bool break_on_thp)
> > >  {
> > 
> > Sorry for late response.
> > These day, I am very busy for new job.
> Thanks for your time.

NP.

> 
> > Still, I don't like surgery of isolation part.
> > What if we isolate a THP page but it is working set page?
> > Let's assume as follows
> > 
> > 1. Ioslate 32 page
> > 2. Unfortunately, 1st page is THP so isolate_lru_page isolates just a
> >    page(of course, it's 512 pages)
> > 3. shrink_page_list see that it's working set page but page_list
> >    have just a page so it have to isolate pages once more with higher priority.
> that's possible. we might scan more pages, but should not introduce more
> THP split, since isolate stop at huge page. on the other hand, if
> isolation doesn't break in huge page, we can't split it and reclaim it
> as a whole immediately. I didn't get a way to make both sides good. I
> still thought the benefit is bigger than the drawback.

I really would like to fix the problem, too.

> 
> > How about this?
> > 
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 9fdfce7..8121415 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -960,7 +960,15 @@ free_it:
> >                  * appear not as the counts should be low
> >                  */
> >                 list_add(&page->lru, &free_pages);
> > -               continue;
> > +
> > +               /*
> > +                * If we have reclaimed enough pages, let's cut it off.
> > +                * It could prevent unnecessary THP split.
> > +                */
> > +               if (nr_reclaimed >= sc->nr_to_reclaim)
> > +                       break;
> > +               else
> > +                       continue;
> > 
> >  cull_mlocked:
> >                 if (PageSwapCache(page))
> this doesn't work. the huge page is dirty, so can't be reclaimed
> immediately.


Coudn't we make both sides good?

Here is my quick patch.
How about this?
It doesn't split THPs in page_list but still reclaims non-THPs so
I think it doesn't changed old behavior a lot.

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 23256e8..54790ad 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -769,6 +769,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 	unsigned long nr_congested = 0;
 	unsigned long nr_reclaimed = 0;
 	unsigned long nr_writeback = 0;
+	bool split_thp = false;
+	bool swapout_thp = false;
 
 	cond_resched();
 
@@ -786,6 +788,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (!trylock_page(page))
 			goto keep;
 
+		/*
+		 * If we already swap out a THP, we don't want to
+		 * split THPs any more. Let's wait until dirty a thp page
+		 * to be written into swap device
+		 */
+		if (unlikely(swapout_thp && PageTransHuge(page)))
+			goto cull_mlocked;
+
 		VM_BUG_ON(PageActive(page));
 		VM_BUG_ON(page_zone(page) != zone);
 
@@ -839,6 +849,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!(sc->gfp_mask & __GFP_IO))
 				goto keep_locked;
+			if (unlikely(PageTransHuge(page)))
+				if (unlikely(split_huge_page_list(page,
+					page_list)))
+				    goto activate_locked;
+				else
+					split_thp = true;
 			if (!add_to_swap(page))
 				goto activate_locked;
 			may_enter_fs = 1;
@@ -900,6 +916,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			case PAGE_ACTIVATE:
 				goto activate_locked;
 			case PAGE_SUCCESS:
+				if (split_thp)
+					swapout_thp = true;
 				if (PageWriteback(page))
 					goto keep_lumpy;
 				if (PageDirty(page))


> 
> Thanks,
> Shaohua
> 

-- 
Kind regards,
Minchan Kim

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-11-09  6:28                 ` Minchan Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-11-09  6:28 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Wed, Nov 09, 2011 at 01:27:55PM +0800, Shaohua Li wrote:
> On Tue, 2011-11-08 at 16:59 +0800, Minchan Kim wrote:
> > On Wed, Nov 02, 2011 at 11:17:55AM +0800, Shaohua Li wrote:
> > > On Mon, 2011-10-31 at 17:03 +0800, Shaohua Li wrote:
> > > > On Mon, 2011-10-31 at 16:23 +0800, Minchan Kim wrote:
> > > > > On Mon, Oct 31, 2011 at 09:21:28AM +0800, Shaohua Li wrote:
> > > > > > On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> > > > > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > > > > lru list has page A and B, page A is huge page:
> > > > > > > > 1. page A is isolated
> > > > > > > > 2. page B is isolated
> > > > > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > > > > page A+1, page A+2, ... are added to lru list.
> > > > > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > > > > 5. page A and B is written out and reclaimed.
> > > > > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > > > > >
> > > > > > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > > > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > > > > > > >
> > > > > > > > With this patch, we do huge page split just after the head page is isolated
> > > > > > > > for inactive lru list, so the tail pages will be reclaimed immediately.
> > > > > > > >
> > > > > > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > > > > > Without the patch:
> > > > > > > > #cat /proc/vmstat|grep thp
> > > > > > > > thp_fault_alloc 451
> > > > > > > > thp_fault_fallback 0
> > > > > > > > thp_collapse_alloc 0
> > > > > > > > thp_collapse_alloc_failed 0
> > > > > > > > thp_split 238
> > > > > > > >
> > > > > > > > With the patch:
> > > > > > > > #cat /proc/vmstat|grep thp
> > > > > > > > thp_fault_alloc 450
> > > > > > > > thp_fault_fallback 1
> > > > > > > > thp_collapse_alloc 0
> > > > > > > > thp_collapse_alloc_failed 0
> > > > > > > > thp_split 103
> > > > > > > >
> > > > > > > > So the thp_split number is reduced a lot, though there is one extra
> > > > > > > > thp_fault_fallback.
> > > > > > > >
> > > > > > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > > > > > > ---
> > > > > > > >  include/linux/memcontrol.h |    3 +-
> > > > > > > >  mm/memcontrol.c            |   12 +++++++++--
> > > > > > > >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> > > > > > > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > > > > > > >
> > > > > > > > Index: linux/mm/vmscan.c
> > > > > > > > ===================================================================
> > > > > > > > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > > > > > > > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > > > > > > > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> > > > > > > >   */
> > > > > > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > > > > > >               struct list_head *src, struct list_head *dst,
> > > > > > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > > > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > > > > > +             struct page **split_page)
> > > > > > > >  {
> > > > > > > >       unsigned long nr_taken = 0;
> > > > > > > >       unsigned long nr_lumpy_taken = 0;
> > > > > > > > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> > > > > > > >               case 0:
> > > > > > > >                       list_move(&page->lru, dst);
> > > > > > > >                       mem_cgroup_del_lru(page);
> > > > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > > > +                             nr_taken++;
> > > > > > > > +                             *split_page = page;
> > > > > > > > +                             goto out;
> > > > > > > > +                     } else
> > > > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > > >                       break;
> > > > > > > >
> > > > > > > >               case -EBUSY:
> > > > > > > > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> > > > > > > >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> > > > > > > >                               list_move(&cursor_page->lru, dst);
> > > > > > > >                               mem_cgroup_del_lru(cursor_page);
> > > > > > > > -                             nr_taken += hpage_nr_pages(page);
> > > > > > > >                               nr_lumpy_taken++;
> > > > > > > >                               if (PageDirty(cursor_page))
> > > > > > > >                                       nr_lumpy_dirty++;
> > > > > > > >                               scan++;
> > > > > > > > +                             if (PageTransHuge(page) && split_page) {
> > > > > > > > +                                     nr_taken++;
> > > > > > > > +                                     *split_page = page;
> > > > > > > > +                                     goto out;
> > > > > > > > +                             } else
> > > > > > > > +                                     nr_taken += hpage_nr_pages(page);
> > > > > > > >                       } else {
> > > > > > > >                               /*
> > > > > > > >                                * Check if the page is freed already.
> > > > > > > > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> > > > > > > >                       nr_lumpy_failed++;
> > > > > > > >       }
> > > > > > > >
> > > > > > > > +out:
> > > > > > > >       *scanned = scan;
> > > > > > > >
> > > > > > > >       trace_mm_vmscan_lru_isolate(order,
> > > > > > > > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> > > > > > > >                                       struct list_head *dst,
> > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > >                                       int mode, struct zone *z,
> > > > > > > > -                                     int active, int file)
> > > > > > > > +                                     int active, int file,
> > > > > > > > +                                     struct page **split_page)
> > > > > > > >  {
> > > > > > > >       int lru = LRU_BASE;
> > > > > > > >       if (active)
> > > > > > > > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> > > > > > > >       if (file)
> > > > > > > >               lru += LRU_FILE;
> > > > > > > >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > > > > > > > -                                                             mode, file);
> > > > > > > > +                                                     mode, file, split_page);
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  /*
> > > > > > > > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > >  {
> > > > > > > >       LIST_HEAD(page_list);
> > > > > > > >       unsigned long nr_scanned;
> > > > > > > > +     unsigned long total_scanned = 0;
> > > > > > > >       unsigned long nr_reclaimed = 0;
> > > > > > > >       unsigned long nr_taken;
> > > > > > > >       unsigned long nr_anon;
> > > > > > > >       unsigned long nr_file;
> > > > > > > > +     struct page *split_page;
> > > > > > > >
> > > > > > > >       while (unlikely(too_many_isolated(zone, file, sc))) {
> > > > > > > >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > > > > > > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > >       }
> > > > > > > >
> > > > > > > >       set_reclaim_mode(priority, sc, false);
> > > > > > > > +again:
> > > > > > > >       lru_add_drain();
> > > > > > > > +     split_page = NULL;
> > > > > > > >       spin_lock_irq(&zone->lru_lock);
> > > > > > > >
> > > > > > > >       if (scanning_global_lru(sc)) {
> > > > > > > > -             nr_taken = isolate_pages_global(nr_to_scan,
> > > > > > > > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> > > > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > > > -                     zone, 0, file);
> > > > > > > > +                     zone, 0, file, &split_page);
> > > > > > > >               zone->pages_scanned += nr_scanned;
> > > > > > > > +             total_scanned += nr_scanned;
> > > > > > > >               if (current_is_kswapd())
> > > > > > > >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > > > > > > >                                              nr_scanned);
> > > > > > > > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > > > > > > >                                              nr_scanned);
> > > > > > > >       } else {
> > > > > > > > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > > > > > > > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> > > > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > > >                       zone, sc->mem_cgroup,
> > > > > > > > -                     0, file);
> > > > > > > > +                     0, file, &split_page);
> > > > > > > > +             total_scanned += nr_scanned;
> > > > > > > >               /*
> > > > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > > > >                * scanned pages on its own.
> > > > > > > > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > >               spin_unlock_irq(&zone->lru_lock);
> > > > > > > >               return 0;
> > > > > > > >       }
> > > > > > > > +     if (split_page && total_scanned < nr_to_scan) {
> > > > > > > > +             spin_unlock_irq(&zone->lru_lock);
> > > > > > > > +             split_huge_page(split_page);
> > > > > > > > +             goto again;
> > > > > > > > +     }
> > > > > > > >
> > > > > > > >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> > > > > > > >
> > > > > > > >       spin_unlock_irq(&zone->lru_lock);
> > > > > > > >
> > > > > > > > +     if (split_page)
> > > > > > > > +             split_huge_page(split_page);
> > > > > > > > +
> > > > > > > >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> > > > > > > >
> > > > > > > >       /* Check if we should syncronously wait for writeback */
> > > > > > > > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> > > > > > > >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> > > > > > > >                                               &pgscanned, sc->order,
> > > > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > > > -                                             1, file);
> > > > > > > > +                                             1, file, NULL);
> > > > > > > >               zone->pages_scanned += pgscanned;
> > > > > > > >       } else {
> > > > > > > >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> > > > > > > >                                               &pgscanned, sc->order,
> > > > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > > > -                                             sc->mem_cgroup, 1, file);
> > > > > > > > +                                             sc->mem_cgroup, 1, file, NULL);
> > > > > > > >               /*
> > > > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > > > >                * scanned pages on its own.
> > > > > > > > Index: linux/mm/memcontrol.c
> > > > > > > > ===================================================================
> > > > > > > > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > > > > > > > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > > > > > > > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > >                                       int mode, struct zone *z,
> > > > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > > > -                                     int active, int file)
> > > > > > > > +                                     int active, int file,
> > > > > > > > +                                     struct page **split_page)
> > > > > > > >  {
> > > > > > > >       unsigned long nr_taken = 0;
> > > > > > > >       struct page *page;
> > > > > > > > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > >               case 0:
> > > > > > > >                       list_move(&page->lru, dst);
> > > > > > > >                       mem_cgroup_del_lru(page);
> > > > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > > > +                             nr_taken++;
> > > > > > > > +                             *split_page = page;
> > > > > > > > +                             goto out;
> > > > > > > > +                     } else
> > > > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > > > +
> > > > > > > >                       break;
> > > > > > > >               case -EBUSY:
> > > > > > > >                       /* we don't affect global LRU but rotate in our LRU */
> > > > > > > > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > >               }
> > > > > > > >       }
> > > > > > > >
> > > > > > > > +out:
> > > > > > > >       *scanned = scan;
> > > > > > > >
> > > > > > > >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > > > > > > > Index: linux/include/linux/memcontrol.h
> > > > > > > > ===================================================================
> > > > > > > > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > > > > > > > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > > > > > > > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > >                                       int mode, struct zone *z,
> > > > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > > > -                                     int active, int file);
> > > > > > > > +                                     int active, int file,
> > > > > > > > +                                     struct page **split_page);
> > > > > > > >
> > > > > > > >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > > > > > > >  /*
> > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > > > I saw the code. my concern is your patch could make unnecessary split of THP.
> > > > > > >
> > > > > > > When we isolates page, we can't know whether it's working set or not.
> > > > > > > So split should happen after we judge it's working set page.
> > > > > > yes, but since memory is big currently, it's unlikely the isolated page
> > > > > > get accessed in the window. And I only did the split in
> > > > >
> > > > > We don't check page_reference when isolate happens.
> > > > > Window which between isolation time and reclaim?
> > > > > No. Window is from inactive's head to tail and it's the basic concept of
> > > > > our LRU.
> > > > >
> > > > > > shrink_inactive_list, not in active list.
> > > > >
> > > > > But inactive list's size could be still big and
> > > > > page reference heuristic is very important for reclaim algorithm.
> > > > I mean pages aren't referenced. but ok, I can't take such assumption.
> > > >
> > > > > > And THP has mechanism to collapse small pages to huge page later.
> > > > >
> > > > > You mean "merge" instead of "collapse"?
> > > > >
> > > > > >
> > > > > > > If you really want to merge this patch, I suggest that
> > > > > > > we can handle it in shrink_page_list step, not isolation step.
> > > > > > >
> > > > > > > My totally untested code which is just to show the concept is as follows,
> > > > > > I did consider this option before. It has its problem too. The isolation
> > > > > > can isolate several huge page one time. And then later shrink_page_list
> > > > > > can swap several huge page one time, which is unfortunate. I'm pretty
> > > > > > sure this method can't reduce the thp_split count in my test. It could
> > > > >
> > > > > I understand your point but approach isn't good to me.
> > > > > Maybe we can check whether we are going on or not before other THP page split happens
> > > > > in shrink_page_list. If we split THP page successfully, maybe we can skip another THP split.
> > > > > Another idea is we can avoid split of THP unless high order reclaim happens or low order
> > > > > high priority pressure happens.
> > > > I agreed the split better be done at shrink_page_list, but we must avoid
> > > > isolate too many pages. I'll check if I can have a better solution for
> > > > next post.
> > > Let me try again.
> > >
> > > Subject: thp: improve huge page reclaim -v2
> > >
> > > With transparent huge page enabled, huge page will be split if it will
> > > be reclaimed. With current logic, if page reclaim finds a huge page,
> > > it will just reclaim the head page and leave tail pages reclaimed later.
> > > Let's take an example, lru list has page A and B, page A is huge page:
> > > 1. page A is isolated
> > > 2. page B is isolated
> > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > page A+1, page A+2, ... are added to lru list.
> > > 4. shrink_page_list() adds page B to swap page cache.
> > > 5. page A and B is written out and reclaimed.
> > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > The worst case could be we isolate/split 32 huge pages to try to reclaim
> > > a huge page, but we only the 32 head pages are reclaimed.
> > >
> > > We expected the whole huge page A is reclaimed in the meantime, so
> > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, .... This could reduce a lot
> > > of unnecessary huge page split and improve the reclaim.
> > >
> > > With this patch, if a huge page is found in isolation, don't continue
> > > isolation. Since if the huge page is reclaimed, we can reclaim more pages
> > > than SWAP_CLUSTER_MAX. In shrink_page_list(), the huge page is split and
> > > all tail pages will be added to the isolation list, so the tail pages can
> > > be reclaimed immediately.
> > >
> > > The drawback is we might isolate less pages if a huge page is found. But
> > > I thought the benefit is far more than the drawback.
> > >
> > > All code path are with PageTransHuge(), so should have no impact to normal
> > > cases.
> > >
> > > In a test, a range of anonymous memory is written and will trigger swap.
> > > Without the patch:
> > > #cat /proc/vmstat|grep thp
> > > thp_fault_alloc 451
> > > thp_fault_fallback 0
> > > thp_collapse_alloc 0
> > > thp_collapse_alloc_failed 0
> > > thp_split 238
> > >
> > > With the patch:
> > > #cat /proc/vmstat|grep thp
> > > thp_fault_alloc 451
> > > thp_fault_fallback 0
> > > thp_collapse_alloc 0
> > > thp_collapse_alloc_failed 0
> > > thp_split 76
> > >
> > > So the thp_split number is reduced a lot.
> > >
> > > v1->v2: Do the huge page split in shrink_page_list(). Some code are adopted from
> > > Minchan's.
> > >
> > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > >
> > > ---
> > >  include/linux/huge_mm.h    |    7 ++++++-
> > >  include/linux/memcontrol.h |    3 ++-
> > >  include/linux/swap.h       |    3 ++-
> > >  mm/huge_memory.c           |   14 ++++++++------
> > >  mm/memcontrol.c            |    6 +++++-
> > >  mm/swap.c                  |   10 +++++++++-
> > >  mm/swap_state.c            |    6 ------
> > >  mm/vmscan.c                |   27 ++++++++++++++++++++-------
> > >  8 files changed, 52 insertions(+), 24 deletions(-)
> > >
> > > Index: linux/include/linux/huge_mm.h
> > > ===================================================================
> > > --- linux.orig/include/linux/huge_mm.h        2011-11-02 09:48:16.000000000 +0800
> > > +++ linux/include/linux/huge_mm.h     2011-11-02 10:06:33.000000000 +0800
> > > @@ -81,7 +81,12 @@ extern int copy_pte_range(struct mm_stru
> > >  extern int handle_pte_fault(struct mm_struct *mm,
> > >                           struct vm_area_struct *vma, unsigned long address,
> > >                           pte_t *pte, pmd_t *pmd, unsigned int flags);
> > > -extern int split_huge_page(struct page *page);
> > > +extern int split_huge_page_list(struct page *page, struct list_head *dst);
> > > +static inline int split_huge_page(struct page *page)
> > > +{
> > > +     return split_huge_page_list(page, NULL);
> > > +}
> > > +
> > >  extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
> > >  #define split_huge_page_pmd(__mm, __pmd)                             \
> > >       do {                                                            \
> > > Index: linux/include/linux/swap.h
> > > ===================================================================
> > > --- linux.orig/include/linux/swap.h   2011-11-02 09:48:16.000000000 +0800
> > > +++ linux/include/linux/swap.h        2011-11-02 10:06:33.000000000 +0800
> > > @@ -218,7 +218,8 @@ extern unsigned int nr_free_pagecache_pa
> > >  extern void __lru_cache_add(struct page *, enum lru_list lru);
> > >  extern void lru_cache_add_lru(struct page *, enum lru_list lru);
> > >  extern void lru_add_page_tail(struct zone* zone,
> > > -                           struct page *page, struct page *page_tail);
> > > +                           struct page *page, struct page *page_tail,
> > > +                           struct list_head *dst);
> > >  extern void activate_page(struct page *);
> > >  extern void mark_page_accessed(struct page *);
> > >  extern void lru_add_drain(void);
> > > Index: linux/mm/huge_memory.c
> > > ===================================================================
> > > --- linux.orig/mm/huge_memory.c       2011-11-02 09:48:16.000000000 +0800
> > > +++ linux/mm/huge_memory.c    2011-11-02 10:58:21.000000000 +0800
> > > @@ -1159,7 +1159,8 @@ static int __split_huge_page_splitting(s
> > >       return ret;
> > >  }
> > >
> > > -static void __split_huge_page_refcount(struct page *page)
> > > +static void __split_huge_page_refcount(struct page *page,
> > > +                                    struct list_head *list)
> > >  {
> > >       int i;
> > >       struct zone *zone = page_zone(page);
> > > @@ -1229,7 +1230,7 @@ static void __split_huge_page_refcount(s
> > >
> > >               mem_cgroup_split_huge_fixup(page, page_tail);
> > >
> > > -             lru_add_page_tail(zone, page, page_tail);
> > > +             lru_add_page_tail(zone, page, page_tail, list);
> > >       }
> > >
> > >       __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
> > > @@ -1343,7 +1344,8 @@ static int __split_huge_page_map(struct
> > >
> > >  /* must be called with anon_vma->root->mutex hold */
> > >  static void __split_huge_page(struct page *page,
> > > -                           struct anon_vma *anon_vma)
> > > +                           struct anon_vma *anon_vma,
> > > +                           struct list_head *list)
> > >  {
> > >       int mapcount, mapcount2;
> > >       struct anon_vma_chain *avc;
> > > @@ -1375,7 +1377,7 @@ static void __split_huge_page(struct pag
> > >                      mapcount, page_mapcount(page));
> > >       BUG_ON(mapcount != page_mapcount(page));
> > >
> > > -     __split_huge_page_refcount(page);
> > > +     __split_huge_page_refcount(page, list);
> > >
> > >       mapcount2 = 0;
> > >       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
> > > @@ -1392,7 +1394,7 @@ static void __split_huge_page(struct pag
> > >       BUG_ON(mapcount != mapcount2);
> > >  }
> > >
> > > -int split_huge_page(struct page *page)
> > > +int split_huge_page_list(struct page *page, struct list_head *list)
> > >  {
> > >       struct anon_vma *anon_vma;
> > >       int ret = 1;
> > > @@ -1406,7 +1408,7 @@ int split_huge_page(struct page *page)
> > >               goto out_unlock;
> > >
> > >       BUG_ON(!PageSwapBacked(page));
> > > -     __split_huge_page(page, anon_vma);
> > > +     __split_huge_page(page, anon_vma, list);
> > >       count_vm_event(THP_SPLIT);
> > >
> > >       BUG_ON(PageCompound(page));
> > > Index: linux/mm/swap.c
> > > ===================================================================
> > > --- linux.orig/mm/swap.c      2011-11-02 09:48:16.000000000 +0800
> > > +++ linux/mm/swap.c   2011-11-02 10:06:33.000000000 +0800
> > > @@ -634,7 +634,8 @@ EXPORT_SYMBOL(__pagevec_release);
> > >
> > >  /* used by __split_huge_page_refcount() */
> > >  void lru_add_page_tail(struct zone* zone,
> > > -                    struct page *page, struct page *page_tail)
> > > +                    struct page *page, struct page *page_tail,
> > > +                    struct list_head *dst)
> > >  {
> > >       int active;
> > >       enum lru_list lru;
> > > @@ -646,6 +647,13 @@ void lru_add_page_tail(struct zone* zone
> > >       VM_BUG_ON(PageLRU(page_tail));
> > >       VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
> > >
> > > +     /* The huge page is isolated */
> > > +     if (dst) {
> > > +             get_page(page_tail);
> > > +             list_add_tail(&page_tail->lru, dst);
> > > +             return;
> > > +     }
> > > +
> > >       SetPageLRU(page_tail);
> > >
> > >       if (page_evictable(page_tail, NULL)) {
> > > Index: linux/mm/swap_state.c
> > > ===================================================================
> > > --- linux.orig/mm/swap_state.c        2011-11-02 09:48:16.000000000 +0800
> > > +++ linux/mm/swap_state.c     2011-11-02 10:06:33.000000000 +0800
> > > @@ -154,12 +154,6 @@ int add_to_swap(struct page *page)
> > >       if (!entry.val)
> > >               return 0;
> > >
> > > -     if (unlikely(PageTransHuge(page)))
> > > -             if (unlikely(split_huge_page(page))) {
> > > -                     swapcache_free(entry, NULL);
> > > -                     return 0;
> > > -             }
> > > -
> > >       /*
> > >        * Radix-tree node allocations from PF_MEMALLOC contexts could
> > >        * completely exhaust the page allocator. __GFP_NOMEMALLOC
> > > Index: linux/mm/vmscan.c
> > > ===================================================================
> > > --- linux.orig/mm/vmscan.c    2011-11-02 09:48:16.000000000 +0800
> > > +++ linux/mm/vmscan.c 2011-11-02 10:58:21.000000000 +0800
> > > @@ -838,6 +838,10 @@ static unsigned long shrink_page_list(st
> > >               if (PageAnon(page) && !PageSwapCache(page)) {
> > >                       if (!(sc->gfp_mask & __GFP_IO))
> > >                               goto keep_locked;
> > > +                     if (unlikely(PageTransHuge(page)))
> > > +                             if (unlikely(split_huge_page_list(page,
> > > +                                     page_list)))
> > > +                                 goto activate_locked;
> > >                       if (!add_to_swap(page))
> > >                               goto activate_locked;
> > >                       may_enter_fs = 1;
> > > @@ -1076,7 +1080,8 @@ int __isolate_lru_page(struct page *page
> > >   */
> > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > >               struct list_head *src, struct list_head *dst,
> > > -             unsigned long *scanned, int order, int mode, int file)
> > > +             unsigned long *scanned, int order, int mode, int file,
> > > +             bool break_on_thp)
> > >  {
> > 
> > Sorry for late response.
> > These day, I am very busy for new job.
> Thanks for your time.

NP.

> 
> > Still, I don't like surgery of isolation part.
> > What if we isolate a THP page but it is working set page?
> > Let's assume as follows
> > 
> > 1. Ioslate 32 page
> > 2. Unfortunately, 1st page is THP so isolate_lru_page isolates just a
> >    page(of course, it's 512 pages)
> > 3. shrink_page_list see that it's working set page but page_list
> >    have just a page so it have to isolate pages once more with higher priority.
> that's possible. we might scan more pages, but should not introduce more
> THP split, since isolate stop at huge page. on the other hand, if
> isolation doesn't break in huge page, we can't split it and reclaim it
> as a whole immediately. I didn't get a way to make both sides good. I
> still thought the benefit is bigger than the drawback.

I really would like to fix the problem, too.

> 
> > How about this?
> > 
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 9fdfce7..8121415 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -960,7 +960,15 @@ free_it:
> >                  * appear not as the counts should be low
> >                  */
> >                 list_add(&page->lru, &free_pages);
> > -               continue;
> > +
> > +               /*
> > +                * If we have reclaimed enough pages, let's cut it off.
> > +                * It could prevent unnecessary THP split.
> > +                */
> > +               if (nr_reclaimed >= sc->nr_to_reclaim)
> > +                       break;
> > +               else
> > +                       continue;
> > 
> >  cull_mlocked:
> >                 if (PageSwapCache(page))
> this doesn't work. the huge page is dirty, so can't be reclaimed
> immediately.


Coudn't we make both sides good?

Here is my quick patch.
How about this?
It doesn't split THPs in page_list but still reclaims non-THPs so
I think it doesn't changed old behavior a lot.

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 23256e8..54790ad 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -769,6 +769,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 	unsigned long nr_congested = 0;
 	unsigned long nr_reclaimed = 0;
 	unsigned long nr_writeback = 0;
+	bool split_thp = false;
+	bool swapout_thp = false;
 
 	cond_resched();
 
@@ -786,6 +788,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (!trylock_page(page))
 			goto keep;
 
+		/*
+		 * If we already swap out a THP, we don't want to
+		 * split THPs any more. Let's wait until dirty a thp page
+		 * to be written into swap device
+		 */
+		if (unlikely(swapout_thp && PageTransHuge(page)))
+			goto cull_mlocked;
+
 		VM_BUG_ON(PageActive(page));
 		VM_BUG_ON(page_zone(page) != zone);
 
@@ -839,6 +849,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!(sc->gfp_mask & __GFP_IO))
 				goto keep_locked;
+			if (unlikely(PageTransHuge(page)))
+				if (unlikely(split_huge_page_list(page,
+					page_list)))
+				    goto activate_locked;
+				else
+					split_thp = true;
 			if (!add_to_swap(page))
 				goto activate_locked;
 			may_enter_fs = 1;
@@ -900,6 +916,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			case PAGE_ACTIVATE:
 				goto activate_locked;
 			case PAGE_SUCCESS:
+				if (split_thp)
+					swapout_thp = true;
 				if (PageWriteback(page))
 					goto keep_lumpy;
 				if (PageDirty(page))


> 
> Thanks,
> Shaohua
> 

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-11-09  6:28                 ` Minchan Kim
@ 2011-11-09  7:08                   ` Shaohua Li
  -1 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-11-09  7:08 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Wed, 2011-11-09 at 14:28 +0800, Minchan Kim wrote:
> On Wed, Nov 09, 2011 at 01:27:55PM +0800, Shaohua Li wrote:
> > On Tue, 2011-11-08 at 16:59 +0800, Minchan Kim wrote:
> > > On Wed, Nov 02, 2011 at 11:17:55AM +0800, Shaohua Li wrote:
> > > > On Mon, 2011-10-31 at 17:03 +0800, Shaohua Li wrote:
> > > > > On Mon, 2011-10-31 at 16:23 +0800, Minchan Kim wrote:
> > > > > > On Mon, Oct 31, 2011 at 09:21:28AM +0800, Shaohua Li wrote:
> > > > > > > On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> > > > > > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > > > > > lru list has page A and B, page A is huge page:
> > > > > > > > > 1. page A is isolated
> > > > > > > > > 2. page B is isolated
> > > > > > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > > > > > page A+1, page A+2, ... are added to lru list.
> > > > > > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > > > > > 5. page A and B is written out and reclaimed.
> > > > > > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > > > > > >
> > > > > > > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > > > > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > > > > > > > >
> > > > > > > > > With this patch, we do huge page split just after the head page is isolated
> > > > > > > > > for inactive lru list, so the tail pages will be reclaimed immediately.
> > > > > > > > >
> > > > > > > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > > > > > > Without the patch:
> > > > > > > > > #cat /proc/vmstat|grep thp
> > > > > > > > > thp_fault_alloc 451
> > > > > > > > > thp_fault_fallback 0
> > > > > > > > > thp_collapse_alloc 0
> > > > > > > > > thp_collapse_alloc_failed 0
> > > > > > > > > thp_split 238
> > > > > > > > >
> > > > > > > > > With the patch:
> > > > > > > > > #cat /proc/vmstat|grep thp
> > > > > > > > > thp_fault_alloc 450
> > > > > > > > > thp_fault_fallback 1
> > > > > > > > > thp_collapse_alloc 0
> > > > > > > > > thp_collapse_alloc_failed 0
> > > > > > > > > thp_split 103
> > > > > > > > >
> > > > > > > > > So the thp_split number is reduced a lot, though there is one extra
> > > > > > > > > thp_fault_fallback.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > > > > > > > ---
> > > > > > > > >  include/linux/memcontrol.h |    3 +-
> > > > > > > > >  mm/memcontrol.c            |   12 +++++++++--
> > > > > > > > >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> > > > > > > > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > > > > > > > >
> > > > > > > > > Index: linux/mm/vmscan.c
> > > > > > > > > ===================================================================
> > > > > > > > > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > > > > > > > > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > > > > > > > > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> > > > > > > > >   */
> > > > > > > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > > > > > > >               struct list_head *src, struct list_head *dst,
> > > > > > > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > > > > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > > > > > > +             struct page **split_page)
> > > > > > > > >  {
> > > > > > > > >       unsigned long nr_taken = 0;
> > > > > > > > >       unsigned long nr_lumpy_taken = 0;
> > > > > > > > > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> > > > > > > > >               case 0:
> > > > > > > > >                       list_move(&page->lru, dst);
> > > > > > > > >                       mem_cgroup_del_lru(page);
> > > > > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > > > > +                             nr_taken++;
> > > > > > > > > +                             *split_page = page;
> > > > > > > > > +                             goto out;
> > > > > > > > > +                     } else
> > > > > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > > > >                       break;
> > > > > > > > >
> > > > > > > > >               case -EBUSY:
> > > > > > > > > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> > > > > > > > >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> > > > > > > > >                               list_move(&cursor_page->lru, dst);
> > > > > > > > >                               mem_cgroup_del_lru(cursor_page);
> > > > > > > > > -                             nr_taken += hpage_nr_pages(page);
> > > > > > > > >                               nr_lumpy_taken++;
> > > > > > > > >                               if (PageDirty(cursor_page))
> > > > > > > > >                                       nr_lumpy_dirty++;
> > > > > > > > >                               scan++;
> > > > > > > > > +                             if (PageTransHuge(page) && split_page) {
> > > > > > > > > +                                     nr_taken++;
> > > > > > > > > +                                     *split_page = page;
> > > > > > > > > +                                     goto out;
> > > > > > > > > +                             } else
> > > > > > > > > +                                     nr_taken += hpage_nr_pages(page);
> > > > > > > > >                       } else {
> > > > > > > > >                               /*
> > > > > > > > >                                * Check if the page is freed already.
> > > > > > > > > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> > > > > > > > >                       nr_lumpy_failed++;
> > > > > > > > >       }
> > > > > > > > >
> > > > > > > > > +out:
> > > > > > > > >       *scanned = scan;
> > > > > > > > >
> > > > > > > > >       trace_mm_vmscan_lru_isolate(order,
> > > > > > > > > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> > > > > > > > >                                       struct list_head *dst,
> > > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > > >                                       int mode, struct zone *z,
> > > > > > > > > -                                     int active, int file)
> > > > > > > > > +                                     int active, int file,
> > > > > > > > > +                                     struct page **split_page)
> > > > > > > > >  {
> > > > > > > > >       int lru = LRU_BASE;
> > > > > > > > >       if (active)
> > > > > > > > > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> > > > > > > > >       if (file)
> > > > > > > > >               lru += LRU_FILE;
> > > > > > > > >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > > > > > > > > -                                                             mode, file);
> > > > > > > > > +                                                     mode, file, split_page);
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > >  /*
> > > > > > > > > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > > >  {
> > > > > > > > >       LIST_HEAD(page_list);
> > > > > > > > >       unsigned long nr_scanned;
> > > > > > > > > +     unsigned long total_scanned = 0;
> > > > > > > > >       unsigned long nr_reclaimed = 0;
> > > > > > > > >       unsigned long nr_taken;
> > > > > > > > >       unsigned long nr_anon;
> > > > > > > > >       unsigned long nr_file;
> > > > > > > > > +     struct page *split_page;
> > > > > > > > >
> > > > > > > > >       while (unlikely(too_many_isolated(zone, file, sc))) {
> > > > > > > > >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > > > > > > > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > > >       }
> > > > > > > > >
> > > > > > > > >       set_reclaim_mode(priority, sc, false);
> > > > > > > > > +again:
> > > > > > > > >       lru_add_drain();
> > > > > > > > > +     split_page = NULL;
> > > > > > > > >       spin_lock_irq(&zone->lru_lock);
> > > > > > > > >
> > > > > > > > >       if (scanning_global_lru(sc)) {
> > > > > > > > > -             nr_taken = isolate_pages_global(nr_to_scan,
> > > > > > > > > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> > > > > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > > > > -                     zone, 0, file);
> > > > > > > > > +                     zone, 0, file, &split_page);
> > > > > > > > >               zone->pages_scanned += nr_scanned;
> > > > > > > > > +             total_scanned += nr_scanned;
> > > > > > > > >               if (current_is_kswapd())
> > > > > > > > >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > > > > > > > >                                              nr_scanned);
> > > > > > > > > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > > >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > > > > > > > >                                              nr_scanned);
> > > > > > > > >       } else {
> > > > > > > > > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > > > > > > > > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> > > > > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > > > >                       zone, sc->mem_cgroup,
> > > > > > > > > -                     0, file);
> > > > > > > > > +                     0, file, &split_page);
> > > > > > > > > +             total_scanned += nr_scanned;
> > > > > > > > >               /*
> > > > > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > > > > >                * scanned pages on its own.
> > > > > > > > > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > > >               spin_unlock_irq(&zone->lru_lock);
> > > > > > > > >               return 0;
> > > > > > > > >       }
> > > > > > > > > +     if (split_page && total_scanned < nr_to_scan) {
> > > > > > > > > +             spin_unlock_irq(&zone->lru_lock);
> > > > > > > > > +             split_huge_page(split_page);
> > > > > > > > > +             goto again;
> > > > > > > > > +     }
> > > > > > > > >
> > > > > > > > >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> > > > > > > > >
> > > > > > > > >       spin_unlock_irq(&zone->lru_lock);
> > > > > > > > >
> > > > > > > > > +     if (split_page)
> > > > > > > > > +             split_huge_page(split_page);
> > > > > > > > > +
> > > > > > > > >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> > > > > > > > >
> > > > > > > > >       /* Check if we should syncronously wait for writeback */
> > > > > > > > > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> > > > > > > > >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> > > > > > > > >                                               &pgscanned, sc->order,
> > > > > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > > > > -                                             1, file);
> > > > > > > > > +                                             1, file, NULL);
> > > > > > > > >               zone->pages_scanned += pgscanned;
> > > > > > > > >       } else {
> > > > > > > > >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> > > > > > > > >                                               &pgscanned, sc->order,
> > > > > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > > > > -                                             sc->mem_cgroup, 1, file);
> > > > > > > > > +                                             sc->mem_cgroup, 1, file, NULL);
> > > > > > > > >               /*
> > > > > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > > > > >                * scanned pages on its own.
> > > > > > > > > Index: linux/mm/memcontrol.c
> > > > > > > > > ===================================================================
> > > > > > > > > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > > > > > > > > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > > > > > > > > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > > >                                       int mode, struct zone *z,
> > > > > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > > > > -                                     int active, int file)
> > > > > > > > > +                                     int active, int file,
> > > > > > > > > +                                     struct page **split_page)
> > > > > > > > >  {
> > > > > > > > >       unsigned long nr_taken = 0;
> > > > > > > > >       struct page *page;
> > > > > > > > > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > > >               case 0:
> > > > > > > > >                       list_move(&page->lru, dst);
> > > > > > > > >                       mem_cgroup_del_lru(page);
> > > > > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > > > > +                             nr_taken++;
> > > > > > > > > +                             *split_page = page;
> > > > > > > > > +                             goto out;
> > > > > > > > > +                     } else
> > > > > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > > > > +
> > > > > > > > >                       break;
> > > > > > > > >               case -EBUSY:
> > > > > > > > >                       /* we don't affect global LRU but rotate in our LRU */
> > > > > > > > > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > > >               }
> > > > > > > > >       }
> > > > > > > > >
> > > > > > > > > +out:
> > > > > > > > >       *scanned = scan;
> > > > > > > > >
> > > > > > > > >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > > > > > > > > Index: linux/include/linux/memcontrol.h
> > > > > > > > > ===================================================================
> > > > > > > > > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > > > > > > > > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > > > > > > > > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> > > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > > >                                       int mode, struct zone *z,
> > > > > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > > > > -                                     int active, int file);
> > > > > > > > > +                                     int active, int file,
> > > > > > > > > +                                     struct page **split_page);
> > > > > > > > >
> > > > > > > > >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > > > > > > > >  /*
> > > > > > > > >
> > > > > > > > >
> > > > > > > >
> > > > > > > > I saw the code. my concern is your patch could make unnecessary split of THP.
> > > > > > > >
> > > > > > > > When we isolates page, we can't know whether it's working set or not.
> > > > > > > > So split should happen after we judge it's working set page.
> > > > > > > yes, but since memory is big currently, it's unlikely the isolated page
> > > > > > > get accessed in the window. And I only did the split in
> > > > > >
> > > > > > We don't check page_reference when isolate happens.
> > > > > > Window which between isolation time and reclaim?
> > > > > > No. Window is from inactive's head to tail and it's the basic concept of
> > > > > > our LRU.
> > > > > >
> > > > > > > shrink_inactive_list, not in active list.
> > > > > >
> > > > > > But inactive list's size could be still big and
> > > > > > page reference heuristic is very important for reclaim algorithm.
> > > > > I mean pages aren't referenced. but ok, I can't take such assumption.
> > > > >
> > > > > > > And THP has mechanism to collapse small pages to huge page later.
> > > > > >
> > > > > > You mean "merge" instead of "collapse"?
> > > > > >
> > > > > > >
> > > > > > > > If you really want to merge this patch, I suggest that
> > > > > > > > we can handle it in shrink_page_list step, not isolation step.
> > > > > > > >
> > > > > > > > My totally untested code which is just to show the concept is as follows,
> > > > > > > I did consider this option before. It has its problem too. The isolation
> > > > > > > can isolate several huge page one time. And then later shrink_page_list
> > > > > > > can swap several huge page one time, which is unfortunate. I'm pretty
> > > > > > > sure this method can't reduce the thp_split count in my test. It could
> > > > > >
> > > > > > I understand your point but approach isn't good to me.
> > > > > > Maybe we can check whether we are going on or not before other THP page split happens
> > > > > > in shrink_page_list. If we split THP page successfully, maybe we can skip another THP split.
> > > > > > Another idea is we can avoid split of THP unless high order reclaim happens or low order
> > > > > > high priority pressure happens.
> > > > > I agreed the split better be done at shrink_page_list, but we must avoid
> > > > > isolate too many pages. I'll check if I can have a better solution for
> > > > > next post.
> > > > Let me try again.
> > > >
> > > > Subject: thp: improve huge page reclaim -v2
> > > >
> > > > With transparent huge page enabled, huge page will be split if it will
> > > > be reclaimed. With current logic, if page reclaim finds a huge page,
> > > > it will just reclaim the head page and leave tail pages reclaimed later.
> > > > Let's take an example, lru list has page A and B, page A is huge page:
> > > > 1. page A is isolated
> > > > 2. page B is isolated
> > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > page A+1, page A+2, ... are added to lru list.
> > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > 5. page A and B is written out and reclaimed.
> > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > The worst case could be we isolate/split 32 huge pages to try to reclaim
> > > > a huge page, but we only the 32 head pages are reclaimed.
> > > >
> > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, .... This could reduce a lot
> > > > of unnecessary huge page split and improve the reclaim.
> > > >
> > > > With this patch, if a huge page is found in isolation, don't continue
> > > > isolation. Since if the huge page is reclaimed, we can reclaim more pages
> > > > than SWAP_CLUSTER_MAX. In shrink_page_list(), the huge page is split and
> > > > all tail pages will be added to the isolation list, so the tail pages can
> > > > be reclaimed immediately.
> > > >
> > > > The drawback is we might isolate less pages if a huge page is found. But
> > > > I thought the benefit is far more than the drawback.
> > > >
> > > > All code path are with PageTransHuge(), so should have no impact to normal
> > > > cases.
> > > >
> > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > Without the patch:
> > > > #cat /proc/vmstat|grep thp
> > > > thp_fault_alloc 451
> > > > thp_fault_fallback 0
> > > > thp_collapse_alloc 0
> > > > thp_collapse_alloc_failed 0
> > > > thp_split 238
> > > >
> > > > With the patch:
> > > > #cat /proc/vmstat|grep thp
> > > > thp_fault_alloc 451
> > > > thp_fault_fallback 0
> > > > thp_collapse_alloc 0
> > > > thp_collapse_alloc_failed 0
> > > > thp_split 76
> > > >
> > > > So the thp_split number is reduced a lot.
> > > >
> > > > v1->v2: Do the huge page split in shrink_page_list(). Some code are adopted from
> > > > Minchan's.
> > > >
> > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > >
> > > > ---
> > > >  include/linux/huge_mm.h    |    7 ++++++-
> > > >  include/linux/memcontrol.h |    3 ++-
> > > >  include/linux/swap.h       |    3 ++-
> > > >  mm/huge_memory.c           |   14 ++++++++------
> > > >  mm/memcontrol.c            |    6 +++++-
> > > >  mm/swap.c                  |   10 +++++++++-
> > > >  mm/swap_state.c            |    6 ------
> > > >  mm/vmscan.c                |   27 ++++++++++++++++++++-------
> > > >  8 files changed, 52 insertions(+), 24 deletions(-)
> > > >
> > > > Index: linux/include/linux/huge_mm.h
> > > > ===================================================================
> > > > --- linux.orig/include/linux/huge_mm.h        2011-11-02 09:48:16.000000000 +0800
> > > > +++ linux/include/linux/huge_mm.h     2011-11-02 10:06:33.000000000 +0800
> > > > @@ -81,7 +81,12 @@ extern int copy_pte_range(struct mm_stru
> > > >  extern int handle_pte_fault(struct mm_struct *mm,
> > > >                           struct vm_area_struct *vma, unsigned long address,
> > > >                           pte_t *pte, pmd_t *pmd, unsigned int flags);
> > > > -extern int split_huge_page(struct page *page);
> > > > +extern int split_huge_page_list(struct page *page, struct list_head *dst);
> > > > +static inline int split_huge_page(struct page *page)
> > > > +{
> > > > +     return split_huge_page_list(page, NULL);
> > > > +}
> > > > +
> > > >  extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
> > > >  #define split_huge_page_pmd(__mm, __pmd)                             \
> > > >       do {                                                            \
> > > > Index: linux/include/linux/swap.h
> > > > ===================================================================
> > > > --- linux.orig/include/linux/swap.h   2011-11-02 09:48:16.000000000 +0800
> > > > +++ linux/include/linux/swap.h        2011-11-02 10:06:33.000000000 +0800
> > > > @@ -218,7 +218,8 @@ extern unsigned int nr_free_pagecache_pa
> > > >  extern void __lru_cache_add(struct page *, enum lru_list lru);
> > > >  extern void lru_cache_add_lru(struct page *, enum lru_list lru);
> > > >  extern void lru_add_page_tail(struct zone* zone,
> > > > -                           struct page *page, struct page *page_tail);
> > > > +                           struct page *page, struct page *page_tail,
> > > > +                           struct list_head *dst);
> > > >  extern void activate_page(struct page *);
> > > >  extern void mark_page_accessed(struct page *);
> > > >  extern void lru_add_drain(void);
> > > > Index: linux/mm/huge_memory.c
> > > > ===================================================================
> > > > --- linux.orig/mm/huge_memory.c       2011-11-02 09:48:16.000000000 +0800
> > > > +++ linux/mm/huge_memory.c    2011-11-02 10:58:21.000000000 +0800
> > > > @@ -1159,7 +1159,8 @@ static int __split_huge_page_splitting(s
> > > >       return ret;
> > > >  }
> > > >
> > > > -static void __split_huge_page_refcount(struct page *page)
> > > > +static void __split_huge_page_refcount(struct page *page,
> > > > +                                    struct list_head *list)
> > > >  {
> > > >       int i;
> > > >       struct zone *zone = page_zone(page);
> > > > @@ -1229,7 +1230,7 @@ static void __split_huge_page_refcount(s
> > > >
> > > >               mem_cgroup_split_huge_fixup(page, page_tail);
> > > >
> > > > -             lru_add_page_tail(zone, page, page_tail);
> > > > +             lru_add_page_tail(zone, page, page_tail, list);
> > > >       }
> > > >
> > > >       __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
> > > > @@ -1343,7 +1344,8 @@ static int __split_huge_page_map(struct
> > > >
> > > >  /* must be called with anon_vma->root->mutex hold */
> > > >  static void __split_huge_page(struct page *page,
> > > > -                           struct anon_vma *anon_vma)
> > > > +                           struct anon_vma *anon_vma,
> > > > +                           struct list_head *list)
> > > >  {
> > > >       int mapcount, mapcount2;
> > > >       struct anon_vma_chain *avc;
> > > > @@ -1375,7 +1377,7 @@ static void __split_huge_page(struct pag
> > > >                      mapcount, page_mapcount(page));
> > > >       BUG_ON(mapcount != page_mapcount(page));
> > > >
> > > > -     __split_huge_page_refcount(page);
> > > > +     __split_huge_page_refcount(page, list);
> > > >
> > > >       mapcount2 = 0;
> > > >       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
> > > > @@ -1392,7 +1394,7 @@ static void __split_huge_page(struct pag
> > > >       BUG_ON(mapcount != mapcount2);
> > > >  }
> > > >
> > > > -int split_huge_page(struct page *page)
> > > > +int split_huge_page_list(struct page *page, struct list_head *list)
> > > >  {
> > > >       struct anon_vma *anon_vma;
> > > >       int ret = 1;
> > > > @@ -1406,7 +1408,7 @@ int split_huge_page(struct page *page)
> > > >               goto out_unlock;
> > > >
> > > >       BUG_ON(!PageSwapBacked(page));
> > > > -     __split_huge_page(page, anon_vma);
> > > > +     __split_huge_page(page, anon_vma, list);
> > > >       count_vm_event(THP_SPLIT);
> > > >
> > > >       BUG_ON(PageCompound(page));
> > > > Index: linux/mm/swap.c
> > > > ===================================================================
> > > > --- linux.orig/mm/swap.c      2011-11-02 09:48:16.000000000 +0800
> > > > +++ linux/mm/swap.c   2011-11-02 10:06:33.000000000 +0800
> > > > @@ -634,7 +634,8 @@ EXPORT_SYMBOL(__pagevec_release);
> > > >
> > > >  /* used by __split_huge_page_refcount() */
> > > >  void lru_add_page_tail(struct zone* zone,
> > > > -                    struct page *page, struct page *page_tail)
> > > > +                    struct page *page, struct page *page_tail,
> > > > +                    struct list_head *dst)
> > > >  {
> > > >       int active;
> > > >       enum lru_list lru;
> > > > @@ -646,6 +647,13 @@ void lru_add_page_tail(struct zone* zone
> > > >       VM_BUG_ON(PageLRU(page_tail));
> > > >       VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
> > > >
> > > > +     /* The huge page is isolated */
> > > > +     if (dst) {
> > > > +             get_page(page_tail);
> > > > +             list_add_tail(&page_tail->lru, dst);
> > > > +             return;
> > > > +     }
> > > > +
> > > >       SetPageLRU(page_tail);
> > > >
> > > >       if (page_evictable(page_tail, NULL)) {
> > > > Index: linux/mm/swap_state.c
> > > > ===================================================================
> > > > --- linux.orig/mm/swap_state.c        2011-11-02 09:48:16.000000000 +0800
> > > > +++ linux/mm/swap_state.c     2011-11-02 10:06:33.000000000 +0800
> > > > @@ -154,12 +154,6 @@ int add_to_swap(struct page *page)
> > > >       if (!entry.val)
> > > >               return 0;
> > > >
> > > > -     if (unlikely(PageTransHuge(page)))
> > > > -             if (unlikely(split_huge_page(page))) {
> > > > -                     swapcache_free(entry, NULL);
> > > > -                     return 0;
> > > > -             }
> > > > -
> > > >       /*
> > > >        * Radix-tree node allocations from PF_MEMALLOC contexts could
> > > >        * completely exhaust the page allocator. __GFP_NOMEMALLOC
> > > > Index: linux/mm/vmscan.c
> > > > ===================================================================
> > > > --- linux.orig/mm/vmscan.c    2011-11-02 09:48:16.000000000 +0800
> > > > +++ linux/mm/vmscan.c 2011-11-02 10:58:21.000000000 +0800
> > > > @@ -838,6 +838,10 @@ static unsigned long shrink_page_list(st
> > > >               if (PageAnon(page) && !PageSwapCache(page)) {
> > > >                       if (!(sc->gfp_mask & __GFP_IO))
> > > >                               goto keep_locked;
> > > > +                     if (unlikely(PageTransHuge(page)))
> > > > +                             if (unlikely(split_huge_page_list(page,
> > > > +                                     page_list)))
> > > > +                                 goto activate_locked;
> > > >                       if (!add_to_swap(page))
> > > >                               goto activate_locked;
> > > >                       may_enter_fs = 1;
> > > > @@ -1076,7 +1080,8 @@ int __isolate_lru_page(struct page *page
> > > >   */
> > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > >               struct list_head *src, struct list_head *dst,
> > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > +             bool break_on_thp)
> > > >  {
> > >
> > > Sorry for late response.
> > > These day, I am very busy for new job.
> > Thanks for your time.
> 
> NP.
> 
> >
> > > Still, I don't like surgery of isolation part.
> > > What if we isolate a THP page but it is working set page?
> > > Let's assume as follows
> > >
> > > 1. Ioslate 32 page
> > > 2. Unfortunately, 1st page is THP so isolate_lru_page isolates just a
> > >    page(of course, it's 512 pages)
> > > 3. shrink_page_list see that it's working set page but page_list
> > >    have just a page so it have to isolate pages once more with higher priority.
> > that's possible. we might scan more pages, but should not introduce more
> > THP split, since isolate stop at huge page. on the other hand, if
> > isolation doesn't break in huge page, we can't split it and reclaim it
> > as a whole immediately. I didn't get a way to make both sides good. I
> > still thought the benefit is bigger than the drawback.
> 
> I really would like to fix the problem, too.
> 
> >
> > > How about this?
> > >
> > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > index 9fdfce7..8121415 100644
> > > --- a/mm/vmscan.c
> > > +++ b/mm/vmscan.c
> > > @@ -960,7 +960,15 @@ free_it:
> > >                  * appear not as the counts should be low
> > >                  */
> > >                 list_add(&page->lru, &free_pages);
> > > -               continue;
> > > +
> > > +               /*
> > > +                * If we have reclaimed enough pages, let's cut it off.
> > > +                * It could prevent unnecessary THP split.
> > > +                */
> > > +               if (nr_reclaimed >= sc->nr_to_reclaim)
> > > +                       break;
> > > +               else
> > > +                       continue;
> > >
> > >  cull_mlocked:
> > >                 if (PageSwapCache(page))
> > this doesn't work. the huge page is dirty, so can't be reclaimed
> > immediately.
> 
> 
> Coudn't we make both sides good?
> 
> Here is my quick patch.
> How about this?
> It doesn't split THPs in page_list but still reclaims non-THPs so
> I think it doesn't changed old behavior a lot.
I like this idea, will do some test soon.

Thanks,
Shaohua


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-11-09  7:08                   ` Shaohua Li
  0 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-11-09  7:08 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Wed, 2011-11-09 at 14:28 +0800, Minchan Kim wrote:
> On Wed, Nov 09, 2011 at 01:27:55PM +0800, Shaohua Li wrote:
> > On Tue, 2011-11-08 at 16:59 +0800, Minchan Kim wrote:
> > > On Wed, Nov 02, 2011 at 11:17:55AM +0800, Shaohua Li wrote:
> > > > On Mon, 2011-10-31 at 17:03 +0800, Shaohua Li wrote:
> > > > > On Mon, 2011-10-31 at 16:23 +0800, Minchan Kim wrote:
> > > > > > On Mon, Oct 31, 2011 at 09:21:28AM +0800, Shaohua Li wrote:
> > > > > > > On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> > > > > > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > > > > > lru list has page A and B, page A is huge page:
> > > > > > > > > 1. page A is isolated
> > > > > > > > > 2. page B is isolated
> > > > > > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > > > > > page A+1, page A+2, ... are added to lru list.
> > > > > > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > > > > > 5. page A and B is written out and reclaimed.
> > > > > > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > > > > > >
> > > > > > > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > > > > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > > > > > > > >
> > > > > > > > > With this patch, we do huge page split just after the head page is isolated
> > > > > > > > > for inactive lru list, so the tail pages will be reclaimed immediately.
> > > > > > > > >
> > > > > > > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > > > > > > Without the patch:
> > > > > > > > > #cat /proc/vmstat|grep thp
> > > > > > > > > thp_fault_alloc 451
> > > > > > > > > thp_fault_fallback 0
> > > > > > > > > thp_collapse_alloc 0
> > > > > > > > > thp_collapse_alloc_failed 0
> > > > > > > > > thp_split 238
> > > > > > > > >
> > > > > > > > > With the patch:
> > > > > > > > > #cat /proc/vmstat|grep thp
> > > > > > > > > thp_fault_alloc 450
> > > > > > > > > thp_fault_fallback 1
> > > > > > > > > thp_collapse_alloc 0
> > > > > > > > > thp_collapse_alloc_failed 0
> > > > > > > > > thp_split 103
> > > > > > > > >
> > > > > > > > > So the thp_split number is reduced a lot, though there is one extra
> > > > > > > > > thp_fault_fallback.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > > > > > > > ---
> > > > > > > > >  include/linux/memcontrol.h |    3 +-
> > > > > > > > >  mm/memcontrol.c            |   12 +++++++++--
> > > > > > > > >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> > > > > > > > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > > > > > > > >
> > > > > > > > > Index: linux/mm/vmscan.c
> > > > > > > > > ===================================================================
> > > > > > > > > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > > > > > > > > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > > > > > > > > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> > > > > > > > >   */
> > > > > > > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > > > > > > >               struct list_head *src, struct list_head *dst,
> > > > > > > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > > > > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > > > > > > +             struct page **split_page)
> > > > > > > > >  {
> > > > > > > > >       unsigned long nr_taken = 0;
> > > > > > > > >       unsigned long nr_lumpy_taken = 0;
> > > > > > > > > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> > > > > > > > >               case 0:
> > > > > > > > >                       list_move(&page->lru, dst);
> > > > > > > > >                       mem_cgroup_del_lru(page);
> > > > > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > > > > +                             nr_taken++;
> > > > > > > > > +                             *split_page = page;
> > > > > > > > > +                             goto out;
> > > > > > > > > +                     } else
> > > > > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > > > >                       break;
> > > > > > > > >
> > > > > > > > >               case -EBUSY:
> > > > > > > > > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> > > > > > > > >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> > > > > > > > >                               list_move(&cursor_page->lru, dst);
> > > > > > > > >                               mem_cgroup_del_lru(cursor_page);
> > > > > > > > > -                             nr_taken += hpage_nr_pages(page);
> > > > > > > > >                               nr_lumpy_taken++;
> > > > > > > > >                               if (PageDirty(cursor_page))
> > > > > > > > >                                       nr_lumpy_dirty++;
> > > > > > > > >                               scan++;
> > > > > > > > > +                             if (PageTransHuge(page) && split_page) {
> > > > > > > > > +                                     nr_taken++;
> > > > > > > > > +                                     *split_page = page;
> > > > > > > > > +                                     goto out;
> > > > > > > > > +                             } else
> > > > > > > > > +                                     nr_taken += hpage_nr_pages(page);
> > > > > > > > >                       } else {
> > > > > > > > >                               /*
> > > > > > > > >                                * Check if the page is freed already.
> > > > > > > > > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> > > > > > > > >                       nr_lumpy_failed++;
> > > > > > > > >       }
> > > > > > > > >
> > > > > > > > > +out:
> > > > > > > > >       *scanned = scan;
> > > > > > > > >
> > > > > > > > >       trace_mm_vmscan_lru_isolate(order,
> > > > > > > > > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> > > > > > > > >                                       struct list_head *dst,
> > > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > > >                                       int mode, struct zone *z,
> > > > > > > > > -                                     int active, int file)
> > > > > > > > > +                                     int active, int file,
> > > > > > > > > +                                     struct page **split_page)
> > > > > > > > >  {
> > > > > > > > >       int lru = LRU_BASE;
> > > > > > > > >       if (active)
> > > > > > > > > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> > > > > > > > >       if (file)
> > > > > > > > >               lru += LRU_FILE;
> > > > > > > > >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > > > > > > > > -                                                             mode, file);
> > > > > > > > > +                                                     mode, file, split_page);
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > >  /*
> > > > > > > > > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > > >  {
> > > > > > > > >       LIST_HEAD(page_list);
> > > > > > > > >       unsigned long nr_scanned;
> > > > > > > > > +     unsigned long total_scanned = 0;
> > > > > > > > >       unsigned long nr_reclaimed = 0;
> > > > > > > > >       unsigned long nr_taken;
> > > > > > > > >       unsigned long nr_anon;
> > > > > > > > >       unsigned long nr_file;
> > > > > > > > > +     struct page *split_page;
> > > > > > > > >
> > > > > > > > >       while (unlikely(too_many_isolated(zone, file, sc))) {
> > > > > > > > >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > > > > > > > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > > >       }
> > > > > > > > >
> > > > > > > > >       set_reclaim_mode(priority, sc, false);
> > > > > > > > > +again:
> > > > > > > > >       lru_add_drain();
> > > > > > > > > +     split_page = NULL;
> > > > > > > > >       spin_lock_irq(&zone->lru_lock);
> > > > > > > > >
> > > > > > > > >       if (scanning_global_lru(sc)) {
> > > > > > > > > -             nr_taken = isolate_pages_global(nr_to_scan,
> > > > > > > > > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> > > > > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > > > > -                     zone, 0, file);
> > > > > > > > > +                     zone, 0, file, &split_page);
> > > > > > > > >               zone->pages_scanned += nr_scanned;
> > > > > > > > > +             total_scanned += nr_scanned;
> > > > > > > > >               if (current_is_kswapd())
> > > > > > > > >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > > > > > > > >                                              nr_scanned);
> > > > > > > > > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > > >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > > > > > > > >                                              nr_scanned);
> > > > > > > > >       } else {
> > > > > > > > > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > > > > > > > > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> > > > > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > > > >                       zone, sc->mem_cgroup,
> > > > > > > > > -                     0, file);
> > > > > > > > > +                     0, file, &split_page);
> > > > > > > > > +             total_scanned += nr_scanned;
> > > > > > > > >               /*
> > > > > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > > > > >                * scanned pages on its own.
> > > > > > > > > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > > >               spin_unlock_irq(&zone->lru_lock);
> > > > > > > > >               return 0;
> > > > > > > > >       }
> > > > > > > > > +     if (split_page && total_scanned < nr_to_scan) {
> > > > > > > > > +             spin_unlock_irq(&zone->lru_lock);
> > > > > > > > > +             split_huge_page(split_page);
> > > > > > > > > +             goto again;
> > > > > > > > > +     }
> > > > > > > > >
> > > > > > > > >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> > > > > > > > >
> > > > > > > > >       spin_unlock_irq(&zone->lru_lock);
> > > > > > > > >
> > > > > > > > > +     if (split_page)
> > > > > > > > > +             split_huge_page(split_page);
> > > > > > > > > +
> > > > > > > > >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> > > > > > > > >
> > > > > > > > >       /* Check if we should syncronously wait for writeback */
> > > > > > > > > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> > > > > > > > >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> > > > > > > > >                                               &pgscanned, sc->order,
> > > > > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > > > > -                                             1, file);
> > > > > > > > > +                                             1, file, NULL);
> > > > > > > > >               zone->pages_scanned += pgscanned;
> > > > > > > > >       } else {
> > > > > > > > >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> > > > > > > > >                                               &pgscanned, sc->order,
> > > > > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > > > > -                                             sc->mem_cgroup, 1, file);
> > > > > > > > > +                                             sc->mem_cgroup, 1, file, NULL);
> > > > > > > > >               /*
> > > > > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > > > > >                * scanned pages on its own.
> > > > > > > > > Index: linux/mm/memcontrol.c
> > > > > > > > > ===================================================================
> > > > > > > > > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > > > > > > > > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > > > > > > > > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > > >                                       int mode, struct zone *z,
> > > > > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > > > > -                                     int active, int file)
> > > > > > > > > +                                     int active, int file,
> > > > > > > > > +                                     struct page **split_page)
> > > > > > > > >  {
> > > > > > > > >       unsigned long nr_taken = 0;
> > > > > > > > >       struct page *page;
> > > > > > > > > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > > >               case 0:
> > > > > > > > >                       list_move(&page->lru, dst);
> > > > > > > > >                       mem_cgroup_del_lru(page);
> > > > > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > > > > +                             nr_taken++;
> > > > > > > > > +                             *split_page = page;
> > > > > > > > > +                             goto out;
> > > > > > > > > +                     } else
> > > > > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > > > > +
> > > > > > > > >                       break;
> > > > > > > > >               case -EBUSY:
> > > > > > > > >                       /* we don't affect global LRU but rotate in our LRU */
> > > > > > > > > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > > >               }
> > > > > > > > >       }
> > > > > > > > >
> > > > > > > > > +out:
> > > > > > > > >       *scanned = scan;
> > > > > > > > >
> > > > > > > > >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > > > > > > > > Index: linux/include/linux/memcontrol.h
> > > > > > > > > ===================================================================
> > > > > > > > > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > > > > > > > > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > > > > > > > > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> > > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > > >                                       int mode, struct zone *z,
> > > > > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > > > > -                                     int active, int file);
> > > > > > > > > +                                     int active, int file,
> > > > > > > > > +                                     struct page **split_page);
> > > > > > > > >
> > > > > > > > >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > > > > > > > >  /*
> > > > > > > > >
> > > > > > > > >
> > > > > > > >
> > > > > > > > I saw the code. my concern is your patch could make unnecessary split of THP.
> > > > > > > >
> > > > > > > > When we isolates page, we can't know whether it's working set or not.
> > > > > > > > So split should happen after we judge it's working set page.
> > > > > > > yes, but since memory is big currently, it's unlikely the isolated page
> > > > > > > get accessed in the window. And I only did the split in
> > > > > >
> > > > > > We don't check page_reference when isolate happens.
> > > > > > Window which between isolation time and reclaim?
> > > > > > No. Window is from inactive's head to tail and it's the basic concept of
> > > > > > our LRU.
> > > > > >
> > > > > > > shrink_inactive_list, not in active list.
> > > > > >
> > > > > > But inactive list's size could be still big and
> > > > > > page reference heuristic is very important for reclaim algorithm.
> > > > > I mean pages aren't referenced. but ok, I can't take such assumption.
> > > > >
> > > > > > > And THP has mechanism to collapse small pages to huge page later.
> > > > > >
> > > > > > You mean "merge" instead of "collapse"?
> > > > > >
> > > > > > >
> > > > > > > > If you really want to merge this patch, I suggest that
> > > > > > > > we can handle it in shrink_page_list step, not isolation step.
> > > > > > > >
> > > > > > > > My totally untested code which is just to show the concept is as follows,
> > > > > > > I did consider this option before. It has its problem too. The isolation
> > > > > > > can isolate several huge page one time. And then later shrink_page_list
> > > > > > > can swap several huge page one time, which is unfortunate. I'm pretty
> > > > > > > sure this method can't reduce the thp_split count in my test. It could
> > > > > >
> > > > > > I understand your point but approach isn't good to me.
> > > > > > Maybe we can check whether we are going on or not before other THP page split happens
> > > > > > in shrink_page_list. If we split THP page successfully, maybe we can skip another THP split.
> > > > > > Another idea is we can avoid split of THP unless high order reclaim happens or low order
> > > > > > high priority pressure happens.
> > > > > I agreed the split better be done at shrink_page_list, but we must avoid
> > > > > isolate too many pages. I'll check if I can have a better solution for
> > > > > next post.
> > > > Let me try again.
> > > >
> > > > Subject: thp: improve huge page reclaim -v2
> > > >
> > > > With transparent huge page enabled, huge page will be split if it will
> > > > be reclaimed. With current logic, if page reclaim finds a huge page,
> > > > it will just reclaim the head page and leave tail pages reclaimed later.
> > > > Let's take an example, lru list has page A and B, page A is huge page:
> > > > 1. page A is isolated
> > > > 2. page B is isolated
> > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > page A+1, page A+2, ... are added to lru list.
> > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > 5. page A and B is written out and reclaimed.
> > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > The worst case could be we isolate/split 32 huge pages to try to reclaim
> > > > a huge page, but we only the 32 head pages are reclaimed.
> > > >
> > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, .... This could reduce a lot
> > > > of unnecessary huge page split and improve the reclaim.
> > > >
> > > > With this patch, if a huge page is found in isolation, don't continue
> > > > isolation. Since if the huge page is reclaimed, we can reclaim more pages
> > > > than SWAP_CLUSTER_MAX. In shrink_page_list(), the huge page is split and
> > > > all tail pages will be added to the isolation list, so the tail pages can
> > > > be reclaimed immediately.
> > > >
> > > > The drawback is we might isolate less pages if a huge page is found. But
> > > > I thought the benefit is far more than the drawback.
> > > >
> > > > All code path are with PageTransHuge(), so should have no impact to normal
> > > > cases.
> > > >
> > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > Without the patch:
> > > > #cat /proc/vmstat|grep thp
> > > > thp_fault_alloc 451
> > > > thp_fault_fallback 0
> > > > thp_collapse_alloc 0
> > > > thp_collapse_alloc_failed 0
> > > > thp_split 238
> > > >
> > > > With the patch:
> > > > #cat /proc/vmstat|grep thp
> > > > thp_fault_alloc 451
> > > > thp_fault_fallback 0
> > > > thp_collapse_alloc 0
> > > > thp_collapse_alloc_failed 0
> > > > thp_split 76
> > > >
> > > > So the thp_split number is reduced a lot.
> > > >
> > > > v1->v2: Do the huge page split in shrink_page_list(). Some code are adopted from
> > > > Minchan's.
> > > >
> > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > >
> > > > ---
> > > >  include/linux/huge_mm.h    |    7 ++++++-
> > > >  include/linux/memcontrol.h |    3 ++-
> > > >  include/linux/swap.h       |    3 ++-
> > > >  mm/huge_memory.c           |   14 ++++++++------
> > > >  mm/memcontrol.c            |    6 +++++-
> > > >  mm/swap.c                  |   10 +++++++++-
> > > >  mm/swap_state.c            |    6 ------
> > > >  mm/vmscan.c                |   27 ++++++++++++++++++++-------
> > > >  8 files changed, 52 insertions(+), 24 deletions(-)
> > > >
> > > > Index: linux/include/linux/huge_mm.h
> > > > ===================================================================
> > > > --- linux.orig/include/linux/huge_mm.h        2011-11-02 09:48:16.000000000 +0800
> > > > +++ linux/include/linux/huge_mm.h     2011-11-02 10:06:33.000000000 +0800
> > > > @@ -81,7 +81,12 @@ extern int copy_pte_range(struct mm_stru
> > > >  extern int handle_pte_fault(struct mm_struct *mm,
> > > >                           struct vm_area_struct *vma, unsigned long address,
> > > >                           pte_t *pte, pmd_t *pmd, unsigned int flags);
> > > > -extern int split_huge_page(struct page *page);
> > > > +extern int split_huge_page_list(struct page *page, struct list_head *dst);
> > > > +static inline int split_huge_page(struct page *page)
> > > > +{
> > > > +     return split_huge_page_list(page, NULL);
> > > > +}
> > > > +
> > > >  extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
> > > >  #define split_huge_page_pmd(__mm, __pmd)                             \
> > > >       do {                                                            \
> > > > Index: linux/include/linux/swap.h
> > > > ===================================================================
> > > > --- linux.orig/include/linux/swap.h   2011-11-02 09:48:16.000000000 +0800
> > > > +++ linux/include/linux/swap.h        2011-11-02 10:06:33.000000000 +0800
> > > > @@ -218,7 +218,8 @@ extern unsigned int nr_free_pagecache_pa
> > > >  extern void __lru_cache_add(struct page *, enum lru_list lru);
> > > >  extern void lru_cache_add_lru(struct page *, enum lru_list lru);
> > > >  extern void lru_add_page_tail(struct zone* zone,
> > > > -                           struct page *page, struct page *page_tail);
> > > > +                           struct page *page, struct page *page_tail,
> > > > +                           struct list_head *dst);
> > > >  extern void activate_page(struct page *);
> > > >  extern void mark_page_accessed(struct page *);
> > > >  extern void lru_add_drain(void);
> > > > Index: linux/mm/huge_memory.c
> > > > ===================================================================
> > > > --- linux.orig/mm/huge_memory.c       2011-11-02 09:48:16.000000000 +0800
> > > > +++ linux/mm/huge_memory.c    2011-11-02 10:58:21.000000000 +0800
> > > > @@ -1159,7 +1159,8 @@ static int __split_huge_page_splitting(s
> > > >       return ret;
> > > >  }
> > > >
> > > > -static void __split_huge_page_refcount(struct page *page)
> > > > +static void __split_huge_page_refcount(struct page *page,
> > > > +                                    struct list_head *list)
> > > >  {
> > > >       int i;
> > > >       struct zone *zone = page_zone(page);
> > > > @@ -1229,7 +1230,7 @@ static void __split_huge_page_refcount(s
> > > >
> > > >               mem_cgroup_split_huge_fixup(page, page_tail);
> > > >
> > > > -             lru_add_page_tail(zone, page, page_tail);
> > > > +             lru_add_page_tail(zone, page, page_tail, list);
> > > >       }
> > > >
> > > >       __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
> > > > @@ -1343,7 +1344,8 @@ static int __split_huge_page_map(struct
> > > >
> > > >  /* must be called with anon_vma->root->mutex hold */
> > > >  static void __split_huge_page(struct page *page,
> > > > -                           struct anon_vma *anon_vma)
> > > > +                           struct anon_vma *anon_vma,
> > > > +                           struct list_head *list)
> > > >  {
> > > >       int mapcount, mapcount2;
> > > >       struct anon_vma_chain *avc;
> > > > @@ -1375,7 +1377,7 @@ static void __split_huge_page(struct pag
> > > >                      mapcount, page_mapcount(page));
> > > >       BUG_ON(mapcount != page_mapcount(page));
> > > >
> > > > -     __split_huge_page_refcount(page);
> > > > +     __split_huge_page_refcount(page, list);
> > > >
> > > >       mapcount2 = 0;
> > > >       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
> > > > @@ -1392,7 +1394,7 @@ static void __split_huge_page(struct pag
> > > >       BUG_ON(mapcount != mapcount2);
> > > >  }
> > > >
> > > > -int split_huge_page(struct page *page)
> > > > +int split_huge_page_list(struct page *page, struct list_head *list)
> > > >  {
> > > >       struct anon_vma *anon_vma;
> > > >       int ret = 1;
> > > > @@ -1406,7 +1408,7 @@ int split_huge_page(struct page *page)
> > > >               goto out_unlock;
> > > >
> > > >       BUG_ON(!PageSwapBacked(page));
> > > > -     __split_huge_page(page, anon_vma);
> > > > +     __split_huge_page(page, anon_vma, list);
> > > >       count_vm_event(THP_SPLIT);
> > > >
> > > >       BUG_ON(PageCompound(page));
> > > > Index: linux/mm/swap.c
> > > > ===================================================================
> > > > --- linux.orig/mm/swap.c      2011-11-02 09:48:16.000000000 +0800
> > > > +++ linux/mm/swap.c   2011-11-02 10:06:33.000000000 +0800
> > > > @@ -634,7 +634,8 @@ EXPORT_SYMBOL(__pagevec_release);
> > > >
> > > >  /* used by __split_huge_page_refcount() */
> > > >  void lru_add_page_tail(struct zone* zone,
> > > > -                    struct page *page, struct page *page_tail)
> > > > +                    struct page *page, struct page *page_tail,
> > > > +                    struct list_head *dst)
> > > >  {
> > > >       int active;
> > > >       enum lru_list lru;
> > > > @@ -646,6 +647,13 @@ void lru_add_page_tail(struct zone* zone
> > > >       VM_BUG_ON(PageLRU(page_tail));
> > > >       VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
> > > >
> > > > +     /* The huge page is isolated */
> > > > +     if (dst) {
> > > > +             get_page(page_tail);
> > > > +             list_add_tail(&page_tail->lru, dst);
> > > > +             return;
> > > > +     }
> > > > +
> > > >       SetPageLRU(page_tail);
> > > >
> > > >       if (page_evictable(page_tail, NULL)) {
> > > > Index: linux/mm/swap_state.c
> > > > ===================================================================
> > > > --- linux.orig/mm/swap_state.c        2011-11-02 09:48:16.000000000 +0800
> > > > +++ linux/mm/swap_state.c     2011-11-02 10:06:33.000000000 +0800
> > > > @@ -154,12 +154,6 @@ int add_to_swap(struct page *page)
> > > >       if (!entry.val)
> > > >               return 0;
> > > >
> > > > -     if (unlikely(PageTransHuge(page)))
> > > > -             if (unlikely(split_huge_page(page))) {
> > > > -                     swapcache_free(entry, NULL);
> > > > -                     return 0;
> > > > -             }
> > > > -
> > > >       /*
> > > >        * Radix-tree node allocations from PF_MEMALLOC contexts could
> > > >        * completely exhaust the page allocator. __GFP_NOMEMALLOC
> > > > Index: linux/mm/vmscan.c
> > > > ===================================================================
> > > > --- linux.orig/mm/vmscan.c    2011-11-02 09:48:16.000000000 +0800
> > > > +++ linux/mm/vmscan.c 2011-11-02 10:58:21.000000000 +0800
> > > > @@ -838,6 +838,10 @@ static unsigned long shrink_page_list(st
> > > >               if (PageAnon(page) && !PageSwapCache(page)) {
> > > >                       if (!(sc->gfp_mask & __GFP_IO))
> > > >                               goto keep_locked;
> > > > +                     if (unlikely(PageTransHuge(page)))
> > > > +                             if (unlikely(split_huge_page_list(page,
> > > > +                                     page_list)))
> > > > +                                 goto activate_locked;
> > > >                       if (!add_to_swap(page))
> > > >                               goto activate_locked;
> > > >                       may_enter_fs = 1;
> > > > @@ -1076,7 +1080,8 @@ int __isolate_lru_page(struct page *page
> > > >   */
> > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > >               struct list_head *src, struct list_head *dst,
> > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > +             bool break_on_thp)
> > > >  {
> > >
> > > Sorry for late response.
> > > These day, I am very busy for new job.
> > Thanks for your time.
> 
> NP.
> 
> >
> > > Still, I don't like surgery of isolation part.
> > > What if we isolate a THP page but it is working set page?
> > > Let's assume as follows
> > >
> > > 1. Ioslate 32 page
> > > 2. Unfortunately, 1st page is THP so isolate_lru_page isolates just a
> > >    page(of course, it's 512 pages)
> > > 3. shrink_page_list see that it's working set page but page_list
> > >    have just a page so it have to isolate pages once more with higher priority.
> > that's possible. we might scan more pages, but should not introduce more
> > THP split, since isolate stop at huge page. on the other hand, if
> > isolation doesn't break in huge page, we can't split it and reclaim it
> > as a whole immediately. I didn't get a way to make both sides good. I
> > still thought the benefit is bigger than the drawback.
> 
> I really would like to fix the problem, too.
> 
> >
> > > How about this?
> > >
> > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > index 9fdfce7..8121415 100644
> > > --- a/mm/vmscan.c
> > > +++ b/mm/vmscan.c
> > > @@ -960,7 +960,15 @@ free_it:
> > >                  * appear not as the counts should be low
> > >                  */
> > >                 list_add(&page->lru, &free_pages);
> > > -               continue;
> > > +
> > > +               /*
> > > +                * If we have reclaimed enough pages, let's cut it off.
> > > +                * It could prevent unnecessary THP split.
> > > +                */
> > > +               if (nr_reclaimed >= sc->nr_to_reclaim)
> > > +                       break;
> > > +               else
> > > +                       continue;
> > >
> > >  cull_mlocked:
> > >                 if (PageSwapCache(page))
> > this doesn't work. the huge page is dirty, so can't be reclaimed
> > immediately.
> 
> 
> Coudn't we make both sides good?
> 
> Here is my quick patch.
> How about this?
> It doesn't split THPs in page_list but still reclaims non-THPs so
> I think it doesn't changed old behavior a lot.
I like this idea, will do some test soon.

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-11-09  7:08                   ` Shaohua Li
@ 2011-11-10  2:07                     ` Shaohua Li
  -1 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-11-10  2:07 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Wed, 2011-11-09 at 15:08 +0800, Shaohua Li wrote:
> On Wed, 2011-11-09 at 14:28 +0800, Minchan Kim wrote:
> > On Wed, Nov 09, 2011 at 01:27:55PM +0800, Shaohua Li wrote:
> > > On Tue, 2011-11-08 at 16:59 +0800, Minchan Kim wrote:
> > > > On Wed, Nov 02, 2011 at 11:17:55AM +0800, Shaohua Li wrote:
> > > > > On Mon, 2011-10-31 at 17:03 +0800, Shaohua Li wrote:
> > > > > > On Mon, 2011-10-31 at 16:23 +0800, Minchan Kim wrote:
> > > > > > > On Mon, Oct 31, 2011 at 09:21:28AM +0800, Shaohua Li wrote:
> > > > > > > > On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> > > > > > > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > > > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > > > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > > > > > > lru list has page A and B, page A is huge page:
> > > > > > > > > > 1. page A is isolated
> > > > > > > > > > 2. page B is isolated
> > > > > > > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > > > > > > page A+1, page A+2, ... are added to lru list.
> > > > > > > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > > > > > > 5. page A and B is written out and reclaimed.
> > > > > > > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > > > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > > > > > > >
> > > > > > > > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > > > > > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > > > > > > > > >
> > > > > > > > > > With this patch, we do huge page split just after the head page is isolated
> > > > > > > > > > for inactive lru list, so the tail pages will be reclaimed immediately.
> > > > > > > > > >
> > > > > > > > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > > > > > > > Without the patch:
> > > > > > > > > > #cat /proc/vmstat|grep thp
> > > > > > > > > > thp_fault_alloc 451
> > > > > > > > > > thp_fault_fallback 0
> > > > > > > > > > thp_collapse_alloc 0
> > > > > > > > > > thp_collapse_alloc_failed 0
> > > > > > > > > > thp_split 238
> > > > > > > > > >
> > > > > > > > > > With the patch:
> > > > > > > > > > #cat /proc/vmstat|grep thp
> > > > > > > > > > thp_fault_alloc 450
> > > > > > > > > > thp_fault_fallback 1
> > > > > > > > > > thp_collapse_alloc 0
> > > > > > > > > > thp_collapse_alloc_failed 0
> > > > > > > > > > thp_split 103
> > > > > > > > > >
> > > > > > > > > > So the thp_split number is reduced a lot, though there is one extra
> > > > > > > > > > thp_fault_fallback.
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > > > > > > > > ---
> > > > > > > > > >  include/linux/memcontrol.h |    3 +-
> > > > > > > > > >  mm/memcontrol.c            |   12 +++++++++--
> > > > > > > > > >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> > > > > > > > > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > > > > > > > > >
> > > > > > > > > > Index: linux/mm/vmscan.c
> > > > > > > > > > ===================================================================
> > > > > > > > > > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > > > > > > > > > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > > > > > > > > > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> > > > > > > > > >   */
> > > > > > > > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > > > > > > > >               struct list_head *src, struct list_head *dst,
> > > > > > > > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > > > > > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > > > > > > > +             struct page **split_page)
> > > > > > > > > >  {
> > > > > > > > > >       unsigned long nr_taken = 0;
> > > > > > > > > >       unsigned long nr_lumpy_taken = 0;
> > > > > > > > > > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> > > > > > > > > >               case 0:
> > > > > > > > > >                       list_move(&page->lru, dst);
> > > > > > > > > >                       mem_cgroup_del_lru(page);
> > > > > > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > > > > > +                             nr_taken++;
> > > > > > > > > > +                             *split_page = page;
> > > > > > > > > > +                             goto out;
> > > > > > > > > > +                     } else
> > > > > > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > > > > >                       break;
> > > > > > > > > >
> > > > > > > > > >               case -EBUSY:
> > > > > > > > > > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> > > > > > > > > >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> > > > > > > > > >                               list_move(&cursor_page->lru, dst);
> > > > > > > > > >                               mem_cgroup_del_lru(cursor_page);
> > > > > > > > > > -                             nr_taken += hpage_nr_pages(page);
> > > > > > > > > >                               nr_lumpy_taken++;
> > > > > > > > > >                               if (PageDirty(cursor_page))
> > > > > > > > > >                                       nr_lumpy_dirty++;
> > > > > > > > > >                               scan++;
> > > > > > > > > > +                             if (PageTransHuge(page) && split_page) {
> > > > > > > > > > +                                     nr_taken++;
> > > > > > > > > > +                                     *split_page = page;
> > > > > > > > > > +                                     goto out;
> > > > > > > > > > +                             } else
> > > > > > > > > > +                                     nr_taken += hpage_nr_pages(page);
> > > > > > > > > >                       } else {
> > > > > > > > > >                               /*
> > > > > > > > > >                                * Check if the page is freed already.
> > > > > > > > > > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> > > > > > > > > >                       nr_lumpy_failed++;
> > > > > > > > > >       }
> > > > > > > > > >
> > > > > > > > > > +out:
> > > > > > > > > >       *scanned = scan;
> > > > > > > > > >
> > > > > > > > > >       trace_mm_vmscan_lru_isolate(order,
> > > > > > > > > > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> > > > > > > > > >                                       struct list_head *dst,
> > > > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > > > >                                       int mode, struct zone *z,
> > > > > > > > > > -                                     int active, int file)
> > > > > > > > > > +                                     int active, int file,
> > > > > > > > > > +                                     struct page **split_page)
> > > > > > > > > >  {
> > > > > > > > > >       int lru = LRU_BASE;
> > > > > > > > > >       if (active)
> > > > > > > > > > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> > > > > > > > > >       if (file)
> > > > > > > > > >               lru += LRU_FILE;
> > > > > > > > > >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > > > > > > > > > -                                                             mode, file);
> > > > > > > > > > +                                                     mode, file, split_page);
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > >  /*
> > > > > > > > > > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > > > >  {
> > > > > > > > > >       LIST_HEAD(page_list);
> > > > > > > > > >       unsigned long nr_scanned;
> > > > > > > > > > +     unsigned long total_scanned = 0;
> > > > > > > > > >       unsigned long nr_reclaimed = 0;
> > > > > > > > > >       unsigned long nr_taken;
> > > > > > > > > >       unsigned long nr_anon;
> > > > > > > > > >       unsigned long nr_file;
> > > > > > > > > > +     struct page *split_page;
> > > > > > > > > >
> > > > > > > > > >       while (unlikely(too_many_isolated(zone, file, sc))) {
> > > > > > > > > >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > > > > > > > > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > > > >       }
> > > > > > > > > >
> > > > > > > > > >       set_reclaim_mode(priority, sc, false);
> > > > > > > > > > +again:
> > > > > > > > > >       lru_add_drain();
> > > > > > > > > > +     split_page = NULL;
> > > > > > > > > >       spin_lock_irq(&zone->lru_lock);
> > > > > > > > > >
> > > > > > > > > >       if (scanning_global_lru(sc)) {
> > > > > > > > > > -             nr_taken = isolate_pages_global(nr_to_scan,
> > > > > > > > > > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> > > > > > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > > > > > -                     zone, 0, file);
> > > > > > > > > > +                     zone, 0, file, &split_page);
> > > > > > > > > >               zone->pages_scanned += nr_scanned;
> > > > > > > > > > +             total_scanned += nr_scanned;
> > > > > > > > > >               if (current_is_kswapd())
> > > > > > > > > >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > > > > > > > > >                                              nr_scanned);
> > > > > > > > > > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > > > >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > > > > > > > > >                                              nr_scanned);
> > > > > > > > > >       } else {
> > > > > > > > > > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > > > > > > > > > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> > > > > > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > > > > >                       zone, sc->mem_cgroup,
> > > > > > > > > > -                     0, file);
> > > > > > > > > > +                     0, file, &split_page);
> > > > > > > > > > +             total_scanned += nr_scanned;
> > > > > > > > > >               /*
> > > > > > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > > > > > >                * scanned pages on its own.
> > > > > > > > > > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > > > >               spin_unlock_irq(&zone->lru_lock);
> > > > > > > > > >               return 0;
> > > > > > > > > >       }
> > > > > > > > > > +     if (split_page && total_scanned < nr_to_scan) {
> > > > > > > > > > +             spin_unlock_irq(&zone->lru_lock);
> > > > > > > > > > +             split_huge_page(split_page);
> > > > > > > > > > +             goto again;
> > > > > > > > > > +     }
> > > > > > > > > >
> > > > > > > > > >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> > > > > > > > > >
> > > > > > > > > >       spin_unlock_irq(&zone->lru_lock);
> > > > > > > > > >
> > > > > > > > > > +     if (split_page)
> > > > > > > > > > +             split_huge_page(split_page);
> > > > > > > > > > +
> > > > > > > > > >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> > > > > > > > > >
> > > > > > > > > >       /* Check if we should syncronously wait for writeback */
> > > > > > > > > > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> > > > > > > > > >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> > > > > > > > > >                                               &pgscanned, sc->order,
> > > > > > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > > > > > -                                             1, file);
> > > > > > > > > > +                                             1, file, NULL);
> > > > > > > > > >               zone->pages_scanned += pgscanned;
> > > > > > > > > >       } else {
> > > > > > > > > >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> > > > > > > > > >                                               &pgscanned, sc->order,
> > > > > > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > > > > > -                                             sc->mem_cgroup, 1, file);
> > > > > > > > > > +                                             sc->mem_cgroup, 1, file, NULL);
> > > > > > > > > >               /*
> > > > > > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > > > > > >                * scanned pages on its own.
> > > > > > > > > > Index: linux/mm/memcontrol.c
> > > > > > > > > > ===================================================================
> > > > > > > > > > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > > > > > > > > > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > > > > > > > > > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > > > >                                       int mode, struct zone *z,
> > > > > > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > > > > > -                                     int active, int file)
> > > > > > > > > > +                                     int active, int file,
> > > > > > > > > > +                                     struct page **split_page)
> > > > > > > > > >  {
> > > > > > > > > >       unsigned long nr_taken = 0;
> > > > > > > > > >       struct page *page;
> > > > > > > > > > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > > > >               case 0:
> > > > > > > > > >                       list_move(&page->lru, dst);
> > > > > > > > > >                       mem_cgroup_del_lru(page);
> > > > > > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > > > > > +                             nr_taken++;
> > > > > > > > > > +                             *split_page = page;
> > > > > > > > > > +                             goto out;
> > > > > > > > > > +                     } else
> > > > > > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > > > > > +
> > > > > > > > > >                       break;
> > > > > > > > > >               case -EBUSY:
> > > > > > > > > >                       /* we don't affect global LRU but rotate in our LRU */
> > > > > > > > > > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > > > >               }
> > > > > > > > > >       }
> > > > > > > > > >
> > > > > > > > > > +out:
> > > > > > > > > >       *scanned = scan;
> > > > > > > > > >
> > > > > > > > > >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > > > > > > > > > Index: linux/include/linux/memcontrol.h
> > > > > > > > > > ===================================================================
> > > > > > > > > > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > > > > > > > > > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > > > > > > > > > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> > > > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > > > >                                       int mode, struct zone *z,
> > > > > > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > > > > > -                                     int active, int file);
> > > > > > > > > > +                                     int active, int file,
> > > > > > > > > > +                                     struct page **split_page);
> > > > > > > > > >
> > > > > > > > > >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > > > > > > > > >  /*
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > I saw the code. my concern is your patch could make unnecessary split of THP.
> > > > > > > > >
> > > > > > > > > When we isolates page, we can't know whether it's working set or not.
> > > > > > > > > So split should happen after we judge it's working set page.
> > > > > > > > yes, but since memory is big currently, it's unlikely the isolated page
> > > > > > > > get accessed in the window. And I only did the split in
> > > > > > >
> > > > > > > We don't check page_reference when isolate happens.
> > > > > > > Window which between isolation time and reclaim?
> > > > > > > No. Window is from inactive's head to tail and it's the basic concept of
> > > > > > > our LRU.
> > > > > > >
> > > > > > > > shrink_inactive_list, not in active list.
> > > > > > >
> > > > > > > But inactive list's size could be still big and
> > > > > > > page reference heuristic is very important for reclaim algorithm.
> > > > > > I mean pages aren't referenced. but ok, I can't take such assumption.
> > > > > >
> > > > > > > > And THP has mechanism to collapse small pages to huge page later.
> > > > > > >
> > > > > > > You mean "merge" instead of "collapse"?
> > > > > > >
> > > > > > > >
> > > > > > > > > If you really want to merge this patch, I suggest that
> > > > > > > > > we can handle it in shrink_page_list step, not isolation step.
> > > > > > > > >
> > > > > > > > > My totally untested code which is just to show the concept is as follows,
> > > > > > > > I did consider this option before. It has its problem too. The isolation
> > > > > > > > can isolate several huge page one time. And then later shrink_page_list
> > > > > > > > can swap several huge page one time, which is unfortunate. I'm pretty
> > > > > > > > sure this method can't reduce the thp_split count in my test. It could
> > > > > > >
> > > > > > > I understand your point but approach isn't good to me.
> > > > > > > Maybe we can check whether we are going on or not before other THP page split happens
> > > > > > > in shrink_page_list. If we split THP page successfully, maybe we can skip another THP split.
> > > > > > > Another idea is we can avoid split of THP unless high order reclaim happens or low order
> > > > > > > high priority pressure happens.
> > > > > > I agreed the split better be done at shrink_page_list, but we must avoid
> > > > > > isolate too many pages. I'll check if I can have a better solution for
> > > > > > next post.
> > > > > Let me try again.
> > > > >
> > > > > Subject: thp: improve huge page reclaim -v2
> > > > >
> > > > > With transparent huge page enabled, huge page will be split if it will
> > > > > be reclaimed. With current logic, if page reclaim finds a huge page,
> > > > > it will just reclaim the head page and leave tail pages reclaimed later.
> > > > > Let's take an example, lru list has page A and B, page A is huge page:
> > > > > 1. page A is isolated
> > > > > 2. page B is isolated
> > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > page A+1, page A+2, ... are added to lru list.
> > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > 5. page A and B is written out and reclaimed.
> > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > > The worst case could be we isolate/split 32 huge pages to try to reclaim
> > > > > a huge page, but we only the 32 head pages are reclaimed.
> > > > >
> > > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, .... This could reduce a lot
> > > > > of unnecessary huge page split and improve the reclaim.
> > > > >
> > > > > With this patch, if a huge page is found in isolation, don't continue
> > > > > isolation. Since if the huge page is reclaimed, we can reclaim more pages
> > > > > than SWAP_CLUSTER_MAX. In shrink_page_list(), the huge page is split and
> > > > > all tail pages will be added to the isolation list, so the tail pages can
> > > > > be reclaimed immediately.
> > > > >
> > > > > The drawback is we might isolate less pages if a huge page is found. But
> > > > > I thought the benefit is far more than the drawback.
> > > > >
> > > > > All code path are with PageTransHuge(), so should have no impact to normal
> > > > > cases.
> > > > >
> > > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > > Without the patch:
> > > > > #cat /proc/vmstat|grep thp
> > > > > thp_fault_alloc 451
> > > > > thp_fault_fallback 0
> > > > > thp_collapse_alloc 0
> > > > > thp_collapse_alloc_failed 0
> > > > > thp_split 238
> > > > >
> > > > > With the patch:
> > > > > #cat /proc/vmstat|grep thp
> > > > > thp_fault_alloc 451
> > > > > thp_fault_fallback 0
> > > > > thp_collapse_alloc 0
> > > > > thp_collapse_alloc_failed 0
> > > > > thp_split 76
> > > > >
> > > > > So the thp_split number is reduced a lot.
> > > > >
> > > > > v1->v2: Do the huge page split in shrink_page_list(). Some code are adopted from
> > > > > Minchan's.
> > > > >
> > > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > > >
> > > > > ---
> > > > >  include/linux/huge_mm.h    |    7 ++++++-
> > > > >  include/linux/memcontrol.h |    3 ++-
> > > > >  include/linux/swap.h       |    3 ++-
> > > > >  mm/huge_memory.c           |   14 ++++++++------
> > > > >  mm/memcontrol.c            |    6 +++++-
> > > > >  mm/swap.c                  |   10 +++++++++-
> > > > >  mm/swap_state.c            |    6 ------
> > > > >  mm/vmscan.c                |   27 ++++++++++++++++++++-------
> > > > >  8 files changed, 52 insertions(+), 24 deletions(-)
> > > > >
> > > > > Index: linux/include/linux/huge_mm.h
> > > > > ===================================================================
> > > > > --- linux.orig/include/linux/huge_mm.h        2011-11-02 09:48:16.000000000 +0800
> > > > > +++ linux/include/linux/huge_mm.h     2011-11-02 10:06:33.000000000 +0800
> > > > > @@ -81,7 +81,12 @@ extern int copy_pte_range(struct mm_stru
> > > > >  extern int handle_pte_fault(struct mm_struct *mm,
> > > > >                           struct vm_area_struct *vma, unsigned long address,
> > > > >                           pte_t *pte, pmd_t *pmd, unsigned int flags);
> > > > > -extern int split_huge_page(struct page *page);
> > > > > +extern int split_huge_page_list(struct page *page, struct list_head *dst);
> > > > > +static inline int split_huge_page(struct page *page)
> > > > > +{
> > > > > +     return split_huge_page_list(page, NULL);
> > > > > +}
> > > > > +
> > > > >  extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
> > > > >  #define split_huge_page_pmd(__mm, __pmd)                             \
> > > > >       do {                                                            \
> > > > > Index: linux/include/linux/swap.h
> > > > > ===================================================================
> > > > > --- linux.orig/include/linux/swap.h   2011-11-02 09:48:16.000000000 +0800
> > > > > +++ linux/include/linux/swap.h        2011-11-02 10:06:33.000000000 +0800
> > > > > @@ -218,7 +218,8 @@ extern unsigned int nr_free_pagecache_pa
> > > > >  extern void __lru_cache_add(struct page *, enum lru_list lru);
> > > > >  extern void lru_cache_add_lru(struct page *, enum lru_list lru);
> > > > >  extern void lru_add_page_tail(struct zone* zone,
> > > > > -                           struct page *page, struct page *page_tail);
> > > > > +                           struct page *page, struct page *page_tail,
> > > > > +                           struct list_head *dst);
> > > > >  extern void activate_page(struct page *);
> > > > >  extern void mark_page_accessed(struct page *);
> > > > >  extern void lru_add_drain(void);
> > > > > Index: linux/mm/huge_memory.c
> > > > > ===================================================================
> > > > > --- linux.orig/mm/huge_memory.c       2011-11-02 09:48:16.000000000 +0800
> > > > > +++ linux/mm/huge_memory.c    2011-11-02 10:58:21.000000000 +0800
> > > > > @@ -1159,7 +1159,8 @@ static int __split_huge_page_splitting(s
> > > > >       return ret;
> > > > >  }
> > > > >
> > > > > -static void __split_huge_page_refcount(struct page *page)
> > > > > +static void __split_huge_page_refcount(struct page *page,
> > > > > +                                    struct list_head *list)
> > > > >  {
> > > > >       int i;
> > > > >       struct zone *zone = page_zone(page);
> > > > > @@ -1229,7 +1230,7 @@ static void __split_huge_page_refcount(s
> > > > >
> > > > >               mem_cgroup_split_huge_fixup(page, page_tail);
> > > > >
> > > > > -             lru_add_page_tail(zone, page, page_tail);
> > > > > +             lru_add_page_tail(zone, page, page_tail, list);
> > > > >       }
> > > > >
> > > > >       __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
> > > > > @@ -1343,7 +1344,8 @@ static int __split_huge_page_map(struct
> > > > >
> > > > >  /* must be called with anon_vma->root->mutex hold */
> > > > >  static void __split_huge_page(struct page *page,
> > > > > -                           struct anon_vma *anon_vma)
> > > > > +                           struct anon_vma *anon_vma,
> > > > > +                           struct list_head *list)
> > > > >  {
> > > > >       int mapcount, mapcount2;
> > > > >       struct anon_vma_chain *avc;
> > > > > @@ -1375,7 +1377,7 @@ static void __split_huge_page(struct pag
> > > > >                      mapcount, page_mapcount(page));
> > > > >       BUG_ON(mapcount != page_mapcount(page));
> > > > >
> > > > > -     __split_huge_page_refcount(page);
> > > > > +     __split_huge_page_refcount(page, list);
> > > > >
> > > > >       mapcount2 = 0;
> > > > >       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
> > > > > @@ -1392,7 +1394,7 @@ static void __split_huge_page(struct pag
> > > > >       BUG_ON(mapcount != mapcount2);
> > > > >  }
> > > > >
> > > > > -int split_huge_page(struct page *page)
> > > > > +int split_huge_page_list(struct page *page, struct list_head *list)
> > > > >  {
> > > > >       struct anon_vma *anon_vma;
> > > > >       int ret = 1;
> > > > > @@ -1406,7 +1408,7 @@ int split_huge_page(struct page *page)
> > > > >               goto out_unlock;
> > > > >
> > > > >       BUG_ON(!PageSwapBacked(page));
> > > > > -     __split_huge_page(page, anon_vma);
> > > > > +     __split_huge_page(page, anon_vma, list);
> > > > >       count_vm_event(THP_SPLIT);
> > > > >
> > > > >       BUG_ON(PageCompound(page));
> > > > > Index: linux/mm/swap.c
> > > > > ===================================================================
> > > > > --- linux.orig/mm/swap.c      2011-11-02 09:48:16.000000000 +0800
> > > > > +++ linux/mm/swap.c   2011-11-02 10:06:33.000000000 +0800
> > > > > @@ -634,7 +634,8 @@ EXPORT_SYMBOL(__pagevec_release);
> > > > >
> > > > >  /* used by __split_huge_page_refcount() */
> > > > >  void lru_add_page_tail(struct zone* zone,
> > > > > -                    struct page *page, struct page *page_tail)
> > > > > +                    struct page *page, struct page *page_tail,
> > > > > +                    struct list_head *dst)
> > > > >  {
> > > > >       int active;
> > > > >       enum lru_list lru;
> > > > > @@ -646,6 +647,13 @@ void lru_add_page_tail(struct zone* zone
> > > > >       VM_BUG_ON(PageLRU(page_tail));
> > > > >       VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
> > > > >
> > > > > +     /* The huge page is isolated */
> > > > > +     if (dst) {
> > > > > +             get_page(page_tail);
> > > > > +             list_add_tail(&page_tail->lru, dst);
> > > > > +             return;
> > > > > +     }
> > > > > +
> > > > >       SetPageLRU(page_tail);
> > > > >
> > > > >       if (page_evictable(page_tail, NULL)) {
> > > > > Index: linux/mm/swap_state.c
> > > > > ===================================================================
> > > > > --- linux.orig/mm/swap_state.c        2011-11-02 09:48:16.000000000 +0800
> > > > > +++ linux/mm/swap_state.c     2011-11-02 10:06:33.000000000 +0800
> > > > > @@ -154,12 +154,6 @@ int add_to_swap(struct page *page)
> > > > >       if (!entry.val)
> > > > >               return 0;
> > > > >
> > > > > -     if (unlikely(PageTransHuge(page)))
> > > > > -             if (unlikely(split_huge_page(page))) {
> > > > > -                     swapcache_free(entry, NULL);
> > > > > -                     return 0;
> > > > > -             }
> > > > > -
> > > > >       /*
> > > > >        * Radix-tree node allocations from PF_MEMALLOC contexts could
> > > > >        * completely exhaust the page allocator. __GFP_NOMEMALLOC
> > > > > Index: linux/mm/vmscan.c
> > > > > ===================================================================
> > > > > --- linux.orig/mm/vmscan.c    2011-11-02 09:48:16.000000000 +0800
> > > > > +++ linux/mm/vmscan.c 2011-11-02 10:58:21.000000000 +0800
> > > > > @@ -838,6 +838,10 @@ static unsigned long shrink_page_list(st
> > > > >               if (PageAnon(page) && !PageSwapCache(page)) {
> > > > >                       if (!(sc->gfp_mask & __GFP_IO))
> > > > >                               goto keep_locked;
> > > > > +                     if (unlikely(PageTransHuge(page)))
> > > > > +                             if (unlikely(split_huge_page_list(page,
> > > > > +                                     page_list)))
> > > > > +                                 goto activate_locked;
> > > > >                       if (!add_to_swap(page))
> > > > >                               goto activate_locked;
> > > > >                       may_enter_fs = 1;
> > > > > @@ -1076,7 +1080,8 @@ int __isolate_lru_page(struct page *page
> > > > >   */
> > > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > > >               struct list_head *src, struct list_head *dst,
> > > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > > +             bool break_on_thp)
> > > > >  {
> > > >
> > > > Sorry for late response.
> > > > These day, I am very busy for new job.
> > > Thanks for your time.
> > 
> > NP.
> > 
> > >
> > > > Still, I don't like surgery of isolation part.
> > > > What if we isolate a THP page but it is working set page?
> > > > Let's assume as follows
> > > >
> > > > 1. Ioslate 32 page
> > > > 2. Unfortunately, 1st page is THP so isolate_lru_page isolates just a
> > > >    page(of course, it's 512 pages)
> > > > 3. shrink_page_list see that it's working set page but page_list
> > > >    have just a page so it have to isolate pages once more with higher priority.
> > > that's possible. we might scan more pages, but should not introduce more
> > > THP split, since isolate stop at huge page. on the other hand, if
> > > isolation doesn't break in huge page, we can't split it and reclaim it
> > > as a whole immediately. I didn't get a way to make both sides good. I
> > > still thought the benefit is bigger than the drawback.
> > 
> > I really would like to fix the problem, too.
> > 
> > >
> > > > How about this?
> > > >
> > > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > > index 9fdfce7..8121415 100644
> > > > --- a/mm/vmscan.c
> > > > +++ b/mm/vmscan.c
> > > > @@ -960,7 +960,15 @@ free_it:
> > > >                  * appear not as the counts should be low
> > > >                  */
> > > >                 list_add(&page->lru, &free_pages);
> > > > -               continue;
> > > > +
> > > > +               /*
> > > > +                * If we have reclaimed enough pages, let's cut it off.
> > > > +                * It could prevent unnecessary THP split.
> > > > +                */
> > > > +               if (nr_reclaimed >= sc->nr_to_reclaim)
> > > > +                       break;
> > > > +               else
> > > > +                       continue;
> > > >
> > > >  cull_mlocked:
> > > >                 if (PageSwapCache(page))
> > > this doesn't work. the huge page is dirty, so can't be reclaimed
> > > immediately.
> > 
> > 
> > Coudn't we make both sides good?
> > 
> > Here is my quick patch.
> > How about this?
> > It doesn't split THPs in page_list but still reclaims non-THPs so
> > I think it doesn't changed old behavior a lot.
> I like this idea, will do some test soon.
hmm, this doesn't work as expected. The putback_lru_page() messes lru.
This isn't a problem if the page will be written since
rotate_reclaimable_page() will fix the order. I got worse data than my
v2 patch, eg, more thp_fallbacks, mess lru order, more pages are
scanned. We could add something like putback_lru_page_tail, but I'm not
convinced it's worthy(even with it, we still will mess lru a little). So
I'm back to use the v2 patch if no better solution, it's still much
better than current code.

Thanks,
Shaohua


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-11-10  2:07                     ` Shaohua Li
  0 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-11-10  2:07 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Wed, 2011-11-09 at 15:08 +0800, Shaohua Li wrote:
> On Wed, 2011-11-09 at 14:28 +0800, Minchan Kim wrote:
> > On Wed, Nov 09, 2011 at 01:27:55PM +0800, Shaohua Li wrote:
> > > On Tue, 2011-11-08 at 16:59 +0800, Minchan Kim wrote:
> > > > On Wed, Nov 02, 2011 at 11:17:55AM +0800, Shaohua Li wrote:
> > > > > On Mon, 2011-10-31 at 17:03 +0800, Shaohua Li wrote:
> > > > > > On Mon, 2011-10-31 at 16:23 +0800, Minchan Kim wrote:
> > > > > > > On Mon, Oct 31, 2011 at 09:21:28AM +0800, Shaohua Li wrote:
> > > > > > > > On Sat, 2011-10-29 at 08:06 +0800, Minchan Kim wrote:
> > > > > > > > > On Tue, Oct 25, 2011 at 10:59:40AM +0800, Shaohua Li wrote:
> > > > > > > > > > With current logic, if page reclaim finds a huge page, it will just reclaim
> > > > > > > > > > the head page and leave tail pages reclaimed later. Let's take an example,
> > > > > > > > > > lru list has page A and B, page A is huge page:
> > > > > > > > > > 1. page A is isolated
> > > > > > > > > > 2. page B is isolated
> > > > > > > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > > > > > > page A+1, page A+2, ... are added to lru list.
> > > > > > > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > > > > > > 5. page A and B is written out and reclaimed.
> > > > > > > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > > > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > > > > > > >
> > > > > > > > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > > > > > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, ....
> > > > > > > > > >
> > > > > > > > > > With this patch, we do huge page split just after the head page is isolated
> > > > > > > > > > for inactive lru list, so the tail pages will be reclaimed immediately.
> > > > > > > > > >
> > > > > > > > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > > > > > > > Without the patch:
> > > > > > > > > > #cat /proc/vmstat|grep thp
> > > > > > > > > > thp_fault_alloc 451
> > > > > > > > > > thp_fault_fallback 0
> > > > > > > > > > thp_collapse_alloc 0
> > > > > > > > > > thp_collapse_alloc_failed 0
> > > > > > > > > > thp_split 238
> > > > > > > > > >
> > > > > > > > > > With the patch:
> > > > > > > > > > #cat /proc/vmstat|grep thp
> > > > > > > > > > thp_fault_alloc 450
> > > > > > > > > > thp_fault_fallback 1
> > > > > > > > > > thp_collapse_alloc 0
> > > > > > > > > > thp_collapse_alloc_failed 0
> > > > > > > > > > thp_split 103
> > > > > > > > > >
> > > > > > > > > > So the thp_split number is reduced a lot, though there is one extra
> > > > > > > > > > thp_fault_fallback.
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > > > > > > > > ---
> > > > > > > > > >  include/linux/memcontrol.h |    3 +-
> > > > > > > > > >  mm/memcontrol.c            |   12 +++++++++--
> > > > > > > > > >  mm/vmscan.c                |   49 ++++++++++++++++++++++++++++++++++-----------
> > > > > > > > > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > > > > > > > > >
> > > > > > > > > > Index: linux/mm/vmscan.c
> > > > > > > > > > ===================================================================
> > > > > > > > > > --- linux.orig/mm/vmscan.c    2011-10-25 08:36:08.000000000 +0800
> > > > > > > > > > +++ linux/mm/vmscan.c 2011-10-25 09:51:44.000000000 +0800
> > > > > > > > > > @@ -1076,7 +1076,8 @@ int __isolate_lru_page(struct page *page
> > > > > > > > > >   */
> > > > > > > > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > > > > > > > >               struct list_head *src, struct list_head *dst,
> > > > > > > > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > > > > > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > > > > > > > +             struct page **split_page)
> > > > > > > > > >  {
> > > > > > > > > >       unsigned long nr_taken = 0;
> > > > > > > > > >       unsigned long nr_lumpy_taken = 0;
> > > > > > > > > > @@ -1100,7 +1101,12 @@ static unsigned long isolate_lru_pages(u
> > > > > > > > > >               case 0:
> > > > > > > > > >                       list_move(&page->lru, dst);
> > > > > > > > > >                       mem_cgroup_del_lru(page);
> > > > > > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > > > > > +                             nr_taken++;
> > > > > > > > > > +                             *split_page = page;
> > > > > > > > > > +                             goto out;
> > > > > > > > > > +                     } else
> > > > > > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > > > > >                       break;
> > > > > > > > > >
> > > > > > > > > >               case -EBUSY:
> > > > > > > > > > @@ -1158,11 +1164,16 @@ static unsigned long isolate_lru_pages(u
> > > > > > > > > >                       if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> > > > > > > > > >                               list_move(&cursor_page->lru, dst);
> > > > > > > > > >                               mem_cgroup_del_lru(cursor_page);
> > > > > > > > > > -                             nr_taken += hpage_nr_pages(page);
> > > > > > > > > >                               nr_lumpy_taken++;
> > > > > > > > > >                               if (PageDirty(cursor_page))
> > > > > > > > > >                                       nr_lumpy_dirty++;
> > > > > > > > > >                               scan++;
> > > > > > > > > > +                             if (PageTransHuge(page) && split_page) {
> > > > > > > > > > +                                     nr_taken++;
> > > > > > > > > > +                                     *split_page = page;
> > > > > > > > > > +                                     goto out;
> > > > > > > > > > +                             } else
> > > > > > > > > > +                                     nr_taken += hpage_nr_pages(page);
> > > > > > > > > >                       } else {
> > > > > > > > > >                               /*
> > > > > > > > > >                                * Check if the page is freed already.
> > > > > > > > > > @@ -1188,6 +1199,7 @@ static unsigned long isolate_lru_pages(u
> > > > > > > > > >                       nr_lumpy_failed++;
> > > > > > > > > >       }
> > > > > > > > > >
> > > > > > > > > > +out:
> > > > > > > > > >       *scanned = scan;
> > > > > > > > > >
> > > > > > > > > >       trace_mm_vmscan_lru_isolate(order,
> > > > > > > > > > @@ -1202,7 +1214,8 @@ static unsigned long isolate_pages_globa
> > > > > > > > > >                                       struct list_head *dst,
> > > > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > > > >                                       int mode, struct zone *z,
> > > > > > > > > > -                                     int active, int file)
> > > > > > > > > > +                                     int active, int file,
> > > > > > > > > > +                                     struct page **split_page)
> > > > > > > > > >  {
> > > > > > > > > >       int lru = LRU_BASE;
> > > > > > > > > >       if (active)
> > > > > > > > > > @@ -1210,7 +1223,7 @@ static unsigned long isolate_pages_globa
> > > > > > > > > >       if (file)
> > > > > > > > > >               lru += LRU_FILE;
> > > > > > > > > >       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
> > > > > > > > > > -                                                             mode, file);
> > > > > > > > > > +                                                     mode, file, split_page);
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > >  /*
> > > > > > > > > > @@ -1444,10 +1457,12 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > > > >  {
> > > > > > > > > >       LIST_HEAD(page_list);
> > > > > > > > > >       unsigned long nr_scanned;
> > > > > > > > > > +     unsigned long total_scanned = 0;
> > > > > > > > > >       unsigned long nr_reclaimed = 0;
> > > > > > > > > >       unsigned long nr_taken;
> > > > > > > > > >       unsigned long nr_anon;
> > > > > > > > > >       unsigned long nr_file;
> > > > > > > > > > +     struct page *split_page;
> > > > > > > > > >
> > > > > > > > > >       while (unlikely(too_many_isolated(zone, file, sc))) {
> > > > > > > > > >               congestion_wait(BLK_RW_ASYNC, HZ/10);
> > > > > > > > > > @@ -1458,16 +1473,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > > > >       }
> > > > > > > > > >
> > > > > > > > > >       set_reclaim_mode(priority, sc, false);
> > > > > > > > > > +again:
> > > > > > > > > >       lru_add_drain();
> > > > > > > > > > +     split_page = NULL;
> > > > > > > > > >       spin_lock_irq(&zone->lru_lock);
> > > > > > > > > >
> > > > > > > > > >       if (scanning_global_lru(sc)) {
> > > > > > > > > > -             nr_taken = isolate_pages_global(nr_to_scan,
> > > > > > > > > > +             nr_taken = isolate_pages_global(nr_to_scan - total_scanned,
> > > > > > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > > > > > -                     zone, 0, file);
> > > > > > > > > > +                     zone, 0, file, &split_page);
> > > > > > > > > >               zone->pages_scanned += nr_scanned;
> > > > > > > > > > +             total_scanned += nr_scanned;
> > > > > > > > > >               if (current_is_kswapd())
> > > > > > > > > >                       __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > > > > > > > > >                                              nr_scanned);
> > > > > > > > > > @@ -1475,12 +1493,13 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > > > >                       __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > > > > > > > > >                                              nr_scanned);
> > > > > > > > > >       } else {
> > > > > > > > > > -             nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
> > > > > > > > > > +             nr_taken = mem_cgroup_isolate_pages(nr_to_scan - total_scanned,
> > > > > > > > > >                       &page_list, &nr_scanned, sc->order,
> > > > > > > > > >                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
> > > > > > > > > >                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
> > > > > > > > > >                       zone, sc->mem_cgroup,
> > > > > > > > > > -                     0, file);
> > > > > > > > > > +                     0, file, &split_page);
> > > > > > > > > > +             total_scanned += nr_scanned;
> > > > > > > > > >               /*
> > > > > > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > > > > > >                * scanned pages on its own.
> > > > > > > > > > @@ -1491,11 +1510,19 @@ shrink_inactive_list(unsigned long nr_to
> > > > > > > > > >               spin_unlock_irq(&zone->lru_lock);
> > > > > > > > > >               return 0;
> > > > > > > > > >       }
> > > > > > > > > > +     if (split_page && total_scanned < nr_to_scan) {
> > > > > > > > > > +             spin_unlock_irq(&zone->lru_lock);
> > > > > > > > > > +             split_huge_page(split_page);
> > > > > > > > > > +             goto again;
> > > > > > > > > > +     }
> > > > > > > > > >
> > > > > > > > > >       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
> > > > > > > > > >
> > > > > > > > > >       spin_unlock_irq(&zone->lru_lock);
> > > > > > > > > >
> > > > > > > > > > +     if (split_page)
> > > > > > > > > > +             split_huge_page(split_page);
> > > > > > > > > > +
> > > > > > > > > >       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
> > > > > > > > > >
> > > > > > > > > >       /* Check if we should syncronously wait for writeback */
> > > > > > > > > > @@ -1589,13 +1616,13 @@ static void shrink_active_list(unsigned
> > > > > > > > > >               nr_taken = isolate_pages_global(nr_pages, &l_hold,
> > > > > > > > > >                                               &pgscanned, sc->order,
> > > > > > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > > > > > -                                             1, file);
> > > > > > > > > > +                                             1, file, NULL);
> > > > > > > > > >               zone->pages_scanned += pgscanned;
> > > > > > > > > >       } else {
> > > > > > > > > >               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
> > > > > > > > > >                                               &pgscanned, sc->order,
> > > > > > > > > >                                               ISOLATE_ACTIVE, zone,
> > > > > > > > > > -                                             sc->mem_cgroup, 1, file);
> > > > > > > > > > +                                             sc->mem_cgroup, 1, file, NULL);
> > > > > > > > > >               /*
> > > > > > > > > >                * mem_cgroup_isolate_pages() keeps track of
> > > > > > > > > >                * scanned pages on its own.
> > > > > > > > > > Index: linux/mm/memcontrol.c
> > > > > > > > > > ===================================================================
> > > > > > > > > > --- linux.orig/mm/memcontrol.c        2011-10-25 08:36:08.000000000 +0800
> > > > > > > > > > +++ linux/mm/memcontrol.c     2011-10-25 09:33:51.000000000 +0800
> > > > > > > > > > @@ -1187,7 +1187,8 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > > > >                                       int mode, struct zone *z,
> > > > > > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > > > > > -                                     int active, int file)
> > > > > > > > > > +                                     int active, int file,
> > > > > > > > > > +                                     struct page **split_page)
> > > > > > > > > >  {
> > > > > > > > > >       unsigned long nr_taken = 0;
> > > > > > > > > >       struct page *page;
> > > > > > > > > > @@ -1224,7 +1225,13 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > > > >               case 0:
> > > > > > > > > >                       list_move(&page->lru, dst);
> > > > > > > > > >                       mem_cgroup_del_lru(page);
> > > > > > > > > > -                     nr_taken += hpage_nr_pages(page);
> > > > > > > > > > +                     if (PageTransHuge(page) && split_page) {
> > > > > > > > > > +                             nr_taken++;
> > > > > > > > > > +                             *split_page = page;
> > > > > > > > > > +                             goto out;
> > > > > > > > > > +                     } else
> > > > > > > > > > +                             nr_taken += hpage_nr_pages(page);
> > > > > > > > > > +
> > > > > > > > > >                       break;
> > > > > > > > > >               case -EBUSY:
> > > > > > > > > >                       /* we don't affect global LRU but rotate in our LRU */
> > > > > > > > > > @@ -1235,6 +1242,7 @@ unsigned long mem_cgroup_isolate_pages(u
> > > > > > > > > >               }
> > > > > > > > > >       }
> > > > > > > > > >
> > > > > > > > > > +out:
> > > > > > > > > >       *scanned = scan;
> > > > > > > > > >
> > > > > > > > > >       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
> > > > > > > > > > Index: linux/include/linux/memcontrol.h
> > > > > > > > > > ===================================================================
> > > > > > > > > > --- linux.orig/include/linux/memcontrol.h     2011-10-25 08:36:08.000000000 +0800
> > > > > > > > > > +++ linux/include/linux/memcontrol.h  2011-10-25 09:33:51.000000000 +0800
> > > > > > > > > > @@ -37,7 +37,8 @@ extern unsigned long mem_cgroup_isolate_
> > > > > > > > > >                                       unsigned long *scanned, int order,
> > > > > > > > > >                                       int mode, struct zone *z,
> > > > > > > > > >                                       struct mem_cgroup *mem_cont,
> > > > > > > > > > -                                     int active, int file);
> > > > > > > > > > +                                     int active, int file,
> > > > > > > > > > +                                     struct page **split_page);
> > > > > > > > > >
> > > > > > > > > >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > > > > > > > > >  /*
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > I saw the code. my concern is your patch could make unnecessary split of THP.
> > > > > > > > >
> > > > > > > > > When we isolates page, we can't know whether it's working set or not.
> > > > > > > > > So split should happen after we judge it's working set page.
> > > > > > > > yes, but since memory is big currently, it's unlikely the isolated page
> > > > > > > > get accessed in the window. And I only did the split in
> > > > > > >
> > > > > > > We don't check page_reference when isolate happens.
> > > > > > > Window which between isolation time and reclaim?
> > > > > > > No. Window is from inactive's head to tail and it's the basic concept of
> > > > > > > our LRU.
> > > > > > >
> > > > > > > > shrink_inactive_list, not in active list.
> > > > > > >
> > > > > > > But inactive list's size could be still big and
> > > > > > > page reference heuristic is very important for reclaim algorithm.
> > > > > > I mean pages aren't referenced. but ok, I can't take such assumption.
> > > > > >
> > > > > > > > And THP has mechanism to collapse small pages to huge page later.
> > > > > > >
> > > > > > > You mean "merge" instead of "collapse"?
> > > > > > >
> > > > > > > >
> > > > > > > > > If you really want to merge this patch, I suggest that
> > > > > > > > > we can handle it in shrink_page_list step, not isolation step.
> > > > > > > > >
> > > > > > > > > My totally untested code which is just to show the concept is as follows,
> > > > > > > > I did consider this option before. It has its problem too. The isolation
> > > > > > > > can isolate several huge page one time. And then later shrink_page_list
> > > > > > > > can swap several huge page one time, which is unfortunate. I'm pretty
> > > > > > > > sure this method can't reduce the thp_split count in my test. It could
> > > > > > >
> > > > > > > I understand your point but approach isn't good to me.
> > > > > > > Maybe we can check whether we are going on or not before other THP page split happens
> > > > > > > in shrink_page_list. If we split THP page successfully, maybe we can skip another THP split.
> > > > > > > Another idea is we can avoid split of THP unless high order reclaim happens or low order
> > > > > > > high priority pressure happens.
> > > > > > I agreed the split better be done at shrink_page_list, but we must avoid
> > > > > > isolate too many pages. I'll check if I can have a better solution for
> > > > > > next post.
> > > > > Let me try again.
> > > > >
> > > > > Subject: thp: improve huge page reclaim -v2
> > > > >
> > > > > With transparent huge page enabled, huge page will be split if it will
> > > > > be reclaimed. With current logic, if page reclaim finds a huge page,
> > > > > it will just reclaim the head page and leave tail pages reclaimed later.
> > > > > Let's take an example, lru list has page A and B, page A is huge page:
> > > > > 1. page A is isolated
> > > > > 2. page B is isolated
> > > > > 3. shrink_page_list() adds page A to swap page cache. so page A is split.
> > > > > page A+1, page A+2, ... are added to lru list.
> > > > > 4. shrink_page_list() adds page B to swap page cache.
> > > > > 5. page A and B is written out and reclaimed.
> > > > > 6. page A+1, A+2 ... is isolated and reclaimed later.
> > > > > So the reclaim order is A, B, ...(maybe other pages), A+1, A+2 ...
> > > > > The worst case could be we isolate/split 32 huge pages to try to reclaim
> > > > > a huge page, but we only the 32 head pages are reclaimed.
> > > > >
> > > > > We expected the whole huge page A is reclaimed in the meantime, so
> > > > > the order is A, A+1, ... A+HPAGE_PMD_NR-1, B, .... This could reduce a lot
> > > > > of unnecessary huge page split and improve the reclaim.
> > > > >
> > > > > With this patch, if a huge page is found in isolation, don't continue
> > > > > isolation. Since if the huge page is reclaimed, we can reclaim more pages
> > > > > than SWAP_CLUSTER_MAX. In shrink_page_list(), the huge page is split and
> > > > > all tail pages will be added to the isolation list, so the tail pages can
> > > > > be reclaimed immediately.
> > > > >
> > > > > The drawback is we might isolate less pages if a huge page is found. But
> > > > > I thought the benefit is far more than the drawback.
> > > > >
> > > > > All code path are with PageTransHuge(), so should have no impact to normal
> > > > > cases.
> > > > >
> > > > > In a test, a range of anonymous memory is written and will trigger swap.
> > > > > Without the patch:
> > > > > #cat /proc/vmstat|grep thp
> > > > > thp_fault_alloc 451
> > > > > thp_fault_fallback 0
> > > > > thp_collapse_alloc 0
> > > > > thp_collapse_alloc_failed 0
> > > > > thp_split 238
> > > > >
> > > > > With the patch:
> > > > > #cat /proc/vmstat|grep thp
> > > > > thp_fault_alloc 451
> > > > > thp_fault_fallback 0
> > > > > thp_collapse_alloc 0
> > > > > thp_collapse_alloc_failed 0
> > > > > thp_split 76
> > > > >
> > > > > So the thp_split number is reduced a lot.
> > > > >
> > > > > v1->v2: Do the huge page split in shrink_page_list(). Some code are adopted from
> > > > > Minchan's.
> > > > >
> > > > > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > > > >
> > > > > ---
> > > > >  include/linux/huge_mm.h    |    7 ++++++-
> > > > >  include/linux/memcontrol.h |    3 ++-
> > > > >  include/linux/swap.h       |    3 ++-
> > > > >  mm/huge_memory.c           |   14 ++++++++------
> > > > >  mm/memcontrol.c            |    6 +++++-
> > > > >  mm/swap.c                  |   10 +++++++++-
> > > > >  mm/swap_state.c            |    6 ------
> > > > >  mm/vmscan.c                |   27 ++++++++++++++++++++-------
> > > > >  8 files changed, 52 insertions(+), 24 deletions(-)
> > > > >
> > > > > Index: linux/include/linux/huge_mm.h
> > > > > ===================================================================
> > > > > --- linux.orig/include/linux/huge_mm.h        2011-11-02 09:48:16.000000000 +0800
> > > > > +++ linux/include/linux/huge_mm.h     2011-11-02 10:06:33.000000000 +0800
> > > > > @@ -81,7 +81,12 @@ extern int copy_pte_range(struct mm_stru
> > > > >  extern int handle_pte_fault(struct mm_struct *mm,
> > > > >                           struct vm_area_struct *vma, unsigned long address,
> > > > >                           pte_t *pte, pmd_t *pmd, unsigned int flags);
> > > > > -extern int split_huge_page(struct page *page);
> > > > > +extern int split_huge_page_list(struct page *page, struct list_head *dst);
> > > > > +static inline int split_huge_page(struct page *page)
> > > > > +{
> > > > > +     return split_huge_page_list(page, NULL);
> > > > > +}
> > > > > +
> > > > >  extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
> > > > >  #define split_huge_page_pmd(__mm, __pmd)                             \
> > > > >       do {                                                            \
> > > > > Index: linux/include/linux/swap.h
> > > > > ===================================================================
> > > > > --- linux.orig/include/linux/swap.h   2011-11-02 09:48:16.000000000 +0800
> > > > > +++ linux/include/linux/swap.h        2011-11-02 10:06:33.000000000 +0800
> > > > > @@ -218,7 +218,8 @@ extern unsigned int nr_free_pagecache_pa
> > > > >  extern void __lru_cache_add(struct page *, enum lru_list lru);
> > > > >  extern void lru_cache_add_lru(struct page *, enum lru_list lru);
> > > > >  extern void lru_add_page_tail(struct zone* zone,
> > > > > -                           struct page *page, struct page *page_tail);
> > > > > +                           struct page *page, struct page *page_tail,
> > > > > +                           struct list_head *dst);
> > > > >  extern void activate_page(struct page *);
> > > > >  extern void mark_page_accessed(struct page *);
> > > > >  extern void lru_add_drain(void);
> > > > > Index: linux/mm/huge_memory.c
> > > > > ===================================================================
> > > > > --- linux.orig/mm/huge_memory.c       2011-11-02 09:48:16.000000000 +0800
> > > > > +++ linux/mm/huge_memory.c    2011-11-02 10:58:21.000000000 +0800
> > > > > @@ -1159,7 +1159,8 @@ static int __split_huge_page_splitting(s
> > > > >       return ret;
> > > > >  }
> > > > >
> > > > > -static void __split_huge_page_refcount(struct page *page)
> > > > > +static void __split_huge_page_refcount(struct page *page,
> > > > > +                                    struct list_head *list)
> > > > >  {
> > > > >       int i;
> > > > >       struct zone *zone = page_zone(page);
> > > > > @@ -1229,7 +1230,7 @@ static void __split_huge_page_refcount(s
> > > > >
> > > > >               mem_cgroup_split_huge_fixup(page, page_tail);
> > > > >
> > > > > -             lru_add_page_tail(zone, page, page_tail);
> > > > > +             lru_add_page_tail(zone, page, page_tail, list);
> > > > >       }
> > > > >
> > > > >       __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
> > > > > @@ -1343,7 +1344,8 @@ static int __split_huge_page_map(struct
> > > > >
> > > > >  /* must be called with anon_vma->root->mutex hold */
> > > > >  static void __split_huge_page(struct page *page,
> > > > > -                           struct anon_vma *anon_vma)
> > > > > +                           struct anon_vma *anon_vma,
> > > > > +                           struct list_head *list)
> > > > >  {
> > > > >       int mapcount, mapcount2;
> > > > >       struct anon_vma_chain *avc;
> > > > > @@ -1375,7 +1377,7 @@ static void __split_huge_page(struct pag
> > > > >                      mapcount, page_mapcount(page));
> > > > >       BUG_ON(mapcount != page_mapcount(page));
> > > > >
> > > > > -     __split_huge_page_refcount(page);
> > > > > +     __split_huge_page_refcount(page, list);
> > > > >
> > > > >       mapcount2 = 0;
> > > > >       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
> > > > > @@ -1392,7 +1394,7 @@ static void __split_huge_page(struct pag
> > > > >       BUG_ON(mapcount != mapcount2);
> > > > >  }
> > > > >
> > > > > -int split_huge_page(struct page *page)
> > > > > +int split_huge_page_list(struct page *page, struct list_head *list)
> > > > >  {
> > > > >       struct anon_vma *anon_vma;
> > > > >       int ret = 1;
> > > > > @@ -1406,7 +1408,7 @@ int split_huge_page(struct page *page)
> > > > >               goto out_unlock;
> > > > >
> > > > >       BUG_ON(!PageSwapBacked(page));
> > > > > -     __split_huge_page(page, anon_vma);
> > > > > +     __split_huge_page(page, anon_vma, list);
> > > > >       count_vm_event(THP_SPLIT);
> > > > >
> > > > >       BUG_ON(PageCompound(page));
> > > > > Index: linux/mm/swap.c
> > > > > ===================================================================
> > > > > --- linux.orig/mm/swap.c      2011-11-02 09:48:16.000000000 +0800
> > > > > +++ linux/mm/swap.c   2011-11-02 10:06:33.000000000 +0800
> > > > > @@ -634,7 +634,8 @@ EXPORT_SYMBOL(__pagevec_release);
> > > > >
> > > > >  /* used by __split_huge_page_refcount() */
> > > > >  void lru_add_page_tail(struct zone* zone,
> > > > > -                    struct page *page, struct page *page_tail)
> > > > > +                    struct page *page, struct page *page_tail,
> > > > > +                    struct list_head *dst)
> > > > >  {
> > > > >       int active;
> > > > >       enum lru_list lru;
> > > > > @@ -646,6 +647,13 @@ void lru_add_page_tail(struct zone* zone
> > > > >       VM_BUG_ON(PageLRU(page_tail));
> > > > >       VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
> > > > >
> > > > > +     /* The huge page is isolated */
> > > > > +     if (dst) {
> > > > > +             get_page(page_tail);
> > > > > +             list_add_tail(&page_tail->lru, dst);
> > > > > +             return;
> > > > > +     }
> > > > > +
> > > > >       SetPageLRU(page_tail);
> > > > >
> > > > >       if (page_evictable(page_tail, NULL)) {
> > > > > Index: linux/mm/swap_state.c
> > > > > ===================================================================
> > > > > --- linux.orig/mm/swap_state.c        2011-11-02 09:48:16.000000000 +0800
> > > > > +++ linux/mm/swap_state.c     2011-11-02 10:06:33.000000000 +0800
> > > > > @@ -154,12 +154,6 @@ int add_to_swap(struct page *page)
> > > > >       if (!entry.val)
> > > > >               return 0;
> > > > >
> > > > > -     if (unlikely(PageTransHuge(page)))
> > > > > -             if (unlikely(split_huge_page(page))) {
> > > > > -                     swapcache_free(entry, NULL);
> > > > > -                     return 0;
> > > > > -             }
> > > > > -
> > > > >       /*
> > > > >        * Radix-tree node allocations from PF_MEMALLOC contexts could
> > > > >        * completely exhaust the page allocator. __GFP_NOMEMALLOC
> > > > > Index: linux/mm/vmscan.c
> > > > > ===================================================================
> > > > > --- linux.orig/mm/vmscan.c    2011-11-02 09:48:16.000000000 +0800
> > > > > +++ linux/mm/vmscan.c 2011-11-02 10:58:21.000000000 +0800
> > > > > @@ -838,6 +838,10 @@ static unsigned long shrink_page_list(st
> > > > >               if (PageAnon(page) && !PageSwapCache(page)) {
> > > > >                       if (!(sc->gfp_mask & __GFP_IO))
> > > > >                               goto keep_locked;
> > > > > +                     if (unlikely(PageTransHuge(page)))
> > > > > +                             if (unlikely(split_huge_page_list(page,
> > > > > +                                     page_list)))
> > > > > +                                 goto activate_locked;
> > > > >                       if (!add_to_swap(page))
> > > > >                               goto activate_locked;
> > > > >                       may_enter_fs = 1;
> > > > > @@ -1076,7 +1080,8 @@ int __isolate_lru_page(struct page *page
> > > > >   */
> > > > >  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> > > > >               struct list_head *src, struct list_head *dst,
> > > > > -             unsigned long *scanned, int order, int mode, int file)
> > > > > +             unsigned long *scanned, int order, int mode, int file,
> > > > > +             bool break_on_thp)
> > > > >  {
> > > >
> > > > Sorry for late response.
> > > > These day, I am very busy for new job.
> > > Thanks for your time.
> > 
> > NP.
> > 
> > >
> > > > Still, I don't like surgery of isolation part.
> > > > What if we isolate a THP page but it is working set page?
> > > > Let's assume as follows
> > > >
> > > > 1. Ioslate 32 page
> > > > 2. Unfortunately, 1st page is THP so isolate_lru_page isolates just a
> > > >    page(of course, it's 512 pages)
> > > > 3. shrink_page_list see that it's working set page but page_list
> > > >    have just a page so it have to isolate pages once more with higher priority.
> > > that's possible. we might scan more pages, but should not introduce more
> > > THP split, since isolate stop at huge page. on the other hand, if
> > > isolation doesn't break in huge page, we can't split it and reclaim it
> > > as a whole immediately. I didn't get a way to make both sides good. I
> > > still thought the benefit is bigger than the drawback.
> > 
> > I really would like to fix the problem, too.
> > 
> > >
> > > > How about this?
> > > >
> > > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > > index 9fdfce7..8121415 100644
> > > > --- a/mm/vmscan.c
> > > > +++ b/mm/vmscan.c
> > > > @@ -960,7 +960,15 @@ free_it:
> > > >                  * appear not as the counts should be low
> > > >                  */
> > > >                 list_add(&page->lru, &free_pages);
> > > > -               continue;
> > > > +
> > > > +               /*
> > > > +                * If we have reclaimed enough pages, let's cut it off.
> > > > +                * It could prevent unnecessary THP split.
> > > > +                */
> > > > +               if (nr_reclaimed >= sc->nr_to_reclaim)
> > > > +                       break;
> > > > +               else
> > > > +                       continue;
> > > >
> > > >  cull_mlocked:
> > > >                 if (PageSwapCache(page))
> > > this doesn't work. the huge page is dirty, so can't be reclaimed
> > > immediately.
> > 
> > 
> > Coudn't we make both sides good?
> > 
> > Here is my quick patch.
> > How about this?
> > It doesn't split THPs in page_list but still reclaims non-THPs so
> > I think it doesn't changed old behavior a lot.
> I like this idea, will do some test soon.
hmm, this doesn't work as expected. The putback_lru_page() messes lru.
This isn't a problem if the page will be written since
rotate_reclaimable_page() will fix the order. I got worse data than my
v2 patch, eg, more thp_fallbacks, mess lru order, more pages are
scanned. We could add something like putback_lru_page_tail, but I'm not
convinced it's worthy(even with it, we still will mess lru a little). So
I'm back to use the v2 patch if no better solution, it's still much
better than current code.

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-11-10  2:07                     ` Shaohua Li
@ 2011-11-10  2:23                       ` Minchan Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-11-10  2:23 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

So long contents.
Let's remove it.

On Thu, Nov 10, 2011 at 10:07:10AM +0800, Shaohua Li wrote:

<snip>

> > > Coudn't we make both sides good?
> > > 
> > > Here is my quick patch.
> > > How about this?
> > > It doesn't split THPs in page_list but still reclaims non-THPs so
> > > I think it doesn't changed old behavior a lot.
> > I like this idea, will do some test soon.
> hmm, this doesn't work as expected. The putback_lru_page() messes lru.
> This isn't a problem if the page will be written since
> rotate_reclaimable_page() will fix the order. I got worse data than my
> v2 patch, eg, more thp_fallbacks, mess lru order, more pages are
> scanned. We could add something like putback_lru_page_tail, but I'm not

Hmm, It's not LRU mess problem. but it's just guessing and you might be right
because you have a workload and can test it.

My guessing is that cull_mlocked reset synchronus page reclaim.
Could you test this patch, again?

And, if the problem cause by LRU mess, I think it is valuable with adding putback_lru_page_tail
because thp added lru_add_page_tail, too.

Thanks!

diff --git a/mm/vmscan.c b/mm/vmscan.c
index b55699c..e2c84c2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -767,6 +767,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 	unsigned long nr_dirty = 0;
 	unsigned long nr_congested = 0;
 	unsigned long nr_reclaimed = 0;
+	bool split_thp = false;
+	bool swapout_thp = false;
 
 	cond_resched();
 
@@ -784,6 +786,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (!trylock_page(page))
 			goto keep;
 
+		/*
+		 * If we already swap out a THP, we don't want to
+		 * split THPs any more. Let's wait until dirty a thp page
+		 * to be written into swap device
+		 */
+		if (unlikely(swapout_thp && PageTransHuge(page)))
+			goto pass_thp;
+
 		VM_BUG_ON(PageActive(page));
 		VM_BUG_ON(page_zone(page) != zone);
 
@@ -838,6 +848,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!(sc->gfp_mask & __GFP_IO))
 				goto keep_locked;
+			if (unlikely(PageTransHuge(page)))
+				if (unlikely(split_huge_page_list(page,
+					page_list)))
+				    goto activate_locked;
+				else
+					split_thp = true;
 			if (!add_to_swap(page))
 				goto activate_locked;
 			may_enter_fs = 1;
@@ -880,6 +896,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			case PAGE_ACTIVATE:
 				goto activate_locked;
 			case PAGE_SUCCESS:
+				if (split_thp)
+					swapout_thp = true;
 				if (PageWriteback(page))
 					goto keep_lumpy;
 				if (PageDirty(page))
@@ -962,6 +980,10 @@ free_it:
 		list_add(&page->lru, &free_pages);
 		continue;
 
+pass_thp:
+		unlock_page(page);
+		putback_lru_page(page);
+		continue;
 cull_mlocked:
 		if (PageSwapCache(page))
 			try_to_free_swap(page);

> convinced it's worthy(even with it, we still will mess lru a little). So
> I'm back to use the v2 patch if no better solution, it's still much
> better than current code.
> 
> Thanks,
> Shaohua
> 

-- 
Kind regards,
Minchan Kim

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-11-10  2:23                       ` Minchan Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-11-10  2:23 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

So long contents.
Let's remove it.

On Thu, Nov 10, 2011 at 10:07:10AM +0800, Shaohua Li wrote:

<snip>

> > > Coudn't we make both sides good?
> > > 
> > > Here is my quick patch.
> > > How about this?
> > > It doesn't split THPs in page_list but still reclaims non-THPs so
> > > I think it doesn't changed old behavior a lot.
> > I like this idea, will do some test soon.
> hmm, this doesn't work as expected. The putback_lru_page() messes lru.
> This isn't a problem if the page will be written since
> rotate_reclaimable_page() will fix the order. I got worse data than my
> v2 patch, eg, more thp_fallbacks, mess lru order, more pages are
> scanned. We could add something like putback_lru_page_tail, but I'm not

Hmm, It's not LRU mess problem. but it's just guessing and you might be right
because you have a workload and can test it.

My guessing is that cull_mlocked reset synchronus page reclaim.
Could you test this patch, again?

And, if the problem cause by LRU mess, I think it is valuable with adding putback_lru_page_tail
because thp added lru_add_page_tail, too.

Thanks!

diff --git a/mm/vmscan.c b/mm/vmscan.c
index b55699c..e2c84c2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -767,6 +767,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 	unsigned long nr_dirty = 0;
 	unsigned long nr_congested = 0;
 	unsigned long nr_reclaimed = 0;
+	bool split_thp = false;
+	bool swapout_thp = false;
 
 	cond_resched();
 
@@ -784,6 +786,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (!trylock_page(page))
 			goto keep;
 
+		/*
+		 * If we already swap out a THP, we don't want to
+		 * split THPs any more. Let's wait until dirty a thp page
+		 * to be written into swap device
+		 */
+		if (unlikely(swapout_thp && PageTransHuge(page)))
+			goto pass_thp;
+
 		VM_BUG_ON(PageActive(page));
 		VM_BUG_ON(page_zone(page) != zone);
 
@@ -838,6 +848,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!(sc->gfp_mask & __GFP_IO))
 				goto keep_locked;
+			if (unlikely(PageTransHuge(page)))
+				if (unlikely(split_huge_page_list(page,
+					page_list)))
+				    goto activate_locked;
+				else
+					split_thp = true;
 			if (!add_to_swap(page))
 				goto activate_locked;
 			may_enter_fs = 1;
@@ -880,6 +896,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			case PAGE_ACTIVATE:
 				goto activate_locked;
 			case PAGE_SUCCESS:
+				if (split_thp)
+					swapout_thp = true;
 				if (PageWriteback(page))
 					goto keep_lumpy;
 				if (PageDirty(page))
@@ -962,6 +980,10 @@ free_it:
 		list_add(&page->lru, &free_pages);
 		continue;
 
+pass_thp:
+		unlock_page(page);
+		putback_lru_page(page);
+		continue;
 cull_mlocked:
 		if (PageSwapCache(page))
 			try_to_free_swap(page);

> convinced it's worthy(even with it, we still will mess lru a little). So
> I'm back to use the v2 patch if no better solution, it's still much
> better than current code.
> 
> Thanks,
> Shaohua
> 

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-11-10  2:23                       ` Minchan Kim
@ 2011-11-10  2:46                         ` Shaohua Li
  -1 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-11-10  2:46 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Thu, 2011-11-10 at 10:23 +0800, Minchan Kim wrote:
> So long contents.
> Let's remove it.
> 
> On Thu, Nov 10, 2011 at 10:07:10AM +0800, Shaohua Li wrote:
> 
> <snip>
> 
> > > > Coudn't we make both sides good?
> > > > 
> > > > Here is my quick patch.
> > > > How about this?
> > > > It doesn't split THPs in page_list but still reclaims non-THPs so
> > > > I think it doesn't changed old behavior a lot.
> > > I like this idea, will do some test soon.
> > hmm, this doesn't work as expected. The putback_lru_page() messes lru.
> > This isn't a problem if the page will be written since
> > rotate_reclaimable_page() will fix the order. I got worse data than my
> > v2 patch, eg, more thp_fallbacks, mess lru order, more pages are
> > scanned. We could add something like putback_lru_page_tail, but I'm not
> 
> Hmm, It's not LRU mess problem. but it's just guessing and you might be right
> because you have a workload and can test it.
> 
> My guessing is that cull_mlocked reset synchronus page reclaim.
> Could you test this patch, again?
no, I traced it, and lru mess. putback_lru_page() adds the page to lru
head instead of tail.

> And, if the problem cause by LRU mess, I think it is valuable with adding putback_lru_page_tail
> because thp added lru_add_page_tail, too.
I want to put all remaining pages back to lru tail if a huge page is
split, because enough pages are reclaimed. So this needs adding
something like putback_lru_pages_tail(), not complicated, but a lot of
code. And if there are parallel reclaimer, we still have lru mess. My
test already shows it. Still worthy?

Thanks,
Shaohua


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-11-10  2:46                         ` Shaohua Li
  0 siblings, 0 replies; 42+ messages in thread
From: Shaohua Li @ 2011-11-10  2:46 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Thu, 2011-11-10 at 10:23 +0800, Minchan Kim wrote:
> So long contents.
> Let's remove it.
> 
> On Thu, Nov 10, 2011 at 10:07:10AM +0800, Shaohua Li wrote:
> 
> <snip>
> 
> > > > Coudn't we make both sides good?
> > > > 
> > > > Here is my quick patch.
> > > > How about this?
> > > > It doesn't split THPs in page_list but still reclaims non-THPs so
> > > > I think it doesn't changed old behavior a lot.
> > > I like this idea, will do some test soon.
> > hmm, this doesn't work as expected. The putback_lru_page() messes lru.
> > This isn't a problem if the page will be written since
> > rotate_reclaimable_page() will fix the order. I got worse data than my
> > v2 patch, eg, more thp_fallbacks, mess lru order, more pages are
> > scanned. We could add something like putback_lru_page_tail, but I'm not
> 
> Hmm, It's not LRU mess problem. but it's just guessing and you might be right
> because you have a workload and can test it.
> 
> My guessing is that cull_mlocked reset synchronus page reclaim.
> Could you test this patch, again?
no, I traced it, and lru mess. putback_lru_page() adds the page to lru
head instead of tail.

> And, if the problem cause by LRU mess, I think it is valuable with adding putback_lru_page_tail
> because thp added lru_add_page_tail, too.
I want to put all remaining pages back to lru tail if a huge page is
split, because enough pages are reclaimed. So this needs adding
something like putback_lru_pages_tail(), not complicated, but a lot of
code. And if there are parallel reclaimer, we still have lru mess. My
test already shows it. Still worthy?

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
  2011-11-10  2:46                         ` Shaohua Li
@ 2011-11-10  3:18                           ` Minchan Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-11-10  3:18 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Thu, Nov 10, 2011 at 10:46:07AM +0800, Shaohua Li wrote:
> On Thu, 2011-11-10 at 10:23 +0800, Minchan Kim wrote:
> > So long contents.
> > Let's remove it.
> > 
> > On Thu, Nov 10, 2011 at 10:07:10AM +0800, Shaohua Li wrote:
> > 
> > <snip>
> > 
> > > > > Coudn't we make both sides good?
> > > > > 
> > > > > Here is my quick patch.
> > > > > How about this?
> > > > > It doesn't split THPs in page_list but still reclaims non-THPs so
> > > > > I think it doesn't changed old behavior a lot.
> > > > I like this idea, will do some test soon.
> > > hmm, this doesn't work as expected. The putback_lru_page() messes lru.
> > > This isn't a problem if the page will be written since
> > > rotate_reclaimable_page() will fix the order. I got worse data than my
> > > v2 patch, eg, more thp_fallbacks, mess lru order, more pages are
> > > scanned. We could add something like putback_lru_page_tail, but I'm not
> > 
> > Hmm, It's not LRU mess problem. but it's just guessing and you might be right
> > because you have a workload and can test it.
> > 
> > My guessing is that cull_mlocked reset synchronus page reclaim.
> > Could you test this patch, again?
> no, I traced it, and lru mess. putback_lru_page() adds the page to lru
> head instead of tail.

I knew LRU mess happens but I mean I am not sure the culprit is it.

> 
> > And, if the problem cause by LRU mess, I think it is valuable with adding putback_lru_page_tail
> > because thp added lru_add_page_tail, too.
> I want to put all remaining pages back to lru tail if a huge page is
> split, because enough pages are reclaimed. So this needs adding
> something like putback_lru_pages_tail(), not complicated, but a lot of
> code. And if there are parallel reclaimer, we still have lru mess. My
> test already shows it. Still worthy?

If parallel reclaim happens, it can spoil everything. It's really really bad.
I perfer adding putback_lru_page_tail with isoloation trick because probably
we can use it later about pages which was not able to reclaim by some causes temporally.

> 
> Thanks,
> Shaohua
> 

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [patch 5/5]thp: split huge page if head page is isolated
@ 2011-11-10  3:18                           ` Minchan Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2011-11-10  3:18 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Andrew Morton, aarcange, Hugh Dickins, Rik van Riel, mel,
	KAMEZAWA Hiroyuki, linux-mm, lkml

On Thu, Nov 10, 2011 at 10:46:07AM +0800, Shaohua Li wrote:
> On Thu, 2011-11-10 at 10:23 +0800, Minchan Kim wrote:
> > So long contents.
> > Let's remove it.
> > 
> > On Thu, Nov 10, 2011 at 10:07:10AM +0800, Shaohua Li wrote:
> > 
> > <snip>
> > 
> > > > > Coudn't we make both sides good?
> > > > > 
> > > > > Here is my quick patch.
> > > > > How about this?
> > > > > It doesn't split THPs in page_list but still reclaims non-THPs so
> > > > > I think it doesn't changed old behavior a lot.
> > > > I like this idea, will do some test soon.
> > > hmm, this doesn't work as expected. The putback_lru_page() messes lru.
> > > This isn't a problem if the page will be written since
> > > rotate_reclaimable_page() will fix the order. I got worse data than my
> > > v2 patch, eg, more thp_fallbacks, mess lru order, more pages are
> > > scanned. We could add something like putback_lru_page_tail, but I'm not
> > 
> > Hmm, It's not LRU mess problem. but it's just guessing and you might be right
> > because you have a workload and can test it.
> > 
> > My guessing is that cull_mlocked reset synchronus page reclaim.
> > Could you test this patch, again?
> no, I traced it, and lru mess. putback_lru_page() adds the page to lru
> head instead of tail.

I knew LRU mess happens but I mean I am not sure the culprit is it.

> 
> > And, if the problem cause by LRU mess, I think it is valuable with adding putback_lru_page_tail
> > because thp added lru_add_page_tail, too.
> I want to put all remaining pages back to lru tail if a huge page is
> split, because enough pages are reclaimed. So this needs adding
> something like putback_lru_pages_tail(), not complicated, but a lot of
> code. And if there are parallel reclaimer, we still have lru mess. My
> test already shows it. Still worthy?

If parallel reclaim happens, it can spoil everything. It's really really bad.
I perfer adding putback_lru_page_tail with isoloation trick because probably
we can use it later about pages which was not able to reclaim by some causes temporally.

> 
> Thanks,
> Shaohua
> 

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2011-11-10  3:20 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-10-25  2:59 [patch 5/5]thp: split huge page if head page is isolated Shaohua Li
2011-10-25  2:59 ` Shaohua Li
2011-10-27 23:34 ` Minchan Kim
2011-10-27 23:34   ` Minchan Kim
2011-10-28  5:11   ` Shaohua Li
2011-10-28  5:11     ` Shaohua Li
2011-10-28  7:30     ` Minchan Kim
2011-10-28  7:30       ` Minchan Kim
2011-10-28  8:25       ` Shaohua Li
2011-10-28  8:25         ` Shaohua Li
2011-10-28  9:50         ` Minchan Kim
2011-10-28  9:50           ` Minchan Kim
2011-10-31  1:10           ` Shaohua Li
2011-10-31  1:10             ` Shaohua Li
2011-10-31  8:24             ` Minchan Kim
2011-10-31  8:24               ` Minchan Kim
2011-10-29  0:06 ` Minchan Kim
2011-10-29  0:06   ` Minchan Kim
2011-10-31  1:21   ` Shaohua Li
2011-10-31  1:21     ` Shaohua Li
2011-10-31  8:23     ` Minchan Kim
2011-10-31  8:23       ` Minchan Kim
2011-10-31  9:03       ` Shaohua Li
2011-10-31  9:03         ` Shaohua Li
2011-11-02  3:17         ` Shaohua Li
2011-11-02  3:17           ` Shaohua Li
2011-11-08  8:59           ` Minchan Kim
2011-11-08  8:59             ` Minchan Kim
2011-11-09  5:27             ` Shaohua Li
2011-11-09  5:27               ` Shaohua Li
2011-11-09  6:28               ` Minchan Kim
2011-11-09  6:28                 ` Minchan Kim
2011-11-09  7:08                 ` Shaohua Li
2011-11-09  7:08                   ` Shaohua Li
2011-11-10  2:07                   ` Shaohua Li
2011-11-10  2:07                     ` Shaohua Li
2011-11-10  2:23                     ` Minchan Kim
2011-11-10  2:23                       ` Minchan Kim
2011-11-10  2:46                       ` Shaohua Li
2011-11-10  2:46                         ` Shaohua Li
2011-11-10  3:18                         ` Minchan Kim
2011-11-10  3:18                           ` Minchan Kim

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.