All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 1/2] mm, migration: add destination page freeing callback
@ 2014-05-01  0:45 ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory migration uses a callback defined by the caller to determine how to
allocate destination pages.  When migration fails for a source page, however, it 
frees the destination page back to the system.

This patch adds a memory migration callback defined by the caller to determine 
how to free destination pages.  If a caller, such as memory compaction, builds 
its own freelist for migration targets, this can reuse already freed memory 
instead of scanning additional memory.

If the caller provides a function to handle freeing of destination pages, it is 
called when page migration fails.  Otherwise, it may pass NULL and freeing back 
to the system will be handled as usual.  This patch introduces no functional 
change.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/migrate.h | 11 ++++++----
 mm/compaction.c         |  2 +-
 mm/memory-failure.c     |  4 ++--
 mm/memory_hotplug.c     |  2 +-
 mm/mempolicy.c          |  4 ++--
 mm/migrate.c            | 54 ++++++++++++++++++++++++++++++++-----------------
 mm/page_alloc.c         |  2 +-
 7 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,7 +5,9 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate_mode.h>
 
-typedef struct page *new_page_t(struct page *, unsigned long private, int **);
+typedef struct page *new_page_t(struct page *page, unsigned long private,
+				int **reason);
+typedef void free_page_t(struct page *page, unsigned long private);
 
 /*
  * Return values from addresss_space_operations.migratepage():
@@ -38,7 +40,7 @@ enum migrate_reason {
 extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
-extern int migrate_pages(struct list_head *l, new_page_t x,
+extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
 
 extern int migrate_prep(void);
@@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
-static inline int migrate_pages(struct list_head *l, new_page_t x,
-		unsigned long private, enum migrate_mode mode, int reason)
+static inline int migrate_pages(struct list_head *l, new_page_t new,
+		free_page_t free, unsigned long private, enum migrate_mode mode,
+		int reason)
 	{ return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1023,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
 				(unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1500,7 +1500,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 
 	/* Keep page count to indicate a given hugepage is isolated. */
 	list_move(&hpage->lru, &pagelist);
-	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 				MIGRATE_SYNC, MR_MEMORY_FAILURE);
 	if (ret) {
 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1581,7 +1581,7 @@ static int __soft_offline_page(struct page *page, int flags)
 		inc_zone_page_state(page, NR_ISOLATED_ANON +
 					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
-		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
 		if (ret) {
 			if (!list_empty(&pagelist)) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * alloc_migrate_target should be improooooved!!
 		 * migrate_pages returns # of failed pages.
 		 */
-		ret = migrate_pages(&source, alloc_migrate_target, 0,
+		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
 		if (ret)
 			putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_node_page, dest,
+		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
 					MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
-					(unsigned long)vma,
+					NULL, (unsigned long)vma,
 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -938,8 +938,9 @@ out:
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
  */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, enum migrate_mode mode)
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+			unsigned long private, struct page *page, int force,
+			enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -983,11 +984,12 @@ out:
 				page_is_file_cache(page));
 		putback_lru_page(page);
 	}
-	/*
-	 * Move the new page to the LRU. If migration was not successful
-	 * then this will free the page.
-	 */
-	putback_lru_page(newpage);
+
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(newpage, private);
+	else
+		putback_lru_page(newpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1016,8 +1018,9 @@ out:
  * will wait in the page fault for migration to complete.
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
-				unsigned long private, struct page *hpage,
-				int force, enum migrate_mode mode)
+				free_page_t put_new_page, unsigned long private,
+				struct page *hpage, int force,
+				enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -1056,20 +1059,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (!page_mapped(hpage))
 		rc = move_to_new_page(new_hpage, hpage, 1, mode);
 
-	if (rc)
+	if (rc != MIGRATEPAGE_SUCCESS)
 		remove_migration_ptes(hpage, hpage);
 
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 
-	if (!rc)
+	if (rc == MIGRATEPAGE_SUCCESS)
 		hugetlb_cgroup_migrate(hpage, new_hpage);
 
 	unlock_page(hpage);
 out:
 	if (rc != -EAGAIN)
 		putback_active_hugepage(hpage);
-	put_page(new_hpage);
+
+	/*
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, put_page() will drop the reference grabbed during
+	 * isolation.
+	 */
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(new_hpage, private);
+	else
+		put_page(new_hpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1086,6 +1099,8 @@ out:
  * @from:		The list of pages to be migrated.
  * @get_new_page:	The function used to allocate free pages to be used
  *			as the target of the page migration.
+ * @put_new_page:	The function used to free target pages if migration
+ *			fails, or NULL if no special handling is necessary.
  * @private:		Private data to be passed on to get_new_page()
  * @mode:		The migration mode that specifies the constraints for
  *			page migration, if any.
@@ -1099,7 +1114,8 @@ out:
  * Returns the number of pages that were not migrated, or an error code.
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
-		unsigned long private, enum migrate_mode mode, int reason)
+		free_page_t put_new_page, unsigned long private,
+		enum migrate_mode mode, int reason)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -1121,10 +1137,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 
 			if (PageHuge(page))
 				rc = unmap_and_move_huge_page(get_new_page,
-						private, page, pass > 2, mode);
+						put_new_page, private, page,
+						pass > 2, mode);
 			else
-				rc = unmap_and_move(get_new_page, private,
-						page, pass > 2, mode);
+				rc = unmap_and_move(get_new_page, put_new_page,
+						private, page, pass > 2, mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1273,7 +1290,7 @@ set_status:
 
 	err = 0;
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_page_node,
+		err = migrate_pages(&pagelist, new_page_node, NULL,
 				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1729,7 +1746,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 
 	list_add(&page->lru, &migratepages);
 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
-				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+				     NULL, node, MIGRATE_ASYNC,
+				     MR_NUMA_MISPLACED);
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6215,7 +6215,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		cc->nr_migratepages -= nr_reclaimed;
 
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
-				    0, MIGRATE_SYNC, MR_CMA);
+				    NULL, 0, MIGRATE_SYNC, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch 1/2] mm, migration: add destination page freeing callback
@ 2014-05-01  0:45 ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory migration uses a callback defined by the caller to determine how to
allocate destination pages.  When migration fails for a source page, however, it 
frees the destination page back to the system.

This patch adds a memory migration callback defined by the caller to determine 
how to free destination pages.  If a caller, such as memory compaction, builds 
its own freelist for migration targets, this can reuse already freed memory 
instead of scanning additional memory.

If the caller provides a function to handle freeing of destination pages, it is 
called when page migration fails.  Otherwise, it may pass NULL and freeing back 
to the system will be handled as usual.  This patch introduces no functional 
change.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/migrate.h | 11 ++++++----
 mm/compaction.c         |  2 +-
 mm/memory-failure.c     |  4 ++--
 mm/memory_hotplug.c     |  2 +-
 mm/mempolicy.c          |  4 ++--
 mm/migrate.c            | 54 ++++++++++++++++++++++++++++++++-----------------
 mm/page_alloc.c         |  2 +-
 7 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,7 +5,9 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate_mode.h>
 
-typedef struct page *new_page_t(struct page *, unsigned long private, int **);
+typedef struct page *new_page_t(struct page *page, unsigned long private,
+				int **reason);
+typedef void free_page_t(struct page *page, unsigned long private);
 
 /*
  * Return values from addresss_space_operations.migratepage():
@@ -38,7 +40,7 @@ enum migrate_reason {
 extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
-extern int migrate_pages(struct list_head *l, new_page_t x,
+extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
 
 extern int migrate_prep(void);
@@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
-static inline int migrate_pages(struct list_head *l, new_page_t x,
-		unsigned long private, enum migrate_mode mode, int reason)
+static inline int migrate_pages(struct list_head *l, new_page_t new,
+		free_page_t free, unsigned long private, enum migrate_mode mode,
+		int reason)
 	{ return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1023,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
 				(unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1500,7 +1500,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 
 	/* Keep page count to indicate a given hugepage is isolated. */
 	list_move(&hpage->lru, &pagelist);
-	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 				MIGRATE_SYNC, MR_MEMORY_FAILURE);
 	if (ret) {
 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1581,7 +1581,7 @@ static int __soft_offline_page(struct page *page, int flags)
 		inc_zone_page_state(page, NR_ISOLATED_ANON +
 					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
-		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
 		if (ret) {
 			if (!list_empty(&pagelist)) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * alloc_migrate_target should be improooooved!!
 		 * migrate_pages returns # of failed pages.
 		 */
-		ret = migrate_pages(&source, alloc_migrate_target, 0,
+		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
 		if (ret)
 			putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_node_page, dest,
+		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
 					MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
-					(unsigned long)vma,
+					NULL, (unsigned long)vma,
 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -938,8 +938,9 @@ out:
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
  */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, enum migrate_mode mode)
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+			unsigned long private, struct page *page, int force,
+			enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -983,11 +984,12 @@ out:
 				page_is_file_cache(page));
 		putback_lru_page(page);
 	}
-	/*
-	 * Move the new page to the LRU. If migration was not successful
-	 * then this will free the page.
-	 */
-	putback_lru_page(newpage);
+
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(newpage, private);
+	else
+		putback_lru_page(newpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1016,8 +1018,9 @@ out:
  * will wait in the page fault for migration to complete.
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
-				unsigned long private, struct page *hpage,
-				int force, enum migrate_mode mode)
+				free_page_t put_new_page, unsigned long private,
+				struct page *hpage, int force,
+				enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -1056,20 +1059,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (!page_mapped(hpage))
 		rc = move_to_new_page(new_hpage, hpage, 1, mode);
 
-	if (rc)
+	if (rc != MIGRATEPAGE_SUCCESS)
 		remove_migration_ptes(hpage, hpage);
 
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 
-	if (!rc)
+	if (rc == MIGRATEPAGE_SUCCESS)
 		hugetlb_cgroup_migrate(hpage, new_hpage);
 
 	unlock_page(hpage);
 out:
 	if (rc != -EAGAIN)
 		putback_active_hugepage(hpage);
-	put_page(new_hpage);
+
+	/*
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, put_page() will drop the reference grabbed during
+	 * isolation.
+	 */
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(new_hpage, private);
+	else
+		put_page(new_hpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1086,6 +1099,8 @@ out:
  * @from:		The list of pages to be migrated.
  * @get_new_page:	The function used to allocate free pages to be used
  *			as the target of the page migration.
+ * @put_new_page:	The function used to free target pages if migration
+ *			fails, or NULL if no special handling is necessary.
  * @private:		Private data to be passed on to get_new_page()
  * @mode:		The migration mode that specifies the constraints for
  *			page migration, if any.
@@ -1099,7 +1114,8 @@ out:
  * Returns the number of pages that were not migrated, or an error code.
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
-		unsigned long private, enum migrate_mode mode, int reason)
+		free_page_t put_new_page, unsigned long private,
+		enum migrate_mode mode, int reason)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -1121,10 +1137,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 
 			if (PageHuge(page))
 				rc = unmap_and_move_huge_page(get_new_page,
-						private, page, pass > 2, mode);
+						put_new_page, private, page,
+						pass > 2, mode);
 			else
-				rc = unmap_and_move(get_new_page, private,
-						page, pass > 2, mode);
+				rc = unmap_and_move(get_new_page, put_new_page,
+						private, page, pass > 2, mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1273,7 +1290,7 @@ set_status:
 
 	err = 0;
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_page_node,
+		err = migrate_pages(&pagelist, new_page_node, NULL,
 				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1729,7 +1746,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 
 	list_add(&page->lru, &migratepages);
 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
-				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+				     NULL, node, MIGRATE_ASYNC,
+				     MR_NUMA_MISPLACED);
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6215,7 +6215,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		cc->nr_migratepages -= nr_reclaimed;
 
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
-				    0, MIGRATE_SYNC, MR_CMA);
+				    NULL, 0, MIGRATE_SYNC, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch 2/2] mm, compaction: return failed migration target pages back to freelist
  2014-05-01  0:45 ` David Rientjes
@ 2014-05-01  0:45   ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory compaction works by having a "freeing scanner" scan from one end of a 
zone which isolates pages as migration targets while another "migrating scanner" 
scans from the other end of the same zone which isolates pages for migration.

When page migration fails for an isolated page, the target page is returned to 
the system rather than the freelist built by the freeing scanner.  This may 
require the freeing scanner to continue scanning memory after suitable migration 
targets have already been returned to the system needlessly.

This patch returns destination pages to the freeing scanner freelist when page 
migration fails.  This prevents unnecessary work done by the freeing scanner but 
also encourages memory to be as compacted as possible at the end of the zone.

Reported-by: Greg Thelen <gthelen@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -797,6 +797,19 @@ static struct page *compaction_alloc(struct page *migratepage,
 }
 
 /*
+ * This is a migrate-callback that "frees" freepages back to the isolated
+ * freelist.  All pages on the freelist are from the same zone, so there is no
+ * special handling needed for NUMA.
+ */
+static void compaction_free(struct page *page, unsigned long data)
+{
+	struct compact_control *cc = (struct compact_control *)data;
+
+	list_add(&page->lru, &cc->freepages);
+	cc->nr_freepages++;
+}
+
+/*
  * We cannot control nr_migratepages and nr_freepages fully when migration is
  * running as migrate_pages() has no knowledge of compact_control. When
  * migration is complete, we count the number of pages on the lists by hand.
@@ -1023,8 +1036,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
-				(unsigned long)cc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc,
+				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
 		update_nr_listpages(cc);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch 2/2] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-01  0:45   ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory compaction works by having a "freeing scanner" scan from one end of a 
zone which isolates pages as migration targets while another "migrating scanner" 
scans from the other end of the same zone which isolates pages for migration.

When page migration fails for an isolated page, the target page is returned to 
the system rather than the freelist built by the freeing scanner.  This may 
require the freeing scanner to continue scanning memory after suitable migration 
targets have already been returned to the system needlessly.

This patch returns destination pages to the freeing scanner freelist when page 
migration fails.  This prevents unnecessary work done by the freeing scanner but 
also encourages memory to be as compacted as possible at the end of the zone.

Reported-by: Greg Thelen <gthelen@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -797,6 +797,19 @@ static struct page *compaction_alloc(struct page *migratepage,
 }
 
 /*
+ * This is a migrate-callback that "frees" freepages back to the isolated
+ * freelist.  All pages on the freelist are from the same zone, so there is no
+ * special handling needed for NUMA.
+ */
+static void compaction_free(struct page *page, unsigned long data)
+{
+	struct compact_control *cc = (struct compact_control *)data;
+
+	list_add(&page->lru, &cc->freepages);
+	cc->nr_freepages++;
+}
+
+/*
  * We cannot control nr_migratepages and nr_freepages fully when migration is
  * running as migrate_pages() has no knowledge of compact_control. When
  * migration is complete, we count the number of pages on the lists by hand.
@@ -1023,8 +1036,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
-				(unsigned long)cc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc,
+				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
 		update_nr_listpages(cc);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch 1/2] mm, migration: add destination page freeing callback
  2014-05-01  0:45 ` David Rientjes
  (?)
  (?)
@ 2014-05-01  5:08 ` Naoya Horiguchi
  -1 siblings, 0 replies; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-01  5:08 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, vbabka, iamjoonsoo.kim,
	gthelen, Hugh Dickins, linux-kernel, linux-mm

Hi David,

On Wed, Apr 30, 2014 at 05:45:24PM -0700, David Rientjes wrote:
> Memory migration uses a callback defined by the caller to determine how to
> allocate destination pages.  When migration fails for a source page, however, it 
> frees the destination page back to the system.
> 
> This patch adds a memory migration callback defined by the caller to determine 
> how to free destination pages.  If a caller, such as memory compaction, builds 
> its own freelist for migration targets, this can reuse already freed memory 
> instead of scanning additional memory.
> 
> If the caller provides a function to handle freeing of destination pages, it is 
> called when page migration fails.  Otherwise, it may pass NULL and freeing back 
> to the system will be handled as usual.  This patch introduces no functional 
> change.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Looks good to me.
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

I have one comment below ...

[snip]

> @@ -1056,20 +1059,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
>  	if (!page_mapped(hpage))
>  		rc = move_to_new_page(new_hpage, hpage, 1, mode);
>  
> -	if (rc)
> +	if (rc != MIGRATEPAGE_SUCCESS)
>  		remove_migration_ptes(hpage, hpage);
>  
>  	if (anon_vma)
>  		put_anon_vma(anon_vma);
>  
> -	if (!rc)
> +	if (rc == MIGRATEPAGE_SUCCESS)
>  		hugetlb_cgroup_migrate(hpage, new_hpage);
>  
>  	unlock_page(hpage);
>  out:
>  	if (rc != -EAGAIN)
>  		putback_active_hugepage(hpage);
> -	put_page(new_hpage);
> +
> +	/*
> +	 * If migration was not successful and there's a freeing callback, use
> +	 * it.  Otherwise, put_page() will drop the reference grabbed during
> +	 * isolation.
> +	 */

This comment is true both for normal page and huge page, and people more likely
to see unmap_and_move() at first, so this had better be (also) in unmap_and_move().

Thanks,
Naoya Horiguchi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch 2/2] mm, compaction: return failed migration target pages back to freelist
  2014-05-01  0:45   ` David Rientjes
  (?)
@ 2014-05-01  5:10   ` Naoya Horiguchi
  2014-05-01 21:02       ` David Rientjes
  -1 siblings, 1 reply; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-01  5:10 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, vbabka, iamjoonsoo.kim,
	gthelen, Hugh Dickins, linux-kernel, linux-mm

On Wed, Apr 30, 2014 at 05:45:27PM -0700, David Rientjes wrote:
> Memory compaction works by having a "freeing scanner" scan from one end of a 
> zone which isolates pages as migration targets while another "migrating scanner" 
> scans from the other end of the same zone which isolates pages for migration.
> 
> When page migration fails for an isolated page, the target page is returned to 
> the system rather than the freelist built by the freeing scanner.  This may 
> require the freeing scanner to continue scanning memory after suitable migration 
> targets have already been returned to the system needlessly.
> 
> This patch returns destination pages to the freeing scanner freelist when page 
> migration fails.  This prevents unnecessary work done by the freeing scanner but 
> also encourages memory to be as compacted as possible at the end of the zone.
> 
> Reported-by: Greg Thelen <gthelen@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  mm/compaction.c | 17 +++++++++++++++--
>  1 file changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -797,6 +797,19 @@ static struct page *compaction_alloc(struct page *migratepage,
>  }
>  
>  /*
> + * This is a migrate-callback that "frees" freepages back to the isolated
> + * freelist.  All pages on the freelist are from the same zone, so there is no
> + * special handling needed for NUMA.
> + */
> +static void compaction_free(struct page *page, unsigned long data)
> +{
> +	struct compact_control *cc = (struct compact_control *)data;
> +
> +	list_add(&page->lru, &cc->freepages);
> +	cc->nr_freepages++;

With this change, migration_page() handles cc->nr_freepages consistently, so
we don't have to run over freelist to update this count in update_nr_listpages()?

I'm not sure if that's also true for cc->nr_migratepages, but if it is,
we can completely remove update_nr_listpages().

Thanks,
Naoya Horiguchi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch 2/2] mm, compaction: return failed migration target pages back to freelist
  2014-05-01  5:10   ` Naoya Horiguchi
@ 2014-05-01 21:02       ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:02 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, vbabka, iamjoonsoo.kim,
	gthelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, 1 May 2014, Naoya Horiguchi wrote:

> > diff --git a/mm/compaction.c b/mm/compaction.c
> > --- a/mm/compaction.c
> > +++ b/mm/compaction.c
> > @@ -797,6 +797,19 @@ static struct page *compaction_alloc(struct page *migratepage,
> >  }
> >  
> >  /*
> > + * This is a migrate-callback that "frees" freepages back to the isolated
> > + * freelist.  All pages on the freelist are from the same zone, so there is no
> > + * special handling needed for NUMA.
> > + */
> > +static void compaction_free(struct page *page, unsigned long data)
> > +{
> > +	struct compact_control *cc = (struct compact_control *)data;
> > +
> > +	list_add(&page->lru, &cc->freepages);
> > +	cc->nr_freepages++;
> 
> With this change, migration_page() handles cc->nr_freepages consistently, so
> we don't have to run over freelist to update this count in update_nr_listpages()?
> 

Good optimization, I'll add it!

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch 2/2] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-01 21:02       ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:02 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, vbabka, iamjoonsoo.kim,
	gthelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, 1 May 2014, Naoya Horiguchi wrote:

> > diff --git a/mm/compaction.c b/mm/compaction.c
> > --- a/mm/compaction.c
> > +++ b/mm/compaction.c
> > @@ -797,6 +797,19 @@ static struct page *compaction_alloc(struct page *migratepage,
> >  }
> >  
> >  /*
> > + * This is a migrate-callback that "frees" freepages back to the isolated
> > + * freelist.  All pages on the freelist are from the same zone, so there is no
> > + * special handling needed for NUMA.
> > + */
> > +static void compaction_free(struct page *page, unsigned long data)
> > +{
> > +	struct compact_control *cc = (struct compact_control *)data;
> > +
> > +	list_add(&page->lru, &cc->freepages);
> > +	cc->nr_freepages++;
> 
> With this change, migration_page() handles cc->nr_freepages consistently, so
> we don't have to run over freelist to update this count in update_nr_listpages()?
> 

Good optimization, I'll add it!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch 1/2] mm, migration: add destination page freeing callback
       [not found] ` <5361d71e.236ec20a.1b3d.ffffc8aeSMTPIN_ADDED_BROKEN@mx.google.com>
@ 2014-05-01 21:02     ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:02 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, vbabka, iamjoonsoo.kim,
	gthelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, 1 May 2014, Naoya Horiguchi wrote:

> Looks good to me.
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> 

Thanks!

> I have one comment below ...
> 
> [snip]
> 
> > @@ -1056,20 +1059,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
> >  	if (!page_mapped(hpage))
> >  		rc = move_to_new_page(new_hpage, hpage, 1, mode);
> >  
> > -	if (rc)
> > +	if (rc != MIGRATEPAGE_SUCCESS)
> >  		remove_migration_ptes(hpage, hpage);
> >  
> >  	if (anon_vma)
> >  		put_anon_vma(anon_vma);
> >  
> > -	if (!rc)
> > +	if (rc == MIGRATEPAGE_SUCCESS)
> >  		hugetlb_cgroup_migrate(hpage, new_hpage);
> >  
> >  	unlock_page(hpage);
> >  out:
> >  	if (rc != -EAGAIN)
> >  		putback_active_hugepage(hpage);
> > -	put_page(new_hpage);
> > +
> > +	/*
> > +	 * If migration was not successful and there's a freeing callback, use
> > +	 * it.  Otherwise, put_page() will drop the reference grabbed during
> > +	 * isolation.
> > +	 */
> 
> This comment is true both for normal page and huge page, and people more likely
> to see unmap_and_move() at first, so this had better be (also) in unmap_and_move().
> 

Ok!

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch 1/2] mm, migration: add destination page freeing callback
@ 2014-05-01 21:02     ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:02 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, vbabka, iamjoonsoo.kim,
	gthelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, 1 May 2014, Naoya Horiguchi wrote:

> Looks good to me.
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> 

Thanks!

> I have one comment below ...
> 
> [snip]
> 
> > @@ -1056,20 +1059,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
> >  	if (!page_mapped(hpage))
> >  		rc = move_to_new_page(new_hpage, hpage, 1, mode);
> >  
> > -	if (rc)
> > +	if (rc != MIGRATEPAGE_SUCCESS)
> >  		remove_migration_ptes(hpage, hpage);
> >  
> >  	if (anon_vma)
> >  		put_anon_vma(anon_vma);
> >  
> > -	if (!rc)
> > +	if (rc == MIGRATEPAGE_SUCCESS)
> >  		hugetlb_cgroup_migrate(hpage, new_hpage);
> >  
> >  	unlock_page(hpage);
> >  out:
> >  	if (rc != -EAGAIN)
> >  		putback_active_hugepage(hpage);
> > -	put_page(new_hpage);
> > +
> > +	/*
> > +	 * If migration was not successful and there's a freeing callback, use
> > +	 * it.  Otherwise, put_page() will drop the reference grabbed during
> > +	 * isolation.
> > +	 */
> 
> This comment is true both for normal page and huge page, and people more likely
> to see unmap_and_move() at first, so this had better be (also) in unmap_and_move().
> 

Ok!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v2 1/4] mm, migration: add destination page freeing callback
  2014-05-01  0:45 ` David Rientjes
@ 2014-05-01 21:35   ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory migration uses a callback defined by the caller to determine how to
allocate destination pages.  When migration fails for a source page, however, it 
frees the destination page back to the system.

This patch adds a memory migration callback defined by the caller to determine 
how to free destination pages.  If a caller, such as memory compaction, builds 
its own freelist for migration targets, this can reuse already freed memory 
instead of scanning additional memory.

If the caller provides a function to handle freeing of destination pages, it is 
called when page migration fails.  Otherwise, it may pass NULL and freeing back 
to the system will be handled as usual.  This patch introduces no functional 
change.

Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/migrate.h | 11 ++++++----
 mm/compaction.c         |  2 +-
 mm/memory-failure.c     |  4 ++--
 mm/memory_hotplug.c     |  2 +-
 mm/mempolicy.c          |  4 ++--
 mm/migrate.c            | 55 +++++++++++++++++++++++++++++++++++--------------
 mm/page_alloc.c         |  2 +-
 7 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,7 +5,9 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate_mode.h>
 
-typedef struct page *new_page_t(struct page *, unsigned long private, int **);
+typedef struct page *new_page_t(struct page *page, unsigned long private,
+				int **reason);
+typedef void free_page_t(struct page *page, unsigned long private);
 
 /*
  * Return values from addresss_space_operations.migratepage():
@@ -38,7 +40,7 @@ enum migrate_reason {
 extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
-extern int migrate_pages(struct list_head *l, new_page_t x,
+extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
 
 extern int migrate_prep(void);
@@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
-static inline int migrate_pages(struct list_head *l, new_page_t x,
-		unsigned long private, enum migrate_mode mode, int reason)
+static inline int migrate_pages(struct list_head *l, new_page_t new,
+		free_page_t free, unsigned long private, enum migrate_mode mode,
+		int reason)
 	{ return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1023,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
 				(unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1500,7 +1500,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 
 	/* Keep page count to indicate a given hugepage is isolated. */
 	list_move(&hpage->lru, &pagelist);
-	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 				MIGRATE_SYNC, MR_MEMORY_FAILURE);
 	if (ret) {
 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1581,7 +1581,7 @@ static int __soft_offline_page(struct page *page, int flags)
 		inc_zone_page_state(page, NR_ISOLATED_ANON +
 					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
-		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
 		if (ret) {
 			if (!list_empty(&pagelist)) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * alloc_migrate_target should be improooooved!!
 		 * migrate_pages returns # of failed pages.
 		 */
-		ret = migrate_pages(&source, alloc_migrate_target, 0,
+		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
 		if (ret)
 			putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_node_page, dest,
+		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
 					MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
-					(unsigned long)vma,
+					NULL, (unsigned long)vma,
 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -938,8 +938,9 @@ out:
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
  */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, enum migrate_mode mode)
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+			unsigned long private, struct page *page, int force,
+			enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -983,11 +984,17 @@ out:
 				page_is_file_cache(page));
 		putback_lru_page(page);
 	}
+
 	/*
-	 * Move the new page to the LRU. If migration was not successful
-	 * then this will free the page.
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, putback_lru_page() will drop the reference grabbed
+	 * during isolation.
 	 */
-	putback_lru_page(newpage);
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(newpage, private);
+	else
+		putback_lru_page(newpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1016,8 +1023,9 @@ out:
  * will wait in the page fault for migration to complete.
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
-				unsigned long private, struct page *hpage,
-				int force, enum migrate_mode mode)
+				free_page_t put_new_page, unsigned long private,
+				struct page *hpage, int force,
+				enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -1056,20 +1064,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (!page_mapped(hpage))
 		rc = move_to_new_page(new_hpage, hpage, 1, mode);
 
-	if (rc)
+	if (rc != MIGRATEPAGE_SUCCESS)
 		remove_migration_ptes(hpage, hpage);
 
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 
-	if (!rc)
+	if (rc == MIGRATEPAGE_SUCCESS)
 		hugetlb_cgroup_migrate(hpage, new_hpage);
 
 	unlock_page(hpage);
 out:
 	if (rc != -EAGAIN)
 		putback_active_hugepage(hpage);
-	put_page(new_hpage);
+
+	/*
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, put_page() will drop the reference grabbed during
+	 * isolation.
+	 */
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(new_hpage, private);
+	else
+		put_page(new_hpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1086,6 +1104,8 @@ out:
  * @from:		The list of pages to be migrated.
  * @get_new_page:	The function used to allocate free pages to be used
  *			as the target of the page migration.
+ * @put_new_page:	The function used to free target pages if migration
+ *			fails, or NULL if no special handling is necessary.
  * @private:		Private data to be passed on to get_new_page()
  * @mode:		The migration mode that specifies the constraints for
  *			page migration, if any.
@@ -1099,7 +1119,8 @@ out:
  * Returns the number of pages that were not migrated, or an error code.
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
-		unsigned long private, enum migrate_mode mode, int reason)
+		free_page_t put_new_page, unsigned long private,
+		enum migrate_mode mode, int reason)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -1121,10 +1142,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 
 			if (PageHuge(page))
 				rc = unmap_and_move_huge_page(get_new_page,
-						private, page, pass > 2, mode);
+						put_new_page, private, page,
+						pass > 2, mode);
 			else
-				rc = unmap_and_move(get_new_page, private,
-						page, pass > 2, mode);
+				rc = unmap_and_move(get_new_page, put_new_page,
+						private, page, pass > 2, mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1273,7 +1295,7 @@ set_status:
 
 	err = 0;
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_page_node,
+		err = migrate_pages(&pagelist, new_page_node, NULL,
 				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1729,7 +1751,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 
 	list_add(&page->lru, &migratepages);
 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
-				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+				     NULL, node, MIGRATE_ASYNC,
+				     MR_NUMA_MISPLACED);
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6215,7 +6215,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		cc->nr_migratepages -= nr_reclaimed;
 
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
-				    0, MIGRATE_SYNC, MR_CMA);
+				    NULL, 0, MIGRATE_SYNC, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v2 1/4] mm, migration: add destination page freeing callback
@ 2014-05-01 21:35   ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory migration uses a callback defined by the caller to determine how to
allocate destination pages.  When migration fails for a source page, however, it 
frees the destination page back to the system.

This patch adds a memory migration callback defined by the caller to determine 
how to free destination pages.  If a caller, such as memory compaction, builds 
its own freelist for migration targets, this can reuse already freed memory 
instead of scanning additional memory.

If the caller provides a function to handle freeing of destination pages, it is 
called when page migration fails.  Otherwise, it may pass NULL and freeing back 
to the system will be handled as usual.  This patch introduces no functional 
change.

Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/migrate.h | 11 ++++++----
 mm/compaction.c         |  2 +-
 mm/memory-failure.c     |  4 ++--
 mm/memory_hotplug.c     |  2 +-
 mm/mempolicy.c          |  4 ++--
 mm/migrate.c            | 55 +++++++++++++++++++++++++++++++++++--------------
 mm/page_alloc.c         |  2 +-
 7 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,7 +5,9 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate_mode.h>
 
-typedef struct page *new_page_t(struct page *, unsigned long private, int **);
+typedef struct page *new_page_t(struct page *page, unsigned long private,
+				int **reason);
+typedef void free_page_t(struct page *page, unsigned long private);
 
 /*
  * Return values from addresss_space_operations.migratepage():
@@ -38,7 +40,7 @@ enum migrate_reason {
 extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
-extern int migrate_pages(struct list_head *l, new_page_t x,
+extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
 
 extern int migrate_prep(void);
@@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
-static inline int migrate_pages(struct list_head *l, new_page_t x,
-		unsigned long private, enum migrate_mode mode, int reason)
+static inline int migrate_pages(struct list_head *l, new_page_t new,
+		free_page_t free, unsigned long private, enum migrate_mode mode,
+		int reason)
 	{ return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1023,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
 				(unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1500,7 +1500,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 
 	/* Keep page count to indicate a given hugepage is isolated. */
 	list_move(&hpage->lru, &pagelist);
-	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 				MIGRATE_SYNC, MR_MEMORY_FAILURE);
 	if (ret) {
 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1581,7 +1581,7 @@ static int __soft_offline_page(struct page *page, int flags)
 		inc_zone_page_state(page, NR_ISOLATED_ANON +
 					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
-		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
 		if (ret) {
 			if (!list_empty(&pagelist)) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * alloc_migrate_target should be improooooved!!
 		 * migrate_pages returns # of failed pages.
 		 */
-		ret = migrate_pages(&source, alloc_migrate_target, 0,
+		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
 		if (ret)
 			putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_node_page, dest,
+		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
 					MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
-					(unsigned long)vma,
+					NULL, (unsigned long)vma,
 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -938,8 +938,9 @@ out:
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
  */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, enum migrate_mode mode)
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+			unsigned long private, struct page *page, int force,
+			enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -983,11 +984,17 @@ out:
 				page_is_file_cache(page));
 		putback_lru_page(page);
 	}
+
 	/*
-	 * Move the new page to the LRU. If migration was not successful
-	 * then this will free the page.
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, putback_lru_page() will drop the reference grabbed
+	 * during isolation.
 	 */
-	putback_lru_page(newpage);
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(newpage, private);
+	else
+		putback_lru_page(newpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1016,8 +1023,9 @@ out:
  * will wait in the page fault for migration to complete.
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
-				unsigned long private, struct page *hpage,
-				int force, enum migrate_mode mode)
+				free_page_t put_new_page, unsigned long private,
+				struct page *hpage, int force,
+				enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -1056,20 +1064,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (!page_mapped(hpage))
 		rc = move_to_new_page(new_hpage, hpage, 1, mode);
 
-	if (rc)
+	if (rc != MIGRATEPAGE_SUCCESS)
 		remove_migration_ptes(hpage, hpage);
 
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 
-	if (!rc)
+	if (rc == MIGRATEPAGE_SUCCESS)
 		hugetlb_cgroup_migrate(hpage, new_hpage);
 
 	unlock_page(hpage);
 out:
 	if (rc != -EAGAIN)
 		putback_active_hugepage(hpage);
-	put_page(new_hpage);
+
+	/*
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, put_page() will drop the reference grabbed during
+	 * isolation.
+	 */
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(new_hpage, private);
+	else
+		put_page(new_hpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1086,6 +1104,8 @@ out:
  * @from:		The list of pages to be migrated.
  * @get_new_page:	The function used to allocate free pages to be used
  *			as the target of the page migration.
+ * @put_new_page:	The function used to free target pages if migration
+ *			fails, or NULL if no special handling is necessary.
  * @private:		Private data to be passed on to get_new_page()
  * @mode:		The migration mode that specifies the constraints for
  *			page migration, if any.
@@ -1099,7 +1119,8 @@ out:
  * Returns the number of pages that were not migrated, or an error code.
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
-		unsigned long private, enum migrate_mode mode, int reason)
+		free_page_t put_new_page, unsigned long private,
+		enum migrate_mode mode, int reason)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -1121,10 +1142,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 
 			if (PageHuge(page))
 				rc = unmap_and_move_huge_page(get_new_page,
-						private, page, pass > 2, mode);
+						put_new_page, private, page,
+						pass > 2, mode);
 			else
-				rc = unmap_and_move(get_new_page, private,
-						page, pass > 2, mode);
+				rc = unmap_and_move(get_new_page, put_new_page,
+						private, page, pass > 2, mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1273,7 +1295,7 @@ set_status:
 
 	err = 0;
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_page_node,
+		err = migrate_pages(&pagelist, new_page_node, NULL,
 				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1729,7 +1751,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 
 	list_add(&page->lru, &migratepages);
 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
-				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+				     NULL, node, MIGRATE_ASYNC,
+				     MR_NUMA_MISPLACED);
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6215,7 +6215,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		cc->nr_migratepages -= nr_reclaimed;
 
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
-				    0, MIGRATE_SYNC, MR_CMA);
+				    NULL, 0, MIGRATE_SYNC, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v2 2/4] mm, compaction: return failed migration target pages back to freelist
  2014-05-01 21:35   ` David Rientjes
@ 2014-05-01 21:35     ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory compaction works by having a "freeing scanner" scan from one end of a 
zone which isolates pages as migration targets while another "migrating scanner" 
scans from the other end of the same zone which isolates pages for migration.

When page migration fails for an isolated page, the target page is returned to 
the system rather than the freelist built by the freeing scanner.  This may 
require the freeing scanner to continue scanning memory after suitable migration 
targets have already been returned to the system needlessly.

This patch returns destination pages to the freeing scanner freelist when page 
migration fails.  This prevents unnecessary work done by the freeing scanner but 
also encourages memory to be as compacted as possible at the end of the zone.

Reported-by: Greg Thelen <gthelen@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -797,23 +797,32 @@ static struct page *compaction_alloc(struct page *migratepage,
 }
 
 /*
- * We cannot control nr_migratepages and nr_freepages fully when migration is
- * running as migrate_pages() has no knowledge of compact_control. When
- * migration is complete, we count the number of pages on the lists by hand.
+ * This is a migrate-callback that "frees" freepages back to the isolated
+ * freelist.  All pages on the freelist are from the same zone, so there is no
+ * special handling needed for NUMA.
+ */
+static void compaction_free(struct page *page, unsigned long data)
+{
+	struct compact_control *cc = (struct compact_control *)data;
+
+	list_add(&page->lru, &cc->freepages);
+	cc->nr_freepages++;
+}
+
+/*
+ * We cannot control nr_migratepages fully when migration is running as
+ * migrate_pages() has no knowledge of of compact_control.  When migration is
+ * complete, we count the number of pages on the list by hand.
  */
 static void update_nr_listpages(struct compact_control *cc)
 {
 	int nr_migratepages = 0;
-	int nr_freepages = 0;
 	struct page *page;
 
 	list_for_each_entry(page, &cc->migratepages, lru)
 		nr_migratepages++;
-	list_for_each_entry(page, &cc->freepages, lru)
-		nr_freepages++;
 
 	cc->nr_migratepages = nr_migratepages;
-	cc->nr_freepages = nr_freepages;
 }
 
 /* possible outcome of isolate_migratepages */
@@ -1023,8 +1032,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
-				(unsigned long)cc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc,
+				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
 		update_nr_listpages(cc);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v2 2/4] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-01 21:35     ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory compaction works by having a "freeing scanner" scan from one end of a 
zone which isolates pages as migration targets while another "migrating scanner" 
scans from the other end of the same zone which isolates pages for migration.

When page migration fails for an isolated page, the target page is returned to 
the system rather than the freelist built by the freeing scanner.  This may 
require the freeing scanner to continue scanning memory after suitable migration 
targets have already been returned to the system needlessly.

This patch returns destination pages to the freeing scanner freelist when page 
migration fails.  This prevents unnecessary work done by the freeing scanner but 
also encourages memory to be as compacted as possible at the end of the zone.

Reported-by: Greg Thelen <gthelen@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -797,23 +797,32 @@ static struct page *compaction_alloc(struct page *migratepage,
 }
 
 /*
- * We cannot control nr_migratepages and nr_freepages fully when migration is
- * running as migrate_pages() has no knowledge of compact_control. When
- * migration is complete, we count the number of pages on the lists by hand.
+ * This is a migrate-callback that "frees" freepages back to the isolated
+ * freelist.  All pages on the freelist are from the same zone, so there is no
+ * special handling needed for NUMA.
+ */
+static void compaction_free(struct page *page, unsigned long data)
+{
+	struct compact_control *cc = (struct compact_control *)data;
+
+	list_add(&page->lru, &cc->freepages);
+	cc->nr_freepages++;
+}
+
+/*
+ * We cannot control nr_migratepages fully when migration is running as
+ * migrate_pages() has no knowledge of of compact_control.  When migration is
+ * complete, we count the number of pages on the list by hand.
  */
 static void update_nr_listpages(struct compact_control *cc)
 {
 	int nr_migratepages = 0;
-	int nr_freepages = 0;
 	struct page *page;
 
 	list_for_each_entry(page, &cc->migratepages, lru)
 		nr_migratepages++;
-	list_for_each_entry(page, &cc->freepages, lru)
-		nr_freepages++;
 
 	cc->nr_migratepages = nr_migratepages;
-	cc->nr_freepages = nr_freepages;
 }
 
 /* possible outcome of isolate_migratepages */
@@ -1023,8 +1032,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
-				(unsigned long)cc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc,
+				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
 		update_nr_listpages(cc);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-01 21:35   ` David Rientjes
@ 2014-05-01 21:35     ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Each zone has a cached migration scanner pfn for memory compaction so that 
subsequent calls to memory compaction can start where the previous call left 
off.

Currently, the compaction migration scanner only updates the per-zone cached pfn 
when pageblocks were not skipped for async compaction.  This creates a 
dependency on calling sync compaction to avoid having subsequent calls to async 
compaction from scanning an enormous amount of non-MOVABLE pageblocks each time 
it is called.  On large machines, this could be potentially very expensive.

This patch adds a per-zone cached migration scanner pfn only for async 
compaction.  It is updated everytime a pageblock has been scanned in its 
entirety and when no pages from it were successfully isolated.  The cached 
migration scanner pfn for sync compaction is updated only when called for sync 
compaction.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mmzone.h |  5 +++--
 mm/compaction.c        | 55 ++++++++++++++++++++++++++------------------------
 2 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -360,9 +360,10 @@ struct zone {
 	/* Set to true when the PG_migrate_skip bits should be cleared */
 	bool			compact_blockskip_flush;
 
-	/* pfns where compaction scanners should start */
+	/* pfn where compaction free scanner should start */
 	unsigned long		compact_cached_free_pfn;
-	unsigned long		compact_cached_migrate_pfn;
+	/* pfn where async and sync compaction migration scanner should start */
+	unsigned long		compact_cached_migrate_pfn[2];
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 	/* see spanned/present_pages for more description */
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
 	unsigned long end_pfn = zone_end_pfn(zone);
 	unsigned long pfn;
 
-	zone->compact_cached_migrate_pfn = start_pfn;
+	zone->compact_cached_migrate_pfn[0] = start_pfn;
+	zone->compact_cached_migrate_pfn[1] = start_pfn;
 	zone->compact_cached_free_pfn = end_pfn;
 	zone->compact_blockskip_flush = false;
 
@@ -134,6 +135,7 @@ static void update_pageblock_skip(struct compact_control *cc,
 			bool migrate_scanner)
 {
 	struct zone *zone = cc->zone;
+	unsigned long pfn;
 
 	if (cc->ignore_skip_hint)
 		return;
@@ -141,20 +143,25 @@ static void update_pageblock_skip(struct compact_control *cc,
 	if (!page)
 		return;
 
-	if (!nr_isolated) {
-		unsigned long pfn = page_to_pfn(page);
-		set_pageblock_skip(page);
-
-		/* Update where compaction should restart */
-		if (migrate_scanner) {
-			if (!cc->finished_update_migrate &&
-			    pfn > zone->compact_cached_migrate_pfn)
-				zone->compact_cached_migrate_pfn = pfn;
-		} else {
-			if (!cc->finished_update_free &&
-			    pfn < zone->compact_cached_free_pfn)
-				zone->compact_cached_free_pfn = pfn;
-		}
+	if (nr_isolated)
+		return;
+
+	set_pageblock_skip(page);
+	pfn = page_to_pfn(page);
+
+	/* Update where async and sync compaction should restart */
+	if (migrate_scanner) {
+		if (cc->finished_update_migrate)
+			return;
+		if (pfn > zone->compact_cached_migrate_pfn[0])
+			zone->compact_cached_migrate_pfn[0] = pfn;
+		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
+			zone->compact_cached_migrate_pfn[1] = pfn;
+	} else {
+		if (cc->finished_update_free)
+			return;
+		if (pfn < zone->compact_cached_free_pfn)
+			zone->compact_cached_free_pfn = pfn;
 	}
 }
 #else
@@ -464,7 +471,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	unsigned long flags;
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
-	bool skipped_async_unsuitable = false;
 	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
 
@@ -540,11 +546,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			 * the minimum amount of work satisfies the allocation
 			 */
 			mt = get_pageblock_migratetype(page);
-			if (!cc->sync && !migrate_async_suitable(mt)) {
-				cc->finished_update_migrate = true;
-				skipped_async_unsuitable = true;
+			if (!cc->sync && !migrate_async_suitable(mt))
 				goto next_pageblock;
-			}
 		}
 
 		/*
@@ -646,10 +649,8 @@ next_pageblock:
 	/*
 	 * Update the pageblock-skip information and cached scanner pfn,
 	 * if the whole pageblock was scanned without isolating any page.
-	 * This is not done when pageblock was skipped due to being unsuitable
-	 * for async compaction, so that eventual sync compaction can try.
 	 */
-	if (low_pfn == end_pfn && !skipped_async_unsuitable)
+	if (low_pfn == end_pfn)
 		update_pageblock_skip(cc, valid_page, nr_isolated, true);
 
 	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -875,7 +876,8 @@ static int compact_finished(struct zone *zone,
 	/* Compaction run completes if the migrate and free scanner meet */
 	if (cc->free_pfn <= cc->migrate_pfn) {
 		/* Let the next compaction start anew. */
-		zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
+		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
 		zone->compact_cached_free_pfn = zone_end_pfn(zone);
 
 		/*
@@ -1000,7 +1002,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	 * information on where the scanners should start but check that it
 	 * is initialised by ensuring the values are within zone boundaries.
 	 */
-	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1008,7 +1010,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	}
 	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
 		cc->migrate_pfn = start_pfn;
-		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
+		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
 	}
 
 	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
@ 2014-05-01 21:35     ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Each zone has a cached migration scanner pfn for memory compaction so that 
subsequent calls to memory compaction can start where the previous call left 
off.

Currently, the compaction migration scanner only updates the per-zone cached pfn 
when pageblocks were not skipped for async compaction.  This creates a 
dependency on calling sync compaction to avoid having subsequent calls to async 
compaction from scanning an enormous amount of non-MOVABLE pageblocks each time 
it is called.  On large machines, this could be potentially very expensive.

This patch adds a per-zone cached migration scanner pfn only for async 
compaction.  It is updated everytime a pageblock has been scanned in its 
entirety and when no pages from it were successfully isolated.  The cached 
migration scanner pfn for sync compaction is updated only when called for sync 
compaction.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mmzone.h |  5 +++--
 mm/compaction.c        | 55 ++++++++++++++++++++++++++------------------------
 2 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -360,9 +360,10 @@ struct zone {
 	/* Set to true when the PG_migrate_skip bits should be cleared */
 	bool			compact_blockskip_flush;
 
-	/* pfns where compaction scanners should start */
+	/* pfn where compaction free scanner should start */
 	unsigned long		compact_cached_free_pfn;
-	unsigned long		compact_cached_migrate_pfn;
+	/* pfn where async and sync compaction migration scanner should start */
+	unsigned long		compact_cached_migrate_pfn[2];
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 	/* see spanned/present_pages for more description */
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
 	unsigned long end_pfn = zone_end_pfn(zone);
 	unsigned long pfn;
 
-	zone->compact_cached_migrate_pfn = start_pfn;
+	zone->compact_cached_migrate_pfn[0] = start_pfn;
+	zone->compact_cached_migrate_pfn[1] = start_pfn;
 	zone->compact_cached_free_pfn = end_pfn;
 	zone->compact_blockskip_flush = false;
 
@@ -134,6 +135,7 @@ static void update_pageblock_skip(struct compact_control *cc,
 			bool migrate_scanner)
 {
 	struct zone *zone = cc->zone;
+	unsigned long pfn;
 
 	if (cc->ignore_skip_hint)
 		return;
@@ -141,20 +143,25 @@ static void update_pageblock_skip(struct compact_control *cc,
 	if (!page)
 		return;
 
-	if (!nr_isolated) {
-		unsigned long pfn = page_to_pfn(page);
-		set_pageblock_skip(page);
-
-		/* Update where compaction should restart */
-		if (migrate_scanner) {
-			if (!cc->finished_update_migrate &&
-			    pfn > zone->compact_cached_migrate_pfn)
-				zone->compact_cached_migrate_pfn = pfn;
-		} else {
-			if (!cc->finished_update_free &&
-			    pfn < zone->compact_cached_free_pfn)
-				zone->compact_cached_free_pfn = pfn;
-		}
+	if (nr_isolated)
+		return;
+
+	set_pageblock_skip(page);
+	pfn = page_to_pfn(page);
+
+	/* Update where async and sync compaction should restart */
+	if (migrate_scanner) {
+		if (cc->finished_update_migrate)
+			return;
+		if (pfn > zone->compact_cached_migrate_pfn[0])
+			zone->compact_cached_migrate_pfn[0] = pfn;
+		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
+			zone->compact_cached_migrate_pfn[1] = pfn;
+	} else {
+		if (cc->finished_update_free)
+			return;
+		if (pfn < zone->compact_cached_free_pfn)
+			zone->compact_cached_free_pfn = pfn;
 	}
 }
 #else
@@ -464,7 +471,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	unsigned long flags;
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
-	bool skipped_async_unsuitable = false;
 	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
 
@@ -540,11 +546,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			 * the minimum amount of work satisfies the allocation
 			 */
 			mt = get_pageblock_migratetype(page);
-			if (!cc->sync && !migrate_async_suitable(mt)) {
-				cc->finished_update_migrate = true;
-				skipped_async_unsuitable = true;
+			if (!cc->sync && !migrate_async_suitable(mt))
 				goto next_pageblock;
-			}
 		}
 
 		/*
@@ -646,10 +649,8 @@ next_pageblock:
 	/*
 	 * Update the pageblock-skip information and cached scanner pfn,
 	 * if the whole pageblock was scanned without isolating any page.
-	 * This is not done when pageblock was skipped due to being unsuitable
-	 * for async compaction, so that eventual sync compaction can try.
 	 */
-	if (low_pfn == end_pfn && !skipped_async_unsuitable)
+	if (low_pfn == end_pfn)
 		update_pageblock_skip(cc, valid_page, nr_isolated, true);
 
 	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -875,7 +876,8 @@ static int compact_finished(struct zone *zone,
 	/* Compaction run completes if the migrate and free scanner meet */
 	if (cc->free_pfn <= cc->migrate_pfn) {
 		/* Let the next compaction start anew. */
-		zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
+		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
 		zone->compact_cached_free_pfn = zone_end_pfn(zone);
 
 		/*
@@ -1000,7 +1002,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	 * information on where the scanners should start but check that it
 	 * is initialised by ensuring the values are within zone boundaries.
 	 */
-	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1008,7 +1010,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	}
 	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
 		cc->migrate_pfn = start_pfn;
-		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
+		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
 	}
 
 	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
  2014-05-01 21:35   ` David Rientjes
@ 2014-05-01 21:35     ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Synchronous memory compaction can be very expensive: it can iterate an enormous 
amount of memory without aborting, constantly rescheduling, waiting on page
locks and lru_lock, etc, if a pageblock cannot be defragmented.

Unfortunately, it's too expensive for pagefault for transparent hugepages and 
it's much better to simply fallback to pages.  On 128GB machines, we find that 
synchronous memory compaction can take O(seconds) for a single thp fault.

Now that async compaction remembers where it left off without strictly relying
on sync compaction, this makes thp allocations best-effort without causing
egregious latency during pagefault.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2656,7 +2656,7 @@ rebalance:
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
-	} else {
+	} else if (!(gfp_mask & __GFP_NO_KSWAPD)) {
 		/*
 		 * High-order allocations do not necessarily loop after
 		 * direct reclaim and reclaim/compaction depends on compaction

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
@ 2014-05-01 21:35     ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Synchronous memory compaction can be very expensive: it can iterate an enormous 
amount of memory without aborting, constantly rescheduling, waiting on page
locks and lru_lock, etc, if a pageblock cannot be defragmented.

Unfortunately, it's too expensive for pagefault for transparent hugepages and 
it's much better to simply fallback to pages.  On 128GB machines, we find that 
synchronous memory compaction can take O(seconds) for a single thp fault.

Now that async compaction remembers where it left off without strictly relying
on sync compaction, this makes thp allocations best-effort without causing
egregious latency during pagefault.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2656,7 +2656,7 @@ rebalance:
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
-	} else {
+	} else if (!(gfp_mask & __GFP_NO_KSWAPD)) {
 		/*
 		 * High-order allocations do not necessarily loop after
 		 * direct reclaim and reclaim/compaction depends on compaction

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 1/4] mm, migration: add destination page freeing callback
  2014-05-01 21:35   ` David Rientjes
                     ` (3 preceding siblings ...)
  (?)
@ 2014-05-02 10:10   ` Mel Gorman
  -1 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-02 10:10 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, May 01, 2014 at 02:35:37PM -0700, David Rientjes wrote:
> Memory migration uses a callback defined by the caller to determine how to
> allocate destination pages.  When migration fails for a source page, however, it 
> frees the destination page back to the system.
> 
> This patch adds a memory migration callback defined by the caller to determine 
> how to free destination pages.  If a caller, such as memory compaction, builds 
> its own freelist for migration targets, this can reuse already freed memory 
> instead of scanning additional memory.
> 
> If the caller provides a function to handle freeing of destination pages, it is 
> called when page migration fails.  Otherwise, it may pass NULL and freeing back 
> to the system will be handled as usual.  This patch introduces no functional 
> change.
> 
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Mel Gorman <mgorman@suse.de>

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 2/4] mm, compaction: return failed migration target pages back to freelist
  2014-05-01 21:35     ` David Rientjes
@ 2014-05-02 10:11       ` Mel Gorman
  -1 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-02 10:11 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, May 01, 2014 at 02:35:42PM -0700, David Rientjes wrote:
> Memory compaction works by having a "freeing scanner" scan from one end of a 
> zone which isolates pages as migration targets while another "migrating scanner" 
> scans from the other end of the same zone which isolates pages for migration.
> 
> When page migration fails for an isolated page, the target page is returned to 
> the system rather than the freelist built by the freeing scanner.  This may 
> require the freeing scanner to continue scanning memory after suitable migration 
> targets have already been returned to the system needlessly.
> 
> This patch returns destination pages to the freeing scanner freelist when page 
> migration fails.  This prevents unnecessary work done by the freeing scanner but 
> also encourages memory to be as compacted as possible at the end of the zone.
> 
> Reported-by: Greg Thelen <gthelen@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Mel Gorman <mgorman@suse.de>

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 2/4] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-02 10:11       ` Mel Gorman
  0 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-02 10:11 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, May 01, 2014 at 02:35:42PM -0700, David Rientjes wrote:
> Memory compaction works by having a "freeing scanner" scan from one end of a 
> zone which isolates pages as migration targets while another "migrating scanner" 
> scans from the other end of the same zone which isolates pages for migration.
> 
> When page migration fails for an isolated page, the target page is returned to 
> the system rather than the freelist built by the freeing scanner.  This may 
> require the freeing scanner to continue scanning memory after suitable migration 
> targets have already been returned to the system needlessly.
> 
> This patch returns destination pages to the freeing scanner freelist when page 
> migration fails.  This prevents unnecessary work done by the freeing scanner but 
> also encourages memory to be as compacted as possible at the end of the zone.
> 
> Reported-by: Greg Thelen <gthelen@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Mel Gorman <mgorman@suse.de>

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
  2014-05-01 21:35     ` David Rientjes
@ 2014-05-02 10:22       ` Mel Gorman
  -1 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-02 10:22 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, May 01, 2014 at 02:35:48PM -0700, David Rientjes wrote:
> Synchronous memory compaction can be very expensive: it can iterate an enormous 
> amount of memory without aborting, constantly rescheduling, waiting on page
> locks and lru_lock, etc, if a pageblock cannot be defragmented.
> 
> Unfortunately, it's too expensive for pagefault for transparent hugepages and 
> it's much better to simply fallback to pages.  On 128GB machines, we find that 
> synchronous memory compaction can take O(seconds) for a single thp fault.
> 
> Now that async compaction remembers where it left off without strictly relying
> on sync compaction, this makes thp allocations best-effort without causing
> egregious latency during pagefault.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Compaction uses MIGRATE_SYNC_LIGHT which in the current implementation
avoids calling ->writepage to clean dirty page in the fallback migrate
case

static int fallback_migrate_page(struct address_space *mapping,
        struct page *newpage, struct page *page, enum migrate_mode mode)
{
        if (PageDirty(page)) {
                /* Only writeback pages in full synchronous migration */
                if (mode != MIGRATE_SYNC)
                        return -EBUSY;
                return writeout(mapping, page);
        }

or waiting on page writeback in other cases

                /*
                 * Only in the case of a full synchronous migration is it
                 * necessary to wait for PageWriteback. In the async case,
                 * the retry loop is too short and in the sync-light case,
                 * the overhead of stalling is too much
                 */
                if (mode != MIGRATE_SYNC) {
                        rc = -EBUSY;
                        goto uncharge;
                }

or on acquiring the page lock for unforced migrations.

However, buffers still get locked in the SYNC_LIGHT case causing stalls
in buffer_migrate_lock_buffers which may be undesirable or maybe you are
hitting some other case. It would be preferable to identify what is
getting stalled in SYNC_LIGHT compaction and fix that rather than
disabling it entirely. You may also want to distinguish between a direct
compaction by a process and collapsing huge pages as done by khugepaged.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
@ 2014-05-02 10:22       ` Mel Gorman
  0 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-02 10:22 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, May 01, 2014 at 02:35:48PM -0700, David Rientjes wrote:
> Synchronous memory compaction can be very expensive: it can iterate an enormous 
> amount of memory without aborting, constantly rescheduling, waiting on page
> locks and lru_lock, etc, if a pageblock cannot be defragmented.
> 
> Unfortunately, it's too expensive for pagefault for transparent hugepages and 
> it's much better to simply fallback to pages.  On 128GB machines, we find that 
> synchronous memory compaction can take O(seconds) for a single thp fault.
> 
> Now that async compaction remembers where it left off without strictly relying
> on sync compaction, this makes thp allocations best-effort without causing
> egregious latency during pagefault.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Compaction uses MIGRATE_SYNC_LIGHT which in the current implementation
avoids calling ->writepage to clean dirty page in the fallback migrate
case

static int fallback_migrate_page(struct address_space *mapping,
        struct page *newpage, struct page *page, enum migrate_mode mode)
{
        if (PageDirty(page)) {
                /* Only writeback pages in full synchronous migration */
                if (mode != MIGRATE_SYNC)
                        return -EBUSY;
                return writeout(mapping, page);
        }

or waiting on page writeback in other cases

                /*
                 * Only in the case of a full synchronous migration is it
                 * necessary to wait for PageWriteback. In the async case,
                 * the retry loop is too short and in the sync-light case,
                 * the overhead of stalling is too much
                 */
                if (mode != MIGRATE_SYNC) {
                        rc = -EBUSY;
                        goto uncharge;
                }

or on acquiring the page lock for unforced migrations.

However, buffers still get locked in the SYNC_LIGHT case causing stalls
in buffer_migrate_lock_buffers which may be undesirable or maybe you are
hitting some other case. It would be preferable to identify what is
getting stalled in SYNC_LIGHT compaction and fix that rather than
disabling it entirely. You may also want to distinguish between a direct
compaction by a process and collapsing huge pages as done by khugepaged.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
  2014-05-02 10:22       ` Mel Gorman
@ 2014-05-02 11:22         ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-02 11:22 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Fri, 2 May 2014, Mel Gorman wrote:

> > Synchronous memory compaction can be very expensive: it can iterate an enormous 
> > amount of memory without aborting, constantly rescheduling, waiting on page
> > locks and lru_lock, etc, if a pageblock cannot be defragmented.
> > 
> > Unfortunately, it's too expensive for pagefault for transparent hugepages and 
> > it's much better to simply fallback to pages.  On 128GB machines, we find that 
> > synchronous memory compaction can take O(seconds) for a single thp fault.
> > 
> > Now that async compaction remembers where it left off without strictly relying
> > on sync compaction, this makes thp allocations best-effort without causing
> > egregious latency during pagefault.
> > 
> > Signed-off-by: David Rientjes <rientjes@google.com>
> 
> Compaction uses MIGRATE_SYNC_LIGHT which in the current implementation
> avoids calling ->writepage to clean dirty page in the fallback migrate
> case
> 
> static int fallback_migrate_page(struct address_space *mapping,
>         struct page *newpage, struct page *page, enum migrate_mode mode)
> {
>         if (PageDirty(page)) {
>                 /* Only writeback pages in full synchronous migration */
>                 if (mode != MIGRATE_SYNC)
>                         return -EBUSY;
>                 return writeout(mapping, page);
>         }
> 
> or waiting on page writeback in other cases
> 
>                 /*
>                  * Only in the case of a full synchronous migration is it
>                  * necessary to wait for PageWriteback. In the async case,
>                  * the retry loop is too short and in the sync-light case,
>                  * the overhead of stalling is too much
>                  */
>                 if (mode != MIGRATE_SYNC) {
>                         rc = -EBUSY;
>                         goto uncharge;
>                 }
> 
> or on acquiring the page lock for unforced migrations.
> 

The page locks I'm referring to is the lock_page() in __unmap_and_move() 
that gets called for sync compaction after the migrate_pages() iteration 
makes a few passes and unsuccessfully grabs it.  This becomes a forced 
migration since __unmap_and_move() returns -EAGAIN when the trylock fails.

> However, buffers still get locked in the SYNC_LIGHT case causing stalls
> in buffer_migrate_lock_buffers which may be undesirable or maybe you are
> hitting some other case.

We have perf profiles from one workload in particular that shows 
contention on i_mmap_mutex (anon isn't interesting since the vast majority 
of memory on this workload [120GB on a 128GB machine] is has a gup pin and 
doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid 
isolating pinned pages")) between cpus all doing memory compaction trying 
to fault thp memory.

That's one example that we've seen, but the fact remains that at times 
sync compaction will iterate the entire 128GB machine and not allow an 
order-9 page to be allocated and there's nothing to preempt it like the 
need_resched() or lock contention checks that async compaction has.  (I 
say "at times" because deferred compaction does help if it's triggered and 
the cached pfn values help, but the worst case scenario is when the entire 
machine is iterated.)  Would it make sense when HPAGE_PMD_ORDER == 
pageblock_order, which is usually the case, that we don't even try to 
migrate pages unless the entire pageblock is successfully isolated and 
gfp_mask & __GFP_NO_KSWAPD?  That's the only other thing that I can think 
that will avoid such a problem other than this patch.

We're struggling to find the logic that ensures sync compaction does not 
become too expensive at pagefault and unless that can be guaranteed, it 
seemed best to avoid it entirely.

> It would be preferable to identify what is
> getting stalled in SYNC_LIGHT compaction and fix that rather than
> disabling it entirely. You may also want to distinguish between a direct
> compaction by a process and collapsing huge pages as done by khugepaged.
> 

I'm referring to latencies of 8-9 seconds to fault 64MB of transparent 
hugepages.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
@ 2014-05-02 11:22         ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-02 11:22 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Fri, 2 May 2014, Mel Gorman wrote:

> > Synchronous memory compaction can be very expensive: it can iterate an enormous 
> > amount of memory without aborting, constantly rescheduling, waiting on page
> > locks and lru_lock, etc, if a pageblock cannot be defragmented.
> > 
> > Unfortunately, it's too expensive for pagefault for transparent hugepages and 
> > it's much better to simply fallback to pages.  On 128GB machines, we find that 
> > synchronous memory compaction can take O(seconds) for a single thp fault.
> > 
> > Now that async compaction remembers where it left off without strictly relying
> > on sync compaction, this makes thp allocations best-effort without causing
> > egregious latency during pagefault.
> > 
> > Signed-off-by: David Rientjes <rientjes@google.com>
> 
> Compaction uses MIGRATE_SYNC_LIGHT which in the current implementation
> avoids calling ->writepage to clean dirty page in the fallback migrate
> case
> 
> static int fallback_migrate_page(struct address_space *mapping,
>         struct page *newpage, struct page *page, enum migrate_mode mode)
> {
>         if (PageDirty(page)) {
>                 /* Only writeback pages in full synchronous migration */
>                 if (mode != MIGRATE_SYNC)
>                         return -EBUSY;
>                 return writeout(mapping, page);
>         }
> 
> or waiting on page writeback in other cases
> 
>                 /*
>                  * Only in the case of a full synchronous migration is it
>                  * necessary to wait for PageWriteback. In the async case,
>                  * the retry loop is too short and in the sync-light case,
>                  * the overhead of stalling is too much
>                  */
>                 if (mode != MIGRATE_SYNC) {
>                         rc = -EBUSY;
>                         goto uncharge;
>                 }
> 
> or on acquiring the page lock for unforced migrations.
> 

The page locks I'm referring to is the lock_page() in __unmap_and_move() 
that gets called for sync compaction after the migrate_pages() iteration 
makes a few passes and unsuccessfully grabs it.  This becomes a forced 
migration since __unmap_and_move() returns -EAGAIN when the trylock fails.

> However, buffers still get locked in the SYNC_LIGHT case causing stalls
> in buffer_migrate_lock_buffers which may be undesirable or maybe you are
> hitting some other case.

We have perf profiles from one workload in particular that shows 
contention on i_mmap_mutex (anon isn't interesting since the vast majority 
of memory on this workload [120GB on a 128GB machine] is has a gup pin and 
doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid 
isolating pinned pages")) between cpus all doing memory compaction trying 
to fault thp memory.

That's one example that we've seen, but the fact remains that at times 
sync compaction will iterate the entire 128GB machine and not allow an 
order-9 page to be allocated and there's nothing to preempt it like the 
need_resched() or lock contention checks that async compaction has.  (I 
say "at times" because deferred compaction does help if it's triggered and 
the cached pfn values help, but the worst case scenario is when the entire 
machine is iterated.)  Would it make sense when HPAGE_PMD_ORDER == 
pageblock_order, which is usually the case, that we don't even try to 
migrate pages unless the entire pageblock is successfully isolated and 
gfp_mask & __GFP_NO_KSWAPD?  That's the only other thing that I can think 
that will avoid such a problem other than this patch.

We're struggling to find the logic that ensures sync compaction does not 
become too expensive at pagefault and unless that can be guaranteed, it 
seemed best to avoid it entirely.

> It would be preferable to identify what is
> getting stalled in SYNC_LIGHT compaction and fix that rather than
> disabling it entirely. You may also want to distinguish between a direct
> compaction by a process and collapsing huge pages as done by khugepaged.
> 

I'm referring to latencies of 8-9 seconds to fault 64MB of transparent 
hugepages.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
  2014-05-02 11:22         ` David Rientjes
  (?)
@ 2014-05-02 11:58         ` Mel Gorman
  2014-05-02 20:29             ` David Rientjes
  -1 siblings, 1 reply; 267+ messages in thread
From: Mel Gorman @ 2014-05-02 11:58 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Fri, May 02, 2014 at 04:22:03AM -0700, David Rientjes wrote:
> On Fri, 2 May 2014, Mel Gorman wrote:
> 
> > > Synchronous memory compaction can be very expensive: it can iterate an enormous 
> > > amount of memory without aborting, constantly rescheduling, waiting on page
> > > locks and lru_lock, etc, if a pageblock cannot be defragmented.
> > > 
> > > Unfortunately, it's too expensive for pagefault for transparent hugepages and 
> > > it's much better to simply fallback to pages.  On 128GB machines, we find that 
> > > synchronous memory compaction can take O(seconds) for a single thp fault.
> > > 
> > > Now that async compaction remembers where it left off without strictly relying
> > > on sync compaction, this makes thp allocations best-effort without causing
> > > egregious latency during pagefault.
> > > 
> > > Signed-off-by: David Rientjes <rientjes@google.com>
> > 
> > Compaction uses MIGRATE_SYNC_LIGHT which in the current implementation
> > avoids calling ->writepage to clean dirty page in the fallback migrate
> > case
> > 
> > static int fallback_migrate_page(struct address_space *mapping,
> >         struct page *newpage, struct page *page, enum migrate_mode mode)
> > {
> >         if (PageDirty(page)) {
> >                 /* Only writeback pages in full synchronous migration */
> >                 if (mode != MIGRATE_SYNC)
> >                         return -EBUSY;
> >                 return writeout(mapping, page);
> >         }
> > 
> > or waiting on page writeback in other cases
> > 
> >                 /*
> >                  * Only in the case of a full synchronous migration is it
> >                  * necessary to wait for PageWriteback. In the async case,
> >                  * the retry loop is too short and in the sync-light case,
> >                  * the overhead of stalling is too much
> >                  */
> >                 if (mode != MIGRATE_SYNC) {
> >                         rc = -EBUSY;
> >                         goto uncharge;
> >                 }
> > 
> > or on acquiring the page lock for unforced migrations.
> > 
> 
> The page locks I'm referring to is the lock_page() in __unmap_and_move() 
> that gets called for sync compaction after the migrate_pages() iteration 
> makes a few passes and unsuccessfully grabs it.  This becomes a forced 
> migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
> 

Can that be fixed then instead of disabling it entirely?

> > However, buffers still get locked in the SYNC_LIGHT case causing stalls
> > in buffer_migrate_lock_buffers which may be undesirable or maybe you are
> > hitting some other case.
> 
> We have perf profiles from one workload in particular that shows 
> contention on i_mmap_mutex (anon isn't interesting since the vast majority 
> of memory on this workload [120GB on a 128GB machine] is has a gup pin and 
> doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid 
> isolating pinned pages")) between cpus all doing memory compaction trying 
> to fault thp memory.
> 

Abort SYNC_LIGHT compaction if the mutex is contended.

> That's one example that we've seen, but the fact remains that at times 
> sync compaction will iterate the entire 128GB machine and not allow an 
> order-9 page to be allocated and there's nothing to preempt it like the 
> need_resched() or lock contention checks that async compaction has. 

Make compact_control->sync the same enum field and check for contention
on the async/sync_light case but leave it for sync if compacting via the
proc interface?

> (I 
> say "at times" because deferred compaction does help if it's triggered and 
> the cached pfn values help, but the worst case scenario is when the entire 
> machine is iterated.)  Would it make sense when HPAGE_PMD_ORDER == 
> pageblock_order, which is usually the case, that we don't even try to 
> migrate pages unless the entire pageblock is successfully isolated and 
> gfp_mask & __GFP_NO_KSWAPD? 

That would be ok for async and sync_light compaction but again leave it
for sync compaction if compacting via the proc interface to give us a
basis for comparison.

> That's the only other thing that I can think 
> that will avoid such a problem other than this patch.
> 
> We're struggling to find the logic that ensures sync compaction does not 
> become too expensive at pagefault and unless that can be guaranteed, it 
> seemed best to avoid it entirely.
> 

The intent of sync_light was to avoid compaction being too expensive at
page page fault time. At the time it would have been evalated against
machines with 4G of memory. You have found that it did not go far enough.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch 1/2] mm, migration: add destination page freeing callback
  2014-05-01  0:45 ` David Rientjes
@ 2014-05-02 13:16   ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 13:16 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/01/2014 02:45 AM, David Rientjes wrote:
> Memory migration uses a callback defined by the caller to determine how to
> allocate destination pages.  When migration fails for a source page, however, it
> frees the destination page back to the system.
>
> This patch adds a memory migration callback defined by the caller to determine
> how to free destination pages.  If a caller, such as memory compaction, builds
> its own freelist for migration targets, this can reuse already freed memory
> instead of scanning additional memory.
>
> If the caller provides a function to handle freeing of destination pages, it is
> called when page migration fails.  Otherwise, it may pass NULL and freeing back
> to the system will be handled as usual.  This patch introduces no functional
> change.
>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>   include/linux/migrate.h | 11 ++++++----
>   mm/compaction.c         |  2 +-
>   mm/memory-failure.c     |  4 ++--
>   mm/memory_hotplug.c     |  2 +-
>   mm/mempolicy.c          |  4 ++--
>   mm/migrate.c            | 54 ++++++++++++++++++++++++++++++++-----------------
>   mm/page_alloc.c         |  2 +-
>   7 files changed, 50 insertions(+), 29 deletions(-)
>
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -5,7 +5,9 @@
>   #include <linux/mempolicy.h>
>   #include <linux/migrate_mode.h>
>
> -typedef struct page *new_page_t(struct page *, unsigned long private, int **);
> +typedef struct page *new_page_t(struct page *page, unsigned long private,
> +				int **reason);
> +typedef void free_page_t(struct page *page, unsigned long private);
>
>   /*
>    * Return values from addresss_space_operations.migratepage():
> @@ -38,7 +40,7 @@ enum migrate_reason {
>   extern void putback_movable_pages(struct list_head *l);
>   extern int migrate_page(struct address_space *,
>   			struct page *, struct page *, enum migrate_mode);
> -extern int migrate_pages(struct list_head *l, new_page_t x,
> +extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
>   		unsigned long private, enum migrate_mode mode, int reason);
>
>   extern int migrate_prep(void);
> @@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
>   #else
>
>   static inline void putback_movable_pages(struct list_head *l) {}
> -static inline int migrate_pages(struct list_head *l, new_page_t x,
> -		unsigned long private, enum migrate_mode mode, int reason)
> +static inline int migrate_pages(struct list_head *l, new_page_t new,
> +		free_page_t free, unsigned long private, enum migrate_mode mode,
> +		int reason)
>   	{ return -ENOSYS; }
>
>   static inline int migrate_prep(void) { return -ENOSYS; }
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -1023,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   		}
>
>   		nr_migrate = cc->nr_migratepages;
> -		err = migrate_pages(&cc->migratepages, compaction_alloc,
> +		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
>   				(unsigned long)cc,
>   				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
>   				MR_COMPACTION);
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1500,7 +1500,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
>
>   	/* Keep page count to indicate a given hugepage is isolated. */
>   	list_move(&hpage->lru, &pagelist);
> -	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
> +	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
>   				MIGRATE_SYNC, MR_MEMORY_FAILURE);
>   	if (ret) {
>   		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
> @@ -1581,7 +1581,7 @@ static int __soft_offline_page(struct page *page, int flags)
>   		inc_zone_page_state(page, NR_ISOLATED_ANON +
>   					page_is_file_cache(page));
>   		list_add(&page->lru, &pagelist);
> -		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
> +		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
>   					MIGRATE_SYNC, MR_MEMORY_FAILURE);
>   		if (ret) {
>   			if (!list_empty(&pagelist)) {
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
>   		 * alloc_migrate_target should be improooooved!!
>   		 * migrate_pages returns # of failed pages.
>   		 */
> -		ret = migrate_pages(&source, alloc_migrate_target, 0,
> +		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
>   					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
>   		if (ret)
>   			putback_movable_pages(&source);
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
>   			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
>
>   	if (!list_empty(&pagelist)) {
> -		err = migrate_pages(&pagelist, new_node_page, dest,
> +		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
>   					MIGRATE_SYNC, MR_SYSCALL);
>   		if (err)
>   			putback_movable_pages(&pagelist);
> @@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
>   		if (!list_empty(&pagelist)) {
>   			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
>   			nr_failed = migrate_pages(&pagelist, new_vma_page,
> -					(unsigned long)vma,
> +					NULL, (unsigned long)vma,
>   					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
>   			if (nr_failed)
>   				putback_movable_pages(&pagelist);
> diff --git a/mm/migrate.c b/mm/migrate.c
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -938,8 +938,9 @@ out:
>    * Obtain the lock on page, remove all ptes and migrate the page
>    * to the newly allocated page in newpage.
>    */
> -static int unmap_and_move(new_page_t get_new_page, unsigned long private,
> -			struct page *page, int force, enum migrate_mode mode)
> +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
> +			unsigned long private, struct page *page, int force,
> +			enum migrate_mode mode)
>   {
>   	int rc = 0;
>   	int *result = NULL;
> @@ -983,11 +984,12 @@ out:
>   				page_is_file_cache(page));
>   		putback_lru_page(page);
>   	}
> -	/*
> -	 * Move the new page to the LRU. If migration was not successful
> -	 * then this will free the page.
> -	 */
> -	putback_lru_page(newpage);
> +
> +	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
> +		put_new_page(newpage, private);
> +	else
> +		putback_lru_page(newpage);
> +
>   	if (result) {
>   		if (rc)
>   			*result = rc;
> @@ -1016,8 +1018,9 @@ out:
>    * will wait in the page fault for migration to complete.
>    */
>   static int unmap_and_move_huge_page(new_page_t get_new_page,
> -				unsigned long private, struct page *hpage,
> -				int force, enum migrate_mode mode)
> +				free_page_t put_new_page, unsigned long private,
> +				struct page *hpage, int force,
> +				enum migrate_mode mode)
>   {
>   	int rc = 0;
>   	int *result = NULL;
> @@ -1056,20 +1059,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
>   	if (!page_mapped(hpage))
>   		rc = move_to_new_page(new_hpage, hpage, 1, mode);
>
> -	if (rc)
> +	if (rc != MIGRATEPAGE_SUCCESS)
>   		remove_migration_ptes(hpage, hpage);
>
>   	if (anon_vma)
>   		put_anon_vma(anon_vma);
>
> -	if (!rc)
> +	if (rc == MIGRATEPAGE_SUCCESS)
>   		hugetlb_cgroup_migrate(hpage, new_hpage);
>
>   	unlock_page(hpage);
>   out:
>   	if (rc != -EAGAIN)
>   		putback_active_hugepage(hpage);
> -	put_page(new_hpage);
> +
> +	/*
> +	 * If migration was not successful and there's a freeing callback, use
> +	 * it.  Otherwise, put_page() will drop the reference grabbed during
> +	 * isolation.
> +	 */
> +	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
> +		put_new_page(new_hpage, private);
> +	else
> +		put_page(new_hpage);
> +
>   	if (result) {
>   		if (rc)
>   			*result = rc;
> @@ -1086,6 +1099,8 @@ out:
>    * @from:		The list of pages to be migrated.
>    * @get_new_page:	The function used to allocate free pages to be used
>    *			as the target of the page migration.
> + * @put_new_page:	The function used to free target pages if migration
> + *			fails, or NULL if no special handling is necessary.
>    * @private:		Private data to be passed on to get_new_page()
>    * @mode:		The migration mode that specifies the constraints for
>    *			page migration, if any.
> @@ -1099,7 +1114,8 @@ out:
>    * Returns the number of pages that were not migrated, or an error code.
>    */
>   int migrate_pages(struct list_head *from, new_page_t get_new_page,
> -		unsigned long private, enum migrate_mode mode, int reason)
> +		free_page_t put_new_page, unsigned long private,
> +		enum migrate_mode mode, int reason)
>   {
>   	int retry = 1;
>   	int nr_failed = 0;
> @@ -1121,10 +1137,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
>
>   			if (PageHuge(page))
>   				rc = unmap_and_move_huge_page(get_new_page,
> -						private, page, pass > 2, mode);
> +						put_new_page, private, page,
> +						pass > 2, mode);
>   			else
> -				rc = unmap_and_move(get_new_page, private,
> -						page, pass > 2, mode);
> +				rc = unmap_and_move(get_new_page, put_new_page,
> +						private, page, pass > 2, mode);
>
>   			switch(rc) {
>   			case -ENOMEM:
> @@ -1273,7 +1290,7 @@ set_status:
>
>   	err = 0;
>   	if (!list_empty(&pagelist)) {
> -		err = migrate_pages(&pagelist, new_page_node,
> +		err = migrate_pages(&pagelist, new_page_node, NULL,
>   				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
>   		if (err)
>   			putback_movable_pages(&pagelist);
> @@ -1729,7 +1746,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
>
>   	list_add(&page->lru, &migratepages);
>   	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
> -				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
> +				     NULL, node, MIGRATE_ASYNC,
> +				     MR_NUMA_MISPLACED);
>   	if (nr_remaining) {
>   		if (!list_empty(&migratepages)) {
>   			list_del(&page->lru);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -6215,7 +6215,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
>   		cc->nr_migratepages -= nr_reclaimed;
>
>   		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
> -				    0, MIGRATE_SYNC, MR_CMA);
> +				    NULL, 0, MIGRATE_SYNC, MR_CMA);
>   	}
>   	if (ret < 0) {
>   		putback_movable_pages(&cc->migratepages);
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch 1/2] mm, migration: add destination page freeing callback
@ 2014-05-02 13:16   ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 13:16 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/01/2014 02:45 AM, David Rientjes wrote:
> Memory migration uses a callback defined by the caller to determine how to
> allocate destination pages.  When migration fails for a source page, however, it
> frees the destination page back to the system.
>
> This patch adds a memory migration callback defined by the caller to determine
> how to free destination pages.  If a caller, such as memory compaction, builds
> its own freelist for migration targets, this can reuse already freed memory
> instead of scanning additional memory.
>
> If the caller provides a function to handle freeing of destination pages, it is
> called when page migration fails.  Otherwise, it may pass NULL and freeing back
> to the system will be handled as usual.  This patch introduces no functional
> change.
>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>   include/linux/migrate.h | 11 ++++++----
>   mm/compaction.c         |  2 +-
>   mm/memory-failure.c     |  4 ++--
>   mm/memory_hotplug.c     |  2 +-
>   mm/mempolicy.c          |  4 ++--
>   mm/migrate.c            | 54 ++++++++++++++++++++++++++++++++-----------------
>   mm/page_alloc.c         |  2 +-
>   7 files changed, 50 insertions(+), 29 deletions(-)
>
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -5,7 +5,9 @@
>   #include <linux/mempolicy.h>
>   #include <linux/migrate_mode.h>
>
> -typedef struct page *new_page_t(struct page *, unsigned long private, int **);
> +typedef struct page *new_page_t(struct page *page, unsigned long private,
> +				int **reason);
> +typedef void free_page_t(struct page *page, unsigned long private);
>
>   /*
>    * Return values from addresss_space_operations.migratepage():
> @@ -38,7 +40,7 @@ enum migrate_reason {
>   extern void putback_movable_pages(struct list_head *l);
>   extern int migrate_page(struct address_space *,
>   			struct page *, struct page *, enum migrate_mode);
> -extern int migrate_pages(struct list_head *l, new_page_t x,
> +extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
>   		unsigned long private, enum migrate_mode mode, int reason);
>
>   extern int migrate_prep(void);
> @@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
>   #else
>
>   static inline void putback_movable_pages(struct list_head *l) {}
> -static inline int migrate_pages(struct list_head *l, new_page_t x,
> -		unsigned long private, enum migrate_mode mode, int reason)
> +static inline int migrate_pages(struct list_head *l, new_page_t new,
> +		free_page_t free, unsigned long private, enum migrate_mode mode,
> +		int reason)
>   	{ return -ENOSYS; }
>
>   static inline int migrate_prep(void) { return -ENOSYS; }
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -1023,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   		}
>
>   		nr_migrate = cc->nr_migratepages;
> -		err = migrate_pages(&cc->migratepages, compaction_alloc,
> +		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
>   				(unsigned long)cc,
>   				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
>   				MR_COMPACTION);
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1500,7 +1500,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
>
>   	/* Keep page count to indicate a given hugepage is isolated. */
>   	list_move(&hpage->lru, &pagelist);
> -	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
> +	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
>   				MIGRATE_SYNC, MR_MEMORY_FAILURE);
>   	if (ret) {
>   		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
> @@ -1581,7 +1581,7 @@ static int __soft_offline_page(struct page *page, int flags)
>   		inc_zone_page_state(page, NR_ISOLATED_ANON +
>   					page_is_file_cache(page));
>   		list_add(&page->lru, &pagelist);
> -		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
> +		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
>   					MIGRATE_SYNC, MR_MEMORY_FAILURE);
>   		if (ret) {
>   			if (!list_empty(&pagelist)) {
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
>   		 * alloc_migrate_target should be improooooved!!
>   		 * migrate_pages returns # of failed pages.
>   		 */
> -		ret = migrate_pages(&source, alloc_migrate_target, 0,
> +		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
>   					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
>   		if (ret)
>   			putback_movable_pages(&source);
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
>   			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
>
>   	if (!list_empty(&pagelist)) {
> -		err = migrate_pages(&pagelist, new_node_page, dest,
> +		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
>   					MIGRATE_SYNC, MR_SYSCALL);
>   		if (err)
>   			putback_movable_pages(&pagelist);
> @@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
>   		if (!list_empty(&pagelist)) {
>   			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
>   			nr_failed = migrate_pages(&pagelist, new_vma_page,
> -					(unsigned long)vma,
> +					NULL, (unsigned long)vma,
>   					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
>   			if (nr_failed)
>   				putback_movable_pages(&pagelist);
> diff --git a/mm/migrate.c b/mm/migrate.c
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -938,8 +938,9 @@ out:
>    * Obtain the lock on page, remove all ptes and migrate the page
>    * to the newly allocated page in newpage.
>    */
> -static int unmap_and_move(new_page_t get_new_page, unsigned long private,
> -			struct page *page, int force, enum migrate_mode mode)
> +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
> +			unsigned long private, struct page *page, int force,
> +			enum migrate_mode mode)
>   {
>   	int rc = 0;
>   	int *result = NULL;
> @@ -983,11 +984,12 @@ out:
>   				page_is_file_cache(page));
>   		putback_lru_page(page);
>   	}
> -	/*
> -	 * Move the new page to the LRU. If migration was not successful
> -	 * then this will free the page.
> -	 */
> -	putback_lru_page(newpage);
> +
> +	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
> +		put_new_page(newpage, private);
> +	else
> +		putback_lru_page(newpage);
> +
>   	if (result) {
>   		if (rc)
>   			*result = rc;
> @@ -1016,8 +1018,9 @@ out:
>    * will wait in the page fault for migration to complete.
>    */
>   static int unmap_and_move_huge_page(new_page_t get_new_page,
> -				unsigned long private, struct page *hpage,
> -				int force, enum migrate_mode mode)
> +				free_page_t put_new_page, unsigned long private,
> +				struct page *hpage, int force,
> +				enum migrate_mode mode)
>   {
>   	int rc = 0;
>   	int *result = NULL;
> @@ -1056,20 +1059,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
>   	if (!page_mapped(hpage))
>   		rc = move_to_new_page(new_hpage, hpage, 1, mode);
>
> -	if (rc)
> +	if (rc != MIGRATEPAGE_SUCCESS)
>   		remove_migration_ptes(hpage, hpage);
>
>   	if (anon_vma)
>   		put_anon_vma(anon_vma);
>
> -	if (!rc)
> +	if (rc == MIGRATEPAGE_SUCCESS)
>   		hugetlb_cgroup_migrate(hpage, new_hpage);
>
>   	unlock_page(hpage);
>   out:
>   	if (rc != -EAGAIN)
>   		putback_active_hugepage(hpage);
> -	put_page(new_hpage);
> +
> +	/*
> +	 * If migration was not successful and there's a freeing callback, use
> +	 * it.  Otherwise, put_page() will drop the reference grabbed during
> +	 * isolation.
> +	 */
> +	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
> +		put_new_page(new_hpage, private);
> +	else
> +		put_page(new_hpage);
> +
>   	if (result) {
>   		if (rc)
>   			*result = rc;
> @@ -1086,6 +1099,8 @@ out:
>    * @from:		The list of pages to be migrated.
>    * @get_new_page:	The function used to allocate free pages to be used
>    *			as the target of the page migration.
> + * @put_new_page:	The function used to free target pages if migration
> + *			fails, or NULL if no special handling is necessary.
>    * @private:		Private data to be passed on to get_new_page()
>    * @mode:		The migration mode that specifies the constraints for
>    *			page migration, if any.
> @@ -1099,7 +1114,8 @@ out:
>    * Returns the number of pages that were not migrated, or an error code.
>    */
>   int migrate_pages(struct list_head *from, new_page_t get_new_page,
> -		unsigned long private, enum migrate_mode mode, int reason)
> +		free_page_t put_new_page, unsigned long private,
> +		enum migrate_mode mode, int reason)
>   {
>   	int retry = 1;
>   	int nr_failed = 0;
> @@ -1121,10 +1137,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
>
>   			if (PageHuge(page))
>   				rc = unmap_and_move_huge_page(get_new_page,
> -						private, page, pass > 2, mode);
> +						put_new_page, private, page,
> +						pass > 2, mode);
>   			else
> -				rc = unmap_and_move(get_new_page, private,
> -						page, pass > 2, mode);
> +				rc = unmap_and_move(get_new_page, put_new_page,
> +						private, page, pass > 2, mode);
>
>   			switch(rc) {
>   			case -ENOMEM:
> @@ -1273,7 +1290,7 @@ set_status:
>
>   	err = 0;
>   	if (!list_empty(&pagelist)) {
> -		err = migrate_pages(&pagelist, new_page_node,
> +		err = migrate_pages(&pagelist, new_page_node, NULL,
>   				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
>   		if (err)
>   			putback_movable_pages(&pagelist);
> @@ -1729,7 +1746,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
>
>   	list_add(&page->lru, &migratepages);
>   	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
> -				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
> +				     NULL, node, MIGRATE_ASYNC,
> +				     MR_NUMA_MISPLACED);
>   	if (nr_remaining) {
>   		if (!list_empty(&migratepages)) {
>   			list_del(&page->lru);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -6215,7 +6215,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
>   		cc->nr_migratepages -= nr_reclaimed;
>
>   		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
> -				    0, MIGRATE_SYNC, MR_CMA);
> +				    NULL, 0, MIGRATE_SYNC, MR_CMA);
>   	}
>   	if (ret < 0) {
>   		putback_movable_pages(&cc->migratepages);
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 2/4] mm, compaction: return failed migration target pages back to freelist
  2014-05-01 21:35     ` David Rientjes
@ 2014-05-02 15:23       ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 15:23 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/01/2014 11:35 PM, David Rientjes wrote:
> Memory compaction works by having a "freeing scanner" scan from one end of a
> zone which isolates pages as migration targets while another "migrating scanner"
> scans from the other end of the same zone which isolates pages for migration.
>
> When page migration fails for an isolated page, the target page is returned to
> the system rather than the freelist built by the freeing scanner.  This may
> require the freeing scanner to continue scanning memory after suitable migration
> targets have already been returned to the system needlessly.
>
> This patch returns destination pages to the freeing scanner freelist when page
> migration fails.  This prevents unnecessary work done by the freeing scanner but
> also encourages memory to be as compacted as possible at the end of the zone.

Note that the free scanner work can be further reduced as it does not 
anymore need to restart on the highest pageblock where it isolated 
something. It now does that exactly for the reason that something might 
have been returned back to the allocator and we don't want to miss that.
I'll reply with a patch for that shortly (that applies on top of 
isolate_freepages() changes in -mm plus Joonsoo's not-yet-included patch 
from http://marc.info/?l=linux-mm&m=139841452326021&w=2 )

> Reported-by: Greg Thelen <gthelen@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>   mm/compaction.c | 27 ++++++++++++++++++---------
>   1 file changed, 18 insertions(+), 9 deletions(-)
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -797,23 +797,32 @@ static struct page *compaction_alloc(struct page *migratepage,
>   }
>
>   /*
> - * We cannot control nr_migratepages and nr_freepages fully when migration is
> - * running as migrate_pages() has no knowledge of compact_control. When
> - * migration is complete, we count the number of pages on the lists by hand.
> + * This is a migrate-callback that "frees" freepages back to the isolated
> + * freelist.  All pages on the freelist are from the same zone, so there is no
> + * special handling needed for NUMA.
> + */
> +static void compaction_free(struct page *page, unsigned long data)
> +{
> +	struct compact_control *cc = (struct compact_control *)data;
> +
> +	list_add(&page->lru, &cc->freepages);
> +	cc->nr_freepages++;
> +}
> +
> +/*
> + * We cannot control nr_migratepages fully when migration is running as
> + * migrate_pages() has no knowledge of of compact_control.  When migration is
> + * complete, we count the number of pages on the list by hand.

Actually migrate_pages() returns the number of failed pages except when 
it returns an error. And then we only use the value for the tracepoint. 
I'll send a followup patch for this.

>    */
>   static void update_nr_listpages(struct compact_control *cc)
>   {
>   	int nr_migratepages = 0;
> -	int nr_freepages = 0;
>   	struct page *page;
>
>   	list_for_each_entry(page, &cc->migratepages, lru)
>   		nr_migratepages++;
> -	list_for_each_entry(page, &cc->freepages, lru)
> -		nr_freepages++;
>
>   	cc->nr_migratepages = nr_migratepages;
> -	cc->nr_freepages = nr_freepages;
>   }
>
>   /* possible outcome of isolate_migratepages */
> @@ -1023,8 +1032,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   		}
>
>   		nr_migrate = cc->nr_migratepages;
> -		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
> -				(unsigned long)cc,
> +		err = migrate_pages(&cc->migratepages, compaction_alloc,
> +				compaction_free, (unsigned long)cc,
>   				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
>   				MR_COMPACTION);
>   		update_nr_listpages(cc);
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 2/4] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-02 15:23       ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 15:23 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/01/2014 11:35 PM, David Rientjes wrote:
> Memory compaction works by having a "freeing scanner" scan from one end of a
> zone which isolates pages as migration targets while another "migrating scanner"
> scans from the other end of the same zone which isolates pages for migration.
>
> When page migration fails for an isolated page, the target page is returned to
> the system rather than the freelist built by the freeing scanner.  This may
> require the freeing scanner to continue scanning memory after suitable migration
> targets have already been returned to the system needlessly.
>
> This patch returns destination pages to the freeing scanner freelist when page
> migration fails.  This prevents unnecessary work done by the freeing scanner but
> also encourages memory to be as compacted as possible at the end of the zone.

Note that the free scanner work can be further reduced as it does not 
anymore need to restart on the highest pageblock where it isolated 
something. It now does that exactly for the reason that something might 
have been returned back to the allocator and we don't want to miss that.
I'll reply with a patch for that shortly (that applies on top of 
isolate_freepages() changes in -mm plus Joonsoo's not-yet-included patch 
from http://marc.info/?l=linux-mm&m=139841452326021&w=2 )

> Reported-by: Greg Thelen <gthelen@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>   mm/compaction.c | 27 ++++++++++++++++++---------
>   1 file changed, 18 insertions(+), 9 deletions(-)
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -797,23 +797,32 @@ static struct page *compaction_alloc(struct page *migratepage,
>   }
>
>   /*
> - * We cannot control nr_migratepages and nr_freepages fully when migration is
> - * running as migrate_pages() has no knowledge of compact_control. When
> - * migration is complete, we count the number of pages on the lists by hand.
> + * This is a migrate-callback that "frees" freepages back to the isolated
> + * freelist.  All pages on the freelist are from the same zone, so there is no
> + * special handling needed for NUMA.
> + */
> +static void compaction_free(struct page *page, unsigned long data)
> +{
> +	struct compact_control *cc = (struct compact_control *)data;
> +
> +	list_add(&page->lru, &cc->freepages);
> +	cc->nr_freepages++;
> +}
> +
> +/*
> + * We cannot control nr_migratepages fully when migration is running as
> + * migrate_pages() has no knowledge of of compact_control.  When migration is
> + * complete, we count the number of pages on the list by hand.

Actually migrate_pages() returns the number of failed pages except when 
it returns an error. And then we only use the value for the tracepoint. 
I'll send a followup patch for this.

>    */
>   static void update_nr_listpages(struct compact_control *cc)
>   {
>   	int nr_migratepages = 0;
> -	int nr_freepages = 0;
>   	struct page *page;
>
>   	list_for_each_entry(page, &cc->migratepages, lru)
>   		nr_migratepages++;
> -	list_for_each_entry(page, &cc->freepages, lru)
> -		nr_freepages++;
>
>   	cc->nr_migratepages = nr_migratepages;
> -	cc->nr_freepages = nr_freepages;
>   }
>
>   /* possible outcome of isolate_migratepages */
> @@ -1023,8 +1032,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   		}
>
>   		nr_migrate = cc->nr_migratepages;
> -		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
> -				(unsigned long)cc,
> +		err = migrate_pages(&cc->migratepages, compaction_alloc,
> +				compaction_free, (unsigned long)cc,
>   				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
>   				MR_COMPACTION);
>   		update_nr_listpages(cc);
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH] mm/compaction: do not count migratepages when unnecessary
  2014-05-02 15:23       ` Vlastimil Babka
  (?)
@ 2014-05-02 15:26       ` Vlastimil Babka
  2014-05-06 21:18         ` Naoya Horiguchi
       [not found]         ` <1399411134-k43fsr0p@n-horiguchi@ah.jp.nec.com>
  -1 siblings, 2 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 15:26 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes, --cc=Hugh Dickins
  Cc: Greg Thelen, linux-kernel, linux-mm, Vlastimil Babka,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

During compaction, update_nr_listpages() has been used to count remaining
non-migrated and free pages after a call to migrage_pages(). The freepages
counting has become unneccessary, and it turns out that migratepages counting
is also unnecessary in most cases.

The only situation when it's needed to count cc->migratepages is when
migrate_pages() returns with a negative error code. Otherwise, the non-negative
return value is the number of pages that were not migrated, which is exactly
the count of remaining pages in the cc->migratepages list.

Furthermore, any non-zero count is only interesting for the tracepoint of
mm_compaction_migratepages events, because after that all remaining unmigrated
pages are put back and their count is set to 0.

This patch therefore removes update_nr_listpages() completely, and changes the
tracepoint definition so that the manual counting is done only when the
tracepoint is enabled, and only when migrate_pages() returns a negative error
code.

Furthermore, migrate_pages() and the tracepoints won't be called when there's
nothing to migrate. This potentially avoids some wasted cycles and reduces the
volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
The mm_compaction_isolate_migratepages event is better for determining that
nothing was isolated for migration, and this one was just duplicating the info.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
 mm/compaction.c                   | 30 ++++++++----------------------
 2 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 06f544e..bbd5e1f 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -5,7 +5,9 @@
 #define _TRACE_COMPACTION_H
 
 #include <linux/types.h>
+#include <linux/list.h>
 #include <linux/tracepoint.h>
+#include <linux/mm_types.h>
 #include <trace/events/gfpflags.h>
 
 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
@@ -47,10 +49,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
 
 TRACE_EVENT(mm_compaction_migratepages,
 
-	TP_PROTO(unsigned long nr_migrated,
-		unsigned long nr_failed),
+	TP_PROTO(unsigned long nr_all,
+		int migrate_rc,
+		struct list_head * migratepages),
 
-	TP_ARGS(nr_migrated, nr_failed),
+	TP_ARGS(nr_all, migrate_rc, migratepages),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, nr_migrated)
@@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
 	),
 
 	TP_fast_assign(
-		__entry->nr_migrated = nr_migrated;
+		unsigned long nr_failed = 0;
+		struct page *page;
+
+		/*
+		 * migrate_pages() returns either a non-negative number
+		 * with the number of pages that failed migration, or an
+		 * error code, in which case we need to count the remaining
+		 * pages manually
+		 */
+		if (migrate_rc >= 0)
+			nr_failed = migrate_rc;
+		else
+		        list_for_each_entry(page, migratepages, lru)
+				nr_failed++;
+
+		__entry->nr_migrated = nr_all - nr_failed;
 		__entry->nr_failed = nr_failed;
 	),
 
diff --git a/mm/compaction.c b/mm/compaction.c
index ae1d0ae..873d7de 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -799,22 +799,6 @@ static void compaction_free(struct page *page, unsigned long data)
 	cc->nr_freepages++;
 }
 
-/*
- * We cannot control nr_migratepages fully when migration is running as
- * migrate_pages() has no knowledge of of compact_control.  When migration is
- * complete, we count the number of pages on the list by hand.
- */
-static void update_nr_listpages(struct compact_control *cc)
-{
-	int nr_migratepages = 0;
-	struct page *page;
-
-	list_for_each_entry(page, &cc->migratepages, lru)
-		nr_migratepages++;
-
-	cc->nr_migratepages = nr_migratepages;
-}
-
 /* possible outcome of isolate_migratepages */
 typedef enum {
 	ISOLATE_ABORT,		/* Abort compaction now */
@@ -1006,7 +990,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	migrate_prep_local();
 
 	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
-		unsigned long nr_migrate, nr_remaining;
 		int err;
 
 		switch (isolate_migratepages(zone, cc)) {
@@ -1021,16 +1004,16 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 			;
 		}
 
-		nr_migrate = cc->nr_migratepages;
+		if (!cc->nr_migratepages)
+			continue;
+
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
-		update_nr_listpages(cc);
-		nr_remaining = cc->nr_migratepages;
 
-		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
-						nr_remaining);
+		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
+							&cc->migratepages);
 
 		/* Release isolated pages not migrated */
 		if (err) {
@@ -1044,6 +1027,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 				ret = COMPACT_PARTIAL;
 				goto out;
 			}
+		} else {
+			/* All pages were successfully migrated */
+			cc->nr_migratepages = 0;
 		}
 	}
 
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-02 15:23       ` Vlastimil Babka
@ 2014-05-02 15:27         ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 15:27 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel

The compaction free scanner in isolate_freepages() currently remembers PFN of
the highest pageblock where it successfully isolates, to be used as the
starting pageblock for the next invocation. The rationale behind this is that
page migration might return free pages to the allocator when migration fails
and we don't want to skip them if the compaction continues.

Since migration now returns free pages back to compaction code where they can
be reused, this is no longer a concern. This patch changes isolate_freepages()
so that the PFN for restarting is updated with each pageblock where isolation
is attempted. Using stress-highalloc from mmtests, this resulted in 10%
reduction of the pages scanned by the free scanner.

Note that the somewhat similar functionality that records highest successful
pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
used when the whole compaction is restarted, not for multiple invocations of
the free scanner during single compaction.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 mm/compaction.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 873d7de..1967850 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -686,12 +686,6 @@ static void isolate_freepages(struct zone *zone,
 	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
 
 	/*
-	 * If no pages are isolated, the block_start_pfn < low_pfn check
-	 * will kick in.
-	 */
-	next_free_pfn = 0;
-
-	/*
 	 * Isolate free pages until enough are available to migrate the
 	 * pages on cc->migratepages. We stop searching if the migrate
 	 * and free page scanners meet or enough free pages are isolated.
@@ -731,19 +725,19 @@ static void isolate_freepages(struct zone *zone,
 			continue;
 
 		/* Found a block suitable for isolating free pages from */
+		next_free_pfn = block_start_pfn;
 		isolated = isolate_freepages_block(cc, block_start_pfn,
 					block_end_pfn, freelist, false);
 		nr_freepages += isolated;
 
 		/*
-		 * Record the highest PFN we isolated pages from. When next
-		 * looking for free pages, the search will restart here as
-		 * page migration may have returned some pages to the allocator
+		 * Set a flag that we successfully isolated in this pageblock.
+		 * In the next loop iteration, zone->compact_cached_free_pfn
+		 * will not be updated and thus it will effectively contain the
+		 * highest pageblock we isolated pages from.
 		 */
-		if (isolated && next_free_pfn == 0) {
+		if (isolated)
 			cc->finished_update_free = true;
-			next_free_pfn = block_start_pfn;
-		}
 	}
 
 	/* split_free_page does not map the pages */
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-02 15:27         ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 15:27 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel

The compaction free scanner in isolate_freepages() currently remembers PFN of
the highest pageblock where it successfully isolates, to be used as the
starting pageblock for the next invocation. The rationale behind this is that
page migration might return free pages to the allocator when migration fails
and we don't want to skip them if the compaction continues.

Since migration now returns free pages back to compaction code where they can
be reused, this is no longer a concern. This patch changes isolate_freepages()
so that the PFN for restarting is updated with each pageblock where isolation
is attempted. Using stress-highalloc from mmtests, this resulted in 10%
reduction of the pages scanned by the free scanner.

Note that the somewhat similar functionality that records highest successful
pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
used when the whole compaction is restarted, not for multiple invocations of
the free scanner during single compaction.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 mm/compaction.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 873d7de..1967850 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -686,12 +686,6 @@ static void isolate_freepages(struct zone *zone,
 	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
 
 	/*
-	 * If no pages are isolated, the block_start_pfn < low_pfn check
-	 * will kick in.
-	 */
-	next_free_pfn = 0;
-
-	/*
 	 * Isolate free pages until enough are available to migrate the
 	 * pages on cc->migratepages. We stop searching if the migrate
 	 * and free page scanners meet or enough free pages are isolated.
@@ -731,19 +725,19 @@ static void isolate_freepages(struct zone *zone,
 			continue;
 
 		/* Found a block suitable for isolating free pages from */
+		next_free_pfn = block_start_pfn;
 		isolated = isolate_freepages_block(cc, block_start_pfn,
 					block_end_pfn, freelist, false);
 		nr_freepages += isolated;
 
 		/*
-		 * Record the highest PFN we isolated pages from. When next
-		 * looking for free pages, the search will restart here as
-		 * page migration may have returned some pages to the allocator
+		 * Set a flag that we successfully isolated in this pageblock.
+		 * In the next loop iteration, zone->compact_cached_free_pfn
+		 * will not be updated and thus it will effectively contain the
+		 * highest pageblock we isolated pages from.
 		 */
-		if (isolated && next_free_pfn == 0) {
+		if (isolated)
 			cc->finished_update_free = true;
-			next_free_pfn = block_start_pfn;
-		}
 	}
 
 	/* split_free_page does not map the pages */
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH 1/2] mm/compaction: do not count migratepages when unnecessary
  2014-05-02 15:23       ` Vlastimil Babka
@ 2014-05-02 15:29         ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 15:29 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel

During compaction, update_nr_listpages() has been used to count remaining
non-migrated and free pages after a call to migrage_pages(). The freepages
counting has become unneccessary, and it turns out that migratepages counting
is also unnecessary in most cases.

The only situation when it's needed to count cc->migratepages is when
migrate_pages() returns with a negative error code. Otherwise, the non-negative
return value is the number of pages that were not migrated, which is exactly
the count of remaining pages in the cc->migratepages list.

Furthermore, any non-zero count is only interesting for the tracepoint of
mm_compaction_migratepages events, because after that all remaining unmigrated
pages are put back and their count is set to 0.

This patch therefore removes update_nr_listpages() completely, and changes the
tracepoint definition so that the manual counting is done only when the
tracepoint is enabled, and only when migrate_pages() returns a negative error
code.

Furthermore, migrate_pages() and the tracepoints won't be called when there's
nothing to migrate. This potentially avoids some wasted cycles and reduces the
volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
The mm_compaction_isolate_migratepages event is better for determining that
nothing was isolated for migration, and this one was just duplicating the info.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
 mm/compaction.c                   | 30 ++++++++----------------------
 2 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 06f544e..bbd5e1f 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -5,7 +5,9 @@
 #define _TRACE_COMPACTION_H
 
 #include <linux/types.h>
+#include <linux/list.h>
 #include <linux/tracepoint.h>
+#include <linux/mm_types.h>
 #include <trace/events/gfpflags.h>
 
 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
@@ -47,10 +49,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
 
 TRACE_EVENT(mm_compaction_migratepages,
 
-	TP_PROTO(unsigned long nr_migrated,
-		unsigned long nr_failed),
+	TP_PROTO(unsigned long nr_all,
+		int migrate_rc,
+		struct list_head * migratepages),
 
-	TP_ARGS(nr_migrated, nr_failed),
+	TP_ARGS(nr_all, migrate_rc, migratepages),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, nr_migrated)
@@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
 	),
 
 	TP_fast_assign(
-		__entry->nr_migrated = nr_migrated;
+		unsigned long nr_failed = 0;
+		struct page *page;
+
+		/*
+		 * migrate_pages() returns either a non-negative number
+		 * with the number of pages that failed migration, or an
+		 * error code, in which case we need to count the remaining
+		 * pages manually
+		 */
+		if (migrate_rc >= 0)
+			nr_failed = migrate_rc;
+		else
+		        list_for_each_entry(page, migratepages, lru)
+				nr_failed++;
+
+		__entry->nr_migrated = nr_all - nr_failed;
 		__entry->nr_failed = nr_failed;
 	),
 
diff --git a/mm/compaction.c b/mm/compaction.c
index ae1d0ae..873d7de 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -799,22 +799,6 @@ static void compaction_free(struct page *page, unsigned long data)
 	cc->nr_freepages++;
 }
 
-/*
- * We cannot control nr_migratepages fully when migration is running as
- * migrate_pages() has no knowledge of of compact_control.  When migration is
- * complete, we count the number of pages on the list by hand.
- */
-static void update_nr_listpages(struct compact_control *cc)
-{
-	int nr_migratepages = 0;
-	struct page *page;
-
-	list_for_each_entry(page, &cc->migratepages, lru)
-		nr_migratepages++;
-
-	cc->nr_migratepages = nr_migratepages;
-}
-
 /* possible outcome of isolate_migratepages */
 typedef enum {
 	ISOLATE_ABORT,		/* Abort compaction now */
@@ -1006,7 +990,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	migrate_prep_local();
 
 	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
-		unsigned long nr_migrate, nr_remaining;
 		int err;
 
 		switch (isolate_migratepages(zone, cc)) {
@@ -1021,16 +1004,16 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 			;
 		}
 
-		nr_migrate = cc->nr_migratepages;
+		if (!cc->nr_migratepages)
+			continue;
+
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
-		update_nr_listpages(cc);
-		nr_remaining = cc->nr_migratepages;
 
-		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
-						nr_remaining);
+		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
+							&cc->migratepages);
 
 		/* Release isolated pages not migrated */
 		if (err) {
@@ -1044,6 +1027,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 				ret = COMPACT_PARTIAL;
 				goto out;
 			}
+		} else {
+			/* All pages were successfully migrated */
+			cc->nr_migratepages = 0;
 		}
 	}
 
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH 1/2] mm/compaction: do not count migratepages when unnecessary
@ 2014-05-02 15:29         ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 15:29 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel

During compaction, update_nr_listpages() has been used to count remaining
non-migrated and free pages after a call to migrage_pages(). The freepages
counting has become unneccessary, and it turns out that migratepages counting
is also unnecessary in most cases.

The only situation when it's needed to count cc->migratepages is when
migrate_pages() returns with a negative error code. Otherwise, the non-negative
return value is the number of pages that were not migrated, which is exactly
the count of remaining pages in the cc->migratepages list.

Furthermore, any non-zero count is only interesting for the tracepoint of
mm_compaction_migratepages events, because after that all remaining unmigrated
pages are put back and their count is set to 0.

This patch therefore removes update_nr_listpages() completely, and changes the
tracepoint definition so that the manual counting is done only when the
tracepoint is enabled, and only when migrate_pages() returns a negative error
code.

Furthermore, migrate_pages() and the tracepoints won't be called when there's
nothing to migrate. This potentially avoids some wasted cycles and reduces the
volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
The mm_compaction_isolate_migratepages event is better for determining that
nothing was isolated for migration, and this one was just duplicating the info.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
 mm/compaction.c                   | 30 ++++++++----------------------
 2 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 06f544e..bbd5e1f 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -5,7 +5,9 @@
 #define _TRACE_COMPACTION_H
 
 #include <linux/types.h>
+#include <linux/list.h>
 #include <linux/tracepoint.h>
+#include <linux/mm_types.h>
 #include <trace/events/gfpflags.h>
 
 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
@@ -47,10 +49,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
 
 TRACE_EVENT(mm_compaction_migratepages,
 
-	TP_PROTO(unsigned long nr_migrated,
-		unsigned long nr_failed),
+	TP_PROTO(unsigned long nr_all,
+		int migrate_rc,
+		struct list_head * migratepages),
 
-	TP_ARGS(nr_migrated, nr_failed),
+	TP_ARGS(nr_all, migrate_rc, migratepages),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, nr_migrated)
@@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
 	),
 
 	TP_fast_assign(
-		__entry->nr_migrated = nr_migrated;
+		unsigned long nr_failed = 0;
+		struct page *page;
+
+		/*
+		 * migrate_pages() returns either a non-negative number
+		 * with the number of pages that failed migration, or an
+		 * error code, in which case we need to count the remaining
+		 * pages manually
+		 */
+		if (migrate_rc >= 0)
+			nr_failed = migrate_rc;
+		else
+		        list_for_each_entry(page, migratepages, lru)
+				nr_failed++;
+
+		__entry->nr_migrated = nr_all - nr_failed;
 		__entry->nr_failed = nr_failed;
 	),
 
diff --git a/mm/compaction.c b/mm/compaction.c
index ae1d0ae..873d7de 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -799,22 +799,6 @@ static void compaction_free(struct page *page, unsigned long data)
 	cc->nr_freepages++;
 }
 
-/*
- * We cannot control nr_migratepages fully when migration is running as
- * migrate_pages() has no knowledge of of compact_control.  When migration is
- * complete, we count the number of pages on the list by hand.
- */
-static void update_nr_listpages(struct compact_control *cc)
-{
-	int nr_migratepages = 0;
-	struct page *page;
-
-	list_for_each_entry(page, &cc->migratepages, lru)
-		nr_migratepages++;
-
-	cc->nr_migratepages = nr_migratepages;
-}
-
 /* possible outcome of isolate_migratepages */
 typedef enum {
 	ISOLATE_ABORT,		/* Abort compaction now */
@@ -1006,7 +990,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	migrate_prep_local();
 
 	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
-		unsigned long nr_migrate, nr_remaining;
 		int err;
 
 		switch (isolate_migratepages(zone, cc)) {
@@ -1021,16 +1004,16 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 			;
 		}
 
-		nr_migrate = cc->nr_migratepages;
+		if (!cc->nr_migratepages)
+			continue;
+
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
-		update_nr_listpages(cc);
-		nr_remaining = cc->nr_migratepages;
 
-		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
-						nr_remaining);
+		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
+							&cc->migratepages);
 
 		/* Release isolated pages not migrated */
 		if (err) {
@@ -1044,6 +1027,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 				ret = COMPACT_PARTIAL;
 				goto out;
 			}
+		} else {
+			/* All pages were successfully migrated */
+			cc->nr_migratepages = 0;
 		}
 	}
 
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
  2014-05-02 11:58         ` Mel Gorman
@ 2014-05-02 20:29             ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-02 20:29 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Fri, 2 May 2014, Mel Gorman wrote:

> > The page locks I'm referring to is the lock_page() in __unmap_and_move() 
> > that gets called for sync compaction after the migrate_pages() iteration 
> > makes a few passes and unsuccessfully grabs it.  This becomes a forced 
> > migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
> > 
> 
> Can that be fixed then instead of disabling it entirely?
> 

We could return -EAGAIN when the trylock_page() fails for 
MIGRATE_SYNC_LIGHT.  It would become a forced migration but we ignore that 
currently for MIGRATE_ASYNC, and I could extend it to be ignored for 
MIGRATE_SYNC_LIGHT as well.

> > We have perf profiles from one workload in particular that shows 
> > contention on i_mmap_mutex (anon isn't interesting since the vast majority 
> > of memory on this workload [120GB on a 128GB machine] is has a gup pin and 
> > doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid 
> > isolating pinned pages")) between cpus all doing memory compaction trying 
> > to fault thp memory.
> > 
> 
> Abort SYNC_LIGHT compaction if the mutex is contended.
> 

Yeah, I have patches for that as well but we're waiting to see if they are 
actually needed when sync compaction is disabled for thp.  If we aren't 
actually going to disable it entirely, then I can revive those patches if 
the contention becomes such an issue.

> > That's one example that we've seen, but the fact remains that at times 
> > sync compaction will iterate the entire 128GB machine and not allow an 
> > order-9 page to be allocated and there's nothing to preempt it like the 
> > need_resched() or lock contention checks that async compaction has. 
> 
> Make compact_control->sync the same enum field and check for contention
> on the async/sync_light case but leave it for sync if compacting via the
> proc interface?
> 

Ok, that certainly can be done, I wasn't sure you would be happy with such 
a change.  I'm not sure there's so much of a difference between the new 
compact_control->sync == MIGRATE_ASYNC and == MIGRATE_SYNC_LIGHT now, 
though.  Would it make sense to remove MIGRATE_SYNC_LIGHT entirely from 
the page allocator, i.e. remove sync_migration entirely, and just retry 
with a second call to compaction before failing instead? 

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
@ 2014-05-02 20:29             ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-02 20:29 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Fri, 2 May 2014, Mel Gorman wrote:

> > The page locks I'm referring to is the lock_page() in __unmap_and_move() 
> > that gets called for sync compaction after the migrate_pages() iteration 
> > makes a few passes and unsuccessfully grabs it.  This becomes a forced 
> > migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
> > 
> 
> Can that be fixed then instead of disabling it entirely?
> 

We could return -EAGAIN when the trylock_page() fails for 
MIGRATE_SYNC_LIGHT.  It would become a forced migration but we ignore that 
currently for MIGRATE_ASYNC, and I could extend it to be ignored for 
MIGRATE_SYNC_LIGHT as well.

> > We have perf profiles from one workload in particular that shows 
> > contention on i_mmap_mutex (anon isn't interesting since the vast majority 
> > of memory on this workload [120GB on a 128GB machine] is has a gup pin and 
> > doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid 
> > isolating pinned pages")) between cpus all doing memory compaction trying 
> > to fault thp memory.
> > 
> 
> Abort SYNC_LIGHT compaction if the mutex is contended.
> 

Yeah, I have patches for that as well but we're waiting to see if they are 
actually needed when sync compaction is disabled for thp.  If we aren't 
actually going to disable it entirely, then I can revive those patches if 
the contention becomes such an issue.

> > That's one example that we've seen, but the fact remains that at times 
> > sync compaction will iterate the entire 128GB machine and not allow an 
> > order-9 page to be allocated and there's nothing to preempt it like the 
> > need_resched() or lock contention checks that async compaction has. 
> 
> Make compact_control->sync the same enum field and check for contention
> on the async/sync_light case but leave it for sync if compacting via the
> proc interface?
> 

Ok, that certainly can be done, I wasn't sure you would be happy with such 
a change.  I'm not sure there's so much of a difference between the new 
compact_control->sync == MIGRATE_ASYNC and == MIGRATE_SYNC_LIGHT now, 
though.  Would it make sense to remove MIGRATE_SYNC_LIGHT entirely from 
the page allocator, i.e. remove sync_migration entirely, and just retry 
with a second call to compaction before failing instead? 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-01 21:35     ` David Rientjes
@ 2014-05-05  9:34       ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-05  9:34 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/01/2014 11:35 PM, David Rientjes wrote:
> Each zone has a cached migration scanner pfn for memory compaction so that
> subsequent calls to memory compaction can start where the previous call left
> off.
>
> Currently, the compaction migration scanner only updates the per-zone cached pfn
> when pageblocks were not skipped for async compaction.  This creates a
> dependency on calling sync compaction to avoid having subsequent calls to async
> compaction from scanning an enormous amount of non-MOVABLE pageblocks each time
> it is called.  On large machines, this could be potentially very expensive.

OK that's due to my commit 50b5b094e6 ("mm: compaction: do not mark 
unmovable pageblocks as skipped in async compaction") and the intention 
was to avoid marking pageblocks as to-be-skipped just because they were 
ignored by async compaction, which would make the following sync 
compaction ignore them as well. However it's true that 
update_pageblock_skip() also updates the cached pfn's and not updating 
them is a sideeffect of this change.

I didn't think that would be a problem as skipping whole pageblocks due 
to being non-movable should be fast and without taking locks. But if 
your testing shows that this is a problem, then OK.

> This patch adds a per-zone cached migration scanner pfn only for async
> compaction.  It is updated everytime a pageblock has been scanned in its
> entirety and when no pages from it were successfully isolated.  The cached
> migration scanner pfn for sync compaction is updated only when called for sync
> compaction.

I think this might be an overkill and maybe just decoupling the cached 
pfn update from the update_pageblock_skip() would be enough, i.e. 
restore pre-50b5b094e6 behavior for the cached pfn (but not for the skip 
bits)? I wonder if your new sync migration scanner would make any 
difference. Presumably when async compaction finishes without success by 
having the scanners meet, compact_finished() will reset the cached pfn's 
and the sync compaction will not have a chance to use any previously 
cached value anyway?

> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   include/linux/mmzone.h |  5 +++--
>   mm/compaction.c        | 55 ++++++++++++++++++++++++++------------------------
>   2 files changed, 32 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -360,9 +360,10 @@ struct zone {
>   	/* Set to true when the PG_migrate_skip bits should be cleared */
>   	bool			compact_blockskip_flush;
>
> -	/* pfns where compaction scanners should start */
> +	/* pfn where compaction free scanner should start */
>   	unsigned long		compact_cached_free_pfn;
> -	unsigned long		compact_cached_migrate_pfn;
> +	/* pfn where async and sync compaction migration scanner should start */
> +	unsigned long		compact_cached_migrate_pfn[2];
>   #endif
>   #ifdef CONFIG_MEMORY_HOTPLUG
>   	/* see spanned/present_pages for more description */
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
>   	unsigned long end_pfn = zone_end_pfn(zone);
>   	unsigned long pfn;
>
> -	zone->compact_cached_migrate_pfn = start_pfn;
> +	zone->compact_cached_migrate_pfn[0] = start_pfn;
> +	zone->compact_cached_migrate_pfn[1] = start_pfn;
>   	zone->compact_cached_free_pfn = end_pfn;
>   	zone->compact_blockskip_flush = false;
>
> @@ -134,6 +135,7 @@ static void update_pageblock_skip(struct compact_control *cc,
>   			bool migrate_scanner)
>   {
>   	struct zone *zone = cc->zone;
> +	unsigned long pfn;
>
>   	if (cc->ignore_skip_hint)
>   		return;
> @@ -141,20 +143,25 @@ static void update_pageblock_skip(struct compact_control *cc,
>   	if (!page)
>   		return;
>
> -	if (!nr_isolated) {
> -		unsigned long pfn = page_to_pfn(page);
> -		set_pageblock_skip(page);
> -
> -		/* Update where compaction should restart */
> -		if (migrate_scanner) {
> -			if (!cc->finished_update_migrate &&
> -			    pfn > zone->compact_cached_migrate_pfn)
> -				zone->compact_cached_migrate_pfn = pfn;
> -		} else {
> -			if (!cc->finished_update_free &&
> -			    pfn < zone->compact_cached_free_pfn)
> -				zone->compact_cached_free_pfn = pfn;
> -		}
> +	if (nr_isolated)
> +		return;
> +
> +	set_pageblock_skip(page);
> +	pfn = page_to_pfn(page);
> +
> +	/* Update where async and sync compaction should restart */
> +	if (migrate_scanner) {
> +		if (cc->finished_update_migrate)
> +			return;
> +		if (pfn > zone->compact_cached_migrate_pfn[0])
> +			zone->compact_cached_migrate_pfn[0] = pfn;
> +		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
> +			zone->compact_cached_migrate_pfn[1] = pfn;
> +	} else {
> +		if (cc->finished_update_free)
> +			return;
> +		if (pfn < zone->compact_cached_free_pfn)
> +			zone->compact_cached_free_pfn = pfn;
>   	}
>   }
>   #else
> @@ -464,7 +471,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   	unsigned long flags;
>   	bool locked = false;
>   	struct page *page = NULL, *valid_page = NULL;
> -	bool skipped_async_unsuitable = false;
>   	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
>   				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
>
> @@ -540,11 +546,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			 * the minimum amount of work satisfies the allocation
>   			 */
>   			mt = get_pageblock_migratetype(page);
> -			if (!cc->sync && !migrate_async_suitable(mt)) {
> -				cc->finished_update_migrate = true;
> -				skipped_async_unsuitable = true;
> +			if (!cc->sync && !migrate_async_suitable(mt))
>   				goto next_pageblock;
> -			}
>   		}
>
>   		/*
> @@ -646,10 +649,8 @@ next_pageblock:
>   	/*
>   	 * Update the pageblock-skip information and cached scanner pfn,
>   	 * if the whole pageblock was scanned without isolating any page.
> -	 * This is not done when pageblock was skipped due to being unsuitable
> -	 * for async compaction, so that eventual sync compaction can try.
>   	 */
> -	if (low_pfn == end_pfn && !skipped_async_unsuitable)
> +	if (low_pfn == end_pfn)
>   		update_pageblock_skip(cc, valid_page, nr_isolated, true);
>
>   	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
> @@ -875,7 +876,8 @@ static int compact_finished(struct zone *zone,
>   	/* Compaction run completes if the migrate and free scanner meet */
>   	if (cc->free_pfn <= cc->migrate_pfn) {
>   		/* Let the next compaction start anew. */
> -		zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
> +		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
> +		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
>   		zone->compact_cached_free_pfn = zone_end_pfn(zone);
>
>   		/*
> @@ -1000,7 +1002,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	 * information on where the scanners should start but check that it
>   	 * is initialised by ensuring the values are within zone boundaries.
>   	 */
> -	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
> +	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
>   	cc->free_pfn = zone->compact_cached_free_pfn;
>   	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
>   		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
> @@ -1008,7 +1010,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	}
>   	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
>   		cc->migrate_pfn = start_pfn;
> -		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
> +		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
> +		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
>   	}
>
>   	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
@ 2014-05-05  9:34       ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-05  9:34 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/01/2014 11:35 PM, David Rientjes wrote:
> Each zone has a cached migration scanner pfn for memory compaction so that
> subsequent calls to memory compaction can start where the previous call left
> off.
>
> Currently, the compaction migration scanner only updates the per-zone cached pfn
> when pageblocks were not skipped for async compaction.  This creates a
> dependency on calling sync compaction to avoid having subsequent calls to async
> compaction from scanning an enormous amount of non-MOVABLE pageblocks each time
> it is called.  On large machines, this could be potentially very expensive.

OK that's due to my commit 50b5b094e6 ("mm: compaction: do not mark 
unmovable pageblocks as skipped in async compaction") and the intention 
was to avoid marking pageblocks as to-be-skipped just because they were 
ignored by async compaction, which would make the following sync 
compaction ignore them as well. However it's true that 
update_pageblock_skip() also updates the cached pfn's and not updating 
them is a sideeffect of this change.

I didn't think that would be a problem as skipping whole pageblocks due 
to being non-movable should be fast and without taking locks. But if 
your testing shows that this is a problem, then OK.

> This patch adds a per-zone cached migration scanner pfn only for async
> compaction.  It is updated everytime a pageblock has been scanned in its
> entirety and when no pages from it were successfully isolated.  The cached
> migration scanner pfn for sync compaction is updated only when called for sync
> compaction.

I think this might be an overkill and maybe just decoupling the cached 
pfn update from the update_pageblock_skip() would be enough, i.e. 
restore pre-50b5b094e6 behavior for the cached pfn (but not for the skip 
bits)? I wonder if your new sync migration scanner would make any 
difference. Presumably when async compaction finishes without success by 
having the scanners meet, compact_finished() will reset the cached pfn's 
and the sync compaction will not have a chance to use any previously 
cached value anyway?

> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   include/linux/mmzone.h |  5 +++--
>   mm/compaction.c        | 55 ++++++++++++++++++++++++++------------------------
>   2 files changed, 32 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -360,9 +360,10 @@ struct zone {
>   	/* Set to true when the PG_migrate_skip bits should be cleared */
>   	bool			compact_blockskip_flush;
>
> -	/* pfns where compaction scanners should start */
> +	/* pfn where compaction free scanner should start */
>   	unsigned long		compact_cached_free_pfn;
> -	unsigned long		compact_cached_migrate_pfn;
> +	/* pfn where async and sync compaction migration scanner should start */
> +	unsigned long		compact_cached_migrate_pfn[2];
>   #endif
>   #ifdef CONFIG_MEMORY_HOTPLUG
>   	/* see spanned/present_pages for more description */
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
>   	unsigned long end_pfn = zone_end_pfn(zone);
>   	unsigned long pfn;
>
> -	zone->compact_cached_migrate_pfn = start_pfn;
> +	zone->compact_cached_migrate_pfn[0] = start_pfn;
> +	zone->compact_cached_migrate_pfn[1] = start_pfn;
>   	zone->compact_cached_free_pfn = end_pfn;
>   	zone->compact_blockskip_flush = false;
>
> @@ -134,6 +135,7 @@ static void update_pageblock_skip(struct compact_control *cc,
>   			bool migrate_scanner)
>   {
>   	struct zone *zone = cc->zone;
> +	unsigned long pfn;
>
>   	if (cc->ignore_skip_hint)
>   		return;
> @@ -141,20 +143,25 @@ static void update_pageblock_skip(struct compact_control *cc,
>   	if (!page)
>   		return;
>
> -	if (!nr_isolated) {
> -		unsigned long pfn = page_to_pfn(page);
> -		set_pageblock_skip(page);
> -
> -		/* Update where compaction should restart */
> -		if (migrate_scanner) {
> -			if (!cc->finished_update_migrate &&
> -			    pfn > zone->compact_cached_migrate_pfn)
> -				zone->compact_cached_migrate_pfn = pfn;
> -		} else {
> -			if (!cc->finished_update_free &&
> -			    pfn < zone->compact_cached_free_pfn)
> -				zone->compact_cached_free_pfn = pfn;
> -		}
> +	if (nr_isolated)
> +		return;
> +
> +	set_pageblock_skip(page);
> +	pfn = page_to_pfn(page);
> +
> +	/* Update where async and sync compaction should restart */
> +	if (migrate_scanner) {
> +		if (cc->finished_update_migrate)
> +			return;
> +		if (pfn > zone->compact_cached_migrate_pfn[0])
> +			zone->compact_cached_migrate_pfn[0] = pfn;
> +		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
> +			zone->compact_cached_migrate_pfn[1] = pfn;
> +	} else {
> +		if (cc->finished_update_free)
> +			return;
> +		if (pfn < zone->compact_cached_free_pfn)
> +			zone->compact_cached_free_pfn = pfn;
>   	}
>   }
>   #else
> @@ -464,7 +471,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   	unsigned long flags;
>   	bool locked = false;
>   	struct page *page = NULL, *valid_page = NULL;
> -	bool skipped_async_unsuitable = false;
>   	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
>   				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
>
> @@ -540,11 +546,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			 * the minimum amount of work satisfies the allocation
>   			 */
>   			mt = get_pageblock_migratetype(page);
> -			if (!cc->sync && !migrate_async_suitable(mt)) {
> -				cc->finished_update_migrate = true;
> -				skipped_async_unsuitable = true;
> +			if (!cc->sync && !migrate_async_suitable(mt))
>   				goto next_pageblock;
> -			}
>   		}
>
>   		/*
> @@ -646,10 +649,8 @@ next_pageblock:
>   	/*
>   	 * Update the pageblock-skip information and cached scanner pfn,
>   	 * if the whole pageblock was scanned without isolating any page.
> -	 * This is not done when pageblock was skipped due to being unsuitable
> -	 * for async compaction, so that eventual sync compaction can try.
>   	 */
> -	if (low_pfn == end_pfn && !skipped_async_unsuitable)
> +	if (low_pfn == end_pfn)
>   		update_pageblock_skip(cc, valid_page, nr_isolated, true);
>
>   	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
> @@ -875,7 +876,8 @@ static int compact_finished(struct zone *zone,
>   	/* Compaction run completes if the migrate and free scanner meet */
>   	if (cc->free_pfn <= cc->migrate_pfn) {
>   		/* Let the next compaction start anew. */
> -		zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
> +		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
> +		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
>   		zone->compact_cached_free_pfn = zone_end_pfn(zone);
>
>   		/*
> @@ -1000,7 +1002,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	 * information on where the scanners should start but check that it
>   	 * is initialised by ensuring the values are within zone boundaries.
>   	 */
> -	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
> +	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
>   	cc->free_pfn = zone->compact_cached_free_pfn;
>   	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
>   		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
> @@ -1008,7 +1010,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	}
>   	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
>   		cc->migrate_pfn = start_pfn;
> -		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
> +		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
> +		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
>   	}
>
>   	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-05  9:34       ` Vlastimil Babka
@ 2014-05-05  9:51         ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-05  9:51 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Mon, 5 May 2014, Vlastimil Babka wrote:

> OK that's due to my commit 50b5b094e6 ("mm: compaction: do not mark unmovable
> pageblocks as skipped in async compaction") and the intention was to avoid
> marking pageblocks as to-be-skipped just because they were ignored by async
> compaction, which would make the following sync compaction ignore them as
> well. However it's true that update_pageblock_skip() also updates the cached
> pfn's and not updating them is a sideeffect of this change.
> 

It's not necessary just that commit, update_pageblock_skip() won't do 
anything if cc->finished_update_migrate is true which still happens before 
the commit.  This issue was noticed on a kernel without your commit.

> I didn't think that would be a problem as skipping whole pageblocks due to
> being non-movable should be fast and without taking locks. But if your testing
> shows that this is a problem, then OK.
> 

Async compaction terminates early when lru_lock is contended or 
need_resched() and on zones that are so large for a 128GB machine, this 
happens often.  A thp allocation returns immediately because of 
contended_compaction in the page allocator.  When the next thp is 
allocated, async compaction starts from where the former iteration started 
because we don't do any caching of the pfns and nothing called sync 
compaction.  It's simply unnecessary overhead that can be prevented on the 
next call and it leaves a potentially large part of the zone unscanned if 
we continuously fail because of contention.  This patch fixes that.

> > This patch adds a per-zone cached migration scanner pfn only for async
> > compaction.  It is updated everytime a pageblock has been scanned in its
> > entirety and when no pages from it were successfully isolated.  The cached
> > migration scanner pfn for sync compaction is updated only when called for
> > sync
> > compaction.
> 
> I think this might be an overkill and maybe just decoupling the cached pfn
> update from the update_pageblock_skip() would be enough, i.e. restore
> pre-50b5b094e6 behavior for the cached pfn (but not for the skip bits)? I
> wonder if your new sync migration scanner would make any difference.
> Presumably when async compaction finishes without success by having the
> scanners meet, compact_finished() will reset the cached pfn's and the sync
> compaction will not have a chance to use any previously cached value anyway?
> 

When a zone has 32GB or 64GB to scan, as is in this case (and will become 
larger in the future), async compaction will always terminate early.  It 
may never cause a migration destination page to even be allocated, the 
freeing scanner may never move and there's no guarantee they will ever 
meet if we never call sync compaction.

The logic presented in this patch will avoid rescanning the non-movable 
pageblocks, for example, for async compaction until all other memory has 
been scanned.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
@ 2014-05-05  9:51         ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-05  9:51 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Mon, 5 May 2014, Vlastimil Babka wrote:

> OK that's due to my commit 50b5b094e6 ("mm: compaction: do not mark unmovable
> pageblocks as skipped in async compaction") and the intention was to avoid
> marking pageblocks as to-be-skipped just because they were ignored by async
> compaction, which would make the following sync compaction ignore them as
> well. However it's true that update_pageblock_skip() also updates the cached
> pfn's and not updating them is a sideeffect of this change.
> 

It's not necessary just that commit, update_pageblock_skip() won't do 
anything if cc->finished_update_migrate is true which still happens before 
the commit.  This issue was noticed on a kernel without your commit.

> I didn't think that would be a problem as skipping whole pageblocks due to
> being non-movable should be fast and without taking locks. But if your testing
> shows that this is a problem, then OK.
> 

Async compaction terminates early when lru_lock is contended or 
need_resched() and on zones that are so large for a 128GB machine, this 
happens often.  A thp allocation returns immediately because of 
contended_compaction in the page allocator.  When the next thp is 
allocated, async compaction starts from where the former iteration started 
because we don't do any caching of the pfns and nothing called sync 
compaction.  It's simply unnecessary overhead that can be prevented on the 
next call and it leaves a potentially large part of the zone unscanned if 
we continuously fail because of contention.  This patch fixes that.

> > This patch adds a per-zone cached migration scanner pfn only for async
> > compaction.  It is updated everytime a pageblock has been scanned in its
> > entirety and when no pages from it were successfully isolated.  The cached
> > migration scanner pfn for sync compaction is updated only when called for
> > sync
> > compaction.
> 
> I think this might be an overkill and maybe just decoupling the cached pfn
> update from the update_pageblock_skip() would be enough, i.e. restore
> pre-50b5b094e6 behavior for the cached pfn (but not for the skip bits)? I
> wonder if your new sync migration scanner would make any difference.
> Presumably when async compaction finishes without success by having the
> scanners meet, compact_finished() will reset the cached pfn's and the sync
> compaction will not have a chance to use any previously cached value anyway?
> 

When a zone has 32GB or 64GB to scan, as is in this case (and will become 
larger in the future), async compaction will always terminate early.  It 
may never cause a migration destination page to even be allocated, the 
freeing scanner may never move and there's no guarantee they will ever 
meet if we never call sync compaction.

The logic presented in this patch will avoid rescanning the non-movable 
pageblocks, for example, for async compaction until all other memory has 
been scanned.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-05  9:51         ` David Rientjes
@ 2014-05-05 14:24           ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-05 14:24 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On 05/05/2014 11:51 AM, David Rientjes wrote:
> On Mon, 5 May 2014, Vlastimil Babka wrote:
>
>> OK that's due to my commit 50b5b094e6 ("mm: compaction: do not mark unmovable
>> pageblocks as skipped in async compaction") and the intention was to avoid
>> marking pageblocks as to-be-skipped just because they were ignored by async
>> compaction, which would make the following sync compaction ignore them as
>> well. However it's true that update_pageblock_skip() also updates the cached
>> pfn's and not updating them is a sideeffect of this change.
>>
>
> It's not necessary just that commit, update_pageblock_skip() won't do
> anything if cc->finished_update_migrate is true which still happens before
> the commit.  This issue was noticed on a kernel without your commit.

OK.

>> I didn't think that would be a problem as skipping whole pageblocks due to
>> being non-movable should be fast and without taking locks. But if your testing
>> shows that this is a problem, then OK.
>>
>
> Async compaction terminates early when lru_lock is contended or
> need_resched() and on zones that are so large for a 128GB machine, this
> happens often.  A thp allocation returns immediately because of
> contended_compaction in the page allocator.  When the next thp is
> allocated, async compaction starts from where the former iteration started
> because we don't do any caching of the pfns and nothing called sync
> compaction.  It's simply unnecessary overhead that can be prevented on the
> next call and it leaves a potentially large part of the zone unscanned if
> we continuously fail because of contention.  This patch fixes that.

OK.

>>> This patch adds a per-zone cached migration scanner pfn only for async
>>> compaction.  It is updated everytime a pageblock has been scanned in its
>>> entirety and when no pages from it were successfully isolated.  The cached
>>> migration scanner pfn for sync compaction is updated only when called for
>>> sync
>>> compaction.
>>
>> I think this might be an overkill and maybe just decoupling the cached pfn
>> update from the update_pageblock_skip() would be enough, i.e. restore
>> pre-50b5b094e6 behavior for the cached pfn (but not for the skip bits)? I
>> wonder if your new sync migration scanner would make any difference.
>> Presumably when async compaction finishes without success by having the
>> scanners meet, compact_finished() will reset the cached pfn's and the sync
>> compaction will not have a chance to use any previously cached value anyway?
>>
>
> When a zone has 32GB or 64GB to scan, as is in this case (and will become
> larger in the future), async compaction will always terminate early.  It
> may never cause a migration destination page to even be allocated, the
> freeing scanner may never move and there's no guarantee they will ever
> meet if we never call sync compaction.
>
> The logic presented in this patch will avoid rescanning the non-movable
> pageblocks, for example, for async compaction until all other memory has
> been scanned.

I see, although I would still welcome some numbers to back such change.
What I still don't like is the removal of the intent of commit 
50b5b094e6. You now again call set_pageblock_skip() unconditionally, 
thus also on pageblocks that async compaction skipped due to being 
non-MOVABLE. The sync compaction will thus ignore them.


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
@ 2014-05-05 14:24           ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-05 14:24 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On 05/05/2014 11:51 AM, David Rientjes wrote:
> On Mon, 5 May 2014, Vlastimil Babka wrote:
>
>> OK that's due to my commit 50b5b094e6 ("mm: compaction: do not mark unmovable
>> pageblocks as skipped in async compaction") and the intention was to avoid
>> marking pageblocks as to-be-skipped just because they were ignored by async
>> compaction, which would make the following sync compaction ignore them as
>> well. However it's true that update_pageblock_skip() also updates the cached
>> pfn's and not updating them is a sideeffect of this change.
>>
>
> It's not necessary just that commit, update_pageblock_skip() won't do
> anything if cc->finished_update_migrate is true which still happens before
> the commit.  This issue was noticed on a kernel without your commit.

OK.

>> I didn't think that would be a problem as skipping whole pageblocks due to
>> being non-movable should be fast and without taking locks. But if your testing
>> shows that this is a problem, then OK.
>>
>
> Async compaction terminates early when lru_lock is contended or
> need_resched() and on zones that are so large for a 128GB machine, this
> happens often.  A thp allocation returns immediately because of
> contended_compaction in the page allocator.  When the next thp is
> allocated, async compaction starts from where the former iteration started
> because we don't do any caching of the pfns and nothing called sync
> compaction.  It's simply unnecessary overhead that can be prevented on the
> next call and it leaves a potentially large part of the zone unscanned if
> we continuously fail because of contention.  This patch fixes that.

OK.

>>> This patch adds a per-zone cached migration scanner pfn only for async
>>> compaction.  It is updated everytime a pageblock has been scanned in its
>>> entirety and when no pages from it were successfully isolated.  The cached
>>> migration scanner pfn for sync compaction is updated only when called for
>>> sync
>>> compaction.
>>
>> I think this might be an overkill and maybe just decoupling the cached pfn
>> update from the update_pageblock_skip() would be enough, i.e. restore
>> pre-50b5b094e6 behavior for the cached pfn (but not for the skip bits)? I
>> wonder if your new sync migration scanner would make any difference.
>> Presumably when async compaction finishes without success by having the
>> scanners meet, compact_finished() will reset the cached pfn's and the sync
>> compaction will not have a chance to use any previously cached value anyway?
>>
>
> When a zone has 32GB or 64GB to scan, as is in this case (and will become
> larger in the future), async compaction will always terminate early.  It
> may never cause a migration destination page to even be allocated, the
> freeing scanner may never move and there's no guarantee they will ever
> meet if we never call sync compaction.
>
> The logic presented in this patch will avoid rescanning the non-movable
> pageblocks, for example, for async compaction until all other memory has
> been scanned.

I see, although I would still welcome some numbers to back such change.
What I still don't like is the removal of the intent of commit 
50b5b094e6. You now again call set_pageblock_skip() unconditionally, 
thus also on pageblocks that async compaction skipped due to being 
non-MOVABLE. The sync compaction will thus ignore them.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
  2014-05-02 20:29             ` David Rientjes
@ 2014-05-05 14:48               ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-05 14:48 UTC (permalink / raw)
  To: David Rientjes, Mel Gorman
  Cc: Andrew Morton, Rik van Riel, Joonsoo Kim, Greg Thelen,
	Hugh Dickins, linux-kernel, linux-mm

On 05/02/2014 10:29 PM, David Rientjes wrote:
> On Fri, 2 May 2014, Mel Gorman wrote:
>
>>> The page locks I'm referring to is the lock_page() in __unmap_and_move()
>>> that gets called for sync compaction after the migrate_pages() iteration
>>> makes a few passes and unsuccessfully grabs it.  This becomes a forced
>>> migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
>>>
>>
>> Can that be fixed then instead of disabling it entirely?
>>
>
> We could return -EAGAIN when the trylock_page() fails for
> MIGRATE_SYNC_LIGHT.  It would become a forced migration but we ignore that
> currently for MIGRATE_ASYNC, and I could extend it to be ignored for
> MIGRATE_SYNC_LIGHT as well.
>
>>> We have perf profiles from one workload in particular that shows
>>> contention on i_mmap_mutex (anon isn't interesting since the vast majority
>>> of memory on this workload [120GB on a 128GB machine] is has a gup pin and
>>> doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid
>>> isolating pinned pages")) between cpus all doing memory compaction trying
>>> to fault thp memory.
>>>
>>
>> Abort SYNC_LIGHT compaction if the mutex is contended.
>>
>
> Yeah, I have patches for that as well but we're waiting to see if they are
> actually needed when sync compaction is disabled for thp.  If we aren't
> actually going to disable it entirely, then I can revive those patches if
> the contention becomes such an issue.
>
>>> That's one example that we've seen, but the fact remains that at times
>>> sync compaction will iterate the entire 128GB machine and not allow an
>>> order-9 page to be allocated and there's nothing to preempt it like the
>>> need_resched() or lock contention checks that async compaction has.
>>
>> Make compact_control->sync the same enum field and check for contention
>> on the async/sync_light case but leave it for sync if compacting via the
>> proc interface?
>>
>
> Ok, that certainly can be done, I wasn't sure you would be happy with such
> a change.  I'm not sure there's so much of a difference between the new
> compact_control->sync == MIGRATE_ASYNC and == MIGRATE_SYNC_LIGHT now,
> though.  Would it make sense to remove MIGRATE_SYNC_LIGHT entirely from
> the page allocator, i.e. remove sync_migration entirely, and just retry
> with a second call to compaction before failing instead?

Maybe we should step back and rethink the conditions for all sources of 
compaction to better balance effort vs desired outcome, so distinguish 4 
modes (just quick thoughts):

1) kswapd - we don't want to slow it down too much, thus async, uses 
cached pfns, honors skip bits, but does not give up on order=X blocks of 
pages just because some page was unable to be isolated/migrated (so 
gradually it might clean up such block over time).

2) hugepaged - it can afford to wait, so async + sync, use cached pfns, 
ignore skip bits on sync, also do not give up on order=THP blocks (?)

3) direct compaction on allocation of order X- we want to avoid 
allocation latencies, so it should skip over order=X blocks of pages as 
soon as it cannot isolate some page in such block (and put back 
already-isolated pages instead of migrating them). As for other 
parameters I'm not sure. Sync should still be possible thanks to the 
deferred_compaction logic? Maybe somehow transform the yes/no answer of 
deferred compaction to a number of how many pageblocks it should try 
before giving up, so there are no latency spikes limited only by the 
size of the zone?

4) compaction from proc interface - sync, reset cached pfn's, ignore 
deferring, ignore skip bits...

Vlastimil



^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
@ 2014-05-05 14:48               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-05 14:48 UTC (permalink / raw)
  To: David Rientjes, Mel Gorman
  Cc: Andrew Morton, Rik van Riel, Joonsoo Kim, Greg Thelen,
	Hugh Dickins, linux-kernel, linux-mm

On 05/02/2014 10:29 PM, David Rientjes wrote:
> On Fri, 2 May 2014, Mel Gorman wrote:
>
>>> The page locks I'm referring to is the lock_page() in __unmap_and_move()
>>> that gets called for sync compaction after the migrate_pages() iteration
>>> makes a few passes and unsuccessfully grabs it.  This becomes a forced
>>> migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
>>>
>>
>> Can that be fixed then instead of disabling it entirely?
>>
>
> We could return -EAGAIN when the trylock_page() fails for
> MIGRATE_SYNC_LIGHT.  It would become a forced migration but we ignore that
> currently for MIGRATE_ASYNC, and I could extend it to be ignored for
> MIGRATE_SYNC_LIGHT as well.
>
>>> We have perf profiles from one workload in particular that shows
>>> contention on i_mmap_mutex (anon isn't interesting since the vast majority
>>> of memory on this workload [120GB on a 128GB machine] is has a gup pin and
>>> doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid
>>> isolating pinned pages")) between cpus all doing memory compaction trying
>>> to fault thp memory.
>>>
>>
>> Abort SYNC_LIGHT compaction if the mutex is contended.
>>
>
> Yeah, I have patches for that as well but we're waiting to see if they are
> actually needed when sync compaction is disabled for thp.  If we aren't
> actually going to disable it entirely, then I can revive those patches if
> the contention becomes such an issue.
>
>>> That's one example that we've seen, but the fact remains that at times
>>> sync compaction will iterate the entire 128GB machine and not allow an
>>> order-9 page to be allocated and there's nothing to preempt it like the
>>> need_resched() or lock contention checks that async compaction has.
>>
>> Make compact_control->sync the same enum field and check for contention
>> on the async/sync_light case but leave it for sync if compacting via the
>> proc interface?
>>
>
> Ok, that certainly can be done, I wasn't sure you would be happy with such
> a change.  I'm not sure there's so much of a difference between the new
> compact_control->sync == MIGRATE_ASYNC and == MIGRATE_SYNC_LIGHT now,
> though.  Would it make sense to remove MIGRATE_SYNC_LIGHT entirely from
> the page allocator, i.e. remove sync_migration entirely, and just retry
> with a second call to compaction before failing instead?

Maybe we should step back and rethink the conditions for all sources of 
compaction to better balance effort vs desired outcome, so distinguish 4 
modes (just quick thoughts):

1) kswapd - we don't want to slow it down too much, thus async, uses 
cached pfns, honors skip bits, but does not give up on order=X blocks of 
pages just because some page was unable to be isolated/migrated (so 
gradually it might clean up such block over time).

2) hugepaged - it can afford to wait, so async + sync, use cached pfns, 
ignore skip bits on sync, also do not give up on order=THP blocks (?)

3) direct compaction on allocation of order X- we want to avoid 
allocation latencies, so it should skip over order=X blocks of pages as 
soon as it cannot isolate some page in such block (and put back 
already-isolated pages instead of migrating them). As for other 
parameters I'm not sure. Sync should still be possible thanks to the 
deferred_compaction logic? Maybe somehow transform the yes/no answer of 
deferred compaction to a number of how many pageblocks it should try 
before giving up, so there are no latency spikes limited only by the 
size of the zone?

4) compaction from proc interface - sync, reset cached pfn's, ignore 
deferring, ignore skip bits...

Vlastimil


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-05 14:24           ` Vlastimil Babka
@ 2014-05-06  0:29             ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-06  0:29 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Mon, 5 May 2014, Vlastimil Babka wrote:

> I see, although I would still welcome some numbers to back such change.

It's pretty difficult to capture numbers for this in real-world scenarios 
since it happens rarely (and when it happens, it's very significant 
latency) and without an instrumented kernel that will determine how many 
pageblocks have been skipped.  I could create a synthetic example of it in 
the kernel and get numbers for a worst-case scenario with a 64GB zone if 
you'd like, I'm not sure how representative it will be.

> What I still don't like is the removal of the intent of commit 50b5b094e6. You
> now again call set_pageblock_skip() unconditionally, thus also on pageblocks
> that async compaction skipped due to being non-MOVABLE. The sync compaction
> will thus ignore them.
> 

I'm not following you, with this patch there are two cached pfns for the 
migration scanner: one is used for sync and one is used for async.  When 
cc->sync == true, both cached pfns are updated (async is not going to 
succeed for a pageblock when sync failed for that pageblock); when 
cc->sync == false, the async cached pfn is updated only and we pick up 
again where we left off for subsequent async compactions.  Sync compaction 
will still begin where it last left off and consider these non-MOVABLE 
pageblocks.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
@ 2014-05-06  0:29             ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-06  0:29 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Mon, 5 May 2014, Vlastimil Babka wrote:

> I see, although I would still welcome some numbers to back such change.

It's pretty difficult to capture numbers for this in real-world scenarios 
since it happens rarely (and when it happens, it's very significant 
latency) and without an instrumented kernel that will determine how many 
pageblocks have been skipped.  I could create a synthetic example of it in 
the kernel and get numbers for a worst-case scenario with a 64GB zone if 
you'd like, I'm not sure how representative it will be.

> What I still don't like is the removal of the intent of commit 50b5b094e6. You
> now again call set_pageblock_skip() unconditionally, thus also on pageblocks
> that async compaction skipped due to being non-MOVABLE. The sync compaction
> will thus ignore them.
> 

I'm not following you, with this patch there are two cached pfns for the 
migration scanner: one is used for sync and one is used for async.  When 
cc->sync == true, both cached pfns are updated (async is not going to 
succeed for a pageblock when sync failed for that pageblock); when 
cc->sync == false, the async cached pfn is updated only and we pick up 
again where we left off for subsequent async compactions.  Sync compaction 
will still begin where it last left off and consider these non-MOVABLE 
pageblocks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
  2014-05-02 20:29             ` David Rientjes
@ 2014-05-06  8:55               ` Mel Gorman
  -1 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-06  8:55 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Fri, May 02, 2014 at 01:29:33PM -0700, David Rientjes wrote:
> On Fri, 2 May 2014, Mel Gorman wrote:
> 
> > > The page locks I'm referring to is the lock_page() in __unmap_and_move() 
> > > that gets called for sync compaction after the migrate_pages() iteration 
> > > makes a few passes and unsuccessfully grabs it.  This becomes a forced 
> > > migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
> > > 
> > 
> > Can that be fixed then instead of disabling it entirely?
> > 
> 
> We could return -EAGAIN when the trylock_page() fails for 
> MIGRATE_SYNC_LIGHT.  It would become a forced migration but we ignore that 
> currently for MIGRATE_ASYNC, and I could extend it to be ignored for 
> MIGRATE_SYNC_LIGHT as well.
> 
> > > We have perf profiles from one workload in particular that shows 
> > > contention on i_mmap_mutex (anon isn't interesting since the vast majority 
> > > of memory on this workload [120GB on a 128GB machine] is has a gup pin and 
> > > doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid 
> > > isolating pinned pages")) between cpus all doing memory compaction trying 
> > > to fault thp memory.
> > > 
> > 
> > Abort SYNC_LIGHT compaction if the mutex is contended.
> > 
> 
> Yeah, I have patches for that as well but we're waiting to see if they are 
> actually needed when sync compaction is disabled for thp.  If we aren't 
> actually going to disable it entirely, then I can revive those patches if 
> the contention becomes such an issue.
> 
> > > That's one example that we've seen, but the fact remains that at times 
> > > sync compaction will iterate the entire 128GB machine and not allow an 
> > > order-9 page to be allocated and there's nothing to preempt it like the 
> > > need_resched() or lock contention checks that async compaction has. 
> > 
> > Make compact_control->sync the same enum field and check for contention
> > on the async/sync_light case but leave it for sync if compacting via the
> > proc interface?
> > 
> 
> Ok, that certainly can be done, I wasn't sure you would be happy with such 
> a change. 

I'm not super-keen as the success rates are already very poor for allocations
under pressure. It's something Vlastimil is working on so it may be possible
to get some of the success rates back. It has always been the case that
compaction should not severely impact overall performance as THP gains
must offset compaction. While I'm not happy to reduce the success rates
further I do not think we should leave a known performance impact on
128G machines wait on Vlastimil's series.

Vlastimil, what do you think?

> I'm not sure there's so much of a difference between the new 
> compact_control->sync == MIGRATE_ASYNC and == MIGRATE_SYNC_LIGHT now, 
> though.  Would it make sense to remove MIGRATE_SYNC_LIGHT entirely from 
> the page allocator, i.e. remove sync_migration entirely, and just retry 
> with a second call to compaction before failing instead? 

Would it be possible if only khugepaged entered SYNC_LIGHT migration and
kswapd and direct THP allocations used only MIGRATE_ASYNC? That would
allow khugepaged to continue locking pages and buffers in a slow path
while still not allowing it to issue IO or wait on writeback. It would
also give a chance for Vlastimil's series to shake out a bit without him
having to reintroduce SYNC_LIGHT as part of that series.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
@ 2014-05-06  8:55               ` Mel Gorman
  0 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-06  8:55 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Fri, May 02, 2014 at 01:29:33PM -0700, David Rientjes wrote:
> On Fri, 2 May 2014, Mel Gorman wrote:
> 
> > > The page locks I'm referring to is the lock_page() in __unmap_and_move() 
> > > that gets called for sync compaction after the migrate_pages() iteration 
> > > makes a few passes and unsuccessfully grabs it.  This becomes a forced 
> > > migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
> > > 
> > 
> > Can that be fixed then instead of disabling it entirely?
> > 
> 
> We could return -EAGAIN when the trylock_page() fails for 
> MIGRATE_SYNC_LIGHT.  It would become a forced migration but we ignore that 
> currently for MIGRATE_ASYNC, and I could extend it to be ignored for 
> MIGRATE_SYNC_LIGHT as well.
> 
> > > We have perf profiles from one workload in particular that shows 
> > > contention on i_mmap_mutex (anon isn't interesting since the vast majority 
> > > of memory on this workload [120GB on a 128GB machine] is has a gup pin and 
> > > doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid 
> > > isolating pinned pages")) between cpus all doing memory compaction trying 
> > > to fault thp memory.
> > > 
> > 
> > Abort SYNC_LIGHT compaction if the mutex is contended.
> > 
> 
> Yeah, I have patches for that as well but we're waiting to see if they are 
> actually needed when sync compaction is disabled for thp.  If we aren't 
> actually going to disable it entirely, then I can revive those patches if 
> the contention becomes such an issue.
> 
> > > That's one example that we've seen, but the fact remains that at times 
> > > sync compaction will iterate the entire 128GB machine and not allow an 
> > > order-9 page to be allocated and there's nothing to preempt it like the 
> > > need_resched() or lock contention checks that async compaction has. 
> > 
> > Make compact_control->sync the same enum field and check for contention
> > on the async/sync_light case but leave it for sync if compacting via the
> > proc interface?
> > 
> 
> Ok, that certainly can be done, I wasn't sure you would be happy with such 
> a change. 

I'm not super-keen as the success rates are already very poor for allocations
under pressure. It's something Vlastimil is working on so it may be possible
to get some of the success rates back. It has always been the case that
compaction should not severely impact overall performance as THP gains
must offset compaction. While I'm not happy to reduce the success rates
further I do not think we should leave a known performance impact on
128G machines wait on Vlastimil's series.

Vlastimil, what do you think?

> I'm not sure there's so much of a difference between the new 
> compact_control->sync == MIGRATE_ASYNC and == MIGRATE_SYNC_LIGHT now, 
> though.  Would it make sense to remove MIGRATE_SYNC_LIGHT entirely from 
> the page allocator, i.e. remove sync_migration entirely, and just retry 
> with a second call to compaction before failing instead? 

Would it be possible if only khugepaged entered SYNC_LIGHT migration and
kswapd and direct THP allocations used only MIGRATE_ASYNC? That would
allow khugepaged to continue locking pages and buffers in a slow path
while still not allowing it to issue IO or wait on writeback. It would
also give a chance for Vlastimil's series to shake out a bit without him
having to reintroduce SYNC_LIGHT as part of that series.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-06  0:29             ` David Rientjes
@ 2014-05-06 11:52               ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-06 11:52 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On 05/06/2014 02:29 AM, David Rientjes wrote:
> On Mon, 5 May 2014, Vlastimil Babka wrote:
>
>> I see, although I would still welcome some numbers to back such change.
>
> It's pretty difficult to capture numbers for this in real-world scenarios
> since it happens rarely (and when it happens, it's very significant
> latency) and without an instrumented kernel that will determine how many
> pageblocks have been skipped.  I could create a synthetic example of it in
> the kernel and get numbers for a worst-case scenario with a 64GB zone if
> you'd like, I'm not sure how representative it will be.
>
>> What I still don't like is the removal of the intent of commit 50b5b094e6. You
>> now again call set_pageblock_skip() unconditionally, thus also on pageblocks
>> that async compaction skipped due to being non-MOVABLE. The sync compaction
>> will thus ignore them.
>>
>
> I'm not following you, with this patch there are two cached pfns for the
> migration scanner: one is used for sync and one is used for async.  When
> cc->sync == true, both cached pfns are updated (async is not going to
> succeed for a pageblock when sync failed for that pageblock); when
> cc->sync == false, the async cached pfn is updated only and we pick up
> again where we left off for subsequent async compactions.  Sync compaction
> will still begin where it last left off and consider these non-MOVABLE
> pageblocks.
>

Yeah I understand, the cached pfn's are not the problem. The problem is 
that with your patch, set_pageblock_skip() will be called through 
update_pageblock_skip() in async compaction, since you removed the 
skipped_async_unsuitable variable. So in sync compaction, such pageblock 
will be skipped thanks to the isolation_suitable() check which uses 
get_pageblock_skip() to read the bit set by the async compaction.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
@ 2014-05-06 11:52               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-06 11:52 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On 05/06/2014 02:29 AM, David Rientjes wrote:
> On Mon, 5 May 2014, Vlastimil Babka wrote:
>
>> I see, although I would still welcome some numbers to back such change.
>
> It's pretty difficult to capture numbers for this in real-world scenarios
> since it happens rarely (and when it happens, it's very significant
> latency) and without an instrumented kernel that will determine how many
> pageblocks have been skipped.  I could create a synthetic example of it in
> the kernel and get numbers for a worst-case scenario with a 64GB zone if
> you'd like, I'm not sure how representative it will be.
>
>> What I still don't like is the removal of the intent of commit 50b5b094e6. You
>> now again call set_pageblock_skip() unconditionally, thus also on pageblocks
>> that async compaction skipped due to being non-MOVABLE. The sync compaction
>> will thus ignore them.
>>
>
> I'm not following you, with this patch there are two cached pfns for the
> migration scanner: one is used for sync and one is used for async.  When
> cc->sync == true, both cached pfns are updated (async is not going to
> succeed for a pageblock when sync failed for that pageblock); when
> cc->sync == false, the async cached pfn is updated only and we pick up
> again where we left off for subsequent async compactions.  Sync compaction
> will still begin where it last left off and consider these non-MOVABLE
> pageblocks.
>

Yeah I understand, the cached pfn's are not the problem. The problem is 
that with your patch, set_pageblock_skip() will be called through 
update_pageblock_skip() in async compaction, since you removed the 
skipped_async_unsuitable variable. So in sync compaction, such pageblock 
will be skipped thanks to the isolation_suitable() check which uses 
get_pageblock_skip() to read the bit set by the async compaction.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
  2014-05-06  8:55               ` Mel Gorman
@ 2014-05-06 15:05                 ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-06 15:05 UTC (permalink / raw)
  To: Mel Gorman, David Rientjes
  Cc: Andrew Morton, Rik van Riel, Joonsoo Kim, Greg Thelen,
	Hugh Dickins, linux-kernel, linux-mm

On 05/06/2014 10:55 AM, Mel Gorman wrote:
> On Fri, May 02, 2014 at 01:29:33PM -0700, David Rientjes wrote:
>> On Fri, 2 May 2014, Mel Gorman wrote:
>>
>>>> The page locks I'm referring to is the lock_page() in __unmap_and_move()
>>>> that gets called for sync compaction after the migrate_pages() iteration
>>>> makes a few passes and unsuccessfully grabs it.  This becomes a forced
>>>> migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
>>>>
>>>
>>> Can that be fixed then instead of disabling it entirely?
>>>
>>
>> We could return -EAGAIN when the trylock_page() fails for
>> MIGRATE_SYNC_LIGHT.  It would become a forced migration but we ignore that
>> currently for MIGRATE_ASYNC, and I could extend it to be ignored for
>> MIGRATE_SYNC_LIGHT as well.
>>
>>>> We have perf profiles from one workload in particular that shows
>>>> contention on i_mmap_mutex (anon isn't interesting since the vast majority
>>>> of memory on this workload [120GB on a 128GB machine] is has a gup pin and
>>>> doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid
>>>> isolating pinned pages")) between cpus all doing memory compaction trying
>>>> to fault thp memory.
>>>>
>>>
>>> Abort SYNC_LIGHT compaction if the mutex is contended.
>>>
>>
>> Yeah, I have patches for that as well but we're waiting to see if they are
>> actually needed when sync compaction is disabled for thp.  If we aren't
>> actually going to disable it entirely, then I can revive those patches if
>> the contention becomes such an issue.
>>
>>>> That's one example that we've seen, but the fact remains that at times
>>>> sync compaction will iterate the entire 128GB machine and not allow an
>>>> order-9 page to be allocated and there's nothing to preempt it like the
>>>> need_resched() or lock contention checks that async compaction has.
>>>
>>> Make compact_control->sync the same enum field and check for contention
>>> on the async/sync_light case but leave it for sync if compacting via the
>>> proc interface?
>>>
>>
>> Ok, that certainly can be done, I wasn't sure you would be happy with such
>> a change.
>
> I'm not super-keen as the success rates are already very poor for allocations
> under pressure. It's something Vlastimil is working on so it may be possible
> to get some of the success rates back. It has always been the case that
> compaction should not severely impact overall performance as THP gains
> must offset compaction. While I'm not happy to reduce the success rates
> further I do not think we should leave a known performance impact on
> 128G machines wait on Vlastimil's series.
>
> Vlastimil, what do you think?

I think before giving up due to lock contention, the "give up on 
pageblock when I won't be able to free all of it anyway" should be 
considered (as I've tried to explain in the yesterday reply, and I think 
David also suggested that already?). Giving up due to lock contention 
might for example mean that it will free half of the pageblock and then 
give up, wasting the work already done.

>> I'm not sure there's so much of a difference between the new
>> compact_control->sync == MIGRATE_ASYNC and == MIGRATE_SYNC_LIGHT now,
>> though.  Would it make sense to remove MIGRATE_SYNC_LIGHT entirely from
>> the page allocator, i.e. remove sync_migration entirely, and just retry
>> with a second call to compaction before failing instead?
>
> Would it be possible if only khugepaged entered SYNC_LIGHT migration and
> kswapd and direct THP allocations used only MIGRATE_ASYNC? That would
> allow khugepaged to continue locking pages and buffers in a slow path
> while still not allowing it to issue IO or wait on writeback. It would
> also give a chance for Vlastimil's series to shake out a bit without him
> having to reintroduce SYNC_LIGHT as part of that series.

I agree that khugepaged should be more persistent. Unlike page faults 
and direct compactions, nothing is stalling on it right?
Also removing sync_migration completely would mean that nobody would try 
to compact non-MOVABLE pageblocks. I think this might increase 
fragmentation, as this is a mechanism that prevents non-MOVABLE 
pageblocks being filled by MOVABLE pages by stealing, and then 
non-MOVABLE allocations having to steal back from other MOVABLE 
pageblocks...


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
@ 2014-05-06 15:05                 ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-06 15:05 UTC (permalink / raw)
  To: Mel Gorman, David Rientjes
  Cc: Andrew Morton, Rik van Riel, Joonsoo Kim, Greg Thelen,
	Hugh Dickins, linux-kernel, linux-mm

On 05/06/2014 10:55 AM, Mel Gorman wrote:
> On Fri, May 02, 2014 at 01:29:33PM -0700, David Rientjes wrote:
>> On Fri, 2 May 2014, Mel Gorman wrote:
>>
>>>> The page locks I'm referring to is the lock_page() in __unmap_and_move()
>>>> that gets called for sync compaction after the migrate_pages() iteration
>>>> makes a few passes and unsuccessfully grabs it.  This becomes a forced
>>>> migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
>>>>
>>>
>>> Can that be fixed then instead of disabling it entirely?
>>>
>>
>> We could return -EAGAIN when the trylock_page() fails for
>> MIGRATE_SYNC_LIGHT.  It would become a forced migration but we ignore that
>> currently for MIGRATE_ASYNC, and I could extend it to be ignored for
>> MIGRATE_SYNC_LIGHT as well.
>>
>>>> We have perf profiles from one workload in particular that shows
>>>> contention on i_mmap_mutex (anon isn't interesting since the vast majority
>>>> of memory on this workload [120GB on a 128GB machine] is has a gup pin and
>>>> doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid
>>>> isolating pinned pages")) between cpus all doing memory compaction trying
>>>> to fault thp memory.
>>>>
>>>
>>> Abort SYNC_LIGHT compaction if the mutex is contended.
>>>
>>
>> Yeah, I have patches for that as well but we're waiting to see if they are
>> actually needed when sync compaction is disabled for thp.  If we aren't
>> actually going to disable it entirely, then I can revive those patches if
>> the contention becomes such an issue.
>>
>>>> That's one example that we've seen, but the fact remains that at times
>>>> sync compaction will iterate the entire 128GB machine and not allow an
>>>> order-9 page to be allocated and there's nothing to preempt it like the
>>>> need_resched() or lock contention checks that async compaction has.
>>>
>>> Make compact_control->sync the same enum field and check for contention
>>> on the async/sync_light case but leave it for sync if compacting via the
>>> proc interface?
>>>
>>
>> Ok, that certainly can be done, I wasn't sure you would be happy with such
>> a change.
>
> I'm not super-keen as the success rates are already very poor for allocations
> under pressure. It's something Vlastimil is working on so it may be possible
> to get some of the success rates back. It has always been the case that
> compaction should not severely impact overall performance as THP gains
> must offset compaction. While I'm not happy to reduce the success rates
> further I do not think we should leave a known performance impact on
> 128G machines wait on Vlastimil's series.
>
> Vlastimil, what do you think?

I think before giving up due to lock contention, the "give up on 
pageblock when I won't be able to free all of it anyway" should be 
considered (as I've tried to explain in the yesterday reply, and I think 
David also suggested that already?). Giving up due to lock contention 
might for example mean that it will free half of the pageblock and then 
give up, wasting the work already done.

>> I'm not sure there's so much of a difference between the new
>> compact_control->sync == MIGRATE_ASYNC and == MIGRATE_SYNC_LIGHT now,
>> though.  Would it make sense to remove MIGRATE_SYNC_LIGHT entirely from
>> the page allocator, i.e. remove sync_migration entirely, and just retry
>> with a second call to compaction before failing instead?
>
> Would it be possible if only khugepaged entered SYNC_LIGHT migration and
> kswapd and direct THP allocations used only MIGRATE_ASYNC? That would
> allow khugepaged to continue locking pages and buffers in a slow path
> while still not allowing it to issue IO or wait on writeback. It would
> also give a chance for Vlastimil's series to shake out a bit without him
> having to reintroduce SYNC_LIGHT as part of that series.

I agree that khugepaged should be more persistent. Unlike page faults 
and direct compactions, nothing is stalling on it right?
Also removing sync_migration completely would mean that nobody would try 
to compact non-MOVABLE pageblocks. I think this might increase 
fragmentation, as this is a mechanism that prevents non-MOVABLE 
pageblocks being filled by MOVABLE pages by stealing, and then 
non-MOVABLE allocations having to steal back from other MOVABLE 
pageblocks...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm/compaction: do not count migratepages when unnecessary
  2014-05-02 15:26       ` [PATCH] mm/compaction: do not count migratepages when unnecessary Vlastimil Babka
@ 2014-05-06 21:18         ` Naoya Horiguchi
       [not found]         ` <1399411134-k43fsr0p@n-horiguchi@ah.jp.nec.com>
  1 sibling, 0 replies; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-06 21:18 UTC (permalink / raw)
  To: vbabka
  Cc: Andrew Morton, Hugh Dickins, gthelen, linux-kernel, linux-mm,
	minchan, Mel Gorman, iamjoonsoo.kim, cl, Rik van Riel

On Fri, May 02, 2014 at 05:26:18PM +0200, Vlastimil Babka wrote:
> During compaction, update_nr_listpages() has been used to count remaining
> non-migrated and free pages after a call to migrage_pages(). The freepages
> counting has become unneccessary, and it turns out that migratepages counting
> is also unnecessary in most cases.
> 
> The only situation when it's needed to count cc->migratepages is when
> migrate_pages() returns with a negative error code. Otherwise, the non-negative
> return value is the number of pages that were not migrated, which is exactly
> the count of remaining pages in the cc->migratepages list.
> 
> Furthermore, any non-zero count is only interesting for the tracepoint of
> mm_compaction_migratepages events, because after that all remaining unmigrated
> pages are put back and their count is set to 0.
> 
> This patch therefore removes update_nr_listpages() completely, and changes the
> tracepoint definition so that the manual counting is done only when the
> tracepoint is enabled, and only when migrate_pages() returns a negative error
> code.
> 
> Furthermore, migrate_pages() and the tracepoints won't be called when there's
> nothing to migrate. This potentially avoids some wasted cycles and reduces the
> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
> The mm_compaction_isolate_migratepages event is better for determining that
> nothing was isolated for migration, and this one was just duplicating the info.

I agree with this patch.
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

A few nitpicks below...

> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
>  mm/compaction.c                   | 30 ++++++++----------------------
>  2 files changed, 30 insertions(+), 26 deletions(-)
> 
> diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
> index 06f544e..bbd5e1f 100644
> --- a/include/trace/events/compaction.h
> +++ b/include/trace/events/compaction.h
> @@ -5,7 +5,9 @@
>  #define _TRACE_COMPACTION_H
>  
>  #include <linux/types.h>
> +#include <linux/list.h>
>  #include <linux/tracepoint.h>
> +#include <linux/mm_types.h>
>  #include <trace/events/gfpflags.h>
>  
>  DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
> @@ -47,10 +49,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
>  
>  TRACE_EVENT(mm_compaction_migratepages,
>  
> -	TP_PROTO(unsigned long nr_migrated,
> -		unsigned long nr_failed),
> +	TP_PROTO(unsigned long nr_all,
> +		int migrate_rc,
> +		struct list_head * migratepages),

checkpatch.pl shows code violation message for this line.
(ERROR: "foo * bar" should be "foo *bar")

>  
> -	TP_ARGS(nr_migrated, nr_failed),
> +	TP_ARGS(nr_all, migrate_rc, migratepages),
>  
>  	TP_STRUCT__entry(
>  		__field(unsigned long, nr_migrated)
> @@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
>  	),
>  
>  	TP_fast_assign(
> -		__entry->nr_migrated = nr_migrated;
> +		unsigned long nr_failed = 0;
> +		struct page *page;
> +
> +		/*
> +		 * migrate_pages() returns either a non-negative number
> +		 * with the number of pages that failed migration, or an
> +		 * error code, in which case we need to count the remaining
> +		 * pages manually
> +		 */
> +		if (migrate_rc >= 0)
> +			nr_failed = migrate_rc;
> +		else
> +		        list_for_each_entry(page, migratepages, lru)

This line contains whitespace indent.

> +				nr_failed++;
> +
> +		__entry->nr_migrated = nr_all - nr_failed;
>  		__entry->nr_failed = nr_failed;
>  	),
>  
> diff --git a/mm/compaction.c b/mm/compaction.c
> index ae1d0ae..873d7de 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -799,22 +799,6 @@ static void compaction_free(struct page *page, unsigned long data)
>  	cc->nr_freepages++;
>  }
>  
> -/*
> - * We cannot control nr_migratepages fully when migration is running as
> - * migrate_pages() has no knowledge of of compact_control.  When migration is
> - * complete, we count the number of pages on the list by hand.
> - */
> -static void update_nr_listpages(struct compact_control *cc)
> -{
> -	int nr_migratepages = 0;
> -	struct page *page;
> -
> -	list_for_each_entry(page, &cc->migratepages, lru)
> -		nr_migratepages++;
> -
> -	cc->nr_migratepages = nr_migratepages;
> -}
> -
>  /* possible outcome of isolate_migratepages */
>  typedef enum {
>  	ISOLATE_ABORT,		/* Abort compaction now */
> @@ -1006,7 +990,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>  	migrate_prep_local();
>  
>  	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
> -		unsigned long nr_migrate, nr_remaining;
>  		int err;
>  
>  		switch (isolate_migratepages(zone, cc)) {
> @@ -1021,16 +1004,16 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>  			;
>  		}
>  
> -		nr_migrate = cc->nr_migratepages;
> +		if (!cc->nr_migratepages)
> +			continue;
> +
>  		err = migrate_pages(&cc->migratepages, compaction_alloc,
>  				compaction_free, (unsigned long)cc,
>  				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
>  				MR_COMPACTION);
> -		update_nr_listpages(cc);
> -		nr_remaining = cc->nr_migratepages;
>  
> -		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
> -						nr_remaining);
> +		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
> +							&cc->migratepages);
>  
>  		/* Release isolated pages not migrated */
>  		if (err) {
> @@ -1044,6 +1027,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>  				ret = COMPACT_PARTIAL;
>  				goto out;
>  			}
> +		} else {
> +			/* All pages were successfully migrated */
> +			cc->nr_migratepages = 0;

cc->nr_migratepages = 0 is also done in err != 0, so can it be done in common path?

Thanks,
Naoya Horiguchi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-02 15:27         ` Vlastimil Babka
  (?)
@ 2014-05-06 22:19         ` Naoya Horiguchi
  -1 siblings, 0 replies; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-06 22:19 UTC (permalink / raw)
  To: vbabka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, gthelen,
	linux-kernel, linux-mm, minchan, Mel Gorman, iamjoonsoo.kim, cl,
	Rik van Riel

On Fri, May 02, 2014 at 05:27:55PM +0200, Vlastimil Babka wrote:
> The compaction free scanner in isolate_freepages() currently remembers PFN of
> the highest pageblock where it successfully isolates, to be used as the
> starting pageblock for the next invocation. The rationale behind this is that
> page migration might return free pages to the allocator when migration fails
> and we don't want to skip them if the compaction continues.
> 
> Since migration now returns free pages back to compaction code where they can
> be reused, this is no longer a concern. This patch changes isolate_freepages()
> so that the PFN for restarting is updated with each pageblock where isolation
> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> reduction of the pages scanned by the free scanner.
> 
> Note that the somewhat similar functionality that records highest successful
> pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
> used when the whole compaction is restarted, not for multiple invocations of
> the free scanner during single compaction.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  mm/compaction.c | 18 ++++++------------
>  1 file changed, 6 insertions(+), 12 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 873d7de..1967850 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -686,12 +686,6 @@ static void isolate_freepages(struct zone *zone,
>  	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
>  
>  	/*
> -	 * If no pages are isolated, the block_start_pfn < low_pfn check
> -	 * will kick in.
> -	 */
> -	next_free_pfn = 0;
> -
> -	/*
>  	 * Isolate free pages until enough are available to migrate the
>  	 * pages on cc->migratepages. We stop searching if the migrate
>  	 * and free page scanners meet or enough free pages are isolated.
> @@ -731,19 +725,19 @@ static void isolate_freepages(struct zone *zone,
>  			continue;
>  
>  		/* Found a block suitable for isolating free pages from */
> +		next_free_pfn = block_start_pfn;
>  		isolated = isolate_freepages_block(cc, block_start_pfn,
>  					block_end_pfn, freelist, false);
>  		nr_freepages += isolated;
>  
>  		/*
> -		 * Record the highest PFN we isolated pages from. When next
> -		 * looking for free pages, the search will restart here as
> -		 * page migration may have returned some pages to the allocator
> +		 * Set a flag that we successfully isolated in this pageblock.
> +		 * In the next loop iteration, zone->compact_cached_free_pfn
> +		 * will not be updated and thus it will effectively contain the
> +		 * highest pageblock we isolated pages from.
>  		 */
> -		if (isolated && next_free_pfn == 0) {
> +		if (isolated)
>  			cc->finished_update_free = true;
> -			next_free_pfn = block_start_pfn;
> -		}

Why don't you completely remove next_free_pfn and update cc->free_pfn directly?

Thanks,
Naoya Horiguchi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 1/6] mm, migration: add destination page freeing callback
  2014-05-01 21:35   ` David Rientjes
@ 2014-05-07  2:22     ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory migration uses a callback defined by the caller to determine how to
allocate destination pages.  When migration fails for a source page, however, it 
frees the destination page back to the system.

This patch adds a memory migration callback defined by the caller to determine 
how to free destination pages.  If a caller, such as memory compaction, builds 
its own freelist for migration targets, this can reuse already freed memory 
instead of scanning additional memory.

If the caller provides a function to handle freeing of destination pages, it is 
called when page migration fails.  Otherwise, it may pass NULL and freeing back 
to the system will be handled as usual.  This patch introduces no functional 
change.

Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/migrate.h | 11 ++++++----
 mm/compaction.c         |  2 +-
 mm/memory-failure.c     |  4 ++--
 mm/memory_hotplug.c     |  2 +-
 mm/mempolicy.c          |  4 ++--
 mm/migrate.c            | 55 +++++++++++++++++++++++++++++++++++--------------
 mm/page_alloc.c         |  2 +-
 7 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,7 +5,9 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate_mode.h>
 
-typedef struct page *new_page_t(struct page *, unsigned long private, int **);
+typedef struct page *new_page_t(struct page *page, unsigned long private,
+				int **reason);
+typedef void free_page_t(struct page *page, unsigned long private);
 
 /*
  * Return values from addresss_space_operations.migratepage():
@@ -38,7 +40,7 @@ enum migrate_reason {
 extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
-extern int migrate_pages(struct list_head *l, new_page_t x,
+extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
 
 extern int migrate_prep(void);
@@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
-static inline int migrate_pages(struct list_head *l, new_page_t x,
-		unsigned long private, enum migrate_mode mode, int reason)
+static inline int migrate_pages(struct list_head *l, new_page_t new,
+		free_page_t free, unsigned long private, enum migrate_mode mode,
+		int reason)
 	{ return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1023,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
 				(unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1500,7 +1500,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 
 	/* Keep page count to indicate a given hugepage is isolated. */
 	list_move(&hpage->lru, &pagelist);
-	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 				MIGRATE_SYNC, MR_MEMORY_FAILURE);
 	if (ret) {
 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1581,7 +1581,7 @@ static int __soft_offline_page(struct page *page, int flags)
 		inc_zone_page_state(page, NR_ISOLATED_ANON +
 					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
-		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
 		if (ret) {
 			if (!list_empty(&pagelist)) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * alloc_migrate_target should be improooooved!!
 		 * migrate_pages returns # of failed pages.
 		 */
-		ret = migrate_pages(&source, alloc_migrate_target, 0,
+		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
 		if (ret)
 			putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_node_page, dest,
+		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
 					MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
-					(unsigned long)vma,
+					NULL, (unsigned long)vma,
 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -938,8 +938,9 @@ out:
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
  */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, enum migrate_mode mode)
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+			unsigned long private, struct page *page, int force,
+			enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -983,11 +984,17 @@ out:
 				page_is_file_cache(page));
 		putback_lru_page(page);
 	}
+
 	/*
-	 * Move the new page to the LRU. If migration was not successful
-	 * then this will free the page.
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, putback_lru_page() will drop the reference grabbed
+	 * during isolation.
 	 */
-	putback_lru_page(newpage);
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(newpage, private);
+	else
+		putback_lru_page(newpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1016,8 +1023,9 @@ out:
  * will wait in the page fault for migration to complete.
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
-				unsigned long private, struct page *hpage,
-				int force, enum migrate_mode mode)
+				free_page_t put_new_page, unsigned long private,
+				struct page *hpage, int force,
+				enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -1056,20 +1064,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (!page_mapped(hpage))
 		rc = move_to_new_page(new_hpage, hpage, 1, mode);
 
-	if (rc)
+	if (rc != MIGRATEPAGE_SUCCESS)
 		remove_migration_ptes(hpage, hpage);
 
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 
-	if (!rc)
+	if (rc == MIGRATEPAGE_SUCCESS)
 		hugetlb_cgroup_migrate(hpage, new_hpage);
 
 	unlock_page(hpage);
 out:
 	if (rc != -EAGAIN)
 		putback_active_hugepage(hpage);
-	put_page(new_hpage);
+
+	/*
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, put_page() will drop the reference grabbed during
+	 * isolation.
+	 */
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(new_hpage, private);
+	else
+		put_page(new_hpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1086,6 +1104,8 @@ out:
  * @from:		The list of pages to be migrated.
  * @get_new_page:	The function used to allocate free pages to be used
  *			as the target of the page migration.
+ * @put_new_page:	The function used to free target pages if migration
+ *			fails, or NULL if no special handling is necessary.
  * @private:		Private data to be passed on to get_new_page()
  * @mode:		The migration mode that specifies the constraints for
  *			page migration, if any.
@@ -1099,7 +1119,8 @@ out:
  * Returns the number of pages that were not migrated, or an error code.
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
-		unsigned long private, enum migrate_mode mode, int reason)
+		free_page_t put_new_page, unsigned long private,
+		enum migrate_mode mode, int reason)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -1121,10 +1142,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 
 			if (PageHuge(page))
 				rc = unmap_and_move_huge_page(get_new_page,
-						private, page, pass > 2, mode);
+						put_new_page, private, page,
+						pass > 2, mode);
 			else
-				rc = unmap_and_move(get_new_page, private,
-						page, pass > 2, mode);
+				rc = unmap_and_move(get_new_page, put_new_page,
+						private, page, pass > 2, mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1273,7 +1295,7 @@ set_status:
 
 	err = 0;
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_page_node,
+		err = migrate_pages(&pagelist, new_page_node, NULL,
 				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1729,7 +1751,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 
 	list_add(&page->lru, &migratepages);
 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
-				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+				     NULL, node, MIGRATE_ASYNC,
+				     MR_NUMA_MISPLACED);
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6215,7 +6215,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		cc->nr_migratepages -= nr_reclaimed;
 
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
-				    0, MIGRATE_SYNC, MR_CMA);
+				    NULL, 0, MIGRATE_SYNC, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 1/6] mm, migration: add destination page freeing callback
@ 2014-05-07  2:22     ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory migration uses a callback defined by the caller to determine how to
allocate destination pages.  When migration fails for a source page, however, it 
frees the destination page back to the system.

This patch adds a memory migration callback defined by the caller to determine 
how to free destination pages.  If a caller, such as memory compaction, builds 
its own freelist for migration targets, this can reuse already freed memory 
instead of scanning additional memory.

If the caller provides a function to handle freeing of destination pages, it is 
called when page migration fails.  Otherwise, it may pass NULL and freeing back 
to the system will be handled as usual.  This patch introduces no functional 
change.

Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/migrate.h | 11 ++++++----
 mm/compaction.c         |  2 +-
 mm/memory-failure.c     |  4 ++--
 mm/memory_hotplug.c     |  2 +-
 mm/mempolicy.c          |  4 ++--
 mm/migrate.c            | 55 +++++++++++++++++++++++++++++++++++--------------
 mm/page_alloc.c         |  2 +-
 7 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,7 +5,9 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate_mode.h>
 
-typedef struct page *new_page_t(struct page *, unsigned long private, int **);
+typedef struct page *new_page_t(struct page *page, unsigned long private,
+				int **reason);
+typedef void free_page_t(struct page *page, unsigned long private);
 
 /*
  * Return values from addresss_space_operations.migratepage():
@@ -38,7 +40,7 @@ enum migrate_reason {
 extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
-extern int migrate_pages(struct list_head *l, new_page_t x,
+extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
 
 extern int migrate_prep(void);
@@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
-static inline int migrate_pages(struct list_head *l, new_page_t x,
-		unsigned long private, enum migrate_mode mode, int reason)
+static inline int migrate_pages(struct list_head *l, new_page_t new,
+		free_page_t free, unsigned long private, enum migrate_mode mode,
+		int reason)
 	{ return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1023,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
 				(unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1500,7 +1500,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 
 	/* Keep page count to indicate a given hugepage is isolated. */
 	list_move(&hpage->lru, &pagelist);
-	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 				MIGRATE_SYNC, MR_MEMORY_FAILURE);
 	if (ret) {
 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1581,7 +1581,7 @@ static int __soft_offline_page(struct page *page, int flags)
 		inc_zone_page_state(page, NR_ISOLATED_ANON +
 					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
-		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
 		if (ret) {
 			if (!list_empty(&pagelist)) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * alloc_migrate_target should be improooooved!!
 		 * migrate_pages returns # of failed pages.
 		 */
-		ret = migrate_pages(&source, alloc_migrate_target, 0,
+		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
 		if (ret)
 			putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_node_page, dest,
+		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
 					MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
-					(unsigned long)vma,
+					NULL, (unsigned long)vma,
 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -938,8 +938,9 @@ out:
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
  */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, enum migrate_mode mode)
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+			unsigned long private, struct page *page, int force,
+			enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -983,11 +984,17 @@ out:
 				page_is_file_cache(page));
 		putback_lru_page(page);
 	}
+
 	/*
-	 * Move the new page to the LRU. If migration was not successful
-	 * then this will free the page.
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, putback_lru_page() will drop the reference grabbed
+	 * during isolation.
 	 */
-	putback_lru_page(newpage);
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(newpage, private);
+	else
+		putback_lru_page(newpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1016,8 +1023,9 @@ out:
  * will wait in the page fault for migration to complete.
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
-				unsigned long private, struct page *hpage,
-				int force, enum migrate_mode mode)
+				free_page_t put_new_page, unsigned long private,
+				struct page *hpage, int force,
+				enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -1056,20 +1064,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (!page_mapped(hpage))
 		rc = move_to_new_page(new_hpage, hpage, 1, mode);
 
-	if (rc)
+	if (rc != MIGRATEPAGE_SUCCESS)
 		remove_migration_ptes(hpage, hpage);
 
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 
-	if (!rc)
+	if (rc == MIGRATEPAGE_SUCCESS)
 		hugetlb_cgroup_migrate(hpage, new_hpage);
 
 	unlock_page(hpage);
 out:
 	if (rc != -EAGAIN)
 		putback_active_hugepage(hpage);
-	put_page(new_hpage);
+
+	/*
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, put_page() will drop the reference grabbed during
+	 * isolation.
+	 */
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(new_hpage, private);
+	else
+		put_page(new_hpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1086,6 +1104,8 @@ out:
  * @from:		The list of pages to be migrated.
  * @get_new_page:	The function used to allocate free pages to be used
  *			as the target of the page migration.
+ * @put_new_page:	The function used to free target pages if migration
+ *			fails, or NULL if no special handling is necessary.
  * @private:		Private data to be passed on to get_new_page()
  * @mode:		The migration mode that specifies the constraints for
  *			page migration, if any.
@@ -1099,7 +1119,8 @@ out:
  * Returns the number of pages that were not migrated, or an error code.
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
-		unsigned long private, enum migrate_mode mode, int reason)
+		free_page_t put_new_page, unsigned long private,
+		enum migrate_mode mode, int reason)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -1121,10 +1142,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 
 			if (PageHuge(page))
 				rc = unmap_and_move_huge_page(get_new_page,
-						private, page, pass > 2, mode);
+						put_new_page, private, page,
+						pass > 2, mode);
 			else
-				rc = unmap_and_move(get_new_page, private,
-						page, pass > 2, mode);
+				rc = unmap_and_move(get_new_page, put_new_page,
+						private, page, pass > 2, mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1273,7 +1295,7 @@ set_status:
 
 	err = 0;
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_page_node,
+		err = migrate_pages(&pagelist, new_page_node, NULL,
 				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1729,7 +1751,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 
 	list_add(&page->lru, &migratepages);
 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
-				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+				     NULL, node, MIGRATE_ASYNC,
+				     MR_NUMA_MISPLACED);
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6215,7 +6215,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		cc->nr_migratepages -= nr_reclaimed;
 
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
-				    0, MIGRATE_SYNC, MR_CMA);
+				    NULL, 0, MIGRATE_SYNC, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
  2014-05-07  2:22     ` David Rientjes
@ 2014-05-07  2:22       ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory compaction works by having a "freeing scanner" scan from one end of a 
zone which isolates pages as migration targets while another "migrating scanner" 
scans from the other end of the same zone which isolates pages for migration.

When page migration fails for an isolated page, the target page is returned to 
the system rather than the freelist built by the freeing scanner.  This may 
require the freeing scanner to continue scanning memory after suitable migration 
targets have already been returned to the system needlessly.

This patch returns destination pages to the freeing scanner freelist when page 
migration fails.  This prevents unnecessary work done by the freeing scanner but 
also encourages memory to be as compacted as possible at the end of the zone.

Reported-by: Greg Thelen <gthelen@google.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -797,23 +797,32 @@ static struct page *compaction_alloc(struct page *migratepage,
 }
 
 /*
- * We cannot control nr_migratepages and nr_freepages fully when migration is
- * running as migrate_pages() has no knowledge of compact_control. When
- * migration is complete, we count the number of pages on the lists by hand.
+ * This is a migrate-callback that "frees" freepages back to the isolated
+ * freelist.  All pages on the freelist are from the same zone, so there is no
+ * special handling needed for NUMA.
+ */
+static void compaction_free(struct page *page, unsigned long data)
+{
+	struct compact_control *cc = (struct compact_control *)data;
+
+	list_add(&page->lru, &cc->freepages);
+	cc->nr_freepages++;
+}
+
+/*
+ * We cannot control nr_migratepages fully when migration is running as
+ * migrate_pages() has no knowledge of of compact_control.  When migration is
+ * complete, we count the number of pages on the list by hand.
  */
 static void update_nr_listpages(struct compact_control *cc)
 {
 	int nr_migratepages = 0;
-	int nr_freepages = 0;
 	struct page *page;
 
 	list_for_each_entry(page, &cc->migratepages, lru)
 		nr_migratepages++;
-	list_for_each_entry(page, &cc->freepages, lru)
-		nr_freepages++;
 
 	cc->nr_migratepages = nr_migratepages;
-	cc->nr_freepages = nr_freepages;
 }
 
 /* possible outcome of isolate_migratepages */
@@ -1023,8 +1032,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
-				(unsigned long)cc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc,
+				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
 		update_nr_listpages(cc);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-07  2:22       ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory compaction works by having a "freeing scanner" scan from one end of a 
zone which isolates pages as migration targets while another "migrating scanner" 
scans from the other end of the same zone which isolates pages for migration.

When page migration fails for an isolated page, the target page is returned to 
the system rather than the freelist built by the freeing scanner.  This may 
require the freeing scanner to continue scanning memory after suitable migration 
targets have already been returned to the system needlessly.

This patch returns destination pages to the freeing scanner freelist when page 
migration fails.  This prevents unnecessary work done by the freeing scanner but 
also encourages memory to be as compacted as possible at the end of the zone.

Reported-by: Greg Thelen <gthelen@google.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -797,23 +797,32 @@ static struct page *compaction_alloc(struct page *migratepage,
 }
 
 /*
- * We cannot control nr_migratepages and nr_freepages fully when migration is
- * running as migrate_pages() has no knowledge of compact_control. When
- * migration is complete, we count the number of pages on the lists by hand.
+ * This is a migrate-callback that "frees" freepages back to the isolated
+ * freelist.  All pages on the freelist are from the same zone, so there is no
+ * special handling needed for NUMA.
+ */
+static void compaction_free(struct page *page, unsigned long data)
+{
+	struct compact_control *cc = (struct compact_control *)data;
+
+	list_add(&page->lru, &cc->freepages);
+	cc->nr_freepages++;
+}
+
+/*
+ * We cannot control nr_migratepages fully when migration is running as
+ * migrate_pages() has no knowledge of of compact_control.  When migration is
+ * complete, we count the number of pages on the list by hand.
  */
 static void update_nr_listpages(struct compact_control *cc)
 {
 	int nr_migratepages = 0;
-	int nr_freepages = 0;
 	struct page *page;
 
 	list_for_each_entry(page, &cc->migratepages, lru)
 		nr_migratepages++;
-	list_for_each_entry(page, &cc->freepages, lru)
-		nr_freepages++;
 
 	cc->nr_migratepages = nr_migratepages;
-	cc->nr_freepages = nr_freepages;
 }
 
 /* possible outcome of isolate_migratepages */
@@ -1023,8 +1032,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
-				(unsigned long)cc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc,
+				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
 		update_nr_listpages(cc);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 3/6] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-07  2:22     ` David Rientjes
@ 2014-05-07  2:22       ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Each zone has a cached migration scanner pfn for memory compaction so that 
subsequent calls to memory compaction can start where the previous call left 
off.

Currently, the compaction migration scanner only updates the per-zone cached pfn 
when pageblocks were not skipped for async compaction.  This creates a 
dependency on calling sync compaction to avoid having subsequent calls to async 
compaction from scanning an enormous amount of non-MOVABLE pageblocks each time 
it is called.  On large machines, this could be potentially very expensive.

This patch adds a per-zone cached migration scanner pfn only for async 
compaction.  It is updated everytime a pageblock has been scanned in its 
entirety and when no pages from it were successfully isolated.  The cached 
migration scanner pfn for sync compaction is updated only when called for sync 
compaction.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 v3: do not update pageblock skip metadata when skipped due to async per
     Vlastimil.

 include/linux/mmzone.h |  5 ++--
 mm/compaction.c        | 66 ++++++++++++++++++++++++++++++--------------------
 2 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -360,9 +360,10 @@ struct zone {
 	/* Set to true when the PG_migrate_skip bits should be cleared */
 	bool			compact_blockskip_flush;
 
-	/* pfns where compaction scanners should start */
+	/* pfn where compaction free scanner should start */
 	unsigned long		compact_cached_free_pfn;
-	unsigned long		compact_cached_migrate_pfn;
+	/* pfn where async and sync compaction migration scanner should start */
+	unsigned long		compact_cached_migrate_pfn[2];
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 	/* see spanned/present_pages for more description */
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
 	unsigned long end_pfn = zone_end_pfn(zone);
 	unsigned long pfn;
 
-	zone->compact_cached_migrate_pfn = start_pfn;
+	zone->compact_cached_migrate_pfn[0] = start_pfn;
+	zone->compact_cached_migrate_pfn[1] = start_pfn;
 	zone->compact_cached_free_pfn = end_pfn;
 	zone->compact_blockskip_flush = false;
 
@@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat)
  */
 static void update_pageblock_skip(struct compact_control *cc,
 			struct page *page, unsigned long nr_isolated,
-			bool migrate_scanner)
+			bool set_unsuitable, bool migrate_scanner)
 {
 	struct zone *zone = cc->zone;
+	unsigned long pfn;
 
 	if (cc->ignore_skip_hint)
 		return;
@@ -141,20 +143,31 @@ static void update_pageblock_skip(struct compact_control *cc,
 	if (!page)
 		return;
 
-	if (!nr_isolated) {
-		unsigned long pfn = page_to_pfn(page);
+	if (nr_isolated)
+		return;
+
+	/*
+	 * Only skip pageblocks when all forms of compaction will be known to
+	 * fail in the near future.
+	 */
+	if (set_unsuitable)
 		set_pageblock_skip(page);
 
-		/* Update where compaction should restart */
-		if (migrate_scanner) {
-			if (!cc->finished_update_migrate &&
-			    pfn > zone->compact_cached_migrate_pfn)
-				zone->compact_cached_migrate_pfn = pfn;
-		} else {
-			if (!cc->finished_update_free &&
-			    pfn < zone->compact_cached_free_pfn)
-				zone->compact_cached_free_pfn = pfn;
-		}
+	pfn = page_to_pfn(page);
+
+	/* Update where async and sync compaction should restart */
+	if (migrate_scanner) {
+		if (cc->finished_update_migrate)
+			return;
+		if (pfn > zone->compact_cached_migrate_pfn[0])
+			zone->compact_cached_migrate_pfn[0] = pfn;
+		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
+			zone->compact_cached_migrate_pfn[1] = pfn;
+	} else {
+		if (cc->finished_update_free)
+			return;
+		if (pfn < zone->compact_cached_free_pfn)
+			zone->compact_cached_free_pfn = pfn;
 	}
 }
 #else
@@ -166,7 +179,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
 
 static void update_pageblock_skip(struct compact_control *cc,
 			struct page *page, unsigned long nr_isolated,
-			bool migrate_scanner)
+			bool set_unsuitable, bool migrate_scanner)
 {
 }
 #endif /* CONFIG_COMPACTION */
@@ -329,7 +342,8 @@ isolate_fail:
 
 	/* Update the pageblock-skip if the whole pageblock was scanned */
 	if (blockpfn == end_pfn)
-		update_pageblock_skip(cc, valid_page, total_isolated, false);
+		update_pageblock_skip(cc, valid_page, total_isolated, true,
+				      false);
 
 	count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
 	if (total_isolated)
@@ -464,7 +478,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	unsigned long flags;
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
-	bool skipped_async_unsuitable = false;
+	bool set_unsuitable = true;
 	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
 
@@ -541,8 +555,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			 */
 			mt = get_pageblock_migratetype(page);
 			if (!cc->sync && !migrate_async_suitable(mt)) {
-				cc->finished_update_migrate = true;
-				skipped_async_unsuitable = true;
+				set_unsuitable = false;
 				goto next_pageblock;
 			}
 		}
@@ -646,11 +659,10 @@ next_pageblock:
 	/*
 	 * Update the pageblock-skip information and cached scanner pfn,
 	 * if the whole pageblock was scanned without isolating any page.
-	 * This is not done when pageblock was skipped due to being unsuitable
-	 * for async compaction, so that eventual sync compaction can try.
 	 */
-	if (low_pfn == end_pfn && !skipped_async_unsuitable)
-		update_pageblock_skip(cc, valid_page, nr_isolated, true);
+	if (low_pfn == end_pfn)
+		update_pageblock_skip(cc, valid_page, nr_isolated,
+				      set_unsuitable, true);
 
 	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
 
@@ -877,7 +889,8 @@ static int compact_finished(struct zone *zone,
 	/* Compaction run completes if the migrate and free scanner meet */
 	if (cc->free_pfn <= cc->migrate_pfn) {
 		/* Let the next compaction start anew. */
-		zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
+		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
 		zone->compact_cached_free_pfn = zone_end_pfn(zone);
 
 		/*
@@ -1002,7 +1015,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	 * information on where the scanners should start but check that it
 	 * is initialised by ensuring the values are within zone boundaries.
 	 */
-	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1010,7 +1023,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	}
 	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
 		cc->migrate_pfn = start_pfn;
-		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
+		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
 	}
 
 	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 3/6] mm, compaction: add per-zone migration pfn cache for async compaction
@ 2014-05-07  2:22       ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Each zone has a cached migration scanner pfn for memory compaction so that 
subsequent calls to memory compaction can start where the previous call left 
off.

Currently, the compaction migration scanner only updates the per-zone cached pfn 
when pageblocks were not skipped for async compaction.  This creates a 
dependency on calling sync compaction to avoid having subsequent calls to async 
compaction from scanning an enormous amount of non-MOVABLE pageblocks each time 
it is called.  On large machines, this could be potentially very expensive.

This patch adds a per-zone cached migration scanner pfn only for async 
compaction.  It is updated everytime a pageblock has been scanned in its 
entirety and when no pages from it were successfully isolated.  The cached 
migration scanner pfn for sync compaction is updated only when called for sync 
compaction.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 v3: do not update pageblock skip metadata when skipped due to async per
     Vlastimil.

 include/linux/mmzone.h |  5 ++--
 mm/compaction.c        | 66 ++++++++++++++++++++++++++++++--------------------
 2 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -360,9 +360,10 @@ struct zone {
 	/* Set to true when the PG_migrate_skip bits should be cleared */
 	bool			compact_blockskip_flush;
 
-	/* pfns where compaction scanners should start */
+	/* pfn where compaction free scanner should start */
 	unsigned long		compact_cached_free_pfn;
-	unsigned long		compact_cached_migrate_pfn;
+	/* pfn where async and sync compaction migration scanner should start */
+	unsigned long		compact_cached_migrate_pfn[2];
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 	/* see spanned/present_pages for more description */
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
 	unsigned long end_pfn = zone_end_pfn(zone);
 	unsigned long pfn;
 
-	zone->compact_cached_migrate_pfn = start_pfn;
+	zone->compact_cached_migrate_pfn[0] = start_pfn;
+	zone->compact_cached_migrate_pfn[1] = start_pfn;
 	zone->compact_cached_free_pfn = end_pfn;
 	zone->compact_blockskip_flush = false;
 
@@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat)
  */
 static void update_pageblock_skip(struct compact_control *cc,
 			struct page *page, unsigned long nr_isolated,
-			bool migrate_scanner)
+			bool set_unsuitable, bool migrate_scanner)
 {
 	struct zone *zone = cc->zone;
+	unsigned long pfn;
 
 	if (cc->ignore_skip_hint)
 		return;
@@ -141,20 +143,31 @@ static void update_pageblock_skip(struct compact_control *cc,
 	if (!page)
 		return;
 
-	if (!nr_isolated) {
-		unsigned long pfn = page_to_pfn(page);
+	if (nr_isolated)
+		return;
+
+	/*
+	 * Only skip pageblocks when all forms of compaction will be known to
+	 * fail in the near future.
+	 */
+	if (set_unsuitable)
 		set_pageblock_skip(page);
 
-		/* Update where compaction should restart */
-		if (migrate_scanner) {
-			if (!cc->finished_update_migrate &&
-			    pfn > zone->compact_cached_migrate_pfn)
-				zone->compact_cached_migrate_pfn = pfn;
-		} else {
-			if (!cc->finished_update_free &&
-			    pfn < zone->compact_cached_free_pfn)
-				zone->compact_cached_free_pfn = pfn;
-		}
+	pfn = page_to_pfn(page);
+
+	/* Update where async and sync compaction should restart */
+	if (migrate_scanner) {
+		if (cc->finished_update_migrate)
+			return;
+		if (pfn > zone->compact_cached_migrate_pfn[0])
+			zone->compact_cached_migrate_pfn[0] = pfn;
+		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
+			zone->compact_cached_migrate_pfn[1] = pfn;
+	} else {
+		if (cc->finished_update_free)
+			return;
+		if (pfn < zone->compact_cached_free_pfn)
+			zone->compact_cached_free_pfn = pfn;
 	}
 }
 #else
@@ -166,7 +179,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
 
 static void update_pageblock_skip(struct compact_control *cc,
 			struct page *page, unsigned long nr_isolated,
-			bool migrate_scanner)
+			bool set_unsuitable, bool migrate_scanner)
 {
 }
 #endif /* CONFIG_COMPACTION */
@@ -329,7 +342,8 @@ isolate_fail:
 
 	/* Update the pageblock-skip if the whole pageblock was scanned */
 	if (blockpfn == end_pfn)
-		update_pageblock_skip(cc, valid_page, total_isolated, false);
+		update_pageblock_skip(cc, valid_page, total_isolated, true,
+				      false);
 
 	count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
 	if (total_isolated)
@@ -464,7 +478,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	unsigned long flags;
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
-	bool skipped_async_unsuitable = false;
+	bool set_unsuitable = true;
 	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
 
@@ -541,8 +555,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			 */
 			mt = get_pageblock_migratetype(page);
 			if (!cc->sync && !migrate_async_suitable(mt)) {
-				cc->finished_update_migrate = true;
-				skipped_async_unsuitable = true;
+				set_unsuitable = false;
 				goto next_pageblock;
 			}
 		}
@@ -646,11 +659,10 @@ next_pageblock:
 	/*
 	 * Update the pageblock-skip information and cached scanner pfn,
 	 * if the whole pageblock was scanned without isolating any page.
-	 * This is not done when pageblock was skipped due to being unsuitable
-	 * for async compaction, so that eventual sync compaction can try.
 	 */
-	if (low_pfn == end_pfn && !skipped_async_unsuitable)
-		update_pageblock_skip(cc, valid_page, nr_isolated, true);
+	if (low_pfn == end_pfn)
+		update_pageblock_skip(cc, valid_page, nr_isolated,
+				      set_unsuitable, true);
 
 	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
 
@@ -877,7 +889,8 @@ static int compact_finished(struct zone *zone,
 	/* Compaction run completes if the migrate and free scanner meet */
 	if (cc->free_pfn <= cc->migrate_pfn) {
 		/* Let the next compaction start anew. */
-		zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
+		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
 		zone->compact_cached_free_pfn = zone_end_pfn(zone);
 
 		/*
@@ -1002,7 +1015,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	 * information on where the scanners should start but check that it
 	 * is initialised by ensuring the values are within zone boundaries.
 	 */
-	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1010,7 +1023,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	}
 	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
 		cc->migrate_pfn = start_pfn;
-		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
+		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
 	}
 
 	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 4/6] mm, compaction: embed migration mode in compact_control
  2014-05-07  2:22     ` David Rientjes
@ 2014-05-07  2:22       ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

We're going to want to manipulate the migration mode for compaction in the page 
allocator, and currently compact_control's sync field is only a bool.  

Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending 
on the value of this bool.  Convert the bool to enum migrate_mode and pass the 
migration mode in directly.  Later, we'll want to avoid MIGRATE_SYNC_LIGHT for 
thp allocations in the pagefault patch to avoid unnecessary latency.

This also alters compaction triggered from sysfs, either for the entire system 
or for a node, to force MIGRATE_SYNC.

Suggested-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/compaction.h |  4 ++--
 mm/compaction.c            | 36 +++++++++++++++++++-----------------
 mm/internal.h              |  2 +-
 mm/page_alloc.c            | 37 ++++++++++++++++---------------------
 4 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask,
-			bool sync, bool *contended);
+			enum migrate_mode sync, bool *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode sync, bool *contended)
 {
 	return COMPACT_CONTINUE;
 }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -161,7 +161,8 @@ static void update_pageblock_skip(struct compact_control *cc,
 			return;
 		if (pfn > zone->compact_cached_migrate_pfn[0])
 			zone->compact_cached_migrate_pfn[0] = pfn;
-		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
+		if (cc->mode != MIGRATE_ASYNC &&
+		    pfn > zone->compact_cached_migrate_pfn[1])
 			zone->compact_cached_migrate_pfn[1] = pfn;
 	} else {
 		if (cc->finished_update_free)
@@ -208,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 		}
 
 		/* async aborts if taking too long or contended */
-		if (!cc->sync) {
+		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
 			return false;
 		}
@@ -479,7 +480,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
 	bool set_unsuitable = true;
-	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
+	const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
+					ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
 
 	/*
@@ -489,7 +491,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	 */
 	while (unlikely(too_many_isolated(zone))) {
 		/* async migration should just abort */
-		if (!cc->sync)
+		if (cc->mode == MIGRATE_ASYNC)
 			return 0;
 
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -554,7 +556,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			 * the minimum amount of work satisfies the allocation
 			 */
 			mt = get_pageblock_migratetype(page);
-			if (!cc->sync && !migrate_async_suitable(mt)) {
+			if (cc->mode == MIGRATE_ASYNC &&
+			    !migrate_async_suitable(mt)) {
 				set_unsuitable = false;
 				goto next_pageblock;
 			}
@@ -990,6 +993,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	int ret;
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone_end_pfn(zone);
+	const bool sync = cc->mode != MIGRATE_ASYNC;
 
 	ret = compaction_suitable(zone, cc->order);
 	switch (ret) {
@@ -1015,7 +1019,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	 * information on where the scanners should start but check that it
 	 * is initialised by ensuring the values are within zone boundaries.
 	 */
-	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1049,8 +1053,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 		nr_migrate = cc->nr_migratepages;
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
-				compaction_free, (unsigned long)cc,
-				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+				compaction_free, (unsigned long)cc, cc->mode,
 				MR_COMPACTION);
 		update_nr_listpages(cc);
 		nr_remaining = cc->nr_migratepages;
@@ -1083,9 +1086,8 @@ out:
 	return ret;
 }
 
-static unsigned long compact_zone_order(struct zone *zone,
-				 int order, gfp_t gfp_mask,
-				 bool sync, bool *contended)
+static unsigned long compact_zone_order(struct zone *zone, int order,
+		gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
 {
 	unsigned long ret;
 	struct compact_control cc = {
@@ -1094,7 +1096,7 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.order = order,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
-		.sync = sync,
+		.mode = mode,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -1116,7 +1118,7 @@ int sysctl_extfrag_threshold = 500;
  * @order: The order of the current allocation
  * @gfp_mask: The GFP mask of the current allocation
  * @nodemask: The allowed nodes to allocate from
- * @sync: Whether migration is synchronous or not
+ * @sync: The migration mode for async, sync light, or sync migration
  * @contended: Return value that is true if compaction was aborted due to lock contention
  * @page: Optionally capture a free page of the requested order during compaction
  *
@@ -1124,7 +1126,7 @@ int sysctl_extfrag_threshold = 500;
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode sync, bool *contended)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1189,7 +1191,7 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 						low_wmark_pages(zone), 0, 0))
 				compaction_defer_reset(zone, cc->order, false);
 			/* Currently async compaction is never deferred. */
-			else if (cc->sync)
+			else if (cc->mode != MIGRATE_ASYNC)
 				defer_compaction(zone, cc->order);
 		}
 
@@ -1202,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
 {
 	struct compact_control cc = {
 		.order = order,
-		.sync = false,
+		.mode = MIGRATE_ASYNC,
 	};
 
 	if (!order)
@@ -1215,7 +1217,7 @@ static void compact_node(int nid)
 {
 	struct compact_control cc = {
 		.order = -1,
-		.sync = true,
+		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 	};
 
diff --git a/mm/internal.h b/mm/internal.h
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -134,7 +134,7 @@ struct compact_control {
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
-	bool sync;			/* Synchronous migration */
+	enum migrate_mode mode;		/* Async or sync migration mode */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 	bool finished_update_free;	/* True when the zone cached pfns are
 					 * no longer being updated
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2226,7 +2226,7 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, bool sync_migration,
+	int migratetype, enum migrate_mode mode,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
@@ -2240,7 +2240,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-						nodemask, sync_migration,
+						nodemask, mode,
 						contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 
@@ -2273,7 +2273,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		 * As async compaction considers a subset of pageblocks, only
 		 * defer if the failure was a sync compaction failure.
 		 */
-		if (sync_migration)
+		if (mode != MIGRATE_ASYNC)
 			defer_compaction(preferred_zone, order);
 
 		cond_resched();
@@ -2286,9 +2286,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, bool sync_migration,
-	bool *contended_compaction, bool *deferred_compaction,
-	unsigned long *did_some_progress)
+	int migratetype, enum migrate_mode mode, bool *contended_compaction,
+	bool *deferred_compaction, unsigned long *did_some_progress)
 {
 	return NULL;
 }
@@ -2483,7 +2482,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
-	bool sync_migration = false;
+	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	bool deferred_compaction = false;
 	bool contended_compaction = false;
 
@@ -2577,17 +2576,15 @@ rebalance:
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
 	 */
-	page = __alloc_pages_direct_compact(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask,
-					alloc_flags, preferred_zone,
-					migratetype, sync_migration,
-					&contended_compaction,
+	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+					high_zoneidx, nodemask, alloc_flags,
+					preferred_zone, migratetype,
+					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
-	sync_migration = true;
+	migration_mode = MIGRATE_SYNC_LIGHT;
 
 	/*
 	 * If compaction is deferred for high-order allocations, it is because
@@ -2662,12 +2659,10 @@ rebalance:
 		 * direct reclaim and reclaim/compaction depends on compaction
 		 * being called after reclaim so call directly if necessary
 		 */
-		page = __alloc_pages_direct_compact(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask,
-					alloc_flags, preferred_zone,
-					migratetype, sync_migration,
-					&contended_compaction,
+		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+					high_zoneidx, nodemask, alloc_flags,
+					preferred_zone, migratetype,
+					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 		if (page)
@@ -6254,7 +6249,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
-		.sync = true,
+		.sync = MIGRATE_SYNC_LIGHT,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 4/6] mm, compaction: embed migration mode in compact_control
@ 2014-05-07  2:22       ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

We're going to want to manipulate the migration mode for compaction in the page 
allocator, and currently compact_control's sync field is only a bool.  

Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending 
on the value of this bool.  Convert the bool to enum migrate_mode and pass the 
migration mode in directly.  Later, we'll want to avoid MIGRATE_SYNC_LIGHT for 
thp allocations in the pagefault patch to avoid unnecessary latency.

This also alters compaction triggered from sysfs, either for the entire system 
or for a node, to force MIGRATE_SYNC.

Suggested-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/compaction.h |  4 ++--
 mm/compaction.c            | 36 +++++++++++++++++++-----------------
 mm/internal.h              |  2 +-
 mm/page_alloc.c            | 37 ++++++++++++++++---------------------
 4 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask,
-			bool sync, bool *contended);
+			enum migrate_mode sync, bool *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode sync, bool *contended)
 {
 	return COMPACT_CONTINUE;
 }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -161,7 +161,8 @@ static void update_pageblock_skip(struct compact_control *cc,
 			return;
 		if (pfn > zone->compact_cached_migrate_pfn[0])
 			zone->compact_cached_migrate_pfn[0] = pfn;
-		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
+		if (cc->mode != MIGRATE_ASYNC &&
+		    pfn > zone->compact_cached_migrate_pfn[1])
 			zone->compact_cached_migrate_pfn[1] = pfn;
 	} else {
 		if (cc->finished_update_free)
@@ -208,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 		}
 
 		/* async aborts if taking too long or contended */
-		if (!cc->sync) {
+		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
 			return false;
 		}
@@ -479,7 +480,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
 	bool set_unsuitable = true;
-	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
+	const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
+					ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
 
 	/*
@@ -489,7 +491,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	 */
 	while (unlikely(too_many_isolated(zone))) {
 		/* async migration should just abort */
-		if (!cc->sync)
+		if (cc->mode == MIGRATE_ASYNC)
 			return 0;
 
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -554,7 +556,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			 * the minimum amount of work satisfies the allocation
 			 */
 			mt = get_pageblock_migratetype(page);
-			if (!cc->sync && !migrate_async_suitable(mt)) {
+			if (cc->mode == MIGRATE_ASYNC &&
+			    !migrate_async_suitable(mt)) {
 				set_unsuitable = false;
 				goto next_pageblock;
 			}
@@ -990,6 +993,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	int ret;
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone_end_pfn(zone);
+	const bool sync = cc->mode != MIGRATE_ASYNC;
 
 	ret = compaction_suitable(zone, cc->order);
 	switch (ret) {
@@ -1015,7 +1019,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	 * information on where the scanners should start but check that it
 	 * is initialised by ensuring the values are within zone boundaries.
 	 */
-	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1049,8 +1053,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 		nr_migrate = cc->nr_migratepages;
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
-				compaction_free, (unsigned long)cc,
-				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+				compaction_free, (unsigned long)cc, cc->mode,
 				MR_COMPACTION);
 		update_nr_listpages(cc);
 		nr_remaining = cc->nr_migratepages;
@@ -1083,9 +1086,8 @@ out:
 	return ret;
 }
 
-static unsigned long compact_zone_order(struct zone *zone,
-				 int order, gfp_t gfp_mask,
-				 bool sync, bool *contended)
+static unsigned long compact_zone_order(struct zone *zone, int order,
+		gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
 {
 	unsigned long ret;
 	struct compact_control cc = {
@@ -1094,7 +1096,7 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.order = order,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
-		.sync = sync,
+		.mode = mode,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -1116,7 +1118,7 @@ int sysctl_extfrag_threshold = 500;
  * @order: The order of the current allocation
  * @gfp_mask: The GFP mask of the current allocation
  * @nodemask: The allowed nodes to allocate from
- * @sync: Whether migration is synchronous or not
+ * @sync: The migration mode for async, sync light, or sync migration
  * @contended: Return value that is true if compaction was aborted due to lock contention
  * @page: Optionally capture a free page of the requested order during compaction
  *
@@ -1124,7 +1126,7 @@ int sysctl_extfrag_threshold = 500;
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode sync, bool *contended)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1189,7 +1191,7 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 						low_wmark_pages(zone), 0, 0))
 				compaction_defer_reset(zone, cc->order, false);
 			/* Currently async compaction is never deferred. */
-			else if (cc->sync)
+			else if (cc->mode != MIGRATE_ASYNC)
 				defer_compaction(zone, cc->order);
 		}
 
@@ -1202,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
 {
 	struct compact_control cc = {
 		.order = order,
-		.sync = false,
+		.mode = MIGRATE_ASYNC,
 	};
 
 	if (!order)
@@ -1215,7 +1217,7 @@ static void compact_node(int nid)
 {
 	struct compact_control cc = {
 		.order = -1,
-		.sync = true,
+		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 	};
 
diff --git a/mm/internal.h b/mm/internal.h
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -134,7 +134,7 @@ struct compact_control {
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
-	bool sync;			/* Synchronous migration */
+	enum migrate_mode mode;		/* Async or sync migration mode */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 	bool finished_update_free;	/* True when the zone cached pfns are
 					 * no longer being updated
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2226,7 +2226,7 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, bool sync_migration,
+	int migratetype, enum migrate_mode mode,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
@@ -2240,7 +2240,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-						nodemask, sync_migration,
+						nodemask, mode,
 						contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 
@@ -2273,7 +2273,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		 * As async compaction considers a subset of pageblocks, only
 		 * defer if the failure was a sync compaction failure.
 		 */
-		if (sync_migration)
+		if (mode != MIGRATE_ASYNC)
 			defer_compaction(preferred_zone, order);
 
 		cond_resched();
@@ -2286,9 +2286,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, bool sync_migration,
-	bool *contended_compaction, bool *deferred_compaction,
-	unsigned long *did_some_progress)
+	int migratetype, enum migrate_mode mode, bool *contended_compaction,
+	bool *deferred_compaction, unsigned long *did_some_progress)
 {
 	return NULL;
 }
@@ -2483,7 +2482,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
-	bool sync_migration = false;
+	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	bool deferred_compaction = false;
 	bool contended_compaction = false;
 
@@ -2577,17 +2576,15 @@ rebalance:
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
 	 */
-	page = __alloc_pages_direct_compact(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask,
-					alloc_flags, preferred_zone,
-					migratetype, sync_migration,
-					&contended_compaction,
+	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+					high_zoneidx, nodemask, alloc_flags,
+					preferred_zone, migratetype,
+					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
-	sync_migration = true;
+	migration_mode = MIGRATE_SYNC_LIGHT;
 
 	/*
 	 * If compaction is deferred for high-order allocations, it is because
@@ -2662,12 +2659,10 @@ rebalance:
 		 * direct reclaim and reclaim/compaction depends on compaction
 		 * being called after reclaim so call directly if necessary
 		 */
-		page = __alloc_pages_direct_compact(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask,
-					alloc_flags, preferred_zone,
-					migratetype, sync_migration,
-					&contended_compaction,
+		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+					high_zoneidx, nodemask, alloc_flags,
+					preferred_zone, migratetype,
+					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 		if (page)
@@ -6254,7 +6249,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
-		.sync = true,
+		.sync = MIGRATE_SYNC_LIGHT,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 5/6] mm, thp: avoid excessive compaction latency during fault
  2014-05-07  2:22     ` David Rientjes
@ 2014-05-07  2:22       ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Synchronous memory compaction can be very expensive: it can iterate an enormous 
amount of memory without aborting, constantly rescheduling, waiting on page
locks and lru_lock, etc, if a pageblock cannot be defragmented.

Unfortunately, it's too expensive for transparent hugepage page faults and 
it's much better to simply fallback to pages.  On 128GB machines, we find that 
synchronous memory compaction can take O(seconds) for a single thp fault.

Now that async compaction remembers where it left off without strictly relying
on sync compaction, this makes thp allocations best-effort without causing
egregious latency during fault.  We still need to retry async compaction after
reclaim, but this won't stall for seconds.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/page_alloc.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2584,7 +2584,17 @@ rebalance:
 					&did_some_progress);
 	if (page)
 		goto got_pg;
-	migration_mode = MIGRATE_SYNC_LIGHT;
+
+	if (gfp_mask & __GFP_NO_KSWAPD) {
+		/*
+		 * Khugepaged is allowed to try MIGRATE_SYNC_LIGHT, the latency
+		 * of this allocation isn't critical.  Everything else, however,
+		 * should only be allowed to do MIGRATE_ASYNC to avoid excessive
+		 * stalls during fault.
+		 */
+		if ((current->flags & (PF_KTHREAD | PF_KSWAPD)) == PF_KTHREAD)
+			migration_mode = MIGRATE_SYNC_LIGHT;
+	}
 
 	/*
 	 * If compaction is deferred for high-order allocations, it is because

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 5/6] mm, thp: avoid excessive compaction latency during fault
@ 2014-05-07  2:22       ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Synchronous memory compaction can be very expensive: it can iterate an enormous 
amount of memory without aborting, constantly rescheduling, waiting on page
locks and lru_lock, etc, if a pageblock cannot be defragmented.

Unfortunately, it's too expensive for transparent hugepage page faults and 
it's much better to simply fallback to pages.  On 128GB machines, we find that 
synchronous memory compaction can take O(seconds) for a single thp fault.

Now that async compaction remembers where it left off without strictly relying
on sync compaction, this makes thp allocations best-effort without causing
egregious latency during fault.  We still need to retry async compaction after
reclaim, but this won't stall for seconds.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/page_alloc.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2584,7 +2584,17 @@ rebalance:
 					&did_some_progress);
 	if (page)
 		goto got_pg;
-	migration_mode = MIGRATE_SYNC_LIGHT;
+
+	if (gfp_mask & __GFP_NO_KSWAPD) {
+		/*
+		 * Khugepaged is allowed to try MIGRATE_SYNC_LIGHT, the latency
+		 * of this allocation isn't critical.  Everything else, however,
+		 * should only be allowed to do MIGRATE_ASYNC to avoid excessive
+		 * stalls during fault.
+		 */
+		if ((current->flags & (PF_KTHREAD | PF_KSWAPD)) == PF_KTHREAD)
+			migration_mode = MIGRATE_SYNC_LIGHT;
+	}
 
 	/*
 	 * If compaction is deferred for high-order allocations, it is because

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
  2014-05-07  2:22     ` David Rientjes
@ 2014-05-07  2:22       ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Async compaction terminates prematurely when need_resched(), see
compact_checklock_irqsave().  This can never trigger, however, if the 
cond_resched() in isolate_migratepages_range() always takes care of the 
scheduling.

If the cond_resched() actually triggers, then terminate this pageblock scan for 
async compaction as well.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -500,8 +500,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			return 0;
 	}
 
+	if (cond_resched()) {
+		/* Async terminates prematurely on need_resched() */
+		if (cc->mode == MIGRATE_ASYNC)
+			return 0;
+	}
+
 	/* Time to isolate some pages for migration */
-	cond_resched();
 	for (; low_pfn < end_pfn; low_pfn++) {
 		/* give a chance to irqs before checking need_resched() */
 		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
@ 2014-05-07  2:22       ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Async compaction terminates prematurely when need_resched(), see
compact_checklock_irqsave().  This can never trigger, however, if the 
cond_resched() in isolate_migratepages_range() always takes care of the 
scheduling.

If the cond_resched() actually triggers, then terminate this pageblock scan for 
async compaction as well.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -500,8 +500,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			return 0;
 	}
 
+	if (cond_resched()) {
+		/* Async terminates prematurely on need_resched() */
+		if (cc->mode == MIGRATE_ASYNC)
+			return 0;
+	}
+
 	/* Time to isolate some pages for migration */
-	cond_resched();
 	for (; low_pfn < end_pfn; low_pfn++) {
 		/* give a chance to irqs before checking need_resched() */
 		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
       [not found]         ` <1399414778-xakujfb3@n-horiguchi@ah.jp.nec.com>
@ 2014-05-07  9:22             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07  9:22 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, gthelen,
	linux-kernel, linux-mm, minchan, Mel Gorman, iamjoonsoo.kim, cl,
	Rik van Riel

On 05/07/2014 12:19 AM, Naoya Horiguchi wrote:
> On Fri, May 02, 2014 at 05:27:55PM +0200, Vlastimil Babka wrote:
>> The compaction free scanner in isolate_freepages() currently remembers PFN of
>> the highest pageblock where it successfully isolates, to be used as the
>> starting pageblock for the next invocation. The rationale behind this is that
>> page migration might return free pages to the allocator when migration fails
>> and we don't want to skip them if the compaction continues.
>>
>> Since migration now returns free pages back to compaction code where they can
>> be reused, this is no longer a concern. This patch changes isolate_freepages()
>> so that the PFN for restarting is updated with each pageblock where isolation
>> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
>> reduction of the pages scanned by the free scanner.
>>
>> Note that the somewhat similar functionality that records highest successful
>> pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
>> used when the whole compaction is restarted, not for multiple invocations of
>> the free scanner during single compaction.
>>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   mm/compaction.c | 18 ++++++------------
>>   1 file changed, 6 insertions(+), 12 deletions(-)
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 873d7de..1967850 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -686,12 +686,6 @@ static void isolate_freepages(struct zone *zone,
>>   	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
>>
>>   	/*
>> -	 * If no pages are isolated, the block_start_pfn < low_pfn check
>> -	 * will kick in.
>> -	 */
>> -	next_free_pfn = 0;
>> -
>> -	/*
>>   	 * Isolate free pages until enough are available to migrate the
>>   	 * pages on cc->migratepages. We stop searching if the migrate
>>   	 * and free page scanners meet or enough free pages are isolated.
>> @@ -731,19 +725,19 @@ static void isolate_freepages(struct zone *zone,
>>   			continue;
>>
>>   		/* Found a block suitable for isolating free pages from */
>> +		next_free_pfn = block_start_pfn;
>>   		isolated = isolate_freepages_block(cc, block_start_pfn,
>>   					block_end_pfn, freelist, false);
>>   		nr_freepages += isolated;
>>
>>   		/*
>> -		 * Record the highest PFN we isolated pages from. When next
>> -		 * looking for free pages, the search will restart here as
>> -		 * page migration may have returned some pages to the allocator
>> +		 * Set a flag that we successfully isolated in this pageblock.
>> +		 * In the next loop iteration, zone->compact_cached_free_pfn
>> +		 * will not be updated and thus it will effectively contain the
>> +		 * highest pageblock we isolated pages from.
>>   		 */
>> -		if (isolated && next_free_pfn == 0) {
>> +		if (isolated)
>>   			cc->finished_update_free = true;
>> -			next_free_pfn = block_start_pfn;
>> -		}
>
> Why don't you completely remove next_free_pfn and update cc->free_pfn directly?

Hi,

well you could ask the same about the nr_freepages variable. For me 
personally local variable (now with a comment for next_free_pfn) looks 
more readable. The function currently received cleanup from several 
people (including me), so I wouldn't want to change it again, unless 
others also think it would be better. From compiler standpoint both 
variants should be the same I guess.

Vlastimil

> Thanks,
> Naoya Horiguchi
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-07  9:22             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07  9:22 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, gthelen,
	linux-kernel, linux-mm, minchan, Mel Gorman, iamjoonsoo.kim, cl,
	Rik van Riel

On 05/07/2014 12:19 AM, Naoya Horiguchi wrote:
> On Fri, May 02, 2014 at 05:27:55PM +0200, Vlastimil Babka wrote:
>> The compaction free scanner in isolate_freepages() currently remembers PFN of
>> the highest pageblock where it successfully isolates, to be used as the
>> starting pageblock for the next invocation. The rationale behind this is that
>> page migration might return free pages to the allocator when migration fails
>> and we don't want to skip them if the compaction continues.
>>
>> Since migration now returns free pages back to compaction code where they can
>> be reused, this is no longer a concern. This patch changes isolate_freepages()
>> so that the PFN for restarting is updated with each pageblock where isolation
>> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
>> reduction of the pages scanned by the free scanner.
>>
>> Note that the somewhat similar functionality that records highest successful
>> pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
>> used when the whole compaction is restarted, not for multiple invocations of
>> the free scanner during single compaction.
>>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   mm/compaction.c | 18 ++++++------------
>>   1 file changed, 6 insertions(+), 12 deletions(-)
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 873d7de..1967850 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -686,12 +686,6 @@ static void isolate_freepages(struct zone *zone,
>>   	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
>>
>>   	/*
>> -	 * If no pages are isolated, the block_start_pfn < low_pfn check
>> -	 * will kick in.
>> -	 */
>> -	next_free_pfn = 0;
>> -
>> -	/*
>>   	 * Isolate free pages until enough are available to migrate the
>>   	 * pages on cc->migratepages. We stop searching if the migrate
>>   	 * and free page scanners meet or enough free pages are isolated.
>> @@ -731,19 +725,19 @@ static void isolate_freepages(struct zone *zone,
>>   			continue;
>>
>>   		/* Found a block suitable for isolating free pages from */
>> +		next_free_pfn = block_start_pfn;
>>   		isolated = isolate_freepages_block(cc, block_start_pfn,
>>   					block_end_pfn, freelist, false);
>>   		nr_freepages += isolated;
>>
>>   		/*
>> -		 * Record the highest PFN we isolated pages from. When next
>> -		 * looking for free pages, the search will restart here as
>> -		 * page migration may have returned some pages to the allocator
>> +		 * Set a flag that we successfully isolated in this pageblock.
>> +		 * In the next loop iteration, zone->compact_cached_free_pfn
>> +		 * will not be updated and thus it will effectively contain the
>> +		 * highest pageblock we isolated pages from.
>>   		 */
>> -		if (isolated && next_free_pfn == 0) {
>> +		if (isolated)
>>   			cc->finished_update_free = true;
>> -			next_free_pfn = block_start_pfn;
>> -		}
>
> Why don't you completely remove next_free_pfn and update cc->free_pfn directly?

Hi,

well you could ask the same about the nr_freepages variable. For me 
personally local variable (now with a comment for next_free_pfn) looks 
more readable. The function currently received cleanup from several 
people (including me), so I wouldn't want to change it again, unless 
others also think it would be better. From compiler standpoint both 
variants should be the same I guess.

Vlastimil

> Thanks,
> Naoya Horiguchi
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm/compaction: do not count migratepages when unnecessary
       [not found]         ` <1399411134-k43fsr0p@n-horiguchi@ah.jp.nec.com>
@ 2014-05-07  9:33             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07  9:33 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Hugh Dickins, gthelen, linux-kernel, linux-mm,
	minchan, Mel Gorman, iamjoonsoo.kim, cl, Rik van Riel

On 05/06/2014 11:18 PM, Naoya Horiguchi wrote:
> On Fri, May 02, 2014 at 05:26:18PM +0200, Vlastimil Babka wrote:
>> During compaction, update_nr_listpages() has been used to count remaining
>> non-migrated and free pages after a call to migrage_pages(). The freepages
>> counting has become unneccessary, and it turns out that migratepages counting
>> is also unnecessary in most cases.
>>
>> The only situation when it's needed to count cc->migratepages is when
>> migrate_pages() returns with a negative error code. Otherwise, the non-negative
>> return value is the number of pages that were not migrated, which is exactly
>> the count of remaining pages in the cc->migratepages list.
>>
>> Furthermore, any non-zero count is only interesting for the tracepoint of
>> mm_compaction_migratepages events, because after that all remaining unmigrated
>> pages are put back and their count is set to 0.
>>
>> This patch therefore removes update_nr_listpages() completely, and changes the
>> tracepoint definition so that the manual counting is done only when the
>> tracepoint is enabled, and only when migrate_pages() returns a negative error
>> code.
>>
>> Furthermore, migrate_pages() and the tracepoints won't be called when there's
>> nothing to migrate. This potentially avoids some wasted cycles and reduces the
>> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
>> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
>> The mm_compaction_isolate_migratepages event is better for determining that
>> nothing was isolated for migration, and this one was just duplicating the info.
>
> I agree with this patch.
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

Thanks.

> A few nitpicks below...
>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
>>   mm/compaction.c                   | 30 ++++++++----------------------
>>   2 files changed, 30 insertions(+), 26 deletions(-)
>>
>> diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
>> index 06f544e..bbd5e1f 100644
>> --- a/include/trace/events/compaction.h
>> +++ b/include/trace/events/compaction.h
>> @@ -5,7 +5,9 @@
>>   #define _TRACE_COMPACTION_H
>>
>>   #include <linux/types.h>
>> +#include <linux/list.h>
>>   #include <linux/tracepoint.h>
>> +#include <linux/mm_types.h>
>>   #include <trace/events/gfpflags.h>
>>
>>   DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
>> @@ -47,10 +49,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
>>
>>   TRACE_EVENT(mm_compaction_migratepages,
>>
>> -	TP_PROTO(unsigned long nr_migrated,
>> -		unsigned long nr_failed),
>> +	TP_PROTO(unsigned long nr_all,
>> +		int migrate_rc,
>> +		struct list_head * migratepages),
>
> checkpatch.pl shows code violation message for this line.
> (ERROR: "foo * bar" should be "foo *bar")
>
>>
>> -	TP_ARGS(nr_migrated, nr_failed),
>> +	TP_ARGS(nr_all, migrate_rc, migratepages),
>>
>>   	TP_STRUCT__entry(
>>   		__field(unsigned long, nr_migrated)
>> @@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
>>   	),
>>
>>   	TP_fast_assign(
>> -		__entry->nr_migrated = nr_migrated;
>> +		unsigned long nr_failed = 0;
>> +		struct page *page;
>> +
>> +		/*
>> +		 * migrate_pages() returns either a non-negative number
>> +		 * with the number of pages that failed migration, or an
>> +		 * error code, in which case we need to count the remaining
>> +		 * pages manually
>> +		 */
>> +		if (migrate_rc >= 0)
>> +			nr_failed = migrate_rc;
>> +		else
>> +		        list_for_each_entry(page, migratepages, lru)
>
> This line contains whitespace indent.

Oops, will fix those checkpatchs...

>> +				nr_failed++;
>> +
>> +		__entry->nr_migrated = nr_all - nr_failed;
>>   		__entry->nr_failed = nr_failed;
>>   	),
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index ae1d0ae..873d7de 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -799,22 +799,6 @@ static void compaction_free(struct page *page, unsigned long data)
>>   	cc->nr_freepages++;
>>   }
>>
>> -/*
>> - * We cannot control nr_migratepages fully when migration is running as
>> - * migrate_pages() has no knowledge of of compact_control.  When migration is
>> - * complete, we count the number of pages on the list by hand.
>> - */
>> -static void update_nr_listpages(struct compact_control *cc)
>> -{
>> -	int nr_migratepages = 0;
>> -	struct page *page;
>> -
>> -	list_for_each_entry(page, &cc->migratepages, lru)
>> -		nr_migratepages++;
>> -
>> -	cc->nr_migratepages = nr_migratepages;
>> -}
>> -
>>   /* possible outcome of isolate_migratepages */
>>   typedef enum {
>>   	ISOLATE_ABORT,		/* Abort compaction now */
>> @@ -1006,7 +990,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>>   	migrate_prep_local();
>>
>>   	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
>> -		unsigned long nr_migrate, nr_remaining;
>>   		int err;
>>
>>   		switch (isolate_migratepages(zone, cc)) {
>> @@ -1021,16 +1004,16 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>>   			;
>>   		}
>>
>> -		nr_migrate = cc->nr_migratepages;
>> +		if (!cc->nr_migratepages)
>> +			continue;
>> +
>>   		err = migrate_pages(&cc->migratepages, compaction_alloc,
>>   				compaction_free, (unsigned long)cc,
>>   				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
>>   				MR_COMPACTION);
>> -		update_nr_listpages(cc);
>> -		nr_remaining = cc->nr_migratepages;
>>
>> -		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
>> -						nr_remaining);
>> +		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
>> +							&cc->migratepages);
>>
>>   		/* Release isolated pages not migrated */
>>   		if (err) {
>> @@ -1044,6 +1027,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>>   				ret = COMPACT_PARTIAL;
>>   				goto out;
>>   			}
>> +		} else {
>> +			/* All pages were successfully migrated */
>> +			cc->nr_migratepages = 0;
>
> cc->nr_migratepages = 0 is also done in err != 0, so can it be done in common path?

Well it would have to be done before 'if (err)', thus also before 
putback_movable_pages(), which is a bit awkward. But avoiding 'else' is 
also good I guess so I'll do that for next version.

> Thanks,
> Naoya Horiguchi
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm/compaction: do not count migratepages when unnecessary
@ 2014-05-07  9:33             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07  9:33 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Hugh Dickins, gthelen, linux-kernel, linux-mm,
	minchan, Mel Gorman, iamjoonsoo.kim, cl, Rik van Riel

On 05/06/2014 11:18 PM, Naoya Horiguchi wrote:
> On Fri, May 02, 2014 at 05:26:18PM +0200, Vlastimil Babka wrote:
>> During compaction, update_nr_listpages() has been used to count remaining
>> non-migrated and free pages after a call to migrage_pages(). The freepages
>> counting has become unneccessary, and it turns out that migratepages counting
>> is also unnecessary in most cases.
>>
>> The only situation when it's needed to count cc->migratepages is when
>> migrate_pages() returns with a negative error code. Otherwise, the non-negative
>> return value is the number of pages that were not migrated, which is exactly
>> the count of remaining pages in the cc->migratepages list.
>>
>> Furthermore, any non-zero count is only interesting for the tracepoint of
>> mm_compaction_migratepages events, because after that all remaining unmigrated
>> pages are put back and their count is set to 0.
>>
>> This patch therefore removes update_nr_listpages() completely, and changes the
>> tracepoint definition so that the manual counting is done only when the
>> tracepoint is enabled, and only when migrate_pages() returns a negative error
>> code.
>>
>> Furthermore, migrate_pages() and the tracepoints won't be called when there's
>> nothing to migrate. This potentially avoids some wasted cycles and reduces the
>> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
>> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
>> The mm_compaction_isolate_migratepages event is better for determining that
>> nothing was isolated for migration, and this one was just duplicating the info.
>
> I agree with this patch.
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

Thanks.

> A few nitpicks below...
>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
>>   mm/compaction.c                   | 30 ++++++++----------------------
>>   2 files changed, 30 insertions(+), 26 deletions(-)
>>
>> diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
>> index 06f544e..bbd5e1f 100644
>> --- a/include/trace/events/compaction.h
>> +++ b/include/trace/events/compaction.h
>> @@ -5,7 +5,9 @@
>>   #define _TRACE_COMPACTION_H
>>
>>   #include <linux/types.h>
>> +#include <linux/list.h>
>>   #include <linux/tracepoint.h>
>> +#include <linux/mm_types.h>
>>   #include <trace/events/gfpflags.h>
>>
>>   DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
>> @@ -47,10 +49,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
>>
>>   TRACE_EVENT(mm_compaction_migratepages,
>>
>> -	TP_PROTO(unsigned long nr_migrated,
>> -		unsigned long nr_failed),
>> +	TP_PROTO(unsigned long nr_all,
>> +		int migrate_rc,
>> +		struct list_head * migratepages),
>
> checkpatch.pl shows code violation message for this line.
> (ERROR: "foo * bar" should be "foo *bar")
>
>>
>> -	TP_ARGS(nr_migrated, nr_failed),
>> +	TP_ARGS(nr_all, migrate_rc, migratepages),
>>
>>   	TP_STRUCT__entry(
>>   		__field(unsigned long, nr_migrated)
>> @@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
>>   	),
>>
>>   	TP_fast_assign(
>> -		__entry->nr_migrated = nr_migrated;
>> +		unsigned long nr_failed = 0;
>> +		struct page *page;
>> +
>> +		/*
>> +		 * migrate_pages() returns either a non-negative number
>> +		 * with the number of pages that failed migration, or an
>> +		 * error code, in which case we need to count the remaining
>> +		 * pages manually
>> +		 */
>> +		if (migrate_rc >= 0)
>> +			nr_failed = migrate_rc;
>> +		else
>> +		        list_for_each_entry(page, migratepages, lru)
>
> This line contains whitespace indent.

Oops, will fix those checkpatchs...

>> +				nr_failed++;
>> +
>> +		__entry->nr_migrated = nr_all - nr_failed;
>>   		__entry->nr_failed = nr_failed;
>>   	),
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index ae1d0ae..873d7de 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -799,22 +799,6 @@ static void compaction_free(struct page *page, unsigned long data)
>>   	cc->nr_freepages++;
>>   }
>>
>> -/*
>> - * We cannot control nr_migratepages fully when migration is running as
>> - * migrate_pages() has no knowledge of of compact_control.  When migration is
>> - * complete, we count the number of pages on the list by hand.
>> - */
>> -static void update_nr_listpages(struct compact_control *cc)
>> -{
>> -	int nr_migratepages = 0;
>> -	struct page *page;
>> -
>> -	list_for_each_entry(page, &cc->migratepages, lru)
>> -		nr_migratepages++;
>> -
>> -	cc->nr_migratepages = nr_migratepages;
>> -}
>> -
>>   /* possible outcome of isolate_migratepages */
>>   typedef enum {
>>   	ISOLATE_ABORT,		/* Abort compaction now */
>> @@ -1006,7 +990,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>>   	migrate_prep_local();
>>
>>   	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
>> -		unsigned long nr_migrate, nr_remaining;
>>   		int err;
>>
>>   		switch (isolate_migratepages(zone, cc)) {
>> @@ -1021,16 +1004,16 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>>   			;
>>   		}
>>
>> -		nr_migrate = cc->nr_migratepages;
>> +		if (!cc->nr_migratepages)
>> +			continue;
>> +
>>   		err = migrate_pages(&cc->migratepages, compaction_alloc,
>>   				compaction_free, (unsigned long)cc,
>>   				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
>>   				MR_COMPACTION);
>> -		update_nr_listpages(cc);
>> -		nr_remaining = cc->nr_migratepages;
>>
>> -		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
>> -						nr_remaining);
>> +		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
>> +							&cc->migratepages);
>>
>>   		/* Release isolated pages not migrated */
>>   		if (err) {
>> @@ -1044,6 +1027,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>>   				ret = COMPACT_PARTIAL;
>>   				goto out;
>>   			}
>> +		} else {
>> +			/* All pages were successfully migrated */
>> +			cc->nr_migratepages = 0;
>
> cc->nr_migratepages = 0 is also done in err != 0, so can it be done in common path?

Well it would have to be done before 'if (err)', thus also before 
putback_movable_pages(), which is a bit awkward. But avoiding 'else' is 
also good I guess so I'll do that for next version.

> Thanks,
> Naoya Horiguchi
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 3/6] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-07  9:34         ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07  9:34 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/07/2014 04:22 AM, David Rientjes wrote:
> Each zone has a cached migration scanner pfn for memory compaction so that
> subsequent calls to memory compaction can start where the previous call left
> off.
>
> Currently, the compaction migration scanner only updates the per-zone cached pfn
> when pageblocks were not skipped for async compaction.  This creates a
> dependency on calling sync compaction to avoid having subsequent calls to async
> compaction from scanning an enormous amount of non-MOVABLE pageblocks each time
> it is called.  On large machines, this could be potentially very expensive.
>
> This patch adds a per-zone cached migration scanner pfn only for async
> compaction.  It is updated everytime a pageblock has been scanned in its
> entirety and when no pages from it were successfully isolated.  The cached
> migration scanner pfn for sync compaction is updated only when called for sync
> compaction.
>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   v3: do not update pageblock skip metadata when skipped due to async per
>       Vlastimil.

Great.

Acked-by: Vlastimil Babka <vbabka@suse.cz>


>   include/linux/mmzone.h |  5 ++--
>   mm/compaction.c        | 66 ++++++++++++++++++++++++++++++--------------------
>   2 files changed, 43 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -360,9 +360,10 @@ struct zone {
>   	/* Set to true when the PG_migrate_skip bits should be cleared */
>   	bool			compact_blockskip_flush;
>
> -	/* pfns where compaction scanners should start */
> +	/* pfn where compaction free scanner should start */
>   	unsigned long		compact_cached_free_pfn;
> -	unsigned long		compact_cached_migrate_pfn;
> +	/* pfn where async and sync compaction migration scanner should start */
> +	unsigned long		compact_cached_migrate_pfn[2];
>   #endif
>   #ifdef CONFIG_MEMORY_HOTPLUG
>   	/* see spanned/present_pages for more description */
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
>   	unsigned long end_pfn = zone_end_pfn(zone);
>   	unsigned long pfn;
>
> -	zone->compact_cached_migrate_pfn = start_pfn;
> +	zone->compact_cached_migrate_pfn[0] = start_pfn;
> +	zone->compact_cached_migrate_pfn[1] = start_pfn;
>   	zone->compact_cached_free_pfn = end_pfn;
>   	zone->compact_blockskip_flush = false;
>
> @@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat)
>    */
>   static void update_pageblock_skip(struct compact_control *cc,
>   			struct page *page, unsigned long nr_isolated,
> -			bool migrate_scanner)
> +			bool set_unsuitable, bool migrate_scanner)
>   {
>   	struct zone *zone = cc->zone;
> +	unsigned long pfn;
>
>   	if (cc->ignore_skip_hint)
>   		return;
> @@ -141,20 +143,31 @@ static void update_pageblock_skip(struct compact_control *cc,
>   	if (!page)
>   		return;
>
> -	if (!nr_isolated) {
> -		unsigned long pfn = page_to_pfn(page);
> +	if (nr_isolated)
> +		return;
> +
> +	/*
> +	 * Only skip pageblocks when all forms of compaction will be known to
> +	 * fail in the near future.
> +	 */
> +	if (set_unsuitable)
>   		set_pageblock_skip(page);
>
> -		/* Update where compaction should restart */
> -		if (migrate_scanner) {
> -			if (!cc->finished_update_migrate &&
> -			    pfn > zone->compact_cached_migrate_pfn)
> -				zone->compact_cached_migrate_pfn = pfn;
> -		} else {
> -			if (!cc->finished_update_free &&
> -			    pfn < zone->compact_cached_free_pfn)
> -				zone->compact_cached_free_pfn = pfn;
> -		}
> +	pfn = page_to_pfn(page);
> +
> +	/* Update where async and sync compaction should restart */
> +	if (migrate_scanner) {
> +		if (cc->finished_update_migrate)
> +			return;
> +		if (pfn > zone->compact_cached_migrate_pfn[0])
> +			zone->compact_cached_migrate_pfn[0] = pfn;
> +		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
> +			zone->compact_cached_migrate_pfn[1] = pfn;
> +	} else {
> +		if (cc->finished_update_free)
> +			return;
> +		if (pfn < zone->compact_cached_free_pfn)
> +			zone->compact_cached_free_pfn = pfn;
>   	}
>   }
>   #else
> @@ -166,7 +179,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
>
>   static void update_pageblock_skip(struct compact_control *cc,
>   			struct page *page, unsigned long nr_isolated,
> -			bool migrate_scanner)
> +			bool set_unsuitable, bool migrate_scanner)
>   {
>   }
>   #endif /* CONFIG_COMPACTION */
> @@ -329,7 +342,8 @@ isolate_fail:
>
>   	/* Update the pageblock-skip if the whole pageblock was scanned */
>   	if (blockpfn == end_pfn)
> -		update_pageblock_skip(cc, valid_page, total_isolated, false);
> +		update_pageblock_skip(cc, valid_page, total_isolated, true,
> +				      false);
>
>   	count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
>   	if (total_isolated)
> @@ -464,7 +478,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   	unsigned long flags;
>   	bool locked = false;
>   	struct page *page = NULL, *valid_page = NULL;
> -	bool skipped_async_unsuitable = false;
> +	bool set_unsuitable = true;
>   	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
>   				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
>
> @@ -541,8 +555,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			 */
>   			mt = get_pageblock_migratetype(page);
>   			if (!cc->sync && !migrate_async_suitable(mt)) {
> -				cc->finished_update_migrate = true;
> -				skipped_async_unsuitable = true;
> +				set_unsuitable = false;
>   				goto next_pageblock;
>   			}
>   		}
> @@ -646,11 +659,10 @@ next_pageblock:
>   	/*
>   	 * Update the pageblock-skip information and cached scanner pfn,
>   	 * if the whole pageblock was scanned without isolating any page.
> -	 * This is not done when pageblock was skipped due to being unsuitable
> -	 * for async compaction, so that eventual sync compaction can try.
>   	 */
> -	if (low_pfn == end_pfn && !skipped_async_unsuitable)
> -		update_pageblock_skip(cc, valid_page, nr_isolated, true);
> +	if (low_pfn == end_pfn)
> +		update_pageblock_skip(cc, valid_page, nr_isolated,
> +				      set_unsuitable, true);
>
>   	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
>
> @@ -877,7 +889,8 @@ static int compact_finished(struct zone *zone,
>   	/* Compaction run completes if the migrate and free scanner meet */
>   	if (cc->free_pfn <= cc->migrate_pfn) {
>   		/* Let the next compaction start anew. */
> -		zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
> +		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
> +		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
>   		zone->compact_cached_free_pfn = zone_end_pfn(zone);
>
>   		/*
> @@ -1002,7 +1015,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	 * information on where the scanners should start but check that it
>   	 * is initialised by ensuring the values are within zone boundaries.
>   	 */
> -	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
> +	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
>   	cc->free_pfn = zone->compact_cached_free_pfn;
>   	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
>   		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
> @@ -1010,7 +1023,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	}
>   	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
>   		cc->migrate_pfn = start_pfn;
> -		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
> +		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
> +		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
>   	}
>
>   	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 3/6] mm, compaction: add per-zone migration pfn cache for async compaction
@ 2014-05-07  9:34         ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07  9:34 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/07/2014 04:22 AM, David Rientjes wrote:
> Each zone has a cached migration scanner pfn for memory compaction so that
> subsequent calls to memory compaction can start where the previous call left
> off.
>
> Currently, the compaction migration scanner only updates the per-zone cached pfn
> when pageblocks were not skipped for async compaction.  This creates a
> dependency on calling sync compaction to avoid having subsequent calls to async
> compaction from scanning an enormous amount of non-MOVABLE pageblocks each time
> it is called.  On large machines, this could be potentially very expensive.
>
> This patch adds a per-zone cached migration scanner pfn only for async
> compaction.  It is updated everytime a pageblock has been scanned in its
> entirety and when no pages from it were successfully isolated.  The cached
> migration scanner pfn for sync compaction is updated only when called for sync
> compaction.
>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   v3: do not update pageblock skip metadata when skipped due to async per
>       Vlastimil.

Great.

Acked-by: Vlastimil Babka <vbabka@suse.cz>


>   include/linux/mmzone.h |  5 ++--
>   mm/compaction.c        | 66 ++++++++++++++++++++++++++++++--------------------
>   2 files changed, 43 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -360,9 +360,10 @@ struct zone {
>   	/* Set to true when the PG_migrate_skip bits should be cleared */
>   	bool			compact_blockskip_flush;
>
> -	/* pfns where compaction scanners should start */
> +	/* pfn where compaction free scanner should start */
>   	unsigned long		compact_cached_free_pfn;
> -	unsigned long		compact_cached_migrate_pfn;
> +	/* pfn where async and sync compaction migration scanner should start */
> +	unsigned long		compact_cached_migrate_pfn[2];
>   #endif
>   #ifdef CONFIG_MEMORY_HOTPLUG
>   	/* see spanned/present_pages for more description */
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
>   	unsigned long end_pfn = zone_end_pfn(zone);
>   	unsigned long pfn;
>
> -	zone->compact_cached_migrate_pfn = start_pfn;
> +	zone->compact_cached_migrate_pfn[0] = start_pfn;
> +	zone->compact_cached_migrate_pfn[1] = start_pfn;
>   	zone->compact_cached_free_pfn = end_pfn;
>   	zone->compact_blockskip_flush = false;
>
> @@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat)
>    */
>   static void update_pageblock_skip(struct compact_control *cc,
>   			struct page *page, unsigned long nr_isolated,
> -			bool migrate_scanner)
> +			bool set_unsuitable, bool migrate_scanner)
>   {
>   	struct zone *zone = cc->zone;
> +	unsigned long pfn;
>
>   	if (cc->ignore_skip_hint)
>   		return;
> @@ -141,20 +143,31 @@ static void update_pageblock_skip(struct compact_control *cc,
>   	if (!page)
>   		return;
>
> -	if (!nr_isolated) {
> -		unsigned long pfn = page_to_pfn(page);
> +	if (nr_isolated)
> +		return;
> +
> +	/*
> +	 * Only skip pageblocks when all forms of compaction will be known to
> +	 * fail in the near future.
> +	 */
> +	if (set_unsuitable)
>   		set_pageblock_skip(page);
>
> -		/* Update where compaction should restart */
> -		if (migrate_scanner) {
> -			if (!cc->finished_update_migrate &&
> -			    pfn > zone->compact_cached_migrate_pfn)
> -				zone->compact_cached_migrate_pfn = pfn;
> -		} else {
> -			if (!cc->finished_update_free &&
> -			    pfn < zone->compact_cached_free_pfn)
> -				zone->compact_cached_free_pfn = pfn;
> -		}
> +	pfn = page_to_pfn(page);
> +
> +	/* Update where async and sync compaction should restart */
> +	if (migrate_scanner) {
> +		if (cc->finished_update_migrate)
> +			return;
> +		if (pfn > zone->compact_cached_migrate_pfn[0])
> +			zone->compact_cached_migrate_pfn[0] = pfn;
> +		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
> +			zone->compact_cached_migrate_pfn[1] = pfn;
> +	} else {
> +		if (cc->finished_update_free)
> +			return;
> +		if (pfn < zone->compact_cached_free_pfn)
> +			zone->compact_cached_free_pfn = pfn;
>   	}
>   }
>   #else
> @@ -166,7 +179,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
>
>   static void update_pageblock_skip(struct compact_control *cc,
>   			struct page *page, unsigned long nr_isolated,
> -			bool migrate_scanner)
> +			bool set_unsuitable, bool migrate_scanner)
>   {
>   }
>   #endif /* CONFIG_COMPACTION */
> @@ -329,7 +342,8 @@ isolate_fail:
>
>   	/* Update the pageblock-skip if the whole pageblock was scanned */
>   	if (blockpfn == end_pfn)
> -		update_pageblock_skip(cc, valid_page, total_isolated, false);
> +		update_pageblock_skip(cc, valid_page, total_isolated, true,
> +				      false);
>
>   	count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
>   	if (total_isolated)
> @@ -464,7 +478,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   	unsigned long flags;
>   	bool locked = false;
>   	struct page *page = NULL, *valid_page = NULL;
> -	bool skipped_async_unsuitable = false;
> +	bool set_unsuitable = true;
>   	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
>   				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
>
> @@ -541,8 +555,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			 */
>   			mt = get_pageblock_migratetype(page);
>   			if (!cc->sync && !migrate_async_suitable(mt)) {
> -				cc->finished_update_migrate = true;
> -				skipped_async_unsuitable = true;
> +				set_unsuitable = false;
>   				goto next_pageblock;
>   			}
>   		}
> @@ -646,11 +659,10 @@ next_pageblock:
>   	/*
>   	 * Update the pageblock-skip information and cached scanner pfn,
>   	 * if the whole pageblock was scanned without isolating any page.
> -	 * This is not done when pageblock was skipped due to being unsuitable
> -	 * for async compaction, so that eventual sync compaction can try.
>   	 */
> -	if (low_pfn == end_pfn && !skipped_async_unsuitable)
> -		update_pageblock_skip(cc, valid_page, nr_isolated, true);
> +	if (low_pfn == end_pfn)
> +		update_pageblock_skip(cc, valid_page, nr_isolated,
> +				      set_unsuitable, true);
>
>   	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
>
> @@ -877,7 +889,8 @@ static int compact_finished(struct zone *zone,
>   	/* Compaction run completes if the migrate and free scanner meet */
>   	if (cc->free_pfn <= cc->migrate_pfn) {
>   		/* Let the next compaction start anew. */
> -		zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
> +		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
> +		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
>   		zone->compact_cached_free_pfn = zone_end_pfn(zone);
>
>   		/*
> @@ -1002,7 +1015,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	 * information on where the scanners should start but check that it
>   	 * is initialised by ensuring the values are within zone boundaries.
>   	 */
> -	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
> +	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
>   	cc->free_pfn = zone->compact_cached_free_pfn;
>   	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
>   		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
> @@ -1010,7 +1023,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	}
>   	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
>   		cc->migrate_pfn = start_pfn;
> -		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
> +		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
> +		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
>   	}
>
>   	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 5/6] mm, thp: avoid excessive compaction latency during fault
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-07  9:39         ` Mel Gorman
  -1 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-07  9:39 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, May 06, 2014 at 07:22:50PM -0700, David Rientjes wrote:
> Synchronous memory compaction can be very expensive: it can iterate an enormous 
> amount of memory without aborting, constantly rescheduling, waiting on page
> locks and lru_lock, etc, if a pageblock cannot be defragmented.
> 
> Unfortunately, it's too expensive for transparent hugepage page faults and 
> it's much better to simply fallback to pages.  On 128GB machines, we find that 
> synchronous memory compaction can take O(seconds) for a single thp fault.
> 
> Now that async compaction remembers where it left off without strictly relying
> on sync compaction, this makes thp allocations best-effort without causing
> egregious latency during fault.  We still need to retry async compaction after
> reclaim, but this won't stall for seconds.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Mel Gorman <mgorman@suse.de>

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 5/6] mm, thp: avoid excessive compaction latency during fault
@ 2014-05-07  9:39         ` Mel Gorman
  0 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-07  9:39 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, May 06, 2014 at 07:22:50PM -0700, David Rientjes wrote:
> Synchronous memory compaction can be very expensive: it can iterate an enormous 
> amount of memory without aborting, constantly rescheduling, waiting on page
> locks and lru_lock, etc, if a pageblock cannot be defragmented.
> 
> Unfortunately, it's too expensive for transparent hugepage page faults and 
> it's much better to simply fallback to pages.  On 128GB machines, we find that 
> synchronous memory compaction can take O(seconds) for a single thp fault.
> 
> Now that async compaction remembers where it left off without strictly relying
> on sync compaction, this makes thp allocations best-effort without causing
> egregious latency during fault.  We still need to retry async compaction after
> reclaim, but this won't stall for seconds.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Mel Gorman <mgorman@suse.de>

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-07  9:41         ` Mel Gorman
  -1 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-07  9:41 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, May 06, 2014 at 07:22:52PM -0700, David Rientjes wrote:
> Async compaction terminates prematurely when need_resched(), see
> compact_checklock_irqsave().  This can never trigger, however, if the 
> cond_resched() in isolate_migratepages_range() always takes care of the 
> scheduling.
> 
> If the cond_resched() actually triggers, then terminate this pageblock scan for 
> async compaction as well.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Mel Gorman <mgorman@suse.de>

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
@ 2014-05-07  9:41         ` Mel Gorman
  0 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-07  9:41 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, May 06, 2014 at 07:22:52PM -0700, David Rientjes wrote:
> Async compaction terminates prematurely when need_resched(), see
> compact_checklock_irqsave().  This can never trigger, however, if the 
> cond_resched() in isolate_migratepages_range() always takes care of the 
> scheduling.
> 
> If the cond_resched() actually triggers, then terminate this pageblock scan for 
> async compaction as well.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Mel Gorman <mgorman@suse.de>

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 4/6] mm, compaction: embed migration mode in compact_control
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-07  9:55         ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07  9:55 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/07/2014 04:22 AM, David Rientjes wrote:
> We're going to want to manipulate the migration mode for compaction in the page
> allocator, and currently compact_control's sync field is only a bool.
>
> Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending
> on the value of this bool.  Convert the bool to enum migrate_mode and pass the
> migration mode in directly.  Later, we'll want to avoid MIGRATE_SYNC_LIGHT for
> thp allocations in the pagefault patch to avoid unnecessary latency.
>
> This also alters compaction triggered from sysfs, either for the entire system
> or for a node, to force MIGRATE_SYNC.
>
> Suggested-by: Mel Gorman <mgorman@suse.de>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   include/linux/compaction.h |  4 ++--
>   mm/compaction.c            | 36 +++++++++++++++++++-----------------
>   mm/internal.h              |  2 +-
>   mm/page_alloc.c            | 37 ++++++++++++++++---------------------
>   4 files changed, 38 insertions(+), 41 deletions(-)
>
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
>   extern int fragmentation_index(struct zone *zone, unsigned int order);
>   extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
>   			int order, gfp_t gfp_mask, nodemask_t *mask,
> -			bool sync, bool *contended);
> +			enum migrate_mode sync, bool *contended);

Everywhere else it's 'mode' and only in this function it's still called 
'sync', that's confusing.
Afterwards:

Acked-by: Vlastimil Babka <vbabka@suse.cz>

>   extern void compact_pgdat(pg_data_t *pgdat, int order);
>   extern void reset_isolation_suitable(pg_data_t *pgdat);
>   extern unsigned long compaction_suitable(struct zone *zone, int order);
> @@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
>   #else
>   static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
>   			int order, gfp_t gfp_mask, nodemask_t *nodemask,
> -			bool sync, bool *contended)
> +			enum migrate_mode sync, bool *contended)
>   {
>   	return COMPACT_CONTINUE;
>   }
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -161,7 +161,8 @@ static void update_pageblock_skip(struct compact_control *cc,
>   			return;
>   		if (pfn > zone->compact_cached_migrate_pfn[0])
>   			zone->compact_cached_migrate_pfn[0] = pfn;
> -		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
> +		if (cc->mode != MIGRATE_ASYNC &&
> +		    pfn > zone->compact_cached_migrate_pfn[1])
>   			zone->compact_cached_migrate_pfn[1] = pfn;
>   	} else {
>   		if (cc->finished_update_free)
> @@ -208,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>   		}
>
>   		/* async aborts if taking too long or contended */
> -		if (!cc->sync) {
> +		if (cc->mode == MIGRATE_ASYNC) {
>   			cc->contended = true;
>   			return false;
>   		}
> @@ -479,7 +480,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   	bool locked = false;
>   	struct page *page = NULL, *valid_page = NULL;
>   	bool set_unsuitable = true;
> -	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
> +	const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
> +					ISOLATE_ASYNC_MIGRATE : 0) |
>   				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
>
>   	/*
> @@ -489,7 +491,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   	 */
>   	while (unlikely(too_many_isolated(zone))) {
>   		/* async migration should just abort */
> -		if (!cc->sync)
> +		if (cc->mode == MIGRATE_ASYNC)
>   			return 0;
>
>   		congestion_wait(BLK_RW_ASYNC, HZ/10);
> @@ -554,7 +556,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			 * the minimum amount of work satisfies the allocation
>   			 */
>   			mt = get_pageblock_migratetype(page);
> -			if (!cc->sync && !migrate_async_suitable(mt)) {
> +			if (cc->mode == MIGRATE_ASYNC &&
> +			    !migrate_async_suitable(mt)) {
>   				set_unsuitable = false;
>   				goto next_pageblock;
>   			}
> @@ -990,6 +993,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	int ret;
>   	unsigned long start_pfn = zone->zone_start_pfn;
>   	unsigned long end_pfn = zone_end_pfn(zone);
> +	const bool sync = cc->mode != MIGRATE_ASYNC;
>
>   	ret = compaction_suitable(zone, cc->order);
>   	switch (ret) {
> @@ -1015,7 +1019,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	 * information on where the scanners should start but check that it
>   	 * is initialised by ensuring the values are within zone boundaries.
>   	 */
> -	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
> +	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
>   	cc->free_pfn = zone->compact_cached_free_pfn;
>   	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
>   		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
> @@ -1049,8 +1053,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>
>   		nr_migrate = cc->nr_migratepages;
>   		err = migrate_pages(&cc->migratepages, compaction_alloc,
> -				compaction_free, (unsigned long)cc,
> -				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
> +				compaction_free, (unsigned long)cc, cc->mode,
>   				MR_COMPACTION);
>   		update_nr_listpages(cc);
>   		nr_remaining = cc->nr_migratepages;
> @@ -1083,9 +1086,8 @@ out:
>   	return ret;
>   }
>
> -static unsigned long compact_zone_order(struct zone *zone,
> -				 int order, gfp_t gfp_mask,
> -				 bool sync, bool *contended)
> +static unsigned long compact_zone_order(struct zone *zone, int order,
> +		gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
>   {
>   	unsigned long ret;
>   	struct compact_control cc = {
> @@ -1094,7 +1096,7 @@ static unsigned long compact_zone_order(struct zone *zone,
>   		.order = order,
>   		.migratetype = allocflags_to_migratetype(gfp_mask),
>   		.zone = zone,
> -		.sync = sync,
> +		.mode = mode,
>   	};
>   	INIT_LIST_HEAD(&cc.freepages);
>   	INIT_LIST_HEAD(&cc.migratepages);
> @@ -1116,7 +1118,7 @@ int sysctl_extfrag_threshold = 500;
>    * @order: The order of the current allocation
>    * @gfp_mask: The GFP mask of the current allocation
>    * @nodemask: The allowed nodes to allocate from
> - * @sync: Whether migration is synchronous or not
> + * @sync: The migration mode for async, sync light, or sync migration
>    * @contended: Return value that is true if compaction was aborted due to lock contention
>    * @page: Optionally capture a free page of the requested order during compaction
>    *
> @@ -1124,7 +1126,7 @@ int sysctl_extfrag_threshold = 500;
>    */
>   unsigned long try_to_compact_pages(struct zonelist *zonelist,
>   			int order, gfp_t gfp_mask, nodemask_t *nodemask,
> -			bool sync, bool *contended)
> +			enum migrate_mode sync, bool *contended)
>   {
>   	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
>   	int may_enter_fs = gfp_mask & __GFP_FS;
> @@ -1189,7 +1191,7 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
>   						low_wmark_pages(zone), 0, 0))
>   				compaction_defer_reset(zone, cc->order, false);
>   			/* Currently async compaction is never deferred. */
> -			else if (cc->sync)
> +			else if (cc->mode != MIGRATE_ASYNC)
>   				defer_compaction(zone, cc->order);
>   		}
>
> @@ -1202,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
>   {
>   	struct compact_control cc = {
>   		.order = order,
> -		.sync = false,
> +		.mode = MIGRATE_ASYNC,
>   	};
>
>   	if (!order)
> @@ -1215,7 +1217,7 @@ static void compact_node(int nid)
>   {
>   	struct compact_control cc = {
>   		.order = -1,
> -		.sync = true,
> +		.mode = MIGRATE_SYNC,
>   		.ignore_skip_hint = true,
>   	};
>
> diff --git a/mm/internal.h b/mm/internal.h
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -134,7 +134,7 @@ struct compact_control {
>   	unsigned long nr_migratepages;	/* Number of pages to migrate */
>   	unsigned long free_pfn;		/* isolate_freepages search base */
>   	unsigned long migrate_pfn;	/* isolate_migratepages search base */
> -	bool sync;			/* Synchronous migration */
> +	enum migrate_mode mode;		/* Async or sync migration mode */
>   	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
>   	bool finished_update_free;	/* True when the zone cached pfns are
>   					 * no longer being updated
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2226,7 +2226,7 @@ static struct page *
>   __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>   	struct zonelist *zonelist, enum zone_type high_zoneidx,
>   	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
> -	int migratetype, bool sync_migration,
> +	int migratetype, enum migrate_mode mode,
>   	bool *contended_compaction, bool *deferred_compaction,
>   	unsigned long *did_some_progress)
>   {
> @@ -2240,7 +2240,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>
>   	current->flags |= PF_MEMALLOC;
>   	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
> -						nodemask, sync_migration,
> +						nodemask, mode,
>   						contended_compaction);
>   	current->flags &= ~PF_MEMALLOC;
>
> @@ -2273,7 +2273,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>   		 * As async compaction considers a subset of pageblocks, only
>   		 * defer if the failure was a sync compaction failure.
>   		 */
> -		if (sync_migration)
> +		if (mode != MIGRATE_ASYNC)
>   			defer_compaction(preferred_zone, order);
>
>   		cond_resched();
> @@ -2286,9 +2286,8 @@ static inline struct page *
>   __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>   	struct zonelist *zonelist, enum zone_type high_zoneidx,
>   	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
> -	int migratetype, bool sync_migration,
> -	bool *contended_compaction, bool *deferred_compaction,
> -	unsigned long *did_some_progress)
> +	int migratetype, enum migrate_mode mode, bool *contended_compaction,
> +	bool *deferred_compaction, unsigned long *did_some_progress)
>   {
>   	return NULL;
>   }
> @@ -2483,7 +2482,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
>   	int alloc_flags;
>   	unsigned long pages_reclaimed = 0;
>   	unsigned long did_some_progress;
> -	bool sync_migration = false;
> +	enum migrate_mode migration_mode = MIGRATE_ASYNC;
>   	bool deferred_compaction = false;
>   	bool contended_compaction = false;
>
> @@ -2577,17 +2576,15 @@ rebalance:
>   	 * Try direct compaction. The first pass is asynchronous. Subsequent
>   	 * attempts after direct reclaim are synchronous
>   	 */
> -	page = __alloc_pages_direct_compact(gfp_mask, order,
> -					zonelist, high_zoneidx,
> -					nodemask,
> -					alloc_flags, preferred_zone,
> -					migratetype, sync_migration,
> -					&contended_compaction,
> +	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
> +					high_zoneidx, nodemask, alloc_flags,
> +					preferred_zone, migratetype,
> +					migration_mode, &contended_compaction,
>   					&deferred_compaction,
>   					&did_some_progress);
>   	if (page)
>   		goto got_pg;
> -	sync_migration = true;
> +	migration_mode = MIGRATE_SYNC_LIGHT;
>
>   	/*
>   	 * If compaction is deferred for high-order allocations, it is because
> @@ -2662,12 +2659,10 @@ rebalance:
>   		 * direct reclaim and reclaim/compaction depends on compaction
>   		 * being called after reclaim so call directly if necessary
>   		 */
> -		page = __alloc_pages_direct_compact(gfp_mask, order,
> -					zonelist, high_zoneidx,
> -					nodemask,
> -					alloc_flags, preferred_zone,
> -					migratetype, sync_migration,
> -					&contended_compaction,
> +		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
> +					high_zoneidx, nodemask, alloc_flags,
> +					preferred_zone, migratetype,
> +					migration_mode, &contended_compaction,
>   					&deferred_compaction,
>   					&did_some_progress);
>   		if (page)
> @@ -6254,7 +6249,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>   		.nr_migratepages = 0,
>   		.order = -1,
>   		.zone = page_zone(pfn_to_page(start)),
> -		.sync = true,
> +		.sync = MIGRATE_SYNC_LIGHT,
>   		.ignore_skip_hint = true,
>   	};
>   	INIT_LIST_HEAD(&cc.migratepages);
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 4/6] mm, compaction: embed migration mode in compact_control
@ 2014-05-07  9:55         ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07  9:55 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/07/2014 04:22 AM, David Rientjes wrote:
> We're going to want to manipulate the migration mode for compaction in the page
> allocator, and currently compact_control's sync field is only a bool.
>
> Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending
> on the value of this bool.  Convert the bool to enum migrate_mode and pass the
> migration mode in directly.  Later, we'll want to avoid MIGRATE_SYNC_LIGHT for
> thp allocations in the pagefault patch to avoid unnecessary latency.
>
> This also alters compaction triggered from sysfs, either for the entire system
> or for a node, to force MIGRATE_SYNC.
>
> Suggested-by: Mel Gorman <mgorman@suse.de>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   include/linux/compaction.h |  4 ++--
>   mm/compaction.c            | 36 +++++++++++++++++++-----------------
>   mm/internal.h              |  2 +-
>   mm/page_alloc.c            | 37 ++++++++++++++++---------------------
>   4 files changed, 38 insertions(+), 41 deletions(-)
>
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
>   extern int fragmentation_index(struct zone *zone, unsigned int order);
>   extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
>   			int order, gfp_t gfp_mask, nodemask_t *mask,
> -			bool sync, bool *contended);
> +			enum migrate_mode sync, bool *contended);

Everywhere else it's 'mode' and only in this function it's still called 
'sync', that's confusing.
Afterwards:

Acked-by: Vlastimil Babka <vbabka@suse.cz>

>   extern void compact_pgdat(pg_data_t *pgdat, int order);
>   extern void reset_isolation_suitable(pg_data_t *pgdat);
>   extern unsigned long compaction_suitable(struct zone *zone, int order);
> @@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
>   #else
>   static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
>   			int order, gfp_t gfp_mask, nodemask_t *nodemask,
> -			bool sync, bool *contended)
> +			enum migrate_mode sync, bool *contended)
>   {
>   	return COMPACT_CONTINUE;
>   }
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -161,7 +161,8 @@ static void update_pageblock_skip(struct compact_control *cc,
>   			return;
>   		if (pfn > zone->compact_cached_migrate_pfn[0])
>   			zone->compact_cached_migrate_pfn[0] = pfn;
> -		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
> +		if (cc->mode != MIGRATE_ASYNC &&
> +		    pfn > zone->compact_cached_migrate_pfn[1])
>   			zone->compact_cached_migrate_pfn[1] = pfn;
>   	} else {
>   		if (cc->finished_update_free)
> @@ -208,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>   		}
>
>   		/* async aborts if taking too long or contended */
> -		if (!cc->sync) {
> +		if (cc->mode == MIGRATE_ASYNC) {
>   			cc->contended = true;
>   			return false;
>   		}
> @@ -479,7 +480,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   	bool locked = false;
>   	struct page *page = NULL, *valid_page = NULL;
>   	bool set_unsuitable = true;
> -	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
> +	const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
> +					ISOLATE_ASYNC_MIGRATE : 0) |
>   				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
>
>   	/*
> @@ -489,7 +491,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   	 */
>   	while (unlikely(too_many_isolated(zone))) {
>   		/* async migration should just abort */
> -		if (!cc->sync)
> +		if (cc->mode == MIGRATE_ASYNC)
>   			return 0;
>
>   		congestion_wait(BLK_RW_ASYNC, HZ/10);
> @@ -554,7 +556,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			 * the minimum amount of work satisfies the allocation
>   			 */
>   			mt = get_pageblock_migratetype(page);
> -			if (!cc->sync && !migrate_async_suitable(mt)) {
> +			if (cc->mode == MIGRATE_ASYNC &&
> +			    !migrate_async_suitable(mt)) {
>   				set_unsuitable = false;
>   				goto next_pageblock;
>   			}
> @@ -990,6 +993,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	int ret;
>   	unsigned long start_pfn = zone->zone_start_pfn;
>   	unsigned long end_pfn = zone_end_pfn(zone);
> +	const bool sync = cc->mode != MIGRATE_ASYNC;
>
>   	ret = compaction_suitable(zone, cc->order);
>   	switch (ret) {
> @@ -1015,7 +1019,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	 * information on where the scanners should start but check that it
>   	 * is initialised by ensuring the values are within zone boundaries.
>   	 */
> -	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
> +	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
>   	cc->free_pfn = zone->compact_cached_free_pfn;
>   	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
>   		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
> @@ -1049,8 +1053,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>
>   		nr_migrate = cc->nr_migratepages;
>   		err = migrate_pages(&cc->migratepages, compaction_alloc,
> -				compaction_free, (unsigned long)cc,
> -				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
> +				compaction_free, (unsigned long)cc, cc->mode,
>   				MR_COMPACTION);
>   		update_nr_listpages(cc);
>   		nr_remaining = cc->nr_migratepages;
> @@ -1083,9 +1086,8 @@ out:
>   	return ret;
>   }
>
> -static unsigned long compact_zone_order(struct zone *zone,
> -				 int order, gfp_t gfp_mask,
> -				 bool sync, bool *contended)
> +static unsigned long compact_zone_order(struct zone *zone, int order,
> +		gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
>   {
>   	unsigned long ret;
>   	struct compact_control cc = {
> @@ -1094,7 +1096,7 @@ static unsigned long compact_zone_order(struct zone *zone,
>   		.order = order,
>   		.migratetype = allocflags_to_migratetype(gfp_mask),
>   		.zone = zone,
> -		.sync = sync,
> +		.mode = mode,
>   	};
>   	INIT_LIST_HEAD(&cc.freepages);
>   	INIT_LIST_HEAD(&cc.migratepages);
> @@ -1116,7 +1118,7 @@ int sysctl_extfrag_threshold = 500;
>    * @order: The order of the current allocation
>    * @gfp_mask: The GFP mask of the current allocation
>    * @nodemask: The allowed nodes to allocate from
> - * @sync: Whether migration is synchronous or not
> + * @sync: The migration mode for async, sync light, or sync migration
>    * @contended: Return value that is true if compaction was aborted due to lock contention
>    * @page: Optionally capture a free page of the requested order during compaction
>    *
> @@ -1124,7 +1126,7 @@ int sysctl_extfrag_threshold = 500;
>    */
>   unsigned long try_to_compact_pages(struct zonelist *zonelist,
>   			int order, gfp_t gfp_mask, nodemask_t *nodemask,
> -			bool sync, bool *contended)
> +			enum migrate_mode sync, bool *contended)
>   {
>   	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
>   	int may_enter_fs = gfp_mask & __GFP_FS;
> @@ -1189,7 +1191,7 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
>   						low_wmark_pages(zone), 0, 0))
>   				compaction_defer_reset(zone, cc->order, false);
>   			/* Currently async compaction is never deferred. */
> -			else if (cc->sync)
> +			else if (cc->mode != MIGRATE_ASYNC)
>   				defer_compaction(zone, cc->order);
>   		}
>
> @@ -1202,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
>   {
>   	struct compact_control cc = {
>   		.order = order,
> -		.sync = false,
> +		.mode = MIGRATE_ASYNC,
>   	};
>
>   	if (!order)
> @@ -1215,7 +1217,7 @@ static void compact_node(int nid)
>   {
>   	struct compact_control cc = {
>   		.order = -1,
> -		.sync = true,
> +		.mode = MIGRATE_SYNC,
>   		.ignore_skip_hint = true,
>   	};
>
> diff --git a/mm/internal.h b/mm/internal.h
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -134,7 +134,7 @@ struct compact_control {
>   	unsigned long nr_migratepages;	/* Number of pages to migrate */
>   	unsigned long free_pfn;		/* isolate_freepages search base */
>   	unsigned long migrate_pfn;	/* isolate_migratepages search base */
> -	bool sync;			/* Synchronous migration */
> +	enum migrate_mode mode;		/* Async or sync migration mode */
>   	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
>   	bool finished_update_free;	/* True when the zone cached pfns are
>   					 * no longer being updated
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2226,7 +2226,7 @@ static struct page *
>   __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>   	struct zonelist *zonelist, enum zone_type high_zoneidx,
>   	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
> -	int migratetype, bool sync_migration,
> +	int migratetype, enum migrate_mode mode,
>   	bool *contended_compaction, bool *deferred_compaction,
>   	unsigned long *did_some_progress)
>   {
> @@ -2240,7 +2240,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>
>   	current->flags |= PF_MEMALLOC;
>   	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
> -						nodemask, sync_migration,
> +						nodemask, mode,
>   						contended_compaction);
>   	current->flags &= ~PF_MEMALLOC;
>
> @@ -2273,7 +2273,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>   		 * As async compaction considers a subset of pageblocks, only
>   		 * defer if the failure was a sync compaction failure.
>   		 */
> -		if (sync_migration)
> +		if (mode != MIGRATE_ASYNC)
>   			defer_compaction(preferred_zone, order);
>
>   		cond_resched();
> @@ -2286,9 +2286,8 @@ static inline struct page *
>   __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>   	struct zonelist *zonelist, enum zone_type high_zoneidx,
>   	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
> -	int migratetype, bool sync_migration,
> -	bool *contended_compaction, bool *deferred_compaction,
> -	unsigned long *did_some_progress)
> +	int migratetype, enum migrate_mode mode, bool *contended_compaction,
> +	bool *deferred_compaction, unsigned long *did_some_progress)
>   {
>   	return NULL;
>   }
> @@ -2483,7 +2482,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
>   	int alloc_flags;
>   	unsigned long pages_reclaimed = 0;
>   	unsigned long did_some_progress;
> -	bool sync_migration = false;
> +	enum migrate_mode migration_mode = MIGRATE_ASYNC;
>   	bool deferred_compaction = false;
>   	bool contended_compaction = false;
>
> @@ -2577,17 +2576,15 @@ rebalance:
>   	 * Try direct compaction. The first pass is asynchronous. Subsequent
>   	 * attempts after direct reclaim are synchronous
>   	 */
> -	page = __alloc_pages_direct_compact(gfp_mask, order,
> -					zonelist, high_zoneidx,
> -					nodemask,
> -					alloc_flags, preferred_zone,
> -					migratetype, sync_migration,
> -					&contended_compaction,
> +	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
> +					high_zoneidx, nodemask, alloc_flags,
> +					preferred_zone, migratetype,
> +					migration_mode, &contended_compaction,
>   					&deferred_compaction,
>   					&did_some_progress);
>   	if (page)
>   		goto got_pg;
> -	sync_migration = true;
> +	migration_mode = MIGRATE_SYNC_LIGHT;
>
>   	/*
>   	 * If compaction is deferred for high-order allocations, it is because
> @@ -2662,12 +2659,10 @@ rebalance:
>   		 * direct reclaim and reclaim/compaction depends on compaction
>   		 * being called after reclaim so call directly if necessary
>   		 */
> -		page = __alloc_pages_direct_compact(gfp_mask, order,
> -					zonelist, high_zoneidx,
> -					nodemask,
> -					alloc_flags, preferred_zone,
> -					migratetype, sync_migration,
> -					&contended_compaction,
> +		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
> +					high_zoneidx, nodemask, alloc_flags,
> +					preferred_zone, migratetype,
> +					migration_mode, &contended_compaction,
>   					&deferred_compaction,
>   					&did_some_progress);
>   		if (page)
> @@ -6254,7 +6249,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>   		.nr_migratepages = 0,
>   		.order = -1,
>   		.zone = page_zone(pfn_to_page(start)),
> -		.sync = true,
> +		.sync = MIGRATE_SYNC_LIGHT,
>   		.ignore_skip_hint = true,
>   	};
>   	INIT_LIST_HEAD(&cc.migratepages);
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v4 4/6] mm, compaction: embed migration mode in compact_control
  2014-05-07  9:55         ` Vlastimil Babka
@ 2014-05-07 10:36           ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 10:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vlastimil Babka, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

We're going to want to manipulate the migration mode for compaction in the page 
allocator, and currently compact_control's sync field is only a bool.  

Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending 
on the value of this bool.  Convert the bool to enum migrate_mode and pass the 
migration mode in directly.  Later, we'll want to avoid MIGRATE_SYNC_LIGHT for 
thp allocations in the pagefault patch to avoid unnecessary latency.

This also alters compaction triggered from sysfs, either for the entire system 
or for a node, to force MIGRATE_SYNC.

Suggested-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 v4 of this patch only: converted name of formal from "sync" to "mode" for
                        try_to_compact_pages() per Vlastimil

 include/linux/compaction.h |  4 ++--
 mm/compaction.c            | 38 ++++++++++++++++++++------------------
 mm/internal.h              |  2 +-
 mm/page_alloc.c            | 37 ++++++++++++++++---------------------
 4 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask,
-			bool sync, bool *contended);
+			enum migrate_mode mode, bool *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode mode, bool *contended)
 {
 	return COMPACT_CONTINUE;
 }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -161,7 +161,8 @@ static void update_pageblock_skip(struct compact_control *cc,
 			return;
 		if (pfn > zone->compact_cached_migrate_pfn[0])
 			zone->compact_cached_migrate_pfn[0] = pfn;
-		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
+		if (cc->mode != MIGRATE_ASYNC &&
+		    pfn > zone->compact_cached_migrate_pfn[1])
 			zone->compact_cached_migrate_pfn[1] = pfn;
 	} else {
 		if (cc->finished_update_free)
@@ -208,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 		}
 
 		/* async aborts if taking too long or contended */
-		if (!cc->sync) {
+		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
 			return false;
 		}
@@ -479,7 +480,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
 	bool set_unsuitable = true;
-	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
+	const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
+					ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
 
 	/*
@@ -489,7 +491,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	 */
 	while (unlikely(too_many_isolated(zone))) {
 		/* async migration should just abort */
-		if (!cc->sync)
+		if (cc->mode == MIGRATE_ASYNC)
 			return 0;
 
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -554,7 +556,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			 * the minimum amount of work satisfies the allocation
 			 */
 			mt = get_pageblock_migratetype(page);
-			if (!cc->sync && !migrate_async_suitable(mt)) {
+			if (cc->mode == MIGRATE_ASYNC &&
+			    !migrate_async_suitable(mt)) {
 				set_unsuitable = false;
 				goto next_pageblock;
 			}
@@ -990,6 +993,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	int ret;
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone_end_pfn(zone);
+	const bool sync = cc->mode != MIGRATE_ASYNC;
 
 	ret = compaction_suitable(zone, cc->order);
 	switch (ret) {
@@ -1015,7 +1019,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	 * information on where the scanners should start but check that it
 	 * is initialised by ensuring the values are within zone boundaries.
 	 */
-	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1049,8 +1053,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 		nr_migrate = cc->nr_migratepages;
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
-				compaction_free, (unsigned long)cc,
-				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+				compaction_free, (unsigned long)cc, cc->mode,
 				MR_COMPACTION);
 		update_nr_listpages(cc);
 		nr_remaining = cc->nr_migratepages;
@@ -1083,9 +1086,8 @@ out:
 	return ret;
 }
 
-static unsigned long compact_zone_order(struct zone *zone,
-				 int order, gfp_t gfp_mask,
-				 bool sync, bool *contended)
+static unsigned long compact_zone_order(struct zone *zone, int order,
+		gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
 {
 	unsigned long ret;
 	struct compact_control cc = {
@@ -1094,7 +1096,7 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.order = order,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
-		.sync = sync,
+		.mode = mode,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -1116,7 +1118,7 @@ int sysctl_extfrag_threshold = 500;
  * @order: The order of the current allocation
  * @gfp_mask: The GFP mask of the current allocation
  * @nodemask: The allowed nodes to allocate from
- * @sync: Whether migration is synchronous or not
+ * @mode: The migration mode for async, sync light, or sync migration
  * @contended: Return value that is true if compaction was aborted due to lock contention
  * @page: Optionally capture a free page of the requested order during compaction
  *
@@ -1124,7 +1126,7 @@ int sysctl_extfrag_threshold = 500;
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode mode, bool *contended)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1149,7 +1151,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 								nodemask) {
 		int status;
 
-		status = compact_zone_order(zone, order, gfp_mask, sync,
+		status = compact_zone_order(zone, order, gfp_mask, mode,
 						contended);
 		rc = max(status, rc);
 
@@ -1189,7 +1191,7 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 						low_wmark_pages(zone), 0, 0))
 				compaction_defer_reset(zone, cc->order, false);
 			/* Currently async compaction is never deferred. */
-			else if (cc->sync)
+			else if (cc->mode != MIGRATE_ASYNC)
 				defer_compaction(zone, cc->order);
 		}
 
@@ -1202,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
 {
 	struct compact_control cc = {
 		.order = order,
-		.sync = false,
+		.mode = MIGRATE_ASYNC,
 	};
 
 	if (!order)
@@ -1215,7 +1217,7 @@ static void compact_node(int nid)
 {
 	struct compact_control cc = {
 		.order = -1,
-		.sync = true,
+		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 	};
 
diff --git a/mm/internal.h b/mm/internal.h
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -134,7 +134,7 @@ struct compact_control {
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
-	bool sync;			/* Synchronous migration */
+	enum migrate_mode mode;		/* Async or sync migration mode */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 	bool finished_update_free;	/* True when the zone cached pfns are
 					 * no longer being updated
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2226,7 +2226,7 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, bool sync_migration,
+	int migratetype, enum migrate_mode mode,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
@@ -2240,7 +2240,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-						nodemask, sync_migration,
+						nodemask, mode,
 						contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 
@@ -2273,7 +2273,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		 * As async compaction considers a subset of pageblocks, only
 		 * defer if the failure was a sync compaction failure.
 		 */
-		if (sync_migration)
+		if (mode != MIGRATE_ASYNC)
 			defer_compaction(preferred_zone, order);
 
 		cond_resched();
@@ -2286,9 +2286,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, bool sync_migration,
-	bool *contended_compaction, bool *deferred_compaction,
-	unsigned long *did_some_progress)
+	int migratetype, enum migrate_mode mode, bool *contended_compaction,
+	bool *deferred_compaction, unsigned long *did_some_progress)
 {
 	return NULL;
 }
@@ -2483,7 +2482,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
-	bool sync_migration = false;
+	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	bool deferred_compaction = false;
 	bool contended_compaction = false;
 
@@ -2577,17 +2576,15 @@ rebalance:
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
 	 */
-	page = __alloc_pages_direct_compact(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask,
-					alloc_flags, preferred_zone,
-					migratetype, sync_migration,
-					&contended_compaction,
+	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+					high_zoneidx, nodemask, alloc_flags,
+					preferred_zone, migratetype,
+					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
-	sync_migration = true;
+	migration_mode = MIGRATE_SYNC_LIGHT;
 
 	/*
 	 * If compaction is deferred for high-order allocations, it is because
@@ -2662,12 +2659,10 @@ rebalance:
 		 * direct reclaim and reclaim/compaction depends on compaction
 		 * being called after reclaim so call directly if necessary
 		 */
-		page = __alloc_pages_direct_compact(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask,
-					alloc_flags, preferred_zone,
-					migratetype, sync_migration,
-					&contended_compaction,
+		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+					high_zoneidx, nodemask, alloc_flags,
+					preferred_zone, migratetype,
+					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 		if (page)
@@ -6254,7 +6249,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
-		.sync = true,
+		.sync = MIGRATE_SYNC_LIGHT,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v4 4/6] mm, compaction: embed migration mode in compact_control
@ 2014-05-07 10:36           ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 10:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vlastimil Babka, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

We're going to want to manipulate the migration mode for compaction in the page 
allocator, and currently compact_control's sync field is only a bool.  

Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending 
on the value of this bool.  Convert the bool to enum migrate_mode and pass the 
migration mode in directly.  Later, we'll want to avoid MIGRATE_SYNC_LIGHT for 
thp allocations in the pagefault patch to avoid unnecessary latency.

This also alters compaction triggered from sysfs, either for the entire system 
or for a node, to force MIGRATE_SYNC.

Suggested-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 v4 of this patch only: converted name of formal from "sync" to "mode" for
                        try_to_compact_pages() per Vlastimil

 include/linux/compaction.h |  4 ++--
 mm/compaction.c            | 38 ++++++++++++++++++++------------------
 mm/internal.h              |  2 +-
 mm/page_alloc.c            | 37 ++++++++++++++++---------------------
 4 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask,
-			bool sync, bool *contended);
+			enum migrate_mode mode, bool *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode mode, bool *contended)
 {
 	return COMPACT_CONTINUE;
 }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -161,7 +161,8 @@ static void update_pageblock_skip(struct compact_control *cc,
 			return;
 		if (pfn > zone->compact_cached_migrate_pfn[0])
 			zone->compact_cached_migrate_pfn[0] = pfn;
-		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
+		if (cc->mode != MIGRATE_ASYNC &&
+		    pfn > zone->compact_cached_migrate_pfn[1])
 			zone->compact_cached_migrate_pfn[1] = pfn;
 	} else {
 		if (cc->finished_update_free)
@@ -208,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 		}
 
 		/* async aborts if taking too long or contended */
-		if (!cc->sync) {
+		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
 			return false;
 		}
@@ -479,7 +480,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
 	bool set_unsuitable = true;
-	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
+	const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
+					ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
 
 	/*
@@ -489,7 +491,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	 */
 	while (unlikely(too_many_isolated(zone))) {
 		/* async migration should just abort */
-		if (!cc->sync)
+		if (cc->mode == MIGRATE_ASYNC)
 			return 0;
 
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -554,7 +556,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			 * the minimum amount of work satisfies the allocation
 			 */
 			mt = get_pageblock_migratetype(page);
-			if (!cc->sync && !migrate_async_suitable(mt)) {
+			if (cc->mode == MIGRATE_ASYNC &&
+			    !migrate_async_suitable(mt)) {
 				set_unsuitable = false;
 				goto next_pageblock;
 			}
@@ -990,6 +993,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	int ret;
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone_end_pfn(zone);
+	const bool sync = cc->mode != MIGRATE_ASYNC;
 
 	ret = compaction_suitable(zone, cc->order);
 	switch (ret) {
@@ -1015,7 +1019,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	 * information on where the scanners should start but check that it
 	 * is initialised by ensuring the values are within zone boundaries.
 	 */
-	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1049,8 +1053,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 		nr_migrate = cc->nr_migratepages;
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
-				compaction_free, (unsigned long)cc,
-				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+				compaction_free, (unsigned long)cc, cc->mode,
 				MR_COMPACTION);
 		update_nr_listpages(cc);
 		nr_remaining = cc->nr_migratepages;
@@ -1083,9 +1086,8 @@ out:
 	return ret;
 }
 
-static unsigned long compact_zone_order(struct zone *zone,
-				 int order, gfp_t gfp_mask,
-				 bool sync, bool *contended)
+static unsigned long compact_zone_order(struct zone *zone, int order,
+		gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
 {
 	unsigned long ret;
 	struct compact_control cc = {
@@ -1094,7 +1096,7 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.order = order,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
-		.sync = sync,
+		.mode = mode,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -1116,7 +1118,7 @@ int sysctl_extfrag_threshold = 500;
  * @order: The order of the current allocation
  * @gfp_mask: The GFP mask of the current allocation
  * @nodemask: The allowed nodes to allocate from
- * @sync: Whether migration is synchronous or not
+ * @mode: The migration mode for async, sync light, or sync migration
  * @contended: Return value that is true if compaction was aborted due to lock contention
  * @page: Optionally capture a free page of the requested order during compaction
  *
@@ -1124,7 +1126,7 @@ int sysctl_extfrag_threshold = 500;
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode mode, bool *contended)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1149,7 +1151,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 								nodemask) {
 		int status;
 
-		status = compact_zone_order(zone, order, gfp_mask, sync,
+		status = compact_zone_order(zone, order, gfp_mask, mode,
 						contended);
 		rc = max(status, rc);
 
@@ -1189,7 +1191,7 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 						low_wmark_pages(zone), 0, 0))
 				compaction_defer_reset(zone, cc->order, false);
 			/* Currently async compaction is never deferred. */
-			else if (cc->sync)
+			else if (cc->mode != MIGRATE_ASYNC)
 				defer_compaction(zone, cc->order);
 		}
 
@@ -1202,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
 {
 	struct compact_control cc = {
 		.order = order,
-		.sync = false,
+		.mode = MIGRATE_ASYNC,
 	};
 
 	if (!order)
@@ -1215,7 +1217,7 @@ static void compact_node(int nid)
 {
 	struct compact_control cc = {
 		.order = -1,
-		.sync = true,
+		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 	};
 
diff --git a/mm/internal.h b/mm/internal.h
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -134,7 +134,7 @@ struct compact_control {
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
-	bool sync;			/* Synchronous migration */
+	enum migrate_mode mode;		/* Async or sync migration mode */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 	bool finished_update_free;	/* True when the zone cached pfns are
 					 * no longer being updated
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2226,7 +2226,7 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, bool sync_migration,
+	int migratetype, enum migrate_mode mode,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
@@ -2240,7 +2240,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-						nodemask, sync_migration,
+						nodemask, mode,
 						contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 
@@ -2273,7 +2273,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		 * As async compaction considers a subset of pageblocks, only
 		 * defer if the failure was a sync compaction failure.
 		 */
-		if (sync_migration)
+		if (mode != MIGRATE_ASYNC)
 			defer_compaction(preferred_zone, order);
 
 		cond_resched();
@@ -2286,9 +2286,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, bool sync_migration,
-	bool *contended_compaction, bool *deferred_compaction,
-	unsigned long *did_some_progress)
+	int migratetype, enum migrate_mode mode, bool *contended_compaction,
+	bool *deferred_compaction, unsigned long *did_some_progress)
 {
 	return NULL;
 }
@@ -2483,7 +2482,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
-	bool sync_migration = false;
+	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	bool deferred_compaction = false;
 	bool contended_compaction = false;
 
@@ -2577,17 +2576,15 @@ rebalance:
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
 	 */
-	page = __alloc_pages_direct_compact(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask,
-					alloc_flags, preferred_zone,
-					migratetype, sync_migration,
-					&contended_compaction,
+	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+					high_zoneidx, nodemask, alloc_flags,
+					preferred_zone, migratetype,
+					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
-	sync_migration = true;
+	migration_mode = MIGRATE_SYNC_LIGHT;
 
 	/*
 	 * If compaction is deferred for high-order allocations, it is because
@@ -2662,12 +2659,10 @@ rebalance:
 		 * direct reclaim and reclaim/compaction depends on compaction
 		 * being called after reclaim so call directly if necessary
 		 */
-		page = __alloc_pages_direct_compact(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask,
-					alloc_flags, preferred_zone,
-					migratetype, sync_migration,
-					&contended_compaction,
+		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+					high_zoneidx, nodemask, alloc_flags,
+					preferred_zone, migratetype,
+					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 		if (page)
@@ -6254,7 +6249,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
-		.sync = true,
+		.sync = MIGRATE_SYNC_LIGHT,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-07 12:09         ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07 12:09 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

During compaction, update_nr_listpages() has been used to count remaining
non-migrated and free pages after a call to migrage_pages(). The freepages
counting has become unneccessary, and it turns out that migratepages counting
is also unnecessary in most cases.

The only situation when it's needed to count cc->migratepages is when
migrate_pages() returns with a negative error code. Otherwise, the non-negative
return value is the number of pages that were not migrated, which is exactly
the count of remaining pages in the cc->migratepages list.

Furthermore, any non-zero count is only interesting for the tracepoint of
mm_compaction_migratepages events, because after that all remaining unmigrated
pages are put back and their count is set to 0.

This patch therefore removes update_nr_listpages() completely, and changes the
tracepoint definition so that the manual counting is done only when the
tracepoint is enabled, and only when migrate_pages() returns a negative error
code.

Furthermore, migrate_pages() and the tracepoints won't be called when there's
nothing to migrate. This potentially avoids some wasted cycles and reduces the
volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
The mm_compaction_isolate_migratepages event is better for determining that
nothing was isolated for migration, and this one was just duplicating the info.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 v2: checkpack and other non-functional fixes suggested by Naoya Horiguchi

 include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
 mm/compaction.c                   | 31 +++++++------------------------
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 06f544e..aacaf0f 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -5,7 +5,9 @@
 #define _TRACE_COMPACTION_H
 
 #include <linux/types.h>
+#include <linux/list.h>
 #include <linux/tracepoint.h>
+#include <linux/mm_types.h>
 #include <trace/events/gfpflags.h>
 
 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
@@ -47,10 +49,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
 
 TRACE_EVENT(mm_compaction_migratepages,
 
-	TP_PROTO(unsigned long nr_migrated,
-		unsigned long nr_failed),
+	TP_PROTO(unsigned long nr_all,
+		int migrate_rc,
+		struct list_head *migratepages),
 
-	TP_ARGS(nr_migrated, nr_failed),
+	TP_ARGS(nr_all, migrate_rc, migratepages),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, nr_migrated)
@@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
 	),
 
 	TP_fast_assign(
-		__entry->nr_migrated = nr_migrated;
+		unsigned long nr_failed = 0;
+		struct page *page;
+
+		/*
+		 * migrate_pages() returns either a non-negative number
+		 * with the number of pages that failed migration, or an
+		 * error code, in which case we need to count the remaining
+		 * pages manually
+		 */
+		if (migrate_rc >= 0)
+			nr_failed = migrate_rc;
+		else
+			list_for_each_entry(page, migratepages, lru)
+				nr_failed++;
+
+		__entry->nr_migrated = nr_all - nr_failed;
 		__entry->nr_failed = nr_failed;
 	),
 
diff --git a/mm/compaction.c b/mm/compaction.c
index 6f4b218..383d562 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -819,22 +819,6 @@ static void compaction_free(struct page *page, unsigned long data)
 	cc->nr_freepages++;
 }
 
-/*
- * We cannot control nr_migratepages fully when migration is running as
- * migrate_pages() has no knowledge of of compact_control.  When migration is
- * complete, we count the number of pages on the list by hand.
- */
-static void update_nr_listpages(struct compact_control *cc)
-{
-	int nr_migratepages = 0;
-	struct page *page;
-
-	list_for_each_entry(page, &cc->migratepages, lru)
-		nr_migratepages++;
-
-	cc->nr_migratepages = nr_migratepages;
-}
-
 /* possible outcome of isolate_migratepages */
 typedef enum {
 	ISOLATE_ABORT,		/* Abort compaction now */
@@ -1029,7 +1013,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	migrate_prep_local();
 
 	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
-		unsigned long nr_migrate, nr_remaining;
 		int err;
 
 		switch (isolate_migratepages(zone, cc)) {
@@ -1044,20 +1027,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 			;
 		}
 
-		nr_migrate = cc->nr_migratepages;
+		if (!cc->nr_migratepages)
+			continue;
+
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				compaction_free, (unsigned long)cc, cc->mode,
 				MR_COMPACTION);
-		update_nr_listpages(cc);
-		nr_remaining = cc->nr_migratepages;
 
-		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
-						nr_remaining);
+		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
+							&cc->migratepages);
 
-		/* Release isolated pages not migrated */
+		/* All pages were either migrated or will be released */
+		cc->nr_migratepages = 0;
 		if (err) {
 			putback_movable_pages(&cc->migratepages);
-			cc->nr_migratepages = 0;
 			/*
 			 * migrate_pages() may return -ENOMEM when scanners meet
 			 * and we want compact_finished() to detect it
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary
@ 2014-05-07 12:09         ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07 12:09 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

During compaction, update_nr_listpages() has been used to count remaining
non-migrated and free pages after a call to migrage_pages(). The freepages
counting has become unneccessary, and it turns out that migratepages counting
is also unnecessary in most cases.

The only situation when it's needed to count cc->migratepages is when
migrate_pages() returns with a negative error code. Otherwise, the non-negative
return value is the number of pages that were not migrated, which is exactly
the count of remaining pages in the cc->migratepages list.

Furthermore, any non-zero count is only interesting for the tracepoint of
mm_compaction_migratepages events, because after that all remaining unmigrated
pages are put back and their count is set to 0.

This patch therefore removes update_nr_listpages() completely, and changes the
tracepoint definition so that the manual counting is done only when the
tracepoint is enabled, and only when migrate_pages() returns a negative error
code.

Furthermore, migrate_pages() and the tracepoints won't be called when there's
nothing to migrate. This potentially avoids some wasted cycles and reduces the
volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
The mm_compaction_isolate_migratepages event is better for determining that
nothing was isolated for migration, and this one was just duplicating the info.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 v2: checkpack and other non-functional fixes suggested by Naoya Horiguchi

 include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
 mm/compaction.c                   | 31 +++++++------------------------
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 06f544e..aacaf0f 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -5,7 +5,9 @@
 #define _TRACE_COMPACTION_H
 
 #include <linux/types.h>
+#include <linux/list.h>
 #include <linux/tracepoint.h>
+#include <linux/mm_types.h>
 #include <trace/events/gfpflags.h>
 
 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
@@ -47,10 +49,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
 
 TRACE_EVENT(mm_compaction_migratepages,
 
-	TP_PROTO(unsigned long nr_migrated,
-		unsigned long nr_failed),
+	TP_PROTO(unsigned long nr_all,
+		int migrate_rc,
+		struct list_head *migratepages),
 
-	TP_ARGS(nr_migrated, nr_failed),
+	TP_ARGS(nr_all, migrate_rc, migratepages),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, nr_migrated)
@@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
 	),
 
 	TP_fast_assign(
-		__entry->nr_migrated = nr_migrated;
+		unsigned long nr_failed = 0;
+		struct page *page;
+
+		/*
+		 * migrate_pages() returns either a non-negative number
+		 * with the number of pages that failed migration, or an
+		 * error code, in which case we need to count the remaining
+		 * pages manually
+		 */
+		if (migrate_rc >= 0)
+			nr_failed = migrate_rc;
+		else
+			list_for_each_entry(page, migratepages, lru)
+				nr_failed++;
+
+		__entry->nr_migrated = nr_all - nr_failed;
 		__entry->nr_failed = nr_failed;
 	),
 
diff --git a/mm/compaction.c b/mm/compaction.c
index 6f4b218..383d562 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -819,22 +819,6 @@ static void compaction_free(struct page *page, unsigned long data)
 	cc->nr_freepages++;
 }
 
-/*
- * We cannot control nr_migratepages fully when migration is running as
- * migrate_pages() has no knowledge of of compact_control.  When migration is
- * complete, we count the number of pages on the list by hand.
- */
-static void update_nr_listpages(struct compact_control *cc)
-{
-	int nr_migratepages = 0;
-	struct page *page;
-
-	list_for_each_entry(page, &cc->migratepages, lru)
-		nr_migratepages++;
-
-	cc->nr_migratepages = nr_migratepages;
-}
-
 /* possible outcome of isolate_migratepages */
 typedef enum {
 	ISOLATE_ABORT,		/* Abort compaction now */
@@ -1029,7 +1013,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	migrate_prep_local();
 
 	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
-		unsigned long nr_migrate, nr_remaining;
 		int err;
 
 		switch (isolate_migratepages(zone, cc)) {
@@ -1044,20 +1027,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 			;
 		}
 
-		nr_migrate = cc->nr_migratepages;
+		if (!cc->nr_migratepages)
+			continue;
+
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				compaction_free, (unsigned long)cc, cc->mode,
 				MR_COMPACTION);
-		update_nr_listpages(cc);
-		nr_remaining = cc->nr_migratepages;
 
-		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
-						nr_remaining);
+		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
+							&cc->migratepages);
 
-		/* Release isolated pages not migrated */
+		/* All pages were either migrated or will be released */
+		cc->nr_migratepages = 0;
 		if (err) {
 			putback_movable_pages(&cc->migratepages);
-			cc->nr_migratepages = 0;
 			/*
 			 * migrate_pages() may return -ENOMEM when scanners meet
 			 * and we want compact_finished() to detect it
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-07 12:09         ` Vlastimil Babka
@ 2014-05-07 12:09           ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07 12:09 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

The compaction free scanner in isolate_freepages() currently remembers PFN of
the highest pageblock where it successfully isolates, to be used as the
starting pageblock for the next invocation. The rationale behind this is that
page migration might return free pages to the allocator when migration fails
and we don't want to skip them if the compaction continues.

Since migration now returns free pages back to compaction code where they can
be reused, this is no longer a concern. This patch changes isolate_freepages()
so that the PFN for restarting is updated with each pageblock where isolation
is attempted. Using stress-highalloc from mmtests, this resulted in 10%
reduction of the pages scanned by the free scanner.

Note that the somewhat similar functionality that records highest successful
pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
used when the whole compaction is restarted, not for multiple invocations of
the free scanner during single compaction.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 v2: no changes, just keep patches together

 mm/compaction.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 383d562..83ca6f9 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -706,12 +706,6 @@ static void isolate_freepages(struct zone *zone,
 	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
 
 	/*
-	 * If no pages are isolated, the block_start_pfn < low_pfn check
-	 * will kick in.
-	 */
-	next_free_pfn = 0;
-
-	/*
 	 * Isolate free pages until enough are available to migrate the
 	 * pages on cc->migratepages. We stop searching if the migrate
 	 * and free page scanners meet or enough free pages are isolated.
@@ -751,19 +745,19 @@ static void isolate_freepages(struct zone *zone,
 			continue;
 
 		/* Found a block suitable for isolating free pages from */
+		next_free_pfn = block_start_pfn;
 		isolated = isolate_freepages_block(cc, block_start_pfn,
 					block_end_pfn, freelist, false);
 		nr_freepages += isolated;
 
 		/*
-		 * Record the highest PFN we isolated pages from. When next
-		 * looking for free pages, the search will restart here as
-		 * page migration may have returned some pages to the allocator
+		 * Set a flag that we successfully isolated in this pageblock.
+		 * In the next loop iteration, zone->compact_cached_free_pfn
+		 * will not be updated and thus it will effectively contain the
+		 * highest pageblock we isolated pages from.
 		 */
-		if (isolated && next_free_pfn == 0) {
+		if (isolated)
 			cc->finished_update_free = true;
-			next_free_pfn = block_start_pfn;
-		}
 	}
 
 	/* split_free_page does not map the pages */
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-07 12:09           ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07 12:09 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

The compaction free scanner in isolate_freepages() currently remembers PFN of
the highest pageblock where it successfully isolates, to be used as the
starting pageblock for the next invocation. The rationale behind this is that
page migration might return free pages to the allocator when migration fails
and we don't want to skip them if the compaction continues.

Since migration now returns free pages back to compaction code where they can
be reused, this is no longer a concern. This patch changes isolate_freepages()
so that the PFN for restarting is updated with each pageblock where isolation
is attempted. Using stress-highalloc from mmtests, this resulted in 10%
reduction of the pages scanned by the free scanner.

Note that the somewhat similar functionality that records highest successful
pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
used when the whole compaction is restarted, not for multiple invocations of
the free scanner during single compaction.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 v2: no changes, just keep patches together

 mm/compaction.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 383d562..83ca6f9 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -706,12 +706,6 @@ static void isolate_freepages(struct zone *zone,
 	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
 
 	/*
-	 * If no pages are isolated, the block_start_pfn < low_pfn check
-	 * will kick in.
-	 */
-	next_free_pfn = 0;
-
-	/*
 	 * Isolate free pages until enough are available to migrate the
 	 * pages on cc->migratepages. We stop searching if the migrate
 	 * and free page scanners meet or enough free pages are isolated.
@@ -751,19 +745,19 @@ static void isolate_freepages(struct zone *zone,
 			continue;
 
 		/* Found a block suitable for isolating free pages from */
+		next_free_pfn = block_start_pfn;
 		isolated = isolate_freepages_block(cc, block_start_pfn,
 					block_end_pfn, freelist, false);
 		nr_freepages += isolated;
 
 		/*
-		 * Record the highest PFN we isolated pages from. When next
-		 * looking for free pages, the search will restart here as
-		 * page migration may have returned some pages to the allocator
+		 * Set a flag that we successfully isolated in this pageblock.
+		 * In the next loop iteration, zone->compact_cached_free_pfn
+		 * will not be updated and thus it will effectively contain the
+		 * highest pageblock we isolated pages from.
 		 */
-		if (isolated && next_free_pfn == 0) {
+		if (isolated)
 			cc->finished_update_free = true;
-			next_free_pfn = block_start_pfn;
-		}
 	}
 
 	/* split_free_page does not map the pages */
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-07 12:10         ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07 12:10 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/07/2014 04:22 AM, David Rientjes wrote:
> Async compaction terminates prematurely when need_resched(), see
> compact_checklock_irqsave().  This can never trigger, however, if the
> cond_resched() in isolate_migratepages_range() always takes care of the
> scheduling.
>
> If the cond_resched() actually triggers, then terminate this pageblock scan for
> async compaction as well.
>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>   mm/compaction.c | 7 ++++++-
>   1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -500,8 +500,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			return 0;
>   	}
>
> +	if (cond_resched()) {
> +		/* Async terminates prematurely on need_resched() */
> +		if (cc->mode == MIGRATE_ASYNC)
> +			return 0;
> +	}
> +
>   	/* Time to isolate some pages for migration */
> -	cond_resched();
>   	for (; low_pfn < end_pfn; low_pfn++) {
>   		/* give a chance to irqs before checking need_resched() */
>   		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
@ 2014-05-07 12:10         ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07 12:10 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/07/2014 04:22 AM, David Rientjes wrote:
> Async compaction terminates prematurely when need_resched(), see
> compact_checklock_irqsave().  This can never trigger, however, if the
> cond_resched() in isolate_migratepages_range() always takes care of the
> scheduling.
>
> If the cond_resched() actually triggers, then terminate this pageblock scan for
> async compaction as well.
>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>   mm/compaction.c | 7 ++++++-
>   1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -500,8 +500,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			return 0;
>   	}
>
> +	if (cond_resched()) {
> +		/* Async terminates prematurely on need_resched() */
> +		if (cc->mode == MIGRATE_ASYNC)
> +			return 0;
> +	}
> +
>   	/* Time to isolate some pages for migration */
> -	cond_resched();
>   	for (; low_pfn < end_pfn; low_pfn++) {
>   		/* give a chance to irqs before checking need_resched() */
>   		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
  2014-05-07  2:22       ` David Rientjes
  (?)
@ 2014-05-07 14:14       ` Naoya Horiguchi
  -1 siblings, 0 replies; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-07 14:14 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, vbabka, iamjoonsoo.kim,
	gthelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, May 06, 2014 at 07:22:43PM -0700, David Rientjes wrote:
> Memory compaction works by having a "freeing scanner" scan from one end of a 
> zone which isolates pages as migration targets while another "migrating scanner" 
> scans from the other end of the same zone which isolates pages for migration.
> 
> When page migration fails for an isolated page, the target page is returned to 
> the system rather than the freelist built by the freeing scanner.  This may 
> require the freeing scanner to continue scanning memory after suitable migration 
> targets have already been returned to the system needlessly.
> 
> This patch returns destination pages to the freeing scanner freelist when page 
> migration fails.  This prevents unnecessary work done by the freeing scanner but 
> also encourages memory to be as compacted as possible at the end of the zone.
> 
> Reported-by: Greg Thelen <gthelen@google.com>
> Acked-by: Mel Gorman <mgorman@suse.de>
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: David Rientjes <rientjes@google.com>

Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 3/6] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-07  2:22       ` David Rientjes
  (?)
  (?)
@ 2014-05-07 20:56       ` Naoya Horiguchi
  -1 siblings, 0 replies; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-07 20:56 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Vlastimil Babka,
	Joonsoo Kim, Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, May 06, 2014 at 07:22:45PM -0700, David Rientjes wrote:
> Each zone has a cached migration scanner pfn for memory compaction so that 
> subsequent calls to memory compaction can start where the previous call left 
> off.
> 
> Currently, the compaction migration scanner only updates the per-zone cached pfn 
> when pageblocks were not skipped for async compaction.  This creates a 
> dependency on calling sync compaction to avoid having subsequent calls to async 
> compaction from scanning an enormous amount of non-MOVABLE pageblocks each time 
> it is called.  On large machines, this could be potentially very expensive.
> 
> This patch adds a per-zone cached migration scanner pfn only for async 
> compaction.  It is updated everytime a pageblock has been scanned in its 
> entirety and when no pages from it were successfully isolated.  The cached 
> migration scanner pfn for sync compaction is updated only when called for sync 
> compaction.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  v3: do not update pageblock skip metadata when skipped due to async per
>      Vlastimil.

Looks good to me.
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-07 21:15         ` Andrew Morton
  -1 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-07 21:15 UTC (permalink / raw)
  To: David Rientjes
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, 6 May 2014 19:22:43 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:

> Memory compaction works by having a "freeing scanner" scan from one end of a 
> zone which isolates pages as migration targets while another "migrating scanner" 
> scans from the other end of the same zone which isolates pages for migration.
> 
> When page migration fails for an isolated page, the target page is returned to 
> the system rather than the freelist built by the freeing scanner.  This may 
> require the freeing scanner to continue scanning memory after suitable migration 
> targets have already been returned to the system needlessly.
> 
> This patch returns destination pages to the freeing scanner freelist when page 
> migration fails.  This prevents unnecessary work done by the freeing scanner but 
> also encourages memory to be as compacted as possible at the end of the zone.
> 
> Reported-by: Greg Thelen <gthelen@google.com>

What did Greg actually report?  IOW, what if any observable problem is
being fixed here?

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-07 21:15         ` Andrew Morton
  0 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-07 21:15 UTC (permalink / raw)
  To: David Rientjes
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, 6 May 2014 19:22:43 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:

> Memory compaction works by having a "freeing scanner" scan from one end of a 
> zone which isolates pages as migration targets while another "migrating scanner" 
> scans from the other end of the same zone which isolates pages for migration.
> 
> When page migration fails for an isolated page, the target page is returned to 
> the system rather than the freelist built by the freeing scanner.  This may 
> require the freeing scanner to continue scanning memory after suitable migration 
> targets have already been returned to the system needlessly.
> 
> This patch returns destination pages to the freeing scanner freelist when page 
> migration fails.  This prevents unnecessary work done by the freeing scanner but 
> also encourages memory to be as compacted as possible at the end of the zone.
> 
> Reported-by: Greg Thelen <gthelen@google.com>

What did Greg actually report?  IOW, what if any observable problem is
being fixed here?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-07 21:20         ` Andrew Morton
  -1 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-07 21:20 UTC (permalink / raw)
  To: David Rientjes
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, 6 May 2014 19:22:52 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:

> Async compaction terminates prematurely when need_resched(), see
> compact_checklock_irqsave().  This can never trigger, however, if the 
> cond_resched() in isolate_migratepages_range() always takes care of the 
> scheduling.
> 
> If the cond_resched() actually triggers, then terminate this pageblock scan for 
> async compaction as well.
> 
> ..
>
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -500,8 +500,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  			return 0;
>  	}
>  
> +	if (cond_resched()) {
> +		/* Async terminates prematurely on need_resched() */
> +		if (cc->mode == MIGRATE_ASYNC)
> +			return 0;
> +	}

Comment comments the obvious.  What is less obvious is *why* we do this.


Someone please remind my why sync and async compaction use different
scanning cursors?


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
@ 2014-05-07 21:20         ` Andrew Morton
  0 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-07 21:20 UTC (permalink / raw)
  To: David Rientjes
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, 6 May 2014 19:22:52 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:

> Async compaction terminates prematurely when need_resched(), see
> compact_checklock_irqsave().  This can never trigger, however, if the 
> cond_resched() in isolate_migratepages_range() always takes care of the 
> scheduling.
> 
> If the cond_resched() actually triggers, then terminate this pageblock scan for 
> async compaction as well.
> 
> ..
>
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -500,8 +500,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  			return 0;
>  	}
>  
> +	if (cond_resched()) {
> +		/* Async terminates prematurely on need_resched() */
> +		if (cc->mode == MIGRATE_ASYNC)
> +			return 0;
> +	}

Comment comments the obvious.  What is less obvious is *why* we do this.


Someone please remind my why sync and async compaction use different
scanning cursors?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
  2014-05-07 21:15         ` Andrew Morton
@ 2014-05-07 21:21           ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 21:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Wed, 7 May 2014, Andrew Morton wrote:

> > Memory compaction works by having a "freeing scanner" scan from one end of a 
> > zone which isolates pages as migration targets while another "migrating scanner" 
> > scans from the other end of the same zone which isolates pages for migration.
> > 
> > When page migration fails for an isolated page, the target page is returned to 
> > the system rather than the freelist built by the freeing scanner.  This may 
> > require the freeing scanner to continue scanning memory after suitable migration 
> > targets have already been returned to the system needlessly.
> > 
> > This patch returns destination pages to the freeing scanner freelist when page 
> > migration fails.  This prevents unnecessary work done by the freeing scanner but 
> > also encourages memory to be as compacted as possible at the end of the zone.
> > 
> > Reported-by: Greg Thelen <gthelen@google.com>
> 
> What did Greg actually report?  IOW, what if any observable problem is
> being fixed here?
> 

Greg reported by code inspection that he found isolated free pages were 
returned back to the VM rather than the compaction freelist.  This will 
cause holes behind the free scanner and cause it to reallocate additional 
memory if necessary later.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-07 21:21           ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 21:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Wed, 7 May 2014, Andrew Morton wrote:

> > Memory compaction works by having a "freeing scanner" scan from one end of a 
> > zone which isolates pages as migration targets while another "migrating scanner" 
> > scans from the other end of the same zone which isolates pages for migration.
> > 
> > When page migration fails for an isolated page, the target page is returned to 
> > the system rather than the freelist built by the freeing scanner.  This may 
> > require the freeing scanner to continue scanning memory after suitable migration 
> > targets have already been returned to the system needlessly.
> > 
> > This patch returns destination pages to the freeing scanner freelist when page 
> > migration fails.  This prevents unnecessary work done by the freeing scanner but 
> > also encourages memory to be as compacted as possible at the end of the zone.
> > 
> > Reported-by: Greg Thelen <gthelen@google.com>
> 
> What did Greg actually report?  IOW, what if any observable problem is
> being fixed here?
> 

Greg reported by code inspection that he found isolated free pages were 
returned back to the VM rather than the compaction freelist.  This will 
cause holes behind the free scanner and cause it to reallocate additional 
memory if necessary later.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
  2014-05-07 21:20         ` Andrew Morton
@ 2014-05-07 21:28           ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 21:28 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Wed, 7 May 2014, Andrew Morton wrote:

> > --- a/mm/compaction.c
> > +++ b/mm/compaction.c
> > @@ -500,8 +500,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> >  			return 0;
> >  	}
> >  
> > +	if (cond_resched()) {
> > +		/* Async terminates prematurely on need_resched() */
> > +		if (cc->mode == MIGRATE_ASYNC)
> > +			return 0;
> > +	}
> 
> Comment comments the obvious.  What is less obvious is *why* we do this.
> 

Async compaction is most prevalent for thp pagefaults and without 
zone->lru_lock contention we have no other termination criteria.  Without 
this, we would scan a potentially very long zone (zones 64GB in length in 
my testing) and it would be very expensive for pagefault.  Async is best 
effort, so if it is becoming too expensive then it's better to just 
fallback to PAGE_SIZE pages instead and rely on khugepaged to collapse 
later.

> Someone please remind my why sync and async compaction use different
> scanning cursors?
> 

It's introduced in this patchset.  Async compaction does not consider 
pageblocks unless it is MIGRATE_MOVABLE since it is best effort, sync 
compaction considers all pageblocks.  In the past, we only updated the 
cursor for sync compaction since it would be wrong to update it for async 
compaction if it can skip certain pageblocks.  Unfortunately, if async 
compaction is relied upon solely for certain allocations (such as thp 
pagefaults), it is possible to scan an enormous amount of a 64GB zone, for 
example, pointlessly every time if none of the memory can be isolated.

The result is that sync compaction always updates both scanners and async 
compaction only updates its own scanner.  Either scanner is only updated 
if the new cursor is "beyond" the previous cursor.  ("Beyond" is _after_ 
the previous migration scanner pfn and _before_ the previous free scanner 
pfn.)

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
@ 2014-05-07 21:28           ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 21:28 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Wed, 7 May 2014, Andrew Morton wrote:

> > --- a/mm/compaction.c
> > +++ b/mm/compaction.c
> > @@ -500,8 +500,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> >  			return 0;
> >  	}
> >  
> > +	if (cond_resched()) {
> > +		/* Async terminates prematurely on need_resched() */
> > +		if (cc->mode == MIGRATE_ASYNC)
> > +			return 0;
> > +	}
> 
> Comment comments the obvious.  What is less obvious is *why* we do this.
> 

Async compaction is most prevalent for thp pagefaults and without 
zone->lru_lock contention we have no other termination criteria.  Without 
this, we would scan a potentially very long zone (zones 64GB in length in 
my testing) and it would be very expensive for pagefault.  Async is best 
effort, so if it is becoming too expensive then it's better to just 
fallback to PAGE_SIZE pages instead and rely on khugepaged to collapse 
later.

> Someone please remind my why sync and async compaction use different
> scanning cursors?
> 

It's introduced in this patchset.  Async compaction does not consider 
pageblocks unless it is MIGRATE_MOVABLE since it is best effort, sync 
compaction considers all pageblocks.  In the past, we only updated the 
cursor for sync compaction since it would be wrong to update it for async 
compaction if it can skip certain pageblocks.  Unfortunately, if async 
compaction is relied upon solely for certain allocations (such as thp 
pagefaults), it is possible to scan an enormous amount of a 64GB zone, for 
example, pointlessly every time if none of the memory can be isolated.

The result is that sync compaction always updates both scanners and async 
compaction only updates its own scanner.  Either scanner is only updated 
if the new cursor is "beyond" the previous cursor.  ("Beyond" is _after_ 
the previous migration scanner pfn and _before_ the previous free scanner 
pfn.)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
  2014-05-07 21:15         ` Andrew Morton
@ 2014-05-07 21:39           ` Greg Thelen
  -1 siblings, 0 replies; 267+ messages in thread
From: Greg Thelen @ 2014-05-07 21:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Rientjes, Mel Gorman, Rik van Riel, Vlastimil Babka,
	Joonsoo Kim, Hugh Dickins, linux-kernel, linux-mm


On Wed, May 07 2014, Andrew Morton <akpm@linux-foundation.org> wrote:

> On Tue, 6 May 2014 19:22:43 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:
>
>> Memory compaction works by having a "freeing scanner" scan from one end of a 
>> zone which isolates pages as migration targets while another "migrating scanner" 
>> scans from the other end of the same zone which isolates pages for migration.
>> 
>> When page migration fails for an isolated page, the target page is returned to 
>> the system rather than the freelist built by the freeing scanner.  This may 
>> require the freeing scanner to continue scanning memory after suitable migration 
>> targets have already been returned to the system needlessly.
>> 
>> This patch returns destination pages to the freeing scanner freelist when page 
>> migration fails.  This prevents unnecessary work done by the freeing scanner but 
>> also encourages memory to be as compacted as possible at the end of the zone.
>> 
>> Reported-by: Greg Thelen <gthelen@google.com>
>
> What did Greg actually report?  IOW, what if any observable problem is
> being fixed here?

I detected the problem at runtime seeing that ext4 metadata pages (esp
the ones read by "sbi->s_group_desc[i] = sb_bread(sb, block)") were
constantly visited by compaction calls of migrate_pages().  These pages
had a non-zero b_count which caused fallback_migrate_page() ->
try_to_release_page() -> try_to_free_buffers() to fail.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-07 21:39           ` Greg Thelen
  0 siblings, 0 replies; 267+ messages in thread
From: Greg Thelen @ 2014-05-07 21:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Rientjes, Mel Gorman, Rik van Riel, Vlastimil Babka,
	Joonsoo Kim, Hugh Dickins, linux-kernel, linux-mm


On Wed, May 07 2014, Andrew Morton <akpm@linux-foundation.org> wrote:

> On Tue, 6 May 2014 19:22:43 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:
>
>> Memory compaction works by having a "freeing scanner" scan from one end of a 
>> zone which isolates pages as migration targets while another "migrating scanner" 
>> scans from the other end of the same zone which isolates pages for migration.
>> 
>> When page migration fails for an isolated page, the target page is returned to 
>> the system rather than the freelist built by the freeing scanner.  This may 
>> require the freeing scanner to continue scanning memory after suitable migration 
>> targets have already been returned to the system needlessly.
>> 
>> This patch returns destination pages to the freeing scanner freelist when page 
>> migration fails.  This prevents unnecessary work done by the freeing scanner but 
>> also encourages memory to be as compacted as possible at the end of the zone.
>> 
>> Reported-by: Greg Thelen <gthelen@google.com>
>
> What did Greg actually report?  IOW, what if any observable problem is
> being fixed here?

I detected the problem at runtime seeing that ext4 metadata pages (esp
the ones read by "sbi->s_group_desc[i] = sb_bread(sb, block)") were
constantly visited by compaction calls of migrate_pages().  These pages
had a non-zero b_count which caused fallback_migrate_page() ->
try_to_release_page() -> try_to_free_buffers() to fail.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary
  2014-05-07 12:09         ` Vlastimil Babka
@ 2014-05-07 21:44           ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 21:44 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Christoph Lameter, Rik van Riel

On Wed, 7 May 2014, Vlastimil Babka wrote:

> During compaction, update_nr_listpages() has been used to count remaining
> non-migrated and free pages after a call to migrage_pages(). The freepages
> counting has become unneccessary, and it turns out that migratepages counting
> is also unnecessary in most cases.
> 
> The only situation when it's needed to count cc->migratepages is when
> migrate_pages() returns with a negative error code. Otherwise, the non-negative
> return value is the number of pages that were not migrated, which is exactly
> the count of remaining pages in the cc->migratepages list.
> 
> Furthermore, any non-zero count is only interesting for the tracepoint of
> mm_compaction_migratepages events, because after that all remaining unmigrated
> pages are put back and their count is set to 0.
> 
> This patch therefore removes update_nr_listpages() completely, and changes the
> tracepoint definition so that the manual counting is done only when the
> tracepoint is enabled, and only when migrate_pages() returns a negative error
> code.
> 
> Furthermore, migrate_pages() and the tracepoints won't be called when there's
> nothing to migrate. This potentially avoids some wasted cycles and reduces the
> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
> The mm_compaction_isolate_migratepages event is better for determining that
> nothing was isolated for migration, and this one was just duplicating the info.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>

Acked-by: David Rientjes <rientjes@google.com>

I like this, before our two patches update_nr_listpages() was expensive 
when called for each pageblock and being able to remove it is certainly a 
step in the right direction to make compaction as fast as possible.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary
@ 2014-05-07 21:44           ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 21:44 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Christoph Lameter, Rik van Riel

On Wed, 7 May 2014, Vlastimil Babka wrote:

> During compaction, update_nr_listpages() has been used to count remaining
> non-migrated and free pages after a call to migrage_pages(). The freepages
> counting has become unneccessary, and it turns out that migratepages counting
> is also unnecessary in most cases.
> 
> The only situation when it's needed to count cc->migratepages is when
> migrate_pages() returns with a negative error code. Otherwise, the non-negative
> return value is the number of pages that were not migrated, which is exactly
> the count of remaining pages in the cc->migratepages list.
> 
> Furthermore, any non-zero count is only interesting for the tracepoint of
> mm_compaction_migratepages events, because after that all remaining unmigrated
> pages are put back and their count is set to 0.
> 
> This patch therefore removes update_nr_listpages() completely, and changes the
> tracepoint definition so that the manual counting is done only when the
> tracepoint is enabled, and only when migrate_pages() returns a negative error
> code.
> 
> Furthermore, migrate_pages() and the tracepoints won't be called when there's
> nothing to migrate. This potentially avoids some wasted cycles and reduces the
> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
> The mm_compaction_isolate_migratepages event is better for determining that
> nothing was isolated for migration, and this one was just duplicating the info.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>

Acked-by: David Rientjes <rientjes@google.com>

I like this, before our two patches update_nr_listpages() was expensive 
when called for each pageblock and being able to remove it is certainly a 
step in the right direction to make compaction as fast as possible.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-07 12:09           ` Vlastimil Babka
@ 2014-05-07 21:47             ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 21:47 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Naoya Horiguchi, Christoph Lameter,
	Rik van Riel

On Wed, 7 May 2014, Vlastimil Babka wrote:

> The compaction free scanner in isolate_freepages() currently remembers PFN of
> the highest pageblock where it successfully isolates, to be used as the
> starting pageblock for the next invocation. The rationale behind this is that
> page migration might return free pages to the allocator when migration fails
> and we don't want to skip them if the compaction continues.
> 
> Since migration now returns free pages back to compaction code where they can
> be reused, this is no longer a concern. This patch changes isolate_freepages()
> so that the PFN for restarting is updated with each pageblock where isolation
> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> reduction of the pages scanned by the free scanner.
> 
> Note that the somewhat similar functionality that records highest successful
> pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
> used when the whole compaction is restarted, not for multiple invocations of
> the free scanner during single compaction.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>

Acked-by: David Rientjes <rientjes@google.com>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-07 21:47             ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 21:47 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Naoya Horiguchi, Christoph Lameter,
	Rik van Riel

On Wed, 7 May 2014, Vlastimil Babka wrote:

> The compaction free scanner in isolate_freepages() currently remembers PFN of
> the highest pageblock where it successfully isolates, to be used as the
> starting pageblock for the next invocation. The rationale behind this is that
> page migration might return free pages to the allocator when migration fails
> and we don't want to skip them if the compaction continues.
> 
> Since migration now returns free pages back to compaction code where they can
> be reused, this is no longer a concern. This patch changes isolate_freepages()
> so that the PFN for restarting is updated with each pageblock where isolation
> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> reduction of the pages scanned by the free scanner.
> 
> Note that the somewhat similar functionality that records highest successful
> pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
> used when the whole compaction is restarted, not for multiple invocations of
> the free scanner during single compaction.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>

Acked-by: David Rientjes <rientjes@google.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-07 12:09           ` Vlastimil Babka
  (?)
  (?)
@ 2014-05-07 22:06           ` Naoya Horiguchi
  -1 siblings, 0 replies; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-07 22:06 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, minchan, Mel Gorman, Joonsoo Kim,
	b.zolnierkie, mina86, cl, Rik van Riel

On Wed, May 07, 2014 at 02:09:10PM +0200, Vlastimil Babka wrote:
> The compaction free scanner in isolate_freepages() currently remembers PFN of
> the highest pageblock where it successfully isolates, to be used as the
> starting pageblock for the next invocation. The rationale behind this is that
> page migration might return free pages to the allocator when migration fails
> and we don't want to skip them if the compaction continues.
> 
> Since migration now returns free pages back to compaction code where they can
> be reused, this is no longer a concern. This patch changes isolate_freepages()
> so that the PFN for restarting is updated with each pageblock where isolation
> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> reduction of the pages scanned by the free scanner.
> 
> Note that the somewhat similar functionality that records highest successful
> pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
> used when the whole compaction is restarted, not for multiple invocations of
> the free scanner during single compaction.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  v2: no changes, just keep patches together

Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-08  5:17         ` Joonsoo Kim
  -1 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-08  5:17 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Vlastimil Babka,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, May 06, 2014 at 07:22:52PM -0700, David Rientjes wrote:
> Async compaction terminates prematurely when need_resched(), see
> compact_checklock_irqsave().  This can never trigger, however, if the 
> cond_resched() in isolate_migratepages_range() always takes care of the 
> scheduling.
> 
> If the cond_resched() actually triggers, then terminate this pageblock scan for 
> async compaction as well.

Hello,

I think that same logic would be helpful to cond_resched() in
isolatate_freepages(). And, isolate_freepages() doesn't have exit logic
when it find zone_lock contention. I think that fixing it is also
helpful.

Thanks.


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
@ 2014-05-08  5:17         ` Joonsoo Kim
  0 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-08  5:17 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Vlastimil Babka,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, May 06, 2014 at 07:22:52PM -0700, David Rientjes wrote:
> Async compaction terminates prematurely when need_resched(), see
> compact_checklock_irqsave().  This can never trigger, however, if the 
> cond_resched() in isolate_migratepages_range() always takes care of the 
> scheduling.
> 
> If the cond_resched() actually triggers, then terminate this pageblock scan for 
> async compaction as well.

Hello,

I think that same logic would be helpful to cond_resched() in
isolatate_freepages(). And, isolate_freepages() doesn't have exit logic
when it find zone_lock contention. I think that fixing it is also
helpful.

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-07 12:09           ` Vlastimil Babka
@ 2014-05-08  5:28             ` Joonsoo Kim
  -1 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-08  5:28 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Wed, May 07, 2014 at 02:09:10PM +0200, Vlastimil Babka wrote:
> The compaction free scanner in isolate_freepages() currently remembers PFN of
> the highest pageblock where it successfully isolates, to be used as the
> starting pageblock for the next invocation. The rationale behind this is that
> page migration might return free pages to the allocator when migration fails
> and we don't want to skip them if the compaction continues.
> 
> Since migration now returns free pages back to compaction code where they can
> be reused, this is no longer a concern. This patch changes isolate_freepages()
> so that the PFN for restarting is updated with each pageblock where isolation
> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> reduction of the pages scanned by the free scanner.

Hello,

Although this patch could reduce page scanned, it is possible to skip
scanning fresh pageblock. If there is zone lock contention and we are on
asyn compaction, we stop scanning this pageblock immediately. And
then, we will continue to scan next pageblock. With this patch,
next_free_pfn is updated in this case, so we never come back again to this
pageblock. Possibly this makes compaction success rate low, doesn't
it?

Thanks.


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-08  5:28             ` Joonsoo Kim
  0 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-08  5:28 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Wed, May 07, 2014 at 02:09:10PM +0200, Vlastimil Babka wrote:
> The compaction free scanner in isolate_freepages() currently remembers PFN of
> the highest pageblock where it successfully isolates, to be used as the
> starting pageblock for the next invocation. The rationale behind this is that
> page migration might return free pages to the allocator when migration fails
> and we don't want to skip them if the compaction continues.
> 
> Since migration now returns free pages back to compaction code where they can
> be reused, this is no longer a concern. This patch changes isolate_freepages()
> so that the PFN for restarting is updated with each pageblock where isolation
> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> reduction of the pages scanned by the free scanner.

Hello,

Although this patch could reduce page scanned, it is possible to skip
scanning fresh pageblock. If there is zone lock contention and we are on
asyn compaction, we stop scanning this pageblock immediately. And
then, we will continue to scan next pageblock. With this patch,
next_free_pfn is updated in this case, so we never come back again to this
pageblock. Possibly this makes compaction success rate low, doesn't
it?

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch -mm] mm, thp: avoid excessive compaction latency during fault fix
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-08  5:30         ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-08  5:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

mm-thp-avoid-excessive-compaction-latency-during-fault.patch excludes sync
compaction for all high order allocations other than thp.  What we really
want to do is suppress sync compaction for thp, but only during the page
fault path.

Orders greater than PAGE_ALLOC_COSTLY_ORDER aren't necessarily going to
loop again so this is the only way to exhaust our capabilities before
declaring that we can't allocate.

Reported-by: Hugh Dickins <hughd@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/page_alloc.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2585,16 +2585,13 @@ rebalance:
 	if (page)
 		goto got_pg;
 
-	if (gfp_mask & __GFP_NO_KSWAPD) {
-		/*
-		 * Khugepaged is allowed to try MIGRATE_SYNC_LIGHT, the latency
-		 * of this allocation isn't critical.  Everything else, however,
-		 * should only be allowed to do MIGRATE_ASYNC to avoid excessive
-		 * stalls during fault.
-		 */
-		if ((current->flags & (PF_KTHREAD | PF_KSWAPD)) == PF_KTHREAD)
-			migration_mode = MIGRATE_SYNC_LIGHT;
-	}
+	/*
+	 * It can become very expensive to allocate transparent hugepages at
+	 * fault, so use asynchronous memory compaction for THP unless it is
+	 * khugepaged trying to collapse.
+	 */
+	if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
+		migration_mode = MIGRATE_SYNC_LIGHT;
 
 	/*
 	 * If compaction is deferred for high-order allocations, it is because

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch -mm] mm, thp: avoid excessive compaction latency during fault fix
@ 2014-05-08  5:30         ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-08  5:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

mm-thp-avoid-excessive-compaction-latency-during-fault.patch excludes sync
compaction for all high order allocations other than thp.  What we really
want to do is suppress sync compaction for thp, but only during the page
fault path.

Orders greater than PAGE_ALLOC_COSTLY_ORDER aren't necessarily going to
loop again so this is the only way to exhaust our capabilities before
declaring that we can't allocate.

Reported-by: Hugh Dickins <hughd@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/page_alloc.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2585,16 +2585,13 @@ rebalance:
 	if (page)
 		goto got_pg;
 
-	if (gfp_mask & __GFP_NO_KSWAPD) {
-		/*
-		 * Khugepaged is allowed to try MIGRATE_SYNC_LIGHT, the latency
-		 * of this allocation isn't critical.  Everything else, however,
-		 * should only be allowed to do MIGRATE_ASYNC to avoid excessive
-		 * stalls during fault.
-		 */
-		if ((current->flags & (PF_KTHREAD | PF_KSWAPD)) == PF_KTHREAD)
-			migration_mode = MIGRATE_SYNC_LIGHT;
-	}
+	/*
+	 * It can become very expensive to allocate transparent hugepages at
+	 * fault, so use asynchronous memory compaction for THP unless it is
+	 * khugepaged trying to collapse.
+	 */
+	if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
+		migration_mode = MIGRATE_SYNC_LIGHT;
 
 	/*
 	 * If compaction is deferred for high-order allocations, it is because

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary
  2014-05-07 12:09         ` Vlastimil Babka
@ 2014-05-09 15:48           ` Michal Nazarewicz
  -1 siblings, 0 replies; 267+ messages in thread
From: Michal Nazarewicz @ 2014-05-09 15:48 UTC (permalink / raw)
  To: Vlastimil Babka, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Bartlomiej Zolnierkiewicz, Christoph Lameter, Rik van Riel

[-- Attachment #1: Type: text/plain, Size: 3562 bytes --]

On Wed, May 07 2014, Vlastimil Babka wrote:
> During compaction, update_nr_listpages() has been used to count remaining
> non-migrated and free pages after a call to migrage_pages(). The freepages
> counting has become unneccessary, and it turns out that migratepages counting
> is also unnecessary in most cases.
>
> The only situation when it's needed to count cc->migratepages is when
> migrate_pages() returns with a negative error code. Otherwise, the non-negative
> return value is the number of pages that were not migrated, which is exactly
> the count of remaining pages in the cc->migratepages list.
>
> Furthermore, any non-zero count is only interesting for the tracepoint of
> mm_compaction_migratepages events, because after that all remaining unmigrated
> pages are put back and their count is set to 0.
>
> This patch therefore removes update_nr_listpages() completely, and changes the
> tracepoint definition so that the manual counting is done only when the
> tracepoint is enabled, and only when migrate_pages() returns a negative error
> code.
>
> Furthermore, migrate_pages() and the tracepoints won't be called when there's
> nothing to migrate. This potentially avoids some wasted cycles and reduces the
> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
> The mm_compaction_isolate_migratepages event is better for determining that
> nothing was isolated for migration, and this one was just duplicating the info.
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>

Acked-by: Michal Nazarewicz <mina86@mina86.com>

One tiny comment below:

> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  v2: checkpack and other non-functional fixes suggested by Naoya Horiguchi
>
>  include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
>  mm/compaction.c                   | 31 +++++++------------------------
>  2 files changed, 29 insertions(+), 28 deletions(-)
>
> diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
> index 06f544e..aacaf0f 100644
> --- a/include/trace/events/compaction.h
> +++ b/include/trace/events/compaction.h
> @@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
>  	),
>  
>  	TP_fast_assign(
> -		__entry->nr_migrated = nr_migrated;
> +		unsigned long nr_failed = 0;
> +		struct page *page;
> +
> +		/*
> +		 * migrate_pages() returns either a non-negative number
> +		 * with the number of pages that failed migration, or an
> +		 * error code, in which case we need to count the remaining
> +		 * pages manually
> +		 */
> +		if (migrate_rc >= 0)
> +			nr_failed = migrate_rc;
> +		else
> +			list_for_each_entry(page, migratepages, lru)
> +				nr_failed++;

list_for_each would suffice here.

> +
> +		__entry->nr_migrated = nr_all - nr_failed;
>  		__entry->nr_failed = nr_failed;
>  	),
>  

-- 
Best regards,                                         _     _
.o. | Liege of Serenely Enlightened Majesty of      o' \,=./ `o
..o | Computer Science,  Michał “mina86” Nazarewicz    (o o)
ooo +--<mpn@google.com>--<xmpp:mina86@jabber.org>--ooO--(_)--Ooo--

[-- Attachment #2.1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 835 bytes --]

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary
@ 2014-05-09 15:48           ` Michal Nazarewicz
  0 siblings, 0 replies; 267+ messages in thread
From: Michal Nazarewicz @ 2014-05-09 15:48 UTC (permalink / raw)
  To: Vlastimil Babka, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Christoph Lameter, Rik van Riel

[-- Attachment #1: Type: text/plain, Size: 3562 bytes --]

On Wed, May 07 2014, Vlastimil Babka wrote:
> During compaction, update_nr_listpages() has been used to count remaining
> non-migrated and free pages after a call to migrage_pages(). The freepages
> counting has become unneccessary, and it turns out that migratepages counting
> is also unnecessary in most cases.
>
> The only situation when it's needed to count cc->migratepages is when
> migrate_pages() returns with a negative error code. Otherwise, the non-negative
> return value is the number of pages that were not migrated, which is exactly
> the count of remaining pages in the cc->migratepages list.
>
> Furthermore, any non-zero count is only interesting for the tracepoint of
> mm_compaction_migratepages events, because after that all remaining unmigrated
> pages are put back and their count is set to 0.
>
> This patch therefore removes update_nr_listpages() completely, and changes the
> tracepoint definition so that the manual counting is done only when the
> tracepoint is enabled, and only when migrate_pages() returns a negative error
> code.
>
> Furthermore, migrate_pages() and the tracepoints won't be called when there's
> nothing to migrate. This potentially avoids some wasted cycles and reduces the
> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
> The mm_compaction_isolate_migratepages event is better for determining that
> nothing was isolated for migration, and this one was just duplicating the info.
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>

Acked-by: Michal Nazarewicz <mina86@mina86.com>

One tiny comment below:

> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  v2: checkpack and other non-functional fixes suggested by Naoya Horiguchi
>
>  include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
>  mm/compaction.c                   | 31 +++++++------------------------
>  2 files changed, 29 insertions(+), 28 deletions(-)
>
> diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
> index 06f544e..aacaf0f 100644
> --- a/include/trace/events/compaction.h
> +++ b/include/trace/events/compaction.h
> @@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
>  	),
>  
>  	TP_fast_assign(
> -		__entry->nr_migrated = nr_migrated;
> +		unsigned long nr_failed = 0;
> +		struct page *page;
> +
> +		/*
> +		 * migrate_pages() returns either a non-negative number
> +		 * with the number of pages that failed migration, or an
> +		 * error code, in which case we need to count the remaining
> +		 * pages manually
> +		 */
> +		if (migrate_rc >= 0)
> +			nr_failed = migrate_rc;
> +		else
> +			list_for_each_entry(page, migratepages, lru)
> +				nr_failed++;

list_for_each would suffice here.

> +
> +		__entry->nr_migrated = nr_all - nr_failed;
>  		__entry->nr_failed = nr_failed;
>  	),
>  

-- 
Best regards,                                         _     _
.o. | Liege of Serenely Enlightened Majesty of      o' \,=./ `o
..o | Computer Science,  Michał “mina86” Nazarewicz    (o o)
ooo +--<mpn@google.com>--<xmpp:mina86@jabber.org>--ooO--(_)--Ooo--

[-- Attachment #2.1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 835 bytes --]

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-07 12:09           ` Vlastimil Babka
@ 2014-05-09 15:49             ` Michal Nazarewicz
  -1 siblings, 0 replies; 267+ messages in thread
From: Michal Nazarewicz @ 2014-05-09 15:49 UTC (permalink / raw)
  To: Vlastimil Babka, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Bartlomiej Zolnierkiewicz, Naoya Horiguchi, Christoph Lameter,
	Rik van Riel

[-- Attachment #1: Type: text/plain, Size: 1977 bytes --]

On Wed, May 07 2014, Vlastimil Babka wrote:
> The compaction free scanner in isolate_freepages() currently remembers PFN of
> the highest pageblock where it successfully isolates, to be used as the
> starting pageblock for the next invocation. The rationale behind this is that
> page migration might return free pages to the allocator when migration fails
> and we don't want to skip them if the compaction continues.
>
> Since migration now returns free pages back to compaction code where they can
> be reused, this is no longer a concern. This patch changes isolate_freepages()
> so that the PFN for restarting is updated with each pageblock where isolation
> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> reduction of the pages scanned by the free scanner.
>
> Note that the somewhat similar functionality that records highest successful
> pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
> used when the whole compaction is restarted, not for multiple invocations of
> the free scanner during single compaction.
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>

Acked-by: Michal Nazarewicz <mina86@mina86.com>

> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  v2: no changes, just keep patches together
>
>  mm/compaction.c | 18 ++++++------------
>  1 file changed, 6 insertions(+), 12 deletions(-)
>
-- 
Best regards,                                         _     _
.o. | Liege of Serenely Enlightened Majesty of      o' \,=./ `o
..o | Computer Science,  Michał “mina86” Nazarewicz    (o o)
ooo +--<mpn@google.com>--<xmpp:mina86@jabber.org>--ooO--(_)--Ooo--

[-- Attachment #2.1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 835 bytes --]

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-09 15:49             ` Michal Nazarewicz
  0 siblings, 0 replies; 267+ messages in thread
From: Michal Nazarewicz @ 2014-05-09 15:49 UTC (permalink / raw)
  To: Vlastimil Babka, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel

[-- Attachment #1: Type: text/plain, Size: 1977 bytes --]

On Wed, May 07 2014, Vlastimil Babka wrote:
> The compaction free scanner in isolate_freepages() currently remembers PFN of
> the highest pageblock where it successfully isolates, to be used as the
> starting pageblock for the next invocation. The rationale behind this is that
> page migration might return free pages to the allocator when migration fails
> and we don't want to skip them if the compaction continues.
>
> Since migration now returns free pages back to compaction code where they can
> be reused, this is no longer a concern. This patch changes isolate_freepages()
> so that the PFN for restarting is updated with each pageblock where isolation
> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> reduction of the pages scanned by the free scanner.
>
> Note that the somewhat similar functionality that records highest successful
> pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
> used when the whole compaction is restarted, not for multiple invocations of
> the free scanner during single compaction.
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>

Acked-by: Michal Nazarewicz <mina86@mina86.com>

> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  v2: no changes, just keep patches together
>
>  mm/compaction.c | 18 ++++++------------
>  1 file changed, 6 insertions(+), 12 deletions(-)
>
-- 
Best regards,                                         _     _
.o. | Liege of Serenely Enlightened Majesty of      o' \,=./ `o
..o | Computer Science,  Michał “mina86” Nazarewicz    (o o)
ooo +--<mpn@google.com>--<xmpp:mina86@jabber.org>--ooO--(_)--Ooo--

[-- Attachment #2.1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 835 bytes --]

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v4 4/6] mm, compaction: embed migration mode in compact_control
  2014-05-07 10:36           ` David Rientjes
@ 2014-05-09 22:03             ` Andrew Morton
  -1 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-09 22:03 UTC (permalink / raw)
  To: David Rientjes
  Cc: Vlastimil Babka, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Wed, 7 May 2014 03:36:46 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:

> We're going to want to manipulate the migration mode for compaction in the page 
> allocator, and currently compact_control's sync field is only a bool.  
> 
> Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending 
> on the value of this bool.  Convert the bool to enum migrate_mode and pass the 
> migration mode in directly.  Later, we'll want to avoid MIGRATE_SYNC_LIGHT for 
> thp allocations in the pagefault patch to avoid unnecessary latency.
> 
> This also alters compaction triggered from sysfs, either for the entire system 
> or for a node, to force MIGRATE_SYNC.

mm/page_alloc.c: In function 'alloc_contig_range':
mm/page_alloc.c:6255: error: unknown field 'sync' specified in initializer

--- a/mm/page_alloc.c~mm-compaction-embed-migration-mode-in-compact_control-fix
+++ a/mm/page_alloc.c
@@ -6252,7 +6252,7 @@ int alloc_contig_range(unsigned long sta
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
-		.sync = MIGRATE_SYNC_LIGHT,
+		.mode = MIGRATE_SYNC_LIGHT,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);


Please check that you sent the correct version of this?

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v4 4/6] mm, compaction: embed migration mode in compact_control
@ 2014-05-09 22:03             ` Andrew Morton
  0 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-09 22:03 UTC (permalink / raw)
  To: David Rientjes
  Cc: Vlastimil Babka, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Wed, 7 May 2014 03:36:46 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:

> We're going to want to manipulate the migration mode for compaction in the page 
> allocator, and currently compact_control's sync field is only a bool.  
> 
> Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending 
> on the value of this bool.  Convert the bool to enum migrate_mode and pass the 
> migration mode in directly.  Later, we'll want to avoid MIGRATE_SYNC_LIGHT for 
> thp allocations in the pagefault patch to avoid unnecessary latency.
> 
> This also alters compaction triggered from sysfs, either for the entire system 
> or for a node, to force MIGRATE_SYNC.

mm/page_alloc.c: In function 'alloc_contig_range':
mm/page_alloc.c:6255: error: unknown field 'sync' specified in initializer

--- a/mm/page_alloc.c~mm-compaction-embed-migration-mode-in-compact_control-fix
+++ a/mm/page_alloc.c
@@ -6252,7 +6252,7 @@ int alloc_contig_range(unsigned long sta
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
-		.sync = MIGRATE_SYNC_LIGHT,
+		.mode = MIGRATE_SYNC_LIGHT,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);


Please check that you sent the correct version of this?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
  2014-05-07 21:21           ` David Rientjes
@ 2014-05-12  8:35             ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12  8:35 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/07/2014 11:21 PM, David Rientjes wrote:
> On Wed, 7 May 2014, Andrew Morton wrote:
>
>>> Memory compaction works by having a "freeing scanner" scan from one end of a
>>> zone which isolates pages as migration targets while another "migrating scanner"
>>> scans from the other end of the same zone which isolates pages for migration.
>>>
>>> When page migration fails for an isolated page, the target page is returned to
>>> the system rather than the freelist built by the freeing scanner.  This may
>>> require the freeing scanner to continue scanning memory after suitable migration
>>> targets have already been returned to the system needlessly.
>>>
>>> This patch returns destination pages to the freeing scanner freelist when page
>>> migration fails.  This prevents unnecessary work done by the freeing scanner but
>>> also encourages memory to be as compacted as possible at the end of the zone.
>>>
>>> Reported-by: Greg Thelen <gthelen@google.com>
>>
>> What did Greg actually report?  IOW, what if any observable problem is
>> being fixed here?
>>
>
> Greg reported by code inspection that he found isolated free pages were
> returned back to the VM rather than the compaction freelist.  This will
> cause holes behind the free scanner and cause it to reallocate additional
> memory if necessary later.

More precisely, there shouldn't be holes as the free scanner restarts at 
highest pageblock where it isolated something, exactly to avoid making 
holes due to returned pages. But that can be now avoided, as my patch does.


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-12  8:35             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12  8:35 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/07/2014 11:21 PM, David Rientjes wrote:
> On Wed, 7 May 2014, Andrew Morton wrote:
>
>>> Memory compaction works by having a "freeing scanner" scan from one end of a
>>> zone which isolates pages as migration targets while another "migrating scanner"
>>> scans from the other end of the same zone which isolates pages for migration.
>>>
>>> When page migration fails for an isolated page, the target page is returned to
>>> the system rather than the freelist built by the freeing scanner.  This may
>>> require the freeing scanner to continue scanning memory after suitable migration
>>> targets have already been returned to the system needlessly.
>>>
>>> This patch returns destination pages to the freeing scanner freelist when page
>>> migration fails.  This prevents unnecessary work done by the freeing scanner but
>>> also encourages memory to be as compacted as possible at the end of the zone.
>>>
>>> Reported-by: Greg Thelen <gthelen@google.com>
>>
>> What did Greg actually report?  IOW, what if any observable problem is
>> being fixed here?
>>
>
> Greg reported by code inspection that he found isolated free pages were
> returned back to the VM rather than the compaction freelist.  This will
> cause holes behind the free scanner and cause it to reallocate additional
> memory if necessary later.

More precisely, there shouldn't be holes as the free scanner restarts at 
highest pageblock where it isolated something, exactly to avoid making 
holes due to returned pages. But that can be now avoided, as my patch does.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
  2014-05-07 21:39           ` Greg Thelen
@ 2014-05-12  8:37             ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12  8:37 UTC (permalink / raw)
  To: Greg Thelen, Andrew Morton
  Cc: David Rientjes, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Hugh Dickins, linux-kernel, linux-mm

On 05/07/2014 11:39 PM, Greg Thelen wrote:
>
> On Wed, May 07 2014, Andrew Morton <akpm@linux-foundation.org> wrote:
>
>> On Tue, 6 May 2014 19:22:43 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:
>>
>>> Memory compaction works by having a "freeing scanner" scan from one end of a
>>> zone which isolates pages as migration targets while another "migrating scanner"
>>> scans from the other end of the same zone which isolates pages for migration.
>>>
>>> When page migration fails for an isolated page, the target page is returned to
>>> the system rather than the freelist built by the freeing scanner.  This may
>>> require the freeing scanner to continue scanning memory after suitable migration
>>> targets have already been returned to the system needlessly.
>>>
>>> This patch returns destination pages to the freeing scanner freelist when page
>>> migration fails.  This prevents unnecessary work done by the freeing scanner but
>>> also encourages memory to be as compacted as possible at the end of the zone.
>>>
>>> Reported-by: Greg Thelen <gthelen@google.com>
>>
>> What did Greg actually report?  IOW, what if any observable problem is
>> being fixed here?
>
> I detected the problem at runtime seeing that ext4 metadata pages (esp
> the ones read by "sbi->s_group_desc[i] = sb_bread(sb, block)") were
> constantly visited by compaction calls of migrate_pages().  These pages
> had a non-zero b_count which caused fallback_migrate_page() ->
> try_to_release_page() -> try_to_free_buffers() to fail.

That sounds like something the "mm, compaction: add per-zone migration 
pfn cache for async compaction" patch would fix, not this one, though.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-12  8:37             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12  8:37 UTC (permalink / raw)
  To: Greg Thelen, Andrew Morton
  Cc: David Rientjes, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Hugh Dickins, linux-kernel, linux-mm

On 05/07/2014 11:39 PM, Greg Thelen wrote:
>
> On Wed, May 07 2014, Andrew Morton <akpm@linux-foundation.org> wrote:
>
>> On Tue, 6 May 2014 19:22:43 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:
>>
>>> Memory compaction works by having a "freeing scanner" scan from one end of a
>>> zone which isolates pages as migration targets while another "migrating scanner"
>>> scans from the other end of the same zone which isolates pages for migration.
>>>
>>> When page migration fails for an isolated page, the target page is returned to
>>> the system rather than the freelist built by the freeing scanner.  This may
>>> require the freeing scanner to continue scanning memory after suitable migration
>>> targets have already been returned to the system needlessly.
>>>
>>> This patch returns destination pages to the freeing scanner freelist when page
>>> migration fails.  This prevents unnecessary work done by the freeing scanner but
>>> also encourages memory to be as compacted as possible at the end of the zone.
>>>
>>> Reported-by: Greg Thelen <gthelen@google.com>
>>
>> What did Greg actually report?  IOW, what if any observable problem is
>> being fixed here?
>
> I detected the problem at runtime seeing that ext4 metadata pages (esp
> the ones read by "sbi->s_group_desc[i] = sb_bread(sb, block)") were
> constantly visited by compaction calls of migrate_pages().  These pages
> had a non-zero b_count which caused fallback_migrate_page() ->
> try_to_release_page() -> try_to_free_buffers() to fail.

That sounds like something the "mm, compaction: add per-zone migration 
pfn cache for async compaction" patch would fix, not this one, though.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-08  5:28             ` Joonsoo Kim
@ 2014-05-12  9:09               ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12  9:09 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 05/08/2014 07:28 AM, Joonsoo Kim wrote:
> On Wed, May 07, 2014 at 02:09:10PM +0200, Vlastimil Babka wrote:
>> The compaction free scanner in isolate_freepages() currently remembers PFN of
>> the highest pageblock where it successfully isolates, to be used as the
>> starting pageblock for the next invocation. The rationale behind this is that
>> page migration might return free pages to the allocator when migration fails
>> and we don't want to skip them if the compaction continues.
>>
>> Since migration now returns free pages back to compaction code where they can
>> be reused, this is no longer a concern. This patch changes isolate_freepages()
>> so that the PFN for restarting is updated with each pageblock where isolation
>> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
>> reduction of the pages scanned by the free scanner.
>
> Hello,
>
> Although this patch could reduce page scanned, it is possible to skip
> scanning fresh pageblock. If there is zone lock contention and we are on
> asyn compaction, we stop scanning this pageblock immediately. And
> then, we will continue to scan next pageblock. With this patch,
> next_free_pfn is updated in this case, so we never come back again to this
> pageblock. Possibly this makes compaction success rate low, doesn't
> it?

Hm, you're right and thanks for catching that, but I think this is a 
sign of a worse and older issue than skipping a pageblock?
When isolate_freepages_block() breaks loop due to lock contention, then 
isolate_freepages() (which called it) should also immediately quit its 
loop. Trying another pageblock in the same zone with the same zone->lock 
makes no sense here? If this is fixed, then the issue you're pointing 
out will also be fixed as next_free_pfn will still point to the 
pageblock where the break occured.

> Thanks.
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-12  9:09               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12  9:09 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 05/08/2014 07:28 AM, Joonsoo Kim wrote:
> On Wed, May 07, 2014 at 02:09:10PM +0200, Vlastimil Babka wrote:
>> The compaction free scanner in isolate_freepages() currently remembers PFN of
>> the highest pageblock where it successfully isolates, to be used as the
>> starting pageblock for the next invocation. The rationale behind this is that
>> page migration might return free pages to the allocator when migration fails
>> and we don't want to skip them if the compaction continues.
>>
>> Since migration now returns free pages back to compaction code where they can
>> be reused, this is no longer a concern. This patch changes isolate_freepages()
>> so that the PFN for restarting is updated with each pageblock where isolation
>> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
>> reduction of the pages scanned by the free scanner.
>
> Hello,
>
> Although this patch could reduce page scanned, it is possible to skip
> scanning fresh pageblock. If there is zone lock contention and we are on
> asyn compaction, we stop scanning this pageblock immediately. And
> then, we will continue to scan next pageblock. With this patch,
> next_free_pfn is updated in this case, so we never come back again to this
> pageblock. Possibly this makes compaction success rate low, doesn't
> it?

Hm, you're right and thanks for catching that, but I think this is a 
sign of a worse and older issue than skipping a pageblock?
When isolate_freepages_block() breaks loop due to lock contention, then 
isolate_freepages() (which called it) should also immediately quit its 
loop. Trying another pageblock in the same zone with the same zone->lock 
makes no sense here? If this is fixed, then the issue you're pointing 
out will also be fixed as next_free_pfn will still point to the 
pageblock where the break occured.

> Thanks.
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary
  2014-05-09 15:48           ` Michal Nazarewicz
@ 2014-05-12  9:51             ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12  9:51 UTC (permalink / raw)
  To: Michal Nazarewicz, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Christoph Lameter, Rik van Riel

On 05/09/2014 05:48 PM, Michal Nazarewicz wrote:
> On Wed, May 07 2014, Vlastimil Babka wrote:
>> During compaction, update_nr_listpages() has been used to count remaining
>> non-migrated and free pages after a call to migrage_pages(). The freepages
>> counting has become unneccessary, and it turns out that migratepages counting
>> is also unnecessary in most cases.
>>
>> The only situation when it's needed to count cc->migratepages is when
>> migrate_pages() returns with a negative error code. Otherwise, the non-negative
>> return value is the number of pages that were not migrated, which is exactly
>> the count of remaining pages in the cc->migratepages list.
>>
>> Furthermore, any non-zero count is only interesting for the tracepoint of
>> mm_compaction_migratepages events, because after that all remaining unmigrated
>> pages are put back and their count is set to 0.
>>
>> This patch therefore removes update_nr_listpages() completely, and changes the
>> tracepoint definition so that the manual counting is done only when the
>> tracepoint is enabled, and only when migrate_pages() returns a negative error
>> code.
>>
>> Furthermore, migrate_pages() and the tracepoints won't be called when there's
>> nothing to migrate. This potentially avoids some wasted cycles and reduces the
>> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
>> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
>> The mm_compaction_isolate_migratepages event is better for determining that
>> nothing was isolated for migration, and this one was just duplicating the info.
>>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
>> Cc: Michal Nazarewicz <mina86@mina86.com>
> 
> Acked-by: Michal Nazarewicz <mina86@mina86.com>
> 
> One tiny comment below:
> 
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   v2: checkpack and other non-functional fixes suggested by Naoya Horiguchi
>>
>>   include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
>>   mm/compaction.c                   | 31 +++++++------------------------
>>   2 files changed, 29 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
>> index 06f544e..aacaf0f 100644
>> --- a/include/trace/events/compaction.h
>> +++ b/include/trace/events/compaction.h
>> @@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
>>   	),
>>   
>>   	TP_fast_assign(
>> -		__entry->nr_migrated = nr_migrated;
>> +		unsigned long nr_failed = 0;
>> +		struct page *page;
>> +
>> +		/*
>> +		 * migrate_pages() returns either a non-negative number
>> +		 * with the number of pages that failed migration, or an
>> +		 * error code, in which case we need to count the remaining
>> +		 * pages manually
>> +		 */
>> +		if (migrate_rc >= 0)
>> +			nr_failed = migrate_rc;
>> +		else
>> +			list_for_each_entry(page, migratepages, lru)
>> +				nr_failed++;
> 
> list_for_each would suffice here.
> 
>> +
>> +		__entry->nr_migrated = nr_all - nr_failed;
>>   		__entry->nr_failed = nr_failed;
>>   	),
>>   

Right, thanks!

-----8<-----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 12 May 2014 10:56:11 +0200
Subject: [PATCH] mm-compaction-do-not-count-migratepages-when-unnecessary-fix

list_for_each is enough for counting the list length. We also avoid including
struct page definition this way.

Suggested-by: Michal Nazarewicz <mina86@mina86.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/trace/events/compaction.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index aacaf0f..c6814b9 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -7,7 +7,6 @@
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/tracepoint.h>
-#include <linux/mm_types.h>
 #include <trace/events/gfpflags.h>
 
 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
@@ -62,7 +61,7 @@ TRACE_EVENT(mm_compaction_migratepages,
 
 	TP_fast_assign(
 		unsigned long nr_failed = 0;
-		struct page *page;
+		struct list_head *page_lru;
 
 		/*
 		 * migrate_pages() returns either a non-negative number
@@ -73,7 +72,7 @@ TRACE_EVENT(mm_compaction_migratepages,
 		if (migrate_rc >= 0)
 			nr_failed = migrate_rc;
 		else
-			list_for_each_entry(page, migratepages, lru)
+			list_for_each(page_lru, migratepages)
 				nr_failed++;
 
 		__entry->nr_migrated = nr_all - nr_failed;
-- 
1.8.4.5




^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary
@ 2014-05-12  9:51             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12  9:51 UTC (permalink / raw)
  To: Michal Nazarewicz, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Christoph Lameter, Rik van Riel

On 05/09/2014 05:48 PM, Michal Nazarewicz wrote:
> On Wed, May 07 2014, Vlastimil Babka wrote:
>> During compaction, update_nr_listpages() has been used to count remaining
>> non-migrated and free pages after a call to migrage_pages(). The freepages
>> counting has become unneccessary, and it turns out that migratepages counting
>> is also unnecessary in most cases.
>>
>> The only situation when it's needed to count cc->migratepages is when
>> migrate_pages() returns with a negative error code. Otherwise, the non-negative
>> return value is the number of pages that were not migrated, which is exactly
>> the count of remaining pages in the cc->migratepages list.
>>
>> Furthermore, any non-zero count is only interesting for the tracepoint of
>> mm_compaction_migratepages events, because after that all remaining unmigrated
>> pages are put back and their count is set to 0.
>>
>> This patch therefore removes update_nr_listpages() completely, and changes the
>> tracepoint definition so that the manual counting is done only when the
>> tracepoint is enabled, and only when migrate_pages() returns a negative error
>> code.
>>
>> Furthermore, migrate_pages() and the tracepoints won't be called when there's
>> nothing to migrate. This potentially avoids some wasted cycles and reduces the
>> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
>> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
>> The mm_compaction_isolate_migratepages event is better for determining that
>> nothing was isolated for migration, and this one was just duplicating the info.
>>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
>> Cc: Michal Nazarewicz <mina86@mina86.com>
> 
> Acked-by: Michal Nazarewicz <mina86@mina86.com>
> 
> One tiny comment below:
> 
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   v2: checkpack and other non-functional fixes suggested by Naoya Horiguchi
>>
>>   include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
>>   mm/compaction.c                   | 31 +++++++------------------------
>>   2 files changed, 29 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
>> index 06f544e..aacaf0f 100644
>> --- a/include/trace/events/compaction.h
>> +++ b/include/trace/events/compaction.h
>> @@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
>>   	),
>>   
>>   	TP_fast_assign(
>> -		__entry->nr_migrated = nr_migrated;
>> +		unsigned long nr_failed = 0;
>> +		struct page *page;
>> +
>> +		/*
>> +		 * migrate_pages() returns either a non-negative number
>> +		 * with the number of pages that failed migration, or an
>> +		 * error code, in which case we need to count the remaining
>> +		 * pages manually
>> +		 */
>> +		if (migrate_rc >= 0)
>> +			nr_failed = migrate_rc;
>> +		else
>> +			list_for_each_entry(page, migratepages, lru)
>> +				nr_failed++;
> 
> list_for_each would suffice here.
> 
>> +
>> +		__entry->nr_migrated = nr_all - nr_failed;
>>   		__entry->nr_failed = nr_failed;
>>   	),
>>   

Right, thanks!

-----8<-----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 12 May 2014 10:56:11 +0200
Subject: [PATCH] mm-compaction-do-not-count-migratepages-when-unnecessary-fix

list_for_each is enough for counting the list length. We also avoid including
struct page definition this way.

Suggested-by: Michal Nazarewicz <mina86@mina86.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/trace/events/compaction.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index aacaf0f..c6814b9 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -7,7 +7,6 @@
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/tracepoint.h>
-#include <linux/mm_types.h>
 #include <trace/events/gfpflags.h>
 
 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
@@ -62,7 +61,7 @@ TRACE_EVENT(mm_compaction_migratepages,
 
 	TP_fast_assign(
 		unsigned long nr_failed = 0;
-		struct page *page;
+		struct list_head *page_lru;
 
 		/*
 		 * migrate_pages() returns either a non-negative number
@@ -73,7 +72,7 @@ TRACE_EVENT(mm_compaction_migratepages,
 		if (migrate_rc >= 0)
 			nr_failed = migrate_rc;
 		else
-			list_for_each_entry(page, migratepages, lru)
+			list_for_each(page_lru, migratepages)
 				nr_failed++;
 
 		__entry->nr_migrated = nr_all - nr_failed;
-- 
1.8.4.5



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-08  5:17         ` Joonsoo Kim
@ 2014-05-12 14:15           ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12 14:15 UTC (permalink / raw)
  To: Joonsoo Kim, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

Compaction uses compact_checklock_irqsave() function to periodically check for
lock contention and need_resched() to either abort async compaction, or to
free the lock, schedule and retake the lock. When aborting, cc->contended is
set to signal the contended state to the caller. Two problems have been
identified in this mechanism.

First, compaction also calls directly cond_resched() in both scanners when no
lock is yet taken. This call either does not abort async compaction, or set
cc->contended appropriately. This patch introduces a new
compact_check_resched() function to achieve both.

Second, isolate_freepages() does not check if isolate_freepages_block()
aborted due to contention, and advances to the next pageblock. This violates
the principle of aborting on contention, and might result in pageblocks not
being scanned completely, since the scanning cursor is advanced. This patch
makes isolate_freepages_block() check the cc->contended flag and abort.

Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 83ca6f9..b34ab7c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 	return true;
 }
 
+/*
+ * Similar to compact_checklock_irqsave() (see its comment) for places where
+ * a zone lock is not concerned.
+ *
+ * Returns false when compaction should abort.
+ */
+static inline bool compact_check_resched(struct compact_control *cc)
+{
+	/* async compaction aborts if contended */
+	if (need_resched()) {
+		if (cc->mode == MIGRATE_ASYNC) {
+			cc->contended = true;
+			return false;
+		}
+
+		cond_resched();
+	}
+
+	return true;
+}
+
 /* Returns true if the page is within a block suitable for migration to */
 static bool suitable_migration_target(struct page *page)
 {
@@ -491,11 +512,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			return 0;
 	}
 
-	if (cond_resched()) {
-		/* Async terminates prematurely on need_resched() */
-		if (cc->mode == MIGRATE_ASYNC)
-			return 0;
-	}
+	if (!compact_check_resched(cc))
+		return 0;
 
 	/* Time to isolate some pages for migration */
 	for (; low_pfn < end_pfn; low_pfn++) {
@@ -718,9 +736,10 @@ static void isolate_freepages(struct zone *zone,
 		/*
 		 * This can iterate a massively long zone without finding any
 		 * suitable migration targets, so periodically check if we need
-		 * to schedule.
+		 * to schedule, or even abort async compaction.
 		 */
-		cond_resched();
+		if (!compact_check_resched(cc))
+			break;
 
 		if (!pfn_valid(block_start_pfn))
 			continue;
@@ -758,6 +777,13 @@ static void isolate_freepages(struct zone *zone,
 		 */
 		if (isolated)
 			cc->finished_update_free = true;
+
+		/*
+		 * isolate_freepages_block() might have aborted due to async
+		 * compaction being contended
+		 */
+		if (cc->contended)
+			break;
 	}
 
 	/* split_free_page does not map the pages */
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-12 14:15           ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12 14:15 UTC (permalink / raw)
  To: Joonsoo Kim, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

Compaction uses compact_checklock_irqsave() function to periodically check for
lock contention and need_resched() to either abort async compaction, or to
free the lock, schedule and retake the lock. When aborting, cc->contended is
set to signal the contended state to the caller. Two problems have been
identified in this mechanism.

First, compaction also calls directly cond_resched() in both scanners when no
lock is yet taken. This call either does not abort async compaction, or set
cc->contended appropriately. This patch introduces a new
compact_check_resched() function to achieve both.

Second, isolate_freepages() does not check if isolate_freepages_block()
aborted due to contention, and advances to the next pageblock. This violates
the principle of aborting on contention, and might result in pageblocks not
being scanned completely, since the scanning cursor is advanced. This patch
makes isolate_freepages_block() check the cc->contended flag and abort.

Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 83ca6f9..b34ab7c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 	return true;
 }
 
+/*
+ * Similar to compact_checklock_irqsave() (see its comment) for places where
+ * a zone lock is not concerned.
+ *
+ * Returns false when compaction should abort.
+ */
+static inline bool compact_check_resched(struct compact_control *cc)
+{
+	/* async compaction aborts if contended */
+	if (need_resched()) {
+		if (cc->mode == MIGRATE_ASYNC) {
+			cc->contended = true;
+			return false;
+		}
+
+		cond_resched();
+	}
+
+	return true;
+}
+
 /* Returns true if the page is within a block suitable for migration to */
 static bool suitable_migration_target(struct page *page)
 {
@@ -491,11 +512,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			return 0;
 	}
 
-	if (cond_resched()) {
-		/* Async terminates prematurely on need_resched() */
-		if (cc->mode == MIGRATE_ASYNC)
-			return 0;
-	}
+	if (!compact_check_resched(cc))
+		return 0;
 
 	/* Time to isolate some pages for migration */
 	for (; low_pfn < end_pfn; low_pfn++) {
@@ -718,9 +736,10 @@ static void isolate_freepages(struct zone *zone,
 		/*
 		 * This can iterate a massively long zone without finding any
 		 * suitable migration targets, so periodically check if we need
-		 * to schedule.
+		 * to schedule, or even abort async compaction.
 		 */
-		cond_resched();
+		if (!compact_check_resched(cc))
+			break;
 
 		if (!pfn_valid(block_start_pfn))
 			continue;
@@ -758,6 +777,13 @@ static void isolate_freepages(struct zone *zone,
 		 */
 		if (isolated)
 			cc->finished_update_free = true;
+
+		/*
+		 * isolate_freepages_block() might have aborted due to async
+		 * compaction being contended
+		 */
+		if (cc->contended)
+			break;
 	}
 
 	/* split_free_page does not map the pages */
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-12 14:15           ` Vlastimil Babka
  (?)
@ 2014-05-12 15:34           ` Naoya Horiguchi
  -1 siblings, 0 replies; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-12 15:34 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, Andrew Morton, David Rientjes, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, minchan, Mel Gorman,
	b.zolnierkie, mina86, cl, Rik van Riel

On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
> 
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new
> compact_check_resched() function to achieve both.
> 
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
> 
> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
>  1 file changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 83ca6f9..b34ab7c 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>  	return true;
>  }
>  
> +/*
> + * Similar to compact_checklock_irqsave() (see its comment) for places where
> + * a zone lock is not concerned.
> + *
> + * Returns false when compaction should abort.
> + */
> +static inline bool compact_check_resched(struct compact_control *cc)
> +{
> +	/* async compaction aborts if contended */
> +	if (need_resched()) {
> +		if (cc->mode == MIGRATE_ASYNC) {
> +			cc->contended = true;

This changes the meaning of contended in struct compact_control (not just
indicating lock contention,) so please update the comment in mm/internal.h too.

Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

Thanks,
Naoya Horiguchi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
       [not found]           ` <1399908847-ouuxeneo@n-horiguchi@ah.jp.nec.com>
@ 2014-05-12 15:45               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12 15:45 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Joonsoo Kim, Andrew Morton, David Rientjes, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, minchan, Mel Gorman,
	b.zolnierkie, mina86, cl, Rik van Riel

On 05/12/2014 05:34 PM, Naoya Horiguchi wrote:
> On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>>
>> First, compaction also calls directly cond_resched() in both scanners when no
>> lock is yet taken. This call either does not abort async compaction, or set
>> cc->contended appropriately. This patch introduces a new
>> compact_check_resched() function to achieve both.
>>
>> Second, isolate_freepages() does not check if isolate_freepages_block()
>> aborted due to contention, and advances to the next pageblock. This violates
>> the principle of aborting on contention, and might result in pageblocks not
>> being scanned completely, since the scanning cursor is advanced. This patch
>> makes isolate_freepages_block() check the cc->contended flag and abort.
>>
>> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
>> Cc: Michal Nazarewicz <mina86@mina86.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
>>   1 file changed, 33 insertions(+), 7 deletions(-)
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 83ca6f9..b34ab7c 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>>   	return true;
>>   }
>>
>> +/*
>> + * Similar to compact_checklock_irqsave() (see its comment) for places where
>> + * a zone lock is not concerned.
>> + *
>> + * Returns false when compaction should abort.
>> + */
>> +static inline bool compact_check_resched(struct compact_control *cc)
>> +{
>> +	/* async compaction aborts if contended */
>> +	if (need_resched()) {
>> +		if (cc->mode == MIGRATE_ASYNC) {
>> +			cc->contended = true;
>
> This changes the meaning of contended in struct compact_control (not just
> indicating lock contention,) so please update the comment in mm/internal.h too.

It doesn't change it, since compact_checklock_irqsave() already has this 
semantic:
if (should_release_lock(lock) && cc->mode == MIGRATE_ASYNC)
	cc->contended = true;

and should_release_lock() is:
	need_resched() || spin_is_contended(lock)

So the comment was already outdated, I will update it in v2.

> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

Thanks.

> Thanks,
> Naoya Horiguchi
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-12 15:45               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12 15:45 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Joonsoo Kim, Andrew Morton, David Rientjes, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, minchan, Mel Gorman,
	b.zolnierkie, mina86, cl, Rik van Riel

On 05/12/2014 05:34 PM, Naoya Horiguchi wrote:
> On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>>
>> First, compaction also calls directly cond_resched() in both scanners when no
>> lock is yet taken. This call either does not abort async compaction, or set
>> cc->contended appropriately. This patch introduces a new
>> compact_check_resched() function to achieve both.
>>
>> Second, isolate_freepages() does not check if isolate_freepages_block()
>> aborted due to contention, and advances to the next pageblock. This violates
>> the principle of aborting on contention, and might result in pageblocks not
>> being scanned completely, since the scanning cursor is advanced. This patch
>> makes isolate_freepages_block() check the cc->contended flag and abort.
>>
>> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
>> Cc: Michal Nazarewicz <mina86@mina86.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
>>   1 file changed, 33 insertions(+), 7 deletions(-)
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 83ca6f9..b34ab7c 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>>   	return true;
>>   }
>>
>> +/*
>> + * Similar to compact_checklock_irqsave() (see its comment) for places where
>> + * a zone lock is not concerned.
>> + *
>> + * Returns false when compaction should abort.
>> + */
>> +static inline bool compact_check_resched(struct compact_control *cc)
>> +{
>> +	/* async compaction aborts if contended */
>> +	if (need_resched()) {
>> +		if (cc->mode == MIGRATE_ASYNC) {
>> +			cc->contended = true;
>
> This changes the meaning of contended in struct compact_control (not just
> indicating lock contention,) so please update the comment in mm/internal.h too.

It doesn't change it, since compact_checklock_irqsave() already has this 
semantic:
if (should_release_lock(lock) && cc->mode == MIGRATE_ASYNC)
	cc->contended = true;

and should_release_lock() is:
	need_resched() || spin_is_contended(lock)

So the comment was already outdated, I will update it in v2.

> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

Thanks.

> Thanks,
> Naoya Horiguchi
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-12 15:45               ` Vlastimil Babka
  (?)
@ 2014-05-12 15:53               ` Naoya Horiguchi
  -1 siblings, 0 replies; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-12 15:53 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, Andrew Morton, David Rientjes, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, minchan, Mel Gorman,
	b.zolnierkie, mina86, cl, Rik van Riel

On Mon, May 12, 2014 at 05:45:31PM +0200, Vlastimil Babka wrote:
...
> >>+/*
> >>+ * Similar to compact_checklock_irqsave() (see its comment) for places where
> >>+ * a zone lock is not concerned.
> >>+ *
> >>+ * Returns false when compaction should abort.
> >>+ */
> >>+static inline bool compact_check_resched(struct compact_control *cc)
> >>+{
> >>+	/* async compaction aborts if contended */
> >>+	if (need_resched()) {
> >>+		if (cc->mode == MIGRATE_ASYNC) {
> >>+			cc->contended = true;
> >
> >This changes the meaning of contended in struct compact_control (not just
> >indicating lock contention,) so please update the comment in mm/internal.h too.
> 
> It doesn't change it, since compact_checklock_irqsave() already has
> this semantic:
> if (should_release_lock(lock) && cc->mode == MIGRATE_ASYNC)
> 	cc->contended = true;
> 
> and should_release_lock() is:
> 	need_resched() || spin_is_contended(lock)
> 
> So the comment was already outdated, I will update it in v2.

Ah OK. Sorry for falsely blaming you :)

Naoya

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-12 14:15           ` Vlastimil Babka
@ 2014-05-12 20:28             ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-12 20:28 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, Andrew Morton, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Mon, 12 May 2014, Vlastimil Babka wrote:

> diff --git a/mm/compaction.c b/mm/compaction.c
> index 83ca6f9..b34ab7c 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>  	return true;
>  }
>  
> +/*
> + * Similar to compact_checklock_irqsave() (see its comment) for places where
> + * a zone lock is not concerned.
> + *
> + * Returns false when compaction should abort.
> + */

I think we should have some sufficient commentary in the code that 
describes why we do this.

> +static inline bool compact_check_resched(struct compact_control *cc)
> +{

I'm not sure that compact_check_resched() is the appropriate name.  Sure, 
it specifies what the current implementation is, but what it's really 
actually doing is determining when compaction should abort prematurely.

Something like compact_should_abort()?

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-12 20:28             ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-12 20:28 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, Andrew Morton, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Mon, 12 May 2014, Vlastimil Babka wrote:

> diff --git a/mm/compaction.c b/mm/compaction.c
> index 83ca6f9..b34ab7c 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>  	return true;
>  }
>  
> +/*
> + * Similar to compact_checklock_irqsave() (see its comment) for places where
> + * a zone lock is not concerned.
> + *
> + * Returns false when compaction should abort.
> + */

I think we should have some sufficient commentary in the code that 
describes why we do this.

> +static inline bool compact_check_resched(struct compact_control *cc)
> +{

I'm not sure that compact_check_resched() is the appropriate name.  Sure, 
it specifies what the current implementation is, but what it's really 
actually doing is determining when compaction should abort prematurely.

Something like compact_should_abort()?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-12 14:15           ` Vlastimil Babka
@ 2014-05-13  0:44             ` Joonsoo Kim
  -1 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-13  0:44 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
> 
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new
> compact_check_resched() function to achieve both.
> 
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
> 
> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
>  1 file changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 83ca6f9..b34ab7c 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>  	return true;
>  }
>  
> +/*
> + * Similar to compact_checklock_irqsave() (see its comment) for places where
> + * a zone lock is not concerned.
> + *
> + * Returns false when compaction should abort.
> + */
> +static inline bool compact_check_resched(struct compact_control *cc)
> +{
> +	/* async compaction aborts if contended */
> +	if (need_resched()) {
> +		if (cc->mode == MIGRATE_ASYNC) {
> +			cc->contended = true;
> +			return false;
> +		}
> +
> +		cond_resched();
> +	}
> +
> +	return true;
> +}
> +
>  /* Returns true if the page is within a block suitable for migration to */
>  static bool suitable_migration_target(struct page *page)
>  {
> @@ -491,11 +512,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  			return 0;
>  	}
>  
> -	if (cond_resched()) {
> -		/* Async terminates prematurely on need_resched() */
> -		if (cc->mode == MIGRATE_ASYNC)
> -			return 0;
> -	}
> +	if (!compact_check_resched(cc))
> +		return 0;
>  
>  	/* Time to isolate some pages for migration */
>  	for (; low_pfn < end_pfn; low_pfn++) {
> @@ -718,9 +736,10 @@ static void isolate_freepages(struct zone *zone,
>  		/*
>  		 * This can iterate a massively long zone without finding any
>  		 * suitable migration targets, so periodically check if we need
> -		 * to schedule.
> +		 * to schedule, or even abort async compaction.
>  		 */
> -		cond_resched();
> +		if (!compact_check_resched(cc))
> +			break;
>  
>  		if (!pfn_valid(block_start_pfn))
>  			continue;
> @@ -758,6 +777,13 @@ static void isolate_freepages(struct zone *zone,
>  		 */
>  		if (isolated)
>  			cc->finished_update_free = true;
> +
> +		/*
> +		 * isolate_freepages_block() might have aborted due to async
> +		 * compaction being contended
> +		 */
> +		if (cc->contended)
> +			break;
>  	}

Hello,

I think that we can do further.

The problem is that this cc->contended is checked only in
isolate_migratepages() to break out the compaction. So if there are
free pages we are already taken, compaction wouldn't stopped
immediately and isolate_freepages() could be invoked again on next
compaction_alloc(). If there is no contention at this time, we would try
to get free pages from one pageblock because cc->contended checking is
on bottom of the loop in isolate_migratepages() and will continue to
run compaction. AFAIK, we want to stop the compaction in this case. 

Moreover, if this isolate_freepages() don't stop the compaction,
next isolate_migratepages() will be invoked and it would be stopped
by checking cc->contended after isolating some pages for migration.
This is useless overhead so should be removed.

Thanks.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-13  0:44             ` Joonsoo Kim
  0 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-13  0:44 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
> 
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new
> compact_check_resched() function to achieve both.
> 
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
> 
> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
>  1 file changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 83ca6f9..b34ab7c 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>  	return true;
>  }
>  
> +/*
> + * Similar to compact_checklock_irqsave() (see its comment) for places where
> + * a zone lock is not concerned.
> + *
> + * Returns false when compaction should abort.
> + */
> +static inline bool compact_check_resched(struct compact_control *cc)
> +{
> +	/* async compaction aborts if contended */
> +	if (need_resched()) {
> +		if (cc->mode == MIGRATE_ASYNC) {
> +			cc->contended = true;
> +			return false;
> +		}
> +
> +		cond_resched();
> +	}
> +
> +	return true;
> +}
> +
>  /* Returns true if the page is within a block suitable for migration to */
>  static bool suitable_migration_target(struct page *page)
>  {
> @@ -491,11 +512,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  			return 0;
>  	}
>  
> -	if (cond_resched()) {
> -		/* Async terminates prematurely on need_resched() */
> -		if (cc->mode == MIGRATE_ASYNC)
> -			return 0;
> -	}
> +	if (!compact_check_resched(cc))
> +		return 0;
>  
>  	/* Time to isolate some pages for migration */
>  	for (; low_pfn < end_pfn; low_pfn++) {
> @@ -718,9 +736,10 @@ static void isolate_freepages(struct zone *zone,
>  		/*
>  		 * This can iterate a massively long zone without finding any
>  		 * suitable migration targets, so periodically check if we need
> -		 * to schedule.
> +		 * to schedule, or even abort async compaction.
>  		 */
> -		cond_resched();
> +		if (!compact_check_resched(cc))
> +			break;
>  
>  		if (!pfn_valid(block_start_pfn))
>  			continue;
> @@ -758,6 +777,13 @@ static void isolate_freepages(struct zone *zone,
>  		 */
>  		if (isolated)
>  			cc->finished_update_free = true;
> +
> +		/*
> +		 * isolate_freepages_block() might have aborted due to async
> +		 * compaction being contended
> +		 */
> +		if (cc->contended)
> +			break;
>  	}

Hello,

I think that we can do further.

The problem is that this cc->contended is checked only in
isolate_migratepages() to break out the compaction. So if there are
free pages we are already taken, compaction wouldn't stopped
immediately and isolate_freepages() could be invoked again on next
compaction_alloc(). If there is no contention at this time, we would try
to get free pages from one pageblock because cc->contended checking is
on bottom of the loop in isolate_migratepages() and will continue to
run compaction. AFAIK, we want to stop the compaction in this case. 

Moreover, if this isolate_freepages() don't stop the compaction,
next isolate_migratepages() will be invoked and it would be stopped
by checking cc->contended after isolating some pages for migration.
This is useless overhead so should be removed.

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-12  9:09               ` Vlastimil Babka
@ 2014-05-13  1:15                 ` Joonsoo Kim
  -1 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-13  1:15 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Mon, May 12, 2014 at 11:09:25AM +0200, Vlastimil Babka wrote:
> On 05/08/2014 07:28 AM, Joonsoo Kim wrote:
> >On Wed, May 07, 2014 at 02:09:10PM +0200, Vlastimil Babka wrote:
> >>The compaction free scanner in isolate_freepages() currently remembers PFN of
> >>the highest pageblock where it successfully isolates, to be used as the
> >>starting pageblock for the next invocation. The rationale behind this is that
> >>page migration might return free pages to the allocator when migration fails
> >>and we don't want to skip them if the compaction continues.
> >>
> >>Since migration now returns free pages back to compaction code where they can
> >>be reused, this is no longer a concern. This patch changes isolate_freepages()
> >>so that the PFN for restarting is updated with each pageblock where isolation
> >>is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> >>reduction of the pages scanned by the free scanner.
> >
> >Hello,
> >
> >Although this patch could reduce page scanned, it is possible to skip
> >scanning fresh pageblock. If there is zone lock contention and we are on
> >asyn compaction, we stop scanning this pageblock immediately. And
> >then, we will continue to scan next pageblock. With this patch,
> >next_free_pfn is updated in this case, so we never come back again to this
> >pageblock. Possibly this makes compaction success rate low, doesn't
> >it?
> 
> Hm, you're right and thanks for catching that, but I think this is a
> sign of a worse and older issue than skipping a pageblock?
> When isolate_freepages_block() breaks loop due to lock contention,
> then isolate_freepages() (which called it) should also immediately
> quit its loop. Trying another pageblock in the same zone with the
> same zone->lock makes no sense here? If this is fixed, then the
> issue you're pointing out will also be fixed as next_free_pfn will
> still point to the pageblock where the break occured.
> 

Yes. It can be fixed by your approach. :)

Thanks.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-13  1:15                 ` Joonsoo Kim
  0 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-13  1:15 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Mon, May 12, 2014 at 11:09:25AM +0200, Vlastimil Babka wrote:
> On 05/08/2014 07:28 AM, Joonsoo Kim wrote:
> >On Wed, May 07, 2014 at 02:09:10PM +0200, Vlastimil Babka wrote:
> >>The compaction free scanner in isolate_freepages() currently remembers PFN of
> >>the highest pageblock where it successfully isolates, to be used as the
> >>starting pageblock for the next invocation. The rationale behind this is that
> >>page migration might return free pages to the allocator when migration fails
> >>and we don't want to skip them if the compaction continues.
> >>
> >>Since migration now returns free pages back to compaction code where they can
> >>be reused, this is no longer a concern. This patch changes isolate_freepages()
> >>so that the PFN for restarting is updated with each pageblock where isolation
> >>is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> >>reduction of the pages scanned by the free scanner.
> >
> >Hello,
> >
> >Although this patch could reduce page scanned, it is possible to skip
> >scanning fresh pageblock. If there is zone lock contention and we are on
> >asyn compaction, we stop scanning this pageblock immediately. And
> >then, we will continue to scan next pageblock. With this patch,
> >next_free_pfn is updated in this case, so we never come back again to this
> >pageblock. Possibly this makes compaction success rate low, doesn't
> >it?
> 
> Hm, you're right and thanks for catching that, but I think this is a
> sign of a worse and older issue than skipping a pageblock?
> When isolate_freepages_block() breaks loop due to lock contention,
> then isolate_freepages() (which called it) should also immediately
> quit its loop. Trying another pageblock in the same zone with the
> same zone->lock makes no sense here? If this is fixed, then the
> issue you're pointing out will also be fixed as next_free_pfn will
> still point to the pageblock where the break occured.
> 

Yes. It can be fixed by your approach. :)

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-12 20:28             ` David Rientjes
@ 2014-05-13  8:50               ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-13  8:50 UTC (permalink / raw)
  To: David Rientjes
  Cc: Joonsoo Kim, Andrew Morton, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 05/12/2014 10:28 PM, David Rientjes wrote:
> On Mon, 12 May 2014, Vlastimil Babka wrote:
>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 83ca6f9..b34ab7c 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>>   	return true;
>>   }
>>
>> +/*
>> + * Similar to compact_checklock_irqsave() (see its comment) for places where
>> + * a zone lock is not concerned.
>> + *
>> + * Returns false when compaction should abort.
>> + */
>
> I think we should have some sufficient commentary in the code that
> describes why we do this.

Well I can of course mostly duplicate the comment of 
compact_checklock_irqsave() instead of referring to it, if you think 
that's better.

>> +static inline bool compact_check_resched(struct compact_control *cc)
>> +{
>
> I'm not sure that compact_check_resched() is the appropriate name.  Sure,
> it specifies what the current implementation is, but what it's really
> actually doing is determining when compaction should abort prematurely.
>
> Something like compact_should_abort()?

I tried to be somewhat analogous to the name of 
compact_checklock_irqsave(). compact_should_abort() doesn't indicate 
that there might be a resched().

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-13  8:50               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-13  8:50 UTC (permalink / raw)
  To: David Rientjes
  Cc: Joonsoo Kim, Andrew Morton, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 05/12/2014 10:28 PM, David Rientjes wrote:
> On Mon, 12 May 2014, Vlastimil Babka wrote:
>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 83ca6f9..b34ab7c 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>>   	return true;
>>   }
>>
>> +/*
>> + * Similar to compact_checklock_irqsave() (see its comment) for places where
>> + * a zone lock is not concerned.
>> + *
>> + * Returns false when compaction should abort.
>> + */
>
> I think we should have some sufficient commentary in the code that
> describes why we do this.

Well I can of course mostly duplicate the comment of 
compact_checklock_irqsave() instead of referring to it, if you think 
that's better.

>> +static inline bool compact_check_resched(struct compact_control *cc)
>> +{
>
> I'm not sure that compact_check_resched() is the appropriate name.  Sure,
> it specifies what the current implementation is, but what it's really
> actually doing is determining when compaction should abort prematurely.
>
> Something like compact_should_abort()?

I tried to be somewhat analogous to the name of 
compact_checklock_irqsave(). compact_should_abort() doesn't indicate 
that there might be a resched().

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-13  0:44             ` Joonsoo Kim
@ 2014-05-13  8:54               ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-13  8:54 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 05/13/2014 02:44 AM, Joonsoo Kim wrote:
> On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>>
>> First, compaction also calls directly cond_resched() in both scanners when no
>> lock is yet taken. This call either does not abort async compaction, or set
>> cc->contended appropriately. This patch introduces a new
>> compact_check_resched() function to achieve both.
>>
>> Second, isolate_freepages() does not check if isolate_freepages_block()
>> aborted due to contention, and advances to the next pageblock. This violates
>> the principle of aborting on contention, and might result in pageblocks not
>> being scanned completely, since the scanning cursor is advanced. This patch
>> makes isolate_freepages_block() check the cc->contended flag and abort.
>>
>> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
>> Cc: Michal Nazarewicz <mina86@mina86.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
>>   1 file changed, 33 insertions(+), 7 deletions(-)
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 83ca6f9..b34ab7c 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>>   	return true;
>>   }
>>
>> +/*
>> + * Similar to compact_checklock_irqsave() (see its comment) for places where
>> + * a zone lock is not concerned.
>> + *
>> + * Returns false when compaction should abort.
>> + */
>> +static inline bool compact_check_resched(struct compact_control *cc)
>> +{
>> +	/* async compaction aborts if contended */
>> +	if (need_resched()) {
>> +		if (cc->mode == MIGRATE_ASYNC) {
>> +			cc->contended = true;
>> +			return false;
>> +		}
>> +
>> +		cond_resched();
>> +	}
>> +
>> +	return true;
>> +}
>> +
>>   /* Returns true if the page is within a block suitable for migration to */
>>   static bool suitable_migration_target(struct page *page)
>>   {
>> @@ -491,11 +512,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>>   			return 0;
>>   	}
>>
>> -	if (cond_resched()) {
>> -		/* Async terminates prematurely on need_resched() */
>> -		if (cc->mode == MIGRATE_ASYNC)
>> -			return 0;
>> -	}
>> +	if (!compact_check_resched(cc))
>> +		return 0;
>>
>>   	/* Time to isolate some pages for migration */
>>   	for (; low_pfn < end_pfn; low_pfn++) {
>> @@ -718,9 +736,10 @@ static void isolate_freepages(struct zone *zone,
>>   		/*
>>   		 * This can iterate a massively long zone without finding any
>>   		 * suitable migration targets, so periodically check if we need
>> -		 * to schedule.
>> +		 * to schedule, or even abort async compaction.
>>   		 */
>> -		cond_resched();
>> +		if (!compact_check_resched(cc))
>> +			break;
>>
>>   		if (!pfn_valid(block_start_pfn))
>>   			continue;
>> @@ -758,6 +777,13 @@ static void isolate_freepages(struct zone *zone,
>>   		 */
>>   		if (isolated)
>>   			cc->finished_update_free = true;
>> +
>> +		/*
>> +		 * isolate_freepages_block() might have aborted due to async
>> +		 * compaction being contended
>> +		 */
>> +		if (cc->contended)
>> +			break;
>>   	}
>
> Hello,
>
> I think that we can do further.
>
> The problem is that this cc->contended is checked only in
> isolate_migratepages() to break out the compaction. So if there are
> free pages we are already taken, compaction wouldn't stopped
> immediately and isolate_freepages() could be invoked again on next
> compaction_alloc(). If there is no contention at this time, we would try
> to get free pages from one pageblock because cc->contended checking is
> on bottom of the loop in isolate_migratepages() and will continue to
> run compaction. AFAIK, we want to stop the compaction in this case.
>
> Moreover, if this isolate_freepages() don't stop the compaction,
> next isolate_migratepages() will be invoked and it would be stopped
> by checking cc->contended after isolating some pages for migration.
> This is useless overhead so should be removed.

Good catch again, thanks! So that means checking the flag also in 
compaction_alloc(). But what to do if we managed isolated something and 
then found out about being contended? Put all pages back and go home, or 
try to migrate what we have?

I'm becoming worried that all these changes will mean that async 
compaction will have near zero probability of finishing anything before 
hitting a contention. And then everything it did until the contention 
would be a wasted work.

> Thanks.
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-13  8:54               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-13  8:54 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 05/13/2014 02:44 AM, Joonsoo Kim wrote:
> On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>>
>> First, compaction also calls directly cond_resched() in both scanners when no
>> lock is yet taken. This call either does not abort async compaction, or set
>> cc->contended appropriately. This patch introduces a new
>> compact_check_resched() function to achieve both.
>>
>> Second, isolate_freepages() does not check if isolate_freepages_block()
>> aborted due to contention, and advances to the next pageblock. This violates
>> the principle of aborting on contention, and might result in pageblocks not
>> being scanned completely, since the scanning cursor is advanced. This patch
>> makes isolate_freepages_block() check the cc->contended flag and abort.
>>
>> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
>> Cc: Michal Nazarewicz <mina86@mina86.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
>>   1 file changed, 33 insertions(+), 7 deletions(-)
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 83ca6f9..b34ab7c 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>>   	return true;
>>   }
>>
>> +/*
>> + * Similar to compact_checklock_irqsave() (see its comment) for places where
>> + * a zone lock is not concerned.
>> + *
>> + * Returns false when compaction should abort.
>> + */
>> +static inline bool compact_check_resched(struct compact_control *cc)
>> +{
>> +	/* async compaction aborts if contended */
>> +	if (need_resched()) {
>> +		if (cc->mode == MIGRATE_ASYNC) {
>> +			cc->contended = true;
>> +			return false;
>> +		}
>> +
>> +		cond_resched();
>> +	}
>> +
>> +	return true;
>> +}
>> +
>>   /* Returns true if the page is within a block suitable for migration to */
>>   static bool suitable_migration_target(struct page *page)
>>   {
>> @@ -491,11 +512,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>>   			return 0;
>>   	}
>>
>> -	if (cond_resched()) {
>> -		/* Async terminates prematurely on need_resched() */
>> -		if (cc->mode == MIGRATE_ASYNC)
>> -			return 0;
>> -	}
>> +	if (!compact_check_resched(cc))
>> +		return 0;
>>
>>   	/* Time to isolate some pages for migration */
>>   	for (; low_pfn < end_pfn; low_pfn++) {
>> @@ -718,9 +736,10 @@ static void isolate_freepages(struct zone *zone,
>>   		/*
>>   		 * This can iterate a massively long zone without finding any
>>   		 * suitable migration targets, so periodically check if we need
>> -		 * to schedule.
>> +		 * to schedule, or even abort async compaction.
>>   		 */
>> -		cond_resched();
>> +		if (!compact_check_resched(cc))
>> +			break;
>>
>>   		if (!pfn_valid(block_start_pfn))
>>   			continue;
>> @@ -758,6 +777,13 @@ static void isolate_freepages(struct zone *zone,
>>   		 */
>>   		if (isolated)
>>   			cc->finished_update_free = true;
>> +
>> +		/*
>> +		 * isolate_freepages_block() might have aborted due to async
>> +		 * compaction being contended
>> +		 */
>> +		if (cc->contended)
>> +			break;
>>   	}
>
> Hello,
>
> I think that we can do further.
>
> The problem is that this cc->contended is checked only in
> isolate_migratepages() to break out the compaction. So if there are
> free pages we are already taken, compaction wouldn't stopped
> immediately and isolate_freepages() could be invoked again on next
> compaction_alloc(). If there is no contention at this time, we would try
> to get free pages from one pageblock because cc->contended checking is
> on bottom of the loop in isolate_migratepages() and will continue to
> run compaction. AFAIK, we want to stop the compaction in this case.
>
> Moreover, if this isolate_freepages() don't stop the compaction,
> next isolate_migratepages() will be invoked and it would be stopped
> by checking cc->contended after isolating some pages for migration.
> This is useless overhead so should be removed.

Good catch again, thanks! So that means checking the flag also in 
compaction_alloc(). But what to do if we managed isolated something and 
then found out about being contended? Put all pages back and go home, or 
try to migrate what we have?

I'm becoming worried that all these changes will mean that async 
compaction will have near zero probability of finishing anything before 
hitting a contention. And then everything it did until the contention 
would be a wasted work.

> Thanks.
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm] mm, thp: avoid excessive compaction latency during fault fix
  2014-05-08  5:30         ` David Rientjes
@ 2014-05-13 10:00           ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-13 10:00 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/08/2014 07:30 AM, David Rientjes wrote:
> mm-thp-avoid-excessive-compaction-latency-during-fault.patch excludes sync
> compaction for all high order allocations other than thp.  What we really
> want to do is suppress sync compaction for thp, but only during the page
> fault path.
>
> Orders greater than PAGE_ALLOC_COSTLY_ORDER aren't necessarily going to
> loop again so this is the only way to exhaust our capabilities before
> declaring that we can't allocate.
>
> Reported-by: Hugh Dickins <hughd@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   mm/page_alloc.c | 17 +++++++----------
>   1 file changed, 7 insertions(+), 10 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2585,16 +2585,13 @@ rebalance:
>   	if (page)
>   		goto got_pg;
>
> -	if (gfp_mask & __GFP_NO_KSWAPD) {
> -		/*
> -		 * Khugepaged is allowed to try MIGRATE_SYNC_LIGHT, the latency
> -		 * of this allocation isn't critical.  Everything else, however,
> -		 * should only be allowed to do MIGRATE_ASYNC to avoid excessive
> -		 * stalls during fault.
> -		 */
> -		if ((current->flags & (PF_KTHREAD | PF_KSWAPD)) == PF_KTHREAD)
> -			migration_mode = MIGRATE_SYNC_LIGHT;
> -	}
> +	/*
> +	 * It can become very expensive to allocate transparent hugepages at
> +	 * fault, so use asynchronous memory compaction for THP unless it is
> +	 * khugepaged trying to collapse.
> +	 */
> +	if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
> +		migration_mode = MIGRATE_SYNC_LIGHT;

I wonder what about a process doing e.g. mmap() with MAP_POPULATE. It 
seems to me that it would get only MIGRATE_ASYNC here, right? Since 
gfp_mask would include __GFP_NO_KSWAPD and it won't have PF_KTHREAD.
I think that goes against the idea that with MAP_POPULATE you say you 
are willing to wait to have everything in place before you actually use 
the memory. So I guess you are also willing to wait for hugepages in 
that situation?

>
>   	/*
>   	 * If compaction is deferred for high-order allocations, it is because
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm] mm, thp: avoid excessive compaction latency during fault fix
@ 2014-05-13 10:00           ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-13 10:00 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/08/2014 07:30 AM, David Rientjes wrote:
> mm-thp-avoid-excessive-compaction-latency-during-fault.patch excludes sync
> compaction for all high order allocations other than thp.  What we really
> want to do is suppress sync compaction for thp, but only during the page
> fault path.
>
> Orders greater than PAGE_ALLOC_COSTLY_ORDER aren't necessarily going to
> loop again so this is the only way to exhaust our capabilities before
> declaring that we can't allocate.
>
> Reported-by: Hugh Dickins <hughd@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   mm/page_alloc.c | 17 +++++++----------
>   1 file changed, 7 insertions(+), 10 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2585,16 +2585,13 @@ rebalance:
>   	if (page)
>   		goto got_pg;
>
> -	if (gfp_mask & __GFP_NO_KSWAPD) {
> -		/*
> -		 * Khugepaged is allowed to try MIGRATE_SYNC_LIGHT, the latency
> -		 * of this allocation isn't critical.  Everything else, however,
> -		 * should only be allowed to do MIGRATE_ASYNC to avoid excessive
> -		 * stalls during fault.
> -		 */
> -		if ((current->flags & (PF_KTHREAD | PF_KSWAPD)) == PF_KTHREAD)
> -			migration_mode = MIGRATE_SYNC_LIGHT;
> -	}
> +	/*
> +	 * It can become very expensive to allocate transparent hugepages at
> +	 * fault, so use asynchronous memory compaction for THP unless it is
> +	 * khugepaged trying to collapse.
> +	 */
> +	if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
> +		migration_mode = MIGRATE_SYNC_LIGHT;

I wonder what about a process doing e.g. mmap() with MAP_POPULATE. It 
seems to me that it would get only MIGRATE_ASYNC here, right? Since 
gfp_mask would include __GFP_NO_KSWAPD and it won't have PF_KTHREAD.
I think that goes against the idea that with MAP_POPULATE you say you 
are willing to wait to have everything in place before you actually use 
the memory. So I guess you are also willing to wait for hugepages in 
that situation?

>
>   	/*
>   	 * If compaction is deferred for high-order allocations, it is because
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-13  8:54               ` Vlastimil Babka
@ 2014-05-15  2:21                 ` Joonsoo Kim
  -1 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-15  2:21 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Tue, May 13, 2014 at 10:54:58AM +0200, Vlastimil Babka wrote:
> On 05/13/2014 02:44 AM, Joonsoo Kim wrote:
> >On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
> >>Compaction uses compact_checklock_irqsave() function to periodically check for
> >>lock contention and need_resched() to either abort async compaction, or to
> >>free the lock, schedule and retake the lock. When aborting, cc->contended is
> >>set to signal the contended state to the caller. Two problems have been
> >>identified in this mechanism.
> >>
> >>First, compaction also calls directly cond_resched() in both scanners when no
> >>lock is yet taken. This call either does not abort async compaction, or set
> >>cc->contended appropriately. This patch introduces a new
> >>compact_check_resched() function to achieve both.
> >>
> >>Second, isolate_freepages() does not check if isolate_freepages_block()
> >>aborted due to contention, and advances to the next pageblock. This violates
> >>the principle of aborting on contention, and might result in pageblocks not
> >>being scanned completely, since the scanning cursor is advanced. This patch
> >>makes isolate_freepages_block() check the cc->contended flag and abort.
> >>
> >>Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> >>Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> >>Cc: Minchan Kim <minchan@kernel.org>
> >>Cc: Mel Gorman <mgorman@suse.de>
> >>Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> >>Cc: Michal Nazarewicz <mina86@mina86.com>
> >>Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> >>Cc: Christoph Lameter <cl@linux.com>
> >>Cc: Rik van Riel <riel@redhat.com>
> >>---
> >>  mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
> >>  1 file changed, 33 insertions(+), 7 deletions(-)
> >>
> >>diff --git a/mm/compaction.c b/mm/compaction.c
> >>index 83ca6f9..b34ab7c 100644
> >>--- a/mm/compaction.c
> >>+++ b/mm/compaction.c
> >>@@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
> >>  	return true;
> >>  }
> >>
> >>+/*
> >>+ * Similar to compact_checklock_irqsave() (see its comment) for places where
> >>+ * a zone lock is not concerned.
> >>+ *
> >>+ * Returns false when compaction should abort.
> >>+ */
> >>+static inline bool compact_check_resched(struct compact_control *cc)
> >>+{
> >>+	/* async compaction aborts if contended */
> >>+	if (need_resched()) {
> >>+		if (cc->mode == MIGRATE_ASYNC) {
> >>+			cc->contended = true;
> >>+			return false;
> >>+		}
> >>+
> >>+		cond_resched();
> >>+	}
> >>+
> >>+	return true;
> >>+}
> >>+
> >>  /* Returns true if the page is within a block suitable for migration to */
> >>  static bool suitable_migration_target(struct page *page)
> >>  {
> >>@@ -491,11 +512,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> >>  			return 0;
> >>  	}
> >>
> >>-	if (cond_resched()) {
> >>-		/* Async terminates prematurely on need_resched() */
> >>-		if (cc->mode == MIGRATE_ASYNC)
> >>-			return 0;
> >>-	}
> >>+	if (!compact_check_resched(cc))
> >>+		return 0;
> >>
> >>  	/* Time to isolate some pages for migration */
> >>  	for (; low_pfn < end_pfn; low_pfn++) {
> >>@@ -718,9 +736,10 @@ static void isolate_freepages(struct zone *zone,
> >>  		/*
> >>  		 * This can iterate a massively long zone without finding any
> >>  		 * suitable migration targets, so periodically check if we need
> >>-		 * to schedule.
> >>+		 * to schedule, or even abort async compaction.
> >>  		 */
> >>-		cond_resched();
> >>+		if (!compact_check_resched(cc))
> >>+			break;
> >>
> >>  		if (!pfn_valid(block_start_pfn))
> >>  			continue;
> >>@@ -758,6 +777,13 @@ static void isolate_freepages(struct zone *zone,
> >>  		 */
> >>  		if (isolated)
> >>  			cc->finished_update_free = true;
> >>+
> >>+		/*
> >>+		 * isolate_freepages_block() might have aborted due to async
> >>+		 * compaction being contended
> >>+		 */
> >>+		if (cc->contended)
> >>+			break;
> >>  	}
> >
> >Hello,
> >
> >I think that we can do further.
> >
> >The problem is that this cc->contended is checked only in
> >isolate_migratepages() to break out the compaction. So if there are
> >free pages we are already taken, compaction wouldn't stopped
> >immediately and isolate_freepages() could be invoked again on next
> >compaction_alloc(). If there is no contention at this time, we would try
> >to get free pages from one pageblock because cc->contended checking is
> >on bottom of the loop in isolate_migratepages() and will continue to
> >run compaction. AFAIK, we want to stop the compaction in this case.
> >
> >Moreover, if this isolate_freepages() don't stop the compaction,
> >next isolate_migratepages() will be invoked and it would be stopped
> >by checking cc->contended after isolating some pages for migration.
> >This is useless overhead so should be removed.
> 
> Good catch again, thanks! So that means checking the flag also in
> compaction_alloc(). But what to do if we managed isolated something
> and then found out about being contended? Put all pages back and go
> home, or try to migrate what we have?

I think that 'try to migrate what we have' is better, because it
doesn't cause contention on zone lock anymore until freepages are
exhausted. If there is another contention on other things such as page
lock, it will skip it, so continuation would not be the problem, I think.

> 
> I'm becoming worried that all these changes will mean that async
> compaction will have near zero probability of finishing anything
> before hitting a contention. And then everything it did until the
> contention would be a wasted work.

Yes, but I think considering this logic would not cause the success
rate to be much lowered than current logic, because, without this change,
compaction stop after next isolate_migratepages().

Thanks.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-15  2:21                 ` Joonsoo Kim
  0 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-15  2:21 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Tue, May 13, 2014 at 10:54:58AM +0200, Vlastimil Babka wrote:
> On 05/13/2014 02:44 AM, Joonsoo Kim wrote:
> >On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
> >>Compaction uses compact_checklock_irqsave() function to periodically check for
> >>lock contention and need_resched() to either abort async compaction, or to
> >>free the lock, schedule and retake the lock. When aborting, cc->contended is
> >>set to signal the contended state to the caller. Two problems have been
> >>identified in this mechanism.
> >>
> >>First, compaction also calls directly cond_resched() in both scanners when no
> >>lock is yet taken. This call either does not abort async compaction, or set
> >>cc->contended appropriately. This patch introduces a new
> >>compact_check_resched() function to achieve both.
> >>
> >>Second, isolate_freepages() does not check if isolate_freepages_block()
> >>aborted due to contention, and advances to the next pageblock. This violates
> >>the principle of aborting on contention, and might result in pageblocks not
> >>being scanned completely, since the scanning cursor is advanced. This patch
> >>makes isolate_freepages_block() check the cc->contended flag and abort.
> >>
> >>Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> >>Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> >>Cc: Minchan Kim <minchan@kernel.org>
> >>Cc: Mel Gorman <mgorman@suse.de>
> >>Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> >>Cc: Michal Nazarewicz <mina86@mina86.com>
> >>Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> >>Cc: Christoph Lameter <cl@linux.com>
> >>Cc: Rik van Riel <riel@redhat.com>
> >>---
> >>  mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
> >>  1 file changed, 33 insertions(+), 7 deletions(-)
> >>
> >>diff --git a/mm/compaction.c b/mm/compaction.c
> >>index 83ca6f9..b34ab7c 100644
> >>--- a/mm/compaction.c
> >>+++ b/mm/compaction.c
> >>@@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
> >>  	return true;
> >>  }
> >>
> >>+/*
> >>+ * Similar to compact_checklock_irqsave() (see its comment) for places where
> >>+ * a zone lock is not concerned.
> >>+ *
> >>+ * Returns false when compaction should abort.
> >>+ */
> >>+static inline bool compact_check_resched(struct compact_control *cc)
> >>+{
> >>+	/* async compaction aborts if contended */
> >>+	if (need_resched()) {
> >>+		if (cc->mode == MIGRATE_ASYNC) {
> >>+			cc->contended = true;
> >>+			return false;
> >>+		}
> >>+
> >>+		cond_resched();
> >>+	}
> >>+
> >>+	return true;
> >>+}
> >>+
> >>  /* Returns true if the page is within a block suitable for migration to */
> >>  static bool suitable_migration_target(struct page *page)
> >>  {
> >>@@ -491,11 +512,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> >>  			return 0;
> >>  	}
> >>
> >>-	if (cond_resched()) {
> >>-		/* Async terminates prematurely on need_resched() */
> >>-		if (cc->mode == MIGRATE_ASYNC)
> >>-			return 0;
> >>-	}
> >>+	if (!compact_check_resched(cc))
> >>+		return 0;
> >>
> >>  	/* Time to isolate some pages for migration */
> >>  	for (; low_pfn < end_pfn; low_pfn++) {
> >>@@ -718,9 +736,10 @@ static void isolate_freepages(struct zone *zone,
> >>  		/*
> >>  		 * This can iterate a massively long zone without finding any
> >>  		 * suitable migration targets, so periodically check if we need
> >>-		 * to schedule.
> >>+		 * to schedule, or even abort async compaction.
> >>  		 */
> >>-		cond_resched();
> >>+		if (!compact_check_resched(cc))
> >>+			break;
> >>
> >>  		if (!pfn_valid(block_start_pfn))
> >>  			continue;
> >>@@ -758,6 +777,13 @@ static void isolate_freepages(struct zone *zone,
> >>  		 */
> >>  		if (isolated)
> >>  			cc->finished_update_free = true;
> >>+
> >>+		/*
> >>+		 * isolate_freepages_block() might have aborted due to async
> >>+		 * compaction being contended
> >>+		 */
> >>+		if (cc->contended)
> >>+			break;
> >>  	}
> >
> >Hello,
> >
> >I think that we can do further.
> >
> >The problem is that this cc->contended is checked only in
> >isolate_migratepages() to break out the compaction. So if there are
> >free pages we are already taken, compaction wouldn't stopped
> >immediately and isolate_freepages() could be invoked again on next
> >compaction_alloc(). If there is no contention at this time, we would try
> >to get free pages from one pageblock because cc->contended checking is
> >on bottom of the loop in isolate_migratepages() and will continue to
> >run compaction. AFAIK, we want to stop the compaction in this case.
> >
> >Moreover, if this isolate_freepages() don't stop the compaction,
> >next isolate_migratepages() will be invoked and it would be stopped
> >by checking cc->contended after isolating some pages for migration.
> >This is useless overhead so should be removed.
> 
> Good catch again, thanks! So that means checking the flag also in
> compaction_alloc(). But what to do if we managed isolated something
> and then found out about being contended? Put all pages back and go
> home, or try to migrate what we have?

I think that 'try to migrate what we have' is better, because it
doesn't cause contention on zone lock anymore until freepages are
exhausted. If there is another contention on other things such as page
lock, it will skip it, so continuation would not be the problem, I think.

> 
> I'm becoming worried that all these changes will mean that async
> compaction will have near zero probability of finishing anything
> before hitting a contention. And then everything it did until the
> contention would be a wasted work.

Yes, but I think considering this logic would not cause the success
rate to be much lowered than current logic, because, without this change,
compaction stop after next isolate_migratepages().

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-12 14:15           ` Vlastimil Babka
@ 2014-05-16  9:47             ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-16  9:47 UTC (permalink / raw)
  To: Joonsoo Kim, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

Compaction uses compact_checklock_irqsave() function to periodically check for
lock contention and need_resched() to either abort async compaction, or to
free the lock, schedule and retake the lock. When aborting, cc->contended is
set to signal the contended state to the caller. Two problems have been
identified in this mechanism.

First, compaction also calls directly cond_resched() in both scanners when no
lock is yet taken. This call either does not abort async compaction, or set
cc->contended appropriately. This patch introduces a new compact_should_abort()
function to achieve both. In isolate_freepages(), the check frequency is
reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
scanner does in the preliminary page checks. In case a pageblock is found
suitable for calling isolate_freepages_block(), the checks within there are
done on higher frequency.

Second, isolate_freepages() does not check if isolate_freepages_block()
aborted due to contention, and advances to the next pageblock. This violates
the principle of aborting on contention, and might result in pageblocks not
being scanned completely, since the scanning cursor is advanced. This patch
makes isolate_freepages_block() check the cc->contended flag and abort.

In case isolate_freepages() has already isolated some pages before aborting
due to contention, page migration will proceed, which is OK since we do not
want to waste the work that has been done, and page migration has own checks
for contention. However, we do not want another isolation attempt by either
of the scanners, so cc->contended flag check is added also to
compaction_alloc() and compact_finished() to make sure compaction is aborted
right after the migration.

Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
v2: update struct compact_control comment (per Naoya Horiguchi)
    rename to compact_should_abort() and add comments (per David Rientjes)
    add cc->contended checks in compaction_alloc() and compact_finished()
    (per Joonsoo Kim)
    reduce frequency of checks in isolate_freepages() 

 mm/compaction.c | 54 ++++++++++++++++++++++++++++++++++++++++++++----------
 mm/internal.h   |  5 ++++-
 2 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 83ca6f9..6fc9f18 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -222,6 +222,30 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 	return true;
 }
 
+/*
+ * Aside from avoiding lock contention, compaction also periodically checks
+ * need_resched() and either schedules in sync compaction, or aborts async
+ * compaction. This is similar to compact_checklock_irqsave() does, but used
+ * where no lock is concerned.
+ *
+ * Returns false when no scheduling was needed, or sync compaction scheduled.
+ * Returns true when async compaction should abort.
+ */
+static inline bool compact_should_abort(struct compact_control *cc)
+{
+	/* async compaction aborts if contended */
+	if (need_resched()) {
+		if (cc->mode == MIGRATE_ASYNC) {
+			cc->contended = true;
+			return false;
+		}
+
+		cond_resched();
+	}
+
+	return true;
+}
+
 /* Returns true if the page is within a block suitable for migration to */
 static bool suitable_migration_target(struct page *page)
 {
@@ -491,11 +515,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			return 0;
 	}
 
-	if (cond_resched()) {
-		/* Async terminates prematurely on need_resched() */
-		if (cc->mode == MIGRATE_ASYNC)
-			return 0;
-	}
+	if (compact_should_abort(cc))
+		return 0;
 
 	/* Time to isolate some pages for migration */
 	for (; low_pfn < end_pfn; low_pfn++) {
@@ -718,9 +739,11 @@ static void isolate_freepages(struct zone *zone,
 		/*
 		 * This can iterate a massively long zone without finding any
 		 * suitable migration targets, so periodically check if we need
-		 * to schedule.
+		 * to schedule, or even abort async compaction.
 		 */
-		cond_resched();
+		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
+						&& compact_should_abort(cc))
+			break;
 
 		if (!pfn_valid(block_start_pfn))
 			continue;
@@ -758,6 +781,13 @@ static void isolate_freepages(struct zone *zone,
 		 */
 		if (isolated)
 			cc->finished_update_free = true;
+
+		/*
+		 * isolate_freepages_block() might have aborted due to async
+		 * compaction being contended
+		 */
+		if (cc->contended)
+			break;
 	}
 
 	/* split_free_page does not map the pages */
@@ -785,9 +815,13 @@ static struct page *compaction_alloc(struct page *migratepage,
 	struct compact_control *cc = (struct compact_control *)data;
 	struct page *freepage;
 
-	/* Isolate free pages if necessary */
+	/*
+	 * Isolate free pages if necessary, and if we are not aborting due to
+	 * contention.
+	 */
 	if (list_empty(&cc->freepages)) {
-		isolate_freepages(cc->zone, cc);
+		if (!cc->contended)
+			isolate_freepages(cc->zone, cc);
 
 		if (list_empty(&cc->freepages))
 			return NULL;
@@ -857,7 +891,7 @@ static int compact_finished(struct zone *zone,
 	unsigned int order;
 	unsigned long watermark;
 
-	if (fatal_signal_pending(current))
+	if (cc->contended || fatal_signal_pending(current))
 		return COMPACT_PARTIAL;
 
 	/* Compaction run completes if the migrate and free scanner meet */
diff --git a/mm/internal.h b/mm/internal.h
index a25424a..ad844ab 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -144,7 +144,10 @@ struct compact_control {
 	int order;			/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
-	bool contended;			/* True if a lock was contended */
+	bool contended;			/* True if a lock was contended, or
+					 * need_resched() true during async
+					 * compaction
+					 */
 };
 
 unsigned long
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-16  9:47             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-16  9:47 UTC (permalink / raw)
  To: Joonsoo Kim, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

Compaction uses compact_checklock_irqsave() function to periodically check for
lock contention and need_resched() to either abort async compaction, or to
free the lock, schedule and retake the lock. When aborting, cc->contended is
set to signal the contended state to the caller. Two problems have been
identified in this mechanism.

First, compaction also calls directly cond_resched() in both scanners when no
lock is yet taken. This call either does not abort async compaction, or set
cc->contended appropriately. This patch introduces a new compact_should_abort()
function to achieve both. In isolate_freepages(), the check frequency is
reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
scanner does in the preliminary page checks. In case a pageblock is found
suitable for calling isolate_freepages_block(), the checks within there are
done on higher frequency.

Second, isolate_freepages() does not check if isolate_freepages_block()
aborted due to contention, and advances to the next pageblock. This violates
the principle of aborting on contention, and might result in pageblocks not
being scanned completely, since the scanning cursor is advanced. This patch
makes isolate_freepages_block() check the cc->contended flag and abort.

In case isolate_freepages() has already isolated some pages before aborting
due to contention, page migration will proceed, which is OK since we do not
want to waste the work that has been done, and page migration has own checks
for contention. However, we do not want another isolation attempt by either
of the scanners, so cc->contended flag check is added also to
compaction_alloc() and compact_finished() to make sure compaction is aborted
right after the migration.

Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
v2: update struct compact_control comment (per Naoya Horiguchi)
    rename to compact_should_abort() and add comments (per David Rientjes)
    add cc->contended checks in compaction_alloc() and compact_finished()
    (per Joonsoo Kim)
    reduce frequency of checks in isolate_freepages() 

 mm/compaction.c | 54 ++++++++++++++++++++++++++++++++++++++++++++----------
 mm/internal.h   |  5 ++++-
 2 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 83ca6f9..6fc9f18 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -222,6 +222,30 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 	return true;
 }
 
+/*
+ * Aside from avoiding lock contention, compaction also periodically checks
+ * need_resched() and either schedules in sync compaction, or aborts async
+ * compaction. This is similar to compact_checklock_irqsave() does, but used
+ * where no lock is concerned.
+ *
+ * Returns false when no scheduling was needed, or sync compaction scheduled.
+ * Returns true when async compaction should abort.
+ */
+static inline bool compact_should_abort(struct compact_control *cc)
+{
+	/* async compaction aborts if contended */
+	if (need_resched()) {
+		if (cc->mode == MIGRATE_ASYNC) {
+			cc->contended = true;
+			return false;
+		}
+
+		cond_resched();
+	}
+
+	return true;
+}
+
 /* Returns true if the page is within a block suitable for migration to */
 static bool suitable_migration_target(struct page *page)
 {
@@ -491,11 +515,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			return 0;
 	}
 
-	if (cond_resched()) {
-		/* Async terminates prematurely on need_resched() */
-		if (cc->mode == MIGRATE_ASYNC)
-			return 0;
-	}
+	if (compact_should_abort(cc))
+		return 0;
 
 	/* Time to isolate some pages for migration */
 	for (; low_pfn < end_pfn; low_pfn++) {
@@ -718,9 +739,11 @@ static void isolate_freepages(struct zone *zone,
 		/*
 		 * This can iterate a massively long zone without finding any
 		 * suitable migration targets, so periodically check if we need
-		 * to schedule.
+		 * to schedule, or even abort async compaction.
 		 */
-		cond_resched();
+		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
+						&& compact_should_abort(cc))
+			break;
 
 		if (!pfn_valid(block_start_pfn))
 			continue;
@@ -758,6 +781,13 @@ static void isolate_freepages(struct zone *zone,
 		 */
 		if (isolated)
 			cc->finished_update_free = true;
+
+		/*
+		 * isolate_freepages_block() might have aborted due to async
+		 * compaction being contended
+		 */
+		if (cc->contended)
+			break;
 	}
 
 	/* split_free_page does not map the pages */
@@ -785,9 +815,13 @@ static struct page *compaction_alloc(struct page *migratepage,
 	struct compact_control *cc = (struct compact_control *)data;
 	struct page *freepage;
 
-	/* Isolate free pages if necessary */
+	/*
+	 * Isolate free pages if necessary, and if we are not aborting due to
+	 * contention.
+	 */
 	if (list_empty(&cc->freepages)) {
-		isolate_freepages(cc->zone, cc);
+		if (!cc->contended)
+			isolate_freepages(cc->zone, cc);
 
 		if (list_empty(&cc->freepages))
 			return NULL;
@@ -857,7 +891,7 @@ static int compact_finished(struct zone *zone,
 	unsigned int order;
 	unsigned long watermark;
 
-	if (fatal_signal_pending(current))
+	if (cc->contended || fatal_signal_pending(current))
 		return COMPACT_PARTIAL;
 
 	/* Compaction run completes if the migrate and free scanner meet */
diff --git a/mm/internal.h b/mm/internal.h
index a25424a..ad844ab 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -144,7 +144,10 @@ struct compact_control {
 	int order;			/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
-	bool contended;			/* True if a lock was contended */
+	bool contended;			/* True if a lock was contended, or
+					 * need_resched() true during async
+					 * compaction
+					 */
 };
 
 unsigned long
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-16  9:47             ` Vlastimil Babka
@ 2014-05-16 17:33               ` Michal Nazarewicz
  -1 siblings, 0 replies; 267+ messages in thread
From: Michal Nazarewicz @ 2014-05-16 17:33 UTC (permalink / raw)
  To: Vlastimil Babka, Joonsoo Kim, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Christoph Lameter, Rik van Riel

[-- Attachment #1: Type: text/plain, Size: 2926 bytes --]

On Fri, May 16 2014, Vlastimil Babka wrote:
> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
>
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new compact_should_abort()
> function to achieve both. In isolate_freepages(), the check frequency is
> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
> scanner does in the preliminary page checks. In case a pageblock is found
> suitable for calling isolate_freepages_block(), the checks within there are
> done on higher frequency.
>
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
>
> In case isolate_freepages() has already isolated some pages before aborting
> due to contention, page migration will proceed, which is OK since we do not
> want to waste the work that has been done, and page migration has own checks
> for contention. However, we do not want another isolation attempt by either
> of the scanners, so cc->contended flag check is added also to
> compaction_alloc() and compact_finished() to make sure compaction is aborted
> right after the migration.
>
> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>

Acked-by: Michal Nazarewicz <mina86@mina86.com>

> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
> v2: update struct compact_control comment (per Naoya Horiguchi)
>     rename to compact_should_abort() and add comments (per David Rientjes)
>     add cc->contended checks in compaction_alloc() and compact_finished()
>     (per Joonsoo Kim)
>     reduce frequency of checks in isolate_freepages() 
>
-- 
Best regards,                                         _     _
.o. | Liege of Serenely Enlightened Majesty of      o' \,=./ `o
..o | Computer Science,  Michał “mina86” Nazarewicz    (o o)
ooo +--<mpn@google.com>--<xmpp:mina86@jabber.org>--ooO--(_)--Ooo--

[-- Attachment #2.1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 835 bytes --]

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-16 17:33               ` Michal Nazarewicz
  0 siblings, 0 replies; 267+ messages in thread
From: Michal Nazarewicz @ 2014-05-16 17:33 UTC (permalink / raw)
  To: Vlastimil Babka, Joonsoo Kim, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Mel Gorman, Bartlomiej Zolnierkiewicz, Christoph Lameter,
	Rik van Riel

[-- Attachment #1: Type: text/plain, Size: 2926 bytes --]

On Fri, May 16 2014, Vlastimil Babka wrote:
> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
>
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new compact_should_abort()
> function to achieve both. In isolate_freepages(), the check frequency is
> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
> scanner does in the preliminary page checks. In case a pageblock is found
> suitable for calling isolate_freepages_block(), the checks within there are
> done on higher frequency.
>
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
>
> In case isolate_freepages() has already isolated some pages before aborting
> due to contention, page migration will proceed, which is OK since we do not
> want to waste the work that has been done, and page migration has own checks
> for contention. However, we do not want another isolation attempt by either
> of the scanners, so cc->contended flag check is added also to
> compaction_alloc() and compact_finished() to make sure compaction is aborted
> right after the migration.
>
> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>

Acked-by: Michal Nazarewicz <mina86@mina86.com>

> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
> v2: update struct compact_control comment (per Naoya Horiguchi)
>     rename to compact_should_abort() and add comments (per David Rientjes)
>     add cc->contended checks in compaction_alloc() and compact_finished()
>     (per Joonsoo Kim)
>     reduce frequency of checks in isolate_freepages() 
>
-- 
Best regards,                                         _     _
.o. | Liege of Serenely Enlightened Majesty of      o' \,=./ `o
..o | Computer Science,  Michał “mina86” Nazarewicz    (o o)
ooo +--<mpn@google.com>--<xmpp:mina86@jabber.org>--ooO--(_)--Ooo--

[-- Attachment #2.1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 835 bytes --]

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-07 12:09           ` Vlastimil Babka
@ 2014-05-19 10:14             ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-19 10:14 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Naoya Horiguchi, Christoph Lameter,
	Rik van Riel

I wonder why nobody complained about the build warning... sorry.

----8<----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 19 May 2014 12:02:38 +0200
Subject: mm-compaction-avoid-rescanning-pageblocks-in-isolate_freepages-fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix a (spurious) build warning:

mm/compaction.c:860:15: warning: ‘next_free_pfn’ may be used uninitialized in this function [-Wmaybe-uninitialized]

Seems like the compiler cannot prove that exiting the for loop without updating
next_free_pfn there will mean that the check for crossing the scanners will
trigger. So let's not confuse people who try to see why this warning occurs.

Instead of initializing next_free_pfn to zero with an explaining comment, just
drop the damned variable altogether and work with cc->free_pfn directly as
Nayoa originally suggested.

Suggested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 8db9820..b0f939b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -760,7 +760,6 @@ static void isolate_freepages(struct zone *zone,
 	unsigned long block_start_pfn;	/* start of current pageblock */
 	unsigned long block_end_pfn;	/* end of current pageblock */
 	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
-	unsigned long next_free_pfn; /* start pfn for scaning at next round */
 	int nr_freepages = cc->nr_freepages;
 	struct list_head *freelist = &cc->freepages;
 
@@ -822,7 +821,7 @@ static void isolate_freepages(struct zone *zone,
 			continue;
 
 		/* Found a block suitable for isolating free pages from */
-		next_free_pfn = block_start_pfn;
+		cc->free_pfn = block_start_pfn;
 		isolated = isolate_freepages_block(cc, block_start_pfn,
 					block_end_pfn, freelist, false);
 		nr_freepages += isolated;
@@ -852,9 +851,8 @@ static void isolate_freepages(struct zone *zone,
 	 * so that compact_finished() may detect this
 	 */
 	if (block_start_pfn < low_pfn)
-		next_free_pfn = cc->migrate_pfn;
+		cc->free_pfn = cc->migrate_pfn;
 
-	cc->free_pfn = next_free_pfn;
 	cc->nr_freepages = nr_freepages;
 }
 
-- 
1.8.4.5



^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-19 10:14             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-19 10:14 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Naoya Horiguchi, Christoph Lameter,
	Rik van Riel

I wonder why nobody complained about the build warning... sorry.

----8<----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 19 May 2014 12:02:38 +0200
Subject: mm-compaction-avoid-rescanning-pageblocks-in-isolate_freepages-fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix a (spurious) build warning:

mm/compaction.c:860:15: warning: a??next_free_pfna?? may be used uninitialized in this function [-Wmaybe-uninitialized]

Seems like the compiler cannot prove that exiting the for loop without updating
next_free_pfn there will mean that the check for crossing the scanners will
trigger. So let's not confuse people who try to see why this warning occurs.

Instead of initializing next_free_pfn to zero with an explaining comment, just
drop the damned variable altogether and work with cc->free_pfn directly as
Nayoa originally suggested.

Suggested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 8db9820..b0f939b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -760,7 +760,6 @@ static void isolate_freepages(struct zone *zone,
 	unsigned long block_start_pfn;	/* start of current pageblock */
 	unsigned long block_end_pfn;	/* end of current pageblock */
 	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
-	unsigned long next_free_pfn; /* start pfn for scaning at next round */
 	int nr_freepages = cc->nr_freepages;
 	struct list_head *freelist = &cc->freepages;
 
@@ -822,7 +821,7 @@ static void isolate_freepages(struct zone *zone,
 			continue;
 
 		/* Found a block suitable for isolating free pages from */
-		next_free_pfn = block_start_pfn;
+		cc->free_pfn = block_start_pfn;
 		isolated = isolate_freepages_block(cc, block_start_pfn,
 					block_end_pfn, freelist, false);
 		nr_freepages += isolated;
@@ -852,9 +851,8 @@ static void isolate_freepages(struct zone *zone,
 	 * so that compact_finished() may detect this
 	 */
 	if (block_start_pfn < low_pfn)
-		next_free_pfn = cc->migrate_pfn;
+		cc->free_pfn = cc->migrate_pfn;
 
-	cc->free_pfn = next_free_pfn;
 	cc->nr_freepages = nr_freepages;
 }
 
-- 
1.8.4.5


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-16  9:47             ` Vlastimil Babka
@ 2014-05-19 23:37               ` Andrew Morton
  -1 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-19 23:37 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On Fri, 16 May 2014 11:47:53 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:

> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
> 
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new compact_should_abort()
> function to achieve both. In isolate_freepages(), the check frequency is
> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
> scanner does in the preliminary page checks. In case a pageblock is found
> suitable for calling isolate_freepages_block(), the checks within there are
> done on higher frequency.
> 
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
> 
> In case isolate_freepages() has already isolated some pages before aborting
> due to contention, page migration will proceed, which is OK since we do not
> want to waste the work that has been done, and page migration has own checks
> for contention. However, we do not want another isolation attempt by either
> of the scanners, so cc->contended flag check is added also to
> compaction_alloc() and compact_finished() to make sure compaction is aborted
> right after the migration.

What are the runtime effect of this change?

> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>

What did Joonsoo report?  Perhaps this is the same thing..

>
> ...
>
> @@ -718,9 +739,11 @@ static void isolate_freepages(struct zone *zone,
>  		/*
>  		 * This can iterate a massively long zone without finding any
>  		 * suitable migration targets, so periodically check if we need
> -		 * to schedule.
> +		 * to schedule, or even abort async compaction.
>  		 */
> -		cond_resched();
> +		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
> +						&& compact_should_abort(cc))

This seems rather gratuitously inefficient and isn't terribly clear. 
What's wrong with

	if ((++foo % SWAP_CLUSTER_MAX) == 0 && compact_should_abort(cc))

?

(Assumes that SWAP_CLUSTER_MAX is power-of-2 and that the compiler will
use &)


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-19 23:37               ` Andrew Morton
  0 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-19 23:37 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On Fri, 16 May 2014 11:47:53 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:

> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
> 
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new compact_should_abort()
> function to achieve both. In isolate_freepages(), the check frequency is
> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
> scanner does in the preliminary page checks. In case a pageblock is found
> suitable for calling isolate_freepages_block(), the checks within there are
> done on higher frequency.
> 
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
> 
> In case isolate_freepages() has already isolated some pages before aborting
> due to contention, page migration will proceed, which is OK since we do not
> want to waste the work that has been done, and page migration has own checks
> for contention. However, we do not want another isolation attempt by either
> of the scanners, so cc->contended flag check is added also to
> compaction_alloc() and compact_finished() to make sure compaction is aborted
> right after the migration.

What are the runtime effect of this change?

> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>

What did Joonsoo report?  Perhaps this is the same thing..

>
> ...
>
> @@ -718,9 +739,11 @@ static void isolate_freepages(struct zone *zone,
>  		/*
>  		 * This can iterate a massively long zone without finding any
>  		 * suitable migration targets, so periodically check if we need
> -		 * to schedule.
> +		 * to schedule, or even abort async compaction.
>  		 */
> -		cond_resched();
> +		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
> +						&& compact_should_abort(cc))

This seems rather gratuitously inefficient and isn't terribly clear. 
What's wrong with

	if ((++foo % SWAP_CLUSTER_MAX) == 0 && compact_should_abort(cc))

?

(Assumes that SWAP_CLUSTER_MAX is power-of-2 and that the compiler will
use &)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-19 23:37               ` Andrew Morton
@ 2014-05-21 14:13                 ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-21 14:13 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On 05/20/2014 01:37 AM, Andrew Morton wrote:
> On Fri, 16 May 2014 11:47:53 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:
> 
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>>
>> First, compaction also calls directly cond_resched() in both scanners when no
>> lock is yet taken. This call either does not abort async compaction, or set
>> cc->contended appropriately. This patch introduces a new compact_should_abort()
>> function to achieve both. In isolate_freepages(), the check frequency is
>> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
>> scanner does in the preliminary page checks. In case a pageblock is found
>> suitable for calling isolate_freepages_block(), the checks within there are
>> done on higher frequency.
>>
>> Second, isolate_freepages() does not check if isolate_freepages_block()
>> aborted due to contention, and advances to the next pageblock. This violates
>> the principle of aborting on contention, and might result in pageblocks not
>> being scanned completely, since the scanning cursor is advanced. This patch
>> makes isolate_freepages_block() check the cc->contended flag and abort.
>>
>> In case isolate_freepages() has already isolated some pages before aborting
>> due to contention, page migration will proceed, which is OK since we do not
>> want to waste the work that has been done, and page migration has own checks
>> for contention. However, we do not want another isolation attempt by either
>> of the scanners, so cc->contended flag check is added also to
>> compaction_alloc() and compact_finished() to make sure compaction is aborted
>> right after the migration.
> 
> What are the runtime effect of this change?
> 
>> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> 
> What did Joonsoo report?  Perhaps this is the same thing..

Updated log message:

Compaction uses compact_checklock_irqsave() function to periodically check for
lock contention and need_resched() to either abort async compaction, or to
free the lock, schedule and retake the lock. When aborting, cc->contended is
set to signal the contended state to the caller. Two problems have been
identified in this mechanism.

First, compaction also calls directly cond_resched() in both scanners when no
lock is yet taken. This call either does not abort async compaction, or set
cc->contended appropriately. This patch introduces a new compact_should_abort()
function to achieve both. In isolate_freepages(), the check frequency is
reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
scanner does in the preliminary page checks. In case a pageblock is found
suitable for calling isolate_freepages_block(), the checks within there are
done on higher frequency.

Second, isolate_freepages() does not check if isolate_freepages_block()
aborted due to contention, and advances to the next pageblock. This violates
the principle of aborting on contention, and might result in pageblocks not
being scanned completely, since the scanning cursor is advanced. This problem
has been noticed in the code by Joonsoo Kim when reviewing related patches.
This patch makes isolate_freepages_block() check the cc->contended flag and
abort.

In case isolate_freepages() has already isolated some pages before aborting
due to contention, page migration will proceed, which is OK since we do not
want to waste the work that has been done, and page migration has own checks
for contention. However, we do not want another isolation attempt by either
of the scanners, so cc->contended flag check is added also to
compaction_alloc() and compact_finished() to make sure compaction is aborted
right after the migration.

The outcome of the patch should be reduced lock contention by async compaction
and lower latencies for higher-order allocations where direct compaction is
involved.

>>
>> ...
>>
>> @@ -718,9 +739,11 @@ static void isolate_freepages(struct zone *zone,
>>   		/*
>>   		 * This can iterate a massively long zone without finding any
>>   		 * suitable migration targets, so periodically check if we need
>> -		 * to schedule.
>> +		 * to schedule, or even abort async compaction.
>>   		 */
>> -		cond_resched();
>> +		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
>> +						&& compact_should_abort(cc))
> 
> This seems rather gratuitously inefficient and isn't terribly clear.
> What's wrong with
> 
> 	if ((++foo % SWAP_CLUSTER_MAX) == 0 && compact_should_abort(cc))

It's a new variable and it differs from how isolate_migratepages_range() does this.
But yeah, I might change it later there as well. There it makes even more sense.
E.g. when skipping whole pageblock there, pfn % SWAP_CLUSTER_MAX will be always zero
so the periodicity varies.
 
> ?
> 
> (Assumes that SWAP_CLUSTER_MAX is power-of-2 and that the compiler will
> use &)
 
I hoped that compiler would be smart enough about SWAP_CLUSTER_MAX * pageblock_nr_pages
as well, as those are constants and also power-of-2. But I didn't check the assembly.

-----8<-----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 21 May 2014 16:07:21 +0200
Subject: [PATCH] 
 mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix

Use a separate counter variable (not pfn) to trigger abort checks.

Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index bbe6a26..23c7439 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -779,6 +779,7 @@ static void isolate_freepages(struct zone *zone,
 	unsigned long block_start_pfn;	/* start of current pageblock */
 	unsigned long block_end_pfn;	/* end of current pageblock */
 	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
+	unsigned long nr_blocks_scanned = 0; /* for periodical abort checks */
 	int nr_freepages = cc->nr_freepages;
 	struct list_head *freelist = &cc->freepages;
 
@@ -813,7 +814,7 @@ static void isolate_freepages(struct zone *zone,
 		 * suitable migration targets, so periodically check if we need
 		 * to schedule, or even abort async compaction.
 		 */
-		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
+		if ((++nr_blocks_scanned % SWAP_CLUSTER_MAX) == 0
 						&& compact_should_abort(cc))
 			break;
 
-- 
1.8.4.5




^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-21 14:13                 ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-21 14:13 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On 05/20/2014 01:37 AM, Andrew Morton wrote:
> On Fri, 16 May 2014 11:47:53 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:
> 
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>>
>> First, compaction also calls directly cond_resched() in both scanners when no
>> lock is yet taken. This call either does not abort async compaction, or set
>> cc->contended appropriately. This patch introduces a new compact_should_abort()
>> function to achieve both. In isolate_freepages(), the check frequency is
>> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
>> scanner does in the preliminary page checks. In case a pageblock is found
>> suitable for calling isolate_freepages_block(), the checks within there are
>> done on higher frequency.
>>
>> Second, isolate_freepages() does not check if isolate_freepages_block()
>> aborted due to contention, and advances to the next pageblock. This violates
>> the principle of aborting on contention, and might result in pageblocks not
>> being scanned completely, since the scanning cursor is advanced. This patch
>> makes isolate_freepages_block() check the cc->contended flag and abort.
>>
>> In case isolate_freepages() has already isolated some pages before aborting
>> due to contention, page migration will proceed, which is OK since we do not
>> want to waste the work that has been done, and page migration has own checks
>> for contention. However, we do not want another isolation attempt by either
>> of the scanners, so cc->contended flag check is added also to
>> compaction_alloc() and compact_finished() to make sure compaction is aborted
>> right after the migration.
> 
> What are the runtime effect of this change?
> 
>> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> 
> What did Joonsoo report?  Perhaps this is the same thing..

Updated log message:

Compaction uses compact_checklock_irqsave() function to periodically check for
lock contention and need_resched() to either abort async compaction, or to
free the lock, schedule and retake the lock. When aborting, cc->contended is
set to signal the contended state to the caller. Two problems have been
identified in this mechanism.

First, compaction also calls directly cond_resched() in both scanners when no
lock is yet taken. This call either does not abort async compaction, or set
cc->contended appropriately. This patch introduces a new compact_should_abort()
function to achieve both. In isolate_freepages(), the check frequency is
reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
scanner does in the preliminary page checks. In case a pageblock is found
suitable for calling isolate_freepages_block(), the checks within there are
done on higher frequency.

Second, isolate_freepages() does not check if isolate_freepages_block()
aborted due to contention, and advances to the next pageblock. This violates
the principle of aborting on contention, and might result in pageblocks not
being scanned completely, since the scanning cursor is advanced. This problem
has been noticed in the code by Joonsoo Kim when reviewing related patches.
This patch makes isolate_freepages_block() check the cc->contended flag and
abort.

In case isolate_freepages() has already isolated some pages before aborting
due to contention, page migration will proceed, which is OK since we do not
want to waste the work that has been done, and page migration has own checks
for contention. However, we do not want another isolation attempt by either
of the scanners, so cc->contended flag check is added also to
compaction_alloc() and compact_finished() to make sure compaction is aborted
right after the migration.

The outcome of the patch should be reduced lock contention by async compaction
and lower latencies for higher-order allocations where direct compaction is
involved.

>>
>> ...
>>
>> @@ -718,9 +739,11 @@ static void isolate_freepages(struct zone *zone,
>>   		/*
>>   		 * This can iterate a massively long zone without finding any
>>   		 * suitable migration targets, so periodically check if we need
>> -		 * to schedule.
>> +		 * to schedule, or even abort async compaction.
>>   		 */
>> -		cond_resched();
>> +		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
>> +						&& compact_should_abort(cc))
> 
> This seems rather gratuitously inefficient and isn't terribly clear.
> What's wrong with
> 
> 	if ((++foo % SWAP_CLUSTER_MAX) == 0 && compact_should_abort(cc))

It's a new variable and it differs from how isolate_migratepages_range() does this.
But yeah, I might change it later there as well. There it makes even more sense.
E.g. when skipping whole pageblock there, pfn % SWAP_CLUSTER_MAX will be always zero
so the periodicity varies.
 
> ?
> 
> (Assumes that SWAP_CLUSTER_MAX is power-of-2 and that the compiler will
> use &)
 
I hoped that compiler would be smart enough about SWAP_CLUSTER_MAX * pageblock_nr_pages
as well, as those are constants and also power-of-2. But I didn't check the assembly.

-----8<-----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 21 May 2014 16:07:21 +0200
Subject: [PATCH] 
 mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix

Use a separate counter variable (not pfn) to trigger abort checks.

Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index bbe6a26..23c7439 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -779,6 +779,7 @@ static void isolate_freepages(struct zone *zone,
 	unsigned long block_start_pfn;	/* start of current pageblock */
 	unsigned long block_end_pfn;	/* end of current pageblock */
 	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
+	unsigned long nr_blocks_scanned = 0; /* for periodical abort checks */
 	int nr_freepages = cc->nr_freepages;
 	struct list_head *freelist = &cc->freepages;
 
@@ -813,7 +814,7 @@ static void isolate_freepages(struct zone *zone,
 		 * suitable migration targets, so periodically check if we need
 		 * to schedule, or even abort async compaction.
 		 */
-		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
+		if ((++nr_blocks_scanned % SWAP_CLUSTER_MAX) == 0
 						&& compact_should_abort(cc))
 			break;
 
-- 
1.8.4.5



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-21 14:13                 ` Vlastimil Babka
@ 2014-05-21 20:11                   ` Andrew Morton
  -1 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-21 20:11 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On Wed, 21 May 2014 16:13:39 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:

> >>
> >> ...
> >>
> >> @@ -718,9 +739,11 @@ static void isolate_freepages(struct zone *zone,
> >>   		/*
> >>   		 * This can iterate a massively long zone without finding any
> >>   		 * suitable migration targets, so periodically check if we need
> >> -		 * to schedule.
> >> +		 * to schedule, or even abort async compaction.
> >>   		 */
> >> -		cond_resched();
> >> +		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
> >> +						&& compact_should_abort(cc))
> > 
> > This seems rather gratuitously inefficient and isn't terribly clear.
> > What's wrong with
> > 
> > 	if ((++foo % SWAP_CLUSTER_MAX) == 0 && compact_should_abort(cc))
> 
> It's a new variable and it differs from how isolate_migratepages_range() does this.
> But yeah, I might change it later there as well. There it makes even more sense.
> E.g. when skipping whole pageblock there, pfn % SWAP_CLUSTER_MAX will be always zero
> so the periodicity varies.
>  
> > ?
> > 
> > (Assumes that SWAP_CLUSTER_MAX is power-of-2 and that the compiler will
> > use &)
>  
> I hoped that compiler would be smart enough about SWAP_CLUSTER_MAX * pageblock_nr_pages
> as well, as those are constants and also power-of-2. But I didn't check the assembly.

Always check the assembly!  Just a quick `size mm/compaction.o' is
enough tell if you're on the right track.

> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -779,6 +779,7 @@ static void isolate_freepages(struct zone *zone,
>  	unsigned long block_start_pfn;	/* start of current pageblock */
>  	unsigned long block_end_pfn;	/* end of current pageblock */
>  	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
> +	unsigned long nr_blocks_scanned = 0; /* for periodical abort checks */
>  	int nr_freepages = cc->nr_freepages;
>  	struct list_head *freelist = &cc->freepages;
>  
> @@ -813,7 +814,7 @@ static void isolate_freepages(struct zone *zone,
>  		 * suitable migration targets, so periodically check if we need
>  		 * to schedule, or even abort async compaction.
>  		 */
> -		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
> +		if ((++nr_blocks_scanned % SWAP_CLUSTER_MAX) == 0
>  						&& compact_should_abort(cc))
>  			break;

This change actually makes the code worse, and the .o file gets larger.

For some stupid reason we went and make pageblock_nr_pages all lower
case but surprise surprise, it's actually a literal constant.  So the
compiler does the multiplication at compile time and converts the
modulus operation into a bitwise AND.  Duh.


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-21 20:11                   ` Andrew Morton
  0 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-21 20:11 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On Wed, 21 May 2014 16:13:39 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:

> >>
> >> ...
> >>
> >> @@ -718,9 +739,11 @@ static void isolate_freepages(struct zone *zone,
> >>   		/*
> >>   		 * This can iterate a massively long zone without finding any
> >>   		 * suitable migration targets, so periodically check if we need
> >> -		 * to schedule.
> >> +		 * to schedule, or even abort async compaction.
> >>   		 */
> >> -		cond_resched();
> >> +		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
> >> +						&& compact_should_abort(cc))
> > 
> > This seems rather gratuitously inefficient and isn't terribly clear.
> > What's wrong with
> > 
> > 	if ((++foo % SWAP_CLUSTER_MAX) == 0 && compact_should_abort(cc))
> 
> It's a new variable and it differs from how isolate_migratepages_range() does this.
> But yeah, I might change it later there as well. There it makes even more sense.
> E.g. when skipping whole pageblock there, pfn % SWAP_CLUSTER_MAX will be always zero
> so the periodicity varies.
>  
> > ?
> > 
> > (Assumes that SWAP_CLUSTER_MAX is power-of-2 and that the compiler will
> > use &)
>  
> I hoped that compiler would be smart enough about SWAP_CLUSTER_MAX * pageblock_nr_pages
> as well, as those are constants and also power-of-2. But I didn't check the assembly.

Always check the assembly!  Just a quick `size mm/compaction.o' is
enough tell if you're on the right track.

> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -779,6 +779,7 @@ static void isolate_freepages(struct zone *zone,
>  	unsigned long block_start_pfn;	/* start of current pageblock */
>  	unsigned long block_end_pfn;	/* end of current pageblock */
>  	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
> +	unsigned long nr_blocks_scanned = 0; /* for periodical abort checks */
>  	int nr_freepages = cc->nr_freepages;
>  	struct list_head *freelist = &cc->freepages;
>  
> @@ -813,7 +814,7 @@ static void isolate_freepages(struct zone *zone,
>  		 * suitable migration targets, so periodically check if we need
>  		 * to schedule, or even abort async compaction.
>  		 */
> -		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
> +		if ((++nr_blocks_scanned % SWAP_CLUSTER_MAX) == 0
>  						&& compact_should_abort(cc))
>  			break;

This change actually makes the code worse, and the .o file gets larger.

For some stupid reason we went and make pageblock_nr_pages all lower
case but surprise surprise, it's actually a literal constant.  So the
compiler does the multiplication at compile time and converts the
modulus operation into a bitwise AND.  Duh.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm] mm, thp: avoid excessive compaction latency during fault fix
  2014-05-13 10:00           ` Vlastimil Babka
@ 2014-05-22  2:49             ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-22  2:49 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, 13 May 2014, Vlastimil Babka wrote:

> I wonder what about a process doing e.g. mmap() with MAP_POPULATE. It seems to
> me that it would get only MIGRATE_ASYNC here, right? Since gfp_mask would
> include __GFP_NO_KSWAPD and it won't have PF_KTHREAD.
> I think that goes against the idea that with MAP_POPULATE you say you are
> willing to wait to have everything in place before you actually use the
> memory. So I guess you are also willing to wait for hugepages in that
> situation?
> 

I don't understand the distinction you're making between MAP_POPULATE and 
simply a prefault of the anon memory.  What is the difference in semantics 
between using MAP_POPULATE and touching a byte every page size along the 
range?  In the latter, you'd be faulting thp with MIGRATE_ASYNC, so I 
don't understand how MAP_POPULATE is any different or implies any 
preference for hugepages.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm] mm, thp: avoid excessive compaction latency during fault fix
@ 2014-05-22  2:49             ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-22  2:49 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, 13 May 2014, Vlastimil Babka wrote:

> I wonder what about a process doing e.g. mmap() with MAP_POPULATE. It seems to
> me that it would get only MIGRATE_ASYNC here, right? Since gfp_mask would
> include __GFP_NO_KSWAPD and it won't have PF_KTHREAD.
> I think that goes against the idea that with MAP_POPULATE you say you are
> willing to wait to have everything in place before you actually use the
> memory. So I guess you are also willing to wait for hugepages in that
> situation?
> 

I don't understand the distinction you're making between MAP_POPULATE and 
simply a prefault of the anon memory.  What is the difference in semantics 
between using MAP_POPULATE and touching a byte every page size along the 
range?  In the latter, you'd be faulting thp with MIGRATE_ASYNC, so I 
don't understand how MAP_POPULATE is any different or implies any 
preference for hugepages.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-19 10:14             ` Vlastimil Babka
@ 2014-05-22  2:51               ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-22  2:51 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Naoya Horiguchi, Christoph Lameter,
	Rik van Riel

[-- Attachment #1: Type: TEXT/PLAIN, Size: 814 bytes --]

On Mon, 19 May 2014, Vlastimil Babka wrote:

> Fix a (spurious) build warning:
> 
> mm/compaction.c:860:15: warning: ‘next_free_pfn’ may be used uninitialized in this function [-Wmaybe-uninitialized]
> 
> Seems like the compiler cannot prove that exiting the for loop without updating
> next_free_pfn there will mean that the check for crossing the scanners will
> trigger. So let's not confuse people who try to see why this warning occurs.
> 
> Instead of initializing next_free_pfn to zero with an explaining comment, just
> drop the damned variable altogether and work with cc->free_pfn directly as
> Nayoa originally suggested.
> 

s/Nayoa/Naoya/

> Suggested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Acked-by: David Rientjes <rientjes@google.com>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-22  2:51               ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-22  2:51 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Naoya Horiguchi, Christoph Lameter,
	Rik van Riel

[-- Attachment #1: Type: TEXT/PLAIN, Size: 814 bytes --]

On Mon, 19 May 2014, Vlastimil Babka wrote:

> Fix a (spurious) build warning:
> 
> mm/compaction.c:860:15: warning: a??next_free_pfna?? may be used uninitialized in this function [-Wmaybe-uninitialized]
> 
> Seems like the compiler cannot prove that exiting the for loop without updating
> next_free_pfn there will mean that the check for crossing the scanners will
> trigger. So let's not confuse people who try to see why this warning occurs.
> 
> Instead of initializing next_free_pfn to zero with an explaining comment, just
> drop the damned variable altogether and work with cc->free_pfn directly as
> Nayoa originally suggested.
> 

s/Nayoa/Naoya/

> Suggested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Acked-by: David Rientjes <rientjes@google.com>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* compaction is still too expensive for thp (was: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention)
  2014-05-16  9:47             ` Vlastimil Babka
@ 2014-05-22  3:20               ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-22  3:20 UTC (permalink / raw)
  To: Vlastimil Babka, Mel Gorman, Andrew Morton
  Cc: Joonsoo Kim, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel

On Fri, 16 May 2014, Vlastimil Babka wrote:

> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
> 
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new compact_should_abort()
> function to achieve both. In isolate_freepages(), the check frequency is
> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
> scanner does in the preliminary page checks. In case a pageblock is found
> suitable for calling isolate_freepages_block(), the checks within there are
> done on higher frequency.
> 
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
> 
> In case isolate_freepages() has already isolated some pages before aborting
> due to contention, page migration will proceed, which is OK since we do not
> want to waste the work that has been done, and page migration has own checks
> for contention. However, we do not want another isolation attempt by either
> of the scanners, so cc->contended flag check is added also to
> compaction_alloc() and compact_finished() to make sure compaction is aborted
> right after the migration.
> 

We have a pretty significant problem with async compaction related to thp 
faults and it's not limited to this patch but was intended to be addressed 
in my series as well.  Since this is the latest patch to be proposed for 
aborting async compaction when it's too expensive, it's probably a good 
idea to discuss it here.

With -mm, it turns out that while egregious thp fault latencies were 
reduced, faulting 64MB of memory backed by thp on a fragmented 128GB 
machine can result in latencies of 1-3s for the entire 64MB.  Collecting 
compaction stats from older kernels that give more insight into 
regressions, one such incident is as follows.

Baseline:
compact_blocks_moved 8181986
compact_pages_moved 6549560
compact_pagemigrate_failed 1612070
compact_stall 101959
compact_fail 100895
compact_success 1064

5s later:
compact_blocks_moved 8182447
compact_pages_moved 6550286
compact_pagemigrate_failed 1612092
compact_stall 102023
compact_fail 100959
compact_success 1064

This represents faulting two 64MB ranges of anonymous memory.  As you can 
see, it results in falling back to 4KB pages because all 64 faults of 
hugepages ends up triggering compaction and failing to allocate.  Over the 
64 async compactions, we scan on average 7.2 pageblocks per call, 
successfully migrate 11.3 pages per call, and fail migrating 0.34 pages 
per call.

If each async compaction scans 7.2 pageblocks per call, it would have to 
be called 9103 times to scan all memory on this 128GB machine.  We're 
simply not scanning enough memory as a result of ISOLATE_ABORT due to 
need_resched().

So the net result is that -mm is much better than Linus's tree, where such 
faulting of 64MB ranges could stall 8-9s, but we're still very expensive.  
We may need to consider scanning more memory on a single call to async 
compaction even when need_resched() and if we are unsuccessful in 
allocating a hugepage to defer async compaction in subsequent calls up to 
1 << COMPACT_MAX_DEFER_SHIFT.  Today, we defer on sync compaction but that 
is now never done for thp faults since it is reliant solely on async 
compaction.

I have a few improvements in mind, but thought it would be better to 
get feedback on it first because it's a substantial rewrite of the 
pageblock migration:

 - For all async compaction, avoid migrating memory unless enough 
   contiguous memory is migrated to allow a cc->order allocation.  This
   would remove the COMPACT_CLUSTER_MAX restriction on pageblock
   compaction and keep pages on the cc->migratepages list between
   calls to isolate_migratepages_range().

   When an unmigratable page is encountered or memory hole is found,
   put all pages on cc->migratepages back on the lru lists unless
   cc->nr_migratepages >= (1 << cc->order).  Otherwise, migrate when
   enough contiguous memory has been isolated.

 - Remove the need_resched() checks entirely from compaction and
   consider only doing a set amount of scanning for each call, such
   as 1GB per call.

   If there is contention on zone->lru_lock, then we can still abort
   to avoid excessive stalls, but need_resched() is a poor heuristic
   to determine when async compaction is taking too long.

   The expense of calling async compaction if this is done is easily
   quantified since we're not migrating any memory unless it is
   sufficient for the page allocation: it would simply be the iteration
   over 1GB of memory and avoiding contention on zone->lru_lock.

We may also need to consider deferring async compaction for subsequent 
faults in the near future even though scanning the previous 1GB does not 
decrease or have any impact whatsoever in the success of defragmenting the 
next 1GB.

Any other suggestions that may be helpful?  

^ permalink raw reply	[flat|nested] 267+ messages in thread

* compaction is still too expensive for thp (was: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention)
@ 2014-05-22  3:20               ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-22  3:20 UTC (permalink / raw)
  To: Vlastimil Babka, Mel Gorman, Andrew Morton
  Cc: Joonsoo Kim, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel

On Fri, 16 May 2014, Vlastimil Babka wrote:

> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
> 
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new compact_should_abort()
> function to achieve both. In isolate_freepages(), the check frequency is
> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
> scanner does in the preliminary page checks. In case a pageblock is found
> suitable for calling isolate_freepages_block(), the checks within there are
> done on higher frequency.
> 
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
> 
> In case isolate_freepages() has already isolated some pages before aborting
> due to contention, page migration will proceed, which is OK since we do not
> want to waste the work that has been done, and page migration has own checks
> for contention. However, we do not want another isolation attempt by either
> of the scanners, so cc->contended flag check is added also to
> compaction_alloc() and compact_finished() to make sure compaction is aborted
> right after the migration.
> 

We have a pretty significant problem with async compaction related to thp 
faults and it's not limited to this patch but was intended to be addressed 
in my series as well.  Since this is the latest patch to be proposed for 
aborting async compaction when it's too expensive, it's probably a good 
idea to discuss it here.

With -mm, it turns out that while egregious thp fault latencies were 
reduced, faulting 64MB of memory backed by thp on a fragmented 128GB 
machine can result in latencies of 1-3s for the entire 64MB.  Collecting 
compaction stats from older kernels that give more insight into 
regressions, one such incident is as follows.

Baseline:
compact_blocks_moved 8181986
compact_pages_moved 6549560
compact_pagemigrate_failed 1612070
compact_stall 101959
compact_fail 100895
compact_success 1064

5s later:
compact_blocks_moved 8182447
compact_pages_moved 6550286
compact_pagemigrate_failed 1612092
compact_stall 102023
compact_fail 100959
compact_success 1064

This represents faulting two 64MB ranges of anonymous memory.  As you can 
see, it results in falling back to 4KB pages because all 64 faults of 
hugepages ends up triggering compaction and failing to allocate.  Over the 
64 async compactions, we scan on average 7.2 pageblocks per call, 
successfully migrate 11.3 pages per call, and fail migrating 0.34 pages 
per call.

If each async compaction scans 7.2 pageblocks per call, it would have to 
be called 9103 times to scan all memory on this 128GB machine.  We're 
simply not scanning enough memory as a result of ISOLATE_ABORT due to 
need_resched().

So the net result is that -mm is much better than Linus's tree, where such 
faulting of 64MB ranges could stall 8-9s, but we're still very expensive.  
We may need to consider scanning more memory on a single call to async 
compaction even when need_resched() and if we are unsuccessful in 
allocating a hugepage to defer async compaction in subsequent calls up to 
1 << COMPACT_MAX_DEFER_SHIFT.  Today, we defer on sync compaction but that 
is now never done for thp faults since it is reliant solely on async 
compaction.

I have a few improvements in mind, but thought it would be better to 
get feedback on it first because it's a substantial rewrite of the 
pageblock migration:

 - For all async compaction, avoid migrating memory unless enough 
   contiguous memory is migrated to allow a cc->order allocation.  This
   would remove the COMPACT_CLUSTER_MAX restriction on pageblock
   compaction and keep pages on the cc->migratepages list between
   calls to isolate_migratepages_range().

   When an unmigratable page is encountered or memory hole is found,
   put all pages on cc->migratepages back on the lru lists unless
   cc->nr_migratepages >= (1 << cc->order).  Otherwise, migrate when
   enough contiguous memory has been isolated.

 - Remove the need_resched() checks entirely from compaction and
   consider only doing a set amount of scanning for each call, such
   as 1GB per call.

   If there is contention on zone->lru_lock, then we can still abort
   to avoid excessive stalls, but need_resched() is a poor heuristic
   to determine when async compaction is taking too long.

   The expense of calling async compaction if this is done is easily
   quantified since we're not migrating any memory unless it is
   sufficient for the page allocation: it would simply be the iteration
   over 1GB of memory and avoiding contention on zone->lru_lock.

We may also need to consider deferring async compaction for subsequent 
faults in the near future even though scanning the previous 1GB does not 
decrease or have any impact whatsoever in the success of defragmenting the 
next 1GB.

Any other suggestions that may be helpful?  

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: compaction is still too expensive for thp
  2014-05-22  3:20               ` David Rientjes
@ 2014-05-22  8:10                 ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-22  8:10 UTC (permalink / raw)
  To: David Rientjes, Mel Gorman, Andrew Morton
  Cc: Joonsoo Kim, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel

On 05/22/2014 05:20 AM, David Rientjes wrote:
> On Fri, 16 May 2014, Vlastimil Babka wrote:
>
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>>
>> First, compaction also calls directly cond_resched() in both scanners when no
>> lock is yet taken. This call either does not abort async compaction, or set
>> cc->contended appropriately. This patch introduces a new compact_should_abort()
>> function to achieve both. In isolate_freepages(), the check frequency is
>> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
>> scanner does in the preliminary page checks. In case a pageblock is found
>> suitable for calling isolate_freepages_block(), the checks within there are
>> done on higher frequency.
>>
>> Second, isolate_freepages() does not check if isolate_freepages_block()
>> aborted due to contention, and advances to the next pageblock. This violates
>> the principle of aborting on contention, and might result in pageblocks not
>> being scanned completely, since the scanning cursor is advanced. This patch
>> makes isolate_freepages_block() check the cc->contended flag and abort.
>>
>> In case isolate_freepages() has already isolated some pages before aborting
>> due to contention, page migration will proceed, which is OK since we do not
>> want to waste the work that has been done, and page migration has own checks
>> for contention. However, we do not want another isolation attempt by either
>> of the scanners, so cc->contended flag check is added also to
>> compaction_alloc() and compact_finished() to make sure compaction is aborted
>> right after the migration.
>>
>
> We have a pretty significant problem with async compaction related to thp
> faults and it's not limited to this patch but was intended to be addressed
> in my series as well.  Since this is the latest patch to be proposed for
> aborting async compaction when it's too expensive, it's probably a good
> idea to discuss it here.

I already tried to call for some higher level discussion eariler in your 
series, good that hear that now you also agree it might be useful :)

> With -mm, it turns out that while egregious thp fault latencies were
> reduced, faulting 64MB of memory backed by thp on a fragmented 128GB
> machine can result in latencies of 1-3s for the entire 64MB.  Collecting
> compaction stats from older kernels that give more insight into
> regressions, one such incident is as follows.
>
> Baseline:
> compact_blocks_moved 8181986
> compact_pages_moved 6549560
> compact_pagemigrate_failed 1612070
> compact_stall 101959
> compact_fail 100895
> compact_success 1064
>
> 5s later:
> compact_blocks_moved 8182447
> compact_pages_moved 6550286
> compact_pagemigrate_failed 1612092
> compact_stall 102023
> compact_fail 100959
> compact_success 1064
>
> This represents faulting two 64MB ranges of anonymous memory.  As you can
> see, it results in falling back to 4KB pages because all 64 faults of
> hugepages ends up triggering compaction and failing to allocate.  Over the
> 64 async compactions, we scan on average 7.2 pageblocks per call,
> successfully migrate 11.3 pages per call, and fail migrating 0.34 pages
> per call.
>
> If each async compaction scans 7.2 pageblocks per call, it would have to
> be called 9103 times to scan all memory on this 128GB machine.  We're
> simply not scanning enough memory as a result of ISOLATE_ABORT due to
> need_resched().

Well, the two objectives of not being expensive and at the same time 
scanning "enough memory" (which is hard to define as well) are clearly 
quite opposite :/

> So the net result is that -mm is much better than Linus's tree, where such
> faulting of 64MB ranges could stall 8-9s, but we're still very expensive.

So I guess the difference here is mainly thanks to not doing sync 
compaction? Or do you have any insight which patch helped the most?

> We may need to consider scanning more memory on a single call to async
> compaction even when need_resched() and if we are unsuccessful in
> allocating a hugepage to defer async compaction in subsequent calls up to
> 1 << COMPACT_MAX_DEFER_SHIFT.  Today, we defer on sync compaction but that
> is now never done for thp faults since it is reliant solely on async
> compaction.

So if I understand correctly, your intention is to scan more in a single 
scan, but balance the increased latencies by introducing deferring for 
async compaction.

Offhand I can think of two issues with that.

1) the result will be that often the latency will be low thanks to 
defer, but then there will be a huge (?) spike by scanning whole 1GB (as 
you suggest further in the mail) at once. I think that's similar to what 
you had now with the sync compaction?

2) 1GB could have a good chance of being successful (?) so there would 
be no defer anyway.

I have some other suggestion at the end of my mail.

> I have a few improvements in mind, but thought it would be better to
> get feedback on it first because it's a substantial rewrite of the
> pageblock migration:
>
>   - For all async compaction, avoid migrating memory unless enough
>     contiguous memory is migrated to allow a cc->order allocation.

Yes I suggested something like this earlier. Also in the scanner, skip 
to the next cc->order aligned block as soon as any page fails the 
isolation and is not PageBuddy.
I would just dinstinguish kswapd and direct compaction, not "all async 
compaction". Or maybe kswapd could be switched to sync compaction.

>     This
>     would remove the COMPACT_CLUSTER_MAX restriction on pageblock
>     compaction

Yes.

>     and keep pages on the cc->migratepages list between
>     calls to isolate_migratepages_range().

This might not be needed. It's called within a single pageblock (except 
maybe CMA but that's quite a different thing) and I think we can ignore 
order > pageblock_nr_order here.

>     When an unmigratable page is encountered or memory hole is found,
>     put all pages on cc->migratepages back on the lru lists unless
>     cc->nr_migratepages >= (1 << cc->order).  Otherwise, migrate when
>     enough contiguous memory has been isolated.
>
>   - Remove the need_resched() checks entirely from compaction and
>     consider only doing a set amount of scanning for each call, such
>     as 1GB per call.
>
>     If there is contention on zone->lru_lock, then we can still abort
>     to avoid excessive stalls, but need_resched() is a poor heuristic
>     to determine when async compaction is taking too long.

I tend to agree. I've also realized that because need_resched() leads to 
cc->contended = true, direct reclaim and second compaction that would 
normally follow (used to be sync, now only in hugepaged) is skipped. 
need_resched() seems to be indeed unsuitable for this.

>     The expense of calling async compaction if this is done is easily
>     quantified since we're not migrating any memory unless it is
>     sufficient for the page allocation: it would simply be the iteration
>     over 1GB of memory and avoiding contention on zone->lru_lock.
>
> We may also need to consider deferring async compaction for subsequent
> faults in the near future even though scanning the previous 1GB does not
> decrease or have any impact whatsoever in the success of defragmenting the
> next 1GB.
>
> Any other suggestions that may be helpful?

I suggested already the idea of turning the deferred compaction into a 
live-tuned amount of how much to scan, instead of doing a whole zone 
(before) or an arbitrary amount like 1GB, with the hope of reducing the 
latency spikes. But I realize this would be tricky.

Even less concrete, it might be worth revisiting the rules we use for 
deciding if compaction is worth trying, and the decisions if to continue 
or we believe the allocation will succeed. It relies heavily on 
watermark checks that also consider the order, and I found it somewhat 
fragile. For example, in the alloc slowpath -> direct compaction path, a 
check in compaction concludes that allocation should succeed and 
finishes the compaction, and few moments later the same check in the 
allocation will conclude that everything is fine up to order 8, but 
there probably isn't a page of order 9. In between, the nr_free_pages 
has *increased*. I suspect it's because the watermark checking is racy 
and the counters drift, but I'm not sure yet.
However, this particular problem should be gone when I finish my series 
that would capture the page that compaction has just freed. But still, 
the decisions regarding compaction could be suboptimal.

Vlastimil


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: compaction is still too expensive for thp
@ 2014-05-22  8:10                 ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-22  8:10 UTC (permalink / raw)
  To: David Rientjes, Mel Gorman, Andrew Morton
  Cc: Joonsoo Kim, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel

On 05/22/2014 05:20 AM, David Rientjes wrote:
> On Fri, 16 May 2014, Vlastimil Babka wrote:
>
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>>
>> First, compaction also calls directly cond_resched() in both scanners when no
>> lock is yet taken. This call either does not abort async compaction, or set
>> cc->contended appropriately. This patch introduces a new compact_should_abort()
>> function to achieve both. In isolate_freepages(), the check frequency is
>> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
>> scanner does in the preliminary page checks. In case a pageblock is found
>> suitable for calling isolate_freepages_block(), the checks within there are
>> done on higher frequency.
>>
>> Second, isolate_freepages() does not check if isolate_freepages_block()
>> aborted due to contention, and advances to the next pageblock. This violates
>> the principle of aborting on contention, and might result in pageblocks not
>> being scanned completely, since the scanning cursor is advanced. This patch
>> makes isolate_freepages_block() check the cc->contended flag and abort.
>>
>> In case isolate_freepages() has already isolated some pages before aborting
>> due to contention, page migration will proceed, which is OK since we do not
>> want to waste the work that has been done, and page migration has own checks
>> for contention. However, we do not want another isolation attempt by either
>> of the scanners, so cc->contended flag check is added also to
>> compaction_alloc() and compact_finished() to make sure compaction is aborted
>> right after the migration.
>>
>
> We have a pretty significant problem with async compaction related to thp
> faults and it's not limited to this patch but was intended to be addressed
> in my series as well.  Since this is the latest patch to be proposed for
> aborting async compaction when it's too expensive, it's probably a good
> idea to discuss it here.

I already tried to call for some higher level discussion eariler in your 
series, good that hear that now you also agree it might be useful :)

> With -mm, it turns out that while egregious thp fault latencies were
> reduced, faulting 64MB of memory backed by thp on a fragmented 128GB
> machine can result in latencies of 1-3s for the entire 64MB.  Collecting
> compaction stats from older kernels that give more insight into
> regressions, one such incident is as follows.
>
> Baseline:
> compact_blocks_moved 8181986
> compact_pages_moved 6549560
> compact_pagemigrate_failed 1612070
> compact_stall 101959
> compact_fail 100895
> compact_success 1064
>
> 5s later:
> compact_blocks_moved 8182447
> compact_pages_moved 6550286
> compact_pagemigrate_failed 1612092
> compact_stall 102023
> compact_fail 100959
> compact_success 1064
>
> This represents faulting two 64MB ranges of anonymous memory.  As you can
> see, it results in falling back to 4KB pages because all 64 faults of
> hugepages ends up triggering compaction and failing to allocate.  Over the
> 64 async compactions, we scan on average 7.2 pageblocks per call,
> successfully migrate 11.3 pages per call, and fail migrating 0.34 pages
> per call.
>
> If each async compaction scans 7.2 pageblocks per call, it would have to
> be called 9103 times to scan all memory on this 128GB machine.  We're
> simply not scanning enough memory as a result of ISOLATE_ABORT due to
> need_resched().

Well, the two objectives of not being expensive and at the same time 
scanning "enough memory" (which is hard to define as well) are clearly 
quite opposite :/

> So the net result is that -mm is much better than Linus's tree, where such
> faulting of 64MB ranges could stall 8-9s, but we're still very expensive.

So I guess the difference here is mainly thanks to not doing sync 
compaction? Or do you have any insight which patch helped the most?

> We may need to consider scanning more memory on a single call to async
> compaction even when need_resched() and if we are unsuccessful in
> allocating a hugepage to defer async compaction in subsequent calls up to
> 1 << COMPACT_MAX_DEFER_SHIFT.  Today, we defer on sync compaction but that
> is now never done for thp faults since it is reliant solely on async
> compaction.

So if I understand correctly, your intention is to scan more in a single 
scan, but balance the increased latencies by introducing deferring for 
async compaction.

Offhand I can think of two issues with that.

1) the result will be that often the latency will be low thanks to 
defer, but then there will be a huge (?) spike by scanning whole 1GB (as 
you suggest further in the mail) at once. I think that's similar to what 
you had now with the sync compaction?

2) 1GB could have a good chance of being successful (?) so there would 
be no defer anyway.

I have some other suggestion at the end of my mail.

> I have a few improvements in mind, but thought it would be better to
> get feedback on it first because it's a substantial rewrite of the
> pageblock migration:
>
>   - For all async compaction, avoid migrating memory unless enough
>     contiguous memory is migrated to allow a cc->order allocation.

Yes I suggested something like this earlier. Also in the scanner, skip 
to the next cc->order aligned block as soon as any page fails the 
isolation and is not PageBuddy.
I would just dinstinguish kswapd and direct compaction, not "all async 
compaction". Or maybe kswapd could be switched to sync compaction.

>     This
>     would remove the COMPACT_CLUSTER_MAX restriction on pageblock
>     compaction

Yes.

>     and keep pages on the cc->migratepages list between
>     calls to isolate_migratepages_range().

This might not be needed. It's called within a single pageblock (except 
maybe CMA but that's quite a different thing) and I think we can ignore 
order > pageblock_nr_order here.

>     When an unmigratable page is encountered or memory hole is found,
>     put all pages on cc->migratepages back on the lru lists unless
>     cc->nr_migratepages >= (1 << cc->order).  Otherwise, migrate when
>     enough contiguous memory has been isolated.
>
>   - Remove the need_resched() checks entirely from compaction and
>     consider only doing a set amount of scanning for each call, such
>     as 1GB per call.
>
>     If there is contention on zone->lru_lock, then we can still abort
>     to avoid excessive stalls, but need_resched() is a poor heuristic
>     to determine when async compaction is taking too long.

I tend to agree. I've also realized that because need_resched() leads to 
cc->contended = true, direct reclaim and second compaction that would 
normally follow (used to be sync, now only in hugepaged) is skipped. 
need_resched() seems to be indeed unsuitable for this.

>     The expense of calling async compaction if this is done is easily
>     quantified since we're not migrating any memory unless it is
>     sufficient for the page allocation: it would simply be the iteration
>     over 1GB of memory and avoiding contention on zone->lru_lock.
>
> We may also need to consider deferring async compaction for subsequent
> faults in the near future even though scanning the previous 1GB does not
> decrease or have any impact whatsoever in the success of defragmenting the
> next 1GB.
>
> Any other suggestions that may be helpful?

I suggested already the idea of turning the deferred compaction into a 
live-tuned amount of how much to scan, instead of doing a whole zone 
(before) or an arbitrary amount like 1GB, with the hope of reducing the 
latency spikes. But I realize this would be tricky.

Even less concrete, it might be worth revisiting the rules we use for 
deciding if compaction is worth trying, and the decisions if to continue 
or we believe the allocation will succeed. It relies heavily on 
watermark checks that also consider the order, and I found it somewhat 
fragile. For example, in the alloc slowpath -> direct compaction path, a 
check in compaction concludes that allocation should succeed and 
finishes the compaction, and few moments later the same check in the 
allocation will conclude that everything is fine up to order 8, but 
there probably isn't a page of order 9. In between, the nr_free_pages 
has *increased*. I suspect it's because the watermark checking is racy 
and the counters drift, but I'm not sure yet.
However, this particular problem should be gone when I finish my series 
that would capture the page that compaction has just freed. But still, 
the decisions regarding compaction could be suboptimal.

Vlastimil

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm] mm, thp: avoid excessive compaction latency during fault fix
  2014-05-22  2:49             ` David Rientjes
@ 2014-05-22  8:43               ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-22  8:43 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On 05/22/2014 04:49 AM, David Rientjes wrote:
> On Tue, 13 May 2014, Vlastimil Babka wrote:
>
>> I wonder what about a process doing e.g. mmap() with MAP_POPULATE. It seems to
>> me that it would get only MIGRATE_ASYNC here, right? Since gfp_mask would
>> include __GFP_NO_KSWAPD and it won't have PF_KTHREAD.
>> I think that goes against the idea that with MAP_POPULATE you say you are
>> willing to wait to have everything in place before you actually use the
>> memory. So I guess you are also willing to wait for hugepages in that
>> situation?
>>
>
> I don't understand the distinction you're making between MAP_POPULATE and
> simply a prefault of the anon memory.  What is the difference in semantics
> between using MAP_POPULATE and touching a byte every page size along the
> range?  In the latter, you'd be faulting thp with MIGRATE_ASYNC, so I
> don't understand how MAP_POPULATE is any different or implies any
> preference for hugepages.

Hm, OK. It's right we cannot distinguish populating by touching the 
pages manually. Nevermind then.

> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm] mm, thp: avoid excessive compaction latency during fault fix
@ 2014-05-22  8:43               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-22  8:43 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On 05/22/2014 04:49 AM, David Rientjes wrote:
> On Tue, 13 May 2014, Vlastimil Babka wrote:
>
>> I wonder what about a process doing e.g. mmap() with MAP_POPULATE. It seems to
>> me that it would get only MIGRATE_ASYNC here, right? Since gfp_mask would
>> include __GFP_NO_KSWAPD and it won't have PF_KTHREAD.
>> I think that goes against the idea that with MAP_POPULATE you say you are
>> willing to wait to have everything in place before you actually use the
>> memory. So I guess you are also willing to wait for hugepages in that
>> situation?
>>
>
> I don't understand the distinction you're making between MAP_POPULATE and
> simply a prefault of the anon memory.  What is the difference in semantics
> between using MAP_POPULATE and touching a byte every page size along the
> range?  In the latter, you'd be faulting thp with MIGRATE_ASYNC, so I
> don't understand how MAP_POPULATE is any different or implies any
> preference for hugepages.

Hm, OK. It's right we cannot distinguish populating by touching the 
pages manually. Nevermind then.

> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: compaction is still too expensive for thp
  2014-05-22  8:10                 ` Vlastimil Babka
@ 2014-05-22  8:55                   ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-22  8:55 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Mel Gorman, Andrew Morton, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On Thu, 22 May 2014, Vlastimil Babka wrote:

> > With -mm, it turns out that while egregious thp fault latencies were
> > reduced, faulting 64MB of memory backed by thp on a fragmented 128GB
> > machine can result in latencies of 1-3s for the entire 64MB.  Collecting
> > compaction stats from older kernels that give more insight into
> > regressions, one such incident is as follows.
> > 
> > Baseline:
> > compact_blocks_moved 8181986
> > compact_pages_moved 6549560
> > compact_pagemigrate_failed 1612070
> > compact_stall 101959
> > compact_fail 100895
> > compact_success 1064
> > 
> > 5s later:
> > compact_blocks_moved 8182447
> > compact_pages_moved 6550286
> > compact_pagemigrate_failed 1612092
> > compact_stall 102023
> > compact_fail 100959
> > compact_success 1064
> > 
> > This represents faulting two 64MB ranges of anonymous memory.  As you can
> > see, it results in falling back to 4KB pages because all 64 faults of
> > hugepages ends up triggering compaction and failing to allocate.  Over the
> > 64 async compactions, we scan on average 7.2 pageblocks per call,
> > successfully migrate 11.3 pages per call, and fail migrating 0.34 pages
> > per call.
> > 
> > If each async compaction scans 7.2 pageblocks per call, it would have to
> > be called 9103 times to scan all memory on this 128GB machine.  We're
> > simply not scanning enough memory as a result of ISOLATE_ABORT due to
> > need_resched().
> 
> Well, the two objectives of not being expensive and at the same time scanning
> "enough memory" (which is hard to define as well) are clearly quite opposite
> :/
> 

Agreed.

> > So the net result is that -mm is much better than Linus's tree, where such
> > faulting of 64MB ranges could stall 8-9s, but we're still very expensive.
> 
> So I guess the difference here is mainly thanks to not doing sync compaction?

Not doing sync compaction for thp and caching the migration pfn for async 
so that it doesn't iterate over a ton of memory that may not be eligible 
for async compaction every time it is called.  But when we avoid sync 
compaction, we also lose deferred compaction.

> So if I understand correctly, your intention is to scan more in a single scan,

More will be scanned instead of ~7 pageblocks for every call to async 
compaction with the data that I presented but also reduce how expensive 
every pageblock scan is by avoiding needlessly migrating memory (and 
dealing with rmap locks) when it will not result in 2MB of contiguous 
memory for thp faults.

> but balance the increased latencies by introducing deferring for async
> compaction.
> 
> Offhand I can think of two issues with that.
> 
> 1) the result will be that often the latency will be low thanks to defer, but
> then there will be a huge (?) spike by scanning whole 1GB (as you suggest
> further in the mail) at once. I think that's similar to what you had now with
> the sync compaction?
> 

Not at all, with MIGRATE_SYNC_LIGHT before there is no termination other 
than an entire scan of memory so we were potentially scanning 128GB and 
failing if thp cannot be allocated.

If we are to avoid migrating memory needlessly that will not result in 
cc->order memory being allocated, then the cost should be relatively 
constant for a span of memory.  My 32GB system can iterate all memory with 
MIGRATE_ASYNC and no need_resched() aborts in ~530ms.

> 2) 1GB could have a good chance of being successful (?) so there would be no
> defer anyway.
> 

If we terminate early because order-9 is allocatable or we end up scanning 
the entire 1GB and the hugepage is allocated, then we have prevented 511 
other pagefaults in my testcase where faulting 64MB of memory with thp 
enabled can currently take 1-3s on a 128GB machine with fragmentation.  I 
think the deferral is unnecessary in such a case.

Are you suggesting we should try without the deferral first?

> > I have a few improvements in mind, but thought it would be better to
> > get feedback on it first because it's a substantial rewrite of the
> > pageblock migration:
> > 
> >   - For all async compaction, avoid migrating memory unless enough
> >     contiguous memory is migrated to allow a cc->order allocation.
> 
> Yes I suggested something like this earlier. Also in the scanner, skip to the
> next cc->order aligned block as soon as any page fails the isolation and is
> not PageBuddy.

Agreed.

> I would just dinstinguish kswapd and direct compaction, not "all async
> compaction". Or maybe kswapd could be switched to sync compaction.
> 

To generalize this, I'm thinking that it is pointless for async compaction 
to migrate memory in a contiguous span if it will not cause a cc->order 
page allocation to succeed.

> >     This
> >     would remove the COMPACT_CLUSTER_MAX restriction on pageblock
> >     compaction
> 
> Yes.
> 
> >     and keep pages on the cc->migratepages list between
> >     calls to isolate_migratepages_range().
> 
> This might not be needed. It's called within a single pageblock (except maybe
> CMA but that's quite a different thing) and I think we can ignore order >
> pageblock_nr_order here.
> 

Ok, I guess pageblocks within a zone are always pageblock_order aligned 
for all platforms so if we encounter any non-migratable (or PageBuddy) 
memory in a block where pageblock_order == HPAGE_PMD_NR, then we can abort 
that block immediately for thp faults.

> >     When an unmigratable page is encountered or memory hole is found,
> >     put all pages on cc->migratepages back on the lru lists unless
> >     cc->nr_migratepages >= (1 << cc->order).  Otherwise, migrate when
> >     enough contiguous memory has been isolated.
> > 
> >   - Remove the need_resched() checks entirely from compaction and
> >     consider only doing a set amount of scanning for each call, such
> >     as 1GB per call.
> > 
> >     If there is contention on zone->lru_lock, then we can still abort
> >     to avoid excessive stalls, but need_resched() is a poor heuristic
> >     to determine when async compaction is taking too long.
> 
> I tend to agree. I've also realized that because need_resched() leads to
> cc->contended = true, direct reclaim and second compaction that would normally
> follow (used to be sync, now only in hugepaged) is skipped. need_resched()
> seems to be indeed unsuitable for this.
> 

It's hard to replace with an alternative, though, to determine when enough 
is enough :)  1GB might be a sane starting point, though, and then try 
reclaim and avoid the second call to async compaction on failure.  I'm not 
sure if the deferral would be needed in this case or not.

> >     The expense of calling async compaction if this is done is easily
> >     quantified since we're not migrating any memory unless it is
> >     sufficient for the page allocation: it would simply be the iteration
> >     over 1GB of memory and avoiding contention on zone->lru_lock.
> > 
> > We may also need to consider deferring async compaction for subsequent
> > faults in the near future even though scanning the previous 1GB does not
> > decrease or have any impact whatsoever in the success of defragmenting the
> > next 1GB.
> > 
> > Any other suggestions that may be helpful?
> 
> I suggested already the idea of turning the deferred compaction into a
> live-tuned amount of how much to scan, instead of doing a whole zone (before)
> or an arbitrary amount like 1GB, with the hope of reducing the latency spikes.
> But I realize this would be tricky.
> 

By live-tuned, do you mean something that is tuned by the kernel over time 
depending on how successful compaction is or do you mean something that 
the user would alter?  If we are to go this route, I agree that we can 
allow the user to tune the 1GB.

> Even less concrete, it might be worth revisiting the rules we use for deciding
> if compaction is worth trying, and the decisions if to continue or we believe
> the allocation will succeed.

Ah, tuning compaction_suitable() might be another opportunity.  I'm 
wondering if we should be considering thp specially here since it's in the 
fault path.

> It relies heavily on watermark checks that also
> consider the order, and I found it somewhat fragile. For example, in the alloc
> slowpath -> direct compaction path, a check in compaction concludes that
> allocation should succeed and finishes the compaction, and few moments later
> the same check in the allocation will conclude that everything is fine up to
> order 8, but there probably isn't a page of order 9. In between, the
> nr_free_pages has *increased*. I suspect it's because the watermark checking
> is racy and the counters drift, but I'm not sure yet.

The watermark checks both in compaction_suitable() and and 
compact_finished() are indeed racy with respect to the actual page 
allocation.

> However, this particular problem should be gone when I finish my series that
> would capture the page that compaction has just freed. But still, the
> decisions regarding compaction could be suboptimal.
> 

This should also avoid the race between COMPACT_PARTIAL, returning to the 
page allocator, and finding that the high-order memory you thought was now 
available has been allocated by someone else.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: compaction is still too expensive for thp
@ 2014-05-22  8:55                   ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-22  8:55 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Mel Gorman, Andrew Morton, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On Thu, 22 May 2014, Vlastimil Babka wrote:

> > With -mm, it turns out that while egregious thp fault latencies were
> > reduced, faulting 64MB of memory backed by thp on a fragmented 128GB
> > machine can result in latencies of 1-3s for the entire 64MB.  Collecting
> > compaction stats from older kernels that give more insight into
> > regressions, one such incident is as follows.
> > 
> > Baseline:
> > compact_blocks_moved 8181986
> > compact_pages_moved 6549560
> > compact_pagemigrate_failed 1612070
> > compact_stall 101959
> > compact_fail 100895
> > compact_success 1064
> > 
> > 5s later:
> > compact_blocks_moved 8182447
> > compact_pages_moved 6550286
> > compact_pagemigrate_failed 1612092
> > compact_stall 102023
> > compact_fail 100959
> > compact_success 1064
> > 
> > This represents faulting two 64MB ranges of anonymous memory.  As you can
> > see, it results in falling back to 4KB pages because all 64 faults of
> > hugepages ends up triggering compaction and failing to allocate.  Over the
> > 64 async compactions, we scan on average 7.2 pageblocks per call,
> > successfully migrate 11.3 pages per call, and fail migrating 0.34 pages
> > per call.
> > 
> > If each async compaction scans 7.2 pageblocks per call, it would have to
> > be called 9103 times to scan all memory on this 128GB machine.  We're
> > simply not scanning enough memory as a result of ISOLATE_ABORT due to
> > need_resched().
> 
> Well, the two objectives of not being expensive and at the same time scanning
> "enough memory" (which is hard to define as well) are clearly quite opposite
> :/
> 

Agreed.

> > So the net result is that -mm is much better than Linus's tree, where such
> > faulting of 64MB ranges could stall 8-9s, but we're still very expensive.
> 
> So I guess the difference here is mainly thanks to not doing sync compaction?

Not doing sync compaction for thp and caching the migration pfn for async 
so that it doesn't iterate over a ton of memory that may not be eligible 
for async compaction every time it is called.  But when we avoid sync 
compaction, we also lose deferred compaction.

> So if I understand correctly, your intention is to scan more in a single scan,

More will be scanned instead of ~7 pageblocks for every call to async 
compaction with the data that I presented but also reduce how expensive 
every pageblock scan is by avoiding needlessly migrating memory (and 
dealing with rmap locks) when it will not result in 2MB of contiguous 
memory for thp faults.

> but balance the increased latencies by introducing deferring for async
> compaction.
> 
> Offhand I can think of two issues with that.
> 
> 1) the result will be that often the latency will be low thanks to defer, but
> then there will be a huge (?) spike by scanning whole 1GB (as you suggest
> further in the mail) at once. I think that's similar to what you had now with
> the sync compaction?
> 

Not at all, with MIGRATE_SYNC_LIGHT before there is no termination other 
than an entire scan of memory so we were potentially scanning 128GB and 
failing if thp cannot be allocated.

If we are to avoid migrating memory needlessly that will not result in 
cc->order memory being allocated, then the cost should be relatively 
constant for a span of memory.  My 32GB system can iterate all memory with 
MIGRATE_ASYNC and no need_resched() aborts in ~530ms.

> 2) 1GB could have a good chance of being successful (?) so there would be no
> defer anyway.
> 

If we terminate early because order-9 is allocatable or we end up scanning 
the entire 1GB and the hugepage is allocated, then we have prevented 511 
other pagefaults in my testcase where faulting 64MB of memory with thp 
enabled can currently take 1-3s on a 128GB machine with fragmentation.  I 
think the deferral is unnecessary in such a case.

Are you suggesting we should try without the deferral first?

> > I have a few improvements in mind, but thought it would be better to
> > get feedback on it first because it's a substantial rewrite of the
> > pageblock migration:
> > 
> >   - For all async compaction, avoid migrating memory unless enough
> >     contiguous memory is migrated to allow a cc->order allocation.
> 
> Yes I suggested something like this earlier. Also in the scanner, skip to the
> next cc->order aligned block as soon as any page fails the isolation and is
> not PageBuddy.

Agreed.

> I would just dinstinguish kswapd and direct compaction, not "all async
> compaction". Or maybe kswapd could be switched to sync compaction.
> 

To generalize this, I'm thinking that it is pointless for async compaction 
to migrate memory in a contiguous span if it will not cause a cc->order 
page allocation to succeed.

> >     This
> >     would remove the COMPACT_CLUSTER_MAX restriction on pageblock
> >     compaction
> 
> Yes.
> 
> >     and keep pages on the cc->migratepages list between
> >     calls to isolate_migratepages_range().
> 
> This might not be needed. It's called within a single pageblock (except maybe
> CMA but that's quite a different thing) and I think we can ignore order >
> pageblock_nr_order here.
> 

Ok, I guess pageblocks within a zone are always pageblock_order aligned 
for all platforms so if we encounter any non-migratable (or PageBuddy) 
memory in a block where pageblock_order == HPAGE_PMD_NR, then we can abort 
that block immediately for thp faults.

> >     When an unmigratable page is encountered or memory hole is found,
> >     put all pages on cc->migratepages back on the lru lists unless
> >     cc->nr_migratepages >= (1 << cc->order).  Otherwise, migrate when
> >     enough contiguous memory has been isolated.
> > 
> >   - Remove the need_resched() checks entirely from compaction and
> >     consider only doing a set amount of scanning for each call, such
> >     as 1GB per call.
> > 
> >     If there is contention on zone->lru_lock, then we can still abort
> >     to avoid excessive stalls, but need_resched() is a poor heuristic
> >     to determine when async compaction is taking too long.
> 
> I tend to agree. I've also realized that because need_resched() leads to
> cc->contended = true, direct reclaim and second compaction that would normally
> follow (used to be sync, now only in hugepaged) is skipped. need_resched()
> seems to be indeed unsuitable for this.
> 

It's hard to replace with an alternative, though, to determine when enough 
is enough :)  1GB might be a sane starting point, though, and then try 
reclaim and avoid the second call to async compaction on failure.  I'm not 
sure if the deferral would be needed in this case or not.

> >     The expense of calling async compaction if this is done is easily
> >     quantified since we're not migrating any memory unless it is
> >     sufficient for the page allocation: it would simply be the iteration
> >     over 1GB of memory and avoiding contention on zone->lru_lock.
> > 
> > We may also need to consider deferring async compaction for subsequent
> > faults in the near future even though scanning the previous 1GB does not
> > decrease or have any impact whatsoever in the success of defragmenting the
> > next 1GB.
> > 
> > Any other suggestions that may be helpful?
> 
> I suggested already the idea of turning the deferred compaction into a
> live-tuned amount of how much to scan, instead of doing a whole zone (before)
> or an arbitrary amount like 1GB, with the hope of reducing the latency spikes.
> But I realize this would be tricky.
> 

By live-tuned, do you mean something that is tuned by the kernel over time 
depending on how successful compaction is or do you mean something that 
the user would alter?  If we are to go this route, I agree that we can 
allow the user to tune the 1GB.

> Even less concrete, it might be worth revisiting the rules we use for deciding
> if compaction is worth trying, and the decisions if to continue or we believe
> the allocation will succeed.

Ah, tuning compaction_suitable() might be another opportunity.  I'm 
wondering if we should be considering thp specially here since it's in the 
fault path.

> It relies heavily on watermark checks that also
> consider the order, and I found it somewhat fragile. For example, in the alloc
> slowpath -> direct compaction path, a check in compaction concludes that
> allocation should succeed and finishes the compaction, and few moments later
> the same check in the allocation will conclude that everything is fine up to
> order 8, but there probably isn't a page of order 9. In between, the
> nr_free_pages has *increased*. I suspect it's because the watermark checking
> is racy and the counters drift, but I'm not sure yet.

The watermark checks both in compaction_suitable() and and 
compact_finished() are indeed racy with respect to the actual page 
allocation.

> However, this particular problem should be gone when I finish my series that
> would capture the page that compaction has just freed. But still, the
> decisions regarding compaction could be suboptimal.
> 

This should also avoid the race between COMPACT_PARTIAL, returning to the 
page allocator, and finding that the high-order memory you thought was now 
available has been allocated by someone else.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: compaction is still too expensive for thp
  2014-05-22  8:55                   ` David Rientjes
@ 2014-05-22 12:03                     ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-22 12:03 UTC (permalink / raw)
  To: David Rientjes
  Cc: Mel Gorman, Andrew Morton, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On 05/22/2014 10:55 AM, David Rientjes wrote:
> On Thu, 22 May 2014, Vlastimil Babka wrote:
>
>>> With -mm, it turns out that while egregious thp fault latencies were
>>> reduced, faulting 64MB of memory backed by thp on a fragmented 128GB
>>> machine can result in latencies of 1-3s for the entire 64MB.  Collecting
>>> compaction stats from older kernels that give more insight into
>>> regressions, one such incident is as follows.
>>>
>>> Baseline:
>>> compact_blocks_moved 8181986
>>> compact_pages_moved 6549560
>>> compact_pagemigrate_failed 1612070
>>> compact_stall 101959
>>> compact_fail 100895
>>> compact_success 1064
>>>
>>> 5s later:
>>> compact_blocks_moved 8182447
>>> compact_pages_moved 6550286
>>> compact_pagemigrate_failed 1612092
>>> compact_stall 102023
>>> compact_fail 100959
>>> compact_success 1064
>>>
>>> This represents faulting two 64MB ranges of anonymous memory.  As you can
>>> see, it results in falling back to 4KB pages because all 64 faults of
>>> hugepages ends up triggering compaction and failing to allocate.  Over the
>>> 64 async compactions, we scan on average 7.2 pageblocks per call,
>>> successfully migrate 11.3 pages per call, and fail migrating 0.34 pages
>>> per call.
>>>
>>> If each async compaction scans 7.2 pageblocks per call, it would have to
>>> be called 9103 times to scan all memory on this 128GB machine.  We're
>>> simply not scanning enough memory as a result of ISOLATE_ABORT due to
>>> need_resched().
>>
>> Well, the two objectives of not being expensive and at the same time scanning
>> "enough memory" (which is hard to define as well) are clearly quite opposite
>> :/
>>
>
> Agreed.
>
>>> So the net result is that -mm is much better than Linus's tree, where such
>>> faulting of 64MB ranges could stall 8-9s, but we're still very expensive.
>>
>> So I guess the difference here is mainly thanks to not doing sync compaction?
>
> Not doing sync compaction for thp and caching the migration pfn for async
> so that it doesn't iterate over a ton of memory that may not be eligible
> for async compaction every time it is called.  But when we avoid sync
> compaction, we also lose deferred compaction.
>
>> So if I understand correctly, your intention is to scan more in a single scan,
>
> More will be scanned instead of ~7 pageblocks for every call to async
> compaction with the data that I presented but also reduce how expensive
> every pageblock scan is by avoiding needlessly migrating memory (and
> dealing with rmap locks) when it will not result in 2MB of contiguous
> memory for thp faults.
>
>> but balance the increased latencies by introducing deferring for async
>> compaction.
>>
>> Offhand I can think of two issues with that.
>>
>> 1) the result will be that often the latency will be low thanks to defer, but
>> then there will be a huge (?) spike by scanning whole 1GB (as you suggest
>> further in the mail) at once. I think that's similar to what you had now with
>> the sync compaction?
>>
>
> Not at all, with MIGRATE_SYNC_LIGHT before there is no termination other
> than an entire scan of memory so we were potentially scanning 128GB and
> failing if thp cannot be allocated.

OK.

> If we are to avoid migrating memory needlessly that will not result in
> cc->order memory being allocated, then the cost should be relatively
> constant for a span of memory.  My 32GB system can iterate all memory with
> MIGRATE_ASYNC and no need_resched() aborts in ~530ms.

OK

>> 2) 1GB could have a good chance of being successful (?) so there would be no
>> defer anyway.
>>
>
> If we terminate early because order-9 is allocatable or we end up scanning
> the entire 1GB and the hugepage is allocated, then we have prevented 511
> other pagefaults in my testcase where faulting 64MB of memory with thp
> enabled can currently take 1-3s on a 128GB machine with fragmentation.  I
> think the deferral is unnecessary in such a case.
>
> Are you suggesting we should try without the deferral first?

Might be an option, or less aggressive back off than the sync deferral, 
as it's limited to 1GB.

>>> I have a few improvements in mind, but thought it would be better to
>>> get feedback on it first because it's a substantial rewrite of the
>>> pageblock migration:
>>>
>>>    - For all async compaction, avoid migrating memory unless enough
>>>      contiguous memory is migrated to allow a cc->order allocation.
>>
>> Yes I suggested something like this earlier. Also in the scanner, skip to the
>> next cc->order aligned block as soon as any page fails the isolation and is
>> not PageBuddy.
>
> Agreed.
>
>> I would just dinstinguish kswapd and direct compaction, not "all async
>> compaction". Or maybe kswapd could be switched to sync compaction.
>>
>
> To generalize this, I'm thinking that it is pointless for async compaction
> to migrate memory in a contiguous span if it will not cause a cc->order
> page allocation to succeed.

Well I think it's pointless for page faults and maybe khugepaged. But 
still there should be some daemon trying to migrate even pages that do 
not immediately lead to a continuous block or some order, as an general 
incremental defragmentation. For example I believe (and hope to analyze 
and improve that eventually) that MOVABLE pages allocated in 
UNMOVABLE/RECLAIMABLE pageblocks as a fallback should be attempted to be 
migrated to more appropriate pageblocks when possible so that 
UNMOVABLE/RECLAIMABLE pageblocks have space for pages they are intended 
for, and those allocations don't have to fallback and pollute MOVABLE 
pageblocks, which is much worse.

So if we say that "async compaction" now means it won't do anything that 
does not lead to continuous space immediately, then kswapd should switch 
to sync compaction IMHO (and still not sure about khugepaged).
Or maybe we shouldn't conflate the two things together - one being the 
original migration mode, the other being "when to skip pages or give up" 
mode.

>>>      This
>>>      would remove the COMPACT_CLUSTER_MAX restriction on pageblock
>>>      compaction
>>
>> Yes.
>>
>>>      and keep pages on the cc->migratepages list between
>>>      calls to isolate_migratepages_range().
>>
>> This might not be needed. It's called within a single pageblock (except maybe
>> CMA but that's quite a different thing) and I think we can ignore order >
>> pageblock_nr_order here.
>>
>
> Ok, I guess pageblocks within a zone are always pageblock_order aligned
> for all platforms so if we encounter any non-migratable (or PageBuddy)
> memory in a block where pageblock_order == HPAGE_PMD_NR, then we can abort
> that block immediately for thp faults.

Yeah I believe they are aligned, otherwise it would be quite a mess :)

>>>      When an unmigratable page is encountered or memory hole is found,
>>>      put all pages on cc->migratepages back on the lru lists unless
>>>      cc->nr_migratepages >= (1 << cc->order).  Otherwise, migrate when
>>>      enough contiguous memory has been isolated.
>>>
>>>    - Remove the need_resched() checks entirely from compaction and
>>>      consider only doing a set amount of scanning for each call, such
>>>      as 1GB per call.
>>>
>>>      If there is contention on zone->lru_lock, then we can still abort
>>>      to avoid excessive stalls, but need_resched() is a poor heuristic
>>>      to determine when async compaction is taking too long.
>>
>> I tend to agree. I've also realized that because need_resched() leads to
>> cc->contended = true, direct reclaim and second compaction that would normally
>> follow (used to be sync, now only in hugepaged) is skipped. need_resched()
>> seems to be indeed unsuitable for this.
>>
>
> It's hard to replace with an alternative, though, to determine when enough
> is enough :)  1GB might be a sane starting point, though, and then try
> reclaim and avoid the second call to async compaction on failure.  I'm not
> sure if the deferral would be needed in this case or not.

I think the second call to compaction is there on the assumption that 
direct reclaim has freed some memory which will allow compaction to 
succeed. I doubt reclaim itself has a chance of freeing a THP-order 
page. But yeah the heuristic is weak since we might not scan in the same 
1GB area as where reclaim helped...

>>>      The expense of calling async compaction if this is done is easily
>>>      quantified since we're not migrating any memory unless it is
>>>      sufficient for the page allocation: it would simply be the iteration
>>>      over 1GB of memory and avoiding contention on zone->lru_lock.
>>>
>>> We may also need to consider deferring async compaction for subsequent
>>> faults in the near future even though scanning the previous 1GB does not
>>> decrease or have any impact whatsoever in the success of defragmenting the
>>> next 1GB.
>>>
>>> Any other suggestions that may be helpful?
>>
>> I suggested already the idea of turning the deferred compaction into a
>> live-tuned amount of how much to scan, instead of doing a whole zone (before)
>> or an arbitrary amount like 1GB, with the hope of reducing the latency spikes.
>> But I realize this would be tricky.
>>
>
> By live-tuned, do you mean something that is tuned by the kernel over time
> depending on how successful compaction is or do you mean something that
> the user would alter?  If we are to go this route, I agree that we can
> allow the user to tune the 1GB.

Actually I meant automatically by the kernel depending on the success, 
but maybe we could allow the user to alter the aggressiveness.

But I guess something like 1GB and no deferral would be a first step and 
see how it works. Also maybe we could be more precise by not limiting by 
memory size, but by the number of pages visited by the scanner(s). 
Because e.g. scanning 1GB by a whole pageblock per iteration (because 
it's already a THP, or of a wrong migratetype) is obviously much smaller 
cost than visiting all the pages.

>> Even less concrete, it might be worth revisiting the rules we use for deciding
>> if compaction is worth trying, and the decisions if to continue or we believe
>> the allocation will succeed.
>
> Ah, tuning compaction_suitable() might be another opportunity.  I'm
> wondering if we should be considering thp specially here since it's in the
> fault path.

Maybe, but how?

>> It relies heavily on watermark checks that also
>> consider the order, and I found it somewhat fragile. For example, in the alloc
>> slowpath -> direct compaction path, a check in compaction concludes that
>> allocation should succeed and finishes the compaction, and few moments later
>> the same check in the allocation will conclude that everything is fine up to
>> order 8, but there probably isn't a page of order 9. In between, the
>> nr_free_pages has *increased*. I suspect it's because the watermark checking
>> is racy and the counters drift, but I'm not sure yet.
>
> The watermark checks both in compaction_suitable() and and
> compact_finished() are indeed racy with respect to the actual page
> allocation.
>
>> However, this particular problem should be gone when I finish my series that
>> would capture the page that compaction has just freed. But still, the
>> decisions regarding compaction could be suboptimal.
>>
>
> This should also avoid the race between COMPACT_PARTIAL, returning to the
> page allocator, and finding that the high-order memory you thought was now
> available has been allocated by someone else.

Yeah, I'll try to finish this attempt soon. The idea was also to avoid 
racing with parallel allocations during the compaction itself, by using 
memory isolation on the pageblock (similar to what CMA is doing). But 
that has to be done before even attempting to migrate-scan the 
pageblock, so it might add a useless latency for a pageblock that will 
be then skipped by the async compaction. But capturing the page as soon 
as it becomes available and holding on to it all the way back from 
compaction to alloc slowpath would be an improvement by itself. The 
isolation could be then added to the cases where latencies are not 
critical, such as hugepaged.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: compaction is still too expensive for thp
@ 2014-05-22 12:03                     ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-22 12:03 UTC (permalink / raw)
  To: David Rientjes
  Cc: Mel Gorman, Andrew Morton, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On 05/22/2014 10:55 AM, David Rientjes wrote:
> On Thu, 22 May 2014, Vlastimil Babka wrote:
>
>>> With -mm, it turns out that while egregious thp fault latencies were
>>> reduced, faulting 64MB of memory backed by thp on a fragmented 128GB
>>> machine can result in latencies of 1-3s for the entire 64MB.  Collecting
>>> compaction stats from older kernels that give more insight into
>>> regressions, one such incident is as follows.
>>>
>>> Baseline:
>>> compact_blocks_moved 8181986
>>> compact_pages_moved 6549560
>>> compact_pagemigrate_failed 1612070
>>> compact_stall 101959
>>> compact_fail 100895
>>> compact_success 1064
>>>
>>> 5s later:
>>> compact_blocks_moved 8182447
>>> compact_pages_moved 6550286
>>> compact_pagemigrate_failed 1612092
>>> compact_stall 102023
>>> compact_fail 100959
>>> compact_success 1064
>>>
>>> This represents faulting two 64MB ranges of anonymous memory.  As you can
>>> see, it results in falling back to 4KB pages because all 64 faults of
>>> hugepages ends up triggering compaction and failing to allocate.  Over the
>>> 64 async compactions, we scan on average 7.2 pageblocks per call,
>>> successfully migrate 11.3 pages per call, and fail migrating 0.34 pages
>>> per call.
>>>
>>> If each async compaction scans 7.2 pageblocks per call, it would have to
>>> be called 9103 times to scan all memory on this 128GB machine.  We're
>>> simply not scanning enough memory as a result of ISOLATE_ABORT due to
>>> need_resched().
>>
>> Well, the two objectives of not being expensive and at the same time scanning
>> "enough memory" (which is hard to define as well) are clearly quite opposite
>> :/
>>
>
> Agreed.
>
>>> So the net result is that -mm is much better than Linus's tree, where such
>>> faulting of 64MB ranges could stall 8-9s, but we're still very expensive.
>>
>> So I guess the difference here is mainly thanks to not doing sync compaction?
>
> Not doing sync compaction for thp and caching the migration pfn for async
> so that it doesn't iterate over a ton of memory that may not be eligible
> for async compaction every time it is called.  But when we avoid sync
> compaction, we also lose deferred compaction.
>
>> So if I understand correctly, your intention is to scan more in a single scan,
>
> More will be scanned instead of ~7 pageblocks for every call to async
> compaction with the data that I presented but also reduce how expensive
> every pageblock scan is by avoiding needlessly migrating memory (and
> dealing with rmap locks) when it will not result in 2MB of contiguous
> memory for thp faults.
>
>> but balance the increased latencies by introducing deferring for async
>> compaction.
>>
>> Offhand I can think of two issues with that.
>>
>> 1) the result will be that often the latency will be low thanks to defer, but
>> then there will be a huge (?) spike by scanning whole 1GB (as you suggest
>> further in the mail) at once. I think that's similar to what you had now with
>> the sync compaction?
>>
>
> Not at all, with MIGRATE_SYNC_LIGHT before there is no termination other
> than an entire scan of memory so we were potentially scanning 128GB and
> failing if thp cannot be allocated.

OK.

> If we are to avoid migrating memory needlessly that will not result in
> cc->order memory being allocated, then the cost should be relatively
> constant for a span of memory.  My 32GB system can iterate all memory with
> MIGRATE_ASYNC and no need_resched() aborts in ~530ms.

OK

>> 2) 1GB could have a good chance of being successful (?) so there would be no
>> defer anyway.
>>
>
> If we terminate early because order-9 is allocatable or we end up scanning
> the entire 1GB and the hugepage is allocated, then we have prevented 511
> other pagefaults in my testcase where faulting 64MB of memory with thp
> enabled can currently take 1-3s on a 128GB machine with fragmentation.  I
> think the deferral is unnecessary in such a case.
>
> Are you suggesting we should try without the deferral first?

Might be an option, or less aggressive back off than the sync deferral, 
as it's limited to 1GB.

>>> I have a few improvements in mind, but thought it would be better to
>>> get feedback on it first because it's a substantial rewrite of the
>>> pageblock migration:
>>>
>>>    - For all async compaction, avoid migrating memory unless enough
>>>      contiguous memory is migrated to allow a cc->order allocation.
>>
>> Yes I suggested something like this earlier. Also in the scanner, skip to the
>> next cc->order aligned block as soon as any page fails the isolation and is
>> not PageBuddy.
>
> Agreed.
>
>> I would just dinstinguish kswapd and direct compaction, not "all async
>> compaction". Or maybe kswapd could be switched to sync compaction.
>>
>
> To generalize this, I'm thinking that it is pointless for async compaction
> to migrate memory in a contiguous span if it will not cause a cc->order
> page allocation to succeed.

Well I think it's pointless for page faults and maybe khugepaged. But 
still there should be some daemon trying to migrate even pages that do 
not immediately lead to a continuous block or some order, as an general 
incremental defragmentation. For example I believe (and hope to analyze 
and improve that eventually) that MOVABLE pages allocated in 
UNMOVABLE/RECLAIMABLE pageblocks as a fallback should be attempted to be 
migrated to more appropriate pageblocks when possible so that 
UNMOVABLE/RECLAIMABLE pageblocks have space for pages they are intended 
for, and those allocations don't have to fallback and pollute MOVABLE 
pageblocks, which is much worse.

So if we say that "async compaction" now means it won't do anything that 
does not lead to continuous space immediately, then kswapd should switch 
to sync compaction IMHO (and still not sure about khugepaged).
Or maybe we shouldn't conflate the two things together - one being the 
original migration mode, the other being "when to skip pages or give up" 
mode.

>>>      This
>>>      would remove the COMPACT_CLUSTER_MAX restriction on pageblock
>>>      compaction
>>
>> Yes.
>>
>>>      and keep pages on the cc->migratepages list between
>>>      calls to isolate_migratepages_range().
>>
>> This might not be needed. It's called within a single pageblock (except maybe
>> CMA but that's quite a different thing) and I think we can ignore order >
>> pageblock_nr_order here.
>>
>
> Ok, I guess pageblocks within a zone are always pageblock_order aligned
> for all platforms so if we encounter any non-migratable (or PageBuddy)
> memory in a block where pageblock_order == HPAGE_PMD_NR, then we can abort
> that block immediately for thp faults.

Yeah I believe they are aligned, otherwise it would be quite a mess :)

>>>      When an unmigratable page is encountered or memory hole is found,
>>>      put all pages on cc->migratepages back on the lru lists unless
>>>      cc->nr_migratepages >= (1 << cc->order).  Otherwise, migrate when
>>>      enough contiguous memory has been isolated.
>>>
>>>    - Remove the need_resched() checks entirely from compaction and
>>>      consider only doing a set amount of scanning for each call, such
>>>      as 1GB per call.
>>>
>>>      If there is contention on zone->lru_lock, then we can still abort
>>>      to avoid excessive stalls, but need_resched() is a poor heuristic
>>>      to determine when async compaction is taking too long.
>>
>> I tend to agree. I've also realized that because need_resched() leads to
>> cc->contended = true, direct reclaim and second compaction that would normally
>> follow (used to be sync, now only in hugepaged) is skipped. need_resched()
>> seems to be indeed unsuitable for this.
>>
>
> It's hard to replace with an alternative, though, to determine when enough
> is enough :)  1GB might be a sane starting point, though, and then try
> reclaim and avoid the second call to async compaction on failure.  I'm not
> sure if the deferral would be needed in this case or not.

I think the second call to compaction is there on the assumption that 
direct reclaim has freed some memory which will allow compaction to 
succeed. I doubt reclaim itself has a chance of freeing a THP-order 
page. But yeah the heuristic is weak since we might not scan in the same 
1GB area as where reclaim helped...

>>>      The expense of calling async compaction if this is done is easily
>>>      quantified since we're not migrating any memory unless it is
>>>      sufficient for the page allocation: it would simply be the iteration
>>>      over 1GB of memory and avoiding contention on zone->lru_lock.
>>>
>>> We may also need to consider deferring async compaction for subsequent
>>> faults in the near future even though scanning the previous 1GB does not
>>> decrease or have any impact whatsoever in the success of defragmenting the
>>> next 1GB.
>>>
>>> Any other suggestions that may be helpful?
>>
>> I suggested already the idea of turning the deferred compaction into a
>> live-tuned amount of how much to scan, instead of doing a whole zone (before)
>> or an arbitrary amount like 1GB, with the hope of reducing the latency spikes.
>> But I realize this would be tricky.
>>
>
> By live-tuned, do you mean something that is tuned by the kernel over time
> depending on how successful compaction is or do you mean something that
> the user would alter?  If we are to go this route, I agree that we can
> allow the user to tune the 1GB.

Actually I meant automatically by the kernel depending on the success, 
but maybe we could allow the user to alter the aggressiveness.

But I guess something like 1GB and no deferral would be a first step and 
see how it works. Also maybe we could be more precise by not limiting by 
memory size, but by the number of pages visited by the scanner(s). 
Because e.g. scanning 1GB by a whole pageblock per iteration (because 
it's already a THP, or of a wrong migratetype) is obviously much smaller 
cost than visiting all the pages.

>> Even less concrete, it might be worth revisiting the rules we use for deciding
>> if compaction is worth trying, and the decisions if to continue or we believe
>> the allocation will succeed.
>
> Ah, tuning compaction_suitable() might be another opportunity.  I'm
> wondering if we should be considering thp specially here since it's in the
> fault path.

Maybe, but how?

>> It relies heavily on watermark checks that also
>> consider the order, and I found it somewhat fragile. For example, in the alloc
>> slowpath -> direct compaction path, a check in compaction concludes that
>> allocation should succeed and finishes the compaction, and few moments later
>> the same check in the allocation will conclude that everything is fine up to
>> order 8, but there probably isn't a page of order 9. In between, the
>> nr_free_pages has *increased*. I suspect it's because the watermark checking
>> is racy and the counters drift, but I'm not sure yet.
>
> The watermark checks both in compaction_suitable() and and
> compact_finished() are indeed racy with respect to the actual page
> allocation.
>
>> However, this particular problem should be gone when I finish my series that
>> would capture the page that compaction has just freed. But still, the
>> decisions regarding compaction could be suboptimal.
>>
>
> This should also avoid the race between COMPACT_PARTIAL, returning to the
> page allocator, and finding that the high-order memory you thought was now
> available has been allocated by someone else.

Yeah, I'll try to finish this attempt soon. The idea was also to avoid 
racing with parallel allocations during the compaction itself, by using 
memory isolation on the pageblock (similar to what CMA is doing). But 
that has to be done before even attempting to migrate-scan the 
pageblock, so it might add a useless latency for a pageblock that will 
be then skipped by the async compaction. But capturing the page as soon 
as it becomes available and holding on to it all the way back from 
compaction to alloc slowpath would be an improvement by itself. The 
isolation could be then added to the cases where latencies are not 
critical, such as hugepaged.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-16  9:47             ` Vlastimil Babka
@ 2014-05-22 23:49               ` Kevin Hilman
  -1 siblings, 0 replies; 267+ messages in thread
From: Kevin Hilman @ 2014-05-22 23:49 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, Andrew Morton, David Rientjes, Hugh Dickins,
	Greg Thelen, LKML, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel, Olof Johansson, Stephen Warren

On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.

This patch (or later version) has hit next-20140522 (in the form
commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
bisect, appears to be the culprit of several boot failures on ARM
platforms.

Unfortunately, there isn't much useful in the logs of the boot
failures/hangs since they mostly silently hang.  However, on one
platform (Marvell Armada 370 Mirabox), it reports a failure to
allocate memory, and the RCU stall detection kicks in:

[    1.298234] xhci_hcd 0000:02:00.0: xHCI Host Controller
[    1.303485] xhci_hcd 0000:02:00.0: new USB bus registered, assigned
bus number 1
[    1.310966] xhci_hcd 0000:02:00.0: Couldn't initialize memory
[   22.245395] INFO: rcu_sched detected stalls on CPUs/tasks: {}
(detected by 0, t=2102 jiffies, g=-282, c=-283, q=16)
[   22.255886] INFO: Stall ended before state dump start
[   48.095396] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s!
[swapper/0:1]

Reverting this commit makes them all happy again.

Kevin

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-22 23:49               ` Kevin Hilman
  0 siblings, 0 replies; 267+ messages in thread
From: Kevin Hilman @ 2014-05-22 23:49 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, Andrew Morton, David Rientjes, Hugh Dickins,
	Greg Thelen, LKML, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel, Olof Johansson, Stephen Warren

On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.

This patch (or later version) has hit next-20140522 (in the form
commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
bisect, appears to be the culprit of several boot failures on ARM
platforms.

Unfortunately, there isn't much useful in the logs of the boot
failures/hangs since they mostly silently hang.  However, on one
platform (Marvell Armada 370 Mirabox), it reports a failure to
allocate memory, and the RCU stall detection kicks in:

[    1.298234] xhci_hcd 0000:02:00.0: xHCI Host Controller
[    1.303485] xhci_hcd 0000:02:00.0: new USB bus registered, assigned
bus number 1
[    1.310966] xhci_hcd 0000:02:00.0: Couldn't initialize memory
[   22.245395] INFO: rcu_sched detected stalls on CPUs/tasks: {}
(detected by 0, t=2102 jiffies, g=-282, c=-283, q=16)
[   22.255886] INFO: Stall ended before state dump start
[   48.095396] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s!
[swapper/0:1]

Reverting this commit makes them all happy again.

Kevin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-22 23:49               ` Kevin Hilman
  (?)
@ 2014-05-23  2:48                 ` Shawn Guo
  -1 siblings, 0 replies; 267+ messages in thread
From: Shawn Guo @ 2014-05-23  2:48 UTC (permalink / raw)
  To: Kevin Hilman
  Cc: Vlastimil Babka, Joonsoo Kim, Andrew Morton, David Rientjes,
	Hugh Dickins, Greg Thelen, LKML, linux-mm, Minchan Kim,
	Mel Gorman, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel, Olof Johansson, Stephen Warren,
	linux-arm-kernel

On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>
> This patch (or later version) has hit next-20140522 (in the form
> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
> bisect, appears to be the culprit of several boot failures on ARM
> platforms.

On i.MX6 where CMA is enabled, the commit causes the drivers calling
dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
dma_alloc_from_contiguous() always return page as NULL after this
commit.

Shawn

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-23  2:48                 ` Shawn Guo
  0 siblings, 0 replies; 267+ messages in thread
From: Shawn Guo @ 2014-05-23  2:48 UTC (permalink / raw)
  To: Kevin Hilman
  Cc: Vlastimil Babka, Joonsoo Kim, Andrew Morton, David Rientjes,
	Hugh Dickins, Greg Thelen, LKML, linux-mm, Minchan Kim,
	Mel Gorman, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel, Olof Johansson, Stephen Warren,
	linux-arm-kernel

On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>
> This patch (or later version) has hit next-20140522 (in the form
> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
> bisect, appears to be the culprit of several boot failures on ARM
> platforms.

On i.MX6 where CMA is enabled, the commit causes the drivers calling
dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
dma_alloc_from_contiguous() always return page as NULL after this
commit.

Shawn

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-23  2:48                 ` Shawn Guo
  0 siblings, 0 replies; 267+ messages in thread
From: Shawn Guo @ 2014-05-23  2:48 UTC (permalink / raw)
  To: linux-arm-kernel

On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>
> This patch (or later version) has hit next-20140522 (in the form
> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
> bisect, appears to be the culprit of several boot failures on ARM
> platforms.

On i.MX6 where CMA is enabled, the commit causes the drivers calling
dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
dma_alloc_from_contiguous() always return page as NULL after this
commit.

Shawn

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-23  2:48                 ` Shawn Guo
  (?)
@ 2014-05-23  8:34                   ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-23  8:34 UTC (permalink / raw)
  To: Shawn Guo, Kevin Hilman, Andrew Morton
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen, LKML,
	linux-mm, Minchan Kim, Mel Gorman, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Christoph Lameter, Rik van Riel,
	Olof Johansson, Stephen Warren, linux-arm-kernel

On 05/23/2014 04:48 AM, Shawn Guo wrote:
> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>> lock contention and need_resched() to either abort async compaction, or to
>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>> set to signal the contended state to the caller. Two problems have been
>>> identified in this mechanism.
>>
>> This patch (or later version) has hit next-20140522 (in the form
>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>> bisect, appears to be the culprit of several boot failures on ARM
>> platforms.
> 
> On i.MX6 where CMA is enabled, the commit causes the drivers calling
> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
> dma_alloc_from_contiguous() always return page as NULL after this
> commit.
> 
> Shawn
> 

Really sorry, guys :/

-----8<-----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 23 May 2014 10:18:56 +0200
Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2

Step 1: Change function name and comment between v1 and v2 so that the return
        value signals the opposite thing.
Step 2: Change the call sites to reflect the opposite return value.
Step 3: ???
Step 4: Make a complete fool of yourself.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index a525cd4..5175019 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
 	if (need_resched()) {
 		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
-			return false;
+			return true;
 		}
 
 		cond_resched();
 	}
 
-	return true;
+	return false;
 }
 
 /* Returns true if the page is within a block suitable for migration to */
-- 
1.8.4.5



^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-23  8:34                   ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-23  8:34 UTC (permalink / raw)
  To: Shawn Guo, Kevin Hilman, Andrew Morton
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen, LKML,
	linux-mm, Minchan Kim, Mel Gorman, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Christoph Lameter, Rik van Riel,
	Olof Johansson, Stephen Warren, linux-arm-kernel

On 05/23/2014 04:48 AM, Shawn Guo wrote:
> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>> lock contention and need_resched() to either abort async compaction, or to
>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>> set to signal the contended state to the caller. Two problems have been
>>> identified in this mechanism.
>>
>> This patch (or later version) has hit next-20140522 (in the form
>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>> bisect, appears to be the culprit of several boot failures on ARM
>> platforms.
> 
> On i.MX6 where CMA is enabled, the commit causes the drivers calling
> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
> dma_alloc_from_contiguous() always return page as NULL after this
> commit.
> 
> Shawn
> 

Really sorry, guys :/

-----8<-----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 23 May 2014 10:18:56 +0200
Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2

Step 1: Change function name and comment between v1 and v2 so that the return
        value signals the opposite thing.
Step 2: Change the call sites to reflect the opposite return value.
Step 3: ???
Step 4: Make a complete fool of yourself.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index a525cd4..5175019 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
 	if (need_resched()) {
 		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
-			return false;
+			return true;
 		}
 
 		cond_resched();
 	}
 
-	return true;
+	return false;
 }
 
 /* Returns true if the page is within a block suitable for migration to */
-- 
1.8.4.5


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-23  8:34                   ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-23  8:34 UTC (permalink / raw)
  To: linux-arm-kernel

On 05/23/2014 04:48 AM, Shawn Guo wrote:
> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>> lock contention and need_resched() to either abort async compaction, or to
>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>> set to signal the contended state to the caller. Two problems have been
>>> identified in this mechanism.
>>
>> This patch (or later version) has hit next-20140522 (in the form
>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>> bisect, appears to be the culprit of several boot failures on ARM
>> platforms.
> 
> On i.MX6 where CMA is enabled, the commit causes the drivers calling
> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
> dma_alloc_from_contiguous() always return page as NULL after this
> commit.
> 
> Shawn
> 

Really sorry, guys :/

-----8<-----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 23 May 2014 10:18:56 +0200
Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2

Step 1: Change function name and comment between v1 and v2 so that the return
        value signals the opposite thing.
Step 2: Change the call sites to reflect the opposite return value.
Step 3: ???
Step 4: Make a complete fool of yourself.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index a525cd4..5175019 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
 	if (need_resched()) {
 		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
-			return false;
+			return true;
 		}
 
 		cond_resched();
 	}
 
-	return true;
+	return false;
 }
 
 /* Returns true if the page is within a block suitable for migration to */
-- 
1.8.4.5

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-23  8:34                   ` Vlastimil Babka
  (?)
@ 2014-05-23 10:49                     ` Shawn Guo
  -1 siblings, 0 replies; 267+ messages in thread
From: Shawn Guo @ 2014-05-23 10:49 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Kevin Hilman, Andrew Morton, Joonsoo Kim, David Rientjes,
	Hugh Dickins, Greg Thelen, LKML, linux-mm, Minchan Kim,
	Mel Gorman, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel, Olof Johansson, Stephen Warren,
	linux-arm-kernel

On Fri, May 23, 2014 at 10:34:55AM +0200, Vlastimil Babka wrote:
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
> 
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Tested-by: Shawn Guo <shawn.guo@linaro.org>

> ---
>  mm/compaction.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index a525cd4..5175019 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
>  	if (need_resched()) {
>  		if (cc->mode == MIGRATE_ASYNC) {
>  			cc->contended = true;
> -			return false;
> +			return true;
>  		}
>  
>  		cond_resched();
>  	}
>  
> -	return true;
> +	return false;
>  }
>  
>  /* Returns true if the page is within a block suitable for migration to */
> -- 
> 1.8.4.5
> 
> 

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-23 10:49                     ` Shawn Guo
  0 siblings, 0 replies; 267+ messages in thread
From: Shawn Guo @ 2014-05-23 10:49 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Kevin Hilman, Andrew Morton, Joonsoo Kim, David Rientjes,
	Hugh Dickins, Greg Thelen, LKML, linux-mm, Minchan Kim,
	Mel Gorman, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel, Olof Johansson, Stephen Warren,
	linux-arm-kernel

On Fri, May 23, 2014 at 10:34:55AM +0200, Vlastimil Babka wrote:
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
> 
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Tested-by: Shawn Guo <shawn.guo@linaro.org>

> ---
>  mm/compaction.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index a525cd4..5175019 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
>  	if (need_resched()) {
>  		if (cc->mode == MIGRATE_ASYNC) {
>  			cc->contended = true;
> -			return false;
> +			return true;
>  		}
>  
>  		cond_resched();
>  	}
>  
> -	return true;
> +	return false;
>  }
>  
>  /* Returns true if the page is within a block suitable for migration to */
> -- 
> 1.8.4.5
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-23 10:49                     ` Shawn Guo
  0 siblings, 0 replies; 267+ messages in thread
From: Shawn Guo @ 2014-05-23 10:49 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, May 23, 2014 at 10:34:55AM +0200, Vlastimil Babka wrote:
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
> 
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Tested-by: Shawn Guo <shawn.guo@linaro.org>

> ---
>  mm/compaction.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index a525cd4..5175019 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
>  	if (need_resched()) {
>  		if (cc->mode == MIGRATE_ASYNC) {
>  			cc->contended = true;
> -			return false;
> +			return true;
>  		}
>  
>  		cond_resched();
>  	}
>  
> -	return true;
> +	return false;
>  }
>  
>  /* Returns true if the page is within a block suitable for migration to */
> -- 
> 1.8.4.5
> 
> 

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-23  8:34                   ` Vlastimil Babka
  (?)
@ 2014-05-23 15:07                     ` Kevin Hilman
  -1 siblings, 0 replies; 267+ messages in thread
From: Kevin Hilman @ 2014-05-23 15:07 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Shawn Guo, Andrew Morton, Joonsoo Kim, David Rientjes,
	Hugh Dickins, Greg Thelen, LKML, linux-mm, Minchan Kim,
	Mel Gorman, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel, Olof Johansson, Stephen Warren,
	linux-arm-kernel

Vlastimil Babka <vbabka@suse.cz> writes:

> On 05/23/2014 04:48 AM, Shawn Guo wrote:
>> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>>> lock contention and need_resched() to either abort async compaction, or to
>>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>>> set to signal the contended state to the caller. Two problems have been
>>>> identified in this mechanism.
>>>
>>> This patch (or later version) has hit next-20140522 (in the form
>>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>>> bisect, appears to be the culprit of several boot failures on ARM
>>> platforms.
>> 
>> On i.MX6 where CMA is enabled, the commit causes the drivers calling
>> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
>> dma_alloc_from_contiguous() always return page as NULL after this
>> commit.
>> 
>> Shawn
>> 
>
> Really sorry, guys :/
>
> -----8<-----
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
>
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Tested-by: Kevin Hilman <khilman@linaro.org>

I verified that this fixes the boot failures I've seen on ARM (i.MX6 and
Marvell Armada 370).

Thanks for the quick fix.

Kevin

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-23 15:07                     ` Kevin Hilman
  0 siblings, 0 replies; 267+ messages in thread
From: Kevin Hilman @ 2014-05-23 15:07 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Shawn Guo, Andrew Morton, Joonsoo Kim, David Rientjes,
	Hugh Dickins, Greg Thelen, LKML, linux-mm, Minchan Kim,
	Mel Gorman, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel, Olof Johansson, Stephen Warren,
	linux-arm-kernel

Vlastimil Babka <vbabka@suse.cz> writes:

> On 05/23/2014 04:48 AM, Shawn Guo wrote:
>> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>>> lock contention and need_resched() to either abort async compaction, or to
>>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>>> set to signal the contended state to the caller. Two problems have been
>>>> identified in this mechanism.
>>>
>>> This patch (or later version) has hit next-20140522 (in the form
>>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>>> bisect, appears to be the culprit of several boot failures on ARM
>>> platforms.
>> 
>> On i.MX6 where CMA is enabled, the commit causes the drivers calling
>> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
>> dma_alloc_from_contiguous() always return page as NULL after this
>> commit.
>> 
>> Shawn
>> 
>
> Really sorry, guys :/
>
> -----8<-----
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
>
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Tested-by: Kevin Hilman <khilman@linaro.org>

I verified that this fixes the boot failures I've seen on ARM (i.MX6 and
Marvell Armada 370).

Thanks for the quick fix.

Kevin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-23 15:07                     ` Kevin Hilman
  0 siblings, 0 replies; 267+ messages in thread
From: Kevin Hilman @ 2014-05-23 15:07 UTC (permalink / raw)
  To: linux-arm-kernel

Vlastimil Babka <vbabka@suse.cz> writes:

> On 05/23/2014 04:48 AM, Shawn Guo wrote:
>> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>>> lock contention and need_resched() to either abort async compaction, or to
>>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>>> set to signal the contended state to the caller. Two problems have been
>>>> identified in this mechanism.
>>>
>>> This patch (or later version) has hit next-20140522 (in the form
>>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>>> bisect, appears to be the culprit of several boot failures on ARM
>>> platforms.
>> 
>> On i.MX6 where CMA is enabled, the commit causes the drivers calling
>> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
>> dma_alloc_from_contiguous() always return page as NULL after this
>> commit.
>> 
>> Shawn
>> 
>
> Really sorry, guys :/
>
> -----8<-----
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
>
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Tested-by: Kevin Hilman <khilman@linaro.org>

I verified that this fixes the boot failures I've seen on ARM (i.MX6 and
Marvell Armada 370).

Thanks for the quick fix.

Kevin

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-23  8:34                   ` Vlastimil Babka
  (?)
@ 2014-05-30 16:59                     ` Stephen Warren
  -1 siblings, 0 replies; 267+ messages in thread
From: Stephen Warren @ 2014-05-30 16:59 UTC (permalink / raw)
  To: Vlastimil Babka, Shawn Guo, Kevin Hilman, Andrew Morton
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen, LKML,
	linux-mm, Minchan Kim, Mel Gorman, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Christoph Lameter, Rik van Riel,
	Olof Johansson, linux-arm-kernel

On 05/23/2014 02:34 AM, Vlastimil Babka wrote:
> On 05/23/2014 04:48 AM, Shawn Guo wrote:
>> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>>> lock contention and need_resched() to either abort async compaction, or to
>>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>>> set to signal the contended state to the caller. Two problems have been
>>>> identified in this mechanism.
>>>
>>> This patch (or later version) has hit next-20140522 (in the form
>>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>>> bisect, appears to be the culprit of several boot failures on ARM
>>> platforms.
>>
>> On i.MX6 where CMA is enabled, the commit causes the drivers calling
>> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
>> dma_alloc_from_contiguous() always return page as NULL after this
>> commit.
>>
>> Shawn
>>
> 
> Really sorry, guys :/
> 
> -----8<-----
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
> 
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.

Tested-by: Stephen Warren <swarren@nvidia.com>

This fix doesn't seem to be in linux-next yet:-(

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-30 16:59                     ` Stephen Warren
  0 siblings, 0 replies; 267+ messages in thread
From: Stephen Warren @ 2014-05-30 16:59 UTC (permalink / raw)
  To: Vlastimil Babka, Shawn Guo, Kevin Hilman, Andrew Morton
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen, LKML,
	linux-mm, Minchan Kim, Mel Gorman, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Christoph Lameter, Rik van Riel,
	Olof Johansson, linux-arm-kernel

On 05/23/2014 02:34 AM, Vlastimil Babka wrote:
> On 05/23/2014 04:48 AM, Shawn Guo wrote:
>> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>>> lock contention and need_resched() to either abort async compaction, or to
>>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>>> set to signal the contended state to the caller. Two problems have been
>>>> identified in this mechanism.
>>>
>>> This patch (or later version) has hit next-20140522 (in the form
>>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>>> bisect, appears to be the culprit of several boot failures on ARM
>>> platforms.
>>
>> On i.MX6 where CMA is enabled, the commit causes the drivers calling
>> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
>> dma_alloc_from_contiguous() always return page as NULL after this
>> commit.
>>
>> Shawn
>>
> 
> Really sorry, guys :/
> 
> -----8<-----
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
> 
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.

Tested-by: Stephen Warren <swarren@nvidia.com>

This fix doesn't seem to be in linux-next yet:-(

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-30 16:59                     ` Stephen Warren
  0 siblings, 0 replies; 267+ messages in thread
From: Stephen Warren @ 2014-05-30 16:59 UTC (permalink / raw)
  To: linux-arm-kernel

On 05/23/2014 02:34 AM, Vlastimil Babka wrote:
> On 05/23/2014 04:48 AM, Shawn Guo wrote:
>> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>>> lock contention and need_resched() to either abort async compaction, or to
>>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>>> set to signal the contended state to the caller. Two problems have been
>>>> identified in this mechanism.
>>>
>>> This patch (or later version) has hit next-20140522 (in the form
>>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>>> bisect, appears to be the culprit of several boot failures on ARM
>>> platforms.
>>
>> On i.MX6 where CMA is enabled, the commit causes the drivers calling
>> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
>> dma_alloc_from_contiguous() always return page as NULL after this
>> commit.
>>
>> Shawn
>>
> 
> Really sorry, guys :/
> 
> -----8<-----
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
> 
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.

Tested-by: Stephen Warren <swarren@nvidia.com>

This fix doesn't seem to be in linux-next yet:-(

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-23  8:34                   ` Vlastimil Babka
  (?)
@ 2014-06-02 13:35                     ` Fabio Estevam
  -1 siblings, 0 replies; 267+ messages in thread
From: Fabio Estevam @ 2014-06-02 13:35 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Shawn Guo, Kevin Hilman, Andrew Morton, Rik van Riel,
	Stephen Warren, Minchan Kim, Bartlomiej Zolnierkiewicz,
	Hugh Dickins, LKML, Michal Nazarewicz, linux-mm, Mel Gorman,
	David Rientjes, Olof Johansson, Greg Thelen, Joonsoo Kim,
	Christoph Lameter, linux-arm-kernel

Vlastimil,

On Fri, May 23, 2014 at 5:34 AM, Vlastimil Babka <vbabka@suse.cz> wrote:

> Really sorry, guys :/
>
> -----8<-----
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
>
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/compaction.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> index a525cd4..5175019 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
>         if (need_resched()) {
>                 if (cc->mode == MIGRATE_ASYNC) {
>                         cc->contended = true;
> -                       return false;
> +