All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 1/2] mm, migration: add destination page freeing callback
@ 2014-05-01  0:45 ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory migration uses a callback defined by the caller to determine how to
allocate destination pages.  When migration fails for a source page, however, it 
frees the destination page back to the system.

This patch adds a memory migration callback defined by the caller to determine 
how to free destination pages.  If a caller, such as memory compaction, builds 
its own freelist for migration targets, this can reuse already freed memory 
instead of scanning additional memory.

If the caller provides a function to handle freeing of destination pages, it is 
called when page migration fails.  Otherwise, it may pass NULL and freeing back 
to the system will be handled as usual.  This patch introduces no functional 
change.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/migrate.h | 11 ++++++----
 mm/compaction.c         |  2 +-
 mm/memory-failure.c     |  4 ++--
 mm/memory_hotplug.c     |  2 +-
 mm/mempolicy.c          |  4 ++--
 mm/migrate.c            | 54 ++++++++++++++++++++++++++++++++-----------------
 mm/page_alloc.c         |  2 +-
 7 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,7 +5,9 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate_mode.h>
 
-typedef struct page *new_page_t(struct page *, unsigned long private, int **);
+typedef struct page *new_page_t(struct page *page, unsigned long private,
+				int **reason);
+typedef void free_page_t(struct page *page, unsigned long private);
 
 /*
  * Return values from addresss_space_operations.migratepage():
@@ -38,7 +40,7 @@ enum migrate_reason {
 extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
-extern int migrate_pages(struct list_head *l, new_page_t x,
+extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
 
 extern int migrate_prep(void);
@@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
-static inline int migrate_pages(struct list_head *l, new_page_t x,
-		unsigned long private, enum migrate_mode mode, int reason)
+static inline int migrate_pages(struct list_head *l, new_page_t new,
+		free_page_t free, unsigned long private, enum migrate_mode mode,
+		int reason)
 	{ return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1023,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
 				(unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1500,7 +1500,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 
 	/* Keep page count to indicate a given hugepage is isolated. */
 	list_move(&hpage->lru, &pagelist);
-	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 				MIGRATE_SYNC, MR_MEMORY_FAILURE);
 	if (ret) {
 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1581,7 +1581,7 @@ static int __soft_offline_page(struct page *page, int flags)
 		inc_zone_page_state(page, NR_ISOLATED_ANON +
 					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
-		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
 		if (ret) {
 			if (!list_empty(&pagelist)) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * alloc_migrate_target should be improooooved!!
 		 * migrate_pages returns # of failed pages.
 		 */
-		ret = migrate_pages(&source, alloc_migrate_target, 0,
+		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
 		if (ret)
 			putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_node_page, dest,
+		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
 					MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
-					(unsigned long)vma,
+					NULL, (unsigned long)vma,
 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -938,8 +938,9 @@ out:
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
  */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, enum migrate_mode mode)
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+			unsigned long private, struct page *page, int force,
+			enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -983,11 +984,12 @@ out:
 				page_is_file_cache(page));
 		putback_lru_page(page);
 	}
-	/*
-	 * Move the new page to the LRU. If migration was not successful
-	 * then this will free the page.
-	 */
-	putback_lru_page(newpage);
+
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(newpage, private);
+	else
+		putback_lru_page(newpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1016,8 +1018,9 @@ out:
  * will wait in the page fault for migration to complete.
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
-				unsigned long private, struct page *hpage,
-				int force, enum migrate_mode mode)
+				free_page_t put_new_page, unsigned long private,
+				struct page *hpage, int force,
+				enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -1056,20 +1059,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (!page_mapped(hpage))
 		rc = move_to_new_page(new_hpage, hpage, 1, mode);
 
-	if (rc)
+	if (rc != MIGRATEPAGE_SUCCESS)
 		remove_migration_ptes(hpage, hpage);
 
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 
-	if (!rc)
+	if (rc == MIGRATEPAGE_SUCCESS)
 		hugetlb_cgroup_migrate(hpage, new_hpage);
 
 	unlock_page(hpage);
 out:
 	if (rc != -EAGAIN)
 		putback_active_hugepage(hpage);
-	put_page(new_hpage);
+
+	/*
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, put_page() will drop the reference grabbed during
+	 * isolation.
+	 */
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(new_hpage, private);
+	else
+		put_page(new_hpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1086,6 +1099,8 @@ out:
  * @from:		The list of pages to be migrated.
  * @get_new_page:	The function used to allocate free pages to be used
  *			as the target of the page migration.
+ * @put_new_page:	The function used to free target pages if migration
+ *			fails, or NULL if no special handling is necessary.
  * @private:		Private data to be passed on to get_new_page()
  * @mode:		The migration mode that specifies the constraints for
  *			page migration, if any.
@@ -1099,7 +1114,8 @@ out:
  * Returns the number of pages that were not migrated, or an error code.
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
-		unsigned long private, enum migrate_mode mode, int reason)
+		free_page_t put_new_page, unsigned long private,
+		enum migrate_mode mode, int reason)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -1121,10 +1137,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 
 			if (PageHuge(page))
 				rc = unmap_and_move_huge_page(get_new_page,
-						private, page, pass > 2, mode);
+						put_new_page, private, page,
+						pass > 2, mode);
 			else
-				rc = unmap_and_move(get_new_page, private,
-						page, pass > 2, mode);
+				rc = unmap_and_move(get_new_page, put_new_page,
+						private, page, pass > 2, mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1273,7 +1290,7 @@ set_status:
 
 	err = 0;
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_page_node,
+		err = migrate_pages(&pagelist, new_page_node, NULL,
 				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1729,7 +1746,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 
 	list_add(&page->lru, &migratepages);
 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
-				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+				     NULL, node, MIGRATE_ASYNC,
+				     MR_NUMA_MISPLACED);
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6215,7 +6215,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		cc->nr_migratepages -= nr_reclaimed;
 
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
-				    0, MIGRATE_SYNC, MR_CMA);
+				    NULL, 0, MIGRATE_SYNC, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch 1/2] mm, migration: add destination page freeing callback
@ 2014-05-01  0:45 ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory migration uses a callback defined by the caller to determine how to
allocate destination pages.  When migration fails for a source page, however, it 
frees the destination page back to the system.

This patch adds a memory migration callback defined by the caller to determine 
how to free destination pages.  If a caller, such as memory compaction, builds 
its own freelist for migration targets, this can reuse already freed memory 
instead of scanning additional memory.

If the caller provides a function to handle freeing of destination pages, it is 
called when page migration fails.  Otherwise, it may pass NULL and freeing back 
to the system will be handled as usual.  This patch introduces no functional 
change.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/migrate.h | 11 ++++++----
 mm/compaction.c         |  2 +-
 mm/memory-failure.c     |  4 ++--
 mm/memory_hotplug.c     |  2 +-
 mm/mempolicy.c          |  4 ++--
 mm/migrate.c            | 54 ++++++++++++++++++++++++++++++++-----------------
 mm/page_alloc.c         |  2 +-
 7 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,7 +5,9 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate_mode.h>
 
-typedef struct page *new_page_t(struct page *, unsigned long private, int **);
+typedef struct page *new_page_t(struct page *page, unsigned long private,
+				int **reason);
+typedef void free_page_t(struct page *page, unsigned long private);
 
 /*
  * Return values from addresss_space_operations.migratepage():
@@ -38,7 +40,7 @@ enum migrate_reason {
 extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
-extern int migrate_pages(struct list_head *l, new_page_t x,
+extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
 
 extern int migrate_prep(void);
@@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
-static inline int migrate_pages(struct list_head *l, new_page_t x,
-		unsigned long private, enum migrate_mode mode, int reason)
+static inline int migrate_pages(struct list_head *l, new_page_t new,
+		free_page_t free, unsigned long private, enum migrate_mode mode,
+		int reason)
 	{ return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1023,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
 				(unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1500,7 +1500,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 
 	/* Keep page count to indicate a given hugepage is isolated. */
 	list_move(&hpage->lru, &pagelist);
-	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 				MIGRATE_SYNC, MR_MEMORY_FAILURE);
 	if (ret) {
 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1581,7 +1581,7 @@ static int __soft_offline_page(struct page *page, int flags)
 		inc_zone_page_state(page, NR_ISOLATED_ANON +
 					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
-		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
 		if (ret) {
 			if (!list_empty(&pagelist)) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * alloc_migrate_target should be improooooved!!
 		 * migrate_pages returns # of failed pages.
 		 */
-		ret = migrate_pages(&source, alloc_migrate_target, 0,
+		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
 		if (ret)
 			putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_node_page, dest,
+		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
 					MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
-					(unsigned long)vma,
+					NULL, (unsigned long)vma,
 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -938,8 +938,9 @@ out:
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
  */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, enum migrate_mode mode)
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+			unsigned long private, struct page *page, int force,
+			enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -983,11 +984,12 @@ out:
 				page_is_file_cache(page));
 		putback_lru_page(page);
 	}
-	/*
-	 * Move the new page to the LRU. If migration was not successful
-	 * then this will free the page.
-	 */
-	putback_lru_page(newpage);
+
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(newpage, private);
+	else
+		putback_lru_page(newpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1016,8 +1018,9 @@ out:
  * will wait in the page fault for migration to complete.
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
-				unsigned long private, struct page *hpage,
-				int force, enum migrate_mode mode)
+				free_page_t put_new_page, unsigned long private,
+				struct page *hpage, int force,
+				enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -1056,20 +1059,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (!page_mapped(hpage))
 		rc = move_to_new_page(new_hpage, hpage, 1, mode);
 
-	if (rc)
+	if (rc != MIGRATEPAGE_SUCCESS)
 		remove_migration_ptes(hpage, hpage);
 
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 
-	if (!rc)
+	if (rc == MIGRATEPAGE_SUCCESS)
 		hugetlb_cgroup_migrate(hpage, new_hpage);
 
 	unlock_page(hpage);
 out:
 	if (rc != -EAGAIN)
 		putback_active_hugepage(hpage);
-	put_page(new_hpage);
+
+	/*
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, put_page() will drop the reference grabbed during
+	 * isolation.
+	 */
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(new_hpage, private);
+	else
+		put_page(new_hpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1086,6 +1099,8 @@ out:
  * @from:		The list of pages to be migrated.
  * @get_new_page:	The function used to allocate free pages to be used
  *			as the target of the page migration.
+ * @put_new_page:	The function used to free target pages if migration
+ *			fails, or NULL if no special handling is necessary.
  * @private:		Private data to be passed on to get_new_page()
  * @mode:		The migration mode that specifies the constraints for
  *			page migration, if any.
@@ -1099,7 +1114,8 @@ out:
  * Returns the number of pages that were not migrated, or an error code.
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
-		unsigned long private, enum migrate_mode mode, int reason)
+		free_page_t put_new_page, unsigned long private,
+		enum migrate_mode mode, int reason)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -1121,10 +1137,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 
 			if (PageHuge(page))
 				rc = unmap_and_move_huge_page(get_new_page,
-						private, page, pass > 2, mode);
+						put_new_page, private, page,
+						pass > 2, mode);
 			else
-				rc = unmap_and_move(get_new_page, private,
-						page, pass > 2, mode);
+				rc = unmap_and_move(get_new_page, put_new_page,
+						private, page, pass > 2, mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1273,7 +1290,7 @@ set_status:
 
 	err = 0;
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_page_node,
+		err = migrate_pages(&pagelist, new_page_node, NULL,
 				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1729,7 +1746,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 
 	list_add(&page->lru, &migratepages);
 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
-				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+				     NULL, node, MIGRATE_ASYNC,
+				     MR_NUMA_MISPLACED);
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6215,7 +6215,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		cc->nr_migratepages -= nr_reclaimed;
 
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
-				    0, MIGRATE_SYNC, MR_CMA);
+				    NULL, 0, MIGRATE_SYNC, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch 2/2] mm, compaction: return failed migration target pages back to freelist
  2014-05-01  0:45 ` David Rientjes
@ 2014-05-01  0:45   ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory compaction works by having a "freeing scanner" scan from one end of a 
zone which isolates pages as migration targets while another "migrating scanner" 
scans from the other end of the same zone which isolates pages for migration.

When page migration fails for an isolated page, the target page is returned to 
the system rather than the freelist built by the freeing scanner.  This may 
require the freeing scanner to continue scanning memory after suitable migration 
targets have already been returned to the system needlessly.

This patch returns destination pages to the freeing scanner freelist when page 
migration fails.  This prevents unnecessary work done by the freeing scanner but 
also encourages memory to be as compacted as possible at the end of the zone.

Reported-by: Greg Thelen <gthelen@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -797,6 +797,19 @@ static struct page *compaction_alloc(struct page *migratepage,
 }
 
 /*
+ * This is a migrate-callback that "frees" freepages back to the isolated
+ * freelist.  All pages on the freelist are from the same zone, so there is no
+ * special handling needed for NUMA.
+ */
+static void compaction_free(struct page *page, unsigned long data)
+{
+	struct compact_control *cc = (struct compact_control *)data;
+
+	list_add(&page->lru, &cc->freepages);
+	cc->nr_freepages++;
+}
+
+/*
  * We cannot control nr_migratepages and nr_freepages fully when migration is
  * running as migrate_pages() has no knowledge of compact_control. When
  * migration is complete, we count the number of pages on the lists by hand.
@@ -1023,8 +1036,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
-				(unsigned long)cc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc,
+				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
 		update_nr_listpages(cc);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch 2/2] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-01  0:45   ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory compaction works by having a "freeing scanner" scan from one end of a 
zone which isolates pages as migration targets while another "migrating scanner" 
scans from the other end of the same zone which isolates pages for migration.

When page migration fails for an isolated page, the target page is returned to 
the system rather than the freelist built by the freeing scanner.  This may 
require the freeing scanner to continue scanning memory after suitable migration 
targets have already been returned to the system needlessly.

This patch returns destination pages to the freeing scanner freelist when page 
migration fails.  This prevents unnecessary work done by the freeing scanner but 
also encourages memory to be as compacted as possible at the end of the zone.

Reported-by: Greg Thelen <gthelen@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -797,6 +797,19 @@ static struct page *compaction_alloc(struct page *migratepage,
 }
 
 /*
+ * This is a migrate-callback that "frees" freepages back to the isolated
+ * freelist.  All pages on the freelist are from the same zone, so there is no
+ * special handling needed for NUMA.
+ */
+static void compaction_free(struct page *page, unsigned long data)
+{
+	struct compact_control *cc = (struct compact_control *)data;
+
+	list_add(&page->lru, &cc->freepages);
+	cc->nr_freepages++;
+}
+
+/*
  * We cannot control nr_migratepages and nr_freepages fully when migration is
  * running as migrate_pages() has no knowledge of compact_control. When
  * migration is complete, we count the number of pages on the lists by hand.
@@ -1023,8 +1036,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
-				(unsigned long)cc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc,
+				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
 		update_nr_listpages(cc);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch 1/2] mm, migration: add destination page freeing callback
  2014-05-01  0:45 ` David Rientjes
  (?)
  (?)
@ 2014-05-01  5:08 ` Naoya Horiguchi
  -1 siblings, 0 replies; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-01  5:08 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, vbabka, iamjoonsoo.kim,
	gthelen, Hugh Dickins, linux-kernel, linux-mm

Hi David,

On Wed, Apr 30, 2014 at 05:45:24PM -0700, David Rientjes wrote:
> Memory migration uses a callback defined by the caller to determine how to
> allocate destination pages.  When migration fails for a source page, however, it 
> frees the destination page back to the system.
> 
> This patch adds a memory migration callback defined by the caller to determine 
> how to free destination pages.  If a caller, such as memory compaction, builds 
> its own freelist for migration targets, this can reuse already freed memory 
> instead of scanning additional memory.
> 
> If the caller provides a function to handle freeing of destination pages, it is 
> called when page migration fails.  Otherwise, it may pass NULL and freeing back 
> to the system will be handled as usual.  This patch introduces no functional 
> change.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Looks good to me.
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

I have one comment below ...

[snip]

> @@ -1056,20 +1059,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
>  	if (!page_mapped(hpage))
>  		rc = move_to_new_page(new_hpage, hpage, 1, mode);
>  
> -	if (rc)
> +	if (rc != MIGRATEPAGE_SUCCESS)
>  		remove_migration_ptes(hpage, hpage);
>  
>  	if (anon_vma)
>  		put_anon_vma(anon_vma);
>  
> -	if (!rc)
> +	if (rc == MIGRATEPAGE_SUCCESS)
>  		hugetlb_cgroup_migrate(hpage, new_hpage);
>  
>  	unlock_page(hpage);
>  out:
>  	if (rc != -EAGAIN)
>  		putback_active_hugepage(hpage);
> -	put_page(new_hpage);
> +
> +	/*
> +	 * If migration was not successful and there's a freeing callback, use
> +	 * it.  Otherwise, put_page() will drop the reference grabbed during
> +	 * isolation.
> +	 */

This comment is true both for normal page and huge page, and people more likely
to see unmap_and_move() at first, so this had better be (also) in unmap_and_move().

Thanks,
Naoya Horiguchi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch 2/2] mm, compaction: return failed migration target pages back to freelist
  2014-05-01  0:45   ` David Rientjes
  (?)
@ 2014-05-01  5:10   ` Naoya Horiguchi
  2014-05-01 21:02       ` David Rientjes
  -1 siblings, 1 reply; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-01  5:10 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, vbabka, iamjoonsoo.kim,
	gthelen, Hugh Dickins, linux-kernel, linux-mm

On Wed, Apr 30, 2014 at 05:45:27PM -0700, David Rientjes wrote:
> Memory compaction works by having a "freeing scanner" scan from one end of a 
> zone which isolates pages as migration targets while another "migrating scanner" 
> scans from the other end of the same zone which isolates pages for migration.
> 
> When page migration fails for an isolated page, the target page is returned to 
> the system rather than the freelist built by the freeing scanner.  This may 
> require the freeing scanner to continue scanning memory after suitable migration 
> targets have already been returned to the system needlessly.
> 
> This patch returns destination pages to the freeing scanner freelist when page 
> migration fails.  This prevents unnecessary work done by the freeing scanner but 
> also encourages memory to be as compacted as possible at the end of the zone.
> 
> Reported-by: Greg Thelen <gthelen@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  mm/compaction.c | 17 +++++++++++++++--
>  1 file changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -797,6 +797,19 @@ static struct page *compaction_alloc(struct page *migratepage,
>  }
>  
>  /*
> + * This is a migrate-callback that "frees" freepages back to the isolated
> + * freelist.  All pages on the freelist are from the same zone, so there is no
> + * special handling needed for NUMA.
> + */
> +static void compaction_free(struct page *page, unsigned long data)
> +{
> +	struct compact_control *cc = (struct compact_control *)data;
> +
> +	list_add(&page->lru, &cc->freepages);
> +	cc->nr_freepages++;

With this change, migration_page() handles cc->nr_freepages consistently, so
we don't have to run over freelist to update this count in update_nr_listpages()?

I'm not sure if that's also true for cc->nr_migratepages, but if it is,
we can completely remove update_nr_listpages().

Thanks,
Naoya Horiguchi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch 2/2] mm, compaction: return failed migration target pages back to freelist
  2014-05-01  5:10   ` Naoya Horiguchi
@ 2014-05-01 21:02       ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:02 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, vbabka, iamjoonsoo.kim,
	gthelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, 1 May 2014, Naoya Horiguchi wrote:

> > diff --git a/mm/compaction.c b/mm/compaction.c
> > --- a/mm/compaction.c
> > +++ b/mm/compaction.c
> > @@ -797,6 +797,19 @@ static struct page *compaction_alloc(struct page *migratepage,
> >  }
> >  
> >  /*
> > + * This is a migrate-callback that "frees" freepages back to the isolated
> > + * freelist.  All pages on the freelist are from the same zone, so there is no
> > + * special handling needed for NUMA.
> > + */
> > +static void compaction_free(struct page *page, unsigned long data)
> > +{
> > +	struct compact_control *cc = (struct compact_control *)data;
> > +
> > +	list_add(&page->lru, &cc->freepages);
> > +	cc->nr_freepages++;
> 
> With this change, migration_page() handles cc->nr_freepages consistently, so
> we don't have to run over freelist to update this count in update_nr_listpages()?
> 

Good optimization, I'll add it!

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch 2/2] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-01 21:02       ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:02 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, vbabka, iamjoonsoo.kim,
	gthelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, 1 May 2014, Naoya Horiguchi wrote:

> > diff --git a/mm/compaction.c b/mm/compaction.c
> > --- a/mm/compaction.c
> > +++ b/mm/compaction.c
> > @@ -797,6 +797,19 @@ static struct page *compaction_alloc(struct page *migratepage,
> >  }
> >  
> >  /*
> > + * This is a migrate-callback that "frees" freepages back to the isolated
> > + * freelist.  All pages on the freelist are from the same zone, so there is no
> > + * special handling needed for NUMA.
> > + */
> > +static void compaction_free(struct page *page, unsigned long data)
> > +{
> > +	struct compact_control *cc = (struct compact_control *)data;
> > +
> > +	list_add(&page->lru, &cc->freepages);
> > +	cc->nr_freepages++;
> 
> With this change, migration_page() handles cc->nr_freepages consistently, so
> we don't have to run over freelist to update this count in update_nr_listpages()?
> 

Good optimization, I'll add it!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch 1/2] mm, migration: add destination page freeing callback
       [not found] ` <5361d71e.236ec20a.1b3d.ffffc8aeSMTPIN_ADDED_BROKEN@mx.google.com>
@ 2014-05-01 21:02     ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:02 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, vbabka, iamjoonsoo.kim,
	gthelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, 1 May 2014, Naoya Horiguchi wrote:

> Looks good to me.
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> 

Thanks!

> I have one comment below ...
> 
> [snip]
> 
> > @@ -1056,20 +1059,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
> >  	if (!page_mapped(hpage))
> >  		rc = move_to_new_page(new_hpage, hpage, 1, mode);
> >  
> > -	if (rc)
> > +	if (rc != MIGRATEPAGE_SUCCESS)
> >  		remove_migration_ptes(hpage, hpage);
> >  
> >  	if (anon_vma)
> >  		put_anon_vma(anon_vma);
> >  
> > -	if (!rc)
> > +	if (rc == MIGRATEPAGE_SUCCESS)
> >  		hugetlb_cgroup_migrate(hpage, new_hpage);
> >  
> >  	unlock_page(hpage);
> >  out:
> >  	if (rc != -EAGAIN)
> >  		putback_active_hugepage(hpage);
> > -	put_page(new_hpage);
> > +
> > +	/*
> > +	 * If migration was not successful and there's a freeing callback, use
> > +	 * it.  Otherwise, put_page() will drop the reference grabbed during
> > +	 * isolation.
> > +	 */
> 
> This comment is true both for normal page and huge page, and people more likely
> to see unmap_and_move() at first, so this had better be (also) in unmap_and_move().
> 

Ok!

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch 1/2] mm, migration: add destination page freeing callback
@ 2014-05-01 21:02     ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:02 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, vbabka, iamjoonsoo.kim,
	gthelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, 1 May 2014, Naoya Horiguchi wrote:

> Looks good to me.
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> 

Thanks!

> I have one comment below ...
> 
> [snip]
> 
> > @@ -1056,20 +1059,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
> >  	if (!page_mapped(hpage))
> >  		rc = move_to_new_page(new_hpage, hpage, 1, mode);
> >  
> > -	if (rc)
> > +	if (rc != MIGRATEPAGE_SUCCESS)
> >  		remove_migration_ptes(hpage, hpage);
> >  
> >  	if (anon_vma)
> >  		put_anon_vma(anon_vma);
> >  
> > -	if (!rc)
> > +	if (rc == MIGRATEPAGE_SUCCESS)
> >  		hugetlb_cgroup_migrate(hpage, new_hpage);
> >  
> >  	unlock_page(hpage);
> >  out:
> >  	if (rc != -EAGAIN)
> >  		putback_active_hugepage(hpage);
> > -	put_page(new_hpage);
> > +
> > +	/*
> > +	 * If migration was not successful and there's a freeing callback, use
> > +	 * it.  Otherwise, put_page() will drop the reference grabbed during
> > +	 * isolation.
> > +	 */
> 
> This comment is true both for normal page and huge page, and people more likely
> to see unmap_and_move() at first, so this had better be (also) in unmap_and_move().
> 

Ok!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v2 1/4] mm, migration: add destination page freeing callback
  2014-05-01  0:45 ` David Rientjes
@ 2014-05-01 21:35   ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory migration uses a callback defined by the caller to determine how to
allocate destination pages.  When migration fails for a source page, however, it 
frees the destination page back to the system.

This patch adds a memory migration callback defined by the caller to determine 
how to free destination pages.  If a caller, such as memory compaction, builds 
its own freelist for migration targets, this can reuse already freed memory 
instead of scanning additional memory.

If the caller provides a function to handle freeing of destination pages, it is 
called when page migration fails.  Otherwise, it may pass NULL and freeing back 
to the system will be handled as usual.  This patch introduces no functional 
change.

Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/migrate.h | 11 ++++++----
 mm/compaction.c         |  2 +-
 mm/memory-failure.c     |  4 ++--
 mm/memory_hotplug.c     |  2 +-
 mm/mempolicy.c          |  4 ++--
 mm/migrate.c            | 55 +++++++++++++++++++++++++++++++++++--------------
 mm/page_alloc.c         |  2 +-
 7 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,7 +5,9 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate_mode.h>
 
-typedef struct page *new_page_t(struct page *, unsigned long private, int **);
+typedef struct page *new_page_t(struct page *page, unsigned long private,
+				int **reason);
+typedef void free_page_t(struct page *page, unsigned long private);
 
 /*
  * Return values from addresss_space_operations.migratepage():
@@ -38,7 +40,7 @@ enum migrate_reason {
 extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
-extern int migrate_pages(struct list_head *l, new_page_t x,
+extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
 
 extern int migrate_prep(void);
@@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
-static inline int migrate_pages(struct list_head *l, new_page_t x,
-		unsigned long private, enum migrate_mode mode, int reason)
+static inline int migrate_pages(struct list_head *l, new_page_t new,
+		free_page_t free, unsigned long private, enum migrate_mode mode,
+		int reason)
 	{ return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1023,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
 				(unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1500,7 +1500,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 
 	/* Keep page count to indicate a given hugepage is isolated. */
 	list_move(&hpage->lru, &pagelist);
-	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 				MIGRATE_SYNC, MR_MEMORY_FAILURE);
 	if (ret) {
 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1581,7 +1581,7 @@ static int __soft_offline_page(struct page *page, int flags)
 		inc_zone_page_state(page, NR_ISOLATED_ANON +
 					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
-		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
 		if (ret) {
 			if (!list_empty(&pagelist)) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * alloc_migrate_target should be improooooved!!
 		 * migrate_pages returns # of failed pages.
 		 */
-		ret = migrate_pages(&source, alloc_migrate_target, 0,
+		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
 		if (ret)
 			putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_node_page, dest,
+		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
 					MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
-					(unsigned long)vma,
+					NULL, (unsigned long)vma,
 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -938,8 +938,9 @@ out:
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
  */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, enum migrate_mode mode)
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+			unsigned long private, struct page *page, int force,
+			enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -983,11 +984,17 @@ out:
 				page_is_file_cache(page));
 		putback_lru_page(page);
 	}
+
 	/*
-	 * Move the new page to the LRU. If migration was not successful
-	 * then this will free the page.
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, putback_lru_page() will drop the reference grabbed
+	 * during isolation.
 	 */
-	putback_lru_page(newpage);
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(newpage, private);
+	else
+		putback_lru_page(newpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1016,8 +1023,9 @@ out:
  * will wait in the page fault for migration to complete.
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
-				unsigned long private, struct page *hpage,
-				int force, enum migrate_mode mode)
+				free_page_t put_new_page, unsigned long private,
+				struct page *hpage, int force,
+				enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -1056,20 +1064,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (!page_mapped(hpage))
 		rc = move_to_new_page(new_hpage, hpage, 1, mode);
 
-	if (rc)
+	if (rc != MIGRATEPAGE_SUCCESS)
 		remove_migration_ptes(hpage, hpage);
 
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 
-	if (!rc)
+	if (rc == MIGRATEPAGE_SUCCESS)
 		hugetlb_cgroup_migrate(hpage, new_hpage);
 
 	unlock_page(hpage);
 out:
 	if (rc != -EAGAIN)
 		putback_active_hugepage(hpage);
-	put_page(new_hpage);
+
+	/*
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, put_page() will drop the reference grabbed during
+	 * isolation.
+	 */
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(new_hpage, private);
+	else
+		put_page(new_hpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1086,6 +1104,8 @@ out:
  * @from:		The list of pages to be migrated.
  * @get_new_page:	The function used to allocate free pages to be used
  *			as the target of the page migration.
+ * @put_new_page:	The function used to free target pages if migration
+ *			fails, or NULL if no special handling is necessary.
  * @private:		Private data to be passed on to get_new_page()
  * @mode:		The migration mode that specifies the constraints for
  *			page migration, if any.
@@ -1099,7 +1119,8 @@ out:
  * Returns the number of pages that were not migrated, or an error code.
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
-		unsigned long private, enum migrate_mode mode, int reason)
+		free_page_t put_new_page, unsigned long private,
+		enum migrate_mode mode, int reason)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -1121,10 +1142,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 
 			if (PageHuge(page))
 				rc = unmap_and_move_huge_page(get_new_page,
-						private, page, pass > 2, mode);
+						put_new_page, private, page,
+						pass > 2, mode);
 			else
-				rc = unmap_and_move(get_new_page, private,
-						page, pass > 2, mode);
+				rc = unmap_and_move(get_new_page, put_new_page,
+						private, page, pass > 2, mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1273,7 +1295,7 @@ set_status:
 
 	err = 0;
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_page_node,
+		err = migrate_pages(&pagelist, new_page_node, NULL,
 				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1729,7 +1751,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 
 	list_add(&page->lru, &migratepages);
 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
-				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+				     NULL, node, MIGRATE_ASYNC,
+				     MR_NUMA_MISPLACED);
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6215,7 +6215,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		cc->nr_migratepages -= nr_reclaimed;
 
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
-				    0, MIGRATE_SYNC, MR_CMA);
+				    NULL, 0, MIGRATE_SYNC, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v2 1/4] mm, migration: add destination page freeing callback
@ 2014-05-01 21:35   ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory migration uses a callback defined by the caller to determine how to
allocate destination pages.  When migration fails for a source page, however, it 
frees the destination page back to the system.

This patch adds a memory migration callback defined by the caller to determine 
how to free destination pages.  If a caller, such as memory compaction, builds 
its own freelist for migration targets, this can reuse already freed memory 
instead of scanning additional memory.

If the caller provides a function to handle freeing of destination pages, it is 
called when page migration fails.  Otherwise, it may pass NULL and freeing back 
to the system will be handled as usual.  This patch introduces no functional 
change.

Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/migrate.h | 11 ++++++----
 mm/compaction.c         |  2 +-
 mm/memory-failure.c     |  4 ++--
 mm/memory_hotplug.c     |  2 +-
 mm/mempolicy.c          |  4 ++--
 mm/migrate.c            | 55 +++++++++++++++++++++++++++++++++++--------------
 mm/page_alloc.c         |  2 +-
 7 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,7 +5,9 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate_mode.h>
 
-typedef struct page *new_page_t(struct page *, unsigned long private, int **);
+typedef struct page *new_page_t(struct page *page, unsigned long private,
+				int **reason);
+typedef void free_page_t(struct page *page, unsigned long private);
 
 /*
  * Return values from addresss_space_operations.migratepage():
@@ -38,7 +40,7 @@ enum migrate_reason {
 extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
-extern int migrate_pages(struct list_head *l, new_page_t x,
+extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
 
 extern int migrate_prep(void);
@@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
-static inline int migrate_pages(struct list_head *l, new_page_t x,
-		unsigned long private, enum migrate_mode mode, int reason)
+static inline int migrate_pages(struct list_head *l, new_page_t new,
+		free_page_t free, unsigned long private, enum migrate_mode mode,
+		int reason)
 	{ return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1023,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
 				(unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1500,7 +1500,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 
 	/* Keep page count to indicate a given hugepage is isolated. */
 	list_move(&hpage->lru, &pagelist);
-	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 				MIGRATE_SYNC, MR_MEMORY_FAILURE);
 	if (ret) {
 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1581,7 +1581,7 @@ static int __soft_offline_page(struct page *page, int flags)
 		inc_zone_page_state(page, NR_ISOLATED_ANON +
 					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
-		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
 		if (ret) {
 			if (!list_empty(&pagelist)) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * alloc_migrate_target should be improooooved!!
 		 * migrate_pages returns # of failed pages.
 		 */
-		ret = migrate_pages(&source, alloc_migrate_target, 0,
+		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
 		if (ret)
 			putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_node_page, dest,
+		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
 					MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
-					(unsigned long)vma,
+					NULL, (unsigned long)vma,
 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -938,8 +938,9 @@ out:
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
  */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, enum migrate_mode mode)
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+			unsigned long private, struct page *page, int force,
+			enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -983,11 +984,17 @@ out:
 				page_is_file_cache(page));
 		putback_lru_page(page);
 	}
+
 	/*
-	 * Move the new page to the LRU. If migration was not successful
-	 * then this will free the page.
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, putback_lru_page() will drop the reference grabbed
+	 * during isolation.
 	 */
-	putback_lru_page(newpage);
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(newpage, private);
+	else
+		putback_lru_page(newpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1016,8 +1023,9 @@ out:
  * will wait in the page fault for migration to complete.
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
-				unsigned long private, struct page *hpage,
-				int force, enum migrate_mode mode)
+				free_page_t put_new_page, unsigned long private,
+				struct page *hpage, int force,
+				enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -1056,20 +1064,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (!page_mapped(hpage))
 		rc = move_to_new_page(new_hpage, hpage, 1, mode);
 
-	if (rc)
+	if (rc != MIGRATEPAGE_SUCCESS)
 		remove_migration_ptes(hpage, hpage);
 
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 
-	if (!rc)
+	if (rc == MIGRATEPAGE_SUCCESS)
 		hugetlb_cgroup_migrate(hpage, new_hpage);
 
 	unlock_page(hpage);
 out:
 	if (rc != -EAGAIN)
 		putback_active_hugepage(hpage);
-	put_page(new_hpage);
+
+	/*
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, put_page() will drop the reference grabbed during
+	 * isolation.
+	 */
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(new_hpage, private);
+	else
+		put_page(new_hpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1086,6 +1104,8 @@ out:
  * @from:		The list of pages to be migrated.
  * @get_new_page:	The function used to allocate free pages to be used
  *			as the target of the page migration.
+ * @put_new_page:	The function used to free target pages if migration
+ *			fails, or NULL if no special handling is necessary.
  * @private:		Private data to be passed on to get_new_page()
  * @mode:		The migration mode that specifies the constraints for
  *			page migration, if any.
@@ -1099,7 +1119,8 @@ out:
  * Returns the number of pages that were not migrated, or an error code.
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
-		unsigned long private, enum migrate_mode mode, int reason)
+		free_page_t put_new_page, unsigned long private,
+		enum migrate_mode mode, int reason)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -1121,10 +1142,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 
 			if (PageHuge(page))
 				rc = unmap_and_move_huge_page(get_new_page,
-						private, page, pass > 2, mode);
+						put_new_page, private, page,
+						pass > 2, mode);
 			else
-				rc = unmap_and_move(get_new_page, private,
-						page, pass > 2, mode);
+				rc = unmap_and_move(get_new_page, put_new_page,
+						private, page, pass > 2, mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1273,7 +1295,7 @@ set_status:
 
 	err = 0;
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_page_node,
+		err = migrate_pages(&pagelist, new_page_node, NULL,
 				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1729,7 +1751,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 
 	list_add(&page->lru, &migratepages);
 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
-				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+				     NULL, node, MIGRATE_ASYNC,
+				     MR_NUMA_MISPLACED);
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6215,7 +6215,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		cc->nr_migratepages -= nr_reclaimed;
 
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
-				    0, MIGRATE_SYNC, MR_CMA);
+				    NULL, 0, MIGRATE_SYNC, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v2 2/4] mm, compaction: return failed migration target pages back to freelist
  2014-05-01 21:35   ` David Rientjes
@ 2014-05-01 21:35     ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory compaction works by having a "freeing scanner" scan from one end of a 
zone which isolates pages as migration targets while another "migrating scanner" 
scans from the other end of the same zone which isolates pages for migration.

When page migration fails for an isolated page, the target page is returned to 
the system rather than the freelist built by the freeing scanner.  This may 
require the freeing scanner to continue scanning memory after suitable migration 
targets have already been returned to the system needlessly.

This patch returns destination pages to the freeing scanner freelist when page 
migration fails.  This prevents unnecessary work done by the freeing scanner but 
also encourages memory to be as compacted as possible at the end of the zone.

Reported-by: Greg Thelen <gthelen@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -797,23 +797,32 @@ static struct page *compaction_alloc(struct page *migratepage,
 }
 
 /*
- * We cannot control nr_migratepages and nr_freepages fully when migration is
- * running as migrate_pages() has no knowledge of compact_control. When
- * migration is complete, we count the number of pages on the lists by hand.
+ * This is a migrate-callback that "frees" freepages back to the isolated
+ * freelist.  All pages on the freelist are from the same zone, so there is no
+ * special handling needed for NUMA.
+ */
+static void compaction_free(struct page *page, unsigned long data)
+{
+	struct compact_control *cc = (struct compact_control *)data;
+
+	list_add(&page->lru, &cc->freepages);
+	cc->nr_freepages++;
+}
+
+/*
+ * We cannot control nr_migratepages fully when migration is running as
+ * migrate_pages() has no knowledge of of compact_control.  When migration is
+ * complete, we count the number of pages on the list by hand.
  */
 static void update_nr_listpages(struct compact_control *cc)
 {
 	int nr_migratepages = 0;
-	int nr_freepages = 0;
 	struct page *page;
 
 	list_for_each_entry(page, &cc->migratepages, lru)
 		nr_migratepages++;
-	list_for_each_entry(page, &cc->freepages, lru)
-		nr_freepages++;
 
 	cc->nr_migratepages = nr_migratepages;
-	cc->nr_freepages = nr_freepages;
 }
 
 /* possible outcome of isolate_migratepages */
@@ -1023,8 +1032,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
-				(unsigned long)cc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc,
+				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
 		update_nr_listpages(cc);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v2 2/4] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-01 21:35     ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory compaction works by having a "freeing scanner" scan from one end of a 
zone which isolates pages as migration targets while another "migrating scanner" 
scans from the other end of the same zone which isolates pages for migration.

When page migration fails for an isolated page, the target page is returned to 
the system rather than the freelist built by the freeing scanner.  This may 
require the freeing scanner to continue scanning memory after suitable migration 
targets have already been returned to the system needlessly.

This patch returns destination pages to the freeing scanner freelist when page 
migration fails.  This prevents unnecessary work done by the freeing scanner but 
also encourages memory to be as compacted as possible at the end of the zone.

Reported-by: Greg Thelen <gthelen@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -797,23 +797,32 @@ static struct page *compaction_alloc(struct page *migratepage,
 }
 
 /*
- * We cannot control nr_migratepages and nr_freepages fully when migration is
- * running as migrate_pages() has no knowledge of compact_control. When
- * migration is complete, we count the number of pages on the lists by hand.
+ * This is a migrate-callback that "frees" freepages back to the isolated
+ * freelist.  All pages on the freelist are from the same zone, so there is no
+ * special handling needed for NUMA.
+ */
+static void compaction_free(struct page *page, unsigned long data)
+{
+	struct compact_control *cc = (struct compact_control *)data;
+
+	list_add(&page->lru, &cc->freepages);
+	cc->nr_freepages++;
+}
+
+/*
+ * We cannot control nr_migratepages fully when migration is running as
+ * migrate_pages() has no knowledge of of compact_control.  When migration is
+ * complete, we count the number of pages on the list by hand.
  */
 static void update_nr_listpages(struct compact_control *cc)
 {
 	int nr_migratepages = 0;
-	int nr_freepages = 0;
 	struct page *page;
 
 	list_for_each_entry(page, &cc->migratepages, lru)
 		nr_migratepages++;
-	list_for_each_entry(page, &cc->freepages, lru)
-		nr_freepages++;
 
 	cc->nr_migratepages = nr_migratepages;
-	cc->nr_freepages = nr_freepages;
 }
 
 /* possible outcome of isolate_migratepages */
@@ -1023,8 +1032,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
-				(unsigned long)cc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc,
+				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
 		update_nr_listpages(cc);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-01 21:35   ` David Rientjes
@ 2014-05-01 21:35     ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Each zone has a cached migration scanner pfn for memory compaction so that 
subsequent calls to memory compaction can start where the previous call left 
off.

Currently, the compaction migration scanner only updates the per-zone cached pfn 
when pageblocks were not skipped for async compaction.  This creates a 
dependency on calling sync compaction to avoid having subsequent calls to async 
compaction from scanning an enormous amount of non-MOVABLE pageblocks each time 
it is called.  On large machines, this could be potentially very expensive.

This patch adds a per-zone cached migration scanner pfn only for async 
compaction.  It is updated everytime a pageblock has been scanned in its 
entirety and when no pages from it were successfully isolated.  The cached 
migration scanner pfn for sync compaction is updated only when called for sync 
compaction.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mmzone.h |  5 +++--
 mm/compaction.c        | 55 ++++++++++++++++++++++++++------------------------
 2 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -360,9 +360,10 @@ struct zone {
 	/* Set to true when the PG_migrate_skip bits should be cleared */
 	bool			compact_blockskip_flush;
 
-	/* pfns where compaction scanners should start */
+	/* pfn where compaction free scanner should start */
 	unsigned long		compact_cached_free_pfn;
-	unsigned long		compact_cached_migrate_pfn;
+	/* pfn where async and sync compaction migration scanner should start */
+	unsigned long		compact_cached_migrate_pfn[2];
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 	/* see spanned/present_pages for more description */
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
 	unsigned long end_pfn = zone_end_pfn(zone);
 	unsigned long pfn;
 
-	zone->compact_cached_migrate_pfn = start_pfn;
+	zone->compact_cached_migrate_pfn[0] = start_pfn;
+	zone->compact_cached_migrate_pfn[1] = start_pfn;
 	zone->compact_cached_free_pfn = end_pfn;
 	zone->compact_blockskip_flush = false;
 
@@ -134,6 +135,7 @@ static void update_pageblock_skip(struct compact_control *cc,
 			bool migrate_scanner)
 {
 	struct zone *zone = cc->zone;
+	unsigned long pfn;
 
 	if (cc->ignore_skip_hint)
 		return;
@@ -141,20 +143,25 @@ static void update_pageblock_skip(struct compact_control *cc,
 	if (!page)
 		return;
 
-	if (!nr_isolated) {
-		unsigned long pfn = page_to_pfn(page);
-		set_pageblock_skip(page);
-
-		/* Update where compaction should restart */
-		if (migrate_scanner) {
-			if (!cc->finished_update_migrate &&
-			    pfn > zone->compact_cached_migrate_pfn)
-				zone->compact_cached_migrate_pfn = pfn;
-		} else {
-			if (!cc->finished_update_free &&
-			    pfn < zone->compact_cached_free_pfn)
-				zone->compact_cached_free_pfn = pfn;
-		}
+	if (nr_isolated)
+		return;
+
+	set_pageblock_skip(page);
+	pfn = page_to_pfn(page);
+
+	/* Update where async and sync compaction should restart */
+	if (migrate_scanner) {
+		if (cc->finished_update_migrate)
+			return;
+		if (pfn > zone->compact_cached_migrate_pfn[0])
+			zone->compact_cached_migrate_pfn[0] = pfn;
+		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
+			zone->compact_cached_migrate_pfn[1] = pfn;
+	} else {
+		if (cc->finished_update_free)
+			return;
+		if (pfn < zone->compact_cached_free_pfn)
+			zone->compact_cached_free_pfn = pfn;
 	}
 }
 #else
@@ -464,7 +471,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	unsigned long flags;
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
-	bool skipped_async_unsuitable = false;
 	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
 
@@ -540,11 +546,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			 * the minimum amount of work satisfies the allocation
 			 */
 			mt = get_pageblock_migratetype(page);
-			if (!cc->sync && !migrate_async_suitable(mt)) {
-				cc->finished_update_migrate = true;
-				skipped_async_unsuitable = true;
+			if (!cc->sync && !migrate_async_suitable(mt))
 				goto next_pageblock;
-			}
 		}
 
 		/*
@@ -646,10 +649,8 @@ next_pageblock:
 	/*
 	 * Update the pageblock-skip information and cached scanner pfn,
 	 * if the whole pageblock was scanned without isolating any page.
-	 * This is not done when pageblock was skipped due to being unsuitable
-	 * for async compaction, so that eventual sync compaction can try.
 	 */
-	if (low_pfn == end_pfn && !skipped_async_unsuitable)
+	if (low_pfn == end_pfn)
 		update_pageblock_skip(cc, valid_page, nr_isolated, true);
 
 	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -875,7 +876,8 @@ static int compact_finished(struct zone *zone,
 	/* Compaction run completes if the migrate and free scanner meet */
 	if (cc->free_pfn <= cc->migrate_pfn) {
 		/* Let the next compaction start anew. */
-		zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
+		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
 		zone->compact_cached_free_pfn = zone_end_pfn(zone);
 
 		/*
@@ -1000,7 +1002,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	 * information on where the scanners should start but check that it
 	 * is initialised by ensuring the values are within zone boundaries.
 	 */
-	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1008,7 +1010,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	}
 	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
 		cc->migrate_pfn = start_pfn;
-		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
+		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
 	}
 
 	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
@ 2014-05-01 21:35     ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Each zone has a cached migration scanner pfn for memory compaction so that 
subsequent calls to memory compaction can start where the previous call left 
off.

Currently, the compaction migration scanner only updates the per-zone cached pfn 
when pageblocks were not skipped for async compaction.  This creates a 
dependency on calling sync compaction to avoid having subsequent calls to async 
compaction from scanning an enormous amount of non-MOVABLE pageblocks each time 
it is called.  On large machines, this could be potentially very expensive.

This patch adds a per-zone cached migration scanner pfn only for async 
compaction.  It is updated everytime a pageblock has been scanned in its 
entirety and when no pages from it were successfully isolated.  The cached 
migration scanner pfn for sync compaction is updated only when called for sync 
compaction.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mmzone.h |  5 +++--
 mm/compaction.c        | 55 ++++++++++++++++++++++++++------------------------
 2 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -360,9 +360,10 @@ struct zone {
 	/* Set to true when the PG_migrate_skip bits should be cleared */
 	bool			compact_blockskip_flush;
 
-	/* pfns where compaction scanners should start */
+	/* pfn where compaction free scanner should start */
 	unsigned long		compact_cached_free_pfn;
-	unsigned long		compact_cached_migrate_pfn;
+	/* pfn where async and sync compaction migration scanner should start */
+	unsigned long		compact_cached_migrate_pfn[2];
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 	/* see spanned/present_pages for more description */
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
 	unsigned long end_pfn = zone_end_pfn(zone);
 	unsigned long pfn;
 
-	zone->compact_cached_migrate_pfn = start_pfn;
+	zone->compact_cached_migrate_pfn[0] = start_pfn;
+	zone->compact_cached_migrate_pfn[1] = start_pfn;
 	zone->compact_cached_free_pfn = end_pfn;
 	zone->compact_blockskip_flush = false;
 
@@ -134,6 +135,7 @@ static void update_pageblock_skip(struct compact_control *cc,
 			bool migrate_scanner)
 {
 	struct zone *zone = cc->zone;
+	unsigned long pfn;
 
 	if (cc->ignore_skip_hint)
 		return;
@@ -141,20 +143,25 @@ static void update_pageblock_skip(struct compact_control *cc,
 	if (!page)
 		return;
 
-	if (!nr_isolated) {
-		unsigned long pfn = page_to_pfn(page);
-		set_pageblock_skip(page);
-
-		/* Update where compaction should restart */
-		if (migrate_scanner) {
-			if (!cc->finished_update_migrate &&
-			    pfn > zone->compact_cached_migrate_pfn)
-				zone->compact_cached_migrate_pfn = pfn;
-		} else {
-			if (!cc->finished_update_free &&
-			    pfn < zone->compact_cached_free_pfn)
-				zone->compact_cached_free_pfn = pfn;
-		}
+	if (nr_isolated)
+		return;
+
+	set_pageblock_skip(page);
+	pfn = page_to_pfn(page);
+
+	/* Update where async and sync compaction should restart */
+	if (migrate_scanner) {
+		if (cc->finished_update_migrate)
+			return;
+		if (pfn > zone->compact_cached_migrate_pfn[0])
+			zone->compact_cached_migrate_pfn[0] = pfn;
+		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
+			zone->compact_cached_migrate_pfn[1] = pfn;
+	} else {
+		if (cc->finished_update_free)
+			return;
+		if (pfn < zone->compact_cached_free_pfn)
+			zone->compact_cached_free_pfn = pfn;
 	}
 }
 #else
@@ -464,7 +471,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	unsigned long flags;
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
-	bool skipped_async_unsuitable = false;
 	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
 
@@ -540,11 +546,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			 * the minimum amount of work satisfies the allocation
 			 */
 			mt = get_pageblock_migratetype(page);
-			if (!cc->sync && !migrate_async_suitable(mt)) {
-				cc->finished_update_migrate = true;
-				skipped_async_unsuitable = true;
+			if (!cc->sync && !migrate_async_suitable(mt))
 				goto next_pageblock;
-			}
 		}
 
 		/*
@@ -646,10 +649,8 @@ next_pageblock:
 	/*
 	 * Update the pageblock-skip information and cached scanner pfn,
 	 * if the whole pageblock was scanned without isolating any page.
-	 * This is not done when pageblock was skipped due to being unsuitable
-	 * for async compaction, so that eventual sync compaction can try.
 	 */
-	if (low_pfn == end_pfn && !skipped_async_unsuitable)
+	if (low_pfn == end_pfn)
 		update_pageblock_skip(cc, valid_page, nr_isolated, true);
 
 	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -875,7 +876,8 @@ static int compact_finished(struct zone *zone,
 	/* Compaction run completes if the migrate and free scanner meet */
 	if (cc->free_pfn <= cc->migrate_pfn) {
 		/* Let the next compaction start anew. */
-		zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
+		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
 		zone->compact_cached_free_pfn = zone_end_pfn(zone);
 
 		/*
@@ -1000,7 +1002,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	 * information on where the scanners should start but check that it
 	 * is initialised by ensuring the values are within zone boundaries.
 	 */
-	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1008,7 +1010,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	}
 	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
 		cc->migrate_pfn = start_pfn;
-		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
+		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
 	}
 
 	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
  2014-05-01 21:35   ` David Rientjes
@ 2014-05-01 21:35     ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Synchronous memory compaction can be very expensive: it can iterate an enormous 
amount of memory without aborting, constantly rescheduling, waiting on page
locks and lru_lock, etc, if a pageblock cannot be defragmented.

Unfortunately, it's too expensive for pagefault for transparent hugepages and 
it's much better to simply fallback to pages.  On 128GB machines, we find that 
synchronous memory compaction can take O(seconds) for a single thp fault.

Now that async compaction remembers where it left off without strictly relying
on sync compaction, this makes thp allocations best-effort without causing
egregious latency during pagefault.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2656,7 +2656,7 @@ rebalance:
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
-	} else {
+	} else if (!(gfp_mask & __GFP_NO_KSWAPD)) {
 		/*
 		 * High-order allocations do not necessarily loop after
 		 * direct reclaim and reclaim/compaction depends on compaction

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
@ 2014-05-01 21:35     ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-01 21:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Synchronous memory compaction can be very expensive: it can iterate an enormous 
amount of memory without aborting, constantly rescheduling, waiting on page
locks and lru_lock, etc, if a pageblock cannot be defragmented.

Unfortunately, it's too expensive for pagefault for transparent hugepages and 
it's much better to simply fallback to pages.  On 128GB machines, we find that 
synchronous memory compaction can take O(seconds) for a single thp fault.

Now that async compaction remembers where it left off without strictly relying
on sync compaction, this makes thp allocations best-effort without causing
egregious latency during pagefault.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2656,7 +2656,7 @@ rebalance:
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
-	} else {
+	} else if (!(gfp_mask & __GFP_NO_KSWAPD)) {
 		/*
 		 * High-order allocations do not necessarily loop after
 		 * direct reclaim and reclaim/compaction depends on compaction

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 1/4] mm, migration: add destination page freeing callback
  2014-05-01 21:35   ` David Rientjes
                     ` (3 preceding siblings ...)
  (?)
@ 2014-05-02 10:10   ` Mel Gorman
  -1 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-02 10:10 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, May 01, 2014 at 02:35:37PM -0700, David Rientjes wrote:
> Memory migration uses a callback defined by the caller to determine how to
> allocate destination pages.  When migration fails for a source page, however, it 
> frees the destination page back to the system.
> 
> This patch adds a memory migration callback defined by the caller to determine 
> how to free destination pages.  If a caller, such as memory compaction, builds 
> its own freelist for migration targets, this can reuse already freed memory 
> instead of scanning additional memory.
> 
> If the caller provides a function to handle freeing of destination pages, it is 
> called when page migration fails.  Otherwise, it may pass NULL and freeing back 
> to the system will be handled as usual.  This patch introduces no functional 
> change.
> 
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Mel Gorman <mgorman@suse.de>

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 2/4] mm, compaction: return failed migration target pages back to freelist
  2014-05-01 21:35     ` David Rientjes
@ 2014-05-02 10:11       ` Mel Gorman
  -1 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-02 10:11 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, May 01, 2014 at 02:35:42PM -0700, David Rientjes wrote:
> Memory compaction works by having a "freeing scanner" scan from one end of a 
> zone which isolates pages as migration targets while another "migrating scanner" 
> scans from the other end of the same zone which isolates pages for migration.
> 
> When page migration fails for an isolated page, the target page is returned to 
> the system rather than the freelist built by the freeing scanner.  This may 
> require the freeing scanner to continue scanning memory after suitable migration 
> targets have already been returned to the system needlessly.
> 
> This patch returns destination pages to the freeing scanner freelist when page 
> migration fails.  This prevents unnecessary work done by the freeing scanner but 
> also encourages memory to be as compacted as possible at the end of the zone.
> 
> Reported-by: Greg Thelen <gthelen@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Mel Gorman <mgorman@suse.de>

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 2/4] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-02 10:11       ` Mel Gorman
  0 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-02 10:11 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, May 01, 2014 at 02:35:42PM -0700, David Rientjes wrote:
> Memory compaction works by having a "freeing scanner" scan from one end of a 
> zone which isolates pages as migration targets while another "migrating scanner" 
> scans from the other end of the same zone which isolates pages for migration.
> 
> When page migration fails for an isolated page, the target page is returned to 
> the system rather than the freelist built by the freeing scanner.  This may 
> require the freeing scanner to continue scanning memory after suitable migration 
> targets have already been returned to the system needlessly.
> 
> This patch returns destination pages to the freeing scanner freelist when page 
> migration fails.  This prevents unnecessary work done by the freeing scanner but 
> also encourages memory to be as compacted as possible at the end of the zone.
> 
> Reported-by: Greg Thelen <gthelen@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Mel Gorman <mgorman@suse.de>

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
  2014-05-01 21:35     ` David Rientjes
@ 2014-05-02 10:22       ` Mel Gorman
  -1 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-02 10:22 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, May 01, 2014 at 02:35:48PM -0700, David Rientjes wrote:
> Synchronous memory compaction can be very expensive: it can iterate an enormous 
> amount of memory without aborting, constantly rescheduling, waiting on page
> locks and lru_lock, etc, if a pageblock cannot be defragmented.
> 
> Unfortunately, it's too expensive for pagefault for transparent hugepages and 
> it's much better to simply fallback to pages.  On 128GB machines, we find that 
> synchronous memory compaction can take O(seconds) for a single thp fault.
> 
> Now that async compaction remembers where it left off without strictly relying
> on sync compaction, this makes thp allocations best-effort without causing
> egregious latency during pagefault.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Compaction uses MIGRATE_SYNC_LIGHT which in the current implementation
avoids calling ->writepage to clean dirty page in the fallback migrate
case

static int fallback_migrate_page(struct address_space *mapping,
        struct page *newpage, struct page *page, enum migrate_mode mode)
{
        if (PageDirty(page)) {
                /* Only writeback pages in full synchronous migration */
                if (mode != MIGRATE_SYNC)
                        return -EBUSY;
                return writeout(mapping, page);
        }

or waiting on page writeback in other cases

                /*
                 * Only in the case of a full synchronous migration is it
                 * necessary to wait for PageWriteback. In the async case,
                 * the retry loop is too short and in the sync-light case,
                 * the overhead of stalling is too much
                 */
                if (mode != MIGRATE_SYNC) {
                        rc = -EBUSY;
                        goto uncharge;
                }

or on acquiring the page lock for unforced migrations.

However, buffers still get locked in the SYNC_LIGHT case causing stalls
in buffer_migrate_lock_buffers which may be undesirable or maybe you are
hitting some other case. It would be preferable to identify what is
getting stalled in SYNC_LIGHT compaction and fix that rather than
disabling it entirely. You may also want to distinguish between a direct
compaction by a process and collapsing huge pages as done by khugepaged.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
@ 2014-05-02 10:22       ` Mel Gorman
  0 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-02 10:22 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Thu, May 01, 2014 at 02:35:48PM -0700, David Rientjes wrote:
> Synchronous memory compaction can be very expensive: it can iterate an enormous 
> amount of memory without aborting, constantly rescheduling, waiting on page
> locks and lru_lock, etc, if a pageblock cannot be defragmented.
> 
> Unfortunately, it's too expensive for pagefault for transparent hugepages and 
> it's much better to simply fallback to pages.  On 128GB machines, we find that 
> synchronous memory compaction can take O(seconds) for a single thp fault.
> 
> Now that async compaction remembers where it left off without strictly relying
> on sync compaction, this makes thp allocations best-effort without causing
> egregious latency during pagefault.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Compaction uses MIGRATE_SYNC_LIGHT which in the current implementation
avoids calling ->writepage to clean dirty page in the fallback migrate
case

static int fallback_migrate_page(struct address_space *mapping,
        struct page *newpage, struct page *page, enum migrate_mode mode)
{
        if (PageDirty(page)) {
                /* Only writeback pages in full synchronous migration */
                if (mode != MIGRATE_SYNC)
                        return -EBUSY;
                return writeout(mapping, page);
        }

or waiting on page writeback in other cases

                /*
                 * Only in the case of a full synchronous migration is it
                 * necessary to wait for PageWriteback. In the async case,
                 * the retry loop is too short and in the sync-light case,
                 * the overhead of stalling is too much
                 */
                if (mode != MIGRATE_SYNC) {
                        rc = -EBUSY;
                        goto uncharge;
                }

or on acquiring the page lock for unforced migrations.

However, buffers still get locked in the SYNC_LIGHT case causing stalls
in buffer_migrate_lock_buffers which may be undesirable or maybe you are
hitting some other case. It would be preferable to identify what is
getting stalled in SYNC_LIGHT compaction and fix that rather than
disabling it entirely. You may also want to distinguish between a direct
compaction by a process and collapsing huge pages as done by khugepaged.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
  2014-05-02 10:22       ` Mel Gorman
@ 2014-05-02 11:22         ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-02 11:22 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Fri, 2 May 2014, Mel Gorman wrote:

> > Synchronous memory compaction can be very expensive: it can iterate an enormous 
> > amount of memory without aborting, constantly rescheduling, waiting on page
> > locks and lru_lock, etc, if a pageblock cannot be defragmented.
> > 
> > Unfortunately, it's too expensive for pagefault for transparent hugepages and 
> > it's much better to simply fallback to pages.  On 128GB machines, we find that 
> > synchronous memory compaction can take O(seconds) for a single thp fault.
> > 
> > Now that async compaction remembers where it left off without strictly relying
> > on sync compaction, this makes thp allocations best-effort without causing
> > egregious latency during pagefault.
> > 
> > Signed-off-by: David Rientjes <rientjes@google.com>
> 
> Compaction uses MIGRATE_SYNC_LIGHT which in the current implementation
> avoids calling ->writepage to clean dirty page in the fallback migrate
> case
> 
> static int fallback_migrate_page(struct address_space *mapping,
>         struct page *newpage, struct page *page, enum migrate_mode mode)
> {
>         if (PageDirty(page)) {
>                 /* Only writeback pages in full synchronous migration */
>                 if (mode != MIGRATE_SYNC)
>                         return -EBUSY;
>                 return writeout(mapping, page);
>         }
> 
> or waiting on page writeback in other cases
> 
>                 /*
>                  * Only in the case of a full synchronous migration is it
>                  * necessary to wait for PageWriteback. In the async case,
>                  * the retry loop is too short and in the sync-light case,
>                  * the overhead of stalling is too much
>                  */
>                 if (mode != MIGRATE_SYNC) {
>                         rc = -EBUSY;
>                         goto uncharge;
>                 }
> 
> or on acquiring the page lock for unforced migrations.
> 

The page locks I'm referring to is the lock_page() in __unmap_and_move() 
that gets called for sync compaction after the migrate_pages() iteration 
makes a few passes and unsuccessfully grabs it.  This becomes a forced 
migration since __unmap_and_move() returns -EAGAIN when the trylock fails.

> However, buffers still get locked in the SYNC_LIGHT case causing stalls
> in buffer_migrate_lock_buffers which may be undesirable or maybe you are
> hitting some other case.

We have perf profiles from one workload in particular that shows 
contention on i_mmap_mutex (anon isn't interesting since the vast majority 
of memory on this workload [120GB on a 128GB machine] is has a gup pin and 
doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid 
isolating pinned pages")) between cpus all doing memory compaction trying 
to fault thp memory.

That's one example that we've seen, but the fact remains that at times 
sync compaction will iterate the entire 128GB machine and not allow an 
order-9 page to be allocated and there's nothing to preempt it like the 
need_resched() or lock contention checks that async compaction has.  (I 
say "at times" because deferred compaction does help if it's triggered and 
the cached pfn values help, but the worst case scenario is when the entire 
machine is iterated.)  Would it make sense when HPAGE_PMD_ORDER == 
pageblock_order, which is usually the case, that we don't even try to 
migrate pages unless the entire pageblock is successfully isolated and 
gfp_mask & __GFP_NO_KSWAPD?  That's the only other thing that I can think 
that will avoid such a problem other than this patch.

We're struggling to find the logic that ensures sync compaction does not 
become too expensive at pagefault and unless that can be guaranteed, it 
seemed best to avoid it entirely.

> It would be preferable to identify what is
> getting stalled in SYNC_LIGHT compaction and fix that rather than
> disabling it entirely. You may also want to distinguish between a direct
> compaction by a process and collapsing huge pages as done by khugepaged.
> 

I'm referring to latencies of 8-9 seconds to fault 64MB of transparent 
hugepages.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
@ 2014-05-02 11:22         ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-02 11:22 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Fri, 2 May 2014, Mel Gorman wrote:

> > Synchronous memory compaction can be very expensive: it can iterate an enormous 
> > amount of memory without aborting, constantly rescheduling, waiting on page
> > locks and lru_lock, etc, if a pageblock cannot be defragmented.
> > 
> > Unfortunately, it's too expensive for pagefault for transparent hugepages and 
> > it's much better to simply fallback to pages.  On 128GB machines, we find that 
> > synchronous memory compaction can take O(seconds) for a single thp fault.
> > 
> > Now that async compaction remembers where it left off without strictly relying
> > on sync compaction, this makes thp allocations best-effort without causing
> > egregious latency during pagefault.
> > 
> > Signed-off-by: David Rientjes <rientjes@google.com>
> 
> Compaction uses MIGRATE_SYNC_LIGHT which in the current implementation
> avoids calling ->writepage to clean dirty page in the fallback migrate
> case
> 
> static int fallback_migrate_page(struct address_space *mapping,
>         struct page *newpage, struct page *page, enum migrate_mode mode)
> {
>         if (PageDirty(page)) {
>                 /* Only writeback pages in full synchronous migration */
>                 if (mode != MIGRATE_SYNC)
>                         return -EBUSY;
>                 return writeout(mapping, page);
>         }
> 
> or waiting on page writeback in other cases
> 
>                 /*
>                  * Only in the case of a full synchronous migration is it
>                  * necessary to wait for PageWriteback. In the async case,
>                  * the retry loop is too short and in the sync-light case,
>                  * the overhead of stalling is too much
>                  */
>                 if (mode != MIGRATE_SYNC) {
>                         rc = -EBUSY;
>                         goto uncharge;
>                 }
> 
> or on acquiring the page lock for unforced migrations.
> 

The page locks I'm referring to is the lock_page() in __unmap_and_move() 
that gets called for sync compaction after the migrate_pages() iteration 
makes a few passes and unsuccessfully grabs it.  This becomes a forced 
migration since __unmap_and_move() returns -EAGAIN when the trylock fails.

> However, buffers still get locked in the SYNC_LIGHT case causing stalls
> in buffer_migrate_lock_buffers which may be undesirable or maybe you are
> hitting some other case.

We have perf profiles from one workload in particular that shows 
contention on i_mmap_mutex (anon isn't interesting since the vast majority 
of memory on this workload [120GB on a 128GB machine] is has a gup pin and 
doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid 
isolating pinned pages")) between cpus all doing memory compaction trying 
to fault thp memory.

That's one example that we've seen, but the fact remains that at times 
sync compaction will iterate the entire 128GB machine and not allow an 
order-9 page to be allocated and there's nothing to preempt it like the 
need_resched() or lock contention checks that async compaction has.  (I 
say "at times" because deferred compaction does help if it's triggered and 
the cached pfn values help, but the worst case scenario is when the entire 
machine is iterated.)  Would it make sense when HPAGE_PMD_ORDER == 
pageblock_order, which is usually the case, that we don't even try to 
migrate pages unless the entire pageblock is successfully isolated and 
gfp_mask & __GFP_NO_KSWAPD?  That's the only other thing that I can think 
that will avoid such a problem other than this patch.

We're struggling to find the logic that ensures sync compaction does not 
become too expensive at pagefault and unless that can be guaranteed, it 
seemed best to avoid it entirely.

> It would be preferable to identify what is
> getting stalled in SYNC_LIGHT compaction and fix that rather than
> disabling it entirely. You may also want to distinguish between a direct
> compaction by a process and collapsing huge pages as done by khugepaged.
> 

I'm referring to latencies of 8-9 seconds to fault 64MB of transparent 
hugepages.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
  2014-05-02 11:22         ` David Rientjes
  (?)
@ 2014-05-02 11:58         ` Mel Gorman
  2014-05-02 20:29             ` David Rientjes
  -1 siblings, 1 reply; 267+ messages in thread
From: Mel Gorman @ 2014-05-02 11:58 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Fri, May 02, 2014 at 04:22:03AM -0700, David Rientjes wrote:
> On Fri, 2 May 2014, Mel Gorman wrote:
> 
> > > Synchronous memory compaction can be very expensive: it can iterate an enormous 
> > > amount of memory without aborting, constantly rescheduling, waiting on page
> > > locks and lru_lock, etc, if a pageblock cannot be defragmented.
> > > 
> > > Unfortunately, it's too expensive for pagefault for transparent hugepages and 
> > > it's much better to simply fallback to pages.  On 128GB machines, we find that 
> > > synchronous memory compaction can take O(seconds) for a single thp fault.
> > > 
> > > Now that async compaction remembers where it left off without strictly relying
> > > on sync compaction, this makes thp allocations best-effort without causing
> > > egregious latency during pagefault.
> > > 
> > > Signed-off-by: David Rientjes <rientjes@google.com>
> > 
> > Compaction uses MIGRATE_SYNC_LIGHT which in the current implementation
> > avoids calling ->writepage to clean dirty page in the fallback migrate
> > case
> > 
> > static int fallback_migrate_page(struct address_space *mapping,
> >         struct page *newpage, struct page *page, enum migrate_mode mode)
> > {
> >         if (PageDirty(page)) {
> >                 /* Only writeback pages in full synchronous migration */
> >                 if (mode != MIGRATE_SYNC)
> >                         return -EBUSY;
> >                 return writeout(mapping, page);
> >         }
> > 
> > or waiting on page writeback in other cases
> > 
> >                 /*
> >                  * Only in the case of a full synchronous migration is it
> >                  * necessary to wait for PageWriteback. In the async case,
> >                  * the retry loop is too short and in the sync-light case,
> >                  * the overhead of stalling is too much
> >                  */
> >                 if (mode != MIGRATE_SYNC) {
> >                         rc = -EBUSY;
> >                         goto uncharge;
> >                 }
> > 
> > or on acquiring the page lock for unforced migrations.
> > 
> 
> The page locks I'm referring to is the lock_page() in __unmap_and_move() 
> that gets called for sync compaction after the migrate_pages() iteration 
> makes a few passes and unsuccessfully grabs it.  This becomes a forced 
> migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
> 

Can that be fixed then instead of disabling it entirely?

> > However, buffers still get locked in the SYNC_LIGHT case causing stalls
> > in buffer_migrate_lock_buffers which may be undesirable or maybe you are
> > hitting some other case.
> 
> We have perf profiles from one workload in particular that shows 
> contention on i_mmap_mutex (anon isn't interesting since the vast majority 
> of memory on this workload [120GB on a 128GB machine] is has a gup pin and 
> doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid 
> isolating pinned pages")) between cpus all doing memory compaction trying 
> to fault thp memory.
> 

Abort SYNC_LIGHT compaction if the mutex is contended.

> That's one example that we've seen, but the fact remains that at times 
> sync compaction will iterate the entire 128GB machine and not allow an 
> order-9 page to be allocated and there's nothing to preempt it like the 
> need_resched() or lock contention checks that async compaction has. 

Make compact_control->sync the same enum field and check for contention
on the async/sync_light case but leave it for sync if compacting via the
proc interface?

> (I 
> say "at times" because deferred compaction does help if it's triggered and 
> the cached pfn values help, but the worst case scenario is when the entire 
> machine is iterated.)  Would it make sense when HPAGE_PMD_ORDER == 
> pageblock_order, which is usually the case, that we don't even try to 
> migrate pages unless the entire pageblock is successfully isolated and 
> gfp_mask & __GFP_NO_KSWAPD? 

That would be ok for async and sync_light compaction but again leave it
for sync compaction if compacting via the proc interface to give us a
basis for comparison.

> That's the only other thing that I can think 
> that will avoid such a problem other than this patch.
> 
> We're struggling to find the logic that ensures sync compaction does not 
> become too expensive at pagefault and unless that can be guaranteed, it 
> seemed best to avoid it entirely.
> 

The intent of sync_light was to avoid compaction being too expensive at
page page fault time. At the time it would have been evalated against
machines with 4G of memory. You have found that it did not go far enough.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch 1/2] mm, migration: add destination page freeing callback
  2014-05-01  0:45 ` David Rientjes
@ 2014-05-02 13:16   ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 13:16 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/01/2014 02:45 AM, David Rientjes wrote:
> Memory migration uses a callback defined by the caller to determine how to
> allocate destination pages.  When migration fails for a source page, however, it
> frees the destination page back to the system.
>
> This patch adds a memory migration callback defined by the caller to determine
> how to free destination pages.  If a caller, such as memory compaction, builds
> its own freelist for migration targets, this can reuse already freed memory
> instead of scanning additional memory.
>
> If the caller provides a function to handle freeing of destination pages, it is
> called when page migration fails.  Otherwise, it may pass NULL and freeing back
> to the system will be handled as usual.  This patch introduces no functional
> change.
>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>   include/linux/migrate.h | 11 ++++++----
>   mm/compaction.c         |  2 +-
>   mm/memory-failure.c     |  4 ++--
>   mm/memory_hotplug.c     |  2 +-
>   mm/mempolicy.c          |  4 ++--
>   mm/migrate.c            | 54 ++++++++++++++++++++++++++++++++-----------------
>   mm/page_alloc.c         |  2 +-
>   7 files changed, 50 insertions(+), 29 deletions(-)
>
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -5,7 +5,9 @@
>   #include <linux/mempolicy.h>
>   #include <linux/migrate_mode.h>
>
> -typedef struct page *new_page_t(struct page *, unsigned long private, int **);
> +typedef struct page *new_page_t(struct page *page, unsigned long private,
> +				int **reason);
> +typedef void free_page_t(struct page *page, unsigned long private);
>
>   /*
>    * Return values from addresss_space_operations.migratepage():
> @@ -38,7 +40,7 @@ enum migrate_reason {
>   extern void putback_movable_pages(struct list_head *l);
>   extern int migrate_page(struct address_space *,
>   			struct page *, struct page *, enum migrate_mode);
> -extern int migrate_pages(struct list_head *l, new_page_t x,
> +extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
>   		unsigned long private, enum migrate_mode mode, int reason);
>
>   extern int migrate_prep(void);
> @@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
>   #else
>
>   static inline void putback_movable_pages(struct list_head *l) {}
> -static inline int migrate_pages(struct list_head *l, new_page_t x,
> -		unsigned long private, enum migrate_mode mode, int reason)
> +static inline int migrate_pages(struct list_head *l, new_page_t new,
> +		free_page_t free, unsigned long private, enum migrate_mode mode,
> +		int reason)
>   	{ return -ENOSYS; }
>
>   static inline int migrate_prep(void) { return -ENOSYS; }
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -1023,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   		}
>
>   		nr_migrate = cc->nr_migratepages;
> -		err = migrate_pages(&cc->migratepages, compaction_alloc,
> +		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
>   				(unsigned long)cc,
>   				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
>   				MR_COMPACTION);
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1500,7 +1500,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
>
>   	/* Keep page count to indicate a given hugepage is isolated. */
>   	list_move(&hpage->lru, &pagelist);
> -	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
> +	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
>   				MIGRATE_SYNC, MR_MEMORY_FAILURE);
>   	if (ret) {
>   		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
> @@ -1581,7 +1581,7 @@ static int __soft_offline_page(struct page *page, int flags)
>   		inc_zone_page_state(page, NR_ISOLATED_ANON +
>   					page_is_file_cache(page));
>   		list_add(&page->lru, &pagelist);
> -		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
> +		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
>   					MIGRATE_SYNC, MR_MEMORY_FAILURE);
>   		if (ret) {
>   			if (!list_empty(&pagelist)) {
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
>   		 * alloc_migrate_target should be improooooved!!
>   		 * migrate_pages returns # of failed pages.
>   		 */
> -		ret = migrate_pages(&source, alloc_migrate_target, 0,
> +		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
>   					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
>   		if (ret)
>   			putback_movable_pages(&source);
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
>   			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
>
>   	if (!list_empty(&pagelist)) {
> -		err = migrate_pages(&pagelist, new_node_page, dest,
> +		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
>   					MIGRATE_SYNC, MR_SYSCALL);
>   		if (err)
>   			putback_movable_pages(&pagelist);
> @@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
>   		if (!list_empty(&pagelist)) {
>   			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
>   			nr_failed = migrate_pages(&pagelist, new_vma_page,
> -					(unsigned long)vma,
> +					NULL, (unsigned long)vma,
>   					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
>   			if (nr_failed)
>   				putback_movable_pages(&pagelist);
> diff --git a/mm/migrate.c b/mm/migrate.c
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -938,8 +938,9 @@ out:
>    * Obtain the lock on page, remove all ptes and migrate the page
>    * to the newly allocated page in newpage.
>    */
> -static int unmap_and_move(new_page_t get_new_page, unsigned long private,
> -			struct page *page, int force, enum migrate_mode mode)
> +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
> +			unsigned long private, struct page *page, int force,
> +			enum migrate_mode mode)
>   {
>   	int rc = 0;
>   	int *result = NULL;
> @@ -983,11 +984,12 @@ out:
>   				page_is_file_cache(page));
>   		putback_lru_page(page);
>   	}
> -	/*
> -	 * Move the new page to the LRU. If migration was not successful
> -	 * then this will free the page.
> -	 */
> -	putback_lru_page(newpage);
> +
> +	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
> +		put_new_page(newpage, private);
> +	else
> +		putback_lru_page(newpage);
> +
>   	if (result) {
>   		if (rc)
>   			*result = rc;
> @@ -1016,8 +1018,9 @@ out:
>    * will wait in the page fault for migration to complete.
>    */
>   static int unmap_and_move_huge_page(new_page_t get_new_page,
> -				unsigned long private, struct page *hpage,
> -				int force, enum migrate_mode mode)
> +				free_page_t put_new_page, unsigned long private,
> +				struct page *hpage, int force,
> +				enum migrate_mode mode)
>   {
>   	int rc = 0;
>   	int *result = NULL;
> @@ -1056,20 +1059,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
>   	if (!page_mapped(hpage))
>   		rc = move_to_new_page(new_hpage, hpage, 1, mode);
>
> -	if (rc)
> +	if (rc != MIGRATEPAGE_SUCCESS)
>   		remove_migration_ptes(hpage, hpage);
>
>   	if (anon_vma)
>   		put_anon_vma(anon_vma);
>
> -	if (!rc)
> +	if (rc == MIGRATEPAGE_SUCCESS)
>   		hugetlb_cgroup_migrate(hpage, new_hpage);
>
>   	unlock_page(hpage);
>   out:
>   	if (rc != -EAGAIN)
>   		putback_active_hugepage(hpage);
> -	put_page(new_hpage);
> +
> +	/*
> +	 * If migration was not successful and there's a freeing callback, use
> +	 * it.  Otherwise, put_page() will drop the reference grabbed during
> +	 * isolation.
> +	 */
> +	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
> +		put_new_page(new_hpage, private);
> +	else
> +		put_page(new_hpage);
> +
>   	if (result) {
>   		if (rc)
>   			*result = rc;
> @@ -1086,6 +1099,8 @@ out:
>    * @from:		The list of pages to be migrated.
>    * @get_new_page:	The function used to allocate free pages to be used
>    *			as the target of the page migration.
> + * @put_new_page:	The function used to free target pages if migration
> + *			fails, or NULL if no special handling is necessary.
>    * @private:		Private data to be passed on to get_new_page()
>    * @mode:		The migration mode that specifies the constraints for
>    *			page migration, if any.
> @@ -1099,7 +1114,8 @@ out:
>    * Returns the number of pages that were not migrated, or an error code.
>    */
>   int migrate_pages(struct list_head *from, new_page_t get_new_page,
> -		unsigned long private, enum migrate_mode mode, int reason)
> +		free_page_t put_new_page, unsigned long private,
> +		enum migrate_mode mode, int reason)
>   {
>   	int retry = 1;
>   	int nr_failed = 0;
> @@ -1121,10 +1137,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
>
>   			if (PageHuge(page))
>   				rc = unmap_and_move_huge_page(get_new_page,
> -						private, page, pass > 2, mode);
> +						put_new_page, private, page,
> +						pass > 2, mode);
>   			else
> -				rc = unmap_and_move(get_new_page, private,
> -						page, pass > 2, mode);
> +				rc = unmap_and_move(get_new_page, put_new_page,
> +						private, page, pass > 2, mode);
>
>   			switch(rc) {
>   			case -ENOMEM:
> @@ -1273,7 +1290,7 @@ set_status:
>
>   	err = 0;
>   	if (!list_empty(&pagelist)) {
> -		err = migrate_pages(&pagelist, new_page_node,
> +		err = migrate_pages(&pagelist, new_page_node, NULL,
>   				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
>   		if (err)
>   			putback_movable_pages(&pagelist);
> @@ -1729,7 +1746,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
>
>   	list_add(&page->lru, &migratepages);
>   	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
> -				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
> +				     NULL, node, MIGRATE_ASYNC,
> +				     MR_NUMA_MISPLACED);
>   	if (nr_remaining) {
>   		if (!list_empty(&migratepages)) {
>   			list_del(&page->lru);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -6215,7 +6215,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
>   		cc->nr_migratepages -= nr_reclaimed;
>
>   		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
> -				    0, MIGRATE_SYNC, MR_CMA);
> +				    NULL, 0, MIGRATE_SYNC, MR_CMA);
>   	}
>   	if (ret < 0) {
>   		putback_movable_pages(&cc->migratepages);
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch 1/2] mm, migration: add destination page freeing callback
@ 2014-05-02 13:16   ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 13:16 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/01/2014 02:45 AM, David Rientjes wrote:
> Memory migration uses a callback defined by the caller to determine how to
> allocate destination pages.  When migration fails for a source page, however, it
> frees the destination page back to the system.
>
> This patch adds a memory migration callback defined by the caller to determine
> how to free destination pages.  If a caller, such as memory compaction, builds
> its own freelist for migration targets, this can reuse already freed memory
> instead of scanning additional memory.
>
> If the caller provides a function to handle freeing of destination pages, it is
> called when page migration fails.  Otherwise, it may pass NULL and freeing back
> to the system will be handled as usual.  This patch introduces no functional
> change.
>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>   include/linux/migrate.h | 11 ++++++----
>   mm/compaction.c         |  2 +-
>   mm/memory-failure.c     |  4 ++--
>   mm/memory_hotplug.c     |  2 +-
>   mm/mempolicy.c          |  4 ++--
>   mm/migrate.c            | 54 ++++++++++++++++++++++++++++++++-----------------
>   mm/page_alloc.c         |  2 +-
>   7 files changed, 50 insertions(+), 29 deletions(-)
>
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -5,7 +5,9 @@
>   #include <linux/mempolicy.h>
>   #include <linux/migrate_mode.h>
>
> -typedef struct page *new_page_t(struct page *, unsigned long private, int **);
> +typedef struct page *new_page_t(struct page *page, unsigned long private,
> +				int **reason);
> +typedef void free_page_t(struct page *page, unsigned long private);
>
>   /*
>    * Return values from addresss_space_operations.migratepage():
> @@ -38,7 +40,7 @@ enum migrate_reason {
>   extern void putback_movable_pages(struct list_head *l);
>   extern int migrate_page(struct address_space *,
>   			struct page *, struct page *, enum migrate_mode);
> -extern int migrate_pages(struct list_head *l, new_page_t x,
> +extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
>   		unsigned long private, enum migrate_mode mode, int reason);
>
>   extern int migrate_prep(void);
> @@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
>   #else
>
>   static inline void putback_movable_pages(struct list_head *l) {}
> -static inline int migrate_pages(struct list_head *l, new_page_t x,
> -		unsigned long private, enum migrate_mode mode, int reason)
> +static inline int migrate_pages(struct list_head *l, new_page_t new,
> +		free_page_t free, unsigned long private, enum migrate_mode mode,
> +		int reason)
>   	{ return -ENOSYS; }
>
>   static inline int migrate_prep(void) { return -ENOSYS; }
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -1023,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   		}
>
>   		nr_migrate = cc->nr_migratepages;
> -		err = migrate_pages(&cc->migratepages, compaction_alloc,
> +		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
>   				(unsigned long)cc,
>   				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
>   				MR_COMPACTION);
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1500,7 +1500,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
>
>   	/* Keep page count to indicate a given hugepage is isolated. */
>   	list_move(&hpage->lru, &pagelist);
> -	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
> +	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
>   				MIGRATE_SYNC, MR_MEMORY_FAILURE);
>   	if (ret) {
>   		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
> @@ -1581,7 +1581,7 @@ static int __soft_offline_page(struct page *page, int flags)
>   		inc_zone_page_state(page, NR_ISOLATED_ANON +
>   					page_is_file_cache(page));
>   		list_add(&page->lru, &pagelist);
> -		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
> +		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
>   					MIGRATE_SYNC, MR_MEMORY_FAILURE);
>   		if (ret) {
>   			if (!list_empty(&pagelist)) {
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
>   		 * alloc_migrate_target should be improooooved!!
>   		 * migrate_pages returns # of failed pages.
>   		 */
> -		ret = migrate_pages(&source, alloc_migrate_target, 0,
> +		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
>   					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
>   		if (ret)
>   			putback_movable_pages(&source);
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
>   			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
>
>   	if (!list_empty(&pagelist)) {
> -		err = migrate_pages(&pagelist, new_node_page, dest,
> +		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
>   					MIGRATE_SYNC, MR_SYSCALL);
>   		if (err)
>   			putback_movable_pages(&pagelist);
> @@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
>   		if (!list_empty(&pagelist)) {
>   			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
>   			nr_failed = migrate_pages(&pagelist, new_vma_page,
> -					(unsigned long)vma,
> +					NULL, (unsigned long)vma,
>   					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
>   			if (nr_failed)
>   				putback_movable_pages(&pagelist);
> diff --git a/mm/migrate.c b/mm/migrate.c
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -938,8 +938,9 @@ out:
>    * Obtain the lock on page, remove all ptes and migrate the page
>    * to the newly allocated page in newpage.
>    */
> -static int unmap_and_move(new_page_t get_new_page, unsigned long private,
> -			struct page *page, int force, enum migrate_mode mode)
> +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
> +			unsigned long private, struct page *page, int force,
> +			enum migrate_mode mode)
>   {
>   	int rc = 0;
>   	int *result = NULL;
> @@ -983,11 +984,12 @@ out:
>   				page_is_file_cache(page));
>   		putback_lru_page(page);
>   	}
> -	/*
> -	 * Move the new page to the LRU. If migration was not successful
> -	 * then this will free the page.
> -	 */
> -	putback_lru_page(newpage);
> +
> +	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
> +		put_new_page(newpage, private);
> +	else
> +		putback_lru_page(newpage);
> +
>   	if (result) {
>   		if (rc)
>   			*result = rc;
> @@ -1016,8 +1018,9 @@ out:
>    * will wait in the page fault for migration to complete.
>    */
>   static int unmap_and_move_huge_page(new_page_t get_new_page,
> -				unsigned long private, struct page *hpage,
> -				int force, enum migrate_mode mode)
> +				free_page_t put_new_page, unsigned long private,
> +				struct page *hpage, int force,
> +				enum migrate_mode mode)
>   {
>   	int rc = 0;
>   	int *result = NULL;
> @@ -1056,20 +1059,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
>   	if (!page_mapped(hpage))
>   		rc = move_to_new_page(new_hpage, hpage, 1, mode);
>
> -	if (rc)
> +	if (rc != MIGRATEPAGE_SUCCESS)
>   		remove_migration_ptes(hpage, hpage);
>
>   	if (anon_vma)
>   		put_anon_vma(anon_vma);
>
> -	if (!rc)
> +	if (rc == MIGRATEPAGE_SUCCESS)
>   		hugetlb_cgroup_migrate(hpage, new_hpage);
>
>   	unlock_page(hpage);
>   out:
>   	if (rc != -EAGAIN)
>   		putback_active_hugepage(hpage);
> -	put_page(new_hpage);
> +
> +	/*
> +	 * If migration was not successful and there's a freeing callback, use
> +	 * it.  Otherwise, put_page() will drop the reference grabbed during
> +	 * isolation.
> +	 */
> +	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
> +		put_new_page(new_hpage, private);
> +	else
> +		put_page(new_hpage);
> +
>   	if (result) {
>   		if (rc)
>   			*result = rc;
> @@ -1086,6 +1099,8 @@ out:
>    * @from:		The list of pages to be migrated.
>    * @get_new_page:	The function used to allocate free pages to be used
>    *			as the target of the page migration.
> + * @put_new_page:	The function used to free target pages if migration
> + *			fails, or NULL if no special handling is necessary.
>    * @private:		Private data to be passed on to get_new_page()
>    * @mode:		The migration mode that specifies the constraints for
>    *			page migration, if any.
> @@ -1099,7 +1114,8 @@ out:
>    * Returns the number of pages that were not migrated, or an error code.
>    */
>   int migrate_pages(struct list_head *from, new_page_t get_new_page,
> -		unsigned long private, enum migrate_mode mode, int reason)
> +		free_page_t put_new_page, unsigned long private,
> +		enum migrate_mode mode, int reason)
>   {
>   	int retry = 1;
>   	int nr_failed = 0;
> @@ -1121,10 +1137,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
>
>   			if (PageHuge(page))
>   				rc = unmap_and_move_huge_page(get_new_page,
> -						private, page, pass > 2, mode);
> +						put_new_page, private, page,
> +						pass > 2, mode);
>   			else
> -				rc = unmap_and_move(get_new_page, private,
> -						page, pass > 2, mode);
> +				rc = unmap_and_move(get_new_page, put_new_page,
> +						private, page, pass > 2, mode);
>
>   			switch(rc) {
>   			case -ENOMEM:
> @@ -1273,7 +1290,7 @@ set_status:
>
>   	err = 0;
>   	if (!list_empty(&pagelist)) {
> -		err = migrate_pages(&pagelist, new_page_node,
> +		err = migrate_pages(&pagelist, new_page_node, NULL,
>   				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
>   		if (err)
>   			putback_movable_pages(&pagelist);
> @@ -1729,7 +1746,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
>
>   	list_add(&page->lru, &migratepages);
>   	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
> -				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
> +				     NULL, node, MIGRATE_ASYNC,
> +				     MR_NUMA_MISPLACED);
>   	if (nr_remaining) {
>   		if (!list_empty(&migratepages)) {
>   			list_del(&page->lru);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -6215,7 +6215,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
>   		cc->nr_migratepages -= nr_reclaimed;
>
>   		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
> -				    0, MIGRATE_SYNC, MR_CMA);
> +				    NULL, 0, MIGRATE_SYNC, MR_CMA);
>   	}
>   	if (ret < 0) {
>   		putback_movable_pages(&cc->migratepages);
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 2/4] mm, compaction: return failed migration target pages back to freelist
  2014-05-01 21:35     ` David Rientjes
@ 2014-05-02 15:23       ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 15:23 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/01/2014 11:35 PM, David Rientjes wrote:
> Memory compaction works by having a "freeing scanner" scan from one end of a
> zone which isolates pages as migration targets while another "migrating scanner"
> scans from the other end of the same zone which isolates pages for migration.
>
> When page migration fails for an isolated page, the target page is returned to
> the system rather than the freelist built by the freeing scanner.  This may
> require the freeing scanner to continue scanning memory after suitable migration
> targets have already been returned to the system needlessly.
>
> This patch returns destination pages to the freeing scanner freelist when page
> migration fails.  This prevents unnecessary work done by the freeing scanner but
> also encourages memory to be as compacted as possible at the end of the zone.

Note that the free scanner work can be further reduced as it does not 
anymore need to restart on the highest pageblock where it isolated 
something. It now does that exactly for the reason that something might 
have been returned back to the allocator and we don't want to miss that.
I'll reply with a patch for that shortly (that applies on top of 
isolate_freepages() changes in -mm plus Joonsoo's not-yet-included patch 
from http://marc.info/?l=linux-mm&m=139841452326021&w=2 )

> Reported-by: Greg Thelen <gthelen@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>   mm/compaction.c | 27 ++++++++++++++++++---------
>   1 file changed, 18 insertions(+), 9 deletions(-)
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -797,23 +797,32 @@ static struct page *compaction_alloc(struct page *migratepage,
>   }
>
>   /*
> - * We cannot control nr_migratepages and nr_freepages fully when migration is
> - * running as migrate_pages() has no knowledge of compact_control. When
> - * migration is complete, we count the number of pages on the lists by hand.
> + * This is a migrate-callback that "frees" freepages back to the isolated
> + * freelist.  All pages on the freelist are from the same zone, so there is no
> + * special handling needed for NUMA.
> + */
> +static void compaction_free(struct page *page, unsigned long data)
> +{
> +	struct compact_control *cc = (struct compact_control *)data;
> +
> +	list_add(&page->lru, &cc->freepages);
> +	cc->nr_freepages++;
> +}
> +
> +/*
> + * We cannot control nr_migratepages fully when migration is running as
> + * migrate_pages() has no knowledge of of compact_control.  When migration is
> + * complete, we count the number of pages on the list by hand.

Actually migrate_pages() returns the number of failed pages except when 
it returns an error. And then we only use the value for the tracepoint. 
I'll send a followup patch for this.

>    */
>   static void update_nr_listpages(struct compact_control *cc)
>   {
>   	int nr_migratepages = 0;
> -	int nr_freepages = 0;
>   	struct page *page;
>
>   	list_for_each_entry(page, &cc->migratepages, lru)
>   		nr_migratepages++;
> -	list_for_each_entry(page, &cc->freepages, lru)
> -		nr_freepages++;
>
>   	cc->nr_migratepages = nr_migratepages;
> -	cc->nr_freepages = nr_freepages;
>   }
>
>   /* possible outcome of isolate_migratepages */
> @@ -1023,8 +1032,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   		}
>
>   		nr_migrate = cc->nr_migratepages;
> -		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
> -				(unsigned long)cc,
> +		err = migrate_pages(&cc->migratepages, compaction_alloc,
> +				compaction_free, (unsigned long)cc,
>   				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
>   				MR_COMPACTION);
>   		update_nr_listpages(cc);
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 2/4] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-02 15:23       ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 15:23 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/01/2014 11:35 PM, David Rientjes wrote:
> Memory compaction works by having a "freeing scanner" scan from one end of a
> zone which isolates pages as migration targets while another "migrating scanner"
> scans from the other end of the same zone which isolates pages for migration.
>
> When page migration fails for an isolated page, the target page is returned to
> the system rather than the freelist built by the freeing scanner.  This may
> require the freeing scanner to continue scanning memory after suitable migration
> targets have already been returned to the system needlessly.
>
> This patch returns destination pages to the freeing scanner freelist when page
> migration fails.  This prevents unnecessary work done by the freeing scanner but
> also encourages memory to be as compacted as possible at the end of the zone.

Note that the free scanner work can be further reduced as it does not 
anymore need to restart on the highest pageblock where it isolated 
something. It now does that exactly for the reason that something might 
have been returned back to the allocator and we don't want to miss that.
I'll reply with a patch for that shortly (that applies on top of 
isolate_freepages() changes in -mm plus Joonsoo's not-yet-included patch 
from http://marc.info/?l=linux-mm&m=139841452326021&w=2 )

> Reported-by: Greg Thelen <gthelen@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>   mm/compaction.c | 27 ++++++++++++++++++---------
>   1 file changed, 18 insertions(+), 9 deletions(-)
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -797,23 +797,32 @@ static struct page *compaction_alloc(struct page *migratepage,
>   }
>
>   /*
> - * We cannot control nr_migratepages and nr_freepages fully when migration is
> - * running as migrate_pages() has no knowledge of compact_control. When
> - * migration is complete, we count the number of pages on the lists by hand.
> + * This is a migrate-callback that "frees" freepages back to the isolated
> + * freelist.  All pages on the freelist are from the same zone, so there is no
> + * special handling needed for NUMA.
> + */
> +static void compaction_free(struct page *page, unsigned long data)
> +{
> +	struct compact_control *cc = (struct compact_control *)data;
> +
> +	list_add(&page->lru, &cc->freepages);
> +	cc->nr_freepages++;
> +}
> +
> +/*
> + * We cannot control nr_migratepages fully when migration is running as
> + * migrate_pages() has no knowledge of of compact_control.  When migration is
> + * complete, we count the number of pages on the list by hand.

Actually migrate_pages() returns the number of failed pages except when 
it returns an error. And then we only use the value for the tracepoint. 
I'll send a followup patch for this.

>    */
>   static void update_nr_listpages(struct compact_control *cc)
>   {
>   	int nr_migratepages = 0;
> -	int nr_freepages = 0;
>   	struct page *page;
>
>   	list_for_each_entry(page, &cc->migratepages, lru)
>   		nr_migratepages++;
> -	list_for_each_entry(page, &cc->freepages, lru)
> -		nr_freepages++;
>
>   	cc->nr_migratepages = nr_migratepages;
> -	cc->nr_freepages = nr_freepages;
>   }
>
>   /* possible outcome of isolate_migratepages */
> @@ -1023,8 +1032,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   		}
>
>   		nr_migrate = cc->nr_migratepages;
> -		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
> -				(unsigned long)cc,
> +		err = migrate_pages(&cc->migratepages, compaction_alloc,
> +				compaction_free, (unsigned long)cc,
>   				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
>   				MR_COMPACTION);
>   		update_nr_listpages(cc);
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH] mm/compaction: do not count migratepages when unnecessary
  2014-05-02 15:23       ` Vlastimil Babka
  (?)
@ 2014-05-02 15:26       ` Vlastimil Babka
  2014-05-06 21:18         ` Naoya Horiguchi
       [not found]         ` <1399411134-k43fsr0p@n-horiguchi@ah.jp.nec.com>
  -1 siblings, 2 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 15:26 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes, --cc=Hugh Dickins
  Cc: Greg Thelen, linux-kernel, linux-mm, Vlastimil Babka,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

During compaction, update_nr_listpages() has been used to count remaining
non-migrated and free pages after a call to migrage_pages(). The freepages
counting has become unneccessary, and it turns out that migratepages counting
is also unnecessary in most cases.

The only situation when it's needed to count cc->migratepages is when
migrate_pages() returns with a negative error code. Otherwise, the non-negative
return value is the number of pages that were not migrated, which is exactly
the count of remaining pages in the cc->migratepages list.

Furthermore, any non-zero count is only interesting for the tracepoint of
mm_compaction_migratepages events, because after that all remaining unmigrated
pages are put back and their count is set to 0.

This patch therefore removes update_nr_listpages() completely, and changes the
tracepoint definition so that the manual counting is done only when the
tracepoint is enabled, and only when migrate_pages() returns a negative error
code.

Furthermore, migrate_pages() and the tracepoints won't be called when there's
nothing to migrate. This potentially avoids some wasted cycles and reduces the
volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
The mm_compaction_isolate_migratepages event is better for determining that
nothing was isolated for migration, and this one was just duplicating the info.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
 mm/compaction.c                   | 30 ++++++++----------------------
 2 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 06f544e..bbd5e1f 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -5,7 +5,9 @@
 #define _TRACE_COMPACTION_H
 
 #include <linux/types.h>
+#include <linux/list.h>
 #include <linux/tracepoint.h>
+#include <linux/mm_types.h>
 #include <trace/events/gfpflags.h>
 
 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
@@ -47,10 +49,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
 
 TRACE_EVENT(mm_compaction_migratepages,
 
-	TP_PROTO(unsigned long nr_migrated,
-		unsigned long nr_failed),
+	TP_PROTO(unsigned long nr_all,
+		int migrate_rc,
+		struct list_head * migratepages),
 
-	TP_ARGS(nr_migrated, nr_failed),
+	TP_ARGS(nr_all, migrate_rc, migratepages),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, nr_migrated)
@@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
 	),
 
 	TP_fast_assign(
-		__entry->nr_migrated = nr_migrated;
+		unsigned long nr_failed = 0;
+		struct page *page;
+
+		/*
+		 * migrate_pages() returns either a non-negative number
+		 * with the number of pages that failed migration, or an
+		 * error code, in which case we need to count the remaining
+		 * pages manually
+		 */
+		if (migrate_rc >= 0)
+			nr_failed = migrate_rc;
+		else
+		        list_for_each_entry(page, migratepages, lru)
+				nr_failed++;
+
+		__entry->nr_migrated = nr_all - nr_failed;
 		__entry->nr_failed = nr_failed;
 	),
 
diff --git a/mm/compaction.c b/mm/compaction.c
index ae1d0ae..873d7de 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -799,22 +799,6 @@ static void compaction_free(struct page *page, unsigned long data)
 	cc->nr_freepages++;
 }
 
-/*
- * We cannot control nr_migratepages fully when migration is running as
- * migrate_pages() has no knowledge of of compact_control.  When migration is
- * complete, we count the number of pages on the list by hand.
- */
-static void update_nr_listpages(struct compact_control *cc)
-{
-	int nr_migratepages = 0;
-	struct page *page;
-
-	list_for_each_entry(page, &cc->migratepages, lru)
-		nr_migratepages++;
-
-	cc->nr_migratepages = nr_migratepages;
-}
-
 /* possible outcome of isolate_migratepages */
 typedef enum {
 	ISOLATE_ABORT,		/* Abort compaction now */
@@ -1006,7 +990,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	migrate_prep_local();
 
 	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
-		unsigned long nr_migrate, nr_remaining;
 		int err;
 
 		switch (isolate_migratepages(zone, cc)) {
@@ -1021,16 +1004,16 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 			;
 		}
 
-		nr_migrate = cc->nr_migratepages;
+		if (!cc->nr_migratepages)
+			continue;
+
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
-		update_nr_listpages(cc);
-		nr_remaining = cc->nr_migratepages;
 
-		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
-						nr_remaining);
+		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
+							&cc->migratepages);
 
 		/* Release isolated pages not migrated */
 		if (err) {
@@ -1044,6 +1027,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 				ret = COMPACT_PARTIAL;
 				goto out;
 			}
+		} else {
+			/* All pages were successfully migrated */
+			cc->nr_migratepages = 0;
 		}
 	}
 
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-02 15:23       ` Vlastimil Babka
@ 2014-05-02 15:27         ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 15:27 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel

The compaction free scanner in isolate_freepages() currently remembers PFN of
the highest pageblock where it successfully isolates, to be used as the
starting pageblock for the next invocation. The rationale behind this is that
page migration might return free pages to the allocator when migration fails
and we don't want to skip them if the compaction continues.

Since migration now returns free pages back to compaction code where they can
be reused, this is no longer a concern. This patch changes isolate_freepages()
so that the PFN for restarting is updated with each pageblock where isolation
is attempted. Using stress-highalloc from mmtests, this resulted in 10%
reduction of the pages scanned by the free scanner.

Note that the somewhat similar functionality that records highest successful
pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
used when the whole compaction is restarted, not for multiple invocations of
the free scanner during single compaction.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 mm/compaction.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 873d7de..1967850 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -686,12 +686,6 @@ static void isolate_freepages(struct zone *zone,
 	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
 
 	/*
-	 * If no pages are isolated, the block_start_pfn < low_pfn check
-	 * will kick in.
-	 */
-	next_free_pfn = 0;
-
-	/*
 	 * Isolate free pages until enough are available to migrate the
 	 * pages on cc->migratepages. We stop searching if the migrate
 	 * and free page scanners meet or enough free pages are isolated.
@@ -731,19 +725,19 @@ static void isolate_freepages(struct zone *zone,
 			continue;
 
 		/* Found a block suitable for isolating free pages from */
+		next_free_pfn = block_start_pfn;
 		isolated = isolate_freepages_block(cc, block_start_pfn,
 					block_end_pfn, freelist, false);
 		nr_freepages += isolated;
 
 		/*
-		 * Record the highest PFN we isolated pages from. When next
-		 * looking for free pages, the search will restart here as
-		 * page migration may have returned some pages to the allocator
+		 * Set a flag that we successfully isolated in this pageblock.
+		 * In the next loop iteration, zone->compact_cached_free_pfn
+		 * will not be updated and thus it will effectively contain the
+		 * highest pageblock we isolated pages from.
 		 */
-		if (isolated && next_free_pfn == 0) {
+		if (isolated)
 			cc->finished_update_free = true;
-			next_free_pfn = block_start_pfn;
-		}
 	}
 
 	/* split_free_page does not map the pages */
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-02 15:27         ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 15:27 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel

The compaction free scanner in isolate_freepages() currently remembers PFN of
the highest pageblock where it successfully isolates, to be used as the
starting pageblock for the next invocation. The rationale behind this is that
page migration might return free pages to the allocator when migration fails
and we don't want to skip them if the compaction continues.

Since migration now returns free pages back to compaction code where they can
be reused, this is no longer a concern. This patch changes isolate_freepages()
so that the PFN for restarting is updated with each pageblock where isolation
is attempted. Using stress-highalloc from mmtests, this resulted in 10%
reduction of the pages scanned by the free scanner.

Note that the somewhat similar functionality that records highest successful
pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
used when the whole compaction is restarted, not for multiple invocations of
the free scanner during single compaction.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 mm/compaction.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 873d7de..1967850 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -686,12 +686,6 @@ static void isolate_freepages(struct zone *zone,
 	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
 
 	/*
-	 * If no pages are isolated, the block_start_pfn < low_pfn check
-	 * will kick in.
-	 */
-	next_free_pfn = 0;
-
-	/*
 	 * Isolate free pages until enough are available to migrate the
 	 * pages on cc->migratepages. We stop searching if the migrate
 	 * and free page scanners meet or enough free pages are isolated.
@@ -731,19 +725,19 @@ static void isolate_freepages(struct zone *zone,
 			continue;
 
 		/* Found a block suitable for isolating free pages from */
+		next_free_pfn = block_start_pfn;
 		isolated = isolate_freepages_block(cc, block_start_pfn,
 					block_end_pfn, freelist, false);
 		nr_freepages += isolated;
 
 		/*
-		 * Record the highest PFN we isolated pages from. When next
-		 * looking for free pages, the search will restart here as
-		 * page migration may have returned some pages to the allocator
+		 * Set a flag that we successfully isolated in this pageblock.
+		 * In the next loop iteration, zone->compact_cached_free_pfn
+		 * will not be updated and thus it will effectively contain the
+		 * highest pageblock we isolated pages from.
 		 */
-		if (isolated && next_free_pfn == 0) {
+		if (isolated)
 			cc->finished_update_free = true;
-			next_free_pfn = block_start_pfn;
-		}
 	}
 
 	/* split_free_page does not map the pages */
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH 1/2] mm/compaction: do not count migratepages when unnecessary
  2014-05-02 15:23       ` Vlastimil Babka
@ 2014-05-02 15:29         ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 15:29 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel

During compaction, update_nr_listpages() has been used to count remaining
non-migrated and free pages after a call to migrage_pages(). The freepages
counting has become unneccessary, and it turns out that migratepages counting
is also unnecessary in most cases.

The only situation when it's needed to count cc->migratepages is when
migrate_pages() returns with a negative error code. Otherwise, the non-negative
return value is the number of pages that were not migrated, which is exactly
the count of remaining pages in the cc->migratepages list.

Furthermore, any non-zero count is only interesting for the tracepoint of
mm_compaction_migratepages events, because after that all remaining unmigrated
pages are put back and their count is set to 0.

This patch therefore removes update_nr_listpages() completely, and changes the
tracepoint definition so that the manual counting is done only when the
tracepoint is enabled, and only when migrate_pages() returns a negative error
code.

Furthermore, migrate_pages() and the tracepoints won't be called when there's
nothing to migrate. This potentially avoids some wasted cycles and reduces the
volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
The mm_compaction_isolate_migratepages event is better for determining that
nothing was isolated for migration, and this one was just duplicating the info.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
 mm/compaction.c                   | 30 ++++++++----------------------
 2 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 06f544e..bbd5e1f 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -5,7 +5,9 @@
 #define _TRACE_COMPACTION_H
 
 #include <linux/types.h>
+#include <linux/list.h>
 #include <linux/tracepoint.h>
+#include <linux/mm_types.h>
 #include <trace/events/gfpflags.h>
 
 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
@@ -47,10 +49,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
 
 TRACE_EVENT(mm_compaction_migratepages,
 
-	TP_PROTO(unsigned long nr_migrated,
-		unsigned long nr_failed),
+	TP_PROTO(unsigned long nr_all,
+		int migrate_rc,
+		struct list_head * migratepages),
 
-	TP_ARGS(nr_migrated, nr_failed),
+	TP_ARGS(nr_all, migrate_rc, migratepages),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, nr_migrated)
@@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
 	),
 
 	TP_fast_assign(
-		__entry->nr_migrated = nr_migrated;
+		unsigned long nr_failed = 0;
+		struct page *page;
+
+		/*
+		 * migrate_pages() returns either a non-negative number
+		 * with the number of pages that failed migration, or an
+		 * error code, in which case we need to count the remaining
+		 * pages manually
+		 */
+		if (migrate_rc >= 0)
+			nr_failed = migrate_rc;
+		else
+		        list_for_each_entry(page, migratepages, lru)
+				nr_failed++;
+
+		__entry->nr_migrated = nr_all - nr_failed;
 		__entry->nr_failed = nr_failed;
 	),
 
diff --git a/mm/compaction.c b/mm/compaction.c
index ae1d0ae..873d7de 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -799,22 +799,6 @@ static void compaction_free(struct page *page, unsigned long data)
 	cc->nr_freepages++;
 }
 
-/*
- * We cannot control nr_migratepages fully when migration is running as
- * migrate_pages() has no knowledge of of compact_control.  When migration is
- * complete, we count the number of pages on the list by hand.
- */
-static void update_nr_listpages(struct compact_control *cc)
-{
-	int nr_migratepages = 0;
-	struct page *page;
-
-	list_for_each_entry(page, &cc->migratepages, lru)
-		nr_migratepages++;
-
-	cc->nr_migratepages = nr_migratepages;
-}
-
 /* possible outcome of isolate_migratepages */
 typedef enum {
 	ISOLATE_ABORT,		/* Abort compaction now */
@@ -1006,7 +990,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	migrate_prep_local();
 
 	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
-		unsigned long nr_migrate, nr_remaining;
 		int err;
 
 		switch (isolate_migratepages(zone, cc)) {
@@ -1021,16 +1004,16 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 			;
 		}
 
-		nr_migrate = cc->nr_migratepages;
+		if (!cc->nr_migratepages)
+			continue;
+
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
-		update_nr_listpages(cc);
-		nr_remaining = cc->nr_migratepages;
 
-		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
-						nr_remaining);
+		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
+							&cc->migratepages);
 
 		/* Release isolated pages not migrated */
 		if (err) {
@@ -1044,6 +1027,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 				ret = COMPACT_PARTIAL;
 				goto out;
 			}
+		} else {
+			/* All pages were successfully migrated */
+			cc->nr_migratepages = 0;
 		}
 	}
 
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH 1/2] mm/compaction: do not count migratepages when unnecessary
@ 2014-05-02 15:29         ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-02 15:29 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel

During compaction, update_nr_listpages() has been used to count remaining
non-migrated and free pages after a call to migrage_pages(). The freepages
counting has become unneccessary, and it turns out that migratepages counting
is also unnecessary in most cases.

The only situation when it's needed to count cc->migratepages is when
migrate_pages() returns with a negative error code. Otherwise, the non-negative
return value is the number of pages that were not migrated, which is exactly
the count of remaining pages in the cc->migratepages list.

Furthermore, any non-zero count is only interesting for the tracepoint of
mm_compaction_migratepages events, because after that all remaining unmigrated
pages are put back and their count is set to 0.

This patch therefore removes update_nr_listpages() completely, and changes the
tracepoint definition so that the manual counting is done only when the
tracepoint is enabled, and only when migrate_pages() returns a negative error
code.

Furthermore, migrate_pages() and the tracepoints won't be called when there's
nothing to migrate. This potentially avoids some wasted cycles and reduces the
volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
The mm_compaction_isolate_migratepages event is better for determining that
nothing was isolated for migration, and this one was just duplicating the info.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
 mm/compaction.c                   | 30 ++++++++----------------------
 2 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 06f544e..bbd5e1f 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -5,7 +5,9 @@
 #define _TRACE_COMPACTION_H
 
 #include <linux/types.h>
+#include <linux/list.h>
 #include <linux/tracepoint.h>
+#include <linux/mm_types.h>
 #include <trace/events/gfpflags.h>
 
 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
@@ -47,10 +49,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
 
 TRACE_EVENT(mm_compaction_migratepages,
 
-	TP_PROTO(unsigned long nr_migrated,
-		unsigned long nr_failed),
+	TP_PROTO(unsigned long nr_all,
+		int migrate_rc,
+		struct list_head * migratepages),
 
-	TP_ARGS(nr_migrated, nr_failed),
+	TP_ARGS(nr_all, migrate_rc, migratepages),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, nr_migrated)
@@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
 	),
 
 	TP_fast_assign(
-		__entry->nr_migrated = nr_migrated;
+		unsigned long nr_failed = 0;
+		struct page *page;
+
+		/*
+		 * migrate_pages() returns either a non-negative number
+		 * with the number of pages that failed migration, or an
+		 * error code, in which case we need to count the remaining
+		 * pages manually
+		 */
+		if (migrate_rc >= 0)
+			nr_failed = migrate_rc;
+		else
+		        list_for_each_entry(page, migratepages, lru)
+				nr_failed++;
+
+		__entry->nr_migrated = nr_all - nr_failed;
 		__entry->nr_failed = nr_failed;
 	),
 
diff --git a/mm/compaction.c b/mm/compaction.c
index ae1d0ae..873d7de 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -799,22 +799,6 @@ static void compaction_free(struct page *page, unsigned long data)
 	cc->nr_freepages++;
 }
 
-/*
- * We cannot control nr_migratepages fully when migration is running as
- * migrate_pages() has no knowledge of of compact_control.  When migration is
- * complete, we count the number of pages on the list by hand.
- */
-static void update_nr_listpages(struct compact_control *cc)
-{
-	int nr_migratepages = 0;
-	struct page *page;
-
-	list_for_each_entry(page, &cc->migratepages, lru)
-		nr_migratepages++;
-
-	cc->nr_migratepages = nr_migratepages;
-}
-
 /* possible outcome of isolate_migratepages */
 typedef enum {
 	ISOLATE_ABORT,		/* Abort compaction now */
@@ -1006,7 +990,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	migrate_prep_local();
 
 	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
-		unsigned long nr_migrate, nr_remaining;
 		int err;
 
 		switch (isolate_migratepages(zone, cc)) {
@@ -1021,16 +1004,16 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 			;
 		}
 
-		nr_migrate = cc->nr_migratepages;
+		if (!cc->nr_migratepages)
+			continue;
+
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
-		update_nr_listpages(cc);
-		nr_remaining = cc->nr_migratepages;
 
-		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
-						nr_remaining);
+		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
+							&cc->migratepages);
 
 		/* Release isolated pages not migrated */
 		if (err) {
@@ -1044,6 +1027,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 				ret = COMPACT_PARTIAL;
 				goto out;
 			}
+		} else {
+			/* All pages were successfully migrated */
+			cc->nr_migratepages = 0;
 		}
 	}
 
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
  2014-05-02 11:58         ` Mel Gorman
@ 2014-05-02 20:29             ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-02 20:29 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Fri, 2 May 2014, Mel Gorman wrote:

> > The page locks I'm referring to is the lock_page() in __unmap_and_move() 
> > that gets called for sync compaction after the migrate_pages() iteration 
> > makes a few passes and unsuccessfully grabs it.  This becomes a forced 
> > migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
> > 
> 
> Can that be fixed then instead of disabling it entirely?
> 

We could return -EAGAIN when the trylock_page() fails for 
MIGRATE_SYNC_LIGHT.  It would become a forced migration but we ignore that 
currently for MIGRATE_ASYNC, and I could extend it to be ignored for 
MIGRATE_SYNC_LIGHT as well.

> > We have perf profiles from one workload in particular that shows 
> > contention on i_mmap_mutex (anon isn't interesting since the vast majority 
> > of memory on this workload [120GB on a 128GB machine] is has a gup pin and 
> > doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid 
> > isolating pinned pages")) between cpus all doing memory compaction trying 
> > to fault thp memory.
> > 
> 
> Abort SYNC_LIGHT compaction if the mutex is contended.
> 

Yeah, I have patches for that as well but we're waiting to see if they are 
actually needed when sync compaction is disabled for thp.  If we aren't 
actually going to disable it entirely, then I can revive those patches if 
the contention becomes such an issue.

> > That's one example that we've seen, but the fact remains that at times 
> > sync compaction will iterate the entire 128GB machine and not allow an 
> > order-9 page to be allocated and there's nothing to preempt it like the 
> > need_resched() or lock contention checks that async compaction has. 
> 
> Make compact_control->sync the same enum field and check for contention
> on the async/sync_light case but leave it for sync if compacting via the
> proc interface?
> 

Ok, that certainly can be done, I wasn't sure you would be happy with such 
a change.  I'm not sure there's so much of a difference between the new 
compact_control->sync == MIGRATE_ASYNC and == MIGRATE_SYNC_LIGHT now, 
though.  Would it make sense to remove MIGRATE_SYNC_LIGHT entirely from 
the page allocator, i.e. remove sync_migration entirely, and just retry 
with a second call to compaction before failing instead? 

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
@ 2014-05-02 20:29             ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-02 20:29 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Fri, 2 May 2014, Mel Gorman wrote:

> > The page locks I'm referring to is the lock_page() in __unmap_and_move() 
> > that gets called for sync compaction after the migrate_pages() iteration 
> > makes a few passes and unsuccessfully grabs it.  This becomes a forced 
> > migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
> > 
> 
> Can that be fixed then instead of disabling it entirely?
> 

We could return -EAGAIN when the trylock_page() fails for 
MIGRATE_SYNC_LIGHT.  It would become a forced migration but we ignore that 
currently for MIGRATE_ASYNC, and I could extend it to be ignored for 
MIGRATE_SYNC_LIGHT as well.

> > We have perf profiles from one workload in particular that shows 
> > contention on i_mmap_mutex (anon isn't interesting since the vast majority 
> > of memory on this workload [120GB on a 128GB machine] is has a gup pin and 
> > doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid 
> > isolating pinned pages")) between cpus all doing memory compaction trying 
> > to fault thp memory.
> > 
> 
> Abort SYNC_LIGHT compaction if the mutex is contended.
> 

Yeah, I have patches for that as well but we're waiting to see if they are 
actually needed when sync compaction is disabled for thp.  If we aren't 
actually going to disable it entirely, then I can revive those patches if 
the contention becomes such an issue.

> > That's one example that we've seen, but the fact remains that at times 
> > sync compaction will iterate the entire 128GB machine and not allow an 
> > order-9 page to be allocated and there's nothing to preempt it like the 
> > need_resched() or lock contention checks that async compaction has. 
> 
> Make compact_control->sync the same enum field and check for contention
> on the async/sync_light case but leave it for sync if compacting via the
> proc interface?
> 

Ok, that certainly can be done, I wasn't sure you would be happy with such 
a change.  I'm not sure there's so much of a difference between the new 
compact_control->sync == MIGRATE_ASYNC and == MIGRATE_SYNC_LIGHT now, 
though.  Would it make sense to remove MIGRATE_SYNC_LIGHT entirely from 
the page allocator, i.e. remove sync_migration entirely, and just retry 
with a second call to compaction before failing instead? 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-01 21:35     ` David Rientjes
@ 2014-05-05  9:34       ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-05  9:34 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/01/2014 11:35 PM, David Rientjes wrote:
> Each zone has a cached migration scanner pfn for memory compaction so that
> subsequent calls to memory compaction can start where the previous call left
> off.
>
> Currently, the compaction migration scanner only updates the per-zone cached pfn
> when pageblocks were not skipped for async compaction.  This creates a
> dependency on calling sync compaction to avoid having subsequent calls to async
> compaction from scanning an enormous amount of non-MOVABLE pageblocks each time
> it is called.  On large machines, this could be potentially very expensive.

OK that's due to my commit 50b5b094e6 ("mm: compaction: do not mark 
unmovable pageblocks as skipped in async compaction") and the intention 
was to avoid marking pageblocks as to-be-skipped just because they were 
ignored by async compaction, which would make the following sync 
compaction ignore them as well. However it's true that 
update_pageblock_skip() also updates the cached pfn's and not updating 
them is a sideeffect of this change.

I didn't think that would be a problem as skipping whole pageblocks due 
to being non-movable should be fast and without taking locks. But if 
your testing shows that this is a problem, then OK.

> This patch adds a per-zone cached migration scanner pfn only for async
> compaction.  It is updated everytime a pageblock has been scanned in its
> entirety and when no pages from it were successfully isolated.  The cached
> migration scanner pfn for sync compaction is updated only when called for sync
> compaction.

I think this might be an overkill and maybe just decoupling the cached 
pfn update from the update_pageblock_skip() would be enough, i.e. 
restore pre-50b5b094e6 behavior for the cached pfn (but not for the skip 
bits)? I wonder if your new sync migration scanner would make any 
difference. Presumably when async compaction finishes without success by 
having the scanners meet, compact_finished() will reset the cached pfn's 
and the sync compaction will not have a chance to use any previously 
cached value anyway?

> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   include/linux/mmzone.h |  5 +++--
>   mm/compaction.c        | 55 ++++++++++++++++++++++++++------------------------
>   2 files changed, 32 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -360,9 +360,10 @@ struct zone {
>   	/* Set to true when the PG_migrate_skip bits should be cleared */
>   	bool			compact_blockskip_flush;
>
> -	/* pfns where compaction scanners should start */
> +	/* pfn where compaction free scanner should start */
>   	unsigned long		compact_cached_free_pfn;
> -	unsigned long		compact_cached_migrate_pfn;
> +	/* pfn where async and sync compaction migration scanner should start */
> +	unsigned long		compact_cached_migrate_pfn[2];
>   #endif
>   #ifdef CONFIG_MEMORY_HOTPLUG
>   	/* see spanned/present_pages for more description */
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
>   	unsigned long end_pfn = zone_end_pfn(zone);
>   	unsigned long pfn;
>
> -	zone->compact_cached_migrate_pfn = start_pfn;
> +	zone->compact_cached_migrate_pfn[0] = start_pfn;
> +	zone->compact_cached_migrate_pfn[1] = start_pfn;
>   	zone->compact_cached_free_pfn = end_pfn;
>   	zone->compact_blockskip_flush = false;
>
> @@ -134,6 +135,7 @@ static void update_pageblock_skip(struct compact_control *cc,
>   			bool migrate_scanner)
>   {
>   	struct zone *zone = cc->zone;
> +	unsigned long pfn;
>
>   	if (cc->ignore_skip_hint)
>   		return;
> @@ -141,20 +143,25 @@ static void update_pageblock_skip(struct compact_control *cc,
>   	if (!page)
>   		return;
>
> -	if (!nr_isolated) {
> -		unsigned long pfn = page_to_pfn(page);
> -		set_pageblock_skip(page);
> -
> -		/* Update where compaction should restart */
> -		if (migrate_scanner) {
> -			if (!cc->finished_update_migrate &&
> -			    pfn > zone->compact_cached_migrate_pfn)
> -				zone->compact_cached_migrate_pfn = pfn;
> -		} else {
> -			if (!cc->finished_update_free &&
> -			    pfn < zone->compact_cached_free_pfn)
> -				zone->compact_cached_free_pfn = pfn;
> -		}
> +	if (nr_isolated)
> +		return;
> +
> +	set_pageblock_skip(page);
> +	pfn = page_to_pfn(page);
> +
> +	/* Update where async and sync compaction should restart */
> +	if (migrate_scanner) {
> +		if (cc->finished_update_migrate)
> +			return;
> +		if (pfn > zone->compact_cached_migrate_pfn[0])
> +			zone->compact_cached_migrate_pfn[0] = pfn;
> +		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
> +			zone->compact_cached_migrate_pfn[1] = pfn;
> +	} else {
> +		if (cc->finished_update_free)
> +			return;
> +		if (pfn < zone->compact_cached_free_pfn)
> +			zone->compact_cached_free_pfn = pfn;
>   	}
>   }
>   #else
> @@ -464,7 +471,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   	unsigned long flags;
>   	bool locked = false;
>   	struct page *page = NULL, *valid_page = NULL;
> -	bool skipped_async_unsuitable = false;
>   	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
>   				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
>
> @@ -540,11 +546,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			 * the minimum amount of work satisfies the allocation
>   			 */
>   			mt = get_pageblock_migratetype(page);
> -			if (!cc->sync && !migrate_async_suitable(mt)) {
> -				cc->finished_update_migrate = true;
> -				skipped_async_unsuitable = true;
> +			if (!cc->sync && !migrate_async_suitable(mt))
>   				goto next_pageblock;
> -			}
>   		}
>
>   		/*
> @@ -646,10 +649,8 @@ next_pageblock:
>   	/*
>   	 * Update the pageblock-skip information and cached scanner pfn,
>   	 * if the whole pageblock was scanned without isolating any page.
> -	 * This is not done when pageblock was skipped due to being unsuitable
> -	 * for async compaction, so that eventual sync compaction can try.
>   	 */
> -	if (low_pfn == end_pfn && !skipped_async_unsuitable)
> +	if (low_pfn == end_pfn)
>   		update_pageblock_skip(cc, valid_page, nr_isolated, true);
>
>   	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
> @@ -875,7 +876,8 @@ static int compact_finished(struct zone *zone,
>   	/* Compaction run completes if the migrate and free scanner meet */
>   	if (cc->free_pfn <= cc->migrate_pfn) {
>   		/* Let the next compaction start anew. */
> -		zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
> +		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
> +		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
>   		zone->compact_cached_free_pfn = zone_end_pfn(zone);
>
>   		/*
> @@ -1000,7 +1002,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	 * information on where the scanners should start but check that it
>   	 * is initialised by ensuring the values are within zone boundaries.
>   	 */
> -	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
> +	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
>   	cc->free_pfn = zone->compact_cached_free_pfn;
>   	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
>   		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
> @@ -1008,7 +1010,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	}
>   	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
>   		cc->migrate_pfn = start_pfn;
> -		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
> +		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
> +		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
>   	}
>
>   	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
@ 2014-05-05  9:34       ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-05  9:34 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/01/2014 11:35 PM, David Rientjes wrote:
> Each zone has a cached migration scanner pfn for memory compaction so that
> subsequent calls to memory compaction can start where the previous call left
> off.
>
> Currently, the compaction migration scanner only updates the per-zone cached pfn
> when pageblocks were not skipped for async compaction.  This creates a
> dependency on calling sync compaction to avoid having subsequent calls to async
> compaction from scanning an enormous amount of non-MOVABLE pageblocks each time
> it is called.  On large machines, this could be potentially very expensive.

OK that's due to my commit 50b5b094e6 ("mm: compaction: do not mark 
unmovable pageblocks as skipped in async compaction") and the intention 
was to avoid marking pageblocks as to-be-skipped just because they were 
ignored by async compaction, which would make the following sync 
compaction ignore them as well. However it's true that 
update_pageblock_skip() also updates the cached pfn's and not updating 
them is a sideeffect of this change.

I didn't think that would be a problem as skipping whole pageblocks due 
to being non-movable should be fast and without taking locks. But if 
your testing shows that this is a problem, then OK.

> This patch adds a per-zone cached migration scanner pfn only for async
> compaction.  It is updated everytime a pageblock has been scanned in its
> entirety and when no pages from it were successfully isolated.  The cached
> migration scanner pfn for sync compaction is updated only when called for sync
> compaction.

I think this might be an overkill and maybe just decoupling the cached 
pfn update from the update_pageblock_skip() would be enough, i.e. 
restore pre-50b5b094e6 behavior for the cached pfn (but not for the skip 
bits)? I wonder if your new sync migration scanner would make any 
difference. Presumably when async compaction finishes without success by 
having the scanners meet, compact_finished() will reset the cached pfn's 
and the sync compaction will not have a chance to use any previously 
cached value anyway?

> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   include/linux/mmzone.h |  5 +++--
>   mm/compaction.c        | 55 ++++++++++++++++++++++++++------------------------
>   2 files changed, 32 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -360,9 +360,10 @@ struct zone {
>   	/* Set to true when the PG_migrate_skip bits should be cleared */
>   	bool			compact_blockskip_flush;
>
> -	/* pfns where compaction scanners should start */
> +	/* pfn where compaction free scanner should start */
>   	unsigned long		compact_cached_free_pfn;
> -	unsigned long		compact_cached_migrate_pfn;
> +	/* pfn where async and sync compaction migration scanner should start */
> +	unsigned long		compact_cached_migrate_pfn[2];
>   #endif
>   #ifdef CONFIG_MEMORY_HOTPLUG
>   	/* see spanned/present_pages for more description */
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
>   	unsigned long end_pfn = zone_end_pfn(zone);
>   	unsigned long pfn;
>
> -	zone->compact_cached_migrate_pfn = start_pfn;
> +	zone->compact_cached_migrate_pfn[0] = start_pfn;
> +	zone->compact_cached_migrate_pfn[1] = start_pfn;
>   	zone->compact_cached_free_pfn = end_pfn;
>   	zone->compact_blockskip_flush = false;
>
> @@ -134,6 +135,7 @@ static void update_pageblock_skip(struct compact_control *cc,
>   			bool migrate_scanner)
>   {
>   	struct zone *zone = cc->zone;
> +	unsigned long pfn;
>
>   	if (cc->ignore_skip_hint)
>   		return;
> @@ -141,20 +143,25 @@ static void update_pageblock_skip(struct compact_control *cc,
>   	if (!page)
>   		return;
>
> -	if (!nr_isolated) {
> -		unsigned long pfn = page_to_pfn(page);
> -		set_pageblock_skip(page);
> -
> -		/* Update where compaction should restart */
> -		if (migrate_scanner) {
> -			if (!cc->finished_update_migrate &&
> -			    pfn > zone->compact_cached_migrate_pfn)
> -				zone->compact_cached_migrate_pfn = pfn;
> -		} else {
> -			if (!cc->finished_update_free &&
> -			    pfn < zone->compact_cached_free_pfn)
> -				zone->compact_cached_free_pfn = pfn;
> -		}
> +	if (nr_isolated)
> +		return;
> +
> +	set_pageblock_skip(page);
> +	pfn = page_to_pfn(page);
> +
> +	/* Update where async and sync compaction should restart */
> +	if (migrate_scanner) {
> +		if (cc->finished_update_migrate)
> +			return;
> +		if (pfn > zone->compact_cached_migrate_pfn[0])
> +			zone->compact_cached_migrate_pfn[0] = pfn;
> +		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
> +			zone->compact_cached_migrate_pfn[1] = pfn;
> +	} else {
> +		if (cc->finished_update_free)
> +			return;
> +		if (pfn < zone->compact_cached_free_pfn)
> +			zone->compact_cached_free_pfn = pfn;
>   	}
>   }
>   #else
> @@ -464,7 +471,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   	unsigned long flags;
>   	bool locked = false;
>   	struct page *page = NULL, *valid_page = NULL;
> -	bool skipped_async_unsuitable = false;
>   	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
>   				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
>
> @@ -540,11 +546,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			 * the minimum amount of work satisfies the allocation
>   			 */
>   			mt = get_pageblock_migratetype(page);
> -			if (!cc->sync && !migrate_async_suitable(mt)) {
> -				cc->finished_update_migrate = true;
> -				skipped_async_unsuitable = true;
> +			if (!cc->sync && !migrate_async_suitable(mt))
>   				goto next_pageblock;
> -			}
>   		}
>
>   		/*
> @@ -646,10 +649,8 @@ next_pageblock:
>   	/*
>   	 * Update the pageblock-skip information and cached scanner pfn,
>   	 * if the whole pageblock was scanned without isolating any page.
> -	 * This is not done when pageblock was skipped due to being unsuitable
> -	 * for async compaction, so that eventual sync compaction can try.
>   	 */
> -	if (low_pfn == end_pfn && !skipped_async_unsuitable)
> +	if (low_pfn == end_pfn)
>   		update_pageblock_skip(cc, valid_page, nr_isolated, true);
>
>   	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
> @@ -875,7 +876,8 @@ static int compact_finished(struct zone *zone,
>   	/* Compaction run completes if the migrate and free scanner meet */
>   	if (cc->free_pfn <= cc->migrate_pfn) {
>   		/* Let the next compaction start anew. */
> -		zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
> +		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
> +		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
>   		zone->compact_cached_free_pfn = zone_end_pfn(zone);
>
>   		/*
> @@ -1000,7 +1002,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	 * information on where the scanners should start but check that it
>   	 * is initialised by ensuring the values are within zone boundaries.
>   	 */
> -	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
> +	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
>   	cc->free_pfn = zone->compact_cached_free_pfn;
>   	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
>   		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
> @@ -1008,7 +1010,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	}
>   	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
>   		cc->migrate_pfn = start_pfn;
> -		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
> +		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
> +		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
>   	}
>
>   	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-05  9:34       ` Vlastimil Babka
@ 2014-05-05  9:51         ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-05  9:51 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Mon, 5 May 2014, Vlastimil Babka wrote:

> OK that's due to my commit 50b5b094e6 ("mm: compaction: do not mark unmovable
> pageblocks as skipped in async compaction") and the intention was to avoid
> marking pageblocks as to-be-skipped just because they were ignored by async
> compaction, which would make the following sync compaction ignore them as
> well. However it's true that update_pageblock_skip() also updates the cached
> pfn's and not updating them is a sideeffect of this change.
> 

It's not necessary just that commit, update_pageblock_skip() won't do 
anything if cc->finished_update_migrate is true which still happens before 
the commit.  This issue was noticed on a kernel without your commit.

> I didn't think that would be a problem as skipping whole pageblocks due to
> being non-movable should be fast and without taking locks. But if your testing
> shows that this is a problem, then OK.
> 

Async compaction terminates early when lru_lock is contended or 
need_resched() and on zones that are so large for a 128GB machine, this 
happens often.  A thp allocation returns immediately because of 
contended_compaction in the page allocator.  When the next thp is 
allocated, async compaction starts from where the former iteration started 
because we don't do any caching of the pfns and nothing called sync 
compaction.  It's simply unnecessary overhead that can be prevented on the 
next call and it leaves a potentially large part of the zone unscanned if 
we continuously fail because of contention.  This patch fixes that.

> > This patch adds a per-zone cached migration scanner pfn only for async
> > compaction.  It is updated everytime a pageblock has been scanned in its
> > entirety and when no pages from it were successfully isolated.  The cached
> > migration scanner pfn for sync compaction is updated only when called for
> > sync
> > compaction.
> 
> I think this might be an overkill and maybe just decoupling the cached pfn
> update from the update_pageblock_skip() would be enough, i.e. restore
> pre-50b5b094e6 behavior for the cached pfn (but not for the skip bits)? I
> wonder if your new sync migration scanner would make any difference.
> Presumably when async compaction finishes without success by having the
> scanners meet, compact_finished() will reset the cached pfn's and the sync
> compaction will not have a chance to use any previously cached value anyway?
> 

When a zone has 32GB or 64GB to scan, as is in this case (and will become 
larger in the future), async compaction will always terminate early.  It 
may never cause a migration destination page to even be allocated, the 
freeing scanner may never move and there's no guarantee they will ever 
meet if we never call sync compaction.

The logic presented in this patch will avoid rescanning the non-movable 
pageblocks, for example, for async compaction until all other memory has 
been scanned.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
@ 2014-05-05  9:51         ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-05  9:51 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Mon, 5 May 2014, Vlastimil Babka wrote:

> OK that's due to my commit 50b5b094e6 ("mm: compaction: do not mark unmovable
> pageblocks as skipped in async compaction") and the intention was to avoid
> marking pageblocks as to-be-skipped just because they were ignored by async
> compaction, which would make the following sync compaction ignore them as
> well. However it's true that update_pageblock_skip() also updates the cached
> pfn's and not updating them is a sideeffect of this change.
> 

It's not necessary just that commit, update_pageblock_skip() won't do 
anything if cc->finished_update_migrate is true which still happens before 
the commit.  This issue was noticed on a kernel without your commit.

> I didn't think that would be a problem as skipping whole pageblocks due to
> being non-movable should be fast and without taking locks. But if your testing
> shows that this is a problem, then OK.
> 

Async compaction terminates early when lru_lock is contended or 
need_resched() and on zones that are so large for a 128GB machine, this 
happens often.  A thp allocation returns immediately because of 
contended_compaction in the page allocator.  When the next thp is 
allocated, async compaction starts from where the former iteration started 
because we don't do any caching of the pfns and nothing called sync 
compaction.  It's simply unnecessary overhead that can be prevented on the 
next call and it leaves a potentially large part of the zone unscanned if 
we continuously fail because of contention.  This patch fixes that.

> > This patch adds a per-zone cached migration scanner pfn only for async
> > compaction.  It is updated everytime a pageblock has been scanned in its
> > entirety and when no pages from it were successfully isolated.  The cached
> > migration scanner pfn for sync compaction is updated only when called for
> > sync
> > compaction.
> 
> I think this might be an overkill and maybe just decoupling the cached pfn
> update from the update_pageblock_skip() would be enough, i.e. restore
> pre-50b5b094e6 behavior for the cached pfn (but not for the skip bits)? I
> wonder if your new sync migration scanner would make any difference.
> Presumably when async compaction finishes without success by having the
> scanners meet, compact_finished() will reset the cached pfn's and the sync
> compaction will not have a chance to use any previously cached value anyway?
> 

When a zone has 32GB or 64GB to scan, as is in this case (and will become 
larger in the future), async compaction will always terminate early.  It 
may never cause a migration destination page to even be allocated, the 
freeing scanner may never move and there's no guarantee they will ever 
meet if we never call sync compaction.

The logic presented in this patch will avoid rescanning the non-movable 
pageblocks, for example, for async compaction until all other memory has 
been scanned.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-05  9:51         ` David Rientjes
@ 2014-05-05 14:24           ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-05 14:24 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On 05/05/2014 11:51 AM, David Rientjes wrote:
> On Mon, 5 May 2014, Vlastimil Babka wrote:
>
>> OK that's due to my commit 50b5b094e6 ("mm: compaction: do not mark unmovable
>> pageblocks as skipped in async compaction") and the intention was to avoid
>> marking pageblocks as to-be-skipped just because they were ignored by async
>> compaction, which would make the following sync compaction ignore them as
>> well. However it's true that update_pageblock_skip() also updates the cached
>> pfn's and not updating them is a sideeffect of this change.
>>
>
> It's not necessary just that commit, update_pageblock_skip() won't do
> anything if cc->finished_update_migrate is true which still happens before
> the commit.  This issue was noticed on a kernel without your commit.

OK.

>> I didn't think that would be a problem as skipping whole pageblocks due to
>> being non-movable should be fast and without taking locks. But if your testing
>> shows that this is a problem, then OK.
>>
>
> Async compaction terminates early when lru_lock is contended or
> need_resched() and on zones that are so large for a 128GB machine, this
> happens often.  A thp allocation returns immediately because of
> contended_compaction in the page allocator.  When the next thp is
> allocated, async compaction starts from where the former iteration started
> because we don't do any caching of the pfns and nothing called sync
> compaction.  It's simply unnecessary overhead that can be prevented on the
> next call and it leaves a potentially large part of the zone unscanned if
> we continuously fail because of contention.  This patch fixes that.

OK.

>>> This patch adds a per-zone cached migration scanner pfn only for async
>>> compaction.  It is updated everytime a pageblock has been scanned in its
>>> entirety and when no pages from it were successfully isolated.  The cached
>>> migration scanner pfn for sync compaction is updated only when called for
>>> sync
>>> compaction.
>>
>> I think this might be an overkill and maybe just decoupling the cached pfn
>> update from the update_pageblock_skip() would be enough, i.e. restore
>> pre-50b5b094e6 behavior for the cached pfn (but not for the skip bits)? I
>> wonder if your new sync migration scanner would make any difference.
>> Presumably when async compaction finishes without success by having the
>> scanners meet, compact_finished() will reset the cached pfn's and the sync
>> compaction will not have a chance to use any previously cached value anyway?
>>
>
> When a zone has 32GB or 64GB to scan, as is in this case (and will become
> larger in the future), async compaction will always terminate early.  It
> may never cause a migration destination page to even be allocated, the
> freeing scanner may never move and there's no guarantee they will ever
> meet if we never call sync compaction.
>
> The logic presented in this patch will avoid rescanning the non-movable
> pageblocks, for example, for async compaction until all other memory has
> been scanned.

I see, although I would still welcome some numbers to back such change.
What I still don't like is the removal of the intent of commit 
50b5b094e6. You now again call set_pageblock_skip() unconditionally, 
thus also on pageblocks that async compaction skipped due to being 
non-MOVABLE. The sync compaction will thus ignore them.


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
@ 2014-05-05 14:24           ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-05 14:24 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On 05/05/2014 11:51 AM, David Rientjes wrote:
> On Mon, 5 May 2014, Vlastimil Babka wrote:
>
>> OK that's due to my commit 50b5b094e6 ("mm: compaction: do not mark unmovable
>> pageblocks as skipped in async compaction") and the intention was to avoid
>> marking pageblocks as to-be-skipped just because they were ignored by async
>> compaction, which would make the following sync compaction ignore them as
>> well. However it's true that update_pageblock_skip() also updates the cached
>> pfn's and not updating them is a sideeffect of this change.
>>
>
> It's not necessary just that commit, update_pageblock_skip() won't do
> anything if cc->finished_update_migrate is true which still happens before
> the commit.  This issue was noticed on a kernel without your commit.

OK.

>> I didn't think that would be a problem as skipping whole pageblocks due to
>> being non-movable should be fast and without taking locks. But if your testing
>> shows that this is a problem, then OK.
>>
>
> Async compaction terminates early when lru_lock is contended or
> need_resched() and on zones that are so large for a 128GB machine, this
> happens often.  A thp allocation returns immediately because of
> contended_compaction in the page allocator.  When the next thp is
> allocated, async compaction starts from where the former iteration started
> because we don't do any caching of the pfns and nothing called sync
> compaction.  It's simply unnecessary overhead that can be prevented on the
> next call and it leaves a potentially large part of the zone unscanned if
> we continuously fail because of contention.  This patch fixes that.

OK.

>>> This patch adds a per-zone cached migration scanner pfn only for async
>>> compaction.  It is updated everytime a pageblock has been scanned in its
>>> entirety and when no pages from it were successfully isolated.  The cached
>>> migration scanner pfn for sync compaction is updated only when called for
>>> sync
>>> compaction.
>>
>> I think this might be an overkill and maybe just decoupling the cached pfn
>> update from the update_pageblock_skip() would be enough, i.e. restore
>> pre-50b5b094e6 behavior for the cached pfn (but not for the skip bits)? I
>> wonder if your new sync migration scanner would make any difference.
>> Presumably when async compaction finishes without success by having the
>> scanners meet, compact_finished() will reset the cached pfn's and the sync
>> compaction will not have a chance to use any previously cached value anyway?
>>
>
> When a zone has 32GB or 64GB to scan, as is in this case (and will become
> larger in the future), async compaction will always terminate early.  It
> may never cause a migration destination page to even be allocated, the
> freeing scanner may never move and there's no guarantee they will ever
> meet if we never call sync compaction.
>
> The logic presented in this patch will avoid rescanning the non-movable
> pageblocks, for example, for async compaction until all other memory has
> been scanned.

I see, although I would still welcome some numbers to back such change.
What I still don't like is the removal of the intent of commit 
50b5b094e6. You now again call set_pageblock_skip() unconditionally, 
thus also on pageblocks that async compaction skipped due to being 
non-MOVABLE. The sync compaction will thus ignore them.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
  2014-05-02 20:29             ` David Rientjes
@ 2014-05-05 14:48               ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-05 14:48 UTC (permalink / raw)
  To: David Rientjes, Mel Gorman
  Cc: Andrew Morton, Rik van Riel, Joonsoo Kim, Greg Thelen,
	Hugh Dickins, linux-kernel, linux-mm

On 05/02/2014 10:29 PM, David Rientjes wrote:
> On Fri, 2 May 2014, Mel Gorman wrote:
>
>>> The page locks I'm referring to is the lock_page() in __unmap_and_move()
>>> that gets called for sync compaction after the migrate_pages() iteration
>>> makes a few passes and unsuccessfully grabs it.  This becomes a forced
>>> migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
>>>
>>
>> Can that be fixed then instead of disabling it entirely?
>>
>
> We could return -EAGAIN when the trylock_page() fails for
> MIGRATE_SYNC_LIGHT.  It would become a forced migration but we ignore that
> currently for MIGRATE_ASYNC, and I could extend it to be ignored for
> MIGRATE_SYNC_LIGHT as well.
>
>>> We have perf profiles from one workload in particular that shows
>>> contention on i_mmap_mutex (anon isn't interesting since the vast majority
>>> of memory on this workload [120GB on a 128GB machine] is has a gup pin and
>>> doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid
>>> isolating pinned pages")) between cpus all doing memory compaction trying
>>> to fault thp memory.
>>>
>>
>> Abort SYNC_LIGHT compaction if the mutex is contended.
>>
>
> Yeah, I have patches for that as well but we're waiting to see if they are
> actually needed when sync compaction is disabled for thp.  If we aren't
> actually going to disable it entirely, then I can revive those patches if
> the contention becomes such an issue.
>
>>> That's one example that we've seen, but the fact remains that at times
>>> sync compaction will iterate the entire 128GB machine and not allow an
>>> order-9 page to be allocated and there's nothing to preempt it like the
>>> need_resched() or lock contention checks that async compaction has.
>>
>> Make compact_control->sync the same enum field and check for contention
>> on the async/sync_light case but leave it for sync if compacting via the
>> proc interface?
>>
>
> Ok, that certainly can be done, I wasn't sure you would be happy with such
> a change.  I'm not sure there's so much of a difference between the new
> compact_control->sync == MIGRATE_ASYNC and == MIGRATE_SYNC_LIGHT now,
> though.  Would it make sense to remove MIGRATE_SYNC_LIGHT entirely from
> the page allocator, i.e. remove sync_migration entirely, and just retry
> with a second call to compaction before failing instead?

Maybe we should step back and rethink the conditions for all sources of 
compaction to better balance effort vs desired outcome, so distinguish 4 
modes (just quick thoughts):

1) kswapd - we don't want to slow it down too much, thus async, uses 
cached pfns, honors skip bits, but does not give up on order=X blocks of 
pages just because some page was unable to be isolated/migrated (so 
gradually it might clean up such block over time).

2) hugepaged - it can afford to wait, so async + sync, use cached pfns, 
ignore skip bits on sync, also do not give up on order=THP blocks (?)

3) direct compaction on allocation of order X- we want to avoid 
allocation latencies, so it should skip over order=X blocks of pages as 
soon as it cannot isolate some page in such block (and put back 
already-isolated pages instead of migrating them). As for other 
parameters I'm not sure. Sync should still be possible thanks to the 
deferred_compaction logic? Maybe somehow transform the yes/no answer of 
deferred compaction to a number of how many pageblocks it should try 
before giving up, so there are no latency spikes limited only by the 
size of the zone?

4) compaction from proc interface - sync, reset cached pfn's, ignore 
deferring, ignore skip bits...

Vlastimil



^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
@ 2014-05-05 14:48               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-05 14:48 UTC (permalink / raw)
  To: David Rientjes, Mel Gorman
  Cc: Andrew Morton, Rik van Riel, Joonsoo Kim, Greg Thelen,
	Hugh Dickins, linux-kernel, linux-mm

On 05/02/2014 10:29 PM, David Rientjes wrote:
> On Fri, 2 May 2014, Mel Gorman wrote:
>
>>> The page locks I'm referring to is the lock_page() in __unmap_and_move()
>>> that gets called for sync compaction after the migrate_pages() iteration
>>> makes a few passes and unsuccessfully grabs it.  This becomes a forced
>>> migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
>>>
>>
>> Can that be fixed then instead of disabling it entirely?
>>
>
> We could return -EAGAIN when the trylock_page() fails for
> MIGRATE_SYNC_LIGHT.  It would become a forced migration but we ignore that
> currently for MIGRATE_ASYNC, and I could extend it to be ignored for
> MIGRATE_SYNC_LIGHT as well.
>
>>> We have perf profiles from one workload in particular that shows
>>> contention on i_mmap_mutex (anon isn't interesting since the vast majority
>>> of memory on this workload [120GB on a 128GB machine] is has a gup pin and
>>> doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid
>>> isolating pinned pages")) between cpus all doing memory compaction trying
>>> to fault thp memory.
>>>
>>
>> Abort SYNC_LIGHT compaction if the mutex is contended.
>>
>
> Yeah, I have patches for that as well but we're waiting to see if they are
> actually needed when sync compaction is disabled for thp.  If we aren't
> actually going to disable it entirely, then I can revive those patches if
> the contention becomes such an issue.
>
>>> That's one example that we've seen, but the fact remains that at times
>>> sync compaction will iterate the entire 128GB machine and not allow an
>>> order-9 page to be allocated and there's nothing to preempt it like the
>>> need_resched() or lock contention checks that async compaction has.
>>
>> Make compact_control->sync the same enum field and check for contention
>> on the async/sync_light case but leave it for sync if compacting via the
>> proc interface?
>>
>
> Ok, that certainly can be done, I wasn't sure you would be happy with such
> a change.  I'm not sure there's so much of a difference between the new
> compact_control->sync == MIGRATE_ASYNC and == MIGRATE_SYNC_LIGHT now,
> though.  Would it make sense to remove MIGRATE_SYNC_LIGHT entirely from
> the page allocator, i.e. remove sync_migration entirely, and just retry
> with a second call to compaction before failing instead?

Maybe we should step back and rethink the conditions for all sources of 
compaction to better balance effort vs desired outcome, so distinguish 4 
modes (just quick thoughts):

1) kswapd - we don't want to slow it down too much, thus async, uses 
cached pfns, honors skip bits, but does not give up on order=X blocks of 
pages just because some page was unable to be isolated/migrated (so 
gradually it might clean up such block over time).

2) hugepaged - it can afford to wait, so async + sync, use cached pfns, 
ignore skip bits on sync, also do not give up on order=THP blocks (?)

3) direct compaction on allocation of order X- we want to avoid 
allocation latencies, so it should skip over order=X blocks of pages as 
soon as it cannot isolate some page in such block (and put back 
already-isolated pages instead of migrating them). As for other 
parameters I'm not sure. Sync should still be possible thanks to the 
deferred_compaction logic? Maybe somehow transform the yes/no answer of 
deferred compaction to a number of how many pageblocks it should try 
before giving up, so there are no latency spikes limited only by the 
size of the zone?

4) compaction from proc interface - sync, reset cached pfn's, ignore 
deferring, ignore skip bits...

Vlastimil


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-05 14:24           ` Vlastimil Babka
@ 2014-05-06  0:29             ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-06  0:29 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Mon, 5 May 2014, Vlastimil Babka wrote:

> I see, although I would still welcome some numbers to back such change.

It's pretty difficult to capture numbers for this in real-world scenarios 
since it happens rarely (and when it happens, it's very significant 
latency) and without an instrumented kernel that will determine how many 
pageblocks have been skipped.  I could create a synthetic example of it in 
the kernel and get numbers for a worst-case scenario with a 64GB zone if 
you'd like, I'm not sure how representative it will be.

> What I still don't like is the removal of the intent of commit 50b5b094e6. You
> now again call set_pageblock_skip() unconditionally, thus also on pageblocks
> that async compaction skipped due to being non-MOVABLE. The sync compaction
> will thus ignore them.
> 

I'm not following you, with this patch there are two cached pfns for the 
migration scanner: one is used for sync and one is used for async.  When 
cc->sync == true, both cached pfns are updated (async is not going to 
succeed for a pageblock when sync failed for that pageblock); when 
cc->sync == false, the async cached pfn is updated only and we pick up 
again where we left off for subsequent async compactions.  Sync compaction 
will still begin where it last left off and consider these non-MOVABLE 
pageblocks.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
@ 2014-05-06  0:29             ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-06  0:29 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Mon, 5 May 2014, Vlastimil Babka wrote:

> I see, although I would still welcome some numbers to back such change.

It's pretty difficult to capture numbers for this in real-world scenarios 
since it happens rarely (and when it happens, it's very significant 
latency) and without an instrumented kernel that will determine how many 
pageblocks have been skipped.  I could create a synthetic example of it in 
the kernel and get numbers for a worst-case scenario with a 64GB zone if 
you'd like, I'm not sure how representative it will be.

> What I still don't like is the removal of the intent of commit 50b5b094e6. You
> now again call set_pageblock_skip() unconditionally, thus also on pageblocks
> that async compaction skipped due to being non-MOVABLE. The sync compaction
> will thus ignore them.
> 

I'm not following you, with this patch there are two cached pfns for the 
migration scanner: one is used for sync and one is used for async.  When 
cc->sync == true, both cached pfns are updated (async is not going to 
succeed for a pageblock when sync failed for that pageblock); when 
cc->sync == false, the async cached pfn is updated only and we pick up 
again where we left off for subsequent async compactions.  Sync compaction 
will still begin where it last left off and consider these non-MOVABLE 
pageblocks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
  2014-05-02 20:29             ` David Rientjes
@ 2014-05-06  8:55               ` Mel Gorman
  -1 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-06  8:55 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Fri, May 02, 2014 at 01:29:33PM -0700, David Rientjes wrote:
> On Fri, 2 May 2014, Mel Gorman wrote:
> 
> > > The page locks I'm referring to is the lock_page() in __unmap_and_move() 
> > > that gets called for sync compaction after the migrate_pages() iteration 
> > > makes a few passes and unsuccessfully grabs it.  This becomes a forced 
> > > migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
> > > 
> > 
> > Can that be fixed then instead of disabling it entirely?
> > 
> 
> We could return -EAGAIN when the trylock_page() fails for 
> MIGRATE_SYNC_LIGHT.  It would become a forced migration but we ignore that 
> currently for MIGRATE_ASYNC, and I could extend it to be ignored for 
> MIGRATE_SYNC_LIGHT as well.
> 
> > > We have perf profiles from one workload in particular that shows 
> > > contention on i_mmap_mutex (anon isn't interesting since the vast majority 
> > > of memory on this workload [120GB on a 128GB machine] is has a gup pin and 
> > > doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid 
> > > isolating pinned pages")) between cpus all doing memory compaction trying 
> > > to fault thp memory.
> > > 
> > 
> > Abort SYNC_LIGHT compaction if the mutex is contended.
> > 
> 
> Yeah, I have patches for that as well but we're waiting to see if they are 
> actually needed when sync compaction is disabled for thp.  If we aren't 
> actually going to disable it entirely, then I can revive those patches if 
> the contention becomes such an issue.
> 
> > > That's one example that we've seen, but the fact remains that at times 
> > > sync compaction will iterate the entire 128GB machine and not allow an 
> > > order-9 page to be allocated and there's nothing to preempt it like the 
> > > need_resched() or lock contention checks that async compaction has. 
> > 
> > Make compact_control->sync the same enum field and check for contention
> > on the async/sync_light case but leave it for sync if compacting via the
> > proc interface?
> > 
> 
> Ok, that certainly can be done, I wasn't sure you would be happy with such 
> a change. 

I'm not super-keen as the success rates are already very poor for allocations
under pressure. It's something Vlastimil is working on so it may be possible
to get some of the success rates back. It has always been the case that
compaction should not severely impact overall performance as THP gains
must offset compaction. While I'm not happy to reduce the success rates
further I do not think we should leave a known performance impact on
128G machines wait on Vlastimil's series.

Vlastimil, what do you think?

> I'm not sure there's so much of a difference between the new 
> compact_control->sync == MIGRATE_ASYNC and == MIGRATE_SYNC_LIGHT now, 
> though.  Would it make sense to remove MIGRATE_SYNC_LIGHT entirely from 
> the page allocator, i.e. remove sync_migration entirely, and just retry 
> with a second call to compaction before failing instead? 

Would it be possible if only khugepaged entered SYNC_LIGHT migration and
kswapd and direct THP allocations used only MIGRATE_ASYNC? That would
allow khugepaged to continue locking pages and buffers in a slow path
while still not allowing it to issue IO or wait on writeback. It would
also give a chance for Vlastimil's series to shake out a bit without him
having to reintroduce SYNC_LIGHT as part of that series.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
@ 2014-05-06  8:55               ` Mel Gorman
  0 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-06  8:55 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Fri, May 02, 2014 at 01:29:33PM -0700, David Rientjes wrote:
> On Fri, 2 May 2014, Mel Gorman wrote:
> 
> > > The page locks I'm referring to is the lock_page() in __unmap_and_move() 
> > > that gets called for sync compaction after the migrate_pages() iteration 
> > > makes a few passes and unsuccessfully grabs it.  This becomes a forced 
> > > migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
> > > 
> > 
> > Can that be fixed then instead of disabling it entirely?
> > 
> 
> We could return -EAGAIN when the trylock_page() fails for 
> MIGRATE_SYNC_LIGHT.  It would become a forced migration but we ignore that 
> currently for MIGRATE_ASYNC, and I could extend it to be ignored for 
> MIGRATE_SYNC_LIGHT as well.
> 
> > > We have perf profiles from one workload in particular that shows 
> > > contention on i_mmap_mutex (anon isn't interesting since the vast majority 
> > > of memory on this workload [120GB on a 128GB machine] is has a gup pin and 
> > > doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid 
> > > isolating pinned pages")) between cpus all doing memory compaction trying 
> > > to fault thp memory.
> > > 
> > 
> > Abort SYNC_LIGHT compaction if the mutex is contended.
> > 
> 
> Yeah, I have patches for that as well but we're waiting to see if they are 
> actually needed when sync compaction is disabled for thp.  If we aren't 
> actually going to disable it entirely, then I can revive those patches if 
> the contention becomes such an issue.
> 
> > > That's one example that we've seen, but the fact remains that at times 
> > > sync compaction will iterate the entire 128GB machine and not allow an 
> > > order-9 page to be allocated and there's nothing to preempt it like the 
> > > need_resched() or lock contention checks that async compaction has. 
> > 
> > Make compact_control->sync the same enum field and check for contention
> > on the async/sync_light case but leave it for sync if compacting via the
> > proc interface?
> > 
> 
> Ok, that certainly can be done, I wasn't sure you would be happy with such 
> a change. 

I'm not super-keen as the success rates are already very poor for allocations
under pressure. It's something Vlastimil is working on so it may be possible
to get some of the success rates back. It has always been the case that
compaction should not severely impact overall performance as THP gains
must offset compaction. While I'm not happy to reduce the success rates
further I do not think we should leave a known performance impact on
128G machines wait on Vlastimil's series.

Vlastimil, what do you think?

> I'm not sure there's so much of a difference between the new 
> compact_control->sync == MIGRATE_ASYNC and == MIGRATE_SYNC_LIGHT now, 
> though.  Would it make sense to remove MIGRATE_SYNC_LIGHT entirely from 
> the page allocator, i.e. remove sync_migration entirely, and just retry 
> with a second call to compaction before failing instead? 

Would it be possible if only khugepaged entered SYNC_LIGHT migration and
kswapd and direct THP allocations used only MIGRATE_ASYNC? That would
allow khugepaged to continue locking pages and buffers in a slow path
while still not allowing it to issue IO or wait on writeback. It would
also give a chance for Vlastimil's series to shake out a bit without him
having to reintroduce SYNC_LIGHT as part of that series.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-06  0:29             ` David Rientjes
@ 2014-05-06 11:52               ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-06 11:52 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On 05/06/2014 02:29 AM, David Rientjes wrote:
> On Mon, 5 May 2014, Vlastimil Babka wrote:
>
>> I see, although I would still welcome some numbers to back such change.
>
> It's pretty difficult to capture numbers for this in real-world scenarios
> since it happens rarely (and when it happens, it's very significant
> latency) and without an instrumented kernel that will determine how many
> pageblocks have been skipped.  I could create a synthetic example of it in
> the kernel and get numbers for a worst-case scenario with a 64GB zone if
> you'd like, I'm not sure how representative it will be.
>
>> What I still don't like is the removal of the intent of commit 50b5b094e6. You
>> now again call set_pageblock_skip() unconditionally, thus also on pageblocks
>> that async compaction skipped due to being non-MOVABLE. The sync compaction
>> will thus ignore them.
>>
>
> I'm not following you, with this patch there are two cached pfns for the
> migration scanner: one is used for sync and one is used for async.  When
> cc->sync == true, both cached pfns are updated (async is not going to
> succeed for a pageblock when sync failed for that pageblock); when
> cc->sync == false, the async cached pfn is updated only and we pick up
> again where we left off for subsequent async compactions.  Sync compaction
> will still begin where it last left off and consider these non-MOVABLE
> pageblocks.
>

Yeah I understand, the cached pfn's are not the problem. The problem is 
that with your patch, set_pageblock_skip() will be called through 
update_pageblock_skip() in async compaction, since you removed the 
skipped_async_unsuitable variable. So in sync compaction, such pageblock 
will be skipped thanks to the isolation_suitable() check which uses 
get_pageblock_skip() to read the bit set by the async compaction.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction
@ 2014-05-06 11:52               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-06 11:52 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On 05/06/2014 02:29 AM, David Rientjes wrote:
> On Mon, 5 May 2014, Vlastimil Babka wrote:
>
>> I see, although I would still welcome some numbers to back such change.
>
> It's pretty difficult to capture numbers for this in real-world scenarios
> since it happens rarely (and when it happens, it's very significant
> latency) and without an instrumented kernel that will determine how many
> pageblocks have been skipped.  I could create a synthetic example of it in
> the kernel and get numbers for a worst-case scenario with a 64GB zone if
> you'd like, I'm not sure how representative it will be.
>
>> What I still don't like is the removal of the intent of commit 50b5b094e6. You
>> now again call set_pageblock_skip() unconditionally, thus also on pageblocks
>> that async compaction skipped due to being non-MOVABLE. The sync compaction
>> will thus ignore them.
>>
>
> I'm not following you, with this patch there are two cached pfns for the
> migration scanner: one is used for sync and one is used for async.  When
> cc->sync == true, both cached pfns are updated (async is not going to
> succeed for a pageblock when sync failed for that pageblock); when
> cc->sync == false, the async cached pfn is updated only and we pick up
> again where we left off for subsequent async compactions.  Sync compaction
> will still begin where it last left off and consider these non-MOVABLE
> pageblocks.
>

Yeah I understand, the cached pfn's are not the problem. The problem is 
that with your patch, set_pageblock_skip() will be called through 
update_pageblock_skip() in async compaction, since you removed the 
skipped_async_unsuitable variable. So in sync compaction, such pageblock 
will be skipped thanks to the isolation_suitable() check which uses 
get_pageblock_skip() to read the bit set by the async compaction.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
  2014-05-06  8:55               ` Mel Gorman
@ 2014-05-06 15:05                 ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-06 15:05 UTC (permalink / raw)
  To: Mel Gorman, David Rientjes
  Cc: Andrew Morton, Rik van Riel, Joonsoo Kim, Greg Thelen,
	Hugh Dickins, linux-kernel, linux-mm

On 05/06/2014 10:55 AM, Mel Gorman wrote:
> On Fri, May 02, 2014 at 01:29:33PM -0700, David Rientjes wrote:
>> On Fri, 2 May 2014, Mel Gorman wrote:
>>
>>>> The page locks I'm referring to is the lock_page() in __unmap_and_move()
>>>> that gets called for sync compaction after the migrate_pages() iteration
>>>> makes a few passes and unsuccessfully grabs it.  This becomes a forced
>>>> migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
>>>>
>>>
>>> Can that be fixed then instead of disabling it entirely?
>>>
>>
>> We could return -EAGAIN when the trylock_page() fails for
>> MIGRATE_SYNC_LIGHT.  It would become a forced migration but we ignore that
>> currently for MIGRATE_ASYNC, and I could extend it to be ignored for
>> MIGRATE_SYNC_LIGHT as well.
>>
>>>> We have perf profiles from one workload in particular that shows
>>>> contention on i_mmap_mutex (anon isn't interesting since the vast majority
>>>> of memory on this workload [120GB on a 128GB machine] is has a gup pin and
>>>> doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid
>>>> isolating pinned pages")) between cpus all doing memory compaction trying
>>>> to fault thp memory.
>>>>
>>>
>>> Abort SYNC_LIGHT compaction if the mutex is contended.
>>>
>>
>> Yeah, I have patches for that as well but we're waiting to see if they are
>> actually needed when sync compaction is disabled for thp.  If we aren't
>> actually going to disable it entirely, then I can revive those patches if
>> the contention becomes such an issue.
>>
>>>> That's one example that we've seen, but the fact remains that at times
>>>> sync compaction will iterate the entire 128GB machine and not allow an
>>>> order-9 page to be allocated and there's nothing to preempt it like the
>>>> need_resched() or lock contention checks that async compaction has.
>>>
>>> Make compact_control->sync the same enum field and check for contention
>>> on the async/sync_light case but leave it for sync if compacting via the
>>> proc interface?
>>>
>>
>> Ok, that certainly can be done, I wasn't sure you would be happy with such
>> a change.
>
> I'm not super-keen as the success rates are already very poor for allocations
> under pressure. It's something Vlastimil is working on so it may be possible
> to get some of the success rates back. It has always been the case that
> compaction should not severely impact overall performance as THP gains
> must offset compaction. While I'm not happy to reduce the success rates
> further I do not think we should leave a known performance impact on
> 128G machines wait on Vlastimil's series.
>
> Vlastimil, what do you think?

I think before giving up due to lock contention, the "give up on 
pageblock when I won't be able to free all of it anyway" should be 
considered (as I've tried to explain in the yesterday reply, and I think 
David also suggested that already?). Giving up due to lock contention 
might for example mean that it will free half of the pageblock and then 
give up, wasting the work already done.

>> I'm not sure there's so much of a difference between the new
>> compact_control->sync == MIGRATE_ASYNC and == MIGRATE_SYNC_LIGHT now,
>> though.  Would it make sense to remove MIGRATE_SYNC_LIGHT entirely from
>> the page allocator, i.e. remove sync_migration entirely, and just retry
>> with a second call to compaction before failing instead?
>
> Would it be possible if only khugepaged entered SYNC_LIGHT migration and
> kswapd and direct THP allocations used only MIGRATE_ASYNC? That would
> allow khugepaged to continue locking pages and buffers in a slow path
> while still not allowing it to issue IO or wait on writeback. It would
> also give a chance for Vlastimil's series to shake out a bit without him
> having to reintroduce SYNC_LIGHT as part of that series.

I agree that khugepaged should be more persistent. Unlike page faults 
and direct compactions, nothing is stalling on it right?
Also removing sync_migration completely would mean that nobody would try 
to compact non-MOVABLE pageblocks. I think this might increase 
fragmentation, as this is a mechanism that prevents non-MOVABLE 
pageblocks being filled by MOVABLE pages by stealing, and then 
non-MOVABLE allocations having to steal back from other MOVABLE 
pageblocks...


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault
@ 2014-05-06 15:05                 ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-06 15:05 UTC (permalink / raw)
  To: Mel Gorman, David Rientjes
  Cc: Andrew Morton, Rik van Riel, Joonsoo Kim, Greg Thelen,
	Hugh Dickins, linux-kernel, linux-mm

On 05/06/2014 10:55 AM, Mel Gorman wrote:
> On Fri, May 02, 2014 at 01:29:33PM -0700, David Rientjes wrote:
>> On Fri, 2 May 2014, Mel Gorman wrote:
>>
>>>> The page locks I'm referring to is the lock_page() in __unmap_and_move()
>>>> that gets called for sync compaction after the migrate_pages() iteration
>>>> makes a few passes and unsuccessfully grabs it.  This becomes a forced
>>>> migration since __unmap_and_move() returns -EAGAIN when the trylock fails.
>>>>
>>>
>>> Can that be fixed then instead of disabling it entirely?
>>>
>>
>> We could return -EAGAIN when the trylock_page() fails for
>> MIGRATE_SYNC_LIGHT.  It would become a forced migration but we ignore that
>> currently for MIGRATE_ASYNC, and I could extend it to be ignored for
>> MIGRATE_SYNC_LIGHT as well.
>>
>>>> We have perf profiles from one workload in particular that shows
>>>> contention on i_mmap_mutex (anon isn't interesting since the vast majority
>>>> of memory on this workload [120GB on a 128GB machine] is has a gup pin and
>>>> doesn't get isolated because of 119d6d59dcc0 ("mm, compaction: avoid
>>>> isolating pinned pages")) between cpus all doing memory compaction trying
>>>> to fault thp memory.
>>>>
>>>
>>> Abort SYNC_LIGHT compaction if the mutex is contended.
>>>
>>
>> Yeah, I have patches for that as well but we're waiting to see if they are
>> actually needed when sync compaction is disabled for thp.  If we aren't
>> actually going to disable it entirely, then I can revive those patches if
>> the contention becomes such an issue.
>>
>>>> That's one example that we've seen, but the fact remains that at times
>>>> sync compaction will iterate the entire 128GB machine and not allow an
>>>> order-9 page to be allocated and there's nothing to preempt it like the
>>>> need_resched() or lock contention checks that async compaction has.
>>>
>>> Make compact_control->sync the same enum field and check for contention
>>> on the async/sync_light case but leave it for sync if compacting via the
>>> proc interface?
>>>
>>
>> Ok, that certainly can be done, I wasn't sure you would be happy with such
>> a change.
>
> I'm not super-keen as the success rates are already very poor for allocations
> under pressure. It's something Vlastimil is working on so it may be possible
> to get some of the success rates back. It has always been the case that
> compaction should not severely impact overall performance as THP gains
> must offset compaction. While I'm not happy to reduce the success rates
> further I do not think we should leave a known performance impact on
> 128G machines wait on Vlastimil's series.
>
> Vlastimil, what do you think?

I think before giving up due to lock contention, the "give up on 
pageblock when I won't be able to free all of it anyway" should be 
considered (as I've tried to explain in the yesterday reply, and I think 
David also suggested that already?). Giving up due to lock contention 
might for example mean that it will free half of the pageblock and then 
give up, wasting the work already done.

>> I'm not sure there's so much of a difference between the new
>> compact_control->sync == MIGRATE_ASYNC and == MIGRATE_SYNC_LIGHT now,
>> though.  Would it make sense to remove MIGRATE_SYNC_LIGHT entirely from
>> the page allocator, i.e. remove sync_migration entirely, and just retry
>> with a second call to compaction before failing instead?
>
> Would it be possible if only khugepaged entered SYNC_LIGHT migration and
> kswapd and direct THP allocations used only MIGRATE_ASYNC? That would
> allow khugepaged to continue locking pages and buffers in a slow path
> while still not allowing it to issue IO or wait on writeback. It would
> also give a chance for Vlastimil's series to shake out a bit without him
> having to reintroduce SYNC_LIGHT as part of that series.

I agree that khugepaged should be more persistent. Unlike page faults 
and direct compactions, nothing is stalling on it right?
Also removing sync_migration completely would mean that nobody would try 
to compact non-MOVABLE pageblocks. I think this might increase 
fragmentation, as this is a mechanism that prevents non-MOVABLE 
pageblocks being filled by MOVABLE pages by stealing, and then 
non-MOVABLE allocations having to steal back from other MOVABLE 
pageblocks...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm/compaction: do not count migratepages when unnecessary
  2014-05-02 15:26       ` [PATCH] mm/compaction: do not count migratepages when unnecessary Vlastimil Babka
@ 2014-05-06 21:18         ` Naoya Horiguchi
       [not found]         ` <1399411134-k43fsr0p@n-horiguchi@ah.jp.nec.com>
  1 sibling, 0 replies; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-06 21:18 UTC (permalink / raw)
  To: vbabka
  Cc: Andrew Morton, Hugh Dickins, gthelen, linux-kernel, linux-mm,
	minchan, Mel Gorman, iamjoonsoo.kim, cl, Rik van Riel

On Fri, May 02, 2014 at 05:26:18PM +0200, Vlastimil Babka wrote:
> During compaction, update_nr_listpages() has been used to count remaining
> non-migrated and free pages after a call to migrage_pages(). The freepages
> counting has become unneccessary, and it turns out that migratepages counting
> is also unnecessary in most cases.
> 
> The only situation when it's needed to count cc->migratepages is when
> migrate_pages() returns with a negative error code. Otherwise, the non-negative
> return value is the number of pages that were not migrated, which is exactly
> the count of remaining pages in the cc->migratepages list.
> 
> Furthermore, any non-zero count is only interesting for the tracepoint of
> mm_compaction_migratepages events, because after that all remaining unmigrated
> pages are put back and their count is set to 0.
> 
> This patch therefore removes update_nr_listpages() completely, and changes the
> tracepoint definition so that the manual counting is done only when the
> tracepoint is enabled, and only when migrate_pages() returns a negative error
> code.
> 
> Furthermore, migrate_pages() and the tracepoints won't be called when there's
> nothing to migrate. This potentially avoids some wasted cycles and reduces the
> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
> The mm_compaction_isolate_migratepages event is better for determining that
> nothing was isolated for migration, and this one was just duplicating the info.

I agree with this patch.
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

A few nitpicks below...

> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
>  mm/compaction.c                   | 30 ++++++++----------------------
>  2 files changed, 30 insertions(+), 26 deletions(-)
> 
> diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
> index 06f544e..bbd5e1f 100644
> --- a/include/trace/events/compaction.h
> +++ b/include/trace/events/compaction.h
> @@ -5,7 +5,9 @@
>  #define _TRACE_COMPACTION_H
>  
>  #include <linux/types.h>
> +#include <linux/list.h>
>  #include <linux/tracepoint.h>
> +#include <linux/mm_types.h>
>  #include <trace/events/gfpflags.h>
>  
>  DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
> @@ -47,10 +49,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
>  
>  TRACE_EVENT(mm_compaction_migratepages,
>  
> -	TP_PROTO(unsigned long nr_migrated,
> -		unsigned long nr_failed),
> +	TP_PROTO(unsigned long nr_all,
> +		int migrate_rc,
> +		struct list_head * migratepages),

checkpatch.pl shows code violation message for this line.
(ERROR: "foo * bar" should be "foo *bar")

>  
> -	TP_ARGS(nr_migrated, nr_failed),
> +	TP_ARGS(nr_all, migrate_rc, migratepages),
>  
>  	TP_STRUCT__entry(
>  		__field(unsigned long, nr_migrated)
> @@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
>  	),
>  
>  	TP_fast_assign(
> -		__entry->nr_migrated = nr_migrated;
> +		unsigned long nr_failed = 0;
> +		struct page *page;
> +
> +		/*
> +		 * migrate_pages() returns either a non-negative number
> +		 * with the number of pages that failed migration, or an
> +		 * error code, in which case we need to count the remaining
> +		 * pages manually
> +		 */
> +		if (migrate_rc >= 0)
> +			nr_failed = migrate_rc;
> +		else
> +		        list_for_each_entry(page, migratepages, lru)

This line contains whitespace indent.

> +				nr_failed++;
> +
> +		__entry->nr_migrated = nr_all - nr_failed;
>  		__entry->nr_failed = nr_failed;
>  	),
>  
> diff --git a/mm/compaction.c b/mm/compaction.c
> index ae1d0ae..873d7de 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -799,22 +799,6 @@ static void compaction_free(struct page *page, unsigned long data)
>  	cc->nr_freepages++;
>  }
>  
> -/*
> - * We cannot control nr_migratepages fully when migration is running as
> - * migrate_pages() has no knowledge of of compact_control.  When migration is
> - * complete, we count the number of pages on the list by hand.
> - */
> -static void update_nr_listpages(struct compact_control *cc)
> -{
> -	int nr_migratepages = 0;
> -	struct page *page;
> -
> -	list_for_each_entry(page, &cc->migratepages, lru)
> -		nr_migratepages++;
> -
> -	cc->nr_migratepages = nr_migratepages;
> -}
> -
>  /* possible outcome of isolate_migratepages */
>  typedef enum {
>  	ISOLATE_ABORT,		/* Abort compaction now */
> @@ -1006,7 +990,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>  	migrate_prep_local();
>  
>  	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
> -		unsigned long nr_migrate, nr_remaining;
>  		int err;
>  
>  		switch (isolate_migratepages(zone, cc)) {
> @@ -1021,16 +1004,16 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>  			;
>  		}
>  
> -		nr_migrate = cc->nr_migratepages;
> +		if (!cc->nr_migratepages)
> +			continue;
> +
>  		err = migrate_pages(&cc->migratepages, compaction_alloc,
>  				compaction_free, (unsigned long)cc,
>  				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
>  				MR_COMPACTION);
> -		update_nr_listpages(cc);
> -		nr_remaining = cc->nr_migratepages;
>  
> -		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
> -						nr_remaining);
> +		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
> +							&cc->migratepages);
>  
>  		/* Release isolated pages not migrated */
>  		if (err) {
> @@ -1044,6 +1027,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>  				ret = COMPACT_PARTIAL;
>  				goto out;
>  			}
> +		} else {
> +			/* All pages were successfully migrated */
> +			cc->nr_migratepages = 0;

cc->nr_migratepages = 0 is also done in err != 0, so can it be done in common path?

Thanks,
Naoya Horiguchi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-02 15:27         ` Vlastimil Babka
  (?)
@ 2014-05-06 22:19         ` Naoya Horiguchi
  -1 siblings, 0 replies; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-06 22:19 UTC (permalink / raw)
  To: vbabka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, gthelen,
	linux-kernel, linux-mm, minchan, Mel Gorman, iamjoonsoo.kim, cl,
	Rik van Riel

On Fri, May 02, 2014 at 05:27:55PM +0200, Vlastimil Babka wrote:
> The compaction free scanner in isolate_freepages() currently remembers PFN of
> the highest pageblock where it successfully isolates, to be used as the
> starting pageblock for the next invocation. The rationale behind this is that
> page migration might return free pages to the allocator when migration fails
> and we don't want to skip them if the compaction continues.
> 
> Since migration now returns free pages back to compaction code where they can
> be reused, this is no longer a concern. This patch changes isolate_freepages()
> so that the PFN for restarting is updated with each pageblock where isolation
> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> reduction of the pages scanned by the free scanner.
> 
> Note that the somewhat similar functionality that records highest successful
> pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
> used when the whole compaction is restarted, not for multiple invocations of
> the free scanner during single compaction.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  mm/compaction.c | 18 ++++++------------
>  1 file changed, 6 insertions(+), 12 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 873d7de..1967850 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -686,12 +686,6 @@ static void isolate_freepages(struct zone *zone,
>  	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
>  
>  	/*
> -	 * If no pages are isolated, the block_start_pfn < low_pfn check
> -	 * will kick in.
> -	 */
> -	next_free_pfn = 0;
> -
> -	/*
>  	 * Isolate free pages until enough are available to migrate the
>  	 * pages on cc->migratepages. We stop searching if the migrate
>  	 * and free page scanners meet or enough free pages are isolated.
> @@ -731,19 +725,19 @@ static void isolate_freepages(struct zone *zone,
>  			continue;
>  
>  		/* Found a block suitable for isolating free pages from */
> +		next_free_pfn = block_start_pfn;
>  		isolated = isolate_freepages_block(cc, block_start_pfn,
>  					block_end_pfn, freelist, false);
>  		nr_freepages += isolated;
>  
>  		/*
> -		 * Record the highest PFN we isolated pages from. When next
> -		 * looking for free pages, the search will restart here as
> -		 * page migration may have returned some pages to the allocator
> +		 * Set a flag that we successfully isolated in this pageblock.
> +		 * In the next loop iteration, zone->compact_cached_free_pfn
> +		 * will not be updated and thus it will effectively contain the
> +		 * highest pageblock we isolated pages from.
>  		 */
> -		if (isolated && next_free_pfn == 0) {
> +		if (isolated)
>  			cc->finished_update_free = true;
> -			next_free_pfn = block_start_pfn;
> -		}

Why don't you completely remove next_free_pfn and update cc->free_pfn directly?

Thanks,
Naoya Horiguchi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 1/6] mm, migration: add destination page freeing callback
  2014-05-01 21:35   ` David Rientjes
@ 2014-05-07  2:22     ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory migration uses a callback defined by the caller to determine how to
allocate destination pages.  When migration fails for a source page, however, it 
frees the destination page back to the system.

This patch adds a memory migration callback defined by the caller to determine 
how to free destination pages.  If a caller, such as memory compaction, builds 
its own freelist for migration targets, this can reuse already freed memory 
instead of scanning additional memory.

If the caller provides a function to handle freeing of destination pages, it is 
called when page migration fails.  Otherwise, it may pass NULL and freeing back 
to the system will be handled as usual.  This patch introduces no functional 
change.

Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/migrate.h | 11 ++++++----
 mm/compaction.c         |  2 +-
 mm/memory-failure.c     |  4 ++--
 mm/memory_hotplug.c     |  2 +-
 mm/mempolicy.c          |  4 ++--
 mm/migrate.c            | 55 +++++++++++++++++++++++++++++++++++--------------
 mm/page_alloc.c         |  2 +-
 7 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,7 +5,9 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate_mode.h>
 
-typedef struct page *new_page_t(struct page *, unsigned long private, int **);
+typedef struct page *new_page_t(struct page *page, unsigned long private,
+				int **reason);
+typedef void free_page_t(struct page *page, unsigned long private);
 
 /*
  * Return values from addresss_space_operations.migratepage():
@@ -38,7 +40,7 @@ enum migrate_reason {
 extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
-extern int migrate_pages(struct list_head *l, new_page_t x,
+extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
 
 extern int migrate_prep(void);
@@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
-static inline int migrate_pages(struct list_head *l, new_page_t x,
-		unsigned long private, enum migrate_mode mode, int reason)
+static inline int migrate_pages(struct list_head *l, new_page_t new,
+		free_page_t free, unsigned long private, enum migrate_mode mode,
+		int reason)
 	{ return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1023,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
 				(unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1500,7 +1500,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 
 	/* Keep page count to indicate a given hugepage is isolated. */
 	list_move(&hpage->lru, &pagelist);
-	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 				MIGRATE_SYNC, MR_MEMORY_FAILURE);
 	if (ret) {
 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1581,7 +1581,7 @@ static int __soft_offline_page(struct page *page, int flags)
 		inc_zone_page_state(page, NR_ISOLATED_ANON +
 					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
-		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
 		if (ret) {
 			if (!list_empty(&pagelist)) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * alloc_migrate_target should be improooooved!!
 		 * migrate_pages returns # of failed pages.
 		 */
-		ret = migrate_pages(&source, alloc_migrate_target, 0,
+		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
 		if (ret)
 			putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_node_page, dest,
+		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
 					MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
-					(unsigned long)vma,
+					NULL, (unsigned long)vma,
 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -938,8 +938,9 @@ out:
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
  */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, enum migrate_mode mode)
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+			unsigned long private, struct page *page, int force,
+			enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -983,11 +984,17 @@ out:
 				page_is_file_cache(page));
 		putback_lru_page(page);
 	}
+
 	/*
-	 * Move the new page to the LRU. If migration was not successful
-	 * then this will free the page.
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, putback_lru_page() will drop the reference grabbed
+	 * during isolation.
 	 */
-	putback_lru_page(newpage);
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(newpage, private);
+	else
+		putback_lru_page(newpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1016,8 +1023,9 @@ out:
  * will wait in the page fault for migration to complete.
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
-				unsigned long private, struct page *hpage,
-				int force, enum migrate_mode mode)
+				free_page_t put_new_page, unsigned long private,
+				struct page *hpage, int force,
+				enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -1056,20 +1064,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (!page_mapped(hpage))
 		rc = move_to_new_page(new_hpage, hpage, 1, mode);
 
-	if (rc)
+	if (rc != MIGRATEPAGE_SUCCESS)
 		remove_migration_ptes(hpage, hpage);
 
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 
-	if (!rc)
+	if (rc == MIGRATEPAGE_SUCCESS)
 		hugetlb_cgroup_migrate(hpage, new_hpage);
 
 	unlock_page(hpage);
 out:
 	if (rc != -EAGAIN)
 		putback_active_hugepage(hpage);
-	put_page(new_hpage);
+
+	/*
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, put_page() will drop the reference grabbed during
+	 * isolation.
+	 */
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(new_hpage, private);
+	else
+		put_page(new_hpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1086,6 +1104,8 @@ out:
  * @from:		The list of pages to be migrated.
  * @get_new_page:	The function used to allocate free pages to be used
  *			as the target of the page migration.
+ * @put_new_page:	The function used to free target pages if migration
+ *			fails, or NULL if no special handling is necessary.
  * @private:		Private data to be passed on to get_new_page()
  * @mode:		The migration mode that specifies the constraints for
  *			page migration, if any.
@@ -1099,7 +1119,8 @@ out:
  * Returns the number of pages that were not migrated, or an error code.
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
-		unsigned long private, enum migrate_mode mode, int reason)
+		free_page_t put_new_page, unsigned long private,
+		enum migrate_mode mode, int reason)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -1121,10 +1142,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 
 			if (PageHuge(page))
 				rc = unmap_and_move_huge_page(get_new_page,
-						private, page, pass > 2, mode);
+						put_new_page, private, page,
+						pass > 2, mode);
 			else
-				rc = unmap_and_move(get_new_page, private,
-						page, pass > 2, mode);
+				rc = unmap_and_move(get_new_page, put_new_page,
+						private, page, pass > 2, mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1273,7 +1295,7 @@ set_status:
 
 	err = 0;
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_page_node,
+		err = migrate_pages(&pagelist, new_page_node, NULL,
 				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1729,7 +1751,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 
 	list_add(&page->lru, &migratepages);
 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
-				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+				     NULL, node, MIGRATE_ASYNC,
+				     MR_NUMA_MISPLACED);
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6215,7 +6215,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		cc->nr_migratepages -= nr_reclaimed;
 
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
-				    0, MIGRATE_SYNC, MR_CMA);
+				    NULL, 0, MIGRATE_SYNC, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 1/6] mm, migration: add destination page freeing callback
@ 2014-05-07  2:22     ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory migration uses a callback defined by the caller to determine how to
allocate destination pages.  When migration fails for a source page, however, it 
frees the destination page back to the system.

This patch adds a memory migration callback defined by the caller to determine 
how to free destination pages.  If a caller, such as memory compaction, builds 
its own freelist for migration targets, this can reuse already freed memory 
instead of scanning additional memory.

If the caller provides a function to handle freeing of destination pages, it is 
called when page migration fails.  Otherwise, it may pass NULL and freeing back 
to the system will be handled as usual.  This patch introduces no functional 
change.

Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/migrate.h | 11 ++++++----
 mm/compaction.c         |  2 +-
 mm/memory-failure.c     |  4 ++--
 mm/memory_hotplug.c     |  2 +-
 mm/mempolicy.c          |  4 ++--
 mm/migrate.c            | 55 +++++++++++++++++++++++++++++++++++--------------
 mm/page_alloc.c         |  2 +-
 7 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,7 +5,9 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate_mode.h>
 
-typedef struct page *new_page_t(struct page *, unsigned long private, int **);
+typedef struct page *new_page_t(struct page *page, unsigned long private,
+				int **reason);
+typedef void free_page_t(struct page *page, unsigned long private);
 
 /*
  * Return values from addresss_space_operations.migratepage():
@@ -38,7 +40,7 @@ enum migrate_reason {
 extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
-extern int migrate_pages(struct list_head *l, new_page_t x,
+extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
 
 extern int migrate_prep(void);
@@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
-static inline int migrate_pages(struct list_head *l, new_page_t x,
-		unsigned long private, enum migrate_mode mode, int reason)
+static inline int migrate_pages(struct list_head *l, new_page_t new,
+		free_page_t free, unsigned long private, enum migrate_mode mode,
+		int reason)
 	{ return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1023,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
 				(unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1500,7 +1500,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 
 	/* Keep page count to indicate a given hugepage is isolated. */
 	list_move(&hpage->lru, &pagelist);
-	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 				MIGRATE_SYNC, MR_MEMORY_FAILURE);
 	if (ret) {
 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1581,7 +1581,7 @@ static int __soft_offline_page(struct page *page, int flags)
 		inc_zone_page_state(page, NR_ISOLATED_ANON +
 					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
-		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
 		if (ret) {
 			if (!list_empty(&pagelist)) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * alloc_migrate_target should be improooooved!!
 		 * migrate_pages returns # of failed pages.
 		 */
-		ret = migrate_pages(&source, alloc_migrate_target, 0,
+		ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
 		if (ret)
 			putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_node_page, dest,
+		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
 					MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
-					(unsigned long)vma,
+					NULL, (unsigned long)vma,
 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -938,8 +938,9 @@ out:
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
  */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, enum migrate_mode mode)
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+			unsigned long private, struct page *page, int force,
+			enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -983,11 +984,17 @@ out:
 				page_is_file_cache(page));
 		putback_lru_page(page);
 	}
+
 	/*
-	 * Move the new page to the LRU. If migration was not successful
-	 * then this will free the page.
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, putback_lru_page() will drop the reference grabbed
+	 * during isolation.
 	 */
-	putback_lru_page(newpage);
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(newpage, private);
+	else
+		putback_lru_page(newpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1016,8 +1023,9 @@ out:
  * will wait in the page fault for migration to complete.
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
-				unsigned long private, struct page *hpage,
-				int force, enum migrate_mode mode)
+				free_page_t put_new_page, unsigned long private,
+				struct page *hpage, int force,
+				enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -1056,20 +1064,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (!page_mapped(hpage))
 		rc = move_to_new_page(new_hpage, hpage, 1, mode);
 
-	if (rc)
+	if (rc != MIGRATEPAGE_SUCCESS)
 		remove_migration_ptes(hpage, hpage);
 
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 
-	if (!rc)
+	if (rc == MIGRATEPAGE_SUCCESS)
 		hugetlb_cgroup_migrate(hpage, new_hpage);
 
 	unlock_page(hpage);
 out:
 	if (rc != -EAGAIN)
 		putback_active_hugepage(hpage);
-	put_page(new_hpage);
+
+	/*
+	 * If migration was not successful and there's a freeing callback, use
+	 * it.  Otherwise, put_page() will drop the reference grabbed during
+	 * isolation.
+	 */
+	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+		put_new_page(new_hpage, private);
+	else
+		put_page(new_hpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -1086,6 +1104,8 @@ out:
  * @from:		The list of pages to be migrated.
  * @get_new_page:	The function used to allocate free pages to be used
  *			as the target of the page migration.
+ * @put_new_page:	The function used to free target pages if migration
+ *			fails, or NULL if no special handling is necessary.
  * @private:		Private data to be passed on to get_new_page()
  * @mode:		The migration mode that specifies the constraints for
  *			page migration, if any.
@@ -1099,7 +1119,8 @@ out:
  * Returns the number of pages that were not migrated, or an error code.
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
-		unsigned long private, enum migrate_mode mode, int reason)
+		free_page_t put_new_page, unsigned long private,
+		enum migrate_mode mode, int reason)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -1121,10 +1142,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 
 			if (PageHuge(page))
 				rc = unmap_and_move_huge_page(get_new_page,
-						private, page, pass > 2, mode);
+						put_new_page, private, page,
+						pass > 2, mode);
 			else
-				rc = unmap_and_move(get_new_page, private,
-						page, pass > 2, mode);
+				rc = unmap_and_move(get_new_page, put_new_page,
+						private, page, pass > 2, mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1273,7 +1295,7 @@ set_status:
 
 	err = 0;
 	if (!list_empty(&pagelist)) {
-		err = migrate_pages(&pagelist, new_page_node,
+		err = migrate_pages(&pagelist, new_page_node, NULL,
 				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
@@ -1729,7 +1751,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 
 	list_add(&page->lru, &migratepages);
 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
-				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+				     NULL, node, MIGRATE_ASYNC,
+				     MR_NUMA_MISPLACED);
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6215,7 +6215,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		cc->nr_migratepages -= nr_reclaimed;
 
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
-				    0, MIGRATE_SYNC, MR_CMA);
+				    NULL, 0, MIGRATE_SYNC, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
  2014-05-07  2:22     ` David Rientjes
@ 2014-05-07  2:22       ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory compaction works by having a "freeing scanner" scan from one end of a 
zone which isolates pages as migration targets while another "migrating scanner" 
scans from the other end of the same zone which isolates pages for migration.

When page migration fails for an isolated page, the target page is returned to 
the system rather than the freelist built by the freeing scanner.  This may 
require the freeing scanner to continue scanning memory after suitable migration 
targets have already been returned to the system needlessly.

This patch returns destination pages to the freeing scanner freelist when page 
migration fails.  This prevents unnecessary work done by the freeing scanner but 
also encourages memory to be as compacted as possible at the end of the zone.

Reported-by: Greg Thelen <gthelen@google.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -797,23 +797,32 @@ static struct page *compaction_alloc(struct page *migratepage,
 }
 
 /*
- * We cannot control nr_migratepages and nr_freepages fully when migration is
- * running as migrate_pages() has no knowledge of compact_control. When
- * migration is complete, we count the number of pages on the lists by hand.
+ * This is a migrate-callback that "frees" freepages back to the isolated
+ * freelist.  All pages on the freelist are from the same zone, so there is no
+ * special handling needed for NUMA.
+ */
+static void compaction_free(struct page *page, unsigned long data)
+{
+	struct compact_control *cc = (struct compact_control *)data;
+
+	list_add(&page->lru, &cc->freepages);
+	cc->nr_freepages++;
+}
+
+/*
+ * We cannot control nr_migratepages fully when migration is running as
+ * migrate_pages() has no knowledge of of compact_control.  When migration is
+ * complete, we count the number of pages on the list by hand.
  */
 static void update_nr_listpages(struct compact_control *cc)
 {
 	int nr_migratepages = 0;
-	int nr_freepages = 0;
 	struct page *page;
 
 	list_for_each_entry(page, &cc->migratepages, lru)
 		nr_migratepages++;
-	list_for_each_entry(page, &cc->freepages, lru)
-		nr_freepages++;
 
 	cc->nr_migratepages = nr_migratepages;
-	cc->nr_freepages = nr_freepages;
 }
 
 /* possible outcome of isolate_migratepages */
@@ -1023,8 +1032,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
-				(unsigned long)cc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc,
+				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
 		update_nr_listpages(cc);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-07  2:22       ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Memory compaction works by having a "freeing scanner" scan from one end of a 
zone which isolates pages as migration targets while another "migrating scanner" 
scans from the other end of the same zone which isolates pages for migration.

When page migration fails for an isolated page, the target page is returned to 
the system rather than the freelist built by the freeing scanner.  This may 
require the freeing scanner to continue scanning memory after suitable migration 
targets have already been returned to the system needlessly.

This patch returns destination pages to the freeing scanner freelist when page 
migration fails.  This prevents unnecessary work done by the freeing scanner but 
also encourages memory to be as compacted as possible at the end of the zone.

Reported-by: Greg Thelen <gthelen@google.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -797,23 +797,32 @@ static struct page *compaction_alloc(struct page *migratepage,
 }
 
 /*
- * We cannot control nr_migratepages and nr_freepages fully when migration is
- * running as migrate_pages() has no knowledge of compact_control. When
- * migration is complete, we count the number of pages on the lists by hand.
+ * This is a migrate-callback that "frees" freepages back to the isolated
+ * freelist.  All pages on the freelist are from the same zone, so there is no
+ * special handling needed for NUMA.
+ */
+static void compaction_free(struct page *page, unsigned long data)
+{
+	struct compact_control *cc = (struct compact_control *)data;
+
+	list_add(&page->lru, &cc->freepages);
+	cc->nr_freepages++;
+}
+
+/*
+ * We cannot control nr_migratepages fully when migration is running as
+ * migrate_pages() has no knowledge of of compact_control.  When migration is
+ * complete, we count the number of pages on the list by hand.
  */
 static void update_nr_listpages(struct compact_control *cc)
 {
 	int nr_migratepages = 0;
-	int nr_freepages = 0;
 	struct page *page;
 
 	list_for_each_entry(page, &cc->migratepages, lru)
 		nr_migratepages++;
-	list_for_each_entry(page, &cc->freepages, lru)
-		nr_freepages++;
 
 	cc->nr_migratepages = nr_migratepages;
-	cc->nr_freepages = nr_freepages;
 }
 
 /* possible outcome of isolate_migratepages */
@@ -1023,8 +1032,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		}
 
 		nr_migrate = cc->nr_migratepages;
-		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
-				(unsigned long)cc,
+		err = migrate_pages(&cc->migratepages, compaction_alloc,
+				compaction_free, (unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
 		update_nr_listpages(cc);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 3/6] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-07  2:22     ` David Rientjes
@ 2014-05-07  2:22       ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Each zone has a cached migration scanner pfn for memory compaction so that 
subsequent calls to memory compaction can start where the previous call left 
off.

Currently, the compaction migration scanner only updates the per-zone cached pfn 
when pageblocks were not skipped for async compaction.  This creates a 
dependency on calling sync compaction to avoid having subsequent calls to async 
compaction from scanning an enormous amount of non-MOVABLE pageblocks each time 
it is called.  On large machines, this could be potentially very expensive.

This patch adds a per-zone cached migration scanner pfn only for async 
compaction.  It is updated everytime a pageblock has been scanned in its 
entirety and when no pages from it were successfully isolated.  The cached 
migration scanner pfn for sync compaction is updated only when called for sync 
compaction.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 v3: do not update pageblock skip metadata when skipped due to async per
     Vlastimil.

 include/linux/mmzone.h |  5 ++--
 mm/compaction.c        | 66 ++++++++++++++++++++++++++++++--------------------
 2 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -360,9 +360,10 @@ struct zone {
 	/* Set to true when the PG_migrate_skip bits should be cleared */
 	bool			compact_blockskip_flush;
 
-	/* pfns where compaction scanners should start */
+	/* pfn where compaction free scanner should start */
 	unsigned long		compact_cached_free_pfn;
-	unsigned long		compact_cached_migrate_pfn;
+	/* pfn where async and sync compaction migration scanner should start */
+	unsigned long		compact_cached_migrate_pfn[2];
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 	/* see spanned/present_pages for more description */
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
 	unsigned long end_pfn = zone_end_pfn(zone);
 	unsigned long pfn;
 
-	zone->compact_cached_migrate_pfn = start_pfn;
+	zone->compact_cached_migrate_pfn[0] = start_pfn;
+	zone->compact_cached_migrate_pfn[1] = start_pfn;
 	zone->compact_cached_free_pfn = end_pfn;
 	zone->compact_blockskip_flush = false;
 
@@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat)
  */
 static void update_pageblock_skip(struct compact_control *cc,
 			struct page *page, unsigned long nr_isolated,
-			bool migrate_scanner)
+			bool set_unsuitable, bool migrate_scanner)
 {
 	struct zone *zone = cc->zone;
+	unsigned long pfn;
 
 	if (cc->ignore_skip_hint)
 		return;
@@ -141,20 +143,31 @@ static void update_pageblock_skip(struct compact_control *cc,
 	if (!page)
 		return;
 
-	if (!nr_isolated) {
-		unsigned long pfn = page_to_pfn(page);
+	if (nr_isolated)
+		return;
+
+	/*
+	 * Only skip pageblocks when all forms of compaction will be known to
+	 * fail in the near future.
+	 */
+	if (set_unsuitable)
 		set_pageblock_skip(page);
 
-		/* Update where compaction should restart */
-		if (migrate_scanner) {
-			if (!cc->finished_update_migrate &&
-			    pfn > zone->compact_cached_migrate_pfn)
-				zone->compact_cached_migrate_pfn = pfn;
-		} else {
-			if (!cc->finished_update_free &&
-			    pfn < zone->compact_cached_free_pfn)
-				zone->compact_cached_free_pfn = pfn;
-		}
+	pfn = page_to_pfn(page);
+
+	/* Update where async and sync compaction should restart */
+	if (migrate_scanner) {
+		if (cc->finished_update_migrate)
+			return;
+		if (pfn > zone->compact_cached_migrate_pfn[0])
+			zone->compact_cached_migrate_pfn[0] = pfn;
+		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
+			zone->compact_cached_migrate_pfn[1] = pfn;
+	} else {
+		if (cc->finished_update_free)
+			return;
+		if (pfn < zone->compact_cached_free_pfn)
+			zone->compact_cached_free_pfn = pfn;
 	}
 }
 #else
@@ -166,7 +179,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
 
 static void update_pageblock_skip(struct compact_control *cc,
 			struct page *page, unsigned long nr_isolated,
-			bool migrate_scanner)
+			bool set_unsuitable, bool migrate_scanner)
 {
 }
 #endif /* CONFIG_COMPACTION */
@@ -329,7 +342,8 @@ isolate_fail:
 
 	/* Update the pageblock-skip if the whole pageblock was scanned */
 	if (blockpfn == end_pfn)
-		update_pageblock_skip(cc, valid_page, total_isolated, false);
+		update_pageblock_skip(cc, valid_page, total_isolated, true,
+				      false);
 
 	count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
 	if (total_isolated)
@@ -464,7 +478,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	unsigned long flags;
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
-	bool skipped_async_unsuitable = false;
+	bool set_unsuitable = true;
 	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
 
@@ -541,8 +555,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			 */
 			mt = get_pageblock_migratetype(page);
 			if (!cc->sync && !migrate_async_suitable(mt)) {
-				cc->finished_update_migrate = true;
-				skipped_async_unsuitable = true;
+				set_unsuitable = false;
 				goto next_pageblock;
 			}
 		}
@@ -646,11 +659,10 @@ next_pageblock:
 	/*
 	 * Update the pageblock-skip information and cached scanner pfn,
 	 * if the whole pageblock was scanned without isolating any page.
-	 * This is not done when pageblock was skipped due to being unsuitable
-	 * for async compaction, so that eventual sync compaction can try.
 	 */
-	if (low_pfn == end_pfn && !skipped_async_unsuitable)
-		update_pageblock_skip(cc, valid_page, nr_isolated, true);
+	if (low_pfn == end_pfn)
+		update_pageblock_skip(cc, valid_page, nr_isolated,
+				      set_unsuitable, true);
 
 	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
 
@@ -877,7 +889,8 @@ static int compact_finished(struct zone *zone,
 	/* Compaction run completes if the migrate and free scanner meet */
 	if (cc->free_pfn <= cc->migrate_pfn) {
 		/* Let the next compaction start anew. */
-		zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
+		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
 		zone->compact_cached_free_pfn = zone_end_pfn(zone);
 
 		/*
@@ -1002,7 +1015,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	 * information on where the scanners should start but check that it
 	 * is initialised by ensuring the values are within zone boundaries.
 	 */
-	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1010,7 +1023,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	}
 	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
 		cc->migrate_pfn = start_pfn;
-		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
+		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
 	}
 
 	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 3/6] mm, compaction: add per-zone migration pfn cache for async compaction
@ 2014-05-07  2:22       ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Each zone has a cached migration scanner pfn for memory compaction so that 
subsequent calls to memory compaction can start where the previous call left 
off.

Currently, the compaction migration scanner only updates the per-zone cached pfn 
when pageblocks were not skipped for async compaction.  This creates a 
dependency on calling sync compaction to avoid having subsequent calls to async 
compaction from scanning an enormous amount of non-MOVABLE pageblocks each time 
it is called.  On large machines, this could be potentially very expensive.

This patch adds a per-zone cached migration scanner pfn only for async 
compaction.  It is updated everytime a pageblock has been scanned in its 
entirety and when no pages from it were successfully isolated.  The cached 
migration scanner pfn for sync compaction is updated only when called for sync 
compaction.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 v3: do not update pageblock skip metadata when skipped due to async per
     Vlastimil.

 include/linux/mmzone.h |  5 ++--
 mm/compaction.c        | 66 ++++++++++++++++++++++++++++++--------------------
 2 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -360,9 +360,10 @@ struct zone {
 	/* Set to true when the PG_migrate_skip bits should be cleared */
 	bool			compact_blockskip_flush;
 
-	/* pfns where compaction scanners should start */
+	/* pfn where compaction free scanner should start */
 	unsigned long		compact_cached_free_pfn;
-	unsigned long		compact_cached_migrate_pfn;
+	/* pfn where async and sync compaction migration scanner should start */
+	unsigned long		compact_cached_migrate_pfn[2];
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 	/* see spanned/present_pages for more description */
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
 	unsigned long end_pfn = zone_end_pfn(zone);
 	unsigned long pfn;
 
-	zone->compact_cached_migrate_pfn = start_pfn;
+	zone->compact_cached_migrate_pfn[0] = start_pfn;
+	zone->compact_cached_migrate_pfn[1] = start_pfn;
 	zone->compact_cached_free_pfn = end_pfn;
 	zone->compact_blockskip_flush = false;
 
@@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat)
  */
 static void update_pageblock_skip(struct compact_control *cc,
 			struct page *page, unsigned long nr_isolated,
-			bool migrate_scanner)
+			bool set_unsuitable, bool migrate_scanner)
 {
 	struct zone *zone = cc->zone;
+	unsigned long pfn;
 
 	if (cc->ignore_skip_hint)
 		return;
@@ -141,20 +143,31 @@ static void update_pageblock_skip(struct compact_control *cc,
 	if (!page)
 		return;
 
-	if (!nr_isolated) {
-		unsigned long pfn = page_to_pfn(page);
+	if (nr_isolated)
+		return;
+
+	/*
+	 * Only skip pageblocks when all forms of compaction will be known to
+	 * fail in the near future.
+	 */
+	if (set_unsuitable)
 		set_pageblock_skip(page);
 
-		/* Update where compaction should restart */
-		if (migrate_scanner) {
-			if (!cc->finished_update_migrate &&
-			    pfn > zone->compact_cached_migrate_pfn)
-				zone->compact_cached_migrate_pfn = pfn;
-		} else {
-			if (!cc->finished_update_free &&
-			    pfn < zone->compact_cached_free_pfn)
-				zone->compact_cached_free_pfn = pfn;
-		}
+	pfn = page_to_pfn(page);
+
+	/* Update where async and sync compaction should restart */
+	if (migrate_scanner) {
+		if (cc->finished_update_migrate)
+			return;
+		if (pfn > zone->compact_cached_migrate_pfn[0])
+			zone->compact_cached_migrate_pfn[0] = pfn;
+		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
+			zone->compact_cached_migrate_pfn[1] = pfn;
+	} else {
+		if (cc->finished_update_free)
+			return;
+		if (pfn < zone->compact_cached_free_pfn)
+			zone->compact_cached_free_pfn = pfn;
 	}
 }
 #else
@@ -166,7 +179,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
 
 static void update_pageblock_skip(struct compact_control *cc,
 			struct page *page, unsigned long nr_isolated,
-			bool migrate_scanner)
+			bool set_unsuitable, bool migrate_scanner)
 {
 }
 #endif /* CONFIG_COMPACTION */
@@ -329,7 +342,8 @@ isolate_fail:
 
 	/* Update the pageblock-skip if the whole pageblock was scanned */
 	if (blockpfn == end_pfn)
-		update_pageblock_skip(cc, valid_page, total_isolated, false);
+		update_pageblock_skip(cc, valid_page, total_isolated, true,
+				      false);
 
 	count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
 	if (total_isolated)
@@ -464,7 +478,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	unsigned long flags;
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
-	bool skipped_async_unsuitable = false;
+	bool set_unsuitable = true;
 	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
 
@@ -541,8 +555,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			 */
 			mt = get_pageblock_migratetype(page);
 			if (!cc->sync && !migrate_async_suitable(mt)) {
-				cc->finished_update_migrate = true;
-				skipped_async_unsuitable = true;
+				set_unsuitable = false;
 				goto next_pageblock;
 			}
 		}
@@ -646,11 +659,10 @@ next_pageblock:
 	/*
 	 * Update the pageblock-skip information and cached scanner pfn,
 	 * if the whole pageblock was scanned without isolating any page.
-	 * This is not done when pageblock was skipped due to being unsuitable
-	 * for async compaction, so that eventual sync compaction can try.
 	 */
-	if (low_pfn == end_pfn && !skipped_async_unsuitable)
-		update_pageblock_skip(cc, valid_page, nr_isolated, true);
+	if (low_pfn == end_pfn)
+		update_pageblock_skip(cc, valid_page, nr_isolated,
+				      set_unsuitable, true);
 
 	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
 
@@ -877,7 +889,8 @@ static int compact_finished(struct zone *zone,
 	/* Compaction run completes if the migrate and free scanner meet */
 	if (cc->free_pfn <= cc->migrate_pfn) {
 		/* Let the next compaction start anew. */
-		zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
+		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
 		zone->compact_cached_free_pfn = zone_end_pfn(zone);
 
 		/*
@@ -1002,7 +1015,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	 * information on where the scanners should start but check that it
 	 * is initialised by ensuring the values are within zone boundaries.
 	 */
-	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1010,7 +1023,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	}
 	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
 		cc->migrate_pfn = start_pfn;
-		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
+		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
 	}
 
 	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 4/6] mm, compaction: embed migration mode in compact_control
  2014-05-07  2:22     ` David Rientjes
@ 2014-05-07  2:22       ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

We're going to want to manipulate the migration mode for compaction in the page 
allocator, and currently compact_control's sync field is only a bool.  

Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending 
on the value of this bool.  Convert the bool to enum migrate_mode and pass the 
migration mode in directly.  Later, we'll want to avoid MIGRATE_SYNC_LIGHT for 
thp allocations in the pagefault patch to avoid unnecessary latency.

This also alters compaction triggered from sysfs, either for the entire system 
or for a node, to force MIGRATE_SYNC.

Suggested-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/compaction.h |  4 ++--
 mm/compaction.c            | 36 +++++++++++++++++++-----------------
 mm/internal.h              |  2 +-
 mm/page_alloc.c            | 37 ++++++++++++++++---------------------
 4 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask,
-			bool sync, bool *contended);
+			enum migrate_mode sync, bool *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode sync, bool *contended)
 {
 	return COMPACT_CONTINUE;
 }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -161,7 +161,8 @@ static void update_pageblock_skip(struct compact_control *cc,
 			return;
 		if (pfn > zone->compact_cached_migrate_pfn[0])
 			zone->compact_cached_migrate_pfn[0] = pfn;
-		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
+		if (cc->mode != MIGRATE_ASYNC &&
+		    pfn > zone->compact_cached_migrate_pfn[1])
 			zone->compact_cached_migrate_pfn[1] = pfn;
 	} else {
 		if (cc->finished_update_free)
@@ -208,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 		}
 
 		/* async aborts if taking too long or contended */
-		if (!cc->sync) {
+		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
 			return false;
 		}
@@ -479,7 +480,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
 	bool set_unsuitable = true;
-	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
+	const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
+					ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
 
 	/*
@@ -489,7 +491,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	 */
 	while (unlikely(too_many_isolated(zone))) {
 		/* async migration should just abort */
-		if (!cc->sync)
+		if (cc->mode == MIGRATE_ASYNC)
 			return 0;
 
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -554,7 +556,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			 * the minimum amount of work satisfies the allocation
 			 */
 			mt = get_pageblock_migratetype(page);
-			if (!cc->sync && !migrate_async_suitable(mt)) {
+			if (cc->mode == MIGRATE_ASYNC &&
+			    !migrate_async_suitable(mt)) {
 				set_unsuitable = false;
 				goto next_pageblock;
 			}
@@ -990,6 +993,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	int ret;
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone_end_pfn(zone);
+	const bool sync = cc->mode != MIGRATE_ASYNC;
 
 	ret = compaction_suitable(zone, cc->order);
 	switch (ret) {
@@ -1015,7 +1019,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	 * information on where the scanners should start but check that it
 	 * is initialised by ensuring the values are within zone boundaries.
 	 */
-	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1049,8 +1053,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 		nr_migrate = cc->nr_migratepages;
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
-				compaction_free, (unsigned long)cc,
-				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+				compaction_free, (unsigned long)cc, cc->mode,
 				MR_COMPACTION);
 		update_nr_listpages(cc);
 		nr_remaining = cc->nr_migratepages;
@@ -1083,9 +1086,8 @@ out:
 	return ret;
 }
 
-static unsigned long compact_zone_order(struct zone *zone,
-				 int order, gfp_t gfp_mask,
-				 bool sync, bool *contended)
+static unsigned long compact_zone_order(struct zone *zone, int order,
+		gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
 {
 	unsigned long ret;
 	struct compact_control cc = {
@@ -1094,7 +1096,7 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.order = order,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
-		.sync = sync,
+		.mode = mode,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -1116,7 +1118,7 @@ int sysctl_extfrag_threshold = 500;
  * @order: The order of the current allocation
  * @gfp_mask: The GFP mask of the current allocation
  * @nodemask: The allowed nodes to allocate from
- * @sync: Whether migration is synchronous or not
+ * @sync: The migration mode for async, sync light, or sync migration
  * @contended: Return value that is true if compaction was aborted due to lock contention
  * @page: Optionally capture a free page of the requested order during compaction
  *
@@ -1124,7 +1126,7 @@ int sysctl_extfrag_threshold = 500;
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode sync, bool *contended)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1189,7 +1191,7 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 						low_wmark_pages(zone), 0, 0))
 				compaction_defer_reset(zone, cc->order, false);
 			/* Currently async compaction is never deferred. */
-			else if (cc->sync)
+			else if (cc->mode != MIGRATE_ASYNC)
 				defer_compaction(zone, cc->order);
 		}
 
@@ -1202,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
 {
 	struct compact_control cc = {
 		.order = order,
-		.sync = false,
+		.mode = MIGRATE_ASYNC,
 	};
 
 	if (!order)
@@ -1215,7 +1217,7 @@ static void compact_node(int nid)
 {
 	struct compact_control cc = {
 		.order = -1,
-		.sync = true,
+		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 	};
 
diff --git a/mm/internal.h b/mm/internal.h
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -134,7 +134,7 @@ struct compact_control {
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
-	bool sync;			/* Synchronous migration */
+	enum migrate_mode mode;		/* Async or sync migration mode */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 	bool finished_update_free;	/* True when the zone cached pfns are
 					 * no longer being updated
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2226,7 +2226,7 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, bool sync_migration,
+	int migratetype, enum migrate_mode mode,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
@@ -2240,7 +2240,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-						nodemask, sync_migration,
+						nodemask, mode,
 						contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 
@@ -2273,7 +2273,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		 * As async compaction considers a subset of pageblocks, only
 		 * defer if the failure was a sync compaction failure.
 		 */
-		if (sync_migration)
+		if (mode != MIGRATE_ASYNC)
 			defer_compaction(preferred_zone, order);
 
 		cond_resched();
@@ -2286,9 +2286,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, bool sync_migration,
-	bool *contended_compaction, bool *deferred_compaction,
-	unsigned long *did_some_progress)
+	int migratetype, enum migrate_mode mode, bool *contended_compaction,
+	bool *deferred_compaction, unsigned long *did_some_progress)
 {
 	return NULL;
 }
@@ -2483,7 +2482,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
-	bool sync_migration = false;
+	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	bool deferred_compaction = false;
 	bool contended_compaction = false;
 
@@ -2577,17 +2576,15 @@ rebalance:
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
 	 */
-	page = __alloc_pages_direct_compact(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask,
-					alloc_flags, preferred_zone,
-					migratetype, sync_migration,
-					&contended_compaction,
+	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+					high_zoneidx, nodemask, alloc_flags,
+					preferred_zone, migratetype,
+					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
-	sync_migration = true;
+	migration_mode = MIGRATE_SYNC_LIGHT;
 
 	/*
 	 * If compaction is deferred for high-order allocations, it is because
@@ -2662,12 +2659,10 @@ rebalance:
 		 * direct reclaim and reclaim/compaction depends on compaction
 		 * being called after reclaim so call directly if necessary
 		 */
-		page = __alloc_pages_direct_compact(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask,
-					alloc_flags, preferred_zone,
-					migratetype, sync_migration,
-					&contended_compaction,
+		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+					high_zoneidx, nodemask, alloc_flags,
+					preferred_zone, migratetype,
+					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 		if (page)
@@ -6254,7 +6249,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
-		.sync = true,
+		.sync = MIGRATE_SYNC_LIGHT,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 4/6] mm, compaction: embed migration mode in compact_control
@ 2014-05-07  2:22       ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

We're going to want to manipulate the migration mode for compaction in the page 
allocator, and currently compact_control's sync field is only a bool.  

Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending 
on the value of this bool.  Convert the bool to enum migrate_mode and pass the 
migration mode in directly.  Later, we'll want to avoid MIGRATE_SYNC_LIGHT for 
thp allocations in the pagefault patch to avoid unnecessary latency.

This also alters compaction triggered from sysfs, either for the entire system 
or for a node, to force MIGRATE_SYNC.

Suggested-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/compaction.h |  4 ++--
 mm/compaction.c            | 36 +++++++++++++++++++-----------------
 mm/internal.h              |  2 +-
 mm/page_alloc.c            | 37 ++++++++++++++++---------------------
 4 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask,
-			bool sync, bool *contended);
+			enum migrate_mode sync, bool *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode sync, bool *contended)
 {
 	return COMPACT_CONTINUE;
 }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -161,7 +161,8 @@ static void update_pageblock_skip(struct compact_control *cc,
 			return;
 		if (pfn > zone->compact_cached_migrate_pfn[0])
 			zone->compact_cached_migrate_pfn[0] = pfn;
-		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
+		if (cc->mode != MIGRATE_ASYNC &&
+		    pfn > zone->compact_cached_migrate_pfn[1])
 			zone->compact_cached_migrate_pfn[1] = pfn;
 	} else {
 		if (cc->finished_update_free)
@@ -208,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 		}
 
 		/* async aborts if taking too long or contended */
-		if (!cc->sync) {
+		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
 			return false;
 		}
@@ -479,7 +480,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
 	bool set_unsuitable = true;
-	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
+	const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
+					ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
 
 	/*
@@ -489,7 +491,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	 */
 	while (unlikely(too_many_isolated(zone))) {
 		/* async migration should just abort */
-		if (!cc->sync)
+		if (cc->mode == MIGRATE_ASYNC)
 			return 0;
 
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -554,7 +556,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			 * the minimum amount of work satisfies the allocation
 			 */
 			mt = get_pageblock_migratetype(page);
-			if (!cc->sync && !migrate_async_suitable(mt)) {
+			if (cc->mode == MIGRATE_ASYNC &&
+			    !migrate_async_suitable(mt)) {
 				set_unsuitable = false;
 				goto next_pageblock;
 			}
@@ -990,6 +993,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	int ret;
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone_end_pfn(zone);
+	const bool sync = cc->mode != MIGRATE_ASYNC;
 
 	ret = compaction_suitable(zone, cc->order);
 	switch (ret) {
@@ -1015,7 +1019,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	 * information on where the scanners should start but check that it
 	 * is initialised by ensuring the values are within zone boundaries.
 	 */
-	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1049,8 +1053,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 		nr_migrate = cc->nr_migratepages;
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
-				compaction_free, (unsigned long)cc,
-				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+				compaction_free, (unsigned long)cc, cc->mode,
 				MR_COMPACTION);
 		update_nr_listpages(cc);
 		nr_remaining = cc->nr_migratepages;
@@ -1083,9 +1086,8 @@ out:
 	return ret;
 }
 
-static unsigned long compact_zone_order(struct zone *zone,
-				 int order, gfp_t gfp_mask,
-				 bool sync, bool *contended)
+static unsigned long compact_zone_order(struct zone *zone, int order,
+		gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
 {
 	unsigned long ret;
 	struct compact_control cc = {
@@ -1094,7 +1096,7 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.order = order,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
-		.sync = sync,
+		.mode = mode,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -1116,7 +1118,7 @@ int sysctl_extfrag_threshold = 500;
  * @order: The order of the current allocation
  * @gfp_mask: The GFP mask of the current allocation
  * @nodemask: The allowed nodes to allocate from
- * @sync: Whether migration is synchronous or not
+ * @sync: The migration mode for async, sync light, or sync migration
  * @contended: Return value that is true if compaction was aborted due to lock contention
  * @page: Optionally capture a free page of the requested order during compaction
  *
@@ -1124,7 +1126,7 @@ int sysctl_extfrag_threshold = 500;
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode sync, bool *contended)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1189,7 +1191,7 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 						low_wmark_pages(zone), 0, 0))
 				compaction_defer_reset(zone, cc->order, false);
 			/* Currently async compaction is never deferred. */
-			else if (cc->sync)
+			else if (cc->mode != MIGRATE_ASYNC)
 				defer_compaction(zone, cc->order);
 		}
 
@@ -1202,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
 {
 	struct compact_control cc = {
 		.order = order,
-		.sync = false,
+		.mode = MIGRATE_ASYNC,
 	};
 
 	if (!order)
@@ -1215,7 +1217,7 @@ static void compact_node(int nid)
 {
 	struct compact_control cc = {
 		.order = -1,
-		.sync = true,
+		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 	};
 
diff --git a/mm/internal.h b/mm/internal.h
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -134,7 +134,7 @@ struct compact_control {
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
-	bool sync;			/* Synchronous migration */
+	enum migrate_mode mode;		/* Async or sync migration mode */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 	bool finished_update_free;	/* True when the zone cached pfns are
 					 * no longer being updated
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2226,7 +2226,7 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, bool sync_migration,
+	int migratetype, enum migrate_mode mode,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
@@ -2240,7 +2240,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-						nodemask, sync_migration,
+						nodemask, mode,
 						contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 
@@ -2273,7 +2273,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		 * As async compaction considers a subset of pageblocks, only
 		 * defer if the failure was a sync compaction failure.
 		 */
-		if (sync_migration)
+		if (mode != MIGRATE_ASYNC)
 			defer_compaction(preferred_zone, order);
 
 		cond_resched();
@@ -2286,9 +2286,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, bool sync_migration,
-	bool *contended_compaction, bool *deferred_compaction,
-	unsigned long *did_some_progress)
+	int migratetype, enum migrate_mode mode, bool *contended_compaction,
+	bool *deferred_compaction, unsigned long *did_some_progress)
 {
 	return NULL;
 }
@@ -2483,7 +2482,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
-	bool sync_migration = false;
+	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	bool deferred_compaction = false;
 	bool contended_compaction = false;
 
@@ -2577,17 +2576,15 @@ rebalance:
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
 	 */
-	page = __alloc_pages_direct_compact(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask,
-					alloc_flags, preferred_zone,
-					migratetype, sync_migration,
-					&contended_compaction,
+	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+					high_zoneidx, nodemask, alloc_flags,
+					preferred_zone, migratetype,
+					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
-	sync_migration = true;
+	migration_mode = MIGRATE_SYNC_LIGHT;
 
 	/*
 	 * If compaction is deferred for high-order allocations, it is because
@@ -2662,12 +2659,10 @@ rebalance:
 		 * direct reclaim and reclaim/compaction depends on compaction
 		 * being called after reclaim so call directly if necessary
 		 */
-		page = __alloc_pages_direct_compact(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask,
-					alloc_flags, preferred_zone,
-					migratetype, sync_migration,
-					&contended_compaction,
+		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+					high_zoneidx, nodemask, alloc_flags,
+					preferred_zone, migratetype,
+					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 		if (page)
@@ -6254,7 +6249,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
-		.sync = true,
+		.sync = MIGRATE_SYNC_LIGHT,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 5/6] mm, thp: avoid excessive compaction latency during fault
  2014-05-07  2:22     ` David Rientjes
@ 2014-05-07  2:22       ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Synchronous memory compaction can be very expensive: it can iterate an enormous 
amount of memory without aborting, constantly rescheduling, waiting on page
locks and lru_lock, etc, if a pageblock cannot be defragmented.

Unfortunately, it's too expensive for transparent hugepage page faults and 
it's much better to simply fallback to pages.  On 128GB machines, we find that 
synchronous memory compaction can take O(seconds) for a single thp fault.

Now that async compaction remembers where it left off without strictly relying
on sync compaction, this makes thp allocations best-effort without causing
egregious latency during fault.  We still need to retry async compaction after
reclaim, but this won't stall for seconds.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/page_alloc.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2584,7 +2584,17 @@ rebalance:
 					&did_some_progress);
 	if (page)
 		goto got_pg;
-	migration_mode = MIGRATE_SYNC_LIGHT;
+
+	if (gfp_mask & __GFP_NO_KSWAPD) {
+		/*
+		 * Khugepaged is allowed to try MIGRATE_SYNC_LIGHT, the latency
+		 * of this allocation isn't critical.  Everything else, however,
+		 * should only be allowed to do MIGRATE_ASYNC to avoid excessive
+		 * stalls during fault.
+		 */
+		if ((current->flags & (PF_KTHREAD | PF_KSWAPD)) == PF_KTHREAD)
+			migration_mode = MIGRATE_SYNC_LIGHT;
+	}
 
 	/*
 	 * If compaction is deferred for high-order allocations, it is because

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 5/6] mm, thp: avoid excessive compaction latency during fault
@ 2014-05-07  2:22       ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Synchronous memory compaction can be very expensive: it can iterate an enormous 
amount of memory without aborting, constantly rescheduling, waiting on page
locks and lru_lock, etc, if a pageblock cannot be defragmented.

Unfortunately, it's too expensive for transparent hugepage page faults and 
it's much better to simply fallback to pages.  On 128GB machines, we find that 
synchronous memory compaction can take O(seconds) for a single thp fault.

Now that async compaction remembers where it left off without strictly relying
on sync compaction, this makes thp allocations best-effort without causing
egregious latency during fault.  We still need to retry async compaction after
reclaim, but this won't stall for seconds.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/page_alloc.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2584,7 +2584,17 @@ rebalance:
 					&did_some_progress);
 	if (page)
 		goto got_pg;
-	migration_mode = MIGRATE_SYNC_LIGHT;
+
+	if (gfp_mask & __GFP_NO_KSWAPD) {
+		/*
+		 * Khugepaged is allowed to try MIGRATE_SYNC_LIGHT, the latency
+		 * of this allocation isn't critical.  Everything else, however,
+		 * should only be allowed to do MIGRATE_ASYNC to avoid excessive
+		 * stalls during fault.
+		 */
+		if ((current->flags & (PF_KTHREAD | PF_KSWAPD)) == PF_KTHREAD)
+			migration_mode = MIGRATE_SYNC_LIGHT;
+	}
 
 	/*
 	 * If compaction is deferred for high-order allocations, it is because

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
  2014-05-07  2:22     ` David Rientjes
@ 2014-05-07  2:22       ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Async compaction terminates prematurely when need_resched(), see
compact_checklock_irqsave().  This can never trigger, however, if the 
cond_resched() in isolate_migratepages_range() always takes care of the 
scheduling.

If the cond_resched() actually triggers, then terminate this pageblock scan for 
async compaction as well.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -500,8 +500,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			return 0;
 	}
 
+	if (cond_resched()) {
+		/* Async terminates prematurely on need_resched() */
+		if (cc->mode == MIGRATE_ASYNC)
+			return 0;
+	}
+
 	/* Time to isolate some pages for migration */
-	cond_resched();
 	for (; low_pfn < end_pfn; low_pfn++) {
 		/* give a chance to irqs before checking need_resched() */
 		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
@ 2014-05-07  2:22       ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07  2:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

Async compaction terminates prematurely when need_resched(), see
compact_checklock_irqsave().  This can never trigger, however, if the 
cond_resched() in isolate_migratepages_range() always takes care of the 
scheduling.

If the cond_resched() actually triggers, then terminate this pageblock scan for 
async compaction as well.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -500,8 +500,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			return 0;
 	}
 
+	if (cond_resched()) {
+		/* Async terminates prematurely on need_resched() */
+		if (cc->mode == MIGRATE_ASYNC)
+			return 0;
+	}
+
 	/* Time to isolate some pages for migration */
-	cond_resched();
 	for (; low_pfn < end_pfn; low_pfn++) {
 		/* give a chance to irqs before checking need_resched() */
 		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
       [not found]         ` <1399414778-xakujfb3@n-horiguchi@ah.jp.nec.com>
@ 2014-05-07  9:22             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07  9:22 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, gthelen,
	linux-kernel, linux-mm, minchan, Mel Gorman, iamjoonsoo.kim, cl,
	Rik van Riel

On 05/07/2014 12:19 AM, Naoya Horiguchi wrote:
> On Fri, May 02, 2014 at 05:27:55PM +0200, Vlastimil Babka wrote:
>> The compaction free scanner in isolate_freepages() currently remembers PFN of
>> the highest pageblock where it successfully isolates, to be used as the
>> starting pageblock for the next invocation. The rationale behind this is that
>> page migration might return free pages to the allocator when migration fails
>> and we don't want to skip them if the compaction continues.
>>
>> Since migration now returns free pages back to compaction code where they can
>> be reused, this is no longer a concern. This patch changes isolate_freepages()
>> so that the PFN for restarting is updated with each pageblock where isolation
>> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
>> reduction of the pages scanned by the free scanner.
>>
>> Note that the somewhat similar functionality that records highest successful
>> pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
>> used when the whole compaction is restarted, not for multiple invocations of
>> the free scanner during single compaction.
>>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   mm/compaction.c | 18 ++++++------------
>>   1 file changed, 6 insertions(+), 12 deletions(-)
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 873d7de..1967850 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -686,12 +686,6 @@ static void isolate_freepages(struct zone *zone,
>>   	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
>>
>>   	/*
>> -	 * If no pages are isolated, the block_start_pfn < low_pfn check
>> -	 * will kick in.
>> -	 */
>> -	next_free_pfn = 0;
>> -
>> -	/*
>>   	 * Isolate free pages until enough are available to migrate the
>>   	 * pages on cc->migratepages. We stop searching if the migrate
>>   	 * and free page scanners meet or enough free pages are isolated.
>> @@ -731,19 +725,19 @@ static void isolate_freepages(struct zone *zone,
>>   			continue;
>>
>>   		/* Found a block suitable for isolating free pages from */
>> +		next_free_pfn = block_start_pfn;
>>   		isolated = isolate_freepages_block(cc, block_start_pfn,
>>   					block_end_pfn, freelist, false);
>>   		nr_freepages += isolated;
>>
>>   		/*
>> -		 * Record the highest PFN we isolated pages from. When next
>> -		 * looking for free pages, the search will restart here as
>> -		 * page migration may have returned some pages to the allocator
>> +		 * Set a flag that we successfully isolated in this pageblock.
>> +		 * In the next loop iteration, zone->compact_cached_free_pfn
>> +		 * will not be updated and thus it will effectively contain the
>> +		 * highest pageblock we isolated pages from.
>>   		 */
>> -		if (isolated && next_free_pfn == 0) {
>> +		if (isolated)
>>   			cc->finished_update_free = true;
>> -			next_free_pfn = block_start_pfn;
>> -		}
>
> Why don't you completely remove next_free_pfn and update cc->free_pfn directly?

Hi,

well you could ask the same about the nr_freepages variable. For me 
personally local variable (now with a comment for next_free_pfn) looks 
more readable. The function currently received cleanup from several 
people (including me), so I wouldn't want to change it again, unless 
others also think it would be better. From compiler standpoint both 
variants should be the same I guess.

Vlastimil

> Thanks,
> Naoya Horiguchi
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-07  9:22             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07  9:22 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, gthelen,
	linux-kernel, linux-mm, minchan, Mel Gorman, iamjoonsoo.kim, cl,
	Rik van Riel

On 05/07/2014 12:19 AM, Naoya Horiguchi wrote:
> On Fri, May 02, 2014 at 05:27:55PM +0200, Vlastimil Babka wrote:
>> The compaction free scanner in isolate_freepages() currently remembers PFN of
>> the highest pageblock where it successfully isolates, to be used as the
>> starting pageblock for the next invocation. The rationale behind this is that
>> page migration might return free pages to the allocator when migration fails
>> and we don't want to skip them if the compaction continues.
>>
>> Since migration now returns free pages back to compaction code where they can
>> be reused, this is no longer a concern. This patch changes isolate_freepages()
>> so that the PFN for restarting is updated with each pageblock where isolation
>> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
>> reduction of the pages scanned by the free scanner.
>>
>> Note that the somewhat similar functionality that records highest successful
>> pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
>> used when the whole compaction is restarted, not for multiple invocations of
>> the free scanner during single compaction.
>>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   mm/compaction.c | 18 ++++++------------
>>   1 file changed, 6 insertions(+), 12 deletions(-)
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 873d7de..1967850 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -686,12 +686,6 @@ static void isolate_freepages(struct zone *zone,
>>   	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
>>
>>   	/*
>> -	 * If no pages are isolated, the block_start_pfn < low_pfn check
>> -	 * will kick in.
>> -	 */
>> -	next_free_pfn = 0;
>> -
>> -	/*
>>   	 * Isolate free pages until enough are available to migrate the
>>   	 * pages on cc->migratepages. We stop searching if the migrate
>>   	 * and free page scanners meet or enough free pages are isolated.
>> @@ -731,19 +725,19 @@ static void isolate_freepages(struct zone *zone,
>>   			continue;
>>
>>   		/* Found a block suitable for isolating free pages from */
>> +		next_free_pfn = block_start_pfn;
>>   		isolated = isolate_freepages_block(cc, block_start_pfn,
>>   					block_end_pfn, freelist, false);
>>   		nr_freepages += isolated;
>>
>>   		/*
>> -		 * Record the highest PFN we isolated pages from. When next
>> -		 * looking for free pages, the search will restart here as
>> -		 * page migration may have returned some pages to the allocator
>> +		 * Set a flag that we successfully isolated in this pageblock.
>> +		 * In the next loop iteration, zone->compact_cached_free_pfn
>> +		 * will not be updated and thus it will effectively contain the
>> +		 * highest pageblock we isolated pages from.
>>   		 */
>> -		if (isolated && next_free_pfn == 0) {
>> +		if (isolated)
>>   			cc->finished_update_free = true;
>> -			next_free_pfn = block_start_pfn;
>> -		}
>
> Why don't you completely remove next_free_pfn and update cc->free_pfn directly?

Hi,

well you could ask the same about the nr_freepages variable. For me 
personally local variable (now with a comment for next_free_pfn) looks 
more readable. The function currently received cleanup from several 
people (including me), so I wouldn't want to change it again, unless 
others also think it would be better. From compiler standpoint both 
variants should be the same I guess.

Vlastimil

> Thanks,
> Naoya Horiguchi
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm/compaction: do not count migratepages when unnecessary
       [not found]         ` <1399411134-k43fsr0p@n-horiguchi@ah.jp.nec.com>
@ 2014-05-07  9:33             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07  9:33 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Hugh Dickins, gthelen, linux-kernel, linux-mm,
	minchan, Mel Gorman, iamjoonsoo.kim, cl, Rik van Riel

On 05/06/2014 11:18 PM, Naoya Horiguchi wrote:
> On Fri, May 02, 2014 at 05:26:18PM +0200, Vlastimil Babka wrote:
>> During compaction, update_nr_listpages() has been used to count remaining
>> non-migrated and free pages after a call to migrage_pages(). The freepages
>> counting has become unneccessary, and it turns out that migratepages counting
>> is also unnecessary in most cases.
>>
>> The only situation when it's needed to count cc->migratepages is when
>> migrate_pages() returns with a negative error code. Otherwise, the non-negative
>> return value is the number of pages that were not migrated, which is exactly
>> the count of remaining pages in the cc->migratepages list.
>>
>> Furthermore, any non-zero count is only interesting for the tracepoint of
>> mm_compaction_migratepages events, because after that all remaining unmigrated
>> pages are put back and their count is set to 0.
>>
>> This patch therefore removes update_nr_listpages() completely, and changes the
>> tracepoint definition so that the manual counting is done only when the
>> tracepoint is enabled, and only when migrate_pages() returns a negative error
>> code.
>>
>> Furthermore, migrate_pages() and the tracepoints won't be called when there's
>> nothing to migrate. This potentially avoids some wasted cycles and reduces the
>> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
>> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
>> The mm_compaction_isolate_migratepages event is better for determining that
>> nothing was isolated for migration, and this one was just duplicating the info.
>
> I agree with this patch.
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

Thanks.

> A few nitpicks below...
>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
>>   mm/compaction.c                   | 30 ++++++++----------------------
>>   2 files changed, 30 insertions(+), 26 deletions(-)
>>
>> diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
>> index 06f544e..bbd5e1f 100644
>> --- a/include/trace/events/compaction.h
>> +++ b/include/trace/events/compaction.h
>> @@ -5,7 +5,9 @@
>>   #define _TRACE_COMPACTION_H
>>
>>   #include <linux/types.h>
>> +#include <linux/list.h>
>>   #include <linux/tracepoint.h>
>> +#include <linux/mm_types.h>
>>   #include <trace/events/gfpflags.h>
>>
>>   DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
>> @@ -47,10 +49,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
>>
>>   TRACE_EVENT(mm_compaction_migratepages,
>>
>> -	TP_PROTO(unsigned long nr_migrated,
>> -		unsigned long nr_failed),
>> +	TP_PROTO(unsigned long nr_all,
>> +		int migrate_rc,
>> +		struct list_head * migratepages),
>
> checkpatch.pl shows code violation message for this line.
> (ERROR: "foo * bar" should be "foo *bar")
>
>>
>> -	TP_ARGS(nr_migrated, nr_failed),
>> +	TP_ARGS(nr_all, migrate_rc, migratepages),
>>
>>   	TP_STRUCT__entry(
>>   		__field(unsigned long, nr_migrated)
>> @@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
>>   	),
>>
>>   	TP_fast_assign(
>> -		__entry->nr_migrated = nr_migrated;
>> +		unsigned long nr_failed = 0;
>> +		struct page *page;
>> +
>> +		/*
>> +		 * migrate_pages() returns either a non-negative number
>> +		 * with the number of pages that failed migration, or an
>> +		 * error code, in which case we need to count the remaining
>> +		 * pages manually
>> +		 */
>> +		if (migrate_rc >= 0)
>> +			nr_failed = migrate_rc;
>> +		else
>> +		        list_for_each_entry(page, migratepages, lru)
>
> This line contains whitespace indent.

Oops, will fix those checkpatchs...

>> +				nr_failed++;
>> +
>> +		__entry->nr_migrated = nr_all - nr_failed;
>>   		__entry->nr_failed = nr_failed;
>>   	),
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index ae1d0ae..873d7de 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -799,22 +799,6 @@ static void compaction_free(struct page *page, unsigned long data)
>>   	cc->nr_freepages++;
>>   }
>>
>> -/*
>> - * We cannot control nr_migratepages fully when migration is running as
>> - * migrate_pages() has no knowledge of of compact_control.  When migration is
>> - * complete, we count the number of pages on the list by hand.
>> - */
>> -static void update_nr_listpages(struct compact_control *cc)
>> -{
>> -	int nr_migratepages = 0;
>> -	struct page *page;
>> -
>> -	list_for_each_entry(page, &cc->migratepages, lru)
>> -		nr_migratepages++;
>> -
>> -	cc->nr_migratepages = nr_migratepages;
>> -}
>> -
>>   /* possible outcome of isolate_migratepages */
>>   typedef enum {
>>   	ISOLATE_ABORT,		/* Abort compaction now */
>> @@ -1006,7 +990,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>>   	migrate_prep_local();
>>
>>   	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
>> -		unsigned long nr_migrate, nr_remaining;
>>   		int err;
>>
>>   		switch (isolate_migratepages(zone, cc)) {
>> @@ -1021,16 +1004,16 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>>   			;
>>   		}
>>
>> -		nr_migrate = cc->nr_migratepages;
>> +		if (!cc->nr_migratepages)
>> +			continue;
>> +
>>   		err = migrate_pages(&cc->migratepages, compaction_alloc,
>>   				compaction_free, (unsigned long)cc,
>>   				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
>>   				MR_COMPACTION);
>> -		update_nr_listpages(cc);
>> -		nr_remaining = cc->nr_migratepages;
>>
>> -		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
>> -						nr_remaining);
>> +		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
>> +							&cc->migratepages);
>>
>>   		/* Release isolated pages not migrated */
>>   		if (err) {
>> @@ -1044,6 +1027,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>>   				ret = COMPACT_PARTIAL;
>>   				goto out;
>>   			}
>> +		} else {
>> +			/* All pages were successfully migrated */
>> +			cc->nr_migratepages = 0;
>
> cc->nr_migratepages = 0 is also done in err != 0, so can it be done in common path?

Well it would have to be done before 'if (err)', thus also before 
putback_movable_pages(), which is a bit awkward. But avoiding 'else' is 
also good I guess so I'll do that for next version.

> Thanks,
> Naoya Horiguchi
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm/compaction: do not count migratepages when unnecessary
@ 2014-05-07  9:33             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07  9:33 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Hugh Dickins, gthelen, linux-kernel, linux-mm,
	minchan, Mel Gorman, iamjoonsoo.kim, cl, Rik van Riel

On 05/06/2014 11:18 PM, Naoya Horiguchi wrote:
> On Fri, May 02, 2014 at 05:26:18PM +0200, Vlastimil Babka wrote:
>> During compaction, update_nr_listpages() has been used to count remaining
>> non-migrated and free pages after a call to migrage_pages(). The freepages
>> counting has become unneccessary, and it turns out that migratepages counting
>> is also unnecessary in most cases.
>>
>> The only situation when it's needed to count cc->migratepages is when
>> migrate_pages() returns with a negative error code. Otherwise, the non-negative
>> return value is the number of pages that were not migrated, which is exactly
>> the count of remaining pages in the cc->migratepages list.
>>
>> Furthermore, any non-zero count is only interesting for the tracepoint of
>> mm_compaction_migratepages events, because after that all remaining unmigrated
>> pages are put back and their count is set to 0.
>>
>> This patch therefore removes update_nr_listpages() completely, and changes the
>> tracepoint definition so that the manual counting is done only when the
>> tracepoint is enabled, and only when migrate_pages() returns a negative error
>> code.
>>
>> Furthermore, migrate_pages() and the tracepoints won't be called when there's
>> nothing to migrate. This potentially avoids some wasted cycles and reduces the
>> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
>> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
>> The mm_compaction_isolate_migratepages event is better for determining that
>> nothing was isolated for migration, and this one was just duplicating the info.
>
> I agree with this patch.
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

Thanks.

> A few nitpicks below...
>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
>>   mm/compaction.c                   | 30 ++++++++----------------------
>>   2 files changed, 30 insertions(+), 26 deletions(-)
>>
>> diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
>> index 06f544e..bbd5e1f 100644
>> --- a/include/trace/events/compaction.h
>> +++ b/include/trace/events/compaction.h
>> @@ -5,7 +5,9 @@
>>   #define _TRACE_COMPACTION_H
>>
>>   #include <linux/types.h>
>> +#include <linux/list.h>
>>   #include <linux/tracepoint.h>
>> +#include <linux/mm_types.h>
>>   #include <trace/events/gfpflags.h>
>>
>>   DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
>> @@ -47,10 +49,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
>>
>>   TRACE_EVENT(mm_compaction_migratepages,
>>
>> -	TP_PROTO(unsigned long nr_migrated,
>> -		unsigned long nr_failed),
>> +	TP_PROTO(unsigned long nr_all,
>> +		int migrate_rc,
>> +		struct list_head * migratepages),
>
> checkpatch.pl shows code violation message for this line.
> (ERROR: "foo * bar" should be "foo *bar")
>
>>
>> -	TP_ARGS(nr_migrated, nr_failed),
>> +	TP_ARGS(nr_all, migrate_rc, migratepages),
>>
>>   	TP_STRUCT__entry(
>>   		__field(unsigned long, nr_migrated)
>> @@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
>>   	),
>>
>>   	TP_fast_assign(
>> -		__entry->nr_migrated = nr_migrated;
>> +		unsigned long nr_failed = 0;
>> +		struct page *page;
>> +
>> +		/*
>> +		 * migrate_pages() returns either a non-negative number
>> +		 * with the number of pages that failed migration, or an
>> +		 * error code, in which case we need to count the remaining
>> +		 * pages manually
>> +		 */
>> +		if (migrate_rc >= 0)
>> +			nr_failed = migrate_rc;
>> +		else
>> +		        list_for_each_entry(page, migratepages, lru)
>
> This line contains whitespace indent.

Oops, will fix those checkpatchs...

>> +				nr_failed++;
>> +
>> +		__entry->nr_migrated = nr_all - nr_failed;
>>   		__entry->nr_failed = nr_failed;
>>   	),
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index ae1d0ae..873d7de 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -799,22 +799,6 @@ static void compaction_free(struct page *page, unsigned long data)
>>   	cc->nr_freepages++;
>>   }
>>
>> -/*
>> - * We cannot control nr_migratepages fully when migration is running as
>> - * migrate_pages() has no knowledge of of compact_control.  When migration is
>> - * complete, we count the number of pages on the list by hand.
>> - */
>> -static void update_nr_listpages(struct compact_control *cc)
>> -{
>> -	int nr_migratepages = 0;
>> -	struct page *page;
>> -
>> -	list_for_each_entry(page, &cc->migratepages, lru)
>> -		nr_migratepages++;
>> -
>> -	cc->nr_migratepages = nr_migratepages;
>> -}
>> -
>>   /* possible outcome of isolate_migratepages */
>>   typedef enum {
>>   	ISOLATE_ABORT,		/* Abort compaction now */
>> @@ -1006,7 +990,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>>   	migrate_prep_local();
>>
>>   	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
>> -		unsigned long nr_migrate, nr_remaining;
>>   		int err;
>>
>>   		switch (isolate_migratepages(zone, cc)) {
>> @@ -1021,16 +1004,16 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>>   			;
>>   		}
>>
>> -		nr_migrate = cc->nr_migratepages;
>> +		if (!cc->nr_migratepages)
>> +			continue;
>> +
>>   		err = migrate_pages(&cc->migratepages, compaction_alloc,
>>   				compaction_free, (unsigned long)cc,
>>   				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
>>   				MR_COMPACTION);
>> -		update_nr_listpages(cc);
>> -		nr_remaining = cc->nr_migratepages;
>>
>> -		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
>> -						nr_remaining);
>> +		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
>> +							&cc->migratepages);
>>
>>   		/* Release isolated pages not migrated */
>>   		if (err) {
>> @@ -1044,6 +1027,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>>   				ret = COMPACT_PARTIAL;
>>   				goto out;
>>   			}
>> +		} else {
>> +			/* All pages were successfully migrated */
>> +			cc->nr_migratepages = 0;
>
> cc->nr_migratepages = 0 is also done in err != 0, so can it be done in common path?

Well it would have to be done before 'if (err)', thus also before 
putback_movable_pages(), which is a bit awkward. But avoiding 'else' is 
also good I guess so I'll do that for next version.

> Thanks,
> Naoya Horiguchi
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 3/6] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-07  9:34         ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07  9:34 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/07/2014 04:22 AM, David Rientjes wrote:
> Each zone has a cached migration scanner pfn for memory compaction so that
> subsequent calls to memory compaction can start where the previous call left
> off.
>
> Currently, the compaction migration scanner only updates the per-zone cached pfn
> when pageblocks were not skipped for async compaction.  This creates a
> dependency on calling sync compaction to avoid having subsequent calls to async
> compaction from scanning an enormous amount of non-MOVABLE pageblocks each time
> it is called.  On large machines, this could be potentially very expensive.
>
> This patch adds a per-zone cached migration scanner pfn only for async
> compaction.  It is updated everytime a pageblock has been scanned in its
> entirety and when no pages from it were successfully isolated.  The cached
> migration scanner pfn for sync compaction is updated only when called for sync
> compaction.
>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   v3: do not update pageblock skip metadata when skipped due to async per
>       Vlastimil.

Great.

Acked-by: Vlastimil Babka <vbabka@suse.cz>


>   include/linux/mmzone.h |  5 ++--
>   mm/compaction.c        | 66 ++++++++++++++++++++++++++++++--------------------
>   2 files changed, 43 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -360,9 +360,10 @@ struct zone {
>   	/* Set to true when the PG_migrate_skip bits should be cleared */
>   	bool			compact_blockskip_flush;
>
> -	/* pfns where compaction scanners should start */
> +	/* pfn where compaction free scanner should start */
>   	unsigned long		compact_cached_free_pfn;
> -	unsigned long		compact_cached_migrate_pfn;
> +	/* pfn where async and sync compaction migration scanner should start */
> +	unsigned long		compact_cached_migrate_pfn[2];
>   #endif
>   #ifdef CONFIG_MEMORY_HOTPLUG
>   	/* see spanned/present_pages for more description */
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
>   	unsigned long end_pfn = zone_end_pfn(zone);
>   	unsigned long pfn;
>
> -	zone->compact_cached_migrate_pfn = start_pfn;
> +	zone->compact_cached_migrate_pfn[0] = start_pfn;
> +	zone->compact_cached_migrate_pfn[1] = start_pfn;
>   	zone->compact_cached_free_pfn = end_pfn;
>   	zone->compact_blockskip_flush = false;
>
> @@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat)
>    */
>   static void update_pageblock_skip(struct compact_control *cc,
>   			struct page *page, unsigned long nr_isolated,
> -			bool migrate_scanner)
> +			bool set_unsuitable, bool migrate_scanner)
>   {
>   	struct zone *zone = cc->zone;
> +	unsigned long pfn;
>
>   	if (cc->ignore_skip_hint)
>   		return;
> @@ -141,20 +143,31 @@ static void update_pageblock_skip(struct compact_control *cc,
>   	if (!page)
>   		return;
>
> -	if (!nr_isolated) {
> -		unsigned long pfn = page_to_pfn(page);
> +	if (nr_isolated)
> +		return;
> +
> +	/*
> +	 * Only skip pageblocks when all forms of compaction will be known to
> +	 * fail in the near future.
> +	 */
> +	if (set_unsuitable)
>   		set_pageblock_skip(page);
>
> -		/* Update where compaction should restart */
> -		if (migrate_scanner) {
> -			if (!cc->finished_update_migrate &&
> -			    pfn > zone->compact_cached_migrate_pfn)
> -				zone->compact_cached_migrate_pfn = pfn;
> -		} else {
> -			if (!cc->finished_update_free &&
> -			    pfn < zone->compact_cached_free_pfn)
> -				zone->compact_cached_free_pfn = pfn;
> -		}
> +	pfn = page_to_pfn(page);
> +
> +	/* Update where async and sync compaction should restart */
> +	if (migrate_scanner) {
> +		if (cc->finished_update_migrate)
> +			return;
> +		if (pfn > zone->compact_cached_migrate_pfn[0])
> +			zone->compact_cached_migrate_pfn[0] = pfn;
> +		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
> +			zone->compact_cached_migrate_pfn[1] = pfn;
> +	} else {
> +		if (cc->finished_update_free)
> +			return;
> +		if (pfn < zone->compact_cached_free_pfn)
> +			zone->compact_cached_free_pfn = pfn;
>   	}
>   }
>   #else
> @@ -166,7 +179,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
>
>   static void update_pageblock_skip(struct compact_control *cc,
>   			struct page *page, unsigned long nr_isolated,
> -			bool migrate_scanner)
> +			bool set_unsuitable, bool migrate_scanner)
>   {
>   }
>   #endif /* CONFIG_COMPACTION */
> @@ -329,7 +342,8 @@ isolate_fail:
>
>   	/* Update the pageblock-skip if the whole pageblock was scanned */
>   	if (blockpfn == end_pfn)
> -		update_pageblock_skip(cc, valid_page, total_isolated, false);
> +		update_pageblock_skip(cc, valid_page, total_isolated, true,
> +				      false);
>
>   	count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
>   	if (total_isolated)
> @@ -464,7 +478,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   	unsigned long flags;
>   	bool locked = false;
>   	struct page *page = NULL, *valid_page = NULL;
> -	bool skipped_async_unsuitable = false;
> +	bool set_unsuitable = true;
>   	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
>   				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
>
> @@ -541,8 +555,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			 */
>   			mt = get_pageblock_migratetype(page);
>   			if (!cc->sync && !migrate_async_suitable(mt)) {
> -				cc->finished_update_migrate = true;
> -				skipped_async_unsuitable = true;
> +				set_unsuitable = false;
>   				goto next_pageblock;
>   			}
>   		}
> @@ -646,11 +659,10 @@ next_pageblock:
>   	/*
>   	 * Update the pageblock-skip information and cached scanner pfn,
>   	 * if the whole pageblock was scanned without isolating any page.
> -	 * This is not done when pageblock was skipped due to being unsuitable
> -	 * for async compaction, so that eventual sync compaction can try.
>   	 */
> -	if (low_pfn == end_pfn && !skipped_async_unsuitable)
> -		update_pageblock_skip(cc, valid_page, nr_isolated, true);
> +	if (low_pfn == end_pfn)
> +		update_pageblock_skip(cc, valid_page, nr_isolated,
> +				      set_unsuitable, true);
>
>   	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
>
> @@ -877,7 +889,8 @@ static int compact_finished(struct zone *zone,
>   	/* Compaction run completes if the migrate and free scanner meet */
>   	if (cc->free_pfn <= cc->migrate_pfn) {
>   		/* Let the next compaction start anew. */
> -		zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
> +		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
> +		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
>   		zone->compact_cached_free_pfn = zone_end_pfn(zone);
>
>   		/*
> @@ -1002,7 +1015,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	 * information on where the scanners should start but check that it
>   	 * is initialised by ensuring the values are within zone boundaries.
>   	 */
> -	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
> +	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
>   	cc->free_pfn = zone->compact_cached_free_pfn;
>   	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
>   		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
> @@ -1010,7 +1023,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	}
>   	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
>   		cc->migrate_pfn = start_pfn;
> -		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
> +		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
> +		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
>   	}
>
>   	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 3/6] mm, compaction: add per-zone migration pfn cache for async compaction
@ 2014-05-07  9:34         ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07  9:34 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/07/2014 04:22 AM, David Rientjes wrote:
> Each zone has a cached migration scanner pfn for memory compaction so that
> subsequent calls to memory compaction can start where the previous call left
> off.
>
> Currently, the compaction migration scanner only updates the per-zone cached pfn
> when pageblocks were not skipped for async compaction.  This creates a
> dependency on calling sync compaction to avoid having subsequent calls to async
> compaction from scanning an enormous amount of non-MOVABLE pageblocks each time
> it is called.  On large machines, this could be potentially very expensive.
>
> This patch adds a per-zone cached migration scanner pfn only for async
> compaction.  It is updated everytime a pageblock has been scanned in its
> entirety and when no pages from it were successfully isolated.  The cached
> migration scanner pfn for sync compaction is updated only when called for sync
> compaction.
>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   v3: do not update pageblock skip metadata when skipped due to async per
>       Vlastimil.

Great.

Acked-by: Vlastimil Babka <vbabka@suse.cz>


>   include/linux/mmzone.h |  5 ++--
>   mm/compaction.c        | 66 ++++++++++++++++++++++++++++++--------------------
>   2 files changed, 43 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -360,9 +360,10 @@ struct zone {
>   	/* Set to true when the PG_migrate_skip bits should be cleared */
>   	bool			compact_blockskip_flush;
>
> -	/* pfns where compaction scanners should start */
> +	/* pfn where compaction free scanner should start */
>   	unsigned long		compact_cached_free_pfn;
> -	unsigned long		compact_cached_migrate_pfn;
> +	/* pfn where async and sync compaction migration scanner should start */
> +	unsigned long		compact_cached_migrate_pfn[2];
>   #endif
>   #ifdef CONFIG_MEMORY_HOTPLUG
>   	/* see spanned/present_pages for more description */
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
>   	unsigned long end_pfn = zone_end_pfn(zone);
>   	unsigned long pfn;
>
> -	zone->compact_cached_migrate_pfn = start_pfn;
> +	zone->compact_cached_migrate_pfn[0] = start_pfn;
> +	zone->compact_cached_migrate_pfn[1] = start_pfn;
>   	zone->compact_cached_free_pfn = end_pfn;
>   	zone->compact_blockskip_flush = false;
>
> @@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat)
>    */
>   static void update_pageblock_skip(struct compact_control *cc,
>   			struct page *page, unsigned long nr_isolated,
> -			bool migrate_scanner)
> +			bool set_unsuitable, bool migrate_scanner)
>   {
>   	struct zone *zone = cc->zone;
> +	unsigned long pfn;
>
>   	if (cc->ignore_skip_hint)
>   		return;
> @@ -141,20 +143,31 @@ static void update_pageblock_skip(struct compact_control *cc,
>   	if (!page)
>   		return;
>
> -	if (!nr_isolated) {
> -		unsigned long pfn = page_to_pfn(page);
> +	if (nr_isolated)
> +		return;
> +
> +	/*
> +	 * Only skip pageblocks when all forms of compaction will be known to
> +	 * fail in the near future.
> +	 */
> +	if (set_unsuitable)
>   		set_pageblock_skip(page);
>
> -		/* Update where compaction should restart */
> -		if (migrate_scanner) {
> -			if (!cc->finished_update_migrate &&
> -			    pfn > zone->compact_cached_migrate_pfn)
> -				zone->compact_cached_migrate_pfn = pfn;
> -		} else {
> -			if (!cc->finished_update_free &&
> -			    pfn < zone->compact_cached_free_pfn)
> -				zone->compact_cached_free_pfn = pfn;
> -		}
> +	pfn = page_to_pfn(page);
> +
> +	/* Update where async and sync compaction should restart */
> +	if (migrate_scanner) {
> +		if (cc->finished_update_migrate)
> +			return;
> +		if (pfn > zone->compact_cached_migrate_pfn[0])
> +			zone->compact_cached_migrate_pfn[0] = pfn;
> +		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
> +			zone->compact_cached_migrate_pfn[1] = pfn;
> +	} else {
> +		if (cc->finished_update_free)
> +			return;
> +		if (pfn < zone->compact_cached_free_pfn)
> +			zone->compact_cached_free_pfn = pfn;
>   	}
>   }
>   #else
> @@ -166,7 +179,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
>
>   static void update_pageblock_skip(struct compact_control *cc,
>   			struct page *page, unsigned long nr_isolated,
> -			bool migrate_scanner)
> +			bool set_unsuitable, bool migrate_scanner)
>   {
>   }
>   #endif /* CONFIG_COMPACTION */
> @@ -329,7 +342,8 @@ isolate_fail:
>
>   	/* Update the pageblock-skip if the whole pageblock was scanned */
>   	if (blockpfn == end_pfn)
> -		update_pageblock_skip(cc, valid_page, total_isolated, false);
> +		update_pageblock_skip(cc, valid_page, total_isolated, true,
> +				      false);
>
>   	count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
>   	if (total_isolated)
> @@ -464,7 +478,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   	unsigned long flags;
>   	bool locked = false;
>   	struct page *page = NULL, *valid_page = NULL;
> -	bool skipped_async_unsuitable = false;
> +	bool set_unsuitable = true;
>   	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
>   				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
>
> @@ -541,8 +555,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			 */
>   			mt = get_pageblock_migratetype(page);
>   			if (!cc->sync && !migrate_async_suitable(mt)) {
> -				cc->finished_update_migrate = true;
> -				skipped_async_unsuitable = true;
> +				set_unsuitable = false;
>   				goto next_pageblock;
>   			}
>   		}
> @@ -646,11 +659,10 @@ next_pageblock:
>   	/*
>   	 * Update the pageblock-skip information and cached scanner pfn,
>   	 * if the whole pageblock was scanned without isolating any page.
> -	 * This is not done when pageblock was skipped due to being unsuitable
> -	 * for async compaction, so that eventual sync compaction can try.
>   	 */
> -	if (low_pfn == end_pfn && !skipped_async_unsuitable)
> -		update_pageblock_skip(cc, valid_page, nr_isolated, true);
> +	if (low_pfn == end_pfn)
> +		update_pageblock_skip(cc, valid_page, nr_isolated,
> +				      set_unsuitable, true);
>
>   	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
>
> @@ -877,7 +889,8 @@ static int compact_finished(struct zone *zone,
>   	/* Compaction run completes if the migrate and free scanner meet */
>   	if (cc->free_pfn <= cc->migrate_pfn) {
>   		/* Let the next compaction start anew. */
> -		zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
> +		zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
> +		zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
>   		zone->compact_cached_free_pfn = zone_end_pfn(zone);
>
>   		/*
> @@ -1002,7 +1015,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	 * information on where the scanners should start but check that it
>   	 * is initialised by ensuring the values are within zone boundaries.
>   	 */
> -	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
> +	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
>   	cc->free_pfn = zone->compact_cached_free_pfn;
>   	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
>   		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
> @@ -1010,7 +1023,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	}
>   	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
>   		cc->migrate_pfn = start_pfn;
> -		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
> +		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
> +		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
>   	}
>
>   	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 5/6] mm, thp: avoid excessive compaction latency during fault
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-07  9:39         ` Mel Gorman
  -1 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-07  9:39 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, May 06, 2014 at 07:22:50PM -0700, David Rientjes wrote:
> Synchronous memory compaction can be very expensive: it can iterate an enormous 
> amount of memory without aborting, constantly rescheduling, waiting on page
> locks and lru_lock, etc, if a pageblock cannot be defragmented.
> 
> Unfortunately, it's too expensive for transparent hugepage page faults and 
> it's much better to simply fallback to pages.  On 128GB machines, we find that 
> synchronous memory compaction can take O(seconds) for a single thp fault.
> 
> Now that async compaction remembers where it left off without strictly relying
> on sync compaction, this makes thp allocations best-effort without causing
> egregious latency during fault.  We still need to retry async compaction after
> reclaim, but this won't stall for seconds.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Mel Gorman <mgorman@suse.de>

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 5/6] mm, thp: avoid excessive compaction latency during fault
@ 2014-05-07  9:39         ` Mel Gorman
  0 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-07  9:39 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, May 06, 2014 at 07:22:50PM -0700, David Rientjes wrote:
> Synchronous memory compaction can be very expensive: it can iterate an enormous 
> amount of memory without aborting, constantly rescheduling, waiting on page
> locks and lru_lock, etc, if a pageblock cannot be defragmented.
> 
> Unfortunately, it's too expensive for transparent hugepage page faults and 
> it's much better to simply fallback to pages.  On 128GB machines, we find that 
> synchronous memory compaction can take O(seconds) for a single thp fault.
> 
> Now that async compaction remembers where it left off without strictly relying
> on sync compaction, this makes thp allocations best-effort without causing
> egregious latency during fault.  We still need to retry async compaction after
> reclaim, but this won't stall for seconds.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Mel Gorman <mgorman@suse.de>

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-07  9:41         ` Mel Gorman
  -1 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-07  9:41 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, May 06, 2014 at 07:22:52PM -0700, David Rientjes wrote:
> Async compaction terminates prematurely when need_resched(), see
> compact_checklock_irqsave().  This can never trigger, however, if the 
> cond_resched() in isolate_migratepages_range() always takes care of the 
> scheduling.
> 
> If the cond_resched() actually triggers, then terminate this pageblock scan for 
> async compaction as well.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Mel Gorman <mgorman@suse.de>

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
@ 2014-05-07  9:41         ` Mel Gorman
  0 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-05-07  9:41 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, May 06, 2014 at 07:22:52PM -0700, David Rientjes wrote:
> Async compaction terminates prematurely when need_resched(), see
> compact_checklock_irqsave().  This can never trigger, however, if the 
> cond_resched() in isolate_migratepages_range() always takes care of the 
> scheduling.
> 
> If the cond_resched() actually triggers, then terminate this pageblock scan for 
> async compaction as well.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Mel Gorman <mgorman@suse.de>

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 4/6] mm, compaction: embed migration mode in compact_control
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-07  9:55         ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07  9:55 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/07/2014 04:22 AM, David Rientjes wrote:
> We're going to want to manipulate the migration mode for compaction in the page
> allocator, and currently compact_control's sync field is only a bool.
>
> Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending
> on the value of this bool.  Convert the bool to enum migrate_mode and pass the
> migration mode in directly.  Later, we'll want to avoid MIGRATE_SYNC_LIGHT for
> thp allocations in the pagefault patch to avoid unnecessary latency.
>
> This also alters compaction triggered from sysfs, either for the entire system
> or for a node, to force MIGRATE_SYNC.
>
> Suggested-by: Mel Gorman <mgorman@suse.de>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   include/linux/compaction.h |  4 ++--
>   mm/compaction.c            | 36 +++++++++++++++++++-----------------
>   mm/internal.h              |  2 +-
>   mm/page_alloc.c            | 37 ++++++++++++++++---------------------
>   4 files changed, 38 insertions(+), 41 deletions(-)
>
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
>   extern int fragmentation_index(struct zone *zone, unsigned int order);
>   extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
>   			int order, gfp_t gfp_mask, nodemask_t *mask,
> -			bool sync, bool *contended);
> +			enum migrate_mode sync, bool *contended);

Everywhere else it's 'mode' and only in this function it's still called 
'sync', that's confusing.
Afterwards:

Acked-by: Vlastimil Babka <vbabka@suse.cz>

>   extern void compact_pgdat(pg_data_t *pgdat, int order);
>   extern void reset_isolation_suitable(pg_data_t *pgdat);
>   extern unsigned long compaction_suitable(struct zone *zone, int order);
> @@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
>   #else
>   static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
>   			int order, gfp_t gfp_mask, nodemask_t *nodemask,
> -			bool sync, bool *contended)
> +			enum migrate_mode sync, bool *contended)
>   {
>   	return COMPACT_CONTINUE;
>   }
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -161,7 +161,8 @@ static void update_pageblock_skip(struct compact_control *cc,
>   			return;
>   		if (pfn > zone->compact_cached_migrate_pfn[0])
>   			zone->compact_cached_migrate_pfn[0] = pfn;
> -		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
> +		if (cc->mode != MIGRATE_ASYNC &&
> +		    pfn > zone->compact_cached_migrate_pfn[1])
>   			zone->compact_cached_migrate_pfn[1] = pfn;
>   	} else {
>   		if (cc->finished_update_free)
> @@ -208,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>   		}
>
>   		/* async aborts if taking too long or contended */
> -		if (!cc->sync) {
> +		if (cc->mode == MIGRATE_ASYNC) {
>   			cc->contended = true;
>   			return false;
>   		}
> @@ -479,7 +480,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   	bool locked = false;
>   	struct page *page = NULL, *valid_page = NULL;
>   	bool set_unsuitable = true;
> -	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
> +	const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
> +					ISOLATE_ASYNC_MIGRATE : 0) |
>   				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
>
>   	/*
> @@ -489,7 +491,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   	 */
>   	while (unlikely(too_many_isolated(zone))) {
>   		/* async migration should just abort */
> -		if (!cc->sync)
> +		if (cc->mode == MIGRATE_ASYNC)
>   			return 0;
>
>   		congestion_wait(BLK_RW_ASYNC, HZ/10);
> @@ -554,7 +556,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			 * the minimum amount of work satisfies the allocation
>   			 */
>   			mt = get_pageblock_migratetype(page);
> -			if (!cc->sync && !migrate_async_suitable(mt)) {
> +			if (cc->mode == MIGRATE_ASYNC &&
> +			    !migrate_async_suitable(mt)) {
>   				set_unsuitable = false;
>   				goto next_pageblock;
>   			}
> @@ -990,6 +993,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	int ret;
>   	unsigned long start_pfn = zone->zone_start_pfn;
>   	unsigned long end_pfn = zone_end_pfn(zone);
> +	const bool sync = cc->mode != MIGRATE_ASYNC;
>
>   	ret = compaction_suitable(zone, cc->order);
>   	switch (ret) {
> @@ -1015,7 +1019,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	 * information on where the scanners should start but check that it
>   	 * is initialised by ensuring the values are within zone boundaries.
>   	 */
> -	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
> +	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
>   	cc->free_pfn = zone->compact_cached_free_pfn;
>   	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
>   		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
> @@ -1049,8 +1053,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>
>   		nr_migrate = cc->nr_migratepages;
>   		err = migrate_pages(&cc->migratepages, compaction_alloc,
> -				compaction_free, (unsigned long)cc,
> -				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
> +				compaction_free, (unsigned long)cc, cc->mode,
>   				MR_COMPACTION);
>   		update_nr_listpages(cc);
>   		nr_remaining = cc->nr_migratepages;
> @@ -1083,9 +1086,8 @@ out:
>   	return ret;
>   }
>
> -static unsigned long compact_zone_order(struct zone *zone,
> -				 int order, gfp_t gfp_mask,
> -				 bool sync, bool *contended)
> +static unsigned long compact_zone_order(struct zone *zone, int order,
> +		gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
>   {
>   	unsigned long ret;
>   	struct compact_control cc = {
> @@ -1094,7 +1096,7 @@ static unsigned long compact_zone_order(struct zone *zone,
>   		.order = order,
>   		.migratetype = allocflags_to_migratetype(gfp_mask),
>   		.zone = zone,
> -		.sync = sync,
> +		.mode = mode,
>   	};
>   	INIT_LIST_HEAD(&cc.freepages);
>   	INIT_LIST_HEAD(&cc.migratepages);
> @@ -1116,7 +1118,7 @@ int sysctl_extfrag_threshold = 500;
>    * @order: The order of the current allocation
>    * @gfp_mask: The GFP mask of the current allocation
>    * @nodemask: The allowed nodes to allocate from
> - * @sync: Whether migration is synchronous or not
> + * @sync: The migration mode for async, sync light, or sync migration
>    * @contended: Return value that is true if compaction was aborted due to lock contention
>    * @page: Optionally capture a free page of the requested order during compaction
>    *
> @@ -1124,7 +1126,7 @@ int sysctl_extfrag_threshold = 500;
>    */
>   unsigned long try_to_compact_pages(struct zonelist *zonelist,
>   			int order, gfp_t gfp_mask, nodemask_t *nodemask,
> -			bool sync, bool *contended)
> +			enum migrate_mode sync, bool *contended)
>   {
>   	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
>   	int may_enter_fs = gfp_mask & __GFP_FS;
> @@ -1189,7 +1191,7 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
>   						low_wmark_pages(zone), 0, 0))
>   				compaction_defer_reset(zone, cc->order, false);
>   			/* Currently async compaction is never deferred. */
> -			else if (cc->sync)
> +			else if (cc->mode != MIGRATE_ASYNC)
>   				defer_compaction(zone, cc->order);
>   		}
>
> @@ -1202,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
>   {
>   	struct compact_control cc = {
>   		.order = order,
> -		.sync = false,
> +		.mode = MIGRATE_ASYNC,
>   	};
>
>   	if (!order)
> @@ -1215,7 +1217,7 @@ static void compact_node(int nid)
>   {
>   	struct compact_control cc = {
>   		.order = -1,
> -		.sync = true,
> +		.mode = MIGRATE_SYNC,
>   		.ignore_skip_hint = true,
>   	};
>
> diff --git a/mm/internal.h b/mm/internal.h
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -134,7 +134,7 @@ struct compact_control {
>   	unsigned long nr_migratepages;	/* Number of pages to migrate */
>   	unsigned long free_pfn;		/* isolate_freepages search base */
>   	unsigned long migrate_pfn;	/* isolate_migratepages search base */
> -	bool sync;			/* Synchronous migration */
> +	enum migrate_mode mode;		/* Async or sync migration mode */
>   	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
>   	bool finished_update_free;	/* True when the zone cached pfns are
>   					 * no longer being updated
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2226,7 +2226,7 @@ static struct page *
>   __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>   	struct zonelist *zonelist, enum zone_type high_zoneidx,
>   	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
> -	int migratetype, bool sync_migration,
> +	int migratetype, enum migrate_mode mode,
>   	bool *contended_compaction, bool *deferred_compaction,
>   	unsigned long *did_some_progress)
>   {
> @@ -2240,7 +2240,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>
>   	current->flags |= PF_MEMALLOC;
>   	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
> -						nodemask, sync_migration,
> +						nodemask, mode,
>   						contended_compaction);
>   	current->flags &= ~PF_MEMALLOC;
>
> @@ -2273,7 +2273,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>   		 * As async compaction considers a subset of pageblocks, only
>   		 * defer if the failure was a sync compaction failure.
>   		 */
> -		if (sync_migration)
> +		if (mode != MIGRATE_ASYNC)
>   			defer_compaction(preferred_zone, order);
>
>   		cond_resched();
> @@ -2286,9 +2286,8 @@ static inline struct page *
>   __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>   	struct zonelist *zonelist, enum zone_type high_zoneidx,
>   	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
> -	int migratetype, bool sync_migration,
> -	bool *contended_compaction, bool *deferred_compaction,
> -	unsigned long *did_some_progress)
> +	int migratetype, enum migrate_mode mode, bool *contended_compaction,
> +	bool *deferred_compaction, unsigned long *did_some_progress)
>   {
>   	return NULL;
>   }
> @@ -2483,7 +2482,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
>   	int alloc_flags;
>   	unsigned long pages_reclaimed = 0;
>   	unsigned long did_some_progress;
> -	bool sync_migration = false;
> +	enum migrate_mode migration_mode = MIGRATE_ASYNC;
>   	bool deferred_compaction = false;
>   	bool contended_compaction = false;
>
> @@ -2577,17 +2576,15 @@ rebalance:
>   	 * Try direct compaction. The first pass is asynchronous. Subsequent
>   	 * attempts after direct reclaim are synchronous
>   	 */
> -	page = __alloc_pages_direct_compact(gfp_mask, order,
> -					zonelist, high_zoneidx,
> -					nodemask,
> -					alloc_flags, preferred_zone,
> -					migratetype, sync_migration,
> -					&contended_compaction,
> +	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
> +					high_zoneidx, nodemask, alloc_flags,
> +					preferred_zone, migratetype,
> +					migration_mode, &contended_compaction,
>   					&deferred_compaction,
>   					&did_some_progress);
>   	if (page)
>   		goto got_pg;
> -	sync_migration = true;
> +	migration_mode = MIGRATE_SYNC_LIGHT;
>
>   	/*
>   	 * If compaction is deferred for high-order allocations, it is because
> @@ -2662,12 +2659,10 @@ rebalance:
>   		 * direct reclaim and reclaim/compaction depends on compaction
>   		 * being called after reclaim so call directly if necessary
>   		 */
> -		page = __alloc_pages_direct_compact(gfp_mask, order,
> -					zonelist, high_zoneidx,
> -					nodemask,
> -					alloc_flags, preferred_zone,
> -					migratetype, sync_migration,
> -					&contended_compaction,
> +		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
> +					high_zoneidx, nodemask, alloc_flags,
> +					preferred_zone, migratetype,
> +					migration_mode, &contended_compaction,
>   					&deferred_compaction,
>   					&did_some_progress);
>   		if (page)
> @@ -6254,7 +6249,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>   		.nr_migratepages = 0,
>   		.order = -1,
>   		.zone = page_zone(pfn_to_page(start)),
> -		.sync = true,
> +		.sync = MIGRATE_SYNC_LIGHT,
>   		.ignore_skip_hint = true,
>   	};
>   	INIT_LIST_HEAD(&cc.migratepages);
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 4/6] mm, compaction: embed migration mode in compact_control
@ 2014-05-07  9:55         ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07  9:55 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/07/2014 04:22 AM, David Rientjes wrote:
> We're going to want to manipulate the migration mode for compaction in the page
> allocator, and currently compact_control's sync field is only a bool.
>
> Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending
> on the value of this bool.  Convert the bool to enum migrate_mode and pass the
> migration mode in directly.  Later, we'll want to avoid MIGRATE_SYNC_LIGHT for
> thp allocations in the pagefault patch to avoid unnecessary latency.
>
> This also alters compaction triggered from sysfs, either for the entire system
> or for a node, to force MIGRATE_SYNC.
>
> Suggested-by: Mel Gorman <mgorman@suse.de>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   include/linux/compaction.h |  4 ++--
>   mm/compaction.c            | 36 +++++++++++++++++++-----------------
>   mm/internal.h              |  2 +-
>   mm/page_alloc.c            | 37 ++++++++++++++++---------------------
>   4 files changed, 38 insertions(+), 41 deletions(-)
>
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
>   extern int fragmentation_index(struct zone *zone, unsigned int order);
>   extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
>   			int order, gfp_t gfp_mask, nodemask_t *mask,
> -			bool sync, bool *contended);
> +			enum migrate_mode sync, bool *contended);

Everywhere else it's 'mode' and only in this function it's still called 
'sync', that's confusing.
Afterwards:

Acked-by: Vlastimil Babka <vbabka@suse.cz>

>   extern void compact_pgdat(pg_data_t *pgdat, int order);
>   extern void reset_isolation_suitable(pg_data_t *pgdat);
>   extern unsigned long compaction_suitable(struct zone *zone, int order);
> @@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
>   #else
>   static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
>   			int order, gfp_t gfp_mask, nodemask_t *nodemask,
> -			bool sync, bool *contended)
> +			enum migrate_mode sync, bool *contended)
>   {
>   	return COMPACT_CONTINUE;
>   }
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -161,7 +161,8 @@ static void update_pageblock_skip(struct compact_control *cc,
>   			return;
>   		if (pfn > zone->compact_cached_migrate_pfn[0])
>   			zone->compact_cached_migrate_pfn[0] = pfn;
> -		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
> +		if (cc->mode != MIGRATE_ASYNC &&
> +		    pfn > zone->compact_cached_migrate_pfn[1])
>   			zone->compact_cached_migrate_pfn[1] = pfn;
>   	} else {
>   		if (cc->finished_update_free)
> @@ -208,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>   		}
>
>   		/* async aborts if taking too long or contended */
> -		if (!cc->sync) {
> +		if (cc->mode == MIGRATE_ASYNC) {
>   			cc->contended = true;
>   			return false;
>   		}
> @@ -479,7 +480,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   	bool locked = false;
>   	struct page *page = NULL, *valid_page = NULL;
>   	bool set_unsuitable = true;
> -	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
> +	const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
> +					ISOLATE_ASYNC_MIGRATE : 0) |
>   				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
>
>   	/*
> @@ -489,7 +491,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   	 */
>   	while (unlikely(too_many_isolated(zone))) {
>   		/* async migration should just abort */
> -		if (!cc->sync)
> +		if (cc->mode == MIGRATE_ASYNC)
>   			return 0;
>
>   		congestion_wait(BLK_RW_ASYNC, HZ/10);
> @@ -554,7 +556,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			 * the minimum amount of work satisfies the allocation
>   			 */
>   			mt = get_pageblock_migratetype(page);
> -			if (!cc->sync && !migrate_async_suitable(mt)) {
> +			if (cc->mode == MIGRATE_ASYNC &&
> +			    !migrate_async_suitable(mt)) {
>   				set_unsuitable = false;
>   				goto next_pageblock;
>   			}
> @@ -990,6 +993,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	int ret;
>   	unsigned long start_pfn = zone->zone_start_pfn;
>   	unsigned long end_pfn = zone_end_pfn(zone);
> +	const bool sync = cc->mode != MIGRATE_ASYNC;
>
>   	ret = compaction_suitable(zone, cc->order);
>   	switch (ret) {
> @@ -1015,7 +1019,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   	 * information on where the scanners should start but check that it
>   	 * is initialised by ensuring the values are within zone boundaries.
>   	 */
> -	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
> +	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
>   	cc->free_pfn = zone->compact_cached_free_pfn;
>   	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
>   		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
> @@ -1049,8 +1053,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>
>   		nr_migrate = cc->nr_migratepages;
>   		err = migrate_pages(&cc->migratepages, compaction_alloc,
> -				compaction_free, (unsigned long)cc,
> -				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
> +				compaction_free, (unsigned long)cc, cc->mode,
>   				MR_COMPACTION);
>   		update_nr_listpages(cc);
>   		nr_remaining = cc->nr_migratepages;
> @@ -1083,9 +1086,8 @@ out:
>   	return ret;
>   }
>
> -static unsigned long compact_zone_order(struct zone *zone,
> -				 int order, gfp_t gfp_mask,
> -				 bool sync, bool *contended)
> +static unsigned long compact_zone_order(struct zone *zone, int order,
> +		gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
>   {
>   	unsigned long ret;
>   	struct compact_control cc = {
> @@ -1094,7 +1096,7 @@ static unsigned long compact_zone_order(struct zone *zone,
>   		.order = order,
>   		.migratetype = allocflags_to_migratetype(gfp_mask),
>   		.zone = zone,
> -		.sync = sync,
> +		.mode = mode,
>   	};
>   	INIT_LIST_HEAD(&cc.freepages);
>   	INIT_LIST_HEAD(&cc.migratepages);
> @@ -1116,7 +1118,7 @@ int sysctl_extfrag_threshold = 500;
>    * @order: The order of the current allocation
>    * @gfp_mask: The GFP mask of the current allocation
>    * @nodemask: The allowed nodes to allocate from
> - * @sync: Whether migration is synchronous or not
> + * @sync: The migration mode for async, sync light, or sync migration
>    * @contended: Return value that is true if compaction was aborted due to lock contention
>    * @page: Optionally capture a free page of the requested order during compaction
>    *
> @@ -1124,7 +1126,7 @@ int sysctl_extfrag_threshold = 500;
>    */
>   unsigned long try_to_compact_pages(struct zonelist *zonelist,
>   			int order, gfp_t gfp_mask, nodemask_t *nodemask,
> -			bool sync, bool *contended)
> +			enum migrate_mode sync, bool *contended)
>   {
>   	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
>   	int may_enter_fs = gfp_mask & __GFP_FS;
> @@ -1189,7 +1191,7 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
>   						low_wmark_pages(zone), 0, 0))
>   				compaction_defer_reset(zone, cc->order, false);
>   			/* Currently async compaction is never deferred. */
> -			else if (cc->sync)
> +			else if (cc->mode != MIGRATE_ASYNC)
>   				defer_compaction(zone, cc->order);
>   		}
>
> @@ -1202,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
>   {
>   	struct compact_control cc = {
>   		.order = order,
> -		.sync = false,
> +		.mode = MIGRATE_ASYNC,
>   	};
>
>   	if (!order)
> @@ -1215,7 +1217,7 @@ static void compact_node(int nid)
>   {
>   	struct compact_control cc = {
>   		.order = -1,
> -		.sync = true,
> +		.mode = MIGRATE_SYNC,
>   		.ignore_skip_hint = true,
>   	};
>
> diff --git a/mm/internal.h b/mm/internal.h
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -134,7 +134,7 @@ struct compact_control {
>   	unsigned long nr_migratepages;	/* Number of pages to migrate */
>   	unsigned long free_pfn;		/* isolate_freepages search base */
>   	unsigned long migrate_pfn;	/* isolate_migratepages search base */
> -	bool sync;			/* Synchronous migration */
> +	enum migrate_mode mode;		/* Async or sync migration mode */
>   	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
>   	bool finished_update_free;	/* True when the zone cached pfns are
>   					 * no longer being updated
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2226,7 +2226,7 @@ static struct page *
>   __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>   	struct zonelist *zonelist, enum zone_type high_zoneidx,
>   	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
> -	int migratetype, bool sync_migration,
> +	int migratetype, enum migrate_mode mode,
>   	bool *contended_compaction, bool *deferred_compaction,
>   	unsigned long *did_some_progress)
>   {
> @@ -2240,7 +2240,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>
>   	current->flags |= PF_MEMALLOC;
>   	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
> -						nodemask, sync_migration,
> +						nodemask, mode,
>   						contended_compaction);
>   	current->flags &= ~PF_MEMALLOC;
>
> @@ -2273,7 +2273,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>   		 * As async compaction considers a subset of pageblocks, only
>   		 * defer if the failure was a sync compaction failure.
>   		 */
> -		if (sync_migration)
> +		if (mode != MIGRATE_ASYNC)
>   			defer_compaction(preferred_zone, order);
>
>   		cond_resched();
> @@ -2286,9 +2286,8 @@ static inline struct page *
>   __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>   	struct zonelist *zonelist, enum zone_type high_zoneidx,
>   	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
> -	int migratetype, bool sync_migration,
> -	bool *contended_compaction, bool *deferred_compaction,
> -	unsigned long *did_some_progress)
> +	int migratetype, enum migrate_mode mode, bool *contended_compaction,
> +	bool *deferred_compaction, unsigned long *did_some_progress)
>   {
>   	return NULL;
>   }
> @@ -2483,7 +2482,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
>   	int alloc_flags;
>   	unsigned long pages_reclaimed = 0;
>   	unsigned long did_some_progress;
> -	bool sync_migration = false;
> +	enum migrate_mode migration_mode = MIGRATE_ASYNC;
>   	bool deferred_compaction = false;
>   	bool contended_compaction = false;
>
> @@ -2577,17 +2576,15 @@ rebalance:
>   	 * Try direct compaction. The first pass is asynchronous. Subsequent
>   	 * attempts after direct reclaim are synchronous
>   	 */
> -	page = __alloc_pages_direct_compact(gfp_mask, order,
> -					zonelist, high_zoneidx,
> -					nodemask,
> -					alloc_flags, preferred_zone,
> -					migratetype, sync_migration,
> -					&contended_compaction,
> +	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
> +					high_zoneidx, nodemask, alloc_flags,
> +					preferred_zone, migratetype,
> +					migration_mode, &contended_compaction,
>   					&deferred_compaction,
>   					&did_some_progress);
>   	if (page)
>   		goto got_pg;
> -	sync_migration = true;
> +	migration_mode = MIGRATE_SYNC_LIGHT;
>
>   	/*
>   	 * If compaction is deferred for high-order allocations, it is because
> @@ -2662,12 +2659,10 @@ rebalance:
>   		 * direct reclaim and reclaim/compaction depends on compaction
>   		 * being called after reclaim so call directly if necessary
>   		 */
> -		page = __alloc_pages_direct_compact(gfp_mask, order,
> -					zonelist, high_zoneidx,
> -					nodemask,
> -					alloc_flags, preferred_zone,
> -					migratetype, sync_migration,
> -					&contended_compaction,
> +		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
> +					high_zoneidx, nodemask, alloc_flags,
> +					preferred_zone, migratetype,
> +					migration_mode, &contended_compaction,
>   					&deferred_compaction,
>   					&did_some_progress);
>   		if (page)
> @@ -6254,7 +6249,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>   		.nr_migratepages = 0,
>   		.order = -1,
>   		.zone = page_zone(pfn_to_page(start)),
> -		.sync = true,
> +		.sync = MIGRATE_SYNC_LIGHT,
>   		.ignore_skip_hint = true,
>   	};
>   	INIT_LIST_HEAD(&cc.migratepages);
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v4 4/6] mm, compaction: embed migration mode in compact_control
  2014-05-07  9:55         ` Vlastimil Babka
@ 2014-05-07 10:36           ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 10:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vlastimil Babka, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

We're going to want to manipulate the migration mode for compaction in the page 
allocator, and currently compact_control's sync field is only a bool.  

Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending 
on the value of this bool.  Convert the bool to enum migrate_mode and pass the 
migration mode in directly.  Later, we'll want to avoid MIGRATE_SYNC_LIGHT for 
thp allocations in the pagefault patch to avoid unnecessary latency.

This also alters compaction triggered from sysfs, either for the entire system 
or for a node, to force MIGRATE_SYNC.

Suggested-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 v4 of this patch only: converted name of formal from "sync" to "mode" for
                        try_to_compact_pages() per Vlastimil

 include/linux/compaction.h |  4 ++--
 mm/compaction.c            | 38 ++++++++++++++++++++------------------
 mm/internal.h              |  2 +-
 mm/page_alloc.c            | 37 ++++++++++++++++---------------------
 4 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask,
-			bool sync, bool *contended);
+			enum migrate_mode mode, bool *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode mode, bool *contended)
 {
 	return COMPACT_CONTINUE;
 }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -161,7 +161,8 @@ static void update_pageblock_skip(struct compact_control *cc,
 			return;
 		if (pfn > zone->compact_cached_migrate_pfn[0])
 			zone->compact_cached_migrate_pfn[0] = pfn;
-		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
+		if (cc->mode != MIGRATE_ASYNC &&
+		    pfn > zone->compact_cached_migrate_pfn[1])
 			zone->compact_cached_migrate_pfn[1] = pfn;
 	} else {
 		if (cc->finished_update_free)
@@ -208,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 		}
 
 		/* async aborts if taking too long or contended */
-		if (!cc->sync) {
+		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
 			return false;
 		}
@@ -479,7 +480,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
 	bool set_unsuitable = true;
-	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
+	const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
+					ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
 
 	/*
@@ -489,7 +491,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	 */
 	while (unlikely(too_many_isolated(zone))) {
 		/* async migration should just abort */
-		if (!cc->sync)
+		if (cc->mode == MIGRATE_ASYNC)
 			return 0;
 
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -554,7 +556,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			 * the minimum amount of work satisfies the allocation
 			 */
 			mt = get_pageblock_migratetype(page);
-			if (!cc->sync && !migrate_async_suitable(mt)) {
+			if (cc->mode == MIGRATE_ASYNC &&
+			    !migrate_async_suitable(mt)) {
 				set_unsuitable = false;
 				goto next_pageblock;
 			}
@@ -990,6 +993,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	int ret;
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone_end_pfn(zone);
+	const bool sync = cc->mode != MIGRATE_ASYNC;
 
 	ret = compaction_suitable(zone, cc->order);
 	switch (ret) {
@@ -1015,7 +1019,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	 * information on where the scanners should start but check that it
 	 * is initialised by ensuring the values are within zone boundaries.
 	 */
-	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1049,8 +1053,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 		nr_migrate = cc->nr_migratepages;
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
-				compaction_free, (unsigned long)cc,
-				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+				compaction_free, (unsigned long)cc, cc->mode,
 				MR_COMPACTION);
 		update_nr_listpages(cc);
 		nr_remaining = cc->nr_migratepages;
@@ -1083,9 +1086,8 @@ out:
 	return ret;
 }
 
-static unsigned long compact_zone_order(struct zone *zone,
-				 int order, gfp_t gfp_mask,
-				 bool sync, bool *contended)
+static unsigned long compact_zone_order(struct zone *zone, int order,
+		gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
 {
 	unsigned long ret;
 	struct compact_control cc = {
@@ -1094,7 +1096,7 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.order = order,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
-		.sync = sync,
+		.mode = mode,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -1116,7 +1118,7 @@ int sysctl_extfrag_threshold = 500;
  * @order: The order of the current allocation
  * @gfp_mask: The GFP mask of the current allocation
  * @nodemask: The allowed nodes to allocate from
- * @sync: Whether migration is synchronous or not
+ * @mode: The migration mode for async, sync light, or sync migration
  * @contended: Return value that is true if compaction was aborted due to lock contention
  * @page: Optionally capture a free page of the requested order during compaction
  *
@@ -1124,7 +1126,7 @@ int sysctl_extfrag_threshold = 500;
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode mode, bool *contended)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1149,7 +1151,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 								nodemask) {
 		int status;
 
-		status = compact_zone_order(zone, order, gfp_mask, sync,
+		status = compact_zone_order(zone, order, gfp_mask, mode,
 						contended);
 		rc = max(status, rc);
 
@@ -1189,7 +1191,7 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 						low_wmark_pages(zone), 0, 0))
 				compaction_defer_reset(zone, cc->order, false);
 			/* Currently async compaction is never deferred. */
-			else if (cc->sync)
+			else if (cc->mode != MIGRATE_ASYNC)
 				defer_compaction(zone, cc->order);
 		}
 
@@ -1202,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
 {
 	struct compact_control cc = {
 		.order = order,
-		.sync = false,
+		.mode = MIGRATE_ASYNC,
 	};
 
 	if (!order)
@@ -1215,7 +1217,7 @@ static void compact_node(int nid)
 {
 	struct compact_control cc = {
 		.order = -1,
-		.sync = true,
+		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 	};
 
diff --git a/mm/internal.h b/mm/internal.h
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -134,7 +134,7 @@ struct compact_control {
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
-	bool sync;			/* Synchronous migration */
+	enum migrate_mode mode;		/* Async or sync migration mode */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 	bool finished_update_free;	/* True when the zone cached pfns are
 					 * no longer being updated
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2226,7 +2226,7 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, bool sync_migration,
+	int migratetype, enum migrate_mode mode,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
@@ -2240,7 +2240,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-						nodemask, sync_migration,
+						nodemask, mode,
 						contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 
@@ -2273,7 +2273,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		 * As async compaction considers a subset of pageblocks, only
 		 * defer if the failure was a sync compaction failure.
 		 */
-		if (sync_migration)
+		if (mode != MIGRATE_ASYNC)
 			defer_compaction(preferred_zone, order);
 
 		cond_resched();
@@ -2286,9 +2286,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, bool sync_migration,
-	bool *contended_compaction, bool *deferred_compaction,
-	unsigned long *did_some_progress)
+	int migratetype, enum migrate_mode mode, bool *contended_compaction,
+	bool *deferred_compaction, unsigned long *did_some_progress)
 {
 	return NULL;
 }
@@ -2483,7 +2482,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
-	bool sync_migration = false;
+	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	bool deferred_compaction = false;
 	bool contended_compaction = false;
 
@@ -2577,17 +2576,15 @@ rebalance:
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
 	 */
-	page = __alloc_pages_direct_compact(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask,
-					alloc_flags, preferred_zone,
-					migratetype, sync_migration,
-					&contended_compaction,
+	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+					high_zoneidx, nodemask, alloc_flags,
+					preferred_zone, migratetype,
+					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
-	sync_migration = true;
+	migration_mode = MIGRATE_SYNC_LIGHT;
 
 	/*
 	 * If compaction is deferred for high-order allocations, it is because
@@ -2662,12 +2659,10 @@ rebalance:
 		 * direct reclaim and reclaim/compaction depends on compaction
 		 * being called after reclaim so call directly if necessary
 		 */
-		page = __alloc_pages_direct_compact(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask,
-					alloc_flags, preferred_zone,
-					migratetype, sync_migration,
-					&contended_compaction,
+		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+					high_zoneidx, nodemask, alloc_flags,
+					preferred_zone, migratetype,
+					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 		if (page)
@@ -6254,7 +6249,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
-		.sync = true,
+		.sync = MIGRATE_SYNC_LIGHT,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch v4 4/6] mm, compaction: embed migration mode in compact_control
@ 2014-05-07 10:36           ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 10:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vlastimil Babka, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

We're going to want to manipulate the migration mode for compaction in the page 
allocator, and currently compact_control's sync field is only a bool.  

Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending 
on the value of this bool.  Convert the bool to enum migrate_mode and pass the 
migration mode in directly.  Later, we'll want to avoid MIGRATE_SYNC_LIGHT for 
thp allocations in the pagefault patch to avoid unnecessary latency.

This also alters compaction triggered from sysfs, either for the entire system 
or for a node, to force MIGRATE_SYNC.

Suggested-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 v4 of this patch only: converted name of formal from "sync" to "mode" for
                        try_to_compact_pages() per Vlastimil

 include/linux/compaction.h |  4 ++--
 mm/compaction.c            | 38 ++++++++++++++++++++------------------
 mm/internal.h              |  2 +-
 mm/page_alloc.c            | 37 ++++++++++++++++---------------------
 4 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask,
-			bool sync, bool *contended);
+			enum migrate_mode mode, bool *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode mode, bool *contended)
 {
 	return COMPACT_CONTINUE;
 }
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -161,7 +161,8 @@ static void update_pageblock_skip(struct compact_control *cc,
 			return;
 		if (pfn > zone->compact_cached_migrate_pfn[0])
 			zone->compact_cached_migrate_pfn[0] = pfn;
-		if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1])
+		if (cc->mode != MIGRATE_ASYNC &&
+		    pfn > zone->compact_cached_migrate_pfn[1])
 			zone->compact_cached_migrate_pfn[1] = pfn;
 	} else {
 		if (cc->finished_update_free)
@@ -208,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 		}
 
 		/* async aborts if taking too long or contended */
-		if (!cc->sync) {
+		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
 			return false;
 		}
@@ -479,7 +480,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
 	bool set_unsuitable = true;
-	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
+	const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
+					ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
 
 	/*
@@ -489,7 +491,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	 */
 	while (unlikely(too_many_isolated(zone))) {
 		/* async migration should just abort */
-		if (!cc->sync)
+		if (cc->mode == MIGRATE_ASYNC)
 			return 0;
 
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -554,7 +556,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			 * the minimum amount of work satisfies the allocation
 			 */
 			mt = get_pageblock_migratetype(page);
-			if (!cc->sync && !migrate_async_suitable(mt)) {
+			if (cc->mode == MIGRATE_ASYNC &&
+			    !migrate_async_suitable(mt)) {
 				set_unsuitable = false;
 				goto next_pageblock;
 			}
@@ -990,6 +993,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	int ret;
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone_end_pfn(zone);
+	const bool sync = cc->mode != MIGRATE_ASYNC;
 
 	ret = compaction_suitable(zone, cc->order);
 	switch (ret) {
@@ -1015,7 +1019,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	 * information on where the scanners should start but check that it
 	 * is initialised by ensuring the values are within zone boundaries.
 	 */
-	cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync];
+	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
 		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1049,8 +1053,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 		nr_migrate = cc->nr_migratepages;
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
-				compaction_free, (unsigned long)cc,
-				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+				compaction_free, (unsigned long)cc, cc->mode,
 				MR_COMPACTION);
 		update_nr_listpages(cc);
 		nr_remaining = cc->nr_migratepages;
@@ -1083,9 +1086,8 @@ out:
 	return ret;
 }
 
-static unsigned long compact_zone_order(struct zone *zone,
-				 int order, gfp_t gfp_mask,
-				 bool sync, bool *contended)
+static unsigned long compact_zone_order(struct zone *zone, int order,
+		gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
 {
 	unsigned long ret;
 	struct compact_control cc = {
@@ -1094,7 +1096,7 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.order = order,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
-		.sync = sync,
+		.mode = mode,
 	};
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
@@ -1116,7 +1118,7 @@ int sysctl_extfrag_threshold = 500;
  * @order: The order of the current allocation
  * @gfp_mask: The GFP mask of the current allocation
  * @nodemask: The allowed nodes to allocate from
- * @sync: Whether migration is synchronous or not
+ * @mode: The migration mode for async, sync light, or sync migration
  * @contended: Return value that is true if compaction was aborted due to lock contention
  * @page: Optionally capture a free page of the requested order during compaction
  *
@@ -1124,7 +1126,7 @@ int sysctl_extfrag_threshold = 500;
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			bool sync, bool *contended)
+			enum migrate_mode mode, bool *contended)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1149,7 +1151,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 								nodemask) {
 		int status;
 
-		status = compact_zone_order(zone, order, gfp_mask, sync,
+		status = compact_zone_order(zone, order, gfp_mask, mode,
 						contended);
 		rc = max(status, rc);
 
@@ -1189,7 +1191,7 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 						low_wmark_pages(zone), 0, 0))
 				compaction_defer_reset(zone, cc->order, false);
 			/* Currently async compaction is never deferred. */
-			else if (cc->sync)
+			else if (cc->mode != MIGRATE_ASYNC)
 				defer_compaction(zone, cc->order);
 		}
 
@@ -1202,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
 {
 	struct compact_control cc = {
 		.order = order,
-		.sync = false,
+		.mode = MIGRATE_ASYNC,
 	};
 
 	if (!order)
@@ -1215,7 +1217,7 @@ static void compact_node(int nid)
 {
 	struct compact_control cc = {
 		.order = -1,
-		.sync = true,
+		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 	};
 
diff --git a/mm/internal.h b/mm/internal.h
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -134,7 +134,7 @@ struct compact_control {
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
-	bool sync;			/* Synchronous migration */
+	enum migrate_mode mode;		/* Async or sync migration mode */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 	bool finished_update_free;	/* True when the zone cached pfns are
 					 * no longer being updated
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2226,7 +2226,7 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, bool sync_migration,
+	int migratetype, enum migrate_mode mode,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
@@ -2240,7 +2240,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-						nodemask, sync_migration,
+						nodemask, mode,
 						contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 
@@ -2273,7 +2273,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		 * As async compaction considers a subset of pageblocks, only
 		 * defer if the failure was a sync compaction failure.
 		 */
-		if (sync_migration)
+		if (mode != MIGRATE_ASYNC)
 			defer_compaction(preferred_zone, order);
 
 		cond_resched();
@@ -2286,9 +2286,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, bool sync_migration,
-	bool *contended_compaction, bool *deferred_compaction,
-	unsigned long *did_some_progress)
+	int migratetype, enum migrate_mode mode, bool *contended_compaction,
+	bool *deferred_compaction, unsigned long *did_some_progress)
 {
 	return NULL;
 }
@@ -2483,7 +2482,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
-	bool sync_migration = false;
+	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	bool deferred_compaction = false;
 	bool contended_compaction = false;
 
@@ -2577,17 +2576,15 @@ rebalance:
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
 	 */
-	page = __alloc_pages_direct_compact(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask,
-					alloc_flags, preferred_zone,
-					migratetype, sync_migration,
-					&contended_compaction,
+	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+					high_zoneidx, nodemask, alloc_flags,
+					preferred_zone, migratetype,
+					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
-	sync_migration = true;
+	migration_mode = MIGRATE_SYNC_LIGHT;
 
 	/*
 	 * If compaction is deferred for high-order allocations, it is because
@@ -2662,12 +2659,10 @@ rebalance:
 		 * direct reclaim and reclaim/compaction depends on compaction
 		 * being called after reclaim so call directly if necessary
 		 */
-		page = __alloc_pages_direct_compact(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask,
-					alloc_flags, preferred_zone,
-					migratetype, sync_migration,
-					&contended_compaction,
+		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+					high_zoneidx, nodemask, alloc_flags,
+					preferred_zone, migratetype,
+					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 		if (page)
@@ -6254,7 +6249,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
-		.sync = true,
+		.sync = MIGRATE_SYNC_LIGHT,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-07 12:09         ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07 12:09 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

During compaction, update_nr_listpages() has been used to count remaining
non-migrated and free pages after a call to migrage_pages(). The freepages
counting has become unneccessary, and it turns out that migratepages counting
is also unnecessary in most cases.

The only situation when it's needed to count cc->migratepages is when
migrate_pages() returns with a negative error code. Otherwise, the non-negative
return value is the number of pages that were not migrated, which is exactly
the count of remaining pages in the cc->migratepages list.

Furthermore, any non-zero count is only interesting for the tracepoint of
mm_compaction_migratepages events, because after that all remaining unmigrated
pages are put back and their count is set to 0.

This patch therefore removes update_nr_listpages() completely, and changes the
tracepoint definition so that the manual counting is done only when the
tracepoint is enabled, and only when migrate_pages() returns a negative error
code.

Furthermore, migrate_pages() and the tracepoints won't be called when there's
nothing to migrate. This potentially avoids some wasted cycles and reduces the
volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
The mm_compaction_isolate_migratepages event is better for determining that
nothing was isolated for migration, and this one was just duplicating the info.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 v2: checkpack and other non-functional fixes suggested by Naoya Horiguchi

 include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
 mm/compaction.c                   | 31 +++++++------------------------
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 06f544e..aacaf0f 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -5,7 +5,9 @@
 #define _TRACE_COMPACTION_H
 
 #include <linux/types.h>
+#include <linux/list.h>
 #include <linux/tracepoint.h>
+#include <linux/mm_types.h>
 #include <trace/events/gfpflags.h>
 
 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
@@ -47,10 +49,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
 
 TRACE_EVENT(mm_compaction_migratepages,
 
-	TP_PROTO(unsigned long nr_migrated,
-		unsigned long nr_failed),
+	TP_PROTO(unsigned long nr_all,
+		int migrate_rc,
+		struct list_head *migratepages),
 
-	TP_ARGS(nr_migrated, nr_failed),
+	TP_ARGS(nr_all, migrate_rc, migratepages),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, nr_migrated)
@@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
 	),
 
 	TP_fast_assign(
-		__entry->nr_migrated = nr_migrated;
+		unsigned long nr_failed = 0;
+		struct page *page;
+
+		/*
+		 * migrate_pages() returns either a non-negative number
+		 * with the number of pages that failed migration, or an
+		 * error code, in which case we need to count the remaining
+		 * pages manually
+		 */
+		if (migrate_rc >= 0)
+			nr_failed = migrate_rc;
+		else
+			list_for_each_entry(page, migratepages, lru)
+				nr_failed++;
+
+		__entry->nr_migrated = nr_all - nr_failed;
 		__entry->nr_failed = nr_failed;
 	),
 
diff --git a/mm/compaction.c b/mm/compaction.c
index 6f4b218..383d562 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -819,22 +819,6 @@ static void compaction_free(struct page *page, unsigned long data)
 	cc->nr_freepages++;
 }
 
-/*
- * We cannot control nr_migratepages fully when migration is running as
- * migrate_pages() has no knowledge of of compact_control.  When migration is
- * complete, we count the number of pages on the list by hand.
- */
-static void update_nr_listpages(struct compact_control *cc)
-{
-	int nr_migratepages = 0;
-	struct page *page;
-
-	list_for_each_entry(page, &cc->migratepages, lru)
-		nr_migratepages++;
-
-	cc->nr_migratepages = nr_migratepages;
-}
-
 /* possible outcome of isolate_migratepages */
 typedef enum {
 	ISOLATE_ABORT,		/* Abort compaction now */
@@ -1029,7 +1013,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	migrate_prep_local();
 
 	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
-		unsigned long nr_migrate, nr_remaining;
 		int err;
 
 		switch (isolate_migratepages(zone, cc)) {
@@ -1044,20 +1027,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 			;
 		}
 
-		nr_migrate = cc->nr_migratepages;
+		if (!cc->nr_migratepages)
+			continue;
+
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				compaction_free, (unsigned long)cc, cc->mode,
 				MR_COMPACTION);
-		update_nr_listpages(cc);
-		nr_remaining = cc->nr_migratepages;
 
-		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
-						nr_remaining);
+		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
+							&cc->migratepages);
 
-		/* Release isolated pages not migrated */
+		/* All pages were either migrated or will be released */
+		cc->nr_migratepages = 0;
 		if (err) {
 			putback_movable_pages(&cc->migratepages);
-			cc->nr_migratepages = 0;
 			/*
 			 * migrate_pages() may return -ENOMEM when scanners meet
 			 * and we want compact_finished() to detect it
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary
@ 2014-05-07 12:09         ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07 12:09 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

During compaction, update_nr_listpages() has been used to count remaining
non-migrated and free pages after a call to migrage_pages(). The freepages
counting has become unneccessary, and it turns out that migratepages counting
is also unnecessary in most cases.

The only situation when it's needed to count cc->migratepages is when
migrate_pages() returns with a negative error code. Otherwise, the non-negative
return value is the number of pages that were not migrated, which is exactly
the count of remaining pages in the cc->migratepages list.

Furthermore, any non-zero count is only interesting for the tracepoint of
mm_compaction_migratepages events, because after that all remaining unmigrated
pages are put back and their count is set to 0.

This patch therefore removes update_nr_listpages() completely, and changes the
tracepoint definition so that the manual counting is done only when the
tracepoint is enabled, and only when migrate_pages() returns a negative error
code.

Furthermore, migrate_pages() and the tracepoints won't be called when there's
nothing to migrate. This potentially avoids some wasted cycles and reduces the
volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
The mm_compaction_isolate_migratepages event is better for determining that
nothing was isolated for migration, and this one was just duplicating the info.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 v2: checkpack and other non-functional fixes suggested by Naoya Horiguchi

 include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
 mm/compaction.c                   | 31 +++++++------------------------
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 06f544e..aacaf0f 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -5,7 +5,9 @@
 #define _TRACE_COMPACTION_H
 
 #include <linux/types.h>
+#include <linux/list.h>
 #include <linux/tracepoint.h>
+#include <linux/mm_types.h>
 #include <trace/events/gfpflags.h>
 
 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
@@ -47,10 +49,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
 
 TRACE_EVENT(mm_compaction_migratepages,
 
-	TP_PROTO(unsigned long nr_migrated,
-		unsigned long nr_failed),
+	TP_PROTO(unsigned long nr_all,
+		int migrate_rc,
+		struct list_head *migratepages),
 
-	TP_ARGS(nr_migrated, nr_failed),
+	TP_ARGS(nr_all, migrate_rc, migratepages),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, nr_migrated)
@@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
 	),
 
 	TP_fast_assign(
-		__entry->nr_migrated = nr_migrated;
+		unsigned long nr_failed = 0;
+		struct page *page;
+
+		/*
+		 * migrate_pages() returns either a non-negative number
+		 * with the number of pages that failed migration, or an
+		 * error code, in which case we need to count the remaining
+		 * pages manually
+		 */
+		if (migrate_rc >= 0)
+			nr_failed = migrate_rc;
+		else
+			list_for_each_entry(page, migratepages, lru)
+				nr_failed++;
+
+		__entry->nr_migrated = nr_all - nr_failed;
 		__entry->nr_failed = nr_failed;
 	),
 
diff --git a/mm/compaction.c b/mm/compaction.c
index 6f4b218..383d562 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -819,22 +819,6 @@ static void compaction_free(struct page *page, unsigned long data)
 	cc->nr_freepages++;
 }
 
-/*
- * We cannot control nr_migratepages fully when migration is running as
- * migrate_pages() has no knowledge of of compact_control.  When migration is
- * complete, we count the number of pages on the list by hand.
- */
-static void update_nr_listpages(struct compact_control *cc)
-{
-	int nr_migratepages = 0;
-	struct page *page;
-
-	list_for_each_entry(page, &cc->migratepages, lru)
-		nr_migratepages++;
-
-	cc->nr_migratepages = nr_migratepages;
-}
-
 /* possible outcome of isolate_migratepages */
 typedef enum {
 	ISOLATE_ABORT,		/* Abort compaction now */
@@ -1029,7 +1013,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	migrate_prep_local();
 
 	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
-		unsigned long nr_migrate, nr_remaining;
 		int err;
 
 		switch (isolate_migratepages(zone, cc)) {
@@ -1044,20 +1027,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 			;
 		}
 
-		nr_migrate = cc->nr_migratepages;
+		if (!cc->nr_migratepages)
+			continue;
+
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				compaction_free, (unsigned long)cc, cc->mode,
 				MR_COMPACTION);
-		update_nr_listpages(cc);
-		nr_remaining = cc->nr_migratepages;
 
-		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
-						nr_remaining);
+		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
+							&cc->migratepages);
 
-		/* Release isolated pages not migrated */
+		/* All pages were either migrated or will be released */
+		cc->nr_migratepages = 0;
 		if (err) {
 			putback_movable_pages(&cc->migratepages);
-			cc->nr_migratepages = 0;
 			/*
 			 * migrate_pages() may return -ENOMEM when scanners meet
 			 * and we want compact_finished() to detect it
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-07 12:09         ` Vlastimil Babka
@ 2014-05-07 12:09           ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07 12:09 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

The compaction free scanner in isolate_freepages() currently remembers PFN of
the highest pageblock where it successfully isolates, to be used as the
starting pageblock for the next invocation. The rationale behind this is that
page migration might return free pages to the allocator when migration fails
and we don't want to skip them if the compaction continues.

Since migration now returns free pages back to compaction code where they can
be reused, this is no longer a concern. This patch changes isolate_freepages()
so that the PFN for restarting is updated with each pageblock where isolation
is attempted. Using stress-highalloc from mmtests, this resulted in 10%
reduction of the pages scanned by the free scanner.

Note that the somewhat similar functionality that records highest successful
pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
used when the whole compaction is restarted, not for multiple invocations of
the free scanner during single compaction.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 v2: no changes, just keep patches together

 mm/compaction.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 383d562..83ca6f9 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -706,12 +706,6 @@ static void isolate_freepages(struct zone *zone,
 	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
 
 	/*
-	 * If no pages are isolated, the block_start_pfn < low_pfn check
-	 * will kick in.
-	 */
-	next_free_pfn = 0;
-
-	/*
 	 * Isolate free pages until enough are available to migrate the
 	 * pages on cc->migratepages. We stop searching if the migrate
 	 * and free page scanners meet or enough free pages are isolated.
@@ -751,19 +745,19 @@ static void isolate_freepages(struct zone *zone,
 			continue;
 
 		/* Found a block suitable for isolating free pages from */
+		next_free_pfn = block_start_pfn;
 		isolated = isolate_freepages_block(cc, block_start_pfn,
 					block_end_pfn, freelist, false);
 		nr_freepages += isolated;
 
 		/*
-		 * Record the highest PFN we isolated pages from. When next
-		 * looking for free pages, the search will restart here as
-		 * page migration may have returned some pages to the allocator
+		 * Set a flag that we successfully isolated in this pageblock.
+		 * In the next loop iteration, zone->compact_cached_free_pfn
+		 * will not be updated and thus it will effectively contain the
+		 * highest pageblock we isolated pages from.
 		 */
-		if (isolated && next_free_pfn == 0) {
+		if (isolated)
 			cc->finished_update_free = true;
-			next_free_pfn = block_start_pfn;
-		}
 	}
 
 	/* split_free_page does not map the pages */
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-07 12:09           ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07 12:09 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

The compaction free scanner in isolate_freepages() currently remembers PFN of
the highest pageblock where it successfully isolates, to be used as the
starting pageblock for the next invocation. The rationale behind this is that
page migration might return free pages to the allocator when migration fails
and we don't want to skip them if the compaction continues.

Since migration now returns free pages back to compaction code where they can
be reused, this is no longer a concern. This patch changes isolate_freepages()
so that the PFN for restarting is updated with each pageblock where isolation
is attempted. Using stress-highalloc from mmtests, this resulted in 10%
reduction of the pages scanned by the free scanner.

Note that the somewhat similar functionality that records highest successful
pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
used when the whole compaction is restarted, not for multiple invocations of
the free scanner during single compaction.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 v2: no changes, just keep patches together

 mm/compaction.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 383d562..83ca6f9 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -706,12 +706,6 @@ static void isolate_freepages(struct zone *zone,
 	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
 
 	/*
-	 * If no pages are isolated, the block_start_pfn < low_pfn check
-	 * will kick in.
-	 */
-	next_free_pfn = 0;
-
-	/*
 	 * Isolate free pages until enough are available to migrate the
 	 * pages on cc->migratepages. We stop searching if the migrate
 	 * and free page scanners meet or enough free pages are isolated.
@@ -751,19 +745,19 @@ static void isolate_freepages(struct zone *zone,
 			continue;
 
 		/* Found a block suitable for isolating free pages from */
+		next_free_pfn = block_start_pfn;
 		isolated = isolate_freepages_block(cc, block_start_pfn,
 					block_end_pfn, freelist, false);
 		nr_freepages += isolated;
 
 		/*
-		 * Record the highest PFN we isolated pages from. When next
-		 * looking for free pages, the search will restart here as
-		 * page migration may have returned some pages to the allocator
+		 * Set a flag that we successfully isolated in this pageblock.
+		 * In the next loop iteration, zone->compact_cached_free_pfn
+		 * will not be updated and thus it will effectively contain the
+		 * highest pageblock we isolated pages from.
 		 */
-		if (isolated && next_free_pfn == 0) {
+		if (isolated)
 			cc->finished_update_free = true;
-			next_free_pfn = block_start_pfn;
-		}
 	}
 
 	/* split_free_page does not map the pages */
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-07 12:10         ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07 12:10 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/07/2014 04:22 AM, David Rientjes wrote:
> Async compaction terminates prematurely when need_resched(), see
> compact_checklock_irqsave().  This can never trigger, however, if the
> cond_resched() in isolate_migratepages_range() always takes care of the
> scheduling.
>
> If the cond_resched() actually triggers, then terminate this pageblock scan for
> async compaction as well.
>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>   mm/compaction.c | 7 ++++++-
>   1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -500,8 +500,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			return 0;
>   	}
>
> +	if (cond_resched()) {
> +		/* Async terminates prematurely on need_resched() */
> +		if (cc->mode == MIGRATE_ASYNC)
> +			return 0;
> +	}
> +
>   	/* Time to isolate some pages for migration */
> -	cond_resched();
>   	for (; low_pfn < end_pfn; low_pfn++) {
>   		/* give a chance to irqs before checking need_resched() */
>   		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
@ 2014-05-07 12:10         ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-07 12:10 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/07/2014 04:22 AM, David Rientjes wrote:
> Async compaction terminates prematurely when need_resched(), see
> compact_checklock_irqsave().  This can never trigger, however, if the
> cond_resched() in isolate_migratepages_range() always takes care of the
> scheduling.
>
> If the cond_resched() actually triggers, then terminate this pageblock scan for
> async compaction as well.
>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>   mm/compaction.c | 7 ++++++-
>   1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -500,8 +500,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			return 0;
>   	}
>
> +	if (cond_resched()) {
> +		/* Async terminates prematurely on need_resched() */
> +		if (cc->mode == MIGRATE_ASYNC)
> +			return 0;
> +	}
> +
>   	/* Time to isolate some pages for migration */
> -	cond_resched();
>   	for (; low_pfn < end_pfn; low_pfn++) {
>   		/* give a chance to irqs before checking need_resched() */
>   		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
  2014-05-07  2:22       ` David Rientjes
  (?)
@ 2014-05-07 14:14       ` Naoya Horiguchi
  -1 siblings, 0 replies; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-07 14:14 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, vbabka, iamjoonsoo.kim,
	gthelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, May 06, 2014 at 07:22:43PM -0700, David Rientjes wrote:
> Memory compaction works by having a "freeing scanner" scan from one end of a 
> zone which isolates pages as migration targets while another "migrating scanner" 
> scans from the other end of the same zone which isolates pages for migration.
> 
> When page migration fails for an isolated page, the target page is returned to 
> the system rather than the freelist built by the freeing scanner.  This may 
> require the freeing scanner to continue scanning memory after suitable migration 
> targets have already been returned to the system needlessly.
> 
> This patch returns destination pages to the freeing scanner freelist when page 
> migration fails.  This prevents unnecessary work done by the freeing scanner but 
> also encourages memory to be as compacted as possible at the end of the zone.
> 
> Reported-by: Greg Thelen <gthelen@google.com>
> Acked-by: Mel Gorman <mgorman@suse.de>
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: David Rientjes <rientjes@google.com>

Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 3/6] mm, compaction: add per-zone migration pfn cache for async compaction
  2014-05-07  2:22       ` David Rientjes
  (?)
  (?)
@ 2014-05-07 20:56       ` Naoya Horiguchi
  -1 siblings, 0 replies; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-07 20:56 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Vlastimil Babka,
	Joonsoo Kim, Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, May 06, 2014 at 07:22:45PM -0700, David Rientjes wrote:
> Each zone has a cached migration scanner pfn for memory compaction so that 
> subsequent calls to memory compaction can start where the previous call left 
> off.
> 
> Currently, the compaction migration scanner only updates the per-zone cached pfn 
> when pageblocks were not skipped for async compaction.  This creates a 
> dependency on calling sync compaction to avoid having subsequent calls to async 
> compaction from scanning an enormous amount of non-MOVABLE pageblocks each time 
> it is called.  On large machines, this could be potentially very expensive.
> 
> This patch adds a per-zone cached migration scanner pfn only for async 
> compaction.  It is updated everytime a pageblock has been scanned in its 
> entirety and when no pages from it were successfully isolated.  The cached 
> migration scanner pfn for sync compaction is updated only when called for sync 
> compaction.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  v3: do not update pageblock skip metadata when skipped due to async per
>      Vlastimil.

Looks good to me.
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-07 21:15         ` Andrew Morton
  -1 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-07 21:15 UTC (permalink / raw)
  To: David Rientjes
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, 6 May 2014 19:22:43 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:

> Memory compaction works by having a "freeing scanner" scan from one end of a 
> zone which isolates pages as migration targets while another "migrating scanner" 
> scans from the other end of the same zone which isolates pages for migration.
> 
> When page migration fails for an isolated page, the target page is returned to 
> the system rather than the freelist built by the freeing scanner.  This may 
> require the freeing scanner to continue scanning memory after suitable migration 
> targets have already been returned to the system needlessly.
> 
> This patch returns destination pages to the freeing scanner freelist when page 
> migration fails.  This prevents unnecessary work done by the freeing scanner but 
> also encourages memory to be as compacted as possible at the end of the zone.
> 
> Reported-by: Greg Thelen <gthelen@google.com>

What did Greg actually report?  IOW, what if any observable problem is
being fixed here?

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-07 21:15         ` Andrew Morton
  0 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-07 21:15 UTC (permalink / raw)
  To: David Rientjes
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, 6 May 2014 19:22:43 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:

> Memory compaction works by having a "freeing scanner" scan from one end of a 
> zone which isolates pages as migration targets while another "migrating scanner" 
> scans from the other end of the same zone which isolates pages for migration.
> 
> When page migration fails for an isolated page, the target page is returned to 
> the system rather than the freelist built by the freeing scanner.  This may 
> require the freeing scanner to continue scanning memory after suitable migration 
> targets have already been returned to the system needlessly.
> 
> This patch returns destination pages to the freeing scanner freelist when page 
> migration fails.  This prevents unnecessary work done by the freeing scanner but 
> also encourages memory to be as compacted as possible at the end of the zone.
> 
> Reported-by: Greg Thelen <gthelen@google.com>

What did Greg actually report?  IOW, what if any observable problem is
being fixed here?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-07 21:20         ` Andrew Morton
  -1 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-07 21:20 UTC (permalink / raw)
  To: David Rientjes
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, 6 May 2014 19:22:52 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:

> Async compaction terminates prematurely when need_resched(), see
> compact_checklock_irqsave().  This can never trigger, however, if the 
> cond_resched() in isolate_migratepages_range() always takes care of the 
> scheduling.
> 
> If the cond_resched() actually triggers, then terminate this pageblock scan for 
> async compaction as well.
> 
> ..
>
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -500,8 +500,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  			return 0;
>  	}
>  
> +	if (cond_resched()) {
> +		/* Async terminates prematurely on need_resched() */
> +		if (cc->mode == MIGRATE_ASYNC)
> +			return 0;
> +	}

Comment comments the obvious.  What is less obvious is *why* we do this.


Someone please remind my why sync and async compaction use different
scanning cursors?


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
@ 2014-05-07 21:20         ` Andrew Morton
  0 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-07 21:20 UTC (permalink / raw)
  To: David Rientjes
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, 6 May 2014 19:22:52 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:

> Async compaction terminates prematurely when need_resched(), see
> compact_checklock_irqsave().  This can never trigger, however, if the 
> cond_resched() in isolate_migratepages_range() always takes care of the 
> scheduling.
> 
> If the cond_resched() actually triggers, then terminate this pageblock scan for 
> async compaction as well.
> 
> ..
>
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -500,8 +500,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  			return 0;
>  	}
>  
> +	if (cond_resched()) {
> +		/* Async terminates prematurely on need_resched() */
> +		if (cc->mode == MIGRATE_ASYNC)
> +			return 0;
> +	}

Comment comments the obvious.  What is less obvious is *why* we do this.


Someone please remind my why sync and async compaction use different
scanning cursors?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
  2014-05-07 21:15         ` Andrew Morton
@ 2014-05-07 21:21           ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 21:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Wed, 7 May 2014, Andrew Morton wrote:

> > Memory compaction works by having a "freeing scanner" scan from one end of a 
> > zone which isolates pages as migration targets while another "migrating scanner" 
> > scans from the other end of the same zone which isolates pages for migration.
> > 
> > When page migration fails for an isolated page, the target page is returned to 
> > the system rather than the freelist built by the freeing scanner.  This may 
> > require the freeing scanner to continue scanning memory after suitable migration 
> > targets have already been returned to the system needlessly.
> > 
> > This patch returns destination pages to the freeing scanner freelist when page 
> > migration fails.  This prevents unnecessary work done by the freeing scanner but 
> > also encourages memory to be as compacted as possible at the end of the zone.
> > 
> > Reported-by: Greg Thelen <gthelen@google.com>
> 
> What did Greg actually report?  IOW, what if any observable problem is
> being fixed here?
> 

Greg reported by code inspection that he found isolated free pages were 
returned back to the VM rather than the compaction freelist.  This will 
cause holes behind the free scanner and cause it to reallocate additional 
memory if necessary later.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-07 21:21           ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 21:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Wed, 7 May 2014, Andrew Morton wrote:

> > Memory compaction works by having a "freeing scanner" scan from one end of a 
> > zone which isolates pages as migration targets while another "migrating scanner" 
> > scans from the other end of the same zone which isolates pages for migration.
> > 
> > When page migration fails for an isolated page, the target page is returned to 
> > the system rather than the freelist built by the freeing scanner.  This may 
> > require the freeing scanner to continue scanning memory after suitable migration 
> > targets have already been returned to the system needlessly.
> > 
> > This patch returns destination pages to the freeing scanner freelist when page 
> > migration fails.  This prevents unnecessary work done by the freeing scanner but 
> > also encourages memory to be as compacted as possible at the end of the zone.
> > 
> > Reported-by: Greg Thelen <gthelen@google.com>
> 
> What did Greg actually report?  IOW, what if any observable problem is
> being fixed here?
> 

Greg reported by code inspection that he found isolated free pages were 
returned back to the VM rather than the compaction freelist.  This will 
cause holes behind the free scanner and cause it to reallocate additional 
memory if necessary later.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
  2014-05-07 21:20         ` Andrew Morton
@ 2014-05-07 21:28           ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 21:28 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Wed, 7 May 2014, Andrew Morton wrote:

> > --- a/mm/compaction.c
> > +++ b/mm/compaction.c
> > @@ -500,8 +500,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> >  			return 0;
> >  	}
> >  
> > +	if (cond_resched()) {
> > +		/* Async terminates prematurely on need_resched() */
> > +		if (cc->mode == MIGRATE_ASYNC)
> > +			return 0;
> > +	}
> 
> Comment comments the obvious.  What is less obvious is *why* we do this.
> 

Async compaction is most prevalent for thp pagefaults and without 
zone->lru_lock contention we have no other termination criteria.  Without 
this, we would scan a potentially very long zone (zones 64GB in length in 
my testing) and it would be very expensive for pagefault.  Async is best 
effort, so if it is becoming too expensive then it's better to just 
fallback to PAGE_SIZE pages instead and rely on khugepaged to collapse 
later.

> Someone please remind my why sync and async compaction use different
> scanning cursors?
> 

It's introduced in this patchset.  Async compaction does not consider 
pageblocks unless it is MIGRATE_MOVABLE since it is best effort, sync 
compaction considers all pageblocks.  In the past, we only updated the 
cursor for sync compaction since it would be wrong to update it for async 
compaction if it can skip certain pageblocks.  Unfortunately, if async 
compaction is relied upon solely for certain allocations (such as thp 
pagefaults), it is possible to scan an enormous amount of a 64GB zone, for 
example, pointlessly every time if none of the memory can be isolated.

The result is that sync compaction always updates both scanners and async 
compaction only updates its own scanner.  Either scanner is only updated 
if the new cursor is "beyond" the previous cursor.  ("Beyond" is _after_ 
the previous migration scanner pfn and _before_ the previous free scanner 
pfn.)

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
@ 2014-05-07 21:28           ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 21:28 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Wed, 7 May 2014, Andrew Morton wrote:

> > --- a/mm/compaction.c
> > +++ b/mm/compaction.c
> > @@ -500,8 +500,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> >  			return 0;
> >  	}
> >  
> > +	if (cond_resched()) {
> > +		/* Async terminates prematurely on need_resched() */
> > +		if (cc->mode == MIGRATE_ASYNC)
> > +			return 0;
> > +	}
> 
> Comment comments the obvious.  What is less obvious is *why* we do this.
> 

Async compaction is most prevalent for thp pagefaults and without 
zone->lru_lock contention we have no other termination criteria.  Without 
this, we would scan a potentially very long zone (zones 64GB in length in 
my testing) and it would be very expensive for pagefault.  Async is best 
effort, so if it is becoming too expensive then it's better to just 
fallback to PAGE_SIZE pages instead and rely on khugepaged to collapse 
later.

> Someone please remind my why sync and async compaction use different
> scanning cursors?
> 

It's introduced in this patchset.  Async compaction does not consider 
pageblocks unless it is MIGRATE_MOVABLE since it is best effort, sync 
compaction considers all pageblocks.  In the past, we only updated the 
cursor for sync compaction since it would be wrong to update it for async 
compaction if it can skip certain pageblocks.  Unfortunately, if async 
compaction is relied upon solely for certain allocations (such as thp 
pagefaults), it is possible to scan an enormous amount of a 64GB zone, for 
example, pointlessly every time if none of the memory can be isolated.

The result is that sync compaction always updates both scanners and async 
compaction only updates its own scanner.  Either scanner is only updated 
if the new cursor is "beyond" the previous cursor.  ("Beyond" is _after_ 
the previous migration scanner pfn and _before_ the previous free scanner 
pfn.)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
  2014-05-07 21:15         ` Andrew Morton
@ 2014-05-07 21:39           ` Greg Thelen
  -1 siblings, 0 replies; 267+ messages in thread
From: Greg Thelen @ 2014-05-07 21:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Rientjes, Mel Gorman, Rik van Riel, Vlastimil Babka,
	Joonsoo Kim, Hugh Dickins, linux-kernel, linux-mm


On Wed, May 07 2014, Andrew Morton <akpm@linux-foundation.org> wrote:

> On Tue, 6 May 2014 19:22:43 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:
>
>> Memory compaction works by having a "freeing scanner" scan from one end of a 
>> zone which isolates pages as migration targets while another "migrating scanner" 
>> scans from the other end of the same zone which isolates pages for migration.
>> 
>> When page migration fails for an isolated page, the target page is returned to 
>> the system rather than the freelist built by the freeing scanner.  This may 
>> require the freeing scanner to continue scanning memory after suitable migration 
>> targets have already been returned to the system needlessly.
>> 
>> This patch returns destination pages to the freeing scanner freelist when page 
>> migration fails.  This prevents unnecessary work done by the freeing scanner but 
>> also encourages memory to be as compacted as possible at the end of the zone.
>> 
>> Reported-by: Greg Thelen <gthelen@google.com>
>
> What did Greg actually report?  IOW, what if any observable problem is
> being fixed here?

I detected the problem at runtime seeing that ext4 metadata pages (esp
the ones read by "sbi->s_group_desc[i] = sb_bread(sb, block)") were
constantly visited by compaction calls of migrate_pages().  These pages
had a non-zero b_count which caused fallback_migrate_page() ->
try_to_release_page() -> try_to_free_buffers() to fail.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-07 21:39           ` Greg Thelen
  0 siblings, 0 replies; 267+ messages in thread
From: Greg Thelen @ 2014-05-07 21:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Rientjes, Mel Gorman, Rik van Riel, Vlastimil Babka,
	Joonsoo Kim, Hugh Dickins, linux-kernel, linux-mm


On Wed, May 07 2014, Andrew Morton <akpm@linux-foundation.org> wrote:

> On Tue, 6 May 2014 19:22:43 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:
>
>> Memory compaction works by having a "freeing scanner" scan from one end of a 
>> zone which isolates pages as migration targets while another "migrating scanner" 
>> scans from the other end of the same zone which isolates pages for migration.
>> 
>> When page migration fails for an isolated page, the target page is returned to 
>> the system rather than the freelist built by the freeing scanner.  This may 
>> require the freeing scanner to continue scanning memory after suitable migration 
>> targets have already been returned to the system needlessly.
>> 
>> This patch returns destination pages to the freeing scanner freelist when page 
>> migration fails.  This prevents unnecessary work done by the freeing scanner but 
>> also encourages memory to be as compacted as possible at the end of the zone.
>> 
>> Reported-by: Greg Thelen <gthelen@google.com>
>
> What did Greg actually report?  IOW, what if any observable problem is
> being fixed here?

I detected the problem at runtime seeing that ext4 metadata pages (esp
the ones read by "sbi->s_group_desc[i] = sb_bread(sb, block)") were
constantly visited by compaction calls of migrate_pages().  These pages
had a non-zero b_count which caused fallback_migrate_page() ->
try_to_release_page() -> try_to_free_buffers() to fail.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary
  2014-05-07 12:09         ` Vlastimil Babka
@ 2014-05-07 21:44           ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 21:44 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Christoph Lameter, Rik van Riel

On Wed, 7 May 2014, Vlastimil Babka wrote:

> During compaction, update_nr_listpages() has been used to count remaining
> non-migrated and free pages after a call to migrage_pages(). The freepages
> counting has become unneccessary, and it turns out that migratepages counting
> is also unnecessary in most cases.
> 
> The only situation when it's needed to count cc->migratepages is when
> migrate_pages() returns with a negative error code. Otherwise, the non-negative
> return value is the number of pages that were not migrated, which is exactly
> the count of remaining pages in the cc->migratepages list.
> 
> Furthermore, any non-zero count is only interesting for the tracepoint of
> mm_compaction_migratepages events, because after that all remaining unmigrated
> pages are put back and their count is set to 0.
> 
> This patch therefore removes update_nr_listpages() completely, and changes the
> tracepoint definition so that the manual counting is done only when the
> tracepoint is enabled, and only when migrate_pages() returns a negative error
> code.
> 
> Furthermore, migrate_pages() and the tracepoints won't be called when there's
> nothing to migrate. This potentially avoids some wasted cycles and reduces the
> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
> The mm_compaction_isolate_migratepages event is better for determining that
> nothing was isolated for migration, and this one was just duplicating the info.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>

Acked-by: David Rientjes <rientjes@google.com>

I like this, before our two patches update_nr_listpages() was expensive 
when called for each pageblock and being able to remove it is certainly a 
step in the right direction to make compaction as fast as possible.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary
@ 2014-05-07 21:44           ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 21:44 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Christoph Lameter, Rik van Riel

On Wed, 7 May 2014, Vlastimil Babka wrote:

> During compaction, update_nr_listpages() has been used to count remaining
> non-migrated and free pages after a call to migrage_pages(). The freepages
> counting has become unneccessary, and it turns out that migratepages counting
> is also unnecessary in most cases.
> 
> The only situation when it's needed to count cc->migratepages is when
> migrate_pages() returns with a negative error code. Otherwise, the non-negative
> return value is the number of pages that were not migrated, which is exactly
> the count of remaining pages in the cc->migratepages list.
> 
> Furthermore, any non-zero count is only interesting for the tracepoint of
> mm_compaction_migratepages events, because after that all remaining unmigrated
> pages are put back and their count is set to 0.
> 
> This patch therefore removes update_nr_listpages() completely, and changes the
> tracepoint definition so that the manual counting is done only when the
> tracepoint is enabled, and only when migrate_pages() returns a negative error
> code.
> 
> Furthermore, migrate_pages() and the tracepoints won't be called when there's
> nothing to migrate. This potentially avoids some wasted cycles and reduces the
> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
> The mm_compaction_isolate_migratepages event is better for determining that
> nothing was isolated for migration, and this one was just duplicating the info.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>

Acked-by: David Rientjes <rientjes@google.com>

I like this, before our two patches update_nr_listpages() was expensive 
when called for each pageblock and being able to remove it is certainly a 
step in the right direction to make compaction as fast as possible.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-07 12:09           ` Vlastimil Babka
@ 2014-05-07 21:47             ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 21:47 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Naoya Horiguchi, Christoph Lameter,
	Rik van Riel

On Wed, 7 May 2014, Vlastimil Babka wrote:

> The compaction free scanner in isolate_freepages() currently remembers PFN of
> the highest pageblock where it successfully isolates, to be used as the
> starting pageblock for the next invocation. The rationale behind this is that
> page migration might return free pages to the allocator when migration fails
> and we don't want to skip them if the compaction continues.
> 
> Since migration now returns free pages back to compaction code where they can
> be reused, this is no longer a concern. This patch changes isolate_freepages()
> so that the PFN for restarting is updated with each pageblock where isolation
> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> reduction of the pages scanned by the free scanner.
> 
> Note that the somewhat similar functionality that records highest successful
> pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
> used when the whole compaction is restarted, not for multiple invocations of
> the free scanner during single compaction.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>

Acked-by: David Rientjes <rientjes@google.com>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-07 21:47             ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-07 21:47 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Naoya Horiguchi, Christoph Lameter,
	Rik van Riel

On Wed, 7 May 2014, Vlastimil Babka wrote:

> The compaction free scanner in isolate_freepages() currently remembers PFN of
> the highest pageblock where it successfully isolates, to be used as the
> starting pageblock for the next invocation. The rationale behind this is that
> page migration might return free pages to the allocator when migration fails
> and we don't want to skip them if the compaction continues.
> 
> Since migration now returns free pages back to compaction code where they can
> be reused, this is no longer a concern. This patch changes isolate_freepages()
> so that the PFN for restarting is updated with each pageblock where isolation
> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> reduction of the pages scanned by the free scanner.
> 
> Note that the somewhat similar functionality that records highest successful
> pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
> used when the whole compaction is restarted, not for multiple invocations of
> the free scanner during single compaction.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>

Acked-by: David Rientjes <rientjes@google.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-07 12:09           ` Vlastimil Babka
  (?)
  (?)
@ 2014-05-07 22:06           ` Naoya Horiguchi
  -1 siblings, 0 replies; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-07 22:06 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, minchan, Mel Gorman, Joonsoo Kim,
	b.zolnierkie, mina86, cl, Rik van Riel

On Wed, May 07, 2014 at 02:09:10PM +0200, Vlastimil Babka wrote:
> The compaction free scanner in isolate_freepages() currently remembers PFN of
> the highest pageblock where it successfully isolates, to be used as the
> starting pageblock for the next invocation. The rationale behind this is that
> page migration might return free pages to the allocator when migration fails
> and we don't want to skip them if the compaction continues.
> 
> Since migration now returns free pages back to compaction code where they can
> be reused, this is no longer a concern. This patch changes isolate_freepages()
> so that the PFN for restarting is updated with each pageblock where isolation
> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> reduction of the pages scanned by the free scanner.
> 
> Note that the somewhat similar functionality that records highest successful
> pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
> used when the whole compaction is restarted, not for multiple invocations of
> the free scanner during single compaction.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  v2: no changes, just keep patches together

Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-08  5:17         ` Joonsoo Kim
  -1 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-08  5:17 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Vlastimil Babka,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, May 06, 2014 at 07:22:52PM -0700, David Rientjes wrote:
> Async compaction terminates prematurely when need_resched(), see
> compact_checklock_irqsave().  This can never trigger, however, if the 
> cond_resched() in isolate_migratepages_range() always takes care of the 
> scheduling.
> 
> If the cond_resched() actually triggers, then terminate this pageblock scan for 
> async compaction as well.

Hello,

I think that same logic would be helpful to cond_resched() in
isolatate_freepages(). And, isolate_freepages() doesn't have exit logic
when it find zone_lock contention. I think that fixing it is also
helpful.

Thanks.


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling
@ 2014-05-08  5:17         ` Joonsoo Kim
  0 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-08  5:17 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Vlastimil Babka,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, May 06, 2014 at 07:22:52PM -0700, David Rientjes wrote:
> Async compaction terminates prematurely when need_resched(), see
> compact_checklock_irqsave().  This can never trigger, however, if the 
> cond_resched() in isolate_migratepages_range() always takes care of the 
> scheduling.
> 
> If the cond_resched() actually triggers, then terminate this pageblock scan for 
> async compaction as well.

Hello,

I think that same logic would be helpful to cond_resched() in
isolatate_freepages(). And, isolate_freepages() doesn't have exit logic
when it find zone_lock contention. I think that fixing it is also
helpful.

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-07 12:09           ` Vlastimil Babka
@ 2014-05-08  5:28             ` Joonsoo Kim
  -1 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-08  5:28 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Wed, May 07, 2014 at 02:09:10PM +0200, Vlastimil Babka wrote:
> The compaction free scanner in isolate_freepages() currently remembers PFN of
> the highest pageblock where it successfully isolates, to be used as the
> starting pageblock for the next invocation. The rationale behind this is that
> page migration might return free pages to the allocator when migration fails
> and we don't want to skip them if the compaction continues.
> 
> Since migration now returns free pages back to compaction code where they can
> be reused, this is no longer a concern. This patch changes isolate_freepages()
> so that the PFN for restarting is updated with each pageblock where isolation
> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> reduction of the pages scanned by the free scanner.

Hello,

Although this patch could reduce page scanned, it is possible to skip
scanning fresh pageblock. If there is zone lock contention and we are on
asyn compaction, we stop scanning this pageblock immediately. And
then, we will continue to scan next pageblock. With this patch,
next_free_pfn is updated in this case, so we never come back again to this
pageblock. Possibly this makes compaction success rate low, doesn't
it?

Thanks.


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-08  5:28             ` Joonsoo Kim
  0 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-08  5:28 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Wed, May 07, 2014 at 02:09:10PM +0200, Vlastimil Babka wrote:
> The compaction free scanner in isolate_freepages() currently remembers PFN of
> the highest pageblock where it successfully isolates, to be used as the
> starting pageblock for the next invocation. The rationale behind this is that
> page migration might return free pages to the allocator when migration fails
> and we don't want to skip them if the compaction continues.
> 
> Since migration now returns free pages back to compaction code where they can
> be reused, this is no longer a concern. This patch changes isolate_freepages()
> so that the PFN for restarting is updated with each pageblock where isolation
> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> reduction of the pages scanned by the free scanner.

Hello,

Although this patch could reduce page scanned, it is possible to skip
scanning fresh pageblock. If there is zone lock contention and we are on
asyn compaction, we stop scanning this pageblock immediately. And
then, we will continue to scan next pageblock. With this patch,
next_free_pfn is updated in this case, so we never come back again to this
pageblock. Possibly this makes compaction success rate low, doesn't
it?

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch -mm] mm, thp: avoid excessive compaction latency during fault fix
  2014-05-07  2:22       ` David Rientjes
@ 2014-05-08  5:30         ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-08  5:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

mm-thp-avoid-excessive-compaction-latency-during-fault.patch excludes sync
compaction for all high order allocations other than thp.  What we really
want to do is suppress sync compaction for thp, but only during the page
fault path.

Orders greater than PAGE_ALLOC_COSTLY_ORDER aren't necessarily going to
loop again so this is the only way to exhaust our capabilities before
declaring that we can't allocate.

Reported-by: Hugh Dickins <hughd@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/page_alloc.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2585,16 +2585,13 @@ rebalance:
 	if (page)
 		goto got_pg;
 
-	if (gfp_mask & __GFP_NO_KSWAPD) {
-		/*
-		 * Khugepaged is allowed to try MIGRATE_SYNC_LIGHT, the latency
-		 * of this allocation isn't critical.  Everything else, however,
-		 * should only be allowed to do MIGRATE_ASYNC to avoid excessive
-		 * stalls during fault.
-		 */
-		if ((current->flags & (PF_KTHREAD | PF_KSWAPD)) == PF_KTHREAD)
-			migration_mode = MIGRATE_SYNC_LIGHT;
-	}
+	/*
+	 * It can become very expensive to allocate transparent hugepages at
+	 * fault, so use asynchronous memory compaction for THP unless it is
+	 * khugepaged trying to collapse.
+	 */
+	if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
+		migration_mode = MIGRATE_SYNC_LIGHT;
 
 	/*
 	 * If compaction is deferred for high-order allocations, it is because

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch -mm] mm, thp: avoid excessive compaction latency during fault fix
@ 2014-05-08  5:30         ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-08  5:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Vlastimil Babka, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

mm-thp-avoid-excessive-compaction-latency-during-fault.patch excludes sync
compaction for all high order allocations other than thp.  What we really
want to do is suppress sync compaction for thp, but only during the page
fault path.

Orders greater than PAGE_ALLOC_COSTLY_ORDER aren't necessarily going to
loop again so this is the only way to exhaust our capabilities before
declaring that we can't allocate.

Reported-by: Hugh Dickins <hughd@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/page_alloc.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2585,16 +2585,13 @@ rebalance:
 	if (page)
 		goto got_pg;
 
-	if (gfp_mask & __GFP_NO_KSWAPD) {
-		/*
-		 * Khugepaged is allowed to try MIGRATE_SYNC_LIGHT, the latency
-		 * of this allocation isn't critical.  Everything else, however,
-		 * should only be allowed to do MIGRATE_ASYNC to avoid excessive
-		 * stalls during fault.
-		 */
-		if ((current->flags & (PF_KTHREAD | PF_KSWAPD)) == PF_KTHREAD)
-			migration_mode = MIGRATE_SYNC_LIGHT;
-	}
+	/*
+	 * It can become very expensive to allocate transparent hugepages at
+	 * fault, so use asynchronous memory compaction for THP unless it is
+	 * khugepaged trying to collapse.
+	 */
+	if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
+		migration_mode = MIGRATE_SYNC_LIGHT;
 
 	/*
 	 * If compaction is deferred for high-order allocations, it is because

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary
  2014-05-07 12:09         ` Vlastimil Babka
@ 2014-05-09 15:48           ` Michal Nazarewicz
  -1 siblings, 0 replies; 267+ messages in thread
From: Michal Nazarewicz @ 2014-05-09 15:48 UTC (permalink / raw)
  To: Vlastimil Babka, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Bartlomiej Zolnierkiewicz, Christoph Lameter, Rik van Riel

[-- Attachment #1: Type: text/plain, Size: 3562 bytes --]

On Wed, May 07 2014, Vlastimil Babka wrote:
> During compaction, update_nr_listpages() has been used to count remaining
> non-migrated and free pages after a call to migrage_pages(). The freepages
> counting has become unneccessary, and it turns out that migratepages counting
> is also unnecessary in most cases.
>
> The only situation when it's needed to count cc->migratepages is when
> migrate_pages() returns with a negative error code. Otherwise, the non-negative
> return value is the number of pages that were not migrated, which is exactly
> the count of remaining pages in the cc->migratepages list.
>
> Furthermore, any non-zero count is only interesting for the tracepoint of
> mm_compaction_migratepages events, because after that all remaining unmigrated
> pages are put back and their count is set to 0.
>
> This patch therefore removes update_nr_listpages() completely, and changes the
> tracepoint definition so that the manual counting is done only when the
> tracepoint is enabled, and only when migrate_pages() returns a negative error
> code.
>
> Furthermore, migrate_pages() and the tracepoints won't be called when there's
> nothing to migrate. This potentially avoids some wasted cycles and reduces the
> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
> The mm_compaction_isolate_migratepages event is better for determining that
> nothing was isolated for migration, and this one was just duplicating the info.
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>

Acked-by: Michal Nazarewicz <mina86@mina86.com>

One tiny comment below:

> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  v2: checkpack and other non-functional fixes suggested by Naoya Horiguchi
>
>  include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
>  mm/compaction.c                   | 31 +++++++------------------------
>  2 files changed, 29 insertions(+), 28 deletions(-)
>
> diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
> index 06f544e..aacaf0f 100644
> --- a/include/trace/events/compaction.h
> +++ b/include/trace/events/compaction.h
> @@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
>  	),
>  
>  	TP_fast_assign(
> -		__entry->nr_migrated = nr_migrated;
> +		unsigned long nr_failed = 0;
> +		struct page *page;
> +
> +		/*
> +		 * migrate_pages() returns either a non-negative number
> +		 * with the number of pages that failed migration, or an
> +		 * error code, in which case we need to count the remaining
> +		 * pages manually
> +		 */
> +		if (migrate_rc >= 0)
> +			nr_failed = migrate_rc;
> +		else
> +			list_for_each_entry(page, migratepages, lru)
> +				nr_failed++;

list_for_each would suffice here.

> +
> +		__entry->nr_migrated = nr_all - nr_failed;
>  		__entry->nr_failed = nr_failed;
>  	),
>  

-- 
Best regards,                                         _     _
.o. | Liege of Serenely Enlightened Majesty of      o' \,=./ `o
..o | Computer Science,  Michał “mina86” Nazarewicz    (o o)
ooo +--<mpn@google.com>--<xmpp:mina86@jabber.org>--ooO--(_)--Ooo--

[-- Attachment #2.1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 835 bytes --]

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary
@ 2014-05-09 15:48           ` Michal Nazarewicz
  0 siblings, 0 replies; 267+ messages in thread
From: Michal Nazarewicz @ 2014-05-09 15:48 UTC (permalink / raw)
  To: Vlastimil Babka, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Christoph Lameter, Rik van Riel

[-- Attachment #1: Type: text/plain, Size: 3562 bytes --]

On Wed, May 07 2014, Vlastimil Babka wrote:
> During compaction, update_nr_listpages() has been used to count remaining
> non-migrated and free pages after a call to migrage_pages(). The freepages
> counting has become unneccessary, and it turns out that migratepages counting
> is also unnecessary in most cases.
>
> The only situation when it's needed to count cc->migratepages is when
> migrate_pages() returns with a negative error code. Otherwise, the non-negative
> return value is the number of pages that were not migrated, which is exactly
> the count of remaining pages in the cc->migratepages list.
>
> Furthermore, any non-zero count is only interesting for the tracepoint of
> mm_compaction_migratepages events, because after that all remaining unmigrated
> pages are put back and their count is set to 0.
>
> This patch therefore removes update_nr_listpages() completely, and changes the
> tracepoint definition so that the manual counting is done only when the
> tracepoint is enabled, and only when migrate_pages() returns a negative error
> code.
>
> Furthermore, migrate_pages() and the tracepoints won't be called when there's
> nothing to migrate. This potentially avoids some wasted cycles and reduces the
> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
> The mm_compaction_isolate_migratepages event is better for determining that
> nothing was isolated for migration, and this one was just duplicating the info.
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>

Acked-by: Michal Nazarewicz <mina86@mina86.com>

One tiny comment below:

> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  v2: checkpack and other non-functional fixes suggested by Naoya Horiguchi
>
>  include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
>  mm/compaction.c                   | 31 +++++++------------------------
>  2 files changed, 29 insertions(+), 28 deletions(-)
>
> diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
> index 06f544e..aacaf0f 100644
> --- a/include/trace/events/compaction.h
> +++ b/include/trace/events/compaction.h
> @@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
>  	),
>  
>  	TP_fast_assign(
> -		__entry->nr_migrated = nr_migrated;
> +		unsigned long nr_failed = 0;
> +		struct page *page;
> +
> +		/*
> +		 * migrate_pages() returns either a non-negative number
> +		 * with the number of pages that failed migration, or an
> +		 * error code, in which case we need to count the remaining
> +		 * pages manually
> +		 */
> +		if (migrate_rc >= 0)
> +			nr_failed = migrate_rc;
> +		else
> +			list_for_each_entry(page, migratepages, lru)
> +				nr_failed++;

list_for_each would suffice here.

> +
> +		__entry->nr_migrated = nr_all - nr_failed;
>  		__entry->nr_failed = nr_failed;
>  	),
>  

-- 
Best regards,                                         _     _
.o. | Liege of Serenely Enlightened Majesty of      o' \,=./ `o
..o | Computer Science,  Michał “mina86” Nazarewicz    (o o)
ooo +--<mpn@google.com>--<xmpp:mina86@jabber.org>--ooO--(_)--Ooo--

[-- Attachment #2.1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 835 bytes --]

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-07 12:09           ` Vlastimil Babka
@ 2014-05-09 15:49             ` Michal Nazarewicz
  -1 siblings, 0 replies; 267+ messages in thread
From: Michal Nazarewicz @ 2014-05-09 15:49 UTC (permalink / raw)
  To: Vlastimil Babka, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman, Joonsoo Kim,
	Bartlomiej Zolnierkiewicz, Naoya Horiguchi, Christoph Lameter,
	Rik van Riel

[-- Attachment #1: Type: text/plain, Size: 1977 bytes --]

On Wed, May 07 2014, Vlastimil Babka wrote:
> The compaction free scanner in isolate_freepages() currently remembers PFN of
> the highest pageblock where it successfully isolates, to be used as the
> starting pageblock for the next invocation. The rationale behind this is that
> page migration might return free pages to the allocator when migration fails
> and we don't want to skip them if the compaction continues.
>
> Since migration now returns free pages back to compaction code where they can
> be reused, this is no longer a concern. This patch changes isolate_freepages()
> so that the PFN for restarting is updated with each pageblock where isolation
> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> reduction of the pages scanned by the free scanner.
>
> Note that the somewhat similar functionality that records highest successful
> pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
> used when the whole compaction is restarted, not for multiple invocations of
> the free scanner during single compaction.
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>

Acked-by: Michal Nazarewicz <mina86@mina86.com>

> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  v2: no changes, just keep patches together
>
>  mm/compaction.c | 18 ++++++------------
>  1 file changed, 6 insertions(+), 12 deletions(-)
>
-- 
Best regards,                                         _     _
.o. | Liege of Serenely Enlightened Majesty of      o' \,=./ `o
..o | Computer Science,  Michał “mina86” Nazarewicz    (o o)
ooo +--<mpn@google.com>--<xmpp:mina86@jabber.org>--ooO--(_)--Ooo--

[-- Attachment #2.1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 835 bytes --]

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-09 15:49             ` Michal Nazarewicz
  0 siblings, 0 replies; 267+ messages in thread
From: Michal Nazarewicz @ 2014-05-09 15:49 UTC (permalink / raw)
  To: Vlastimil Babka, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel

[-- Attachment #1: Type: text/plain, Size: 1977 bytes --]

On Wed, May 07 2014, Vlastimil Babka wrote:
> The compaction free scanner in isolate_freepages() currently remembers PFN of
> the highest pageblock where it successfully isolates, to be used as the
> starting pageblock for the next invocation. The rationale behind this is that
> page migration might return free pages to the allocator when migration fails
> and we don't want to skip them if the compaction continues.
>
> Since migration now returns free pages back to compaction code where they can
> be reused, this is no longer a concern. This patch changes isolate_freepages()
> so that the PFN for restarting is updated with each pageblock where isolation
> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> reduction of the pages scanned by the free scanner.
>
> Note that the somewhat similar functionality that records highest successful
> pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is
> used when the whole compaction is restarted, not for multiple invocations of
> the free scanner during single compaction.
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>

Acked-by: Michal Nazarewicz <mina86@mina86.com>

> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  v2: no changes, just keep patches together
>
>  mm/compaction.c | 18 ++++++------------
>  1 file changed, 6 insertions(+), 12 deletions(-)
>
-- 
Best regards,                                         _     _
.o. | Liege of Serenely Enlightened Majesty of      o' \,=./ `o
..o | Computer Science,  Michał “mina86” Nazarewicz    (o o)
ooo +--<mpn@google.com>--<xmpp:mina86@jabber.org>--ooO--(_)--Ooo--

[-- Attachment #2.1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 835 bytes --]

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v4 4/6] mm, compaction: embed migration mode in compact_control
  2014-05-07 10:36           ` David Rientjes
@ 2014-05-09 22:03             ` Andrew Morton
  -1 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-09 22:03 UTC (permalink / raw)
  To: David Rientjes
  Cc: Vlastimil Babka, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Wed, 7 May 2014 03:36:46 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:

> We're going to want to manipulate the migration mode for compaction in the page 
> allocator, and currently compact_control's sync field is only a bool.  
> 
> Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending 
> on the value of this bool.  Convert the bool to enum migrate_mode and pass the 
> migration mode in directly.  Later, we'll want to avoid MIGRATE_SYNC_LIGHT for 
> thp allocations in the pagefault patch to avoid unnecessary latency.
> 
> This also alters compaction triggered from sysfs, either for the entire system 
> or for a node, to force MIGRATE_SYNC.

mm/page_alloc.c: In function 'alloc_contig_range':
mm/page_alloc.c:6255: error: unknown field 'sync' specified in initializer

--- a/mm/page_alloc.c~mm-compaction-embed-migration-mode-in-compact_control-fix
+++ a/mm/page_alloc.c
@@ -6252,7 +6252,7 @@ int alloc_contig_range(unsigned long sta
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
-		.sync = MIGRATE_SYNC_LIGHT,
+		.mode = MIGRATE_SYNC_LIGHT,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);


Please check that you sent the correct version of this?

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v4 4/6] mm, compaction: embed migration mode in compact_control
@ 2014-05-09 22:03             ` Andrew Morton
  0 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-09 22:03 UTC (permalink / raw)
  To: David Rientjes
  Cc: Vlastimil Babka, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Wed, 7 May 2014 03:36:46 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:

> We're going to want to manipulate the migration mode for compaction in the page 
> allocator, and currently compact_control's sync field is only a bool.  
> 
> Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending 
> on the value of this bool.  Convert the bool to enum migrate_mode and pass the 
> migration mode in directly.  Later, we'll want to avoid MIGRATE_SYNC_LIGHT for 
> thp allocations in the pagefault patch to avoid unnecessary latency.
> 
> This also alters compaction triggered from sysfs, either for the entire system 
> or for a node, to force MIGRATE_SYNC.

mm/page_alloc.c: In function 'alloc_contig_range':
mm/page_alloc.c:6255: error: unknown field 'sync' specified in initializer

--- a/mm/page_alloc.c~mm-compaction-embed-migration-mode-in-compact_control-fix
+++ a/mm/page_alloc.c
@@ -6252,7 +6252,7 @@ int alloc_contig_range(unsigned long sta
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
-		.sync = MIGRATE_SYNC_LIGHT,
+		.mode = MIGRATE_SYNC_LIGHT,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);


Please check that you sent the correct version of this?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
  2014-05-07 21:21           ` David Rientjes
@ 2014-05-12  8:35             ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12  8:35 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/07/2014 11:21 PM, David Rientjes wrote:
> On Wed, 7 May 2014, Andrew Morton wrote:
>
>>> Memory compaction works by having a "freeing scanner" scan from one end of a
>>> zone which isolates pages as migration targets while another "migrating scanner"
>>> scans from the other end of the same zone which isolates pages for migration.
>>>
>>> When page migration fails for an isolated page, the target page is returned to
>>> the system rather than the freelist built by the freeing scanner.  This may
>>> require the freeing scanner to continue scanning memory after suitable migration
>>> targets have already been returned to the system needlessly.
>>>
>>> This patch returns destination pages to the freeing scanner freelist when page
>>> migration fails.  This prevents unnecessary work done by the freeing scanner but
>>> also encourages memory to be as compacted as possible at the end of the zone.
>>>
>>> Reported-by: Greg Thelen <gthelen@google.com>
>>
>> What did Greg actually report?  IOW, what if any observable problem is
>> being fixed here?
>>
>
> Greg reported by code inspection that he found isolated free pages were
> returned back to the VM rather than the compaction freelist.  This will
> cause holes behind the free scanner and cause it to reallocate additional
> memory if necessary later.

More precisely, there shouldn't be holes as the free scanner restarts at 
highest pageblock where it isolated something, exactly to avoid making 
holes due to returned pages. But that can be now avoided, as my patch does.


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-12  8:35             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12  8:35 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/07/2014 11:21 PM, David Rientjes wrote:
> On Wed, 7 May 2014, Andrew Morton wrote:
>
>>> Memory compaction works by having a "freeing scanner" scan from one end of a
>>> zone which isolates pages as migration targets while another "migrating scanner"
>>> scans from the other end of the same zone which isolates pages for migration.
>>>
>>> When page migration fails for an isolated page, the target page is returned to
>>> the system rather than the freelist built by the freeing scanner.  This may
>>> require the freeing scanner to continue scanning memory after suitable migration
>>> targets have already been returned to the system needlessly.
>>>
>>> This patch returns destination pages to the freeing scanner freelist when page
>>> migration fails.  This prevents unnecessary work done by the freeing scanner but
>>> also encourages memory to be as compacted as possible at the end of the zone.
>>>
>>> Reported-by: Greg Thelen <gthelen@google.com>
>>
>> What did Greg actually report?  IOW, what if any observable problem is
>> being fixed here?
>>
>
> Greg reported by code inspection that he found isolated free pages were
> returned back to the VM rather than the compaction freelist.  This will
> cause holes behind the free scanner and cause it to reallocate additional
> memory if necessary later.

More precisely, there shouldn't be holes as the free scanner restarts at 
highest pageblock where it isolated something, exactly to avoid making 
holes due to returned pages. But that can be now avoided, as my patch does.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
  2014-05-07 21:39           ` Greg Thelen
@ 2014-05-12  8:37             ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12  8:37 UTC (permalink / raw)
  To: Greg Thelen, Andrew Morton
  Cc: David Rientjes, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Hugh Dickins, linux-kernel, linux-mm

On 05/07/2014 11:39 PM, Greg Thelen wrote:
>
> On Wed, May 07 2014, Andrew Morton <akpm@linux-foundation.org> wrote:
>
>> On Tue, 6 May 2014 19:22:43 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:
>>
>>> Memory compaction works by having a "freeing scanner" scan from one end of a
>>> zone which isolates pages as migration targets while another "migrating scanner"
>>> scans from the other end of the same zone which isolates pages for migration.
>>>
>>> When page migration fails for an isolated page, the target page is returned to
>>> the system rather than the freelist built by the freeing scanner.  This may
>>> require the freeing scanner to continue scanning memory after suitable migration
>>> targets have already been returned to the system needlessly.
>>>
>>> This patch returns destination pages to the freeing scanner freelist when page
>>> migration fails.  This prevents unnecessary work done by the freeing scanner but
>>> also encourages memory to be as compacted as possible at the end of the zone.
>>>
>>> Reported-by: Greg Thelen <gthelen@google.com>
>>
>> What did Greg actually report?  IOW, what if any observable problem is
>> being fixed here?
>
> I detected the problem at runtime seeing that ext4 metadata pages (esp
> the ones read by "sbi->s_group_desc[i] = sb_bread(sb, block)") were
> constantly visited by compaction calls of migrate_pages().  These pages
> had a non-zero b_count which caused fallback_migrate_page() ->
> try_to_release_page() -> try_to_free_buffers() to fail.

That sounds like something the "mm, compaction: add per-zone migration 
pfn cache for async compaction" patch would fix, not this one, though.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist
@ 2014-05-12  8:37             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12  8:37 UTC (permalink / raw)
  To: Greg Thelen, Andrew Morton
  Cc: David Rientjes, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Hugh Dickins, linux-kernel, linux-mm

On 05/07/2014 11:39 PM, Greg Thelen wrote:
>
> On Wed, May 07 2014, Andrew Morton <akpm@linux-foundation.org> wrote:
>
>> On Tue, 6 May 2014 19:22:43 -0700 (PDT) David Rientjes <rientjes@google.com> wrote:
>>
>>> Memory compaction works by having a "freeing scanner" scan from one end of a
>>> zone which isolates pages as migration targets while another "migrating scanner"
>>> scans from the other end of the same zone which isolates pages for migration.
>>>
>>> When page migration fails for an isolated page, the target page is returned to
>>> the system rather than the freelist built by the freeing scanner.  This may
>>> require the freeing scanner to continue scanning memory after suitable migration
>>> targets have already been returned to the system needlessly.
>>>
>>> This patch returns destination pages to the freeing scanner freelist when page
>>> migration fails.  This prevents unnecessary work done by the freeing scanner but
>>> also encourages memory to be as compacted as possible at the end of the zone.
>>>
>>> Reported-by: Greg Thelen <gthelen@google.com>
>>
>> What did Greg actually report?  IOW, what if any observable problem is
>> being fixed here?
>
> I detected the problem at runtime seeing that ext4 metadata pages (esp
> the ones read by "sbi->s_group_desc[i] = sb_bread(sb, block)") were
> constantly visited by compaction calls of migrate_pages().  These pages
> had a non-zero b_count which caused fallback_migrate_page() ->
> try_to_release_page() -> try_to_free_buffers() to fail.

That sounds like something the "mm, compaction: add per-zone migration 
pfn cache for async compaction" patch would fix, not this one, though.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-08  5:28             ` Joonsoo Kim
@ 2014-05-12  9:09               ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12  9:09 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 05/08/2014 07:28 AM, Joonsoo Kim wrote:
> On Wed, May 07, 2014 at 02:09:10PM +0200, Vlastimil Babka wrote:
>> The compaction free scanner in isolate_freepages() currently remembers PFN of
>> the highest pageblock where it successfully isolates, to be used as the
>> starting pageblock for the next invocation. The rationale behind this is that
>> page migration might return free pages to the allocator when migration fails
>> and we don't want to skip them if the compaction continues.
>>
>> Since migration now returns free pages back to compaction code where they can
>> be reused, this is no longer a concern. This patch changes isolate_freepages()
>> so that the PFN for restarting is updated with each pageblock where isolation
>> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
>> reduction of the pages scanned by the free scanner.
>
> Hello,
>
> Although this patch could reduce page scanned, it is possible to skip
> scanning fresh pageblock. If there is zone lock contention and we are on
> asyn compaction, we stop scanning this pageblock immediately. And
> then, we will continue to scan next pageblock. With this patch,
> next_free_pfn is updated in this case, so we never come back again to this
> pageblock. Possibly this makes compaction success rate low, doesn't
> it?

Hm, you're right and thanks for catching that, but I think this is a 
sign of a worse and older issue than skipping a pageblock?
When isolate_freepages_block() breaks loop due to lock contention, then 
isolate_freepages() (which called it) should also immediately quit its 
loop. Trying another pageblock in the same zone with the same zone->lock 
makes no sense here? If this is fixed, then the issue you're pointing 
out will also be fixed as next_free_pfn will still point to the 
pageblock where the break occured.

> Thanks.
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-12  9:09               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12  9:09 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 05/08/2014 07:28 AM, Joonsoo Kim wrote:
> On Wed, May 07, 2014 at 02:09:10PM +0200, Vlastimil Babka wrote:
>> The compaction free scanner in isolate_freepages() currently remembers PFN of
>> the highest pageblock where it successfully isolates, to be used as the
>> starting pageblock for the next invocation. The rationale behind this is that
>> page migration might return free pages to the allocator when migration fails
>> and we don't want to skip them if the compaction continues.
>>
>> Since migration now returns free pages back to compaction code where they can
>> be reused, this is no longer a concern. This patch changes isolate_freepages()
>> so that the PFN for restarting is updated with each pageblock where isolation
>> is attempted. Using stress-highalloc from mmtests, this resulted in 10%
>> reduction of the pages scanned by the free scanner.
>
> Hello,
>
> Although this patch could reduce page scanned, it is possible to skip
> scanning fresh pageblock. If there is zone lock contention and we are on
> asyn compaction, we stop scanning this pageblock immediately. And
> then, we will continue to scan next pageblock. With this patch,
> next_free_pfn is updated in this case, so we never come back again to this
> pageblock. Possibly this makes compaction success rate low, doesn't
> it?

Hm, you're right and thanks for catching that, but I think this is a 
sign of a worse and older issue than skipping a pageblock?
When isolate_freepages_block() breaks loop due to lock contention, then 
isolate_freepages() (which called it) should also immediately quit its 
loop. Trying another pageblock in the same zone with the same zone->lock 
makes no sense here? If this is fixed, then the issue you're pointing 
out will also be fixed as next_free_pfn will still point to the 
pageblock where the break occured.

> Thanks.
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary
  2014-05-09 15:48           ` Michal Nazarewicz
@ 2014-05-12  9:51             ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12  9:51 UTC (permalink / raw)
  To: Michal Nazarewicz, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Christoph Lameter, Rik van Riel

On 05/09/2014 05:48 PM, Michal Nazarewicz wrote:
> On Wed, May 07 2014, Vlastimil Babka wrote:
>> During compaction, update_nr_listpages() has been used to count remaining
>> non-migrated and free pages after a call to migrage_pages(). The freepages
>> counting has become unneccessary, and it turns out that migratepages counting
>> is also unnecessary in most cases.
>>
>> The only situation when it's needed to count cc->migratepages is when
>> migrate_pages() returns with a negative error code. Otherwise, the non-negative
>> return value is the number of pages that were not migrated, which is exactly
>> the count of remaining pages in the cc->migratepages list.
>>
>> Furthermore, any non-zero count is only interesting for the tracepoint of
>> mm_compaction_migratepages events, because after that all remaining unmigrated
>> pages are put back and their count is set to 0.
>>
>> This patch therefore removes update_nr_listpages() completely, and changes the
>> tracepoint definition so that the manual counting is done only when the
>> tracepoint is enabled, and only when migrate_pages() returns a negative error
>> code.
>>
>> Furthermore, migrate_pages() and the tracepoints won't be called when there's
>> nothing to migrate. This potentially avoids some wasted cycles and reduces the
>> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
>> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
>> The mm_compaction_isolate_migratepages event is better for determining that
>> nothing was isolated for migration, and this one was just duplicating the info.
>>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
>> Cc: Michal Nazarewicz <mina86@mina86.com>
> 
> Acked-by: Michal Nazarewicz <mina86@mina86.com>
> 
> One tiny comment below:
> 
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   v2: checkpack and other non-functional fixes suggested by Naoya Horiguchi
>>
>>   include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
>>   mm/compaction.c                   | 31 +++++++------------------------
>>   2 files changed, 29 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
>> index 06f544e..aacaf0f 100644
>> --- a/include/trace/events/compaction.h
>> +++ b/include/trace/events/compaction.h
>> @@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
>>   	),
>>   
>>   	TP_fast_assign(
>> -		__entry->nr_migrated = nr_migrated;
>> +		unsigned long nr_failed = 0;
>> +		struct page *page;
>> +
>> +		/*
>> +		 * migrate_pages() returns either a non-negative number
>> +		 * with the number of pages that failed migration, or an
>> +		 * error code, in which case we need to count the remaining
>> +		 * pages manually
>> +		 */
>> +		if (migrate_rc >= 0)
>> +			nr_failed = migrate_rc;
>> +		else
>> +			list_for_each_entry(page, migratepages, lru)
>> +				nr_failed++;
> 
> list_for_each would suffice here.
> 
>> +
>> +		__entry->nr_migrated = nr_all - nr_failed;
>>   		__entry->nr_failed = nr_failed;
>>   	),
>>   

Right, thanks!

-----8<-----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 12 May 2014 10:56:11 +0200
Subject: [PATCH] mm-compaction-do-not-count-migratepages-when-unnecessary-fix

list_for_each is enough for counting the list length. We also avoid including
struct page definition this way.

Suggested-by: Michal Nazarewicz <mina86@mina86.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/trace/events/compaction.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index aacaf0f..c6814b9 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -7,7 +7,6 @@
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/tracepoint.h>
-#include <linux/mm_types.h>
 #include <trace/events/gfpflags.h>
 
 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
@@ -62,7 +61,7 @@ TRACE_EVENT(mm_compaction_migratepages,
 
 	TP_fast_assign(
 		unsigned long nr_failed = 0;
-		struct page *page;
+		struct list_head *page_lru;
 
 		/*
 		 * migrate_pages() returns either a non-negative number
@@ -73,7 +72,7 @@ TRACE_EVENT(mm_compaction_migratepages,
 		if (migrate_rc >= 0)
 			nr_failed = migrate_rc;
 		else
-			list_for_each_entry(page, migratepages, lru)
+			list_for_each(page_lru, migratepages)
 				nr_failed++;
 
 		__entry->nr_migrated = nr_all - nr_failed;
-- 
1.8.4.5




^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary
@ 2014-05-12  9:51             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12  9:51 UTC (permalink / raw)
  To: Michal Nazarewicz, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Christoph Lameter, Rik van Riel

On 05/09/2014 05:48 PM, Michal Nazarewicz wrote:
> On Wed, May 07 2014, Vlastimil Babka wrote:
>> During compaction, update_nr_listpages() has been used to count remaining
>> non-migrated and free pages after a call to migrage_pages(). The freepages
>> counting has become unneccessary, and it turns out that migratepages counting
>> is also unnecessary in most cases.
>>
>> The only situation when it's needed to count cc->migratepages is when
>> migrate_pages() returns with a negative error code. Otherwise, the non-negative
>> return value is the number of pages that were not migrated, which is exactly
>> the count of remaining pages in the cc->migratepages list.
>>
>> Furthermore, any non-zero count is only interesting for the tracepoint of
>> mm_compaction_migratepages events, because after that all remaining unmigrated
>> pages are put back and their count is set to 0.
>>
>> This patch therefore removes update_nr_listpages() completely, and changes the
>> tracepoint definition so that the manual counting is done only when the
>> tracepoint is enabled, and only when migrate_pages() returns a negative error
>> code.
>>
>> Furthermore, migrate_pages() and the tracepoints won't be called when there's
>> nothing to migrate. This potentially avoids some wasted cycles and reduces the
>> volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0
>> nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events.
>> The mm_compaction_isolate_migratepages event is better for determining that
>> nothing was isolated for migration, and this one was just duplicating the info.
>>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
>> Cc: Michal Nazarewicz <mina86@mina86.com>
> 
> Acked-by: Michal Nazarewicz <mina86@mina86.com>
> 
> One tiny comment below:
> 
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   v2: checkpack and other non-functional fixes suggested by Naoya Horiguchi
>>
>>   include/trace/events/compaction.h | 26 ++++++++++++++++++++++----
>>   mm/compaction.c                   | 31 +++++++------------------------
>>   2 files changed, 29 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
>> index 06f544e..aacaf0f 100644
>> --- a/include/trace/events/compaction.h
>> +++ b/include/trace/events/compaction.h
>> @@ -58,7 +61,22 @@ TRACE_EVENT(mm_compaction_migratepages,
>>   	),
>>   
>>   	TP_fast_assign(
>> -		__entry->nr_migrated = nr_migrated;
>> +		unsigned long nr_failed = 0;
>> +		struct page *page;
>> +
>> +		/*
>> +		 * migrate_pages() returns either a non-negative number
>> +		 * with the number of pages that failed migration, or an
>> +		 * error code, in which case we need to count the remaining
>> +		 * pages manually
>> +		 */
>> +		if (migrate_rc >= 0)
>> +			nr_failed = migrate_rc;
>> +		else
>> +			list_for_each_entry(page, migratepages, lru)
>> +				nr_failed++;
> 
> list_for_each would suffice here.
> 
>> +
>> +		__entry->nr_migrated = nr_all - nr_failed;
>>   		__entry->nr_failed = nr_failed;
>>   	),
>>   

Right, thanks!

-----8<-----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 12 May 2014 10:56:11 +0200
Subject: [PATCH] mm-compaction-do-not-count-migratepages-when-unnecessary-fix

list_for_each is enough for counting the list length. We also avoid including
struct page definition this way.

Suggested-by: Michal Nazarewicz <mina86@mina86.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/trace/events/compaction.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index aacaf0f..c6814b9 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -7,7 +7,6 @@
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/tracepoint.h>
-#include <linux/mm_types.h>
 #include <trace/events/gfpflags.h>
 
 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
@@ -62,7 +61,7 @@ TRACE_EVENT(mm_compaction_migratepages,
 
 	TP_fast_assign(
 		unsigned long nr_failed = 0;
-		struct page *page;
+		struct list_head *page_lru;
 
 		/*
 		 * migrate_pages() returns either a non-negative number
@@ -73,7 +72,7 @@ TRACE_EVENT(mm_compaction_migratepages,
 		if (migrate_rc >= 0)
 			nr_failed = migrate_rc;
 		else
-			list_for_each_entry(page, migratepages, lru)
+			list_for_each(page_lru, migratepages)
 				nr_failed++;
 
 		__entry->nr_migrated = nr_all - nr_failed;
-- 
1.8.4.5



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-08  5:17         ` Joonsoo Kim
@ 2014-05-12 14:15           ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12 14:15 UTC (permalink / raw)
  To: Joonsoo Kim, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

Compaction uses compact_checklock_irqsave() function to periodically check for
lock contention and need_resched() to either abort async compaction, or to
free the lock, schedule and retake the lock. When aborting, cc->contended is
set to signal the contended state to the caller. Two problems have been
identified in this mechanism.

First, compaction also calls directly cond_resched() in both scanners when no
lock is yet taken. This call either does not abort async compaction, or set
cc->contended appropriately. This patch introduces a new
compact_check_resched() function to achieve both.

Second, isolate_freepages() does not check if isolate_freepages_block()
aborted due to contention, and advances to the next pageblock. This violates
the principle of aborting on contention, and might result in pageblocks not
being scanned completely, since the scanning cursor is advanced. This patch
makes isolate_freepages_block() check the cc->contended flag and abort.

Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 83ca6f9..b34ab7c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 	return true;
 }
 
+/*
+ * Similar to compact_checklock_irqsave() (see its comment) for places where
+ * a zone lock is not concerned.
+ *
+ * Returns false when compaction should abort.
+ */
+static inline bool compact_check_resched(struct compact_control *cc)
+{
+	/* async compaction aborts if contended */
+	if (need_resched()) {
+		if (cc->mode == MIGRATE_ASYNC) {
+			cc->contended = true;
+			return false;
+		}
+
+		cond_resched();
+	}
+
+	return true;
+}
+
 /* Returns true if the page is within a block suitable for migration to */
 static bool suitable_migration_target(struct page *page)
 {
@@ -491,11 +512,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			return 0;
 	}
 
-	if (cond_resched()) {
-		/* Async terminates prematurely on need_resched() */
-		if (cc->mode == MIGRATE_ASYNC)
-			return 0;
-	}
+	if (!compact_check_resched(cc))
+		return 0;
 
 	/* Time to isolate some pages for migration */
 	for (; low_pfn < end_pfn; low_pfn++) {
@@ -718,9 +736,10 @@ static void isolate_freepages(struct zone *zone,
 		/*
 		 * This can iterate a massively long zone without finding any
 		 * suitable migration targets, so periodically check if we need
-		 * to schedule.
+		 * to schedule, or even abort async compaction.
 		 */
-		cond_resched();
+		if (!compact_check_resched(cc))
+			break;
 
 		if (!pfn_valid(block_start_pfn))
 			continue;
@@ -758,6 +777,13 @@ static void isolate_freepages(struct zone *zone,
 		 */
 		if (isolated)
 			cc->finished_update_free = true;
+
+		/*
+		 * isolate_freepages_block() might have aborted due to async
+		 * compaction being contended
+		 */
+		if (cc->contended)
+			break;
 	}
 
 	/* split_free_page does not map the pages */
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-12 14:15           ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12 14:15 UTC (permalink / raw)
  To: Joonsoo Kim, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

Compaction uses compact_checklock_irqsave() function to periodically check for
lock contention and need_resched() to either abort async compaction, or to
free the lock, schedule and retake the lock. When aborting, cc->contended is
set to signal the contended state to the caller. Two problems have been
identified in this mechanism.

First, compaction also calls directly cond_resched() in both scanners when no
lock is yet taken. This call either does not abort async compaction, or set
cc->contended appropriately. This patch introduces a new
compact_check_resched() function to achieve both.

Second, isolate_freepages() does not check if isolate_freepages_block()
aborted due to contention, and advances to the next pageblock. This violates
the principle of aborting on contention, and might result in pageblocks not
being scanned completely, since the scanning cursor is advanced. This patch
makes isolate_freepages_block() check the cc->contended flag and abort.

Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
 mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 83ca6f9..b34ab7c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 	return true;
 }
 
+/*
+ * Similar to compact_checklock_irqsave() (see its comment) for places where
+ * a zone lock is not concerned.
+ *
+ * Returns false when compaction should abort.
+ */
+static inline bool compact_check_resched(struct compact_control *cc)
+{
+	/* async compaction aborts if contended */
+	if (need_resched()) {
+		if (cc->mode == MIGRATE_ASYNC) {
+			cc->contended = true;
+			return false;
+		}
+
+		cond_resched();
+	}
+
+	return true;
+}
+
 /* Returns true if the page is within a block suitable for migration to */
 static bool suitable_migration_target(struct page *page)
 {
@@ -491,11 +512,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			return 0;
 	}
 
-	if (cond_resched()) {
-		/* Async terminates prematurely on need_resched() */
-		if (cc->mode == MIGRATE_ASYNC)
-			return 0;
-	}
+	if (!compact_check_resched(cc))
+		return 0;
 
 	/* Time to isolate some pages for migration */
 	for (; low_pfn < end_pfn; low_pfn++) {
@@ -718,9 +736,10 @@ static void isolate_freepages(struct zone *zone,
 		/*
 		 * This can iterate a massively long zone without finding any
 		 * suitable migration targets, so periodically check if we need
-		 * to schedule.
+		 * to schedule, or even abort async compaction.
 		 */
-		cond_resched();
+		if (!compact_check_resched(cc))
+			break;
 
 		if (!pfn_valid(block_start_pfn))
 			continue;
@@ -758,6 +777,13 @@ static void isolate_freepages(struct zone *zone,
 		 */
 		if (isolated)
 			cc->finished_update_free = true;
+
+		/*
+		 * isolate_freepages_block() might have aborted due to async
+		 * compaction being contended
+		 */
+		if (cc->contended)
+			break;
 	}
 
 	/* split_free_page does not map the pages */
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-12 14:15           ` Vlastimil Babka
  (?)
@ 2014-05-12 15:34           ` Naoya Horiguchi
  -1 siblings, 0 replies; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-12 15:34 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, Andrew Morton, David Rientjes, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, minchan, Mel Gorman,
	b.zolnierkie, mina86, cl, Rik van Riel

On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
> 
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new
> compact_check_resched() function to achieve both.
> 
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
> 
> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
>  1 file changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 83ca6f9..b34ab7c 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>  	return true;
>  }
>  
> +/*
> + * Similar to compact_checklock_irqsave() (see its comment) for places where
> + * a zone lock is not concerned.
> + *
> + * Returns false when compaction should abort.
> + */
> +static inline bool compact_check_resched(struct compact_control *cc)
> +{
> +	/* async compaction aborts if contended */
> +	if (need_resched()) {
> +		if (cc->mode == MIGRATE_ASYNC) {
> +			cc->contended = true;

This changes the meaning of contended in struct compact_control (not just
indicating lock contention,) so please update the comment in mm/internal.h too.

Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

Thanks,
Naoya Horiguchi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
       [not found]           ` <1399908847-ouuxeneo@n-horiguchi@ah.jp.nec.com>
@ 2014-05-12 15:45               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12 15:45 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Joonsoo Kim, Andrew Morton, David Rientjes, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, minchan, Mel Gorman,
	b.zolnierkie, mina86, cl, Rik van Riel

On 05/12/2014 05:34 PM, Naoya Horiguchi wrote:
> On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>>
>> First, compaction also calls directly cond_resched() in both scanners when no
>> lock is yet taken. This call either does not abort async compaction, or set
>> cc->contended appropriately. This patch introduces a new
>> compact_check_resched() function to achieve both.
>>
>> Second, isolate_freepages() does not check if isolate_freepages_block()
>> aborted due to contention, and advances to the next pageblock. This violates
>> the principle of aborting on contention, and might result in pageblocks not
>> being scanned completely, since the scanning cursor is advanced. This patch
>> makes isolate_freepages_block() check the cc->contended flag and abort.
>>
>> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
>> Cc: Michal Nazarewicz <mina86@mina86.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
>>   1 file changed, 33 insertions(+), 7 deletions(-)
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 83ca6f9..b34ab7c 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>>   	return true;
>>   }
>>
>> +/*
>> + * Similar to compact_checklock_irqsave() (see its comment) for places where
>> + * a zone lock is not concerned.
>> + *
>> + * Returns false when compaction should abort.
>> + */
>> +static inline bool compact_check_resched(struct compact_control *cc)
>> +{
>> +	/* async compaction aborts if contended */
>> +	if (need_resched()) {
>> +		if (cc->mode == MIGRATE_ASYNC) {
>> +			cc->contended = true;
>
> This changes the meaning of contended in struct compact_control (not just
> indicating lock contention,) so please update the comment in mm/internal.h too.

It doesn't change it, since compact_checklock_irqsave() already has this 
semantic:
if (should_release_lock(lock) && cc->mode == MIGRATE_ASYNC)
	cc->contended = true;

and should_release_lock() is:
	need_resched() || spin_is_contended(lock)

So the comment was already outdated, I will update it in v2.

> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

Thanks.

> Thanks,
> Naoya Horiguchi
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-12 15:45               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-12 15:45 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Joonsoo Kim, Andrew Morton, David Rientjes, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, minchan, Mel Gorman,
	b.zolnierkie, mina86, cl, Rik van Riel

On 05/12/2014 05:34 PM, Naoya Horiguchi wrote:
> On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>>
>> First, compaction also calls directly cond_resched() in both scanners when no
>> lock is yet taken. This call either does not abort async compaction, or set
>> cc->contended appropriately. This patch introduces a new
>> compact_check_resched() function to achieve both.
>>
>> Second, isolate_freepages() does not check if isolate_freepages_block()
>> aborted due to contention, and advances to the next pageblock. This violates
>> the principle of aborting on contention, and might result in pageblocks not
>> being scanned completely, since the scanning cursor is advanced. This patch
>> makes isolate_freepages_block() check the cc->contended flag and abort.
>>
>> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
>> Cc: Michal Nazarewicz <mina86@mina86.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
>>   1 file changed, 33 insertions(+), 7 deletions(-)
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 83ca6f9..b34ab7c 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>>   	return true;
>>   }
>>
>> +/*
>> + * Similar to compact_checklock_irqsave() (see its comment) for places where
>> + * a zone lock is not concerned.
>> + *
>> + * Returns false when compaction should abort.
>> + */
>> +static inline bool compact_check_resched(struct compact_control *cc)
>> +{
>> +	/* async compaction aborts if contended */
>> +	if (need_resched()) {
>> +		if (cc->mode == MIGRATE_ASYNC) {
>> +			cc->contended = true;
>
> This changes the meaning of contended in struct compact_control (not just
> indicating lock contention,) so please update the comment in mm/internal.h too.

It doesn't change it, since compact_checklock_irqsave() already has this 
semantic:
if (should_release_lock(lock) && cc->mode == MIGRATE_ASYNC)
	cc->contended = true;

and should_release_lock() is:
	need_resched() || spin_is_contended(lock)

So the comment was already outdated, I will update it in v2.

> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

Thanks.

> Thanks,
> Naoya Horiguchi
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-12 15:45               ` Vlastimil Babka
  (?)
@ 2014-05-12 15:53               ` Naoya Horiguchi
  -1 siblings, 0 replies; 267+ messages in thread
From: Naoya Horiguchi @ 2014-05-12 15:53 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, Andrew Morton, David Rientjes, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, minchan, Mel Gorman,
	b.zolnierkie, mina86, cl, Rik van Riel

On Mon, May 12, 2014 at 05:45:31PM +0200, Vlastimil Babka wrote:
...
> >>+/*
> >>+ * Similar to compact_checklock_irqsave() (see its comment) for places where
> >>+ * a zone lock is not concerned.
> >>+ *
> >>+ * Returns false when compaction should abort.
> >>+ */
> >>+static inline bool compact_check_resched(struct compact_control *cc)
> >>+{
> >>+	/* async compaction aborts if contended */
> >>+	if (need_resched()) {
> >>+		if (cc->mode == MIGRATE_ASYNC) {
> >>+			cc->contended = true;
> >
> >This changes the meaning of contended in struct compact_control (not just
> >indicating lock contention,) so please update the comment in mm/internal.h too.
> 
> It doesn't change it, since compact_checklock_irqsave() already has
> this semantic:
> if (should_release_lock(lock) && cc->mode == MIGRATE_ASYNC)
> 	cc->contended = true;
> 
> and should_release_lock() is:
> 	need_resched() || spin_is_contended(lock)
> 
> So the comment was already outdated, I will update it in v2.

Ah OK. Sorry for falsely blaming you :)

Naoya

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-12 14:15           ` Vlastimil Babka
@ 2014-05-12 20:28             ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-12 20:28 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, Andrew Morton, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Mon, 12 May 2014, Vlastimil Babka wrote:

> diff --git a/mm/compaction.c b/mm/compaction.c
> index 83ca6f9..b34ab7c 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>  	return true;
>  }
>  
> +/*
> + * Similar to compact_checklock_irqsave() (see its comment) for places where
> + * a zone lock is not concerned.
> + *
> + * Returns false when compaction should abort.
> + */

I think we should have some sufficient commentary in the code that 
describes why we do this.

> +static inline bool compact_check_resched(struct compact_control *cc)
> +{

I'm not sure that compact_check_resched() is the appropriate name.  Sure, 
it specifies what the current implementation is, but what it's really 
actually doing is determining when compaction should abort prematurely.

Something like compact_should_abort()?

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-12 20:28             ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-12 20:28 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, Andrew Morton, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Mon, 12 May 2014, Vlastimil Babka wrote:

> diff --git a/mm/compaction.c b/mm/compaction.c
> index 83ca6f9..b34ab7c 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>  	return true;
>  }
>  
> +/*
> + * Similar to compact_checklock_irqsave() (see its comment) for places where
> + * a zone lock is not concerned.
> + *
> + * Returns false when compaction should abort.
> + */

I think we should have some sufficient commentary in the code that 
describes why we do this.

> +static inline bool compact_check_resched(struct compact_control *cc)
> +{

I'm not sure that compact_check_resched() is the appropriate name.  Sure, 
it specifies what the current implementation is, but what it's really 
actually doing is determining when compaction should abort prematurely.

Something like compact_should_abort()?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-12 14:15           ` Vlastimil Babka
@ 2014-05-13  0:44             ` Joonsoo Kim
  -1 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-13  0:44 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
> 
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new
> compact_check_resched() function to achieve both.
> 
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
> 
> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
>  1 file changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 83ca6f9..b34ab7c 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>  	return true;
>  }
>  
> +/*
> + * Similar to compact_checklock_irqsave() (see its comment) for places where
> + * a zone lock is not concerned.
> + *
> + * Returns false when compaction should abort.
> + */
> +static inline bool compact_check_resched(struct compact_control *cc)
> +{
> +	/* async compaction aborts if contended */
> +	if (need_resched()) {
> +		if (cc->mode == MIGRATE_ASYNC) {
> +			cc->contended = true;
> +			return false;
> +		}
> +
> +		cond_resched();
> +	}
> +
> +	return true;
> +}
> +
>  /* Returns true if the page is within a block suitable for migration to */
>  static bool suitable_migration_target(struct page *page)
>  {
> @@ -491,11 +512,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  			return 0;
>  	}
>  
> -	if (cond_resched()) {
> -		/* Async terminates prematurely on need_resched() */
> -		if (cc->mode == MIGRATE_ASYNC)
> -			return 0;
> -	}
> +	if (!compact_check_resched(cc))
> +		return 0;
>  
>  	/* Time to isolate some pages for migration */
>  	for (; low_pfn < end_pfn; low_pfn++) {
> @@ -718,9 +736,10 @@ static void isolate_freepages(struct zone *zone,
>  		/*
>  		 * This can iterate a massively long zone without finding any
>  		 * suitable migration targets, so periodically check if we need
> -		 * to schedule.
> +		 * to schedule, or even abort async compaction.
>  		 */
> -		cond_resched();
> +		if (!compact_check_resched(cc))
> +			break;
>  
>  		if (!pfn_valid(block_start_pfn))
>  			continue;
> @@ -758,6 +777,13 @@ static void isolate_freepages(struct zone *zone,
>  		 */
>  		if (isolated)
>  			cc->finished_update_free = true;
> +
> +		/*
> +		 * isolate_freepages_block() might have aborted due to async
> +		 * compaction being contended
> +		 */
> +		if (cc->contended)
> +			break;
>  	}

Hello,

I think that we can do further.

The problem is that this cc->contended is checked only in
isolate_migratepages() to break out the compaction. So if there are
free pages we are already taken, compaction wouldn't stopped
immediately and isolate_freepages() could be invoked again on next
compaction_alloc(). If there is no contention at this time, we would try
to get free pages from one pageblock because cc->contended checking is
on bottom of the loop in isolate_migratepages() and will continue to
run compaction. AFAIK, we want to stop the compaction in this case. 

Moreover, if this isolate_freepages() don't stop the compaction,
next isolate_migratepages() will be invoked and it would be stopped
by checking cc->contended after isolating some pages for migration.
This is useless overhead so should be removed.

Thanks.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-13  0:44             ` Joonsoo Kim
  0 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-13  0:44 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
> 
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new
> compact_check_resched() function to achieve both.
> 
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
> 
> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
>  mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
>  1 file changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 83ca6f9..b34ab7c 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>  	return true;
>  }
>  
> +/*
> + * Similar to compact_checklock_irqsave() (see its comment) for places where
> + * a zone lock is not concerned.
> + *
> + * Returns false when compaction should abort.
> + */
> +static inline bool compact_check_resched(struct compact_control *cc)
> +{
> +	/* async compaction aborts if contended */
> +	if (need_resched()) {
> +		if (cc->mode == MIGRATE_ASYNC) {
> +			cc->contended = true;
> +			return false;
> +		}
> +
> +		cond_resched();
> +	}
> +
> +	return true;
> +}
> +
>  /* Returns true if the page is within a block suitable for migration to */
>  static bool suitable_migration_target(struct page *page)
>  {
> @@ -491,11 +512,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  			return 0;
>  	}
>  
> -	if (cond_resched()) {
> -		/* Async terminates prematurely on need_resched() */
> -		if (cc->mode == MIGRATE_ASYNC)
> -			return 0;
> -	}
> +	if (!compact_check_resched(cc))
> +		return 0;
>  
>  	/* Time to isolate some pages for migration */
>  	for (; low_pfn < end_pfn; low_pfn++) {
> @@ -718,9 +736,10 @@ static void isolate_freepages(struct zone *zone,
>  		/*
>  		 * This can iterate a massively long zone without finding any
>  		 * suitable migration targets, so periodically check if we need
> -		 * to schedule.
> +		 * to schedule, or even abort async compaction.
>  		 */
> -		cond_resched();
> +		if (!compact_check_resched(cc))
> +			break;
>  
>  		if (!pfn_valid(block_start_pfn))
>  			continue;
> @@ -758,6 +777,13 @@ static void isolate_freepages(struct zone *zone,
>  		 */
>  		if (isolated)
>  			cc->finished_update_free = true;
> +
> +		/*
> +		 * isolate_freepages_block() might have aborted due to async
> +		 * compaction being contended
> +		 */
> +		if (cc->contended)
> +			break;
>  	}

Hello,

I think that we can do further.

The problem is that this cc->contended is checked only in
isolate_migratepages() to break out the compaction. So if there are
free pages we are already taken, compaction wouldn't stopped
immediately and isolate_freepages() could be invoked again on next
compaction_alloc(). If there is no contention at this time, we would try
to get free pages from one pageblock because cc->contended checking is
on bottom of the loop in isolate_migratepages() and will continue to
run compaction. AFAIK, we want to stop the compaction in this case. 

Moreover, if this isolate_freepages() don't stop the compaction,
next isolate_migratepages() will be invoked and it would be stopped
by checking cc->contended after isolating some pages for migration.
This is useless overhead so should be removed.

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-12  9:09               ` Vlastimil Babka
@ 2014-05-13  1:15                 ` Joonsoo Kim
  -1 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-13  1:15 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Mon, May 12, 2014 at 11:09:25AM +0200, Vlastimil Babka wrote:
> On 05/08/2014 07:28 AM, Joonsoo Kim wrote:
> >On Wed, May 07, 2014 at 02:09:10PM +0200, Vlastimil Babka wrote:
> >>The compaction free scanner in isolate_freepages() currently remembers PFN of
> >>the highest pageblock where it successfully isolates, to be used as the
> >>starting pageblock for the next invocation. The rationale behind this is that
> >>page migration might return free pages to the allocator when migration fails
> >>and we don't want to skip them if the compaction continues.
> >>
> >>Since migration now returns free pages back to compaction code where they can
> >>be reused, this is no longer a concern. This patch changes isolate_freepages()
> >>so that the PFN for restarting is updated with each pageblock where isolation
> >>is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> >>reduction of the pages scanned by the free scanner.
> >
> >Hello,
> >
> >Although this patch could reduce page scanned, it is possible to skip
> >scanning fresh pageblock. If there is zone lock contention and we are on
> >asyn compaction, we stop scanning this pageblock immediately. And
> >then, we will continue to scan next pageblock. With this patch,
> >next_free_pfn is updated in this case, so we never come back again to this
> >pageblock. Possibly this makes compaction success rate low, doesn't
> >it?
> 
> Hm, you're right and thanks for catching that, but I think this is a
> sign of a worse and older issue than skipping a pageblock?
> When isolate_freepages_block() breaks loop due to lock contention,
> then isolate_freepages() (which called it) should also immediately
> quit its loop. Trying another pageblock in the same zone with the
> same zone->lock makes no sense here? If this is fixed, then the
> issue you're pointing out will also be fixed as next_free_pfn will
> still point to the pageblock where the break occured.
> 

Yes. It can be fixed by your approach. :)

Thanks.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-13  1:15                 ` Joonsoo Kim
  0 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-13  1:15 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Mon, May 12, 2014 at 11:09:25AM +0200, Vlastimil Babka wrote:
> On 05/08/2014 07:28 AM, Joonsoo Kim wrote:
> >On Wed, May 07, 2014 at 02:09:10PM +0200, Vlastimil Babka wrote:
> >>The compaction free scanner in isolate_freepages() currently remembers PFN of
> >>the highest pageblock where it successfully isolates, to be used as the
> >>starting pageblock for the next invocation. The rationale behind this is that
> >>page migration might return free pages to the allocator when migration fails
> >>and we don't want to skip them if the compaction continues.
> >>
> >>Since migration now returns free pages back to compaction code where they can
> >>be reused, this is no longer a concern. This patch changes isolate_freepages()
> >>so that the PFN for restarting is updated with each pageblock where isolation
> >>is attempted. Using stress-highalloc from mmtests, this resulted in 10%
> >>reduction of the pages scanned by the free scanner.
> >
> >Hello,
> >
> >Although this patch could reduce page scanned, it is possible to skip
> >scanning fresh pageblock. If there is zone lock contention and we are on
> >asyn compaction, we stop scanning this pageblock immediately. And
> >then, we will continue to scan next pageblock. With this patch,
> >next_free_pfn is updated in this case, so we never come back again to this
> >pageblock. Possibly this makes compaction success rate low, doesn't
> >it?
> 
> Hm, you're right and thanks for catching that, but I think this is a
> sign of a worse and older issue than skipping a pageblock?
> When isolate_freepages_block() breaks loop due to lock contention,
> then isolate_freepages() (which called it) should also immediately
> quit its loop. Trying another pageblock in the same zone with the
> same zone->lock makes no sense here? If this is fixed, then the
> issue you're pointing out will also be fixed as next_free_pfn will
> still point to the pageblock where the break occured.
> 

Yes. It can be fixed by your approach. :)

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-12 20:28             ` David Rientjes
@ 2014-05-13  8:50               ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-13  8:50 UTC (permalink / raw)
  To: David Rientjes
  Cc: Joonsoo Kim, Andrew Morton, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 05/12/2014 10:28 PM, David Rientjes wrote:
> On Mon, 12 May 2014, Vlastimil Babka wrote:
>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 83ca6f9..b34ab7c 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>>   	return true;
>>   }
>>
>> +/*
>> + * Similar to compact_checklock_irqsave() (see its comment) for places where
>> + * a zone lock is not concerned.
>> + *
>> + * Returns false when compaction should abort.
>> + */
>
> I think we should have some sufficient commentary in the code that
> describes why we do this.

Well I can of course mostly duplicate the comment of 
compact_checklock_irqsave() instead of referring to it, if you think 
that's better.

>> +static inline bool compact_check_resched(struct compact_control *cc)
>> +{
>
> I'm not sure that compact_check_resched() is the appropriate name.  Sure,
> it specifies what the current implementation is, but what it's really
> actually doing is determining when compaction should abort prematurely.
>
> Something like compact_should_abort()?

I tried to be somewhat analogous to the name of 
compact_checklock_irqsave(). compact_should_abort() doesn't indicate 
that there might be a resched().

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-13  8:50               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-13  8:50 UTC (permalink / raw)
  To: David Rientjes
  Cc: Joonsoo Kim, Andrew Morton, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 05/12/2014 10:28 PM, David Rientjes wrote:
> On Mon, 12 May 2014, Vlastimil Babka wrote:
>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 83ca6f9..b34ab7c 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>>   	return true;
>>   }
>>
>> +/*
>> + * Similar to compact_checklock_irqsave() (see its comment) for places where
>> + * a zone lock is not concerned.
>> + *
>> + * Returns false when compaction should abort.
>> + */
>
> I think we should have some sufficient commentary in the code that
> describes why we do this.

Well I can of course mostly duplicate the comment of 
compact_checklock_irqsave() instead of referring to it, if you think 
that's better.

>> +static inline bool compact_check_resched(struct compact_control *cc)
>> +{
>
> I'm not sure that compact_check_resched() is the appropriate name.  Sure,
> it specifies what the current implementation is, but what it's really
> actually doing is determining when compaction should abort prematurely.
>
> Something like compact_should_abort()?

I tried to be somewhat analogous to the name of 
compact_checklock_irqsave(). compact_should_abort() doesn't indicate 
that there might be a resched().

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-13  0:44             ` Joonsoo Kim
@ 2014-05-13  8:54               ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-13  8:54 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 05/13/2014 02:44 AM, Joonsoo Kim wrote:
> On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>>
>> First, compaction also calls directly cond_resched() in both scanners when no
>> lock is yet taken. This call either does not abort async compaction, or set
>> cc->contended appropriately. This patch introduces a new
>> compact_check_resched() function to achieve both.
>>
>> Second, isolate_freepages() does not check if isolate_freepages_block()
>> aborted due to contention, and advances to the next pageblock. This violates
>> the principle of aborting on contention, and might result in pageblocks not
>> being scanned completely, since the scanning cursor is advanced. This patch
>> makes isolate_freepages_block() check the cc->contended flag and abort.
>>
>> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
>> Cc: Michal Nazarewicz <mina86@mina86.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
>>   1 file changed, 33 insertions(+), 7 deletions(-)
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 83ca6f9..b34ab7c 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>>   	return true;
>>   }
>>
>> +/*
>> + * Similar to compact_checklock_irqsave() (see its comment) for places where
>> + * a zone lock is not concerned.
>> + *
>> + * Returns false when compaction should abort.
>> + */
>> +static inline bool compact_check_resched(struct compact_control *cc)
>> +{
>> +	/* async compaction aborts if contended */
>> +	if (need_resched()) {
>> +		if (cc->mode == MIGRATE_ASYNC) {
>> +			cc->contended = true;
>> +			return false;
>> +		}
>> +
>> +		cond_resched();
>> +	}
>> +
>> +	return true;
>> +}
>> +
>>   /* Returns true if the page is within a block suitable for migration to */
>>   static bool suitable_migration_target(struct page *page)
>>   {
>> @@ -491,11 +512,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>>   			return 0;
>>   	}
>>
>> -	if (cond_resched()) {
>> -		/* Async terminates prematurely on need_resched() */
>> -		if (cc->mode == MIGRATE_ASYNC)
>> -			return 0;
>> -	}
>> +	if (!compact_check_resched(cc))
>> +		return 0;
>>
>>   	/* Time to isolate some pages for migration */
>>   	for (; low_pfn < end_pfn; low_pfn++) {
>> @@ -718,9 +736,10 @@ static void isolate_freepages(struct zone *zone,
>>   		/*
>>   		 * This can iterate a massively long zone without finding any
>>   		 * suitable migration targets, so periodically check if we need
>> -		 * to schedule.
>> +		 * to schedule, or even abort async compaction.
>>   		 */
>> -		cond_resched();
>> +		if (!compact_check_resched(cc))
>> +			break;
>>
>>   		if (!pfn_valid(block_start_pfn))
>>   			continue;
>> @@ -758,6 +777,13 @@ static void isolate_freepages(struct zone *zone,
>>   		 */
>>   		if (isolated)
>>   			cc->finished_update_free = true;
>> +
>> +		/*
>> +		 * isolate_freepages_block() might have aborted due to async
>> +		 * compaction being contended
>> +		 */
>> +		if (cc->contended)
>> +			break;
>>   	}
>
> Hello,
>
> I think that we can do further.
>
> The problem is that this cc->contended is checked only in
> isolate_migratepages() to break out the compaction. So if there are
> free pages we are already taken, compaction wouldn't stopped
> immediately and isolate_freepages() could be invoked again on next
> compaction_alloc(). If there is no contention at this time, we would try
> to get free pages from one pageblock because cc->contended checking is
> on bottom of the loop in isolate_migratepages() and will continue to
> run compaction. AFAIK, we want to stop the compaction in this case.
>
> Moreover, if this isolate_freepages() don't stop the compaction,
> next isolate_migratepages() will be invoked and it would be stopped
> by checking cc->contended after isolating some pages for migration.
> This is useless overhead so should be removed.

Good catch again, thanks! So that means checking the flag also in 
compaction_alloc(). But what to do if we managed isolated something and 
then found out about being contended? Put all pages back and go home, or 
try to migrate what we have?

I'm becoming worried that all these changes will mean that async 
compaction will have near zero probability of finishing anything before 
hitting a contention. And then everything it did until the contention 
would be a wasted work.

> Thanks.
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-13  8:54               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-13  8:54 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 05/13/2014 02:44 AM, Joonsoo Kim wrote:
> On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>>
>> First, compaction also calls directly cond_resched() in both scanners when no
>> lock is yet taken. This call either does not abort async compaction, or set
>> cc->contended appropriately. This patch introduces a new
>> compact_check_resched() function to achieve both.
>>
>> Second, isolate_freepages() does not check if isolate_freepages_block()
>> aborted due to contention, and advances to the next pageblock. This violates
>> the principle of aborting on contention, and might result in pageblocks not
>> being scanned completely, since the scanning cursor is advanced. This patch
>> makes isolate_freepages_block() check the cc->contended flag and abort.
>>
>> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
>> Cc: Michal Nazarewicz <mina86@mina86.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> ---
>>   mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
>>   1 file changed, 33 insertions(+), 7 deletions(-)
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 83ca6f9..b34ab7c 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>>   	return true;
>>   }
>>
>> +/*
>> + * Similar to compact_checklock_irqsave() (see its comment) for places where
>> + * a zone lock is not concerned.
>> + *
>> + * Returns false when compaction should abort.
>> + */
>> +static inline bool compact_check_resched(struct compact_control *cc)
>> +{
>> +	/* async compaction aborts if contended */
>> +	if (need_resched()) {
>> +		if (cc->mode == MIGRATE_ASYNC) {
>> +			cc->contended = true;
>> +			return false;
>> +		}
>> +
>> +		cond_resched();
>> +	}
>> +
>> +	return true;
>> +}
>> +
>>   /* Returns true if the page is within a block suitable for migration to */
>>   static bool suitable_migration_target(struct page *page)
>>   {
>> @@ -491,11 +512,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>>   			return 0;
>>   	}
>>
>> -	if (cond_resched()) {
>> -		/* Async terminates prematurely on need_resched() */
>> -		if (cc->mode == MIGRATE_ASYNC)
>> -			return 0;
>> -	}
>> +	if (!compact_check_resched(cc))
>> +		return 0;
>>
>>   	/* Time to isolate some pages for migration */
>>   	for (; low_pfn < end_pfn; low_pfn++) {
>> @@ -718,9 +736,10 @@ static void isolate_freepages(struct zone *zone,
>>   		/*
>>   		 * This can iterate a massively long zone without finding any
>>   		 * suitable migration targets, so periodically check if we need
>> -		 * to schedule.
>> +		 * to schedule, or even abort async compaction.
>>   		 */
>> -		cond_resched();
>> +		if (!compact_check_resched(cc))
>> +			break;
>>
>>   		if (!pfn_valid(block_start_pfn))
>>   			continue;
>> @@ -758,6 +777,13 @@ static void isolate_freepages(struct zone *zone,
>>   		 */
>>   		if (isolated)
>>   			cc->finished_update_free = true;
>> +
>> +		/*
>> +		 * isolate_freepages_block() might have aborted due to async
>> +		 * compaction being contended
>> +		 */
>> +		if (cc->contended)
>> +			break;
>>   	}
>
> Hello,
>
> I think that we can do further.
>
> The problem is that this cc->contended is checked only in
> isolate_migratepages() to break out the compaction. So if there are
> free pages we are already taken, compaction wouldn't stopped
> immediately and isolate_freepages() could be invoked again on next
> compaction_alloc(). If there is no contention at this time, we would try
> to get free pages from one pageblock because cc->contended checking is
> on bottom of the loop in isolate_migratepages() and will continue to
> run compaction. AFAIK, we want to stop the compaction in this case.
>
> Moreover, if this isolate_freepages() don't stop the compaction,
> next isolate_migratepages() will be invoked and it would be stopped
> by checking cc->contended after isolating some pages for migration.
> This is useless overhead so should be removed.

Good catch again, thanks! So that means checking the flag also in 
compaction_alloc(). But what to do if we managed isolated something and 
then found out about being contended? Put all pages back and go home, or 
try to migrate what we have?

I'm becoming worried that all these changes will mean that async 
compaction will have near zero probability of finishing anything before 
hitting a contention. And then everything it did until the contention 
would be a wasted work.

> Thanks.
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm] mm, thp: avoid excessive compaction latency during fault fix
  2014-05-08  5:30         ` David Rientjes
@ 2014-05-13 10:00           ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-13 10:00 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/08/2014 07:30 AM, David Rientjes wrote:
> mm-thp-avoid-excessive-compaction-latency-during-fault.patch excludes sync
> compaction for all high order allocations other than thp.  What we really
> want to do is suppress sync compaction for thp, but only during the page
> fault path.
>
> Orders greater than PAGE_ALLOC_COSTLY_ORDER aren't necessarily going to
> loop again so this is the only way to exhaust our capabilities before
> declaring that we can't allocate.
>
> Reported-by: Hugh Dickins <hughd@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   mm/page_alloc.c | 17 +++++++----------
>   1 file changed, 7 insertions(+), 10 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2585,16 +2585,13 @@ rebalance:
>   	if (page)
>   		goto got_pg;
>
> -	if (gfp_mask & __GFP_NO_KSWAPD) {
> -		/*
> -		 * Khugepaged is allowed to try MIGRATE_SYNC_LIGHT, the latency
> -		 * of this allocation isn't critical.  Everything else, however,
> -		 * should only be allowed to do MIGRATE_ASYNC to avoid excessive
> -		 * stalls during fault.
> -		 */
> -		if ((current->flags & (PF_KTHREAD | PF_KSWAPD)) == PF_KTHREAD)
> -			migration_mode = MIGRATE_SYNC_LIGHT;
> -	}
> +	/*
> +	 * It can become very expensive to allocate transparent hugepages at
> +	 * fault, so use asynchronous memory compaction for THP unless it is
> +	 * khugepaged trying to collapse.
> +	 */
> +	if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
> +		migration_mode = MIGRATE_SYNC_LIGHT;

I wonder what about a process doing e.g. mmap() with MAP_POPULATE. It 
seems to me that it would get only MIGRATE_ASYNC here, right? Since 
gfp_mask would include __GFP_NO_KSWAPD and it won't have PF_KTHREAD.
I think that goes against the idea that with MAP_POPULATE you say you 
are willing to wait to have everything in place before you actually use 
the memory. So I guess you are also willing to wait for hugepages in 
that situation?

>
>   	/*
>   	 * If compaction is deferred for high-order allocations, it is because
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm] mm, thp: avoid excessive compaction latency during fault fix
@ 2014-05-13 10:00           ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-13 10:00 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Rik van Riel, Joonsoo Kim, Greg Thelen, Hugh Dickins,
	linux-kernel, linux-mm

On 05/08/2014 07:30 AM, David Rientjes wrote:
> mm-thp-avoid-excessive-compaction-latency-during-fault.patch excludes sync
> compaction for all high order allocations other than thp.  What we really
> want to do is suppress sync compaction for thp, but only during the page
> fault path.
>
> Orders greater than PAGE_ALLOC_COSTLY_ORDER aren't necessarily going to
> loop again so this is the only way to exhaust our capabilities before
> declaring that we can't allocate.
>
> Reported-by: Hugh Dickins <hughd@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   mm/page_alloc.c | 17 +++++++----------
>   1 file changed, 7 insertions(+), 10 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2585,16 +2585,13 @@ rebalance:
>   	if (page)
>   		goto got_pg;
>
> -	if (gfp_mask & __GFP_NO_KSWAPD) {
> -		/*
> -		 * Khugepaged is allowed to try MIGRATE_SYNC_LIGHT, the latency
> -		 * of this allocation isn't critical.  Everything else, however,
> -		 * should only be allowed to do MIGRATE_ASYNC to avoid excessive
> -		 * stalls during fault.
> -		 */
> -		if ((current->flags & (PF_KTHREAD | PF_KSWAPD)) == PF_KTHREAD)
> -			migration_mode = MIGRATE_SYNC_LIGHT;
> -	}
> +	/*
> +	 * It can become very expensive to allocate transparent hugepages at
> +	 * fault, so use asynchronous memory compaction for THP unless it is
> +	 * khugepaged trying to collapse.
> +	 */
> +	if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
> +		migration_mode = MIGRATE_SYNC_LIGHT;

I wonder what about a process doing e.g. mmap() with MAP_POPULATE. It 
seems to me that it would get only MIGRATE_ASYNC here, right? Since 
gfp_mask would include __GFP_NO_KSWAPD and it won't have PF_KTHREAD.
I think that goes against the idea that with MAP_POPULATE you say you 
are willing to wait to have everything in place before you actually use 
the memory. So I guess you are also willing to wait for hugepages in 
that situation?

>
>   	/*
>   	 * If compaction is deferred for high-order allocations, it is because
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-13  8:54               ` Vlastimil Babka
@ 2014-05-15  2:21                 ` Joonsoo Kim
  -1 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-15  2:21 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Tue, May 13, 2014 at 10:54:58AM +0200, Vlastimil Babka wrote:
> On 05/13/2014 02:44 AM, Joonsoo Kim wrote:
> >On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
> >>Compaction uses compact_checklock_irqsave() function to periodically check for
> >>lock contention and need_resched() to either abort async compaction, or to
> >>free the lock, schedule and retake the lock. When aborting, cc->contended is
> >>set to signal the contended state to the caller. Two problems have been
> >>identified in this mechanism.
> >>
> >>First, compaction also calls directly cond_resched() in both scanners when no
> >>lock is yet taken. This call either does not abort async compaction, or set
> >>cc->contended appropriately. This patch introduces a new
> >>compact_check_resched() function to achieve both.
> >>
> >>Second, isolate_freepages() does not check if isolate_freepages_block()
> >>aborted due to contention, and advances to the next pageblock. This violates
> >>the principle of aborting on contention, and might result in pageblocks not
> >>being scanned completely, since the scanning cursor is advanced. This patch
> >>makes isolate_freepages_block() check the cc->contended flag and abort.
> >>
> >>Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> >>Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> >>Cc: Minchan Kim <minchan@kernel.org>
> >>Cc: Mel Gorman <mgorman@suse.de>
> >>Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> >>Cc: Michal Nazarewicz <mina86@mina86.com>
> >>Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> >>Cc: Christoph Lameter <cl@linux.com>
> >>Cc: Rik van Riel <riel@redhat.com>
> >>---
> >>  mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
> >>  1 file changed, 33 insertions(+), 7 deletions(-)
> >>
> >>diff --git a/mm/compaction.c b/mm/compaction.c
> >>index 83ca6f9..b34ab7c 100644
> >>--- a/mm/compaction.c
> >>+++ b/mm/compaction.c
> >>@@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
> >>  	return true;
> >>  }
> >>
> >>+/*
> >>+ * Similar to compact_checklock_irqsave() (see its comment) for places where
> >>+ * a zone lock is not concerned.
> >>+ *
> >>+ * Returns false when compaction should abort.
> >>+ */
> >>+static inline bool compact_check_resched(struct compact_control *cc)
> >>+{
> >>+	/* async compaction aborts if contended */
> >>+	if (need_resched()) {
> >>+		if (cc->mode == MIGRATE_ASYNC) {
> >>+			cc->contended = true;
> >>+			return false;
> >>+		}
> >>+
> >>+		cond_resched();
> >>+	}
> >>+
> >>+	return true;
> >>+}
> >>+
> >>  /* Returns true if the page is within a block suitable for migration to */
> >>  static bool suitable_migration_target(struct page *page)
> >>  {
> >>@@ -491,11 +512,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> >>  			return 0;
> >>  	}
> >>
> >>-	if (cond_resched()) {
> >>-		/* Async terminates prematurely on need_resched() */
> >>-		if (cc->mode == MIGRATE_ASYNC)
> >>-			return 0;
> >>-	}
> >>+	if (!compact_check_resched(cc))
> >>+		return 0;
> >>
> >>  	/* Time to isolate some pages for migration */
> >>  	for (; low_pfn < end_pfn; low_pfn++) {
> >>@@ -718,9 +736,10 @@ static void isolate_freepages(struct zone *zone,
> >>  		/*
> >>  		 * This can iterate a massively long zone without finding any
> >>  		 * suitable migration targets, so periodically check if we need
> >>-		 * to schedule.
> >>+		 * to schedule, or even abort async compaction.
> >>  		 */
> >>-		cond_resched();
> >>+		if (!compact_check_resched(cc))
> >>+			break;
> >>
> >>  		if (!pfn_valid(block_start_pfn))
> >>  			continue;
> >>@@ -758,6 +777,13 @@ static void isolate_freepages(struct zone *zone,
> >>  		 */
> >>  		if (isolated)
> >>  			cc->finished_update_free = true;
> >>+
> >>+		/*
> >>+		 * isolate_freepages_block() might have aborted due to async
> >>+		 * compaction being contended
> >>+		 */
> >>+		if (cc->contended)
> >>+			break;
> >>  	}
> >
> >Hello,
> >
> >I think that we can do further.
> >
> >The problem is that this cc->contended is checked only in
> >isolate_migratepages() to break out the compaction. So if there are
> >free pages we are already taken, compaction wouldn't stopped
> >immediately and isolate_freepages() could be invoked again on next
> >compaction_alloc(). If there is no contention at this time, we would try
> >to get free pages from one pageblock because cc->contended checking is
> >on bottom of the loop in isolate_migratepages() and will continue to
> >run compaction. AFAIK, we want to stop the compaction in this case.
> >
> >Moreover, if this isolate_freepages() don't stop the compaction,
> >next isolate_migratepages() will be invoked and it would be stopped
> >by checking cc->contended after isolating some pages for migration.
> >This is useless overhead so should be removed.
> 
> Good catch again, thanks! So that means checking the flag also in
> compaction_alloc(). But what to do if we managed isolated something
> and then found out about being contended? Put all pages back and go
> home, or try to migrate what we have?

I think that 'try to migrate what we have' is better, because it
doesn't cause contention on zone lock anymore until freepages are
exhausted. If there is another contention on other things such as page
lock, it will skip it, so continuation would not be the problem, I think.

> 
> I'm becoming worried that all these changes will mean that async
> compaction will have near zero probability of finishing anything
> before hitting a contention. And then everything it did until the
> contention would be a wasted work.

Yes, but I think considering this logic would not cause the success
rate to be much lowered than current logic, because, without this change,
compaction stop after next isolate_migratepages().

Thanks.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-15  2:21                 ` Joonsoo Kim
  0 siblings, 0 replies; 267+ messages in thread
From: Joonsoo Kim @ 2014-05-15  2:21 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Tue, May 13, 2014 at 10:54:58AM +0200, Vlastimil Babka wrote:
> On 05/13/2014 02:44 AM, Joonsoo Kim wrote:
> >On Mon, May 12, 2014 at 04:15:11PM +0200, Vlastimil Babka wrote:
> >>Compaction uses compact_checklock_irqsave() function to periodically check for
> >>lock contention and need_resched() to either abort async compaction, or to
> >>free the lock, schedule and retake the lock. When aborting, cc->contended is
> >>set to signal the contended state to the caller. Two problems have been
> >>identified in this mechanism.
> >>
> >>First, compaction also calls directly cond_resched() in both scanners when no
> >>lock is yet taken. This call either does not abort async compaction, or set
> >>cc->contended appropriately. This patch introduces a new
> >>compact_check_resched() function to achieve both.
> >>
> >>Second, isolate_freepages() does not check if isolate_freepages_block()
> >>aborted due to contention, and advances to the next pageblock. This violates
> >>the principle of aborting on contention, and might result in pageblocks not
> >>being scanned completely, since the scanning cursor is advanced. This patch
> >>makes isolate_freepages_block() check the cc->contended flag and abort.
> >>
> >>Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> >>Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> >>Cc: Minchan Kim <minchan@kernel.org>
> >>Cc: Mel Gorman <mgorman@suse.de>
> >>Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> >>Cc: Michal Nazarewicz <mina86@mina86.com>
> >>Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> >>Cc: Christoph Lameter <cl@linux.com>
> >>Cc: Rik van Riel <riel@redhat.com>
> >>---
> >>  mm/compaction.c | 40 +++++++++++++++++++++++++++++++++-------
> >>  1 file changed, 33 insertions(+), 7 deletions(-)
> >>
> >>diff --git a/mm/compaction.c b/mm/compaction.c
> >>index 83ca6f9..b34ab7c 100644
> >>--- a/mm/compaction.c
> >>+++ b/mm/compaction.c
> >>@@ -222,6 +222,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
> >>  	return true;
> >>  }
> >>
> >>+/*
> >>+ * Similar to compact_checklock_irqsave() (see its comment) for places where
> >>+ * a zone lock is not concerned.
> >>+ *
> >>+ * Returns false when compaction should abort.
> >>+ */
> >>+static inline bool compact_check_resched(struct compact_control *cc)
> >>+{
> >>+	/* async compaction aborts if contended */
> >>+	if (need_resched()) {
> >>+		if (cc->mode == MIGRATE_ASYNC) {
> >>+			cc->contended = true;
> >>+			return false;
> >>+		}
> >>+
> >>+		cond_resched();
> >>+	}
> >>+
> >>+	return true;
> >>+}
> >>+
> >>  /* Returns true if the page is within a block suitable for migration to */
> >>  static bool suitable_migration_target(struct page *page)
> >>  {
> >>@@ -491,11 +512,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> >>  			return 0;
> >>  	}
> >>
> >>-	if (cond_resched()) {
> >>-		/* Async terminates prematurely on need_resched() */
> >>-		if (cc->mode == MIGRATE_ASYNC)
> >>-			return 0;
> >>-	}
> >>+	if (!compact_check_resched(cc))
> >>+		return 0;
> >>
> >>  	/* Time to isolate some pages for migration */
> >>  	for (; low_pfn < end_pfn; low_pfn++) {
> >>@@ -718,9 +736,10 @@ static void isolate_freepages(struct zone *zone,
> >>  		/*
> >>  		 * This can iterate a massively long zone without finding any
> >>  		 * suitable migration targets, so periodically check if we need
> >>-		 * to schedule.
> >>+		 * to schedule, or even abort async compaction.
> >>  		 */
> >>-		cond_resched();
> >>+		if (!compact_check_resched(cc))
> >>+			break;
> >>
> >>  		if (!pfn_valid(block_start_pfn))
> >>  			continue;
> >>@@ -758,6 +777,13 @@ static void isolate_freepages(struct zone *zone,
> >>  		 */
> >>  		if (isolated)
> >>  			cc->finished_update_free = true;
> >>+
> >>+		/*
> >>+		 * isolate_freepages_block() might have aborted due to async
> >>+		 * compaction being contended
> >>+		 */
> >>+		if (cc->contended)
> >>+			break;
> >>  	}
> >
> >Hello,
> >
> >I think that we can do further.
> >
> >The problem is that this cc->contended is checked only in
> >isolate_migratepages() to break out the compaction. So if there are
> >free pages we are already taken, compaction wouldn't stopped
> >immediately and isolate_freepages() could be invoked again on next
> >compaction_alloc(). If there is no contention at this time, we would try
> >to get free pages from one pageblock because cc->contended checking is
> >on bottom of the loop in isolate_migratepages() and will continue to
> >run compaction. AFAIK, we want to stop the compaction in this case.
> >
> >Moreover, if this isolate_freepages() don't stop the compaction,
> >next isolate_migratepages() will be invoked and it would be stopped
> >by checking cc->contended after isolating some pages for migration.
> >This is useless overhead so should be removed.
> 
> Good catch again, thanks! So that means checking the flag also in
> compaction_alloc(). But what to do if we managed isolated something
> and then found out about being contended? Put all pages back and go
> home, or try to migrate what we have?

I think that 'try to migrate what we have' is better, because it
doesn't cause contention on zone lock anymore until freepages are
exhausted. If there is another contention on other things such as page
lock, it will skip it, so continuation would not be the problem, I think.

> 
> I'm becoming worried that all these changes will mean that async
> compaction will have near zero probability of finishing anything
> before hitting a contention. And then everything it did until the
> contention would be a wasted work.

Yes, but I think considering this logic would not cause the success
rate to be much lowered than current logic, because, without this change,
compaction stop after next isolate_migratepages().

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-12 14:15           ` Vlastimil Babka
@ 2014-05-16  9:47             ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-16  9:47 UTC (permalink / raw)
  To: Joonsoo Kim, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

Compaction uses compact_checklock_irqsave() function to periodically check for
lock contention and need_resched() to either abort async compaction, or to
free the lock, schedule and retake the lock. When aborting, cc->contended is
set to signal the contended state to the caller. Two problems have been
identified in this mechanism.

First, compaction also calls directly cond_resched() in both scanners when no
lock is yet taken. This call either does not abort async compaction, or set
cc->contended appropriately. This patch introduces a new compact_should_abort()
function to achieve both. In isolate_freepages(), the check frequency is
reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
scanner does in the preliminary page checks. In case a pageblock is found
suitable for calling isolate_freepages_block(), the checks within there are
done on higher frequency.

Second, isolate_freepages() does not check if isolate_freepages_block()
aborted due to contention, and advances to the next pageblock. This violates
the principle of aborting on contention, and might result in pageblocks not
being scanned completely, since the scanning cursor is advanced. This patch
makes isolate_freepages_block() check the cc->contended flag and abort.

In case isolate_freepages() has already isolated some pages before aborting
due to contention, page migration will proceed, which is OK since we do not
want to waste the work that has been done, and page migration has own checks
for contention. However, we do not want another isolation attempt by either
of the scanners, so cc->contended flag check is added also to
compaction_alloc() and compact_finished() to make sure compaction is aborted
right after the migration.

Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
v2: update struct compact_control comment (per Naoya Horiguchi)
    rename to compact_should_abort() and add comments (per David Rientjes)
    add cc->contended checks in compaction_alloc() and compact_finished()
    (per Joonsoo Kim)
    reduce frequency of checks in isolate_freepages() 

 mm/compaction.c | 54 ++++++++++++++++++++++++++++++++++++++++++++----------
 mm/internal.h   |  5 ++++-
 2 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 83ca6f9..6fc9f18 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -222,6 +222,30 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 	return true;
 }
 
+/*
+ * Aside from avoiding lock contention, compaction also periodically checks
+ * need_resched() and either schedules in sync compaction, or aborts async
+ * compaction. This is similar to compact_checklock_irqsave() does, but used
+ * where no lock is concerned.
+ *
+ * Returns false when no scheduling was needed, or sync compaction scheduled.
+ * Returns true when async compaction should abort.
+ */
+static inline bool compact_should_abort(struct compact_control *cc)
+{
+	/* async compaction aborts if contended */
+	if (need_resched()) {
+		if (cc->mode == MIGRATE_ASYNC) {
+			cc->contended = true;
+			return false;
+		}
+
+		cond_resched();
+	}
+
+	return true;
+}
+
 /* Returns true if the page is within a block suitable for migration to */
 static bool suitable_migration_target(struct page *page)
 {
@@ -491,11 +515,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			return 0;
 	}
 
-	if (cond_resched()) {
-		/* Async terminates prematurely on need_resched() */
-		if (cc->mode == MIGRATE_ASYNC)
-			return 0;
-	}
+	if (compact_should_abort(cc))
+		return 0;
 
 	/* Time to isolate some pages for migration */
 	for (; low_pfn < end_pfn; low_pfn++) {
@@ -718,9 +739,11 @@ static void isolate_freepages(struct zone *zone,
 		/*
 		 * This can iterate a massively long zone without finding any
 		 * suitable migration targets, so periodically check if we need
-		 * to schedule.
+		 * to schedule, or even abort async compaction.
 		 */
-		cond_resched();
+		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
+						&& compact_should_abort(cc))
+			break;
 
 		if (!pfn_valid(block_start_pfn))
 			continue;
@@ -758,6 +781,13 @@ static void isolate_freepages(struct zone *zone,
 		 */
 		if (isolated)
 			cc->finished_update_free = true;
+
+		/*
+		 * isolate_freepages_block() might have aborted due to async
+		 * compaction being contended
+		 */
+		if (cc->contended)
+			break;
 	}
 
 	/* split_free_page does not map the pages */
@@ -785,9 +815,13 @@ static struct page *compaction_alloc(struct page *migratepage,
 	struct compact_control *cc = (struct compact_control *)data;
 	struct page *freepage;
 
-	/* Isolate free pages if necessary */
+	/*
+	 * Isolate free pages if necessary, and if we are not aborting due to
+	 * contention.
+	 */
 	if (list_empty(&cc->freepages)) {
-		isolate_freepages(cc->zone, cc);
+		if (!cc->contended)
+			isolate_freepages(cc->zone, cc);
 
 		if (list_empty(&cc->freepages))
 			return NULL;
@@ -857,7 +891,7 @@ static int compact_finished(struct zone *zone,
 	unsigned int order;
 	unsigned long watermark;
 
-	if (fatal_signal_pending(current))
+	if (cc->contended || fatal_signal_pending(current))
 		return COMPACT_PARTIAL;
 
 	/* Compaction run completes if the migrate and free scanner meet */
diff --git a/mm/internal.h b/mm/internal.h
index a25424a..ad844ab 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -144,7 +144,10 @@ struct compact_control {
 	int order;			/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
-	bool contended;			/* True if a lock was contended */
+	bool contended;			/* True if a lock was contended, or
+					 * need_resched() true during async
+					 * compaction
+					 */
 };
 
 unsigned long
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-16  9:47             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-16  9:47 UTC (permalink / raw)
  To: Joonsoo Kim, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

Compaction uses compact_checklock_irqsave() function to periodically check for
lock contention and need_resched() to either abort async compaction, or to
free the lock, schedule and retake the lock. When aborting, cc->contended is
set to signal the contended state to the caller. Two problems have been
identified in this mechanism.

First, compaction also calls directly cond_resched() in both scanners when no
lock is yet taken. This call either does not abort async compaction, or set
cc->contended appropriately. This patch introduces a new compact_should_abort()
function to achieve both. In isolate_freepages(), the check frequency is
reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
scanner does in the preliminary page checks. In case a pageblock is found
suitable for calling isolate_freepages_block(), the checks within there are
done on higher frequency.

Second, isolate_freepages() does not check if isolate_freepages_block()
aborted due to contention, and advances to the next pageblock. This violates
the principle of aborting on contention, and might result in pageblocks not
being scanned completely, since the scanning cursor is advanced. This patch
makes isolate_freepages_block() check the cc->contended flag and abort.

In case isolate_freepages() has already isolated some pages before aborting
due to contention, page migration will proceed, which is OK since we do not
want to waste the work that has been done, and page migration has own checks
for contention. However, we do not want another isolation attempt by either
of the scanners, so cc->contended flag check is added also to
compaction_alloc() and compact_finished() to make sure compaction is aborted
right after the migration.

Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
---
v2: update struct compact_control comment (per Naoya Horiguchi)
    rename to compact_should_abort() and add comments (per David Rientjes)
    add cc->contended checks in compaction_alloc() and compact_finished()
    (per Joonsoo Kim)
    reduce frequency of checks in isolate_freepages() 

 mm/compaction.c | 54 ++++++++++++++++++++++++++++++++++++++++++++----------
 mm/internal.h   |  5 ++++-
 2 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 83ca6f9..6fc9f18 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -222,6 +222,30 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 	return true;
 }
 
+/*
+ * Aside from avoiding lock contention, compaction also periodically checks
+ * need_resched() and either schedules in sync compaction, or aborts async
+ * compaction. This is similar to compact_checklock_irqsave() does, but used
+ * where no lock is concerned.
+ *
+ * Returns false when no scheduling was needed, or sync compaction scheduled.
+ * Returns true when async compaction should abort.
+ */
+static inline bool compact_should_abort(struct compact_control *cc)
+{
+	/* async compaction aborts if contended */
+	if (need_resched()) {
+		if (cc->mode == MIGRATE_ASYNC) {
+			cc->contended = true;
+			return false;
+		}
+
+		cond_resched();
+	}
+
+	return true;
+}
+
 /* Returns true if the page is within a block suitable for migration to */
 static bool suitable_migration_target(struct page *page)
 {
@@ -491,11 +515,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			return 0;
 	}
 
-	if (cond_resched()) {
-		/* Async terminates prematurely on need_resched() */
-		if (cc->mode == MIGRATE_ASYNC)
-			return 0;
-	}
+	if (compact_should_abort(cc))
+		return 0;
 
 	/* Time to isolate some pages for migration */
 	for (; low_pfn < end_pfn; low_pfn++) {
@@ -718,9 +739,11 @@ static void isolate_freepages(struct zone *zone,
 		/*
 		 * This can iterate a massively long zone without finding any
 		 * suitable migration targets, so periodically check if we need
-		 * to schedule.
+		 * to schedule, or even abort async compaction.
 		 */
-		cond_resched();
+		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
+						&& compact_should_abort(cc))
+			break;
 
 		if (!pfn_valid(block_start_pfn))
 			continue;
@@ -758,6 +781,13 @@ static void isolate_freepages(struct zone *zone,
 		 */
 		if (isolated)
 			cc->finished_update_free = true;
+
+		/*
+		 * isolate_freepages_block() might have aborted due to async
+		 * compaction being contended
+		 */
+		if (cc->contended)
+			break;
 	}
 
 	/* split_free_page does not map the pages */
@@ -785,9 +815,13 @@ static struct page *compaction_alloc(struct page *migratepage,
 	struct compact_control *cc = (struct compact_control *)data;
 	struct page *freepage;
 
-	/* Isolate free pages if necessary */
+	/*
+	 * Isolate free pages if necessary, and if we are not aborting due to
+	 * contention.
+	 */
 	if (list_empty(&cc->freepages)) {
-		isolate_freepages(cc->zone, cc);
+		if (!cc->contended)
+			isolate_freepages(cc->zone, cc);
 
 		if (list_empty(&cc->freepages))
 			return NULL;
@@ -857,7 +891,7 @@ static int compact_finished(struct zone *zone,
 	unsigned int order;
 	unsigned long watermark;
 
-	if (fatal_signal_pending(current))
+	if (cc->contended || fatal_signal_pending(current))
 		return COMPACT_PARTIAL;
 
 	/* Compaction run completes if the migrate and free scanner meet */
diff --git a/mm/internal.h b/mm/internal.h
index a25424a..ad844ab 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -144,7 +144,10 @@ struct compact_control {
 	int order;			/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
-	bool contended;			/* True if a lock was contended */
+	bool contended;			/* True if a lock was contended, or
+					 * need_resched() true during async
+					 * compaction
+					 */
 };
 
 unsigned long
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-16  9:47             ` Vlastimil Babka
@ 2014-05-16 17:33               ` Michal Nazarewicz
  -1 siblings, 0 replies; 267+ messages in thread
From: Michal Nazarewicz @ 2014-05-16 17:33 UTC (permalink / raw)
  To: Vlastimil Babka, Joonsoo Kim, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Vlastimil Babka, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Christoph Lameter, Rik van Riel

[-- Attachment #1: Type: text/plain, Size: 2926 bytes --]

On Fri, May 16 2014, Vlastimil Babka wrote:
> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
>
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new compact_should_abort()
> function to achieve both. In isolate_freepages(), the check frequency is
> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
> scanner does in the preliminary page checks. In case a pageblock is found
> suitable for calling isolate_freepages_block(), the checks within there are
> done on higher frequency.
>
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
>
> In case isolate_freepages() has already isolated some pages before aborting
> due to contention, page migration will proceed, which is OK since we do not
> want to waste the work that has been done, and page migration has own checks
> for contention. However, we do not want another isolation attempt by either
> of the scanners, so cc->contended flag check is added also to
> compaction_alloc() and compact_finished() to make sure compaction is aborted
> right after the migration.
>
> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>

Acked-by: Michal Nazarewicz <mina86@mina86.com>

> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
> v2: update struct compact_control comment (per Naoya Horiguchi)
>     rename to compact_should_abort() and add comments (per David Rientjes)
>     add cc->contended checks in compaction_alloc() and compact_finished()
>     (per Joonsoo Kim)
>     reduce frequency of checks in isolate_freepages() 
>
-- 
Best regards,                                         _     _
.o. | Liege of Serenely Enlightened Majesty of      o' \,=./ `o
..o | Computer Science,  Michał “mina86” Nazarewicz    (o o)
ooo +--<mpn@google.com>--<xmpp:mina86@jabber.org>--ooO--(_)--Ooo--

[-- Attachment #2.1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 835 bytes --]

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-16 17:33               ` Michal Nazarewicz
  0 siblings, 0 replies; 267+ messages in thread
From: Michal Nazarewicz @ 2014-05-16 17:33 UTC (permalink / raw)
  To: Vlastimil Babka, Joonsoo Kim, Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Mel Gorman, Bartlomiej Zolnierkiewicz, Christoph Lameter,
	Rik van Riel

[-- Attachment #1: Type: text/plain, Size: 2926 bytes --]

On Fri, May 16 2014, Vlastimil Babka wrote:
> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
>
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new compact_should_abort()
> function to achieve both. In isolate_freepages(), the check frequency is
> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
> scanner does in the preliminary page checks. In case a pageblock is found
> suitable for calling isolate_freepages_block(), the checks within there are
> done on higher frequency.
>
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
>
> In case isolate_freepages() has already isolated some pages before aborting
> due to contention, page migration will proceed, which is OK since we do not
> want to waste the work that has been done, and page migration has own checks
> for contention. However, we do not want another isolation attempt by either
> of the scanners, so cc->contended flag check is added also to
> compaction_alloc() and compact_finished() to make sure compaction is aborted
> right after the migration.
>
> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
> Cc: Michal Nazarewicz <mina86@mina86.com>

Acked-by: Michal Nazarewicz <mina86@mina86.com>

> Cc: Christoph Lameter <cl@linux.com>
> Cc: Rik van Riel <riel@redhat.com>
> ---
> v2: update struct compact_control comment (per Naoya Horiguchi)
>     rename to compact_should_abort() and add comments (per David Rientjes)
>     add cc->contended checks in compaction_alloc() and compact_finished()
>     (per Joonsoo Kim)
>     reduce frequency of checks in isolate_freepages() 
>
-- 
Best regards,                                         _     _
.o. | Liege of Serenely Enlightened Majesty of      o' \,=./ `o
..o | Computer Science,  Michał “mina86” Nazarewicz    (o o)
ooo +--<mpn@google.com>--<xmpp:mina86@jabber.org>--ooO--(_)--Ooo--

[-- Attachment #2.1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 835 bytes --]

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-07 12:09           ` Vlastimil Babka
@ 2014-05-19 10:14             ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-19 10:14 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Naoya Horiguchi, Christoph Lameter,
	Rik van Riel

I wonder why nobody complained about the build warning... sorry.

----8<----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 19 May 2014 12:02:38 +0200
Subject: mm-compaction-avoid-rescanning-pageblocks-in-isolate_freepages-fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix a (spurious) build warning:

mm/compaction.c:860:15: warning: ‘next_free_pfn’ may be used uninitialized in this function [-Wmaybe-uninitialized]

Seems like the compiler cannot prove that exiting the for loop without updating
next_free_pfn there will mean that the check for crossing the scanners will
trigger. So let's not confuse people who try to see why this warning occurs.

Instead of initializing next_free_pfn to zero with an explaining comment, just
drop the damned variable altogether and work with cc->free_pfn directly as
Nayoa originally suggested.

Suggested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 8db9820..b0f939b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -760,7 +760,6 @@ static void isolate_freepages(struct zone *zone,
 	unsigned long block_start_pfn;	/* start of current pageblock */
 	unsigned long block_end_pfn;	/* end of current pageblock */
 	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
-	unsigned long next_free_pfn; /* start pfn for scaning at next round */
 	int nr_freepages = cc->nr_freepages;
 	struct list_head *freelist = &cc->freepages;
 
@@ -822,7 +821,7 @@ static void isolate_freepages(struct zone *zone,
 			continue;
 
 		/* Found a block suitable for isolating free pages from */
-		next_free_pfn = block_start_pfn;
+		cc->free_pfn = block_start_pfn;
 		isolated = isolate_freepages_block(cc, block_start_pfn,
 					block_end_pfn, freelist, false);
 		nr_freepages += isolated;
@@ -852,9 +851,8 @@ static void isolate_freepages(struct zone *zone,
 	 * so that compact_finished() may detect this
 	 */
 	if (block_start_pfn < low_pfn)
-		next_free_pfn = cc->migrate_pfn;
+		cc->free_pfn = cc->migrate_pfn;
 
-	cc->free_pfn = next_free_pfn;
 	cc->nr_freepages = nr_freepages;
 }
 
-- 
1.8.4.5



^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-19 10:14             ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-19 10:14 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Hugh Dickins, Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Naoya Horiguchi, Christoph Lameter,
	Rik van Riel

I wonder why nobody complained about the build warning... sorry.

----8<----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 19 May 2014 12:02:38 +0200
Subject: mm-compaction-avoid-rescanning-pageblocks-in-isolate_freepages-fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix a (spurious) build warning:

mm/compaction.c:860:15: warning: a??next_free_pfna?? may be used uninitialized in this function [-Wmaybe-uninitialized]

Seems like the compiler cannot prove that exiting the for loop without updating
next_free_pfn there will mean that the check for crossing the scanners will
trigger. So let's not confuse people who try to see why this warning occurs.

Instead of initializing next_free_pfn to zero with an explaining comment, just
drop the damned variable altogether and work with cc->free_pfn directly as
Nayoa originally suggested.

Suggested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 8db9820..b0f939b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -760,7 +760,6 @@ static void isolate_freepages(struct zone *zone,
 	unsigned long block_start_pfn;	/* start of current pageblock */
 	unsigned long block_end_pfn;	/* end of current pageblock */
 	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
-	unsigned long next_free_pfn; /* start pfn for scaning at next round */
 	int nr_freepages = cc->nr_freepages;
 	struct list_head *freelist = &cc->freepages;
 
@@ -822,7 +821,7 @@ static void isolate_freepages(struct zone *zone,
 			continue;
 
 		/* Found a block suitable for isolating free pages from */
-		next_free_pfn = block_start_pfn;
+		cc->free_pfn = block_start_pfn;
 		isolated = isolate_freepages_block(cc, block_start_pfn,
 					block_end_pfn, freelist, false);
 		nr_freepages += isolated;
@@ -852,9 +851,8 @@ static void isolate_freepages(struct zone *zone,
 	 * so that compact_finished() may detect this
 	 */
 	if (block_start_pfn < low_pfn)
-		next_free_pfn = cc->migrate_pfn;
+		cc->free_pfn = cc->migrate_pfn;
 
-	cc->free_pfn = next_free_pfn;
 	cc->nr_freepages = nr_freepages;
 }
 
-- 
1.8.4.5


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-16  9:47             ` Vlastimil Babka
@ 2014-05-19 23:37               ` Andrew Morton
  -1 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-19 23:37 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On Fri, 16 May 2014 11:47:53 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:

> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
> 
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new compact_should_abort()
> function to achieve both. In isolate_freepages(), the check frequency is
> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
> scanner does in the preliminary page checks. In case a pageblock is found
> suitable for calling isolate_freepages_block(), the checks within there are
> done on higher frequency.
> 
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
> 
> In case isolate_freepages() has already isolated some pages before aborting
> due to contention, page migration will proceed, which is OK since we do not
> want to waste the work that has been done, and page migration has own checks
> for contention. However, we do not want another isolation attempt by either
> of the scanners, so cc->contended flag check is added also to
> compaction_alloc() and compact_finished() to make sure compaction is aborted
> right after the migration.

What are the runtime effect of this change?

> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>

What did Joonsoo report?  Perhaps this is the same thing..

>
> ...
>
> @@ -718,9 +739,11 @@ static void isolate_freepages(struct zone *zone,
>  		/*
>  		 * This can iterate a massively long zone without finding any
>  		 * suitable migration targets, so periodically check if we need
> -		 * to schedule.
> +		 * to schedule, or even abort async compaction.
>  		 */
> -		cond_resched();
> +		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
> +						&& compact_should_abort(cc))

This seems rather gratuitously inefficient and isn't terribly clear. 
What's wrong with

	if ((++foo % SWAP_CLUSTER_MAX) == 0 && compact_should_abort(cc))

?

(Assumes that SWAP_CLUSTER_MAX is power-of-2 and that the compiler will
use &)


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-19 23:37               ` Andrew Morton
  0 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-19 23:37 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On Fri, 16 May 2014 11:47:53 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:

> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
> 
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new compact_should_abort()
> function to achieve both. In isolate_freepages(), the check frequency is
> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
> scanner does in the preliminary page checks. In case a pageblock is found
> suitable for calling isolate_freepages_block(), the checks within there are
> done on higher frequency.
> 
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
> 
> In case isolate_freepages() has already isolated some pages before aborting
> due to contention, page migration will proceed, which is OK since we do not
> want to waste the work that has been done, and page migration has own checks
> for contention. However, we do not want another isolation attempt by either
> of the scanners, so cc->contended flag check is added also to
> compaction_alloc() and compact_finished() to make sure compaction is aborted
> right after the migration.

What are the runtime effect of this change?

> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>

What did Joonsoo report?  Perhaps this is the same thing..

>
> ...
>
> @@ -718,9 +739,11 @@ static void isolate_freepages(struct zone *zone,
>  		/*
>  		 * This can iterate a massively long zone without finding any
>  		 * suitable migration targets, so periodically check if we need
> -		 * to schedule.
> +		 * to schedule, or even abort async compaction.
>  		 */
> -		cond_resched();
> +		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
> +						&& compact_should_abort(cc))

This seems rather gratuitously inefficient and isn't terribly clear. 
What's wrong with

	if ((++foo % SWAP_CLUSTER_MAX) == 0 && compact_should_abort(cc))

?

(Assumes that SWAP_CLUSTER_MAX is power-of-2 and that the compiler will
use &)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-19 23:37               ` Andrew Morton
@ 2014-05-21 14:13                 ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-21 14:13 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On 05/20/2014 01:37 AM, Andrew Morton wrote:
> On Fri, 16 May 2014 11:47:53 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:
> 
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>>
>> First, compaction also calls directly cond_resched() in both scanners when no
>> lock is yet taken. This call either does not abort async compaction, or set
>> cc->contended appropriately. This patch introduces a new compact_should_abort()
>> function to achieve both. In isolate_freepages(), the check frequency is
>> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
>> scanner does in the preliminary page checks. In case a pageblock is found
>> suitable for calling isolate_freepages_block(), the checks within there are
>> done on higher frequency.
>>
>> Second, isolate_freepages() does not check if isolate_freepages_block()
>> aborted due to contention, and advances to the next pageblock. This violates
>> the principle of aborting on contention, and might result in pageblocks not
>> being scanned completely, since the scanning cursor is advanced. This patch
>> makes isolate_freepages_block() check the cc->contended flag and abort.
>>
>> In case isolate_freepages() has already isolated some pages before aborting
>> due to contention, page migration will proceed, which is OK since we do not
>> want to waste the work that has been done, and page migration has own checks
>> for contention. However, we do not want another isolation attempt by either
>> of the scanners, so cc->contended flag check is added also to
>> compaction_alloc() and compact_finished() to make sure compaction is aborted
>> right after the migration.
> 
> What are the runtime effect of this change?
> 
>> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> 
> What did Joonsoo report?  Perhaps this is the same thing..

Updated log message:

Compaction uses compact_checklock_irqsave() function to periodically check for
lock contention and need_resched() to either abort async compaction, or to
free the lock, schedule and retake the lock. When aborting, cc->contended is
set to signal the contended state to the caller. Two problems have been
identified in this mechanism.

First, compaction also calls directly cond_resched() in both scanners when no
lock is yet taken. This call either does not abort async compaction, or set
cc->contended appropriately. This patch introduces a new compact_should_abort()
function to achieve both. In isolate_freepages(), the check frequency is
reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
scanner does in the preliminary page checks. In case a pageblock is found
suitable for calling isolate_freepages_block(), the checks within there are
done on higher frequency.

Second, isolate_freepages() does not check if isolate_freepages_block()
aborted due to contention, and advances to the next pageblock. This violates
the principle of aborting on contention, and might result in pageblocks not
being scanned completely, since the scanning cursor is advanced. This problem
has been noticed in the code by Joonsoo Kim when reviewing related patches.
This patch makes isolate_freepages_block() check the cc->contended flag and
abort.

In case isolate_freepages() has already isolated some pages before aborting
due to contention, page migration will proceed, which is OK since we do not
want to waste the work that has been done, and page migration has own checks
for contention. However, we do not want another isolation attempt by either
of the scanners, so cc->contended flag check is added also to
compaction_alloc() and compact_finished() to make sure compaction is aborted
right after the migration.

The outcome of the patch should be reduced lock contention by async compaction
and lower latencies for higher-order allocations where direct compaction is
involved.

>>
>> ...
>>
>> @@ -718,9 +739,11 @@ static void isolate_freepages(struct zone *zone,
>>   		/*
>>   		 * This can iterate a massively long zone without finding any
>>   		 * suitable migration targets, so periodically check if we need
>> -		 * to schedule.
>> +		 * to schedule, or even abort async compaction.
>>   		 */
>> -		cond_resched();
>> +		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
>> +						&& compact_should_abort(cc))
> 
> This seems rather gratuitously inefficient and isn't terribly clear.
> What's wrong with
> 
> 	if ((++foo % SWAP_CLUSTER_MAX) == 0 && compact_should_abort(cc))

It's a new variable and it differs from how isolate_migratepages_range() does this.
But yeah, I might change it later there as well. There it makes even more sense.
E.g. when skipping whole pageblock there, pfn % SWAP_CLUSTER_MAX will be always zero
so the periodicity varies.
 
> ?
> 
> (Assumes that SWAP_CLUSTER_MAX is power-of-2 and that the compiler will
> use &)
 
I hoped that compiler would be smart enough about SWAP_CLUSTER_MAX * pageblock_nr_pages
as well, as those are constants and also power-of-2. But I didn't check the assembly.

-----8<-----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 21 May 2014 16:07:21 +0200
Subject: [PATCH] 
 mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix

Use a separate counter variable (not pfn) to trigger abort checks.

Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index bbe6a26..23c7439 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -779,6 +779,7 @@ static void isolate_freepages(struct zone *zone,
 	unsigned long block_start_pfn;	/* start of current pageblock */
 	unsigned long block_end_pfn;	/* end of current pageblock */
 	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
+	unsigned long nr_blocks_scanned = 0; /* for periodical abort checks */
 	int nr_freepages = cc->nr_freepages;
 	struct list_head *freelist = &cc->freepages;
 
@@ -813,7 +814,7 @@ static void isolate_freepages(struct zone *zone,
 		 * suitable migration targets, so periodically check if we need
 		 * to schedule, or even abort async compaction.
 		 */
-		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
+		if ((++nr_blocks_scanned % SWAP_CLUSTER_MAX) == 0
 						&& compact_should_abort(cc))
 			break;
 
-- 
1.8.4.5




^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-21 14:13                 ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-21 14:13 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On 05/20/2014 01:37 AM, Andrew Morton wrote:
> On Fri, 16 May 2014 11:47:53 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:
> 
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>>
>> First, compaction also calls directly cond_resched() in both scanners when no
>> lock is yet taken. This call either does not abort async compaction, or set
>> cc->contended appropriately. This patch introduces a new compact_should_abort()
>> function to achieve both. In isolate_freepages(), the check frequency is
>> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
>> scanner does in the preliminary page checks. In case a pageblock is found
>> suitable for calling isolate_freepages_block(), the checks within there are
>> done on higher frequency.
>>
>> Second, isolate_freepages() does not check if isolate_freepages_block()
>> aborted due to contention, and advances to the next pageblock. This violates
>> the principle of aborting on contention, and might result in pageblocks not
>> being scanned completely, since the scanning cursor is advanced. This patch
>> makes isolate_freepages_block() check the cc->contended flag and abort.
>>
>> In case isolate_freepages() has already isolated some pages before aborting
>> due to contention, page migration will proceed, which is OK since we do not
>> want to waste the work that has been done, and page migration has own checks
>> for contention. However, we do not want another isolation attempt by either
>> of the scanners, so cc->contended flag check is added also to
>> compaction_alloc() and compact_finished() to make sure compaction is aborted
>> right after the migration.
> 
> What are the runtime effect of this change?
> 
>> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> 
> What did Joonsoo report?  Perhaps this is the same thing..

Updated log message:

Compaction uses compact_checklock_irqsave() function to periodically check for
lock contention and need_resched() to either abort async compaction, or to
free the lock, schedule and retake the lock. When aborting, cc->contended is
set to signal the contended state to the caller. Two problems have been
identified in this mechanism.

First, compaction also calls directly cond_resched() in both scanners when no
lock is yet taken. This call either does not abort async compaction, or set
cc->contended appropriately. This patch introduces a new compact_should_abort()
function to achieve both. In isolate_freepages(), the check frequency is
reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
scanner does in the preliminary page checks. In case a pageblock is found
suitable for calling isolate_freepages_block(), the checks within there are
done on higher frequency.

Second, isolate_freepages() does not check if isolate_freepages_block()
aborted due to contention, and advances to the next pageblock. This violates
the principle of aborting on contention, and might result in pageblocks not
being scanned completely, since the scanning cursor is advanced. This problem
has been noticed in the code by Joonsoo Kim when reviewing related patches.
This patch makes isolate_freepages_block() check the cc->contended flag and
abort.

In case isolate_freepages() has already isolated some pages before aborting
due to contention, page migration will proceed, which is OK since we do not
want to waste the work that has been done, and page migration has own checks
for contention. However, we do not want another isolation attempt by either
of the scanners, so cc->contended flag check is added also to
compaction_alloc() and compact_finished() to make sure compaction is aborted
right after the migration.

The outcome of the patch should be reduced lock contention by async compaction
and lower latencies for higher-order allocations where direct compaction is
involved.

>>
>> ...
>>
>> @@ -718,9 +739,11 @@ static void isolate_freepages(struct zone *zone,
>>   		/*
>>   		 * This can iterate a massively long zone without finding any
>>   		 * suitable migration targets, so periodically check if we need
>> -		 * to schedule.
>> +		 * to schedule, or even abort async compaction.
>>   		 */
>> -		cond_resched();
>> +		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
>> +						&& compact_should_abort(cc))
> 
> This seems rather gratuitously inefficient and isn't terribly clear.
> What's wrong with
> 
> 	if ((++foo % SWAP_CLUSTER_MAX) == 0 && compact_should_abort(cc))

It's a new variable and it differs from how isolate_migratepages_range() does this.
But yeah, I might change it later there as well. There it makes even more sense.
E.g. when skipping whole pageblock there, pfn % SWAP_CLUSTER_MAX will be always zero
so the periodicity varies.
 
> ?
> 
> (Assumes that SWAP_CLUSTER_MAX is power-of-2 and that the compiler will
> use &)
 
I hoped that compiler would be smart enough about SWAP_CLUSTER_MAX * pageblock_nr_pages
as well, as those are constants and also power-of-2. But I didn't check the assembly.

-----8<-----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 21 May 2014 16:07:21 +0200
Subject: [PATCH] 
 mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix

Use a separate counter variable (not pfn) to trigger abort checks.

Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index bbe6a26..23c7439 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -779,6 +779,7 @@ static void isolate_freepages(struct zone *zone,
 	unsigned long block_start_pfn;	/* start of current pageblock */
 	unsigned long block_end_pfn;	/* end of current pageblock */
 	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
+	unsigned long nr_blocks_scanned = 0; /* for periodical abort checks */
 	int nr_freepages = cc->nr_freepages;
 	struct list_head *freelist = &cc->freepages;
 
@@ -813,7 +814,7 @@ static void isolate_freepages(struct zone *zone,
 		 * suitable migration targets, so periodically check if we need
 		 * to schedule, or even abort async compaction.
 		 */
-		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
+		if ((++nr_blocks_scanned % SWAP_CLUSTER_MAX) == 0
 						&& compact_should_abort(cc))
 			break;
 
-- 
1.8.4.5



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-21 14:13                 ` Vlastimil Babka
@ 2014-05-21 20:11                   ` Andrew Morton
  -1 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-21 20:11 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On Wed, 21 May 2014 16:13:39 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:

> >>
> >> ...
> >>
> >> @@ -718,9 +739,11 @@ static void isolate_freepages(struct zone *zone,
> >>   		/*
> >>   		 * This can iterate a massively long zone without finding any
> >>   		 * suitable migration targets, so periodically check if we need
> >> -		 * to schedule.
> >> +		 * to schedule, or even abort async compaction.
> >>   		 */
> >> -		cond_resched();
> >> +		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
> >> +						&& compact_should_abort(cc))
> > 
> > This seems rather gratuitously inefficient and isn't terribly clear.
> > What's wrong with
> > 
> > 	if ((++foo % SWAP_CLUSTER_MAX) == 0 && compact_should_abort(cc))
> 
> It's a new variable and it differs from how isolate_migratepages_range() does this.
> But yeah, I might change it later there as well. There it makes even more sense.
> E.g. when skipping whole pageblock there, pfn % SWAP_CLUSTER_MAX will be always zero
> so the periodicity varies.
>  
> > ?
> > 
> > (Assumes that SWAP_CLUSTER_MAX is power-of-2 and that the compiler will
> > use &)
>  
> I hoped that compiler would be smart enough about SWAP_CLUSTER_MAX * pageblock_nr_pages
> as well, as those are constants and also power-of-2. But I didn't check the assembly.

Always check the assembly!  Just a quick `size mm/compaction.o' is
enough tell if you're on the right track.

> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -779,6 +779,7 @@ static void isolate_freepages(struct zone *zone,
>  	unsigned long block_start_pfn;	/* start of current pageblock */
>  	unsigned long block_end_pfn;	/* end of current pageblock */
>  	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
> +	unsigned long nr_blocks_scanned = 0; /* for periodical abort checks */
>  	int nr_freepages = cc->nr_freepages;
>  	struct list_head *freelist = &cc->freepages;
>  
> @@ -813,7 +814,7 @@ static void isolate_freepages(struct zone *zone,
>  		 * suitable migration targets, so periodically check if we need
>  		 * to schedule, or even abort async compaction.
>  		 */
> -		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
> +		if ((++nr_blocks_scanned % SWAP_CLUSTER_MAX) == 0
>  						&& compact_should_abort(cc))
>  			break;

This change actually makes the code worse, and the .o file gets larger.

For some stupid reason we went and make pageblock_nr_pages all lower
case but surprise surprise, it's actually a literal constant.  So the
compiler does the multiplication at compile time and converts the
modulus operation into a bitwise AND.  Duh.


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-21 20:11                   ` Andrew Morton
  0 siblings, 0 replies; 267+ messages in thread
From: Andrew Morton @ 2014-05-21 20:11 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen,
	linux-kernel, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On Wed, 21 May 2014 16:13:39 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:

> >>
> >> ...
> >>
> >> @@ -718,9 +739,11 @@ static void isolate_freepages(struct zone *zone,
> >>   		/*
> >>   		 * This can iterate a massively long zone without finding any
> >>   		 * suitable migration targets, so periodically check if we need
> >> -		 * to schedule.
> >> +		 * to schedule, or even abort async compaction.
> >>   		 */
> >> -		cond_resched();
> >> +		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
> >> +						&& compact_should_abort(cc))
> > 
> > This seems rather gratuitously inefficient and isn't terribly clear.
> > What's wrong with
> > 
> > 	if ((++foo % SWAP_CLUSTER_MAX) == 0 && compact_should_abort(cc))
> 
> It's a new variable and it differs from how isolate_migratepages_range() does this.
> But yeah, I might change it later there as well. There it makes even more sense.
> E.g. when skipping whole pageblock there, pfn % SWAP_CLUSTER_MAX will be always zero
> so the periodicity varies.
>  
> > ?
> > 
> > (Assumes that SWAP_CLUSTER_MAX is power-of-2 and that the compiler will
> > use &)
>  
> I hoped that compiler would be smart enough about SWAP_CLUSTER_MAX * pageblock_nr_pages
> as well, as those are constants and also power-of-2. But I didn't check the assembly.

Always check the assembly!  Just a quick `size mm/compaction.o' is
enough tell if you're on the right track.

> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -779,6 +779,7 @@ static void isolate_freepages(struct zone *zone,
>  	unsigned long block_start_pfn;	/* start of current pageblock */
>  	unsigned long block_end_pfn;	/* end of current pageblock */
>  	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
> +	unsigned long nr_blocks_scanned = 0; /* for periodical abort checks */
>  	int nr_freepages = cc->nr_freepages;
>  	struct list_head *freelist = &cc->freepages;
>  
> @@ -813,7 +814,7 @@ static void isolate_freepages(struct zone *zone,
>  		 * suitable migration targets, so periodically check if we need
>  		 * to schedule, or even abort async compaction.
>  		 */
> -		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
> +		if ((++nr_blocks_scanned % SWAP_CLUSTER_MAX) == 0
>  						&& compact_should_abort(cc))
>  			break;

This change actually makes the code worse, and the .o file gets larger.

For some stupid reason we went and make pageblock_nr_pages all lower
case but surprise surprise, it's actually a literal constant.  So the
compiler does the multiplication at compile time and converts the
modulus operation into a bitwise AND.  Duh.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm] mm, thp: avoid excessive compaction latency during fault fix
  2014-05-13 10:00           ` Vlastimil Babka
@ 2014-05-22  2:49             ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-22  2:49 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, 13 May 2014, Vlastimil Babka wrote:

> I wonder what about a process doing e.g. mmap() with MAP_POPULATE. It seems to
> me that it would get only MIGRATE_ASYNC here, right? Since gfp_mask would
> include __GFP_NO_KSWAPD and it won't have PF_KTHREAD.
> I think that goes against the idea that with MAP_POPULATE you say you are
> willing to wait to have everything in place before you actually use the
> memory. So I guess you are also willing to wait for hugepages in that
> situation?
> 

I don't understand the distinction you're making between MAP_POPULATE and 
simply a prefault of the anon memory.  What is the difference in semantics 
between using MAP_POPULATE and touching a byte every page size along the 
range?  In the latter, you'd be faulting thp with MIGRATE_ASYNC, so I 
don't understand how MAP_POPULATE is any different or implies any 
preference for hugepages.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm] mm, thp: avoid excessive compaction latency during fault fix
@ 2014-05-22  2:49             ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-22  2:49 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On Tue, 13 May 2014, Vlastimil Babka wrote:

> I wonder what about a process doing e.g. mmap() with MAP_POPULATE. It seems to
> me that it would get only MIGRATE_ASYNC here, right? Since gfp_mask would
> include __GFP_NO_KSWAPD and it won't have PF_KTHREAD.
> I think that goes against the idea that with MAP_POPULATE you say you are
> willing to wait to have everything in place before you actually use the
> memory. So I guess you are also willing to wait for hugepages in that
> situation?
> 

I don't understand the distinction you're making between MAP_POPULATE and 
simply a prefault of the anon memory.  What is the difference in semantics 
between using MAP_POPULATE and touching a byte every page size along the 
range?  In the latter, you'd be faulting thp with MIGRATE_ASYNC, so I 
don't understand how MAP_POPULATE is any different or implies any 
preference for hugepages.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
  2014-05-19 10:14             ` Vlastimil Babka
@ 2014-05-22  2:51               ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-22  2:51 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Naoya Horiguchi, Christoph Lameter,
	Rik van Riel

[-- Attachment #1: Type: TEXT/PLAIN, Size: 814 bytes --]

On Mon, 19 May 2014, Vlastimil Babka wrote:

> Fix a (spurious) build warning:
> 
> mm/compaction.c:860:15: warning: ‘next_free_pfn’ may be used uninitialized in this function [-Wmaybe-uninitialized]
> 
> Seems like the compiler cannot prove that exiting the for loop without updating
> next_free_pfn there will mean that the check for crossing the scanners will
> trigger. So let's not confuse people who try to see why this warning occurs.
> 
> Instead of initializing next_free_pfn to zero with an explaining comment, just
> drop the damned variable altogether and work with cc->free_pfn directly as
> Nayoa originally suggested.
> 

s/Nayoa/Naoya/

> Suggested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Acked-by: David Rientjes <rientjes@google.com>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages
@ 2014-05-22  2:51               ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-22  2:51 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Naoya Horiguchi, Christoph Lameter,
	Rik van Riel

[-- Attachment #1: Type: TEXT/PLAIN, Size: 814 bytes --]

On Mon, 19 May 2014, Vlastimil Babka wrote:

> Fix a (spurious) build warning:
> 
> mm/compaction.c:860:15: warning: a??next_free_pfna?? may be used uninitialized in this function [-Wmaybe-uninitialized]
> 
> Seems like the compiler cannot prove that exiting the for loop without updating
> next_free_pfn there will mean that the check for crossing the scanners will
> trigger. So let's not confuse people who try to see why this warning occurs.
> 
> Instead of initializing next_free_pfn to zero with an explaining comment, just
> drop the damned variable altogether and work with cc->free_pfn directly as
> Nayoa originally suggested.
> 

s/Nayoa/Naoya/

> Suggested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Acked-by: David Rientjes <rientjes@google.com>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* compaction is still too expensive for thp (was: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention)
  2014-05-16  9:47             ` Vlastimil Babka
@ 2014-05-22  3:20               ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-22  3:20 UTC (permalink / raw)
  To: Vlastimil Babka, Mel Gorman, Andrew Morton
  Cc: Joonsoo Kim, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel

On Fri, 16 May 2014, Vlastimil Babka wrote:

> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
> 
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new compact_should_abort()
> function to achieve both. In isolate_freepages(), the check frequency is
> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
> scanner does in the preliminary page checks. In case a pageblock is found
> suitable for calling isolate_freepages_block(), the checks within there are
> done on higher frequency.
> 
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
> 
> In case isolate_freepages() has already isolated some pages before aborting
> due to contention, page migration will proceed, which is OK since we do not
> want to waste the work that has been done, and page migration has own checks
> for contention. However, we do not want another isolation attempt by either
> of the scanners, so cc->contended flag check is added also to
> compaction_alloc() and compact_finished() to make sure compaction is aborted
> right after the migration.
> 

We have a pretty significant problem with async compaction related to thp 
faults and it's not limited to this patch but was intended to be addressed 
in my series as well.  Since this is the latest patch to be proposed for 
aborting async compaction when it's too expensive, it's probably a good 
idea to discuss it here.

With -mm, it turns out that while egregious thp fault latencies were 
reduced, faulting 64MB of memory backed by thp on a fragmented 128GB 
machine can result in latencies of 1-3s for the entire 64MB.  Collecting 
compaction stats from older kernels that give more insight into 
regressions, one such incident is as follows.

Baseline:
compact_blocks_moved 8181986
compact_pages_moved 6549560
compact_pagemigrate_failed 1612070
compact_stall 101959
compact_fail 100895
compact_success 1064

5s later:
compact_blocks_moved 8182447
compact_pages_moved 6550286
compact_pagemigrate_failed 1612092
compact_stall 102023
compact_fail 100959
compact_success 1064

This represents faulting two 64MB ranges of anonymous memory.  As you can 
see, it results in falling back to 4KB pages because all 64 faults of 
hugepages ends up triggering compaction and failing to allocate.  Over the 
64 async compactions, we scan on average 7.2 pageblocks per call, 
successfully migrate 11.3 pages per call, and fail migrating 0.34 pages 
per call.

If each async compaction scans 7.2 pageblocks per call, it would have to 
be called 9103 times to scan all memory on this 128GB machine.  We're 
simply not scanning enough memory as a result of ISOLATE_ABORT due to 
need_resched().

So the net result is that -mm is much better than Linus's tree, where such 
faulting of 64MB ranges could stall 8-9s, but we're still very expensive.  
We may need to consider scanning more memory on a single call to async 
compaction even when need_resched() and if we are unsuccessful in 
allocating a hugepage to defer async compaction in subsequent calls up to 
1 << COMPACT_MAX_DEFER_SHIFT.  Today, we defer on sync compaction but that 
is now never done for thp faults since it is reliant solely on async 
compaction.

I have a few improvements in mind, but thought it would be better to 
get feedback on it first because it's a substantial rewrite of the 
pageblock migration:

 - For all async compaction, avoid migrating memory unless enough 
   contiguous memory is migrated to allow a cc->order allocation.  This
   would remove the COMPACT_CLUSTER_MAX restriction on pageblock
   compaction and keep pages on the cc->migratepages list between
   calls to isolate_migratepages_range().

   When an unmigratable page is encountered or memory hole is found,
   put all pages on cc->migratepages back on the lru lists unless
   cc->nr_migratepages >= (1 << cc->order).  Otherwise, migrate when
   enough contiguous memory has been isolated.

 - Remove the need_resched() checks entirely from compaction and
   consider only doing a set amount of scanning for each call, such
   as 1GB per call.

   If there is contention on zone->lru_lock, then we can still abort
   to avoid excessive stalls, but need_resched() is a poor heuristic
   to determine when async compaction is taking too long.

   The expense of calling async compaction if this is done is easily
   quantified since we're not migrating any memory unless it is
   sufficient for the page allocation: it would simply be the iteration
   over 1GB of memory and avoiding contention on zone->lru_lock.

We may also need to consider deferring async compaction for subsequent 
faults in the near future even though scanning the previous 1GB does not 
decrease or have any impact whatsoever in the success of defragmenting the 
next 1GB.

Any other suggestions that may be helpful?  

^ permalink raw reply	[flat|nested] 267+ messages in thread

* compaction is still too expensive for thp (was: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention)
@ 2014-05-22  3:20               ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-22  3:20 UTC (permalink / raw)
  To: Vlastimil Babka, Mel Gorman, Andrew Morton
  Cc: Joonsoo Kim, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel

On Fri, 16 May 2014, Vlastimil Babka wrote:

> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.
> 
> First, compaction also calls directly cond_resched() in both scanners when no
> lock is yet taken. This call either does not abort async compaction, or set
> cc->contended appropriately. This patch introduces a new compact_should_abort()
> function to achieve both. In isolate_freepages(), the check frequency is
> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
> scanner does in the preliminary page checks. In case a pageblock is found
> suitable for calling isolate_freepages_block(), the checks within there are
> done on higher frequency.
> 
> Second, isolate_freepages() does not check if isolate_freepages_block()
> aborted due to contention, and advances to the next pageblock. This violates
> the principle of aborting on contention, and might result in pageblocks not
> being scanned completely, since the scanning cursor is advanced. This patch
> makes isolate_freepages_block() check the cc->contended flag and abort.
> 
> In case isolate_freepages() has already isolated some pages before aborting
> due to contention, page migration will proceed, which is OK since we do not
> want to waste the work that has been done, and page migration has own checks
> for contention. However, we do not want another isolation attempt by either
> of the scanners, so cc->contended flag check is added also to
> compaction_alloc() and compact_finished() to make sure compaction is aborted
> right after the migration.
> 

We have a pretty significant problem with async compaction related to thp 
faults and it's not limited to this patch but was intended to be addressed 
in my series as well.  Since this is the latest patch to be proposed for 
aborting async compaction when it's too expensive, it's probably a good 
idea to discuss it here.

With -mm, it turns out that while egregious thp fault latencies were 
reduced, faulting 64MB of memory backed by thp on a fragmented 128GB 
machine can result in latencies of 1-3s for the entire 64MB.  Collecting 
compaction stats from older kernels that give more insight into 
regressions, one such incident is as follows.

Baseline:
compact_blocks_moved 8181986
compact_pages_moved 6549560
compact_pagemigrate_failed 1612070
compact_stall 101959
compact_fail 100895
compact_success 1064

5s later:
compact_blocks_moved 8182447
compact_pages_moved 6550286
compact_pagemigrate_failed 1612092
compact_stall 102023
compact_fail 100959
compact_success 1064

This represents faulting two 64MB ranges of anonymous memory.  As you can 
see, it results in falling back to 4KB pages because all 64 faults of 
hugepages ends up triggering compaction and failing to allocate.  Over the 
64 async compactions, we scan on average 7.2 pageblocks per call, 
successfully migrate 11.3 pages per call, and fail migrating 0.34 pages 
per call.

If each async compaction scans 7.2 pageblocks per call, it would have to 
be called 9103 times to scan all memory on this 128GB machine.  We're 
simply not scanning enough memory as a result of ISOLATE_ABORT due to 
need_resched().

So the net result is that -mm is much better than Linus's tree, where such 
faulting of 64MB ranges could stall 8-9s, but we're still very expensive.  
We may need to consider scanning more memory on a single call to async 
compaction even when need_resched() and if we are unsuccessful in 
allocating a hugepage to defer async compaction in subsequent calls up to 
1 << COMPACT_MAX_DEFER_SHIFT.  Today, we defer on sync compaction but that 
is now never done for thp faults since it is reliant solely on async 
compaction.

I have a few improvements in mind, but thought it would be better to 
get feedback on it first because it's a substantial rewrite of the 
pageblock migration:

 - For all async compaction, avoid migrating memory unless enough 
   contiguous memory is migrated to allow a cc->order allocation.  This
   would remove the COMPACT_CLUSTER_MAX restriction on pageblock
   compaction and keep pages on the cc->migratepages list between
   calls to isolate_migratepages_range().

   When an unmigratable page is encountered or memory hole is found,
   put all pages on cc->migratepages back on the lru lists unless
   cc->nr_migratepages >= (1 << cc->order).  Otherwise, migrate when
   enough contiguous memory has been isolated.

 - Remove the need_resched() checks entirely from compaction and
   consider only doing a set amount of scanning for each call, such
   as 1GB per call.

   If there is contention on zone->lru_lock, then we can still abort
   to avoid excessive stalls, but need_resched() is a poor heuristic
   to determine when async compaction is taking too long.

   The expense of calling async compaction if this is done is easily
   quantified since we're not migrating any memory unless it is
   sufficient for the page allocation: it would simply be the iteration
   over 1GB of memory and avoiding contention on zone->lru_lock.

We may also need to consider deferring async compaction for subsequent 
faults in the near future even though scanning the previous 1GB does not 
decrease or have any impact whatsoever in the success of defragmenting the 
next 1GB.

Any other suggestions that may be helpful?  

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: compaction is still too expensive for thp
  2014-05-22  3:20               ` David Rientjes
@ 2014-05-22  8:10                 ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-22  8:10 UTC (permalink / raw)
  To: David Rientjes, Mel Gorman, Andrew Morton
  Cc: Joonsoo Kim, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel

On 05/22/2014 05:20 AM, David Rientjes wrote:
> On Fri, 16 May 2014, Vlastimil Babka wrote:
>
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>>
>> First, compaction also calls directly cond_resched() in both scanners when no
>> lock is yet taken. This call either does not abort async compaction, or set
>> cc->contended appropriately. This patch introduces a new compact_should_abort()
>> function to achieve both. In isolate_freepages(), the check frequency is
>> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
>> scanner does in the preliminary page checks. In case a pageblock is found
>> suitable for calling isolate_freepages_block(), the checks within there are
>> done on higher frequency.
>>
>> Second, isolate_freepages() does not check if isolate_freepages_block()
>> aborted due to contention, and advances to the next pageblock. This violates
>> the principle of aborting on contention, and might result in pageblocks not
>> being scanned completely, since the scanning cursor is advanced. This patch
>> makes isolate_freepages_block() check the cc->contended flag and abort.
>>
>> In case isolate_freepages() has already isolated some pages before aborting
>> due to contention, page migration will proceed, which is OK since we do not
>> want to waste the work that has been done, and page migration has own checks
>> for contention. However, we do not want another isolation attempt by either
>> of the scanners, so cc->contended flag check is added also to
>> compaction_alloc() and compact_finished() to make sure compaction is aborted
>> right after the migration.
>>
>
> We have a pretty significant problem with async compaction related to thp
> faults and it's not limited to this patch but was intended to be addressed
> in my series as well.  Since this is the latest patch to be proposed for
> aborting async compaction when it's too expensive, it's probably a good
> idea to discuss it here.

I already tried to call for some higher level discussion eariler in your 
series, good that hear that now you also agree it might be useful :)

> With -mm, it turns out that while egregious thp fault latencies were
> reduced, faulting 64MB of memory backed by thp on a fragmented 128GB
> machine can result in latencies of 1-3s for the entire 64MB.  Collecting
> compaction stats from older kernels that give more insight into
> regressions, one such incident is as follows.
>
> Baseline:
> compact_blocks_moved 8181986
> compact_pages_moved 6549560
> compact_pagemigrate_failed 1612070
> compact_stall 101959
> compact_fail 100895
> compact_success 1064
>
> 5s later:
> compact_blocks_moved 8182447
> compact_pages_moved 6550286
> compact_pagemigrate_failed 1612092
> compact_stall 102023
> compact_fail 100959
> compact_success 1064
>
> This represents faulting two 64MB ranges of anonymous memory.  As you can
> see, it results in falling back to 4KB pages because all 64 faults of
> hugepages ends up triggering compaction and failing to allocate.  Over the
> 64 async compactions, we scan on average 7.2 pageblocks per call,
> successfully migrate 11.3 pages per call, and fail migrating 0.34 pages
> per call.
>
> If each async compaction scans 7.2 pageblocks per call, it would have to
> be called 9103 times to scan all memory on this 128GB machine.  We're
> simply not scanning enough memory as a result of ISOLATE_ABORT due to
> need_resched().

Well, the two objectives of not being expensive and at the same time 
scanning "enough memory" (which is hard to define as well) are clearly 
quite opposite :/

> So the net result is that -mm is much better than Linus's tree, where such
> faulting of 64MB ranges could stall 8-9s, but we're still very expensive.

So I guess the difference here is mainly thanks to not doing sync 
compaction? Or do you have any insight which patch helped the most?

> We may need to consider scanning more memory on a single call to async
> compaction even when need_resched() and if we are unsuccessful in
> allocating a hugepage to defer async compaction in subsequent calls up to
> 1 << COMPACT_MAX_DEFER_SHIFT.  Today, we defer on sync compaction but that
> is now never done for thp faults since it is reliant solely on async
> compaction.

So if I understand correctly, your intention is to scan more in a single 
scan, but balance the increased latencies by introducing deferring for 
async compaction.

Offhand I can think of two issues with that.

1) the result will be that often the latency will be low thanks to 
defer, but then there will be a huge (?) spike by scanning whole 1GB (as 
you suggest further in the mail) at once. I think that's similar to what 
you had now with the sync compaction?

2) 1GB could have a good chance of being successful (?) so there would 
be no defer anyway.

I have some other suggestion at the end of my mail.

> I have a few improvements in mind, but thought it would be better to
> get feedback on it first because it's a substantial rewrite of the
> pageblock migration:
>
>   - For all async compaction, avoid migrating memory unless enough
>     contiguous memory is migrated to allow a cc->order allocation.

Yes I suggested something like this earlier. Also in the scanner, skip 
to the next cc->order aligned block as soon as any page fails the 
isolation and is not PageBuddy.
I would just dinstinguish kswapd and direct compaction, not "all async 
compaction". Or maybe kswapd could be switched to sync compaction.

>     This
>     would remove the COMPACT_CLUSTER_MAX restriction on pageblock
>     compaction

Yes.

>     and keep pages on the cc->migratepages list between
>     calls to isolate_migratepages_range().

This might not be needed. It's called within a single pageblock (except 
maybe CMA but that's quite a different thing) and I think we can ignore 
order > pageblock_nr_order here.

>     When an unmigratable page is encountered or memory hole is found,
>     put all pages on cc->migratepages back on the lru lists unless
>     cc->nr_migratepages >= (1 << cc->order).  Otherwise, migrate when
>     enough contiguous memory has been isolated.
>
>   - Remove the need_resched() checks entirely from compaction and
>     consider only doing a set amount of scanning for each call, such
>     as 1GB per call.
>
>     If there is contention on zone->lru_lock, then we can still abort
>     to avoid excessive stalls, but need_resched() is a poor heuristic
>     to determine when async compaction is taking too long.

I tend to agree. I've also realized that because need_resched() leads to 
cc->contended = true, direct reclaim and second compaction that would 
normally follow (used to be sync, now only in hugepaged) is skipped. 
need_resched() seems to be indeed unsuitable for this.

>     The expense of calling async compaction if this is done is easily
>     quantified since we're not migrating any memory unless it is
>     sufficient for the page allocation: it would simply be the iteration
>     over 1GB of memory and avoiding contention on zone->lru_lock.
>
> We may also need to consider deferring async compaction for subsequent
> faults in the near future even though scanning the previous 1GB does not
> decrease or have any impact whatsoever in the success of defragmenting the
> next 1GB.
>
> Any other suggestions that may be helpful?

I suggested already the idea of turning the deferred compaction into a 
live-tuned amount of how much to scan, instead of doing a whole zone 
(before) or an arbitrary amount like 1GB, with the hope of reducing the 
latency spikes. But I realize this would be tricky.

Even less concrete, it might be worth revisiting the rules we use for 
deciding if compaction is worth trying, and the decisions if to continue 
or we believe the allocation will succeed. It relies heavily on 
watermark checks that also consider the order, and I found it somewhat 
fragile. For example, in the alloc slowpath -> direct compaction path, a 
check in compaction concludes that allocation should succeed and 
finishes the compaction, and few moments later the same check in the 
allocation will conclude that everything is fine up to order 8, but 
there probably isn't a page of order 9. In between, the nr_free_pages 
has *increased*. I suspect it's because the watermark checking is racy 
and the counters drift, but I'm not sure yet.
However, this particular problem should be gone when I finish my series 
that would capture the page that compaction has just freed. But still, 
the decisions regarding compaction could be suboptimal.

Vlastimil


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: compaction is still too expensive for thp
@ 2014-05-22  8:10                 ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-22  8:10 UTC (permalink / raw)
  To: David Rientjes, Mel Gorman, Andrew Morton
  Cc: Joonsoo Kim, Hugh Dickins, Greg Thelen, linux-kernel, linux-mm,
	Minchan Kim, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel

On 05/22/2014 05:20 AM, David Rientjes wrote:
> On Fri, 16 May 2014, Vlastimil Babka wrote:
>
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>>
>> First, compaction also calls directly cond_resched() in both scanners when no
>> lock is yet taken. This call either does not abort async compaction, or set
>> cc->contended appropriately. This patch introduces a new compact_should_abort()
>> function to achieve both. In isolate_freepages(), the check frequency is
>> reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration
>> scanner does in the preliminary page checks. In case a pageblock is found
>> suitable for calling isolate_freepages_block(), the checks within there are
>> done on higher frequency.
>>
>> Second, isolate_freepages() does not check if isolate_freepages_block()
>> aborted due to contention, and advances to the next pageblock. This violates
>> the principle of aborting on contention, and might result in pageblocks not
>> being scanned completely, since the scanning cursor is advanced. This patch
>> makes isolate_freepages_block() check the cc->contended flag and abort.
>>
>> In case isolate_freepages() has already isolated some pages before aborting
>> due to contention, page migration will proceed, which is OK since we do not
>> want to waste the work that has been done, and page migration has own checks
>> for contention. However, we do not want another isolation attempt by either
>> of the scanners, so cc->contended flag check is added also to
>> compaction_alloc() and compact_finished() to make sure compaction is aborted
>> right after the migration.
>>
>
> We have a pretty significant problem with async compaction related to thp
> faults and it's not limited to this patch but was intended to be addressed
> in my series as well.  Since this is the latest patch to be proposed for
> aborting async compaction when it's too expensive, it's probably a good
> idea to discuss it here.

I already tried to call for some higher level discussion eariler in your 
series, good that hear that now you also agree it might be useful :)

> With -mm, it turns out that while egregious thp fault latencies were
> reduced, faulting 64MB of memory backed by thp on a fragmented 128GB
> machine can result in latencies of 1-3s for the entire 64MB.  Collecting
> compaction stats from older kernels that give more insight into
> regressions, one such incident is as follows.
>
> Baseline:
> compact_blocks_moved 8181986
> compact_pages_moved 6549560
> compact_pagemigrate_failed 1612070
> compact_stall 101959
> compact_fail 100895
> compact_success 1064
>
> 5s later:
> compact_blocks_moved 8182447
> compact_pages_moved 6550286
> compact_pagemigrate_failed 1612092
> compact_stall 102023
> compact_fail 100959
> compact_success 1064
>
> This represents faulting two 64MB ranges of anonymous memory.  As you can
> see, it results in falling back to 4KB pages because all 64 faults of
> hugepages ends up triggering compaction and failing to allocate.  Over the
> 64 async compactions, we scan on average 7.2 pageblocks per call,
> successfully migrate 11.3 pages per call, and fail migrating 0.34 pages
> per call.
>
> If each async compaction scans 7.2 pageblocks per call, it would have to
> be called 9103 times to scan all memory on this 128GB machine.  We're
> simply not scanning enough memory as a result of ISOLATE_ABORT due to
> need_resched().

Well, the two objectives of not being expensive and at the same time 
scanning "enough memory" (which is hard to define as well) are clearly 
quite opposite :/

> So the net result is that -mm is much better than Linus's tree, where such
> faulting of 64MB ranges could stall 8-9s, but we're still very expensive.

So I guess the difference here is mainly thanks to not doing sync 
compaction? Or do you have any insight which patch helped the most?

> We may need to consider scanning more memory on a single call to async
> compaction even when need_resched() and if we are unsuccessful in
> allocating a hugepage to defer async compaction in subsequent calls up to
> 1 << COMPACT_MAX_DEFER_SHIFT.  Today, we defer on sync compaction but that
> is now never done for thp faults since it is reliant solely on async
> compaction.

So if I understand correctly, your intention is to scan more in a single 
scan, but balance the increased latencies by introducing deferring for 
async compaction.

Offhand I can think of two issues with that.

1) the result will be that often the latency will be low thanks to 
defer, but then there will be a huge (?) spike by scanning whole 1GB (as 
you suggest further in the mail) at once. I think that's similar to what 
you had now with the sync compaction?

2) 1GB could have a good chance of being successful (?) so there would 
be no defer anyway.

I have some other suggestion at the end of my mail.

> I have a few improvements in mind, but thought it would be better to
> get feedback on it first because it's a substantial rewrite of the
> pageblock migration:
>
>   - For all async compaction, avoid migrating memory unless enough
>     contiguous memory is migrated to allow a cc->order allocation.

Yes I suggested something like this earlier. Also in the scanner, skip 
to the next cc->order aligned block as soon as any page fails the 
isolation and is not PageBuddy.
I would just dinstinguish kswapd and direct compaction, not "all async 
compaction". Or maybe kswapd could be switched to sync compaction.

>     This
>     would remove the COMPACT_CLUSTER_MAX restriction on pageblock
>     compaction

Yes.

>     and keep pages on the cc->migratepages list between
>     calls to isolate_migratepages_range().

This might not be needed. It's called within a single pageblock (except 
maybe CMA but that's quite a different thing) and I think we can ignore 
order > pageblock_nr_order here.

>     When an unmigratable page is encountered or memory hole is found,
>     put all pages on cc->migratepages back on the lru lists unless
>     cc->nr_migratepages >= (1 << cc->order).  Otherwise, migrate when
>     enough contiguous memory has been isolated.
>
>   - Remove the need_resched() checks entirely from compaction and
>     consider only doing a set amount of scanning for each call, such
>     as 1GB per call.
>
>     If there is contention on zone->lru_lock, then we can still abort
>     to avoid excessive stalls, but need_resched() is a poor heuristic
>     to determine when async compaction is taking too long.

I tend to agree. I've also realized that because need_resched() leads to 
cc->contended = true, direct reclaim and second compaction that would 
normally follow (used to be sync, now only in hugepaged) is skipped. 
need_resched() seems to be indeed unsuitable for this.

>     The expense of calling async compaction if this is done is easily
>     quantified since we're not migrating any memory unless it is
>     sufficient for the page allocation: it would simply be the iteration
>     over 1GB of memory and avoiding contention on zone->lru_lock.
>
> We may also need to consider deferring async compaction for subsequent
> faults in the near future even though scanning the previous 1GB does not
> decrease or have any impact whatsoever in the success of defragmenting the
> next 1GB.
>
> Any other suggestions that may be helpful?

I suggested already the idea of turning the deferred compaction into a 
live-tuned amount of how much to scan, instead of doing a whole zone 
(before) or an arbitrary amount like 1GB, with the hope of reducing the 
latency spikes. But I realize this would be tricky.

Even less concrete, it might be worth revisiting the rules we use for 
deciding if compaction is worth trying, and the decisions if to continue 
or we believe the allocation will succeed. It relies heavily on 
watermark checks that also consider the order, and I found it somewhat 
fragile. For example, in the alloc slowpath -> direct compaction path, a 
check in compaction concludes that allocation should succeed and 
finishes the compaction, and few moments later the same check in the 
allocation will conclude that everything is fine up to order 8, but 
there probably isn't a page of order 9. In between, the nr_free_pages 
has *increased*. I suspect it's because the watermark checking is racy 
and the counters drift, but I'm not sure yet.
However, this particular problem should be gone when I finish my series 
that would capture the page that compaction has just freed. But still, 
the decisions regarding compaction could be suboptimal.

Vlastimil

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm] mm, thp: avoid excessive compaction latency during fault fix
  2014-05-22  2:49             ` David Rientjes
@ 2014-05-22  8:43               ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-22  8:43 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On 05/22/2014 04:49 AM, David Rientjes wrote:
> On Tue, 13 May 2014, Vlastimil Babka wrote:
>
>> I wonder what about a process doing e.g. mmap() with MAP_POPULATE. It seems to
>> me that it would get only MIGRATE_ASYNC here, right? Since gfp_mask would
>> include __GFP_NO_KSWAPD and it won't have PF_KTHREAD.
>> I think that goes against the idea that with MAP_POPULATE you say you are
>> willing to wait to have everything in place before you actually use the
>> memory. So I guess you are also willing to wait for hugepages in that
>> situation?
>>
>
> I don't understand the distinction you're making between MAP_POPULATE and
> simply a prefault of the anon memory.  What is the difference in semantics
> between using MAP_POPULATE and touching a byte every page size along the
> range?  In the latter, you'd be faulting thp with MIGRATE_ASYNC, so I
> don't understand how MAP_POPULATE is any different or implies any
> preference for hugepages.

Hm, OK. It's right we cannot distinguish populating by touching the 
pages manually. Nevermind then.

> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm] mm, thp: avoid excessive compaction latency during fault fix
@ 2014-05-22  8:43               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-22  8:43 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, Joonsoo Kim,
	Greg Thelen, Hugh Dickins, linux-kernel, linux-mm

On 05/22/2014 04:49 AM, David Rientjes wrote:
> On Tue, 13 May 2014, Vlastimil Babka wrote:
>
>> I wonder what about a process doing e.g. mmap() with MAP_POPULATE. It seems to
>> me that it would get only MIGRATE_ASYNC here, right? Since gfp_mask would
>> include __GFP_NO_KSWAPD and it won't have PF_KTHREAD.
>> I think that goes against the idea that with MAP_POPULATE you say you are
>> willing to wait to have everything in place before you actually use the
>> memory. So I guess you are also willing to wait for hugepages in that
>> situation?
>>
>
> I don't understand the distinction you're making between MAP_POPULATE and
> simply a prefault of the anon memory.  What is the difference in semantics
> between using MAP_POPULATE and touching a byte every page size along the
> range?  In the latter, you'd be faulting thp with MIGRATE_ASYNC, so I
> don't understand how MAP_POPULATE is any different or implies any
> preference for hugepages.

Hm, OK. It's right we cannot distinguish populating by touching the 
pages manually. Nevermind then.

> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: compaction is still too expensive for thp
  2014-05-22  8:10                 ` Vlastimil Babka
@ 2014-05-22  8:55                   ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-22  8:55 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Mel Gorman, Andrew Morton, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On Thu, 22 May 2014, Vlastimil Babka wrote:

> > With -mm, it turns out that while egregious thp fault latencies were
> > reduced, faulting 64MB of memory backed by thp on a fragmented 128GB
> > machine can result in latencies of 1-3s for the entire 64MB.  Collecting
> > compaction stats from older kernels that give more insight into
> > regressions, one such incident is as follows.
> > 
> > Baseline:
> > compact_blocks_moved 8181986
> > compact_pages_moved 6549560
> > compact_pagemigrate_failed 1612070
> > compact_stall 101959
> > compact_fail 100895
> > compact_success 1064
> > 
> > 5s later:
> > compact_blocks_moved 8182447
> > compact_pages_moved 6550286
> > compact_pagemigrate_failed 1612092
> > compact_stall 102023
> > compact_fail 100959
> > compact_success 1064
> > 
> > This represents faulting two 64MB ranges of anonymous memory.  As you can
> > see, it results in falling back to 4KB pages because all 64 faults of
> > hugepages ends up triggering compaction and failing to allocate.  Over the
> > 64 async compactions, we scan on average 7.2 pageblocks per call,
> > successfully migrate 11.3 pages per call, and fail migrating 0.34 pages
> > per call.
> > 
> > If each async compaction scans 7.2 pageblocks per call, it would have to
> > be called 9103 times to scan all memory on this 128GB machine.  We're
> > simply not scanning enough memory as a result of ISOLATE_ABORT due to
> > need_resched().
> 
> Well, the two objectives of not being expensive and at the same time scanning
> "enough memory" (which is hard to define as well) are clearly quite opposite
> :/
> 

Agreed.

> > So the net result is that -mm is much better than Linus's tree, where such
> > faulting of 64MB ranges could stall 8-9s, but we're still very expensive.
> 
> So I guess the difference here is mainly thanks to not doing sync compaction?

Not doing sync compaction for thp and caching the migration pfn for async 
so that it doesn't iterate over a ton of memory that may not be eligible 
for async compaction every time it is called.  But when we avoid sync 
compaction, we also lose deferred compaction.

> So if I understand correctly, your intention is to scan more in a single scan,

More will be scanned instead of ~7 pageblocks for every call to async 
compaction with the data that I presented but also reduce how expensive 
every pageblock scan is by avoiding needlessly migrating memory (and 
dealing with rmap locks) when it will not result in 2MB of contiguous 
memory for thp faults.

> but balance the increased latencies by introducing deferring for async
> compaction.
> 
> Offhand I can think of two issues with that.
> 
> 1) the result will be that often the latency will be low thanks to defer, but
> then there will be a huge (?) spike by scanning whole 1GB (as you suggest
> further in the mail) at once. I think that's similar to what you had now with
> the sync compaction?
> 

Not at all, with MIGRATE_SYNC_LIGHT before there is no termination other 
than an entire scan of memory so we were potentially scanning 128GB and 
failing if thp cannot be allocated.

If we are to avoid migrating memory needlessly that will not result in 
cc->order memory being allocated, then the cost should be relatively 
constant for a span of memory.  My 32GB system can iterate all memory with 
MIGRATE_ASYNC and no need_resched() aborts in ~530ms.

> 2) 1GB could have a good chance of being successful (?) so there would be no
> defer anyway.
> 

If we terminate early because order-9 is allocatable or we end up scanning 
the entire 1GB and the hugepage is allocated, then we have prevented 511 
other pagefaults in my testcase where faulting 64MB of memory with thp 
enabled can currently take 1-3s on a 128GB machine with fragmentation.  I 
think the deferral is unnecessary in such a case.

Are you suggesting we should try without the deferral first?

> > I have a few improvements in mind, but thought it would be better to
> > get feedback on it first because it's a substantial rewrite of the
> > pageblock migration:
> > 
> >   - For all async compaction, avoid migrating memory unless enough
> >     contiguous memory is migrated to allow a cc->order allocation.
> 
> Yes I suggested something like this earlier. Also in the scanner, skip to the
> next cc->order aligned block as soon as any page fails the isolation and is
> not PageBuddy.

Agreed.

> I would just dinstinguish kswapd and direct compaction, not "all async
> compaction". Or maybe kswapd could be switched to sync compaction.
> 

To generalize this, I'm thinking that it is pointless for async compaction 
to migrate memory in a contiguous span if it will not cause a cc->order 
page allocation to succeed.

> >     This
> >     would remove the COMPACT_CLUSTER_MAX restriction on pageblock
> >     compaction
> 
> Yes.
> 
> >     and keep pages on the cc->migratepages list between
> >     calls to isolate_migratepages_range().
> 
> This might not be needed. It's called within a single pageblock (except maybe
> CMA but that's quite a different thing) and I think we can ignore order >
> pageblock_nr_order here.
> 

Ok, I guess pageblocks within a zone are always pageblock_order aligned 
for all platforms so if we encounter any non-migratable (or PageBuddy) 
memory in a block where pageblock_order == HPAGE_PMD_NR, then we can abort 
that block immediately for thp faults.

> >     When an unmigratable page is encountered or memory hole is found,
> >     put all pages on cc->migratepages back on the lru lists unless
> >     cc->nr_migratepages >= (1 << cc->order).  Otherwise, migrate when
> >     enough contiguous memory has been isolated.
> > 
> >   - Remove the need_resched() checks entirely from compaction and
> >     consider only doing a set amount of scanning for each call, such
> >     as 1GB per call.
> > 
> >     If there is contention on zone->lru_lock, then we can still abort
> >     to avoid excessive stalls, but need_resched() is a poor heuristic
> >     to determine when async compaction is taking too long.
> 
> I tend to agree. I've also realized that because need_resched() leads to
> cc->contended = true, direct reclaim and second compaction that would normally
> follow (used to be sync, now only in hugepaged) is skipped. need_resched()
> seems to be indeed unsuitable for this.
> 

It's hard to replace with an alternative, though, to determine when enough 
is enough :)  1GB might be a sane starting point, though, and then try 
reclaim and avoid the second call to async compaction on failure.  I'm not 
sure if the deferral would be needed in this case or not.

> >     The expense of calling async compaction if this is done is easily
> >     quantified since we're not migrating any memory unless it is
> >     sufficient for the page allocation: it would simply be the iteration
> >     over 1GB of memory and avoiding contention on zone->lru_lock.
> > 
> > We may also need to consider deferring async compaction for subsequent
> > faults in the near future even though scanning the previous 1GB does not
> > decrease or have any impact whatsoever in the success of defragmenting the
> > next 1GB.
> > 
> > Any other suggestions that may be helpful?
> 
> I suggested already the idea of turning the deferred compaction into a
> live-tuned amount of how much to scan, instead of doing a whole zone (before)
> or an arbitrary amount like 1GB, with the hope of reducing the latency spikes.
> But I realize this would be tricky.
> 

By live-tuned, do you mean something that is tuned by the kernel over time 
depending on how successful compaction is or do you mean something that 
the user would alter?  If we are to go this route, I agree that we can 
allow the user to tune the 1GB.

> Even less concrete, it might be worth revisiting the rules we use for deciding
> if compaction is worth trying, and the decisions if to continue or we believe
> the allocation will succeed.

Ah, tuning compaction_suitable() might be another opportunity.  I'm 
wondering if we should be considering thp specially here since it's in the 
fault path.

> It relies heavily on watermark checks that also
> consider the order, and I found it somewhat fragile. For example, in the alloc
> slowpath -> direct compaction path, a check in compaction concludes that
> allocation should succeed and finishes the compaction, and few moments later
> the same check in the allocation will conclude that everything is fine up to
> order 8, but there probably isn't a page of order 9. In between, the
> nr_free_pages has *increased*. I suspect it's because the watermark checking
> is racy and the counters drift, but I'm not sure yet.

The watermark checks both in compaction_suitable() and and 
compact_finished() are indeed racy with respect to the actual page 
allocation.

> However, this particular problem should be gone when I finish my series that
> would capture the page that compaction has just freed. But still, the
> decisions regarding compaction could be suboptimal.
> 

This should also avoid the race between COMPACT_PARTIAL, returning to the 
page allocator, and finding that the high-order memory you thought was now 
available has been allocated by someone else.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: compaction is still too expensive for thp
@ 2014-05-22  8:55                   ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-05-22  8:55 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Mel Gorman, Andrew Morton, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On Thu, 22 May 2014, Vlastimil Babka wrote:

> > With -mm, it turns out that while egregious thp fault latencies were
> > reduced, faulting 64MB of memory backed by thp on a fragmented 128GB
> > machine can result in latencies of 1-3s for the entire 64MB.  Collecting
> > compaction stats from older kernels that give more insight into
> > regressions, one such incident is as follows.
> > 
> > Baseline:
> > compact_blocks_moved 8181986
> > compact_pages_moved 6549560
> > compact_pagemigrate_failed 1612070
> > compact_stall 101959
> > compact_fail 100895
> > compact_success 1064
> > 
> > 5s later:
> > compact_blocks_moved 8182447
> > compact_pages_moved 6550286
> > compact_pagemigrate_failed 1612092
> > compact_stall 102023
> > compact_fail 100959
> > compact_success 1064
> > 
> > This represents faulting two 64MB ranges of anonymous memory.  As you can
> > see, it results in falling back to 4KB pages because all 64 faults of
> > hugepages ends up triggering compaction and failing to allocate.  Over the
> > 64 async compactions, we scan on average 7.2 pageblocks per call,
> > successfully migrate 11.3 pages per call, and fail migrating 0.34 pages
> > per call.
> > 
> > If each async compaction scans 7.2 pageblocks per call, it would have to
> > be called 9103 times to scan all memory on this 128GB machine.  We're
> > simply not scanning enough memory as a result of ISOLATE_ABORT due to
> > need_resched().
> 
> Well, the two objectives of not being expensive and at the same time scanning
> "enough memory" (which is hard to define as well) are clearly quite opposite
> :/
> 

Agreed.

> > So the net result is that -mm is much better than Linus's tree, where such
> > faulting of 64MB ranges could stall 8-9s, but we're still very expensive.
> 
> So I guess the difference here is mainly thanks to not doing sync compaction?

Not doing sync compaction for thp and caching the migration pfn for async 
so that it doesn't iterate over a ton of memory that may not be eligible 
for async compaction every time it is called.  But when we avoid sync 
compaction, we also lose deferred compaction.

> So if I understand correctly, your intention is to scan more in a single scan,

More will be scanned instead of ~7 pageblocks for every call to async 
compaction with the data that I presented but also reduce how expensive 
every pageblock scan is by avoiding needlessly migrating memory (and 
dealing with rmap locks) when it will not result in 2MB of contiguous 
memory for thp faults.

> but balance the increased latencies by introducing deferring for async
> compaction.
> 
> Offhand I can think of two issues with that.
> 
> 1) the result will be that often the latency will be low thanks to defer, but
> then there will be a huge (?) spike by scanning whole 1GB (as you suggest
> further in the mail) at once. I think that's similar to what you had now with
> the sync compaction?
> 

Not at all, with MIGRATE_SYNC_LIGHT before there is no termination other 
than an entire scan of memory so we were potentially scanning 128GB and 
failing if thp cannot be allocated.

If we are to avoid migrating memory needlessly that will not result in 
cc->order memory being allocated, then the cost should be relatively 
constant for a span of memory.  My 32GB system can iterate all memory with 
MIGRATE_ASYNC and no need_resched() aborts in ~530ms.

> 2) 1GB could have a good chance of being successful (?) so there would be no
> defer anyway.
> 

If we terminate early because order-9 is allocatable or we end up scanning 
the entire 1GB and the hugepage is allocated, then we have prevented 511 
other pagefaults in my testcase where faulting 64MB of memory with thp 
enabled can currently take 1-3s on a 128GB machine with fragmentation.  I 
think the deferral is unnecessary in such a case.

Are you suggesting we should try without the deferral first?

> > I have a few improvements in mind, but thought it would be better to
> > get feedback on it first because it's a substantial rewrite of the
> > pageblock migration:
> > 
> >   - For all async compaction, avoid migrating memory unless enough
> >     contiguous memory is migrated to allow a cc->order allocation.
> 
> Yes I suggested something like this earlier. Also in the scanner, skip to the
> next cc->order aligned block as soon as any page fails the isolation and is
> not PageBuddy.

Agreed.

> I would just dinstinguish kswapd and direct compaction, not "all async
> compaction". Or maybe kswapd could be switched to sync compaction.
> 

To generalize this, I'm thinking that it is pointless for async compaction 
to migrate memory in a contiguous span if it will not cause a cc->order 
page allocation to succeed.

> >     This
> >     would remove the COMPACT_CLUSTER_MAX restriction on pageblock
> >     compaction
> 
> Yes.
> 
> >     and keep pages on the cc->migratepages list between
> >     calls to isolate_migratepages_range().
> 
> This might not be needed. It's called within a single pageblock (except maybe
> CMA but that's quite a different thing) and I think we can ignore order >
> pageblock_nr_order here.
> 

Ok, I guess pageblocks within a zone are always pageblock_order aligned 
for all platforms so if we encounter any non-migratable (or PageBuddy) 
memory in a block where pageblock_order == HPAGE_PMD_NR, then we can abort 
that block immediately for thp faults.

> >     When an unmigratable page is encountered or memory hole is found,
> >     put all pages on cc->migratepages back on the lru lists unless
> >     cc->nr_migratepages >= (1 << cc->order).  Otherwise, migrate when
> >     enough contiguous memory has been isolated.
> > 
> >   - Remove the need_resched() checks entirely from compaction and
> >     consider only doing a set amount of scanning for each call, such
> >     as 1GB per call.
> > 
> >     If there is contention on zone->lru_lock, then we can still abort
> >     to avoid excessive stalls, but need_resched() is a poor heuristic
> >     to determine when async compaction is taking too long.
> 
> I tend to agree. I've also realized that because need_resched() leads to
> cc->contended = true, direct reclaim and second compaction that would normally
> follow (used to be sync, now only in hugepaged) is skipped. need_resched()
> seems to be indeed unsuitable for this.
> 

It's hard to replace with an alternative, though, to determine when enough 
is enough :)  1GB might be a sane starting point, though, and then try 
reclaim and avoid the second call to async compaction on failure.  I'm not 
sure if the deferral would be needed in this case or not.

> >     The expense of calling async compaction if this is done is easily
> >     quantified since we're not migrating any memory unless it is
> >     sufficient for the page allocation: it would simply be the iteration
> >     over 1GB of memory and avoiding contention on zone->lru_lock.
> > 
> > We may also need to consider deferring async compaction for subsequent
> > faults in the near future even though scanning the previous 1GB does not
> > decrease or have any impact whatsoever in the success of defragmenting the
> > next 1GB.
> > 
> > Any other suggestions that may be helpful?
> 
> I suggested already the idea of turning the deferred compaction into a
> live-tuned amount of how much to scan, instead of doing a whole zone (before)
> or an arbitrary amount like 1GB, with the hope of reducing the latency spikes.
> But I realize this would be tricky.
> 

By live-tuned, do you mean something that is tuned by the kernel over time 
depending on how successful compaction is or do you mean something that 
the user would alter?  If we are to go this route, I agree that we can 
allow the user to tune the 1GB.

> Even less concrete, it might be worth revisiting the rules we use for deciding
> if compaction is worth trying, and the decisions if to continue or we believe
> the allocation will succeed.

Ah, tuning compaction_suitable() might be another opportunity.  I'm 
wondering if we should be considering thp specially here since it's in the 
fault path.

> It relies heavily on watermark checks that also
> consider the order, and I found it somewhat fragile. For example, in the alloc
> slowpath -> direct compaction path, a check in compaction concludes that
> allocation should succeed and finishes the compaction, and few moments later
> the same check in the allocation will conclude that everything is fine up to
> order 8, but there probably isn't a page of order 9. In between, the
> nr_free_pages has *increased*. I suspect it's because the watermark checking
> is racy and the counters drift, but I'm not sure yet.

The watermark checks both in compaction_suitable() and and 
compact_finished() are indeed racy with respect to the actual page 
allocation.

> However, this particular problem should be gone when I finish my series that
> would capture the page that compaction has just freed. But still, the
> decisions regarding compaction could be suboptimal.
> 

This should also avoid the race between COMPACT_PARTIAL, returning to the 
page allocator, and finding that the high-order memory you thought was now 
available has been allocated by someone else.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: compaction is still too expensive for thp
  2014-05-22  8:55                   ` David Rientjes
@ 2014-05-22 12:03                     ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-22 12:03 UTC (permalink / raw)
  To: David Rientjes
  Cc: Mel Gorman, Andrew Morton, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On 05/22/2014 10:55 AM, David Rientjes wrote:
> On Thu, 22 May 2014, Vlastimil Babka wrote:
>
>>> With -mm, it turns out that while egregious thp fault latencies were
>>> reduced, faulting 64MB of memory backed by thp on a fragmented 128GB
>>> machine can result in latencies of 1-3s for the entire 64MB.  Collecting
>>> compaction stats from older kernels that give more insight into
>>> regressions, one such incident is as follows.
>>>
>>> Baseline:
>>> compact_blocks_moved 8181986
>>> compact_pages_moved 6549560
>>> compact_pagemigrate_failed 1612070
>>> compact_stall 101959
>>> compact_fail 100895
>>> compact_success 1064
>>>
>>> 5s later:
>>> compact_blocks_moved 8182447
>>> compact_pages_moved 6550286
>>> compact_pagemigrate_failed 1612092
>>> compact_stall 102023
>>> compact_fail 100959
>>> compact_success 1064
>>>
>>> This represents faulting two 64MB ranges of anonymous memory.  As you can
>>> see, it results in falling back to 4KB pages because all 64 faults of
>>> hugepages ends up triggering compaction and failing to allocate.  Over the
>>> 64 async compactions, we scan on average 7.2 pageblocks per call,
>>> successfully migrate 11.3 pages per call, and fail migrating 0.34 pages
>>> per call.
>>>
>>> If each async compaction scans 7.2 pageblocks per call, it would have to
>>> be called 9103 times to scan all memory on this 128GB machine.  We're
>>> simply not scanning enough memory as a result of ISOLATE_ABORT due to
>>> need_resched().
>>
>> Well, the two objectives of not being expensive and at the same time scanning
>> "enough memory" (which is hard to define as well) are clearly quite opposite
>> :/
>>
>
> Agreed.
>
>>> So the net result is that -mm is much better than Linus's tree, where such
>>> faulting of 64MB ranges could stall 8-9s, but we're still very expensive.
>>
>> So I guess the difference here is mainly thanks to not doing sync compaction?
>
> Not doing sync compaction for thp and caching the migration pfn for async
> so that it doesn't iterate over a ton of memory that may not be eligible
> for async compaction every time it is called.  But when we avoid sync
> compaction, we also lose deferred compaction.
>
>> So if I understand correctly, your intention is to scan more in a single scan,
>
> More will be scanned instead of ~7 pageblocks for every call to async
> compaction with the data that I presented but also reduce how expensive
> every pageblock scan is by avoiding needlessly migrating memory (and
> dealing with rmap locks) when it will not result in 2MB of contiguous
> memory for thp faults.
>
>> but balance the increased latencies by introducing deferring for async
>> compaction.
>>
>> Offhand I can think of two issues with that.
>>
>> 1) the result will be that often the latency will be low thanks to defer, but
>> then there will be a huge (?) spike by scanning whole 1GB (as you suggest
>> further in the mail) at once. I think that's similar to what you had now with
>> the sync compaction?
>>
>
> Not at all, with MIGRATE_SYNC_LIGHT before there is no termination other
> than an entire scan of memory so we were potentially scanning 128GB and
> failing if thp cannot be allocated.

OK.

> If we are to avoid migrating memory needlessly that will not result in
> cc->order memory being allocated, then the cost should be relatively
> constant for a span of memory.  My 32GB system can iterate all memory with
> MIGRATE_ASYNC and no need_resched() aborts in ~530ms.

OK

>> 2) 1GB could have a good chance of being successful (?) so there would be no
>> defer anyway.
>>
>
> If we terminate early because order-9 is allocatable or we end up scanning
> the entire 1GB and the hugepage is allocated, then we have prevented 511
> other pagefaults in my testcase where faulting 64MB of memory with thp
> enabled can currently take 1-3s on a 128GB machine with fragmentation.  I
> think the deferral is unnecessary in such a case.
>
> Are you suggesting we should try without the deferral first?

Might be an option, or less aggressive back off than the sync deferral, 
as it's limited to 1GB.

>>> I have a few improvements in mind, but thought it would be better to
>>> get feedback on it first because it's a substantial rewrite of the
>>> pageblock migration:
>>>
>>>    - For all async compaction, avoid migrating memory unless enough
>>>      contiguous memory is migrated to allow a cc->order allocation.
>>
>> Yes I suggested something like this earlier. Also in the scanner, skip to the
>> next cc->order aligned block as soon as any page fails the isolation and is
>> not PageBuddy.
>
> Agreed.
>
>> I would just dinstinguish kswapd and direct compaction, not "all async
>> compaction". Or maybe kswapd could be switched to sync compaction.
>>
>
> To generalize this, I'm thinking that it is pointless for async compaction
> to migrate memory in a contiguous span if it will not cause a cc->order
> page allocation to succeed.

Well I think it's pointless for page faults and maybe khugepaged. But 
still there should be some daemon trying to migrate even pages that do 
not immediately lead to a continuous block or some order, as an general 
incremental defragmentation. For example I believe (and hope to analyze 
and improve that eventually) that MOVABLE pages allocated in 
UNMOVABLE/RECLAIMABLE pageblocks as a fallback should be attempted to be 
migrated to more appropriate pageblocks when possible so that 
UNMOVABLE/RECLAIMABLE pageblocks have space for pages they are intended 
for, and those allocations don't have to fallback and pollute MOVABLE 
pageblocks, which is much worse.

So if we say that "async compaction" now means it won't do anything that 
does not lead to continuous space immediately, then kswapd should switch 
to sync compaction IMHO (and still not sure about khugepaged).
Or maybe we shouldn't conflate the two things together - one being the 
original migration mode, the other being "when to skip pages or give up" 
mode.

>>>      This
>>>      would remove the COMPACT_CLUSTER_MAX restriction on pageblock
>>>      compaction
>>
>> Yes.
>>
>>>      and keep pages on the cc->migratepages list between
>>>      calls to isolate_migratepages_range().
>>
>> This might not be needed. It's called within a single pageblock (except maybe
>> CMA but that's quite a different thing) and I think we can ignore order >
>> pageblock_nr_order here.
>>
>
> Ok, I guess pageblocks within a zone are always pageblock_order aligned
> for all platforms so if we encounter any non-migratable (or PageBuddy)
> memory in a block where pageblock_order == HPAGE_PMD_NR, then we can abort
> that block immediately for thp faults.

Yeah I believe they are aligned, otherwise it would be quite a mess :)

>>>      When an unmigratable page is encountered or memory hole is found,
>>>      put all pages on cc->migratepages back on the lru lists unless
>>>      cc->nr_migratepages >= (1 << cc->order).  Otherwise, migrate when
>>>      enough contiguous memory has been isolated.
>>>
>>>    - Remove the need_resched() checks entirely from compaction and
>>>      consider only doing a set amount of scanning for each call, such
>>>      as 1GB per call.
>>>
>>>      If there is contention on zone->lru_lock, then we can still abort
>>>      to avoid excessive stalls, but need_resched() is a poor heuristic
>>>      to determine when async compaction is taking too long.
>>
>> I tend to agree. I've also realized that because need_resched() leads to
>> cc->contended = true, direct reclaim and second compaction that would normally
>> follow (used to be sync, now only in hugepaged) is skipped. need_resched()
>> seems to be indeed unsuitable for this.
>>
>
> It's hard to replace with an alternative, though, to determine when enough
> is enough :)  1GB might be a sane starting point, though, and then try
> reclaim and avoid the second call to async compaction on failure.  I'm not
> sure if the deferral would be needed in this case or not.

I think the second call to compaction is there on the assumption that 
direct reclaim has freed some memory which will allow compaction to 
succeed. I doubt reclaim itself has a chance of freeing a THP-order 
page. But yeah the heuristic is weak since we might not scan in the same 
1GB area as where reclaim helped...

>>>      The expense of calling async compaction if this is done is easily
>>>      quantified since we're not migrating any memory unless it is
>>>      sufficient for the page allocation: it would simply be the iteration
>>>      over 1GB of memory and avoiding contention on zone->lru_lock.
>>>
>>> We may also need to consider deferring async compaction for subsequent
>>> faults in the near future even though scanning the previous 1GB does not
>>> decrease or have any impact whatsoever in the success of defragmenting the
>>> next 1GB.
>>>
>>> Any other suggestions that may be helpful?
>>
>> I suggested already the idea of turning the deferred compaction into a
>> live-tuned amount of how much to scan, instead of doing a whole zone (before)
>> or an arbitrary amount like 1GB, with the hope of reducing the latency spikes.
>> But I realize this would be tricky.
>>
>
> By live-tuned, do you mean something that is tuned by the kernel over time
> depending on how successful compaction is or do you mean something that
> the user would alter?  If we are to go this route, I agree that we can
> allow the user to tune the 1GB.

Actually I meant automatically by the kernel depending on the success, 
but maybe we could allow the user to alter the aggressiveness.

But I guess something like 1GB and no deferral would be a first step and 
see how it works. Also maybe we could be more precise by not limiting by 
memory size, but by the number of pages visited by the scanner(s). 
Because e.g. scanning 1GB by a whole pageblock per iteration (because 
it's already a THP, or of a wrong migratetype) is obviously much smaller 
cost than visiting all the pages.

>> Even less concrete, it might be worth revisiting the rules we use for deciding
>> if compaction is worth trying, and the decisions if to continue or we believe
>> the allocation will succeed.
>
> Ah, tuning compaction_suitable() might be another opportunity.  I'm
> wondering if we should be considering thp specially here since it's in the
> fault path.

Maybe, but how?

>> It relies heavily on watermark checks that also
>> consider the order, and I found it somewhat fragile. For example, in the alloc
>> slowpath -> direct compaction path, a check in compaction concludes that
>> allocation should succeed and finishes the compaction, and few moments later
>> the same check in the allocation will conclude that everything is fine up to
>> order 8, but there probably isn't a page of order 9. In between, the
>> nr_free_pages has *increased*. I suspect it's because the watermark checking
>> is racy and the counters drift, but I'm not sure yet.
>
> The watermark checks both in compaction_suitable() and and
> compact_finished() are indeed racy with respect to the actual page
> allocation.
>
>> However, this particular problem should be gone when I finish my series that
>> would capture the page that compaction has just freed. But still, the
>> decisions regarding compaction could be suboptimal.
>>
>
> This should also avoid the race between COMPACT_PARTIAL, returning to the
> page allocator, and finding that the high-order memory you thought was now
> available has been allocated by someone else.

Yeah, I'll try to finish this attempt soon. The idea was also to avoid 
racing with parallel allocations during the compaction itself, by using 
memory isolation on the pageblock (similar to what CMA is doing). But 
that has to be done before even attempting to migrate-scan the 
pageblock, so it might add a useless latency for a pageblock that will 
be then skipped by the async compaction. But capturing the page as soon 
as it becomes available and holding on to it all the way back from 
compaction to alloc slowpath would be an improvement by itself. The 
isolation could be then added to the cases where latencies are not 
critical, such as hugepaged.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: compaction is still too expensive for thp
@ 2014-05-22 12:03                     ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-22 12:03 UTC (permalink / raw)
  To: David Rientjes
  Cc: Mel Gorman, Andrew Morton, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel

On 05/22/2014 10:55 AM, David Rientjes wrote:
> On Thu, 22 May 2014, Vlastimil Babka wrote:
>
>>> With -mm, it turns out that while egregious thp fault latencies were
>>> reduced, faulting 64MB of memory backed by thp on a fragmented 128GB
>>> machine can result in latencies of 1-3s for the entire 64MB.  Collecting
>>> compaction stats from older kernels that give more insight into
>>> regressions, one such incident is as follows.
>>>
>>> Baseline:
>>> compact_blocks_moved 8181986
>>> compact_pages_moved 6549560
>>> compact_pagemigrate_failed 1612070
>>> compact_stall 101959
>>> compact_fail 100895
>>> compact_success 1064
>>>
>>> 5s later:
>>> compact_blocks_moved 8182447
>>> compact_pages_moved 6550286
>>> compact_pagemigrate_failed 1612092
>>> compact_stall 102023
>>> compact_fail 100959
>>> compact_success 1064
>>>
>>> This represents faulting two 64MB ranges of anonymous memory.  As you can
>>> see, it results in falling back to 4KB pages because all 64 faults of
>>> hugepages ends up triggering compaction and failing to allocate.  Over the
>>> 64 async compactions, we scan on average 7.2 pageblocks per call,
>>> successfully migrate 11.3 pages per call, and fail migrating 0.34 pages
>>> per call.
>>>
>>> If each async compaction scans 7.2 pageblocks per call, it would have to
>>> be called 9103 times to scan all memory on this 128GB machine.  We're
>>> simply not scanning enough memory as a result of ISOLATE_ABORT due to
>>> need_resched().
>>
>> Well, the two objectives of not being expensive and at the same time scanning
>> "enough memory" (which is hard to define as well) are clearly quite opposite
>> :/
>>
>
> Agreed.
>
>>> So the net result is that -mm is much better than Linus's tree, where such
>>> faulting of 64MB ranges could stall 8-9s, but we're still very expensive.
>>
>> So I guess the difference here is mainly thanks to not doing sync compaction?
>
> Not doing sync compaction for thp and caching the migration pfn for async
> so that it doesn't iterate over a ton of memory that may not be eligible
> for async compaction every time it is called.  But when we avoid sync
> compaction, we also lose deferred compaction.
>
>> So if I understand correctly, your intention is to scan more in a single scan,
>
> More will be scanned instead of ~7 pageblocks for every call to async
> compaction with the data that I presented but also reduce how expensive
> every pageblock scan is by avoiding needlessly migrating memory (and
> dealing with rmap locks) when it will not result in 2MB of contiguous
> memory for thp faults.
>
>> but balance the increased latencies by introducing deferring for async
>> compaction.
>>
>> Offhand I can think of two issues with that.
>>
>> 1) the result will be that often the latency will be low thanks to defer, but
>> then there will be a huge (?) spike by scanning whole 1GB (as you suggest
>> further in the mail) at once. I think that's similar to what you had now with
>> the sync compaction?
>>
>
> Not at all, with MIGRATE_SYNC_LIGHT before there is no termination other
> than an entire scan of memory so we were potentially scanning 128GB and
> failing if thp cannot be allocated.

OK.

> If we are to avoid migrating memory needlessly that will not result in
> cc->order memory being allocated, then the cost should be relatively
> constant for a span of memory.  My 32GB system can iterate all memory with
> MIGRATE_ASYNC and no need_resched() aborts in ~530ms.

OK

>> 2) 1GB could have a good chance of being successful (?) so there would be no
>> defer anyway.
>>
>
> If we terminate early because order-9 is allocatable or we end up scanning
> the entire 1GB and the hugepage is allocated, then we have prevented 511
> other pagefaults in my testcase where faulting 64MB of memory with thp
> enabled can currently take 1-3s on a 128GB machine with fragmentation.  I
> think the deferral is unnecessary in such a case.
>
> Are you suggesting we should try without the deferral first?

Might be an option, or less aggressive back off than the sync deferral, 
as it's limited to 1GB.

>>> I have a few improvements in mind, but thought it would be better to
>>> get feedback on it first because it's a substantial rewrite of the
>>> pageblock migration:
>>>
>>>    - For all async compaction, avoid migrating memory unless enough
>>>      contiguous memory is migrated to allow a cc->order allocation.
>>
>> Yes I suggested something like this earlier. Also in the scanner, skip to the
>> next cc->order aligned block as soon as any page fails the isolation and is
>> not PageBuddy.
>
> Agreed.
>
>> I would just dinstinguish kswapd and direct compaction, not "all async
>> compaction". Or maybe kswapd could be switched to sync compaction.
>>
>
> To generalize this, I'm thinking that it is pointless for async compaction
> to migrate memory in a contiguous span if it will not cause a cc->order
> page allocation to succeed.

Well I think it's pointless for page faults and maybe khugepaged. But 
still there should be some daemon trying to migrate even pages that do 
not immediately lead to a continuous block or some order, as an general 
incremental defragmentation. For example I believe (and hope to analyze 
and improve that eventually) that MOVABLE pages allocated in 
UNMOVABLE/RECLAIMABLE pageblocks as a fallback should be attempted to be 
migrated to more appropriate pageblocks when possible so that 
UNMOVABLE/RECLAIMABLE pageblocks have space for pages they are intended 
for, and those allocations don't have to fallback and pollute MOVABLE 
pageblocks, which is much worse.

So if we say that "async compaction" now means it won't do anything that 
does not lead to continuous space immediately, then kswapd should switch 
to sync compaction IMHO (and still not sure about khugepaged).
Or maybe we shouldn't conflate the two things together - one being the 
original migration mode, the other being "when to skip pages or give up" 
mode.

>>>      This
>>>      would remove the COMPACT_CLUSTER_MAX restriction on pageblock
>>>      compaction
>>
>> Yes.
>>
>>>      and keep pages on the cc->migratepages list between
>>>      calls to isolate_migratepages_range().
>>
>> This might not be needed. It's called within a single pageblock (except maybe
>> CMA but that's quite a different thing) and I think we can ignore order >
>> pageblock_nr_order here.
>>
>
> Ok, I guess pageblocks within a zone are always pageblock_order aligned
> for all platforms so if we encounter any non-migratable (or PageBuddy)
> memory in a block where pageblock_order == HPAGE_PMD_NR, then we can abort
> that block immediately for thp faults.

Yeah I believe they are aligned, otherwise it would be quite a mess :)

>>>      When an unmigratable page is encountered or memory hole is found,
>>>      put all pages on cc->migratepages back on the lru lists unless
>>>      cc->nr_migratepages >= (1 << cc->order).  Otherwise, migrate when
>>>      enough contiguous memory has been isolated.
>>>
>>>    - Remove the need_resched() checks entirely from compaction and
>>>      consider only doing a set amount of scanning for each call, such
>>>      as 1GB per call.
>>>
>>>      If there is contention on zone->lru_lock, then we can still abort
>>>      to avoid excessive stalls, but need_resched() is a poor heuristic
>>>      to determine when async compaction is taking too long.
>>
>> I tend to agree. I've also realized that because need_resched() leads to
>> cc->contended = true, direct reclaim and second compaction that would normally
>> follow (used to be sync, now only in hugepaged) is skipped. need_resched()
>> seems to be indeed unsuitable for this.
>>
>
> It's hard to replace with an alternative, though, to determine when enough
> is enough :)  1GB might be a sane starting point, though, and then try
> reclaim and avoid the second call to async compaction on failure.  I'm not
> sure if the deferral would be needed in this case or not.

I think the second call to compaction is there on the assumption that 
direct reclaim has freed some memory which will allow compaction to 
succeed. I doubt reclaim itself has a chance of freeing a THP-order 
page. But yeah the heuristic is weak since we might not scan in the same 
1GB area as where reclaim helped...

>>>      The expense of calling async compaction if this is done is easily
>>>      quantified since we're not migrating any memory unless it is
>>>      sufficient for the page allocation: it would simply be the iteration
>>>      over 1GB of memory and avoiding contention on zone->lru_lock.
>>>
>>> We may also need to consider deferring async compaction for subsequent
>>> faults in the near future even though scanning the previous 1GB does not
>>> decrease or have any impact whatsoever in the success of defragmenting the
>>> next 1GB.
>>>
>>> Any other suggestions that may be helpful?
>>
>> I suggested already the idea of turning the deferred compaction into a
>> live-tuned amount of how much to scan, instead of doing a whole zone (before)
>> or an arbitrary amount like 1GB, with the hope of reducing the latency spikes.
>> But I realize this would be tricky.
>>
>
> By live-tuned, do you mean something that is tuned by the kernel over time
> depending on how successful compaction is or do you mean something that
> the user would alter?  If we are to go this route, I agree that we can
> allow the user to tune the 1GB.

Actually I meant automatically by the kernel depending on the success, 
but maybe we could allow the user to alter the aggressiveness.

But I guess something like 1GB and no deferral would be a first step and 
see how it works. Also maybe we could be more precise by not limiting by 
memory size, but by the number of pages visited by the scanner(s). 
Because e.g. scanning 1GB by a whole pageblock per iteration (because 
it's already a THP, or of a wrong migratetype) is obviously much smaller 
cost than visiting all the pages.

>> Even less concrete, it might be worth revisiting the rules we use for deciding
>> if compaction is worth trying, and the decisions if to continue or we believe
>> the allocation will succeed.
>
> Ah, tuning compaction_suitable() might be another opportunity.  I'm
> wondering if we should be considering thp specially here since it's in the
> fault path.

Maybe, but how?

>> It relies heavily on watermark checks that also
>> consider the order, and I found it somewhat fragile. For example, in the alloc
>> slowpath -> direct compaction path, a check in compaction concludes that
>> allocation should succeed and finishes the compaction, and few moments later
>> the same check in the allocation will conclude that everything is fine up to
>> order 8, but there probably isn't a page of order 9. In between, the
>> nr_free_pages has *increased*. I suspect it's because the watermark checking
>> is racy and the counters drift, but I'm not sure yet.
>
> The watermark checks both in compaction_suitable() and and
> compact_finished() are indeed racy with respect to the actual page
> allocation.
>
>> However, this particular problem should be gone when I finish my series that
>> would capture the page that compaction has just freed. But still, the
>> decisions regarding compaction could be suboptimal.
>>
>
> This should also avoid the race between COMPACT_PARTIAL, returning to the
> page allocator, and finding that the high-order memory you thought was now
> available has been allocated by someone else.

Yeah, I'll try to finish this attempt soon. The idea was also to avoid 
racing with parallel allocations during the compaction itself, by using 
memory isolation on the pageblock (similar to what CMA is doing). But 
that has to be done before even attempting to migrate-scan the 
pageblock, so it might add a useless latency for a pageblock that will 
be then skipped by the async compaction. But capturing the page as soon 
as it becomes available and holding on to it all the way back from 
compaction to alloc slowpath would be an improvement by itself. The 
isolation could be then added to the cases where latencies are not 
critical, such as hugepaged.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-16  9:47             ` Vlastimil Babka
@ 2014-05-22 23:49               ` Kevin Hilman
  -1 siblings, 0 replies; 267+ messages in thread
From: Kevin Hilman @ 2014-05-22 23:49 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, Andrew Morton, David Rientjes, Hugh Dickins,
	Greg Thelen, LKML, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel, Olof Johansson, Stephen Warren

On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.

This patch (or later version) has hit next-20140522 (in the form
commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
bisect, appears to be the culprit of several boot failures on ARM
platforms.

Unfortunately, there isn't much useful in the logs of the boot
failures/hangs since they mostly silently hang.  However, on one
platform (Marvell Armada 370 Mirabox), it reports a failure to
allocate memory, and the RCU stall detection kicks in:

[    1.298234] xhci_hcd 0000:02:00.0: xHCI Host Controller
[    1.303485] xhci_hcd 0000:02:00.0: new USB bus registered, assigned
bus number 1
[    1.310966] xhci_hcd 0000:02:00.0: Couldn't initialize memory
[   22.245395] INFO: rcu_sched detected stalls on CPUs/tasks: {}
(detected by 0, t=2102 jiffies, g=-282, c=-283, q=16)
[   22.255886] INFO: Stall ended before state dump start
[   48.095396] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s!
[swapper/0:1]

Reverting this commit makes them all happy again.

Kevin

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-22 23:49               ` Kevin Hilman
  0 siblings, 0 replies; 267+ messages in thread
From: Kevin Hilman @ 2014-05-22 23:49 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Joonsoo Kim, Andrew Morton, David Rientjes, Hugh Dickins,
	Greg Thelen, LKML, linux-mm, Minchan Kim, Mel Gorman,
	Bartlomiej Zolnierkiewicz, Michal Nazarewicz, Christoph Lameter,
	Rik van Riel, Olof Johansson, Stephen Warren

On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
> Compaction uses compact_checklock_irqsave() function to periodically check for
> lock contention and need_resched() to either abort async compaction, or to
> free the lock, schedule and retake the lock. When aborting, cc->contended is
> set to signal the contended state to the caller. Two problems have been
> identified in this mechanism.

This patch (or later version) has hit next-20140522 (in the form
commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
bisect, appears to be the culprit of several boot failures on ARM
platforms.

Unfortunately, there isn't much useful in the logs of the boot
failures/hangs since they mostly silently hang.  However, on one
platform (Marvell Armada 370 Mirabox), it reports a failure to
allocate memory, and the RCU stall detection kicks in:

[    1.298234] xhci_hcd 0000:02:00.0: xHCI Host Controller
[    1.303485] xhci_hcd 0000:02:00.0: new USB bus registered, assigned
bus number 1
[    1.310966] xhci_hcd 0000:02:00.0: Couldn't initialize memory
[   22.245395] INFO: rcu_sched detected stalls on CPUs/tasks: {}
(detected by 0, t=2102 jiffies, g=-282, c=-283, q=16)
[   22.255886] INFO: Stall ended before state dump start
[   48.095396] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s!
[swapper/0:1]

Reverting this commit makes them all happy again.

Kevin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-22 23:49               ` Kevin Hilman
  (?)
@ 2014-05-23  2:48                 ` Shawn Guo
  -1 siblings, 0 replies; 267+ messages in thread
From: Shawn Guo @ 2014-05-23  2:48 UTC (permalink / raw)
  To: Kevin Hilman
  Cc: Vlastimil Babka, Joonsoo Kim, Andrew Morton, David Rientjes,
	Hugh Dickins, Greg Thelen, LKML, linux-mm, Minchan Kim,
	Mel Gorman, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel, Olof Johansson, Stephen Warren,
	linux-arm-kernel

On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>
> This patch (or later version) has hit next-20140522 (in the form
> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
> bisect, appears to be the culprit of several boot failures on ARM
> platforms.

On i.MX6 where CMA is enabled, the commit causes the drivers calling
dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
dma_alloc_from_contiguous() always return page as NULL after this
commit.

Shawn

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-23  2:48                 ` Shawn Guo
  0 siblings, 0 replies; 267+ messages in thread
From: Shawn Guo @ 2014-05-23  2:48 UTC (permalink / raw)
  To: Kevin Hilman
  Cc: Vlastimil Babka, Joonsoo Kim, Andrew Morton, David Rientjes,
	Hugh Dickins, Greg Thelen, LKML, linux-mm, Minchan Kim,
	Mel Gorman, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel, Olof Johansson, Stephen Warren,
	linux-arm-kernel

On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>
> This patch (or later version) has hit next-20140522 (in the form
> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
> bisect, appears to be the culprit of several boot failures on ARM
> platforms.

On i.MX6 where CMA is enabled, the commit causes the drivers calling
dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
dma_alloc_from_contiguous() always return page as NULL after this
commit.

Shawn

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-23  2:48                 ` Shawn Guo
  0 siblings, 0 replies; 267+ messages in thread
From: Shawn Guo @ 2014-05-23  2:48 UTC (permalink / raw)
  To: linux-arm-kernel

On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>> Compaction uses compact_checklock_irqsave() function to periodically check for
>> lock contention and need_resched() to either abort async compaction, or to
>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>> set to signal the contended state to the caller. Two problems have been
>> identified in this mechanism.
>
> This patch (or later version) has hit next-20140522 (in the form
> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
> bisect, appears to be the culprit of several boot failures on ARM
> platforms.

On i.MX6 where CMA is enabled, the commit causes the drivers calling
dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
dma_alloc_from_contiguous() always return page as NULL after this
commit.

Shawn

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-23  2:48                 ` Shawn Guo
  (?)
@ 2014-05-23  8:34                   ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-23  8:34 UTC (permalink / raw)
  To: Shawn Guo, Kevin Hilman, Andrew Morton
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen, LKML,
	linux-mm, Minchan Kim, Mel Gorman, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Christoph Lameter, Rik van Riel,
	Olof Johansson, Stephen Warren, linux-arm-kernel

On 05/23/2014 04:48 AM, Shawn Guo wrote:
> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>> lock contention and need_resched() to either abort async compaction, or to
>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>> set to signal the contended state to the caller. Two problems have been
>>> identified in this mechanism.
>>
>> This patch (or later version) has hit next-20140522 (in the form
>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>> bisect, appears to be the culprit of several boot failures on ARM
>> platforms.
> 
> On i.MX6 where CMA is enabled, the commit causes the drivers calling
> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
> dma_alloc_from_contiguous() always return page as NULL after this
> commit.
> 
> Shawn
> 

Really sorry, guys :/

-----8<-----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 23 May 2014 10:18:56 +0200
Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2

Step 1: Change function name and comment between v1 and v2 so that the return
        value signals the opposite thing.
Step 2: Change the call sites to reflect the opposite return value.
Step 3: ???
Step 4: Make a complete fool of yourself.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index a525cd4..5175019 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
 	if (need_resched()) {
 		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
-			return false;
+			return true;
 		}
 
 		cond_resched();
 	}
 
-	return true;
+	return false;
 }
 
 /* Returns true if the page is within a block suitable for migration to */
-- 
1.8.4.5



^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-23  8:34                   ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-23  8:34 UTC (permalink / raw)
  To: Shawn Guo, Kevin Hilman, Andrew Morton
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen, LKML,
	linux-mm, Minchan Kim, Mel Gorman, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Christoph Lameter, Rik van Riel,
	Olof Johansson, Stephen Warren, linux-arm-kernel

On 05/23/2014 04:48 AM, Shawn Guo wrote:
> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>> lock contention and need_resched() to either abort async compaction, or to
>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>> set to signal the contended state to the caller. Two problems have been
>>> identified in this mechanism.
>>
>> This patch (or later version) has hit next-20140522 (in the form
>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>> bisect, appears to be the culprit of several boot failures on ARM
>> platforms.
> 
> On i.MX6 where CMA is enabled, the commit causes the drivers calling
> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
> dma_alloc_from_contiguous() always return page as NULL after this
> commit.
> 
> Shawn
> 

Really sorry, guys :/

-----8<-----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 23 May 2014 10:18:56 +0200
Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2

Step 1: Change function name and comment between v1 and v2 so that the return
        value signals the opposite thing.
Step 2: Change the call sites to reflect the opposite return value.
Step 3: ???
Step 4: Make a complete fool of yourself.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index a525cd4..5175019 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
 	if (need_resched()) {
 		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
-			return false;
+			return true;
 		}
 
 		cond_resched();
 	}
 
-	return true;
+	return false;
 }
 
 /* Returns true if the page is within a block suitable for migration to */
-- 
1.8.4.5


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-23  8:34                   ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-05-23  8:34 UTC (permalink / raw)
  To: linux-arm-kernel

On 05/23/2014 04:48 AM, Shawn Guo wrote:
> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>> lock contention and need_resched() to either abort async compaction, or to
>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>> set to signal the contended state to the caller. Two problems have been
>>> identified in this mechanism.
>>
>> This patch (or later version) has hit next-20140522 (in the form
>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>> bisect, appears to be the culprit of several boot failures on ARM
>> platforms.
> 
> On i.MX6 where CMA is enabled, the commit causes the drivers calling
> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
> dma_alloc_from_contiguous() always return page as NULL after this
> commit.
> 
> Shawn
> 

Really sorry, guys :/

-----8<-----
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 23 May 2014 10:18:56 +0200
Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2

Step 1: Change function name and comment between v1 and v2 so that the return
        value signals the opposite thing.
Step 2: Change the call sites to reflect the opposite return value.
Step 3: ???
Step 4: Make a complete fool of yourself.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index a525cd4..5175019 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
 	if (need_resched()) {
 		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
-			return false;
+			return true;
 		}
 
 		cond_resched();
 	}
 
-	return true;
+	return false;
 }
 
 /* Returns true if the page is within a block suitable for migration to */
-- 
1.8.4.5

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-23  8:34                   ` Vlastimil Babka
  (?)
@ 2014-05-23 10:49                     ` Shawn Guo
  -1 siblings, 0 replies; 267+ messages in thread
From: Shawn Guo @ 2014-05-23 10:49 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Kevin Hilman, Andrew Morton, Joonsoo Kim, David Rientjes,
	Hugh Dickins, Greg Thelen, LKML, linux-mm, Minchan Kim,
	Mel Gorman, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel, Olof Johansson, Stephen Warren,
	linux-arm-kernel

On Fri, May 23, 2014 at 10:34:55AM +0200, Vlastimil Babka wrote:
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
> 
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Tested-by: Shawn Guo <shawn.guo@linaro.org>

> ---
>  mm/compaction.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index a525cd4..5175019 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
>  	if (need_resched()) {
>  		if (cc->mode == MIGRATE_ASYNC) {
>  			cc->contended = true;
> -			return false;
> +			return true;
>  		}
>  
>  		cond_resched();
>  	}
>  
> -	return true;
> +	return false;
>  }
>  
>  /* Returns true if the page is within a block suitable for migration to */
> -- 
> 1.8.4.5
> 
> 

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-23 10:49                     ` Shawn Guo
  0 siblings, 0 replies; 267+ messages in thread
From: Shawn Guo @ 2014-05-23 10:49 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Kevin Hilman, Andrew Morton, Joonsoo Kim, David Rientjes,
	Hugh Dickins, Greg Thelen, LKML, linux-mm, Minchan Kim,
	Mel Gorman, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel, Olof Johansson, Stephen Warren,
	linux-arm-kernel

On Fri, May 23, 2014 at 10:34:55AM +0200, Vlastimil Babka wrote:
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
> 
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Tested-by: Shawn Guo <shawn.guo@linaro.org>

> ---
>  mm/compaction.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index a525cd4..5175019 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
>  	if (need_resched()) {
>  		if (cc->mode == MIGRATE_ASYNC) {
>  			cc->contended = true;
> -			return false;
> +			return true;
>  		}
>  
>  		cond_resched();
>  	}
>  
> -	return true;
> +	return false;
>  }
>  
>  /* Returns true if the page is within a block suitable for migration to */
> -- 
> 1.8.4.5
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-23 10:49                     ` Shawn Guo
  0 siblings, 0 replies; 267+ messages in thread
From: Shawn Guo @ 2014-05-23 10:49 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, May 23, 2014 at 10:34:55AM +0200, Vlastimil Babka wrote:
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
> 
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Tested-by: Shawn Guo <shawn.guo@linaro.org>

> ---
>  mm/compaction.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index a525cd4..5175019 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
>  	if (need_resched()) {
>  		if (cc->mode == MIGRATE_ASYNC) {
>  			cc->contended = true;
> -			return false;
> +			return true;
>  		}
>  
>  		cond_resched();
>  	}
>  
> -	return true;
> +	return false;
>  }
>  
>  /* Returns true if the page is within a block suitable for migration to */
> -- 
> 1.8.4.5
> 
> 

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-23  8:34                   ` Vlastimil Babka
  (?)
@ 2014-05-23 15:07                     ` Kevin Hilman
  -1 siblings, 0 replies; 267+ messages in thread
From: Kevin Hilman @ 2014-05-23 15:07 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Shawn Guo, Andrew Morton, Joonsoo Kim, David Rientjes,
	Hugh Dickins, Greg Thelen, LKML, linux-mm, Minchan Kim,
	Mel Gorman, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel, Olof Johansson, Stephen Warren,
	linux-arm-kernel

Vlastimil Babka <vbabka@suse.cz> writes:

> On 05/23/2014 04:48 AM, Shawn Guo wrote:
>> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>>> lock contention and need_resched() to either abort async compaction, or to
>>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>>> set to signal the contended state to the caller. Two problems have been
>>>> identified in this mechanism.
>>>
>>> This patch (or later version) has hit next-20140522 (in the form
>>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>>> bisect, appears to be the culprit of several boot failures on ARM
>>> platforms.
>> 
>> On i.MX6 where CMA is enabled, the commit causes the drivers calling
>> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
>> dma_alloc_from_contiguous() always return page as NULL after this
>> commit.
>> 
>> Shawn
>> 
>
> Really sorry, guys :/
>
> -----8<-----
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
>
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Tested-by: Kevin Hilman <khilman@linaro.org>

I verified that this fixes the boot failures I've seen on ARM (i.MX6 and
Marvell Armada 370).

Thanks for the quick fix.

Kevin

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-23 15:07                     ` Kevin Hilman
  0 siblings, 0 replies; 267+ messages in thread
From: Kevin Hilman @ 2014-05-23 15:07 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Shawn Guo, Andrew Morton, Joonsoo Kim, David Rientjes,
	Hugh Dickins, Greg Thelen, LKML, linux-mm, Minchan Kim,
	Mel Gorman, Bartlomiej Zolnierkiewicz, Michal Nazarewicz,
	Christoph Lameter, Rik van Riel, Olof Johansson, Stephen Warren,
	linux-arm-kernel

Vlastimil Babka <vbabka@suse.cz> writes:

> On 05/23/2014 04:48 AM, Shawn Guo wrote:
>> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>>> lock contention and need_resched() to either abort async compaction, or to
>>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>>> set to signal the contended state to the caller. Two problems have been
>>>> identified in this mechanism.
>>>
>>> This patch (or later version) has hit next-20140522 (in the form
>>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>>> bisect, appears to be the culprit of several boot failures on ARM
>>> platforms.
>> 
>> On i.MX6 where CMA is enabled, the commit causes the drivers calling
>> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
>> dma_alloc_from_contiguous() always return page as NULL after this
>> commit.
>> 
>> Shawn
>> 
>
> Really sorry, guys :/
>
> -----8<-----
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
>
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Tested-by: Kevin Hilman <khilman@linaro.org>

I verified that this fixes the boot failures I've seen on ARM (i.MX6 and
Marvell Armada 370).

Thanks for the quick fix.

Kevin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-23 15:07                     ` Kevin Hilman
  0 siblings, 0 replies; 267+ messages in thread
From: Kevin Hilman @ 2014-05-23 15:07 UTC (permalink / raw)
  To: linux-arm-kernel

Vlastimil Babka <vbabka@suse.cz> writes:

> On 05/23/2014 04:48 AM, Shawn Guo wrote:
>> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>>> lock contention and need_resched() to either abort async compaction, or to
>>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>>> set to signal the contended state to the caller. Two problems have been
>>>> identified in this mechanism.
>>>
>>> This patch (or later version) has hit next-20140522 (in the form
>>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>>> bisect, appears to be the culprit of several boot failures on ARM
>>> platforms.
>> 
>> On i.MX6 where CMA is enabled, the commit causes the drivers calling
>> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
>> dma_alloc_from_contiguous() always return page as NULL after this
>> commit.
>> 
>> Shawn
>> 
>
> Really sorry, guys :/
>
> -----8<-----
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
>
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Tested-by: Kevin Hilman <khilman@linaro.org>

I verified that this fixes the boot failures I've seen on ARM (i.MX6 and
Marvell Armada 370).

Thanks for the quick fix.

Kevin

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-23  8:34                   ` Vlastimil Babka
  (?)
@ 2014-05-30 16:59                     ` Stephen Warren
  -1 siblings, 0 replies; 267+ messages in thread
From: Stephen Warren @ 2014-05-30 16:59 UTC (permalink / raw)
  To: Vlastimil Babka, Shawn Guo, Kevin Hilman, Andrew Morton
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen, LKML,
	linux-mm, Minchan Kim, Mel Gorman, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Christoph Lameter, Rik van Riel,
	Olof Johansson, linux-arm-kernel

On 05/23/2014 02:34 AM, Vlastimil Babka wrote:
> On 05/23/2014 04:48 AM, Shawn Guo wrote:
>> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>>> lock contention and need_resched() to either abort async compaction, or to
>>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>>> set to signal the contended state to the caller. Two problems have been
>>>> identified in this mechanism.
>>>
>>> This patch (or later version) has hit next-20140522 (in the form
>>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>>> bisect, appears to be the culprit of several boot failures on ARM
>>> platforms.
>>
>> On i.MX6 where CMA is enabled, the commit causes the drivers calling
>> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
>> dma_alloc_from_contiguous() always return page as NULL after this
>> commit.
>>
>> Shawn
>>
> 
> Really sorry, guys :/
> 
> -----8<-----
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
> 
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.

Tested-by: Stephen Warren <swarren@nvidia.com>

This fix doesn't seem to be in linux-next yet:-(

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-30 16:59                     ` Stephen Warren
  0 siblings, 0 replies; 267+ messages in thread
From: Stephen Warren @ 2014-05-30 16:59 UTC (permalink / raw)
  To: Vlastimil Babka, Shawn Guo, Kevin Hilman, Andrew Morton
  Cc: Joonsoo Kim, David Rientjes, Hugh Dickins, Greg Thelen, LKML,
	linux-mm, Minchan Kim, Mel Gorman, Bartlomiej Zolnierkiewicz,
	Michal Nazarewicz, Christoph Lameter, Rik van Riel,
	Olof Johansson, linux-arm-kernel

On 05/23/2014 02:34 AM, Vlastimil Babka wrote:
> On 05/23/2014 04:48 AM, Shawn Guo wrote:
>> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>>> lock contention and need_resched() to either abort async compaction, or to
>>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>>> set to signal the contended state to the caller. Two problems have been
>>>> identified in this mechanism.
>>>
>>> This patch (or later version) has hit next-20140522 (in the form
>>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>>> bisect, appears to be the culprit of several boot failures on ARM
>>> platforms.
>>
>> On i.MX6 where CMA is enabled, the commit causes the drivers calling
>> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
>> dma_alloc_from_contiguous() always return page as NULL after this
>> commit.
>>
>> Shawn
>>
> 
> Really sorry, guys :/
> 
> -----8<-----
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
> 
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.

Tested-by: Stephen Warren <swarren@nvidia.com>

This fix doesn't seem to be in linux-next yet:-(

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-05-30 16:59                     ` Stephen Warren
  0 siblings, 0 replies; 267+ messages in thread
From: Stephen Warren @ 2014-05-30 16:59 UTC (permalink / raw)
  To: linux-arm-kernel

On 05/23/2014 02:34 AM, Vlastimil Babka wrote:
> On 05/23/2014 04:48 AM, Shawn Guo wrote:
>> On 23 May 2014 07:49, Kevin Hilman <khilman@linaro.org> wrote:
>>> On Fri, May 16, 2014 at 2:47 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
>>>> Compaction uses compact_checklock_irqsave() function to periodically check for
>>>> lock contention and need_resched() to either abort async compaction, or to
>>>> free the lock, schedule and retake the lock. When aborting, cc->contended is
>>>> set to signal the contended state to the caller. Two problems have been
>>>> identified in this mechanism.
>>>
>>> This patch (or later version) has hit next-20140522 (in the form
>>> commit 645ceea9331bfd851bc21eea456dda27862a10f4) and according to my
>>> bisect, appears to be the culprit of several boot failures on ARM
>>> platforms.
>>
>> On i.MX6 where CMA is enabled, the commit causes the drivers calling
>> dma_alloc_coherent() fail to probe.  Tracing it a little bit, it seems
>> dma_alloc_from_contiguous() always return page as NULL after this
>> commit.
>>
>> Shawn
>>
> 
> Really sorry, guys :/
> 
> -----8<-----
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
> 
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.

Tested-by: Stephen Warren <swarren@nvidia.com>

This fix doesn't seem to be in linux-next yet:-(

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
  2014-05-23  8:34                   ` Vlastimil Babka
  (?)
@ 2014-06-02 13:35                     ` Fabio Estevam
  -1 siblings, 0 replies; 267+ messages in thread
From: Fabio Estevam @ 2014-06-02 13:35 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Shawn Guo, Kevin Hilman, Andrew Morton, Rik van Riel,
	Stephen Warren, Minchan Kim, Bartlomiej Zolnierkiewicz,
	Hugh Dickins, LKML, Michal Nazarewicz, linux-mm, Mel Gorman,
	David Rientjes, Olof Johansson, Greg Thelen, Joonsoo Kim,
	Christoph Lameter, linux-arm-kernel

Vlastimil,

On Fri, May 23, 2014 at 5:34 AM, Vlastimil Babka <vbabka@suse.cz> wrote:

> Really sorry, guys :/
>
> -----8<-----
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
>
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/compaction.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> index a525cd4..5175019 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
>         if (need_resched()) {
>                 if (cc->mode == MIGRATE_ASYNC) {
>                         cc->contended = true;
> -                       return false;
> +                       return true;
>                 }
>
>                 cond_resched();
>         }
>
> -       return true;
> +       return false;
>  }

This patch is still not in linux-next.

Could you please submit it formally?

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-06-02 13:35                     ` Fabio Estevam
  0 siblings, 0 replies; 267+ messages in thread
From: Fabio Estevam @ 2014-06-02 13:35 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Shawn Guo, Kevin Hilman, Andrew Morton, Rik van Riel,
	Stephen Warren, Minchan Kim, Bartlomiej Zolnierkiewicz,
	Hugh Dickins, LKML, Michal Nazarewicz, linux-mm, Mel Gorman,
	David Rientjes, Olof Johansson, Greg Thelen, Joonsoo Kim,
	Christoph Lameter, linux-arm-kernel

Vlastimil,

On Fri, May 23, 2014 at 5:34 AM, Vlastimil Babka <vbabka@suse.cz> wrote:

> Really sorry, guys :/
>
> -----8<-----
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
>
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/compaction.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> index a525cd4..5175019 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
>         if (need_resched()) {
>                 if (cc->mode == MIGRATE_ASYNC) {
>                         cc->contended = true;
> -                       return false;
> +                       return true;
>                 }
>
>                 cond_resched();
>         }
>
> -       return true;
> +       return false;
>  }

This patch is still not in linux-next.

Could you please submit it formally?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention
@ 2014-06-02 13:35                     ` Fabio Estevam
  0 siblings, 0 replies; 267+ messages in thread
From: Fabio Estevam @ 2014-06-02 13:35 UTC (permalink / raw)
  To: linux-arm-kernel

Vlastimil,

On Fri, May 23, 2014 at 5:34 AM, Vlastimil Babka <vbabka@suse.cz> wrote:

> Really sorry, guys :/
>
> -----8<-----
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Fri, 23 May 2014 10:18:56 +0200
> Subject: mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix2
>
> Step 1: Change function name and comment between v1 and v2 so that the return
>         value signals the opposite thing.
> Step 2: Change the call sites to reflect the opposite return value.
> Step 3: ???
> Step 4: Make a complete fool of yourself.
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/compaction.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> index a525cd4..5175019 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
>         if (need_resched()) {
>                 if (cc->mode == MIGRATE_ASYNC) {
>                         cc->contended = true;
> -                       return false;
> +                       return true;
>                 }
>
>                 cond_resched();
>         }
>
> -       return true;
> +       return false;
>  }

This patch is still not in linux-next.

Could you please submit it formally?

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH -mm] mm, compaction: properly signal and act upon lock and need_sched() contention - fix
  2014-06-02 13:35                     ` Fabio Estevam
  (?)
@ 2014-06-02 14:33                       ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-02 14:33 UTC (permalink / raw)
  To: Fabio Estevam, Andrew Morton
  Cc: Shawn Guo, Kevin Hilman, Rik van Riel, Stephen Warren,
	Minchan Kim, Bartlomiej Zolnierkiewicz, Hugh Dickins, LKML,
	Michal Nazarewicz, linux-mm, Mel Gorman, David Rientjes,
	Olof Johansson, Greg Thelen, Joonsoo Kim, Christoph Lameter,
	linux-arm-kernel

compact_should_abort() returns true instead of false and vice versa
due to changes between v1 and v2 of the patch. This makes both async
and sync compaction abort with high probability, and has been reported
to cause e.g. soft lockups on some ARM boards, or drivers calling
dma_alloc_coherent() fail to probe with CMA enabled on different boards.

This patch fixes the return value to match comments and callers expecations.

Reported-and-tested-by: Kevin Hilman <khilman@linaro.org>
Reported-and-tested-by: Shawn Guo <shawn.guo@linaro.org>
Tested-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index a525cd4..5175019 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
 	if (need_resched()) {
 		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
-			return false;
+			return true;
 		}
 
 		cond_resched();
 	}
 
-	return true;
+	return false;
 }
 
 /* Returns true if the page is within a block suitable for migration to */
-- 1.8.4.5 

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH -mm] mm, compaction: properly signal and act upon lock and need_sched() contention - fix
@ 2014-06-02 14:33                       ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-02 14:33 UTC (permalink / raw)
  To: Fabio Estevam, Andrew Morton
  Cc: Shawn Guo, Kevin Hilman, Rik van Riel, Stephen Warren,
	Minchan Kim, Bartlomiej Zolnierkiewicz, Hugh Dickins, LKML,
	Michal Nazarewicz, linux-mm, Mel Gorman, David Rientjes,
	Olof Johansson, Greg Thelen, Joonsoo Kim, Christoph Lameter,
	linux-arm-kernel

compact_should_abort() returns true instead of false and vice versa
due to changes between v1 and v2 of the patch. This makes both async
and sync compaction abort with high probability, and has been reported
to cause e.g. soft lockups on some ARM boards, or drivers calling
dma_alloc_coherent() fail to probe with CMA enabled on different boards.

This patch fixes the return value to match comments and callers expecations.

Reported-and-tested-by: Kevin Hilman <khilman@linaro.org>
Reported-and-tested-by: Shawn Guo <shawn.guo@linaro.org>
Tested-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index a525cd4..5175019 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
 	if (need_resched()) {
 		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
-			return false;
+			return true;
 		}
 
 		cond_resched();
 	}
 
-	return true;
+	return false;
 }
 
 /* Returns true if the page is within a block suitable for migration to */
-- 1.8.4.5 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [PATCH -mm] mm, compaction: properly signal and act upon lock and need_sched() contention - fix
@ 2014-06-02 14:33                       ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-02 14:33 UTC (permalink / raw)
  To: linux-arm-kernel

compact_should_abort() returns true instead of false and vice versa
due to changes between v1 and v2 of the patch. This makes both async
and sync compaction abort with high probability, and has been reported
to cause e.g. soft lockups on some ARM boards, or drivers calling
dma_alloc_coherent() fail to probe with CMA enabled on different boards.

This patch fixes the return value to match comments and callers expecations.

Reported-and-tested-by: Kevin Hilman <khilman@linaro.org>
Reported-and-tested-by: Shawn Guo <shawn.guo@linaro.org>
Tested-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/compaction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index a525cd4..5175019 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -237,13 +237,13 @@ static inline bool compact_should_abort(struct compact_control *cc)
 	if (need_resched()) {
 		if (cc->mode == MIGRATE_ASYNC) {
 			cc->contended = true;
-			return false;
+			return true;
 		}
 
 		cond_resched();
 	}
 
-	return true;
+	return false;
 }
 
 /* Returns true if the page is within a block suitable for migration to */
-- 1.8.4.5 

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [PATCH -mm] mm, compaction: properly signal and act upon lock and need_sched() contention - fix
  2014-06-02 14:33                       ` Vlastimil Babka
  (?)
@ 2014-06-02 15:18                         ` Fabio Estevam
  -1 siblings, 0 replies; 267+ messages in thread
From: Fabio Estevam @ 2014-06-02 15:18 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Shawn Guo, Kevin Hilman, Rik van Riel,
	Stephen Warren, Minchan Kim, Bartlomiej Zolnierkiewicz,
	Hugh Dickins, LKML, Michal Nazarewicz, linux-mm, Mel Gorman,
	David Rientjes, Olof Johansson, Greg Thelen, Joonsoo Kim,
	Christoph Lameter, linux-arm-kernel

On Mon, Jun 2, 2014 at 11:33 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
> compact_should_abort() returns true instead of false and vice versa
> due to changes between v1 and v2 of the patch. This makes both async
> and sync compaction abort with high probability, and has been reported
> to cause e.g. soft lockups on some ARM boards, or drivers calling
> dma_alloc_coherent() fail to probe with CMA enabled on different boards.
>
> This patch fixes the return value to match comments and callers expecations.
>
> Reported-and-tested-by: Kevin Hilman <khilman@linaro.org>
> Reported-and-tested-by: Shawn Guo <shawn.guo@linaro.org>
> Tested-by: Stephen Warren <swarren@nvidia.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Tested-by: Fabio Estevam <fabio.estevam@freescale.com>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH -mm] mm, compaction: properly signal and act upon lock and need_sched() contention - fix
@ 2014-06-02 15:18                         ` Fabio Estevam
  0 siblings, 0 replies; 267+ messages in thread
From: Fabio Estevam @ 2014-06-02 15:18 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Shawn Guo, Kevin Hilman, Rik van Riel,
	Stephen Warren, Minchan Kim, Bartlomiej Zolnierkiewicz,
	Hugh Dickins, LKML, Michal Nazarewicz, linux-mm, Mel Gorman,
	David Rientjes, Olof Johansson, Greg Thelen, Joonsoo Kim,
	Christoph Lameter, linux-arm-kernel

On Mon, Jun 2, 2014 at 11:33 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
> compact_should_abort() returns true instead of false and vice versa
> due to changes between v1 and v2 of the patch. This makes both async
> and sync compaction abort with high probability, and has been reported
> to cause e.g. soft lockups on some ARM boards, or drivers calling
> dma_alloc_coherent() fail to probe with CMA enabled on different boards.
>
> This patch fixes the return value to match comments and callers expecations.
>
> Reported-and-tested-by: Kevin Hilman <khilman@linaro.org>
> Reported-and-tested-by: Shawn Guo <shawn.guo@linaro.org>
> Tested-by: Stephen Warren <swarren@nvidia.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Tested-by: Fabio Estevam <fabio.estevam@freescale.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH -mm] mm, compaction: properly signal and act upon lock and need_sched() contention - fix
@ 2014-06-02 15:18                         ` Fabio Estevam
  0 siblings, 0 replies; 267+ messages in thread
From: Fabio Estevam @ 2014-06-02 15:18 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Jun 2, 2014 at 11:33 AM, Vlastimil Babka <vbabka@suse.cz> wrote:
> compact_should_abort() returns true instead of false and vice versa
> due to changes between v1 and v2 of the patch. This makes both async
> and sync compaction abort with high probability, and has been reported
> to cause e.g. soft lockups on some ARM boards, or drivers calling
> dma_alloc_coherent() fail to probe with CMA enabled on different boards.
>
> This patch fixes the return value to match comments and callers expecations.
>
> Reported-and-tested-by: Kevin Hilman <khilman@linaro.org>
> Reported-and-tested-by: Shawn Guo <shawn.guo@linaro.org>
> Tested-by: Stephen Warren <swarren@nvidia.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Tested-by: Fabio Estevam <fabio.estevam@freescale.com>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH -mm] mm, compaction: properly signal and act upon lock and need_sched() contention - fix
  2014-06-02 14:33                       ` Vlastimil Babka
  (?)
@ 2014-06-02 20:09                         ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-02 20:09 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Fabio Estevam, Andrew Morton, Shawn Guo, Kevin Hilman,
	Rik van Riel, Stephen Warren, Minchan Kim,
	Bartlomiej Zolnierkiewicz, Hugh Dickins, LKML, Michal Nazarewicz,
	linux-mm, Mel Gorman, Olof Johansson, Greg Thelen, Joonsoo Kim,
	Christoph Lameter, linux-arm-kernel

On Mon, 2 Jun 2014, Vlastimil Babka wrote:

> compact_should_abort() returns true instead of false and vice versa
> due to changes between v1 and v2 of the patch. This makes both async
> and sync compaction abort with high probability, and has been reported
> to cause e.g. soft lockups on some ARM boards, or drivers calling
> dma_alloc_coherent() fail to probe with CMA enabled on different boards.
> 
> This patch fixes the return value to match comments and callers expecations.
> 
> Reported-and-tested-by: Kevin Hilman <khilman@linaro.org>
> Reported-and-tested-by: Shawn Guo <shawn.guo@linaro.org>
> Tested-by: Stephen Warren <swarren@nvidia.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Acked-by: David Rientjes <rientjes@google.com>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [PATCH -mm] mm, compaction: properly signal and act upon lock and need_sched() contention - fix
@ 2014-06-02 20:09                         ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-02 20:09 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Fabio Estevam, Andrew Morton, Shawn Guo, Kevin Hilman,
	Rik van Riel, Stephen Warren, Minchan Kim,
	Bartlomiej Zolnierkiewicz, Hugh Dickins, LKML, Michal Nazarewicz,
	linux-mm, Mel Gorman, Olof Johansson, Greg Thelen, Joonsoo Kim,
	Christoph Lameter, linux-arm-kernel

On Mon, 2 Jun 2014, Vlastimil Babka wrote:

> compact_should_abort() returns true instead of false and vice versa
> due to changes between v1 and v2 of the patch. This makes both async
> and sync compaction abort with high probability, and has been reported
> to cause e.g. soft lockups on some ARM boards, or drivers calling
> dma_alloc_coherent() fail to probe with CMA enabled on different boards.
> 
> This patch fixes the return value to match comments and callers expecations.
> 
> Reported-and-tested-by: Kevin Hilman <khilman@linaro.org>
> Reported-and-tested-by: Shawn Guo <shawn.guo@linaro.org>
> Tested-by: Stephen Warren <swarren@nvidia.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Acked-by: David Rientjes <rientjes@google.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [PATCH -mm] mm, compaction: properly signal and act upon lock and need_sched() contention - fix
@ 2014-06-02 20:09                         ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-02 20:09 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 2 Jun 2014, Vlastimil Babka wrote:

> compact_should_abort() returns true instead of false and vice versa
> due to changes between v1 and v2 of the patch. This makes both async
> and sync compaction abort with high probability, and has been reported
> to cause e.g. soft lockups on some ARM boards, or drivers calling
> dma_alloc_coherent() fail to probe with CMA enabled on different boards.
> 
> This patch fixes the return value to match comments and callers expecations.
> 
> Reported-and-tested-by: Kevin Hilman <khilman@linaro.org>
> Reported-and-tested-by: Shawn Guo <shawn.guo@linaro.org>
> Tested-by: Stephen Warren <swarren@nvidia.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Acked-by: David Rientjes <rientjes@google.com>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch -mm 1/3] mm: rename allocflags_to_migratetype for clarity
  2014-05-22 12:03                     ` Vlastimil Babka
@ 2014-06-04  0:29                       ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-04  0:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vlastimil Babka, Mel Gorman, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Michal Nazarewicz, Rik van Riel

The page allocator has gfp flags (like __GFP_WAIT) and alloc flags (like 
ALLOC_CPUSET) that have separate semantics.

The function allocflags_to_migratetype() actually takes gfp flags, not alloc 
flags, and returns a migratetype.  Rename it to gfpflags_to_migratetype().

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/gfp.h | 2 +-
 mm/compaction.c     | 4 ++--
 mm/page_alloc.c     | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -156,7 +156,7 @@ struct vm_area_struct;
 #define GFP_DMA32	__GFP_DMA32
 
 /* Convert GFP flags to their corresponding migrate type */
-static inline int allocflags_to_migratetype(gfp_t gfp_flags)
+static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
 {
 	WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
 
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1096,7 +1096,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
 		.nr_freepages = 0,
 		.nr_migratepages = 0,
 		.order = order,
-		.migratetype = allocflags_to_migratetype(gfp_mask),
+		.migratetype = gfpflags_to_migratetype(gfp_mask),
 		.zone = zone,
 		.mode = mode,
 	};
@@ -1145,7 +1145,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 	count_compact_event(COMPACTSTALL);
 
 #ifdef CONFIG_CMA
-	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+	if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 	/* Compact each zone in the list */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2473,7 +2473,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 	}
 #ifdef CONFIG_CMA
-	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+	if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 	return alloc_flags;
@@ -2716,7 +2716,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	struct zone *preferred_zone;
 	struct zoneref *preferred_zoneref;
 	struct page *page = NULL;
-	int migratetype = allocflags_to_migratetype(gfp_mask);
+	int migratetype = gfpflags_to_migratetype(gfp_mask);
 	unsigned int cpuset_mems_cookie;
 	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
 	int classzone_idx;
@@ -2750,7 +2750,7 @@ retry_cpuset:
 	classzone_idx = zonelist_zone_idx(preferred_zoneref);
 
 #ifdef CONFIG_CMA
-	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+	if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 retry:

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch -mm 1/3] mm: rename allocflags_to_migratetype for clarity
@ 2014-06-04  0:29                       ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-04  0:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vlastimil Babka, Mel Gorman, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Michal Nazarewicz, Rik van Riel

The page allocator has gfp flags (like __GFP_WAIT) and alloc flags (like 
ALLOC_CPUSET) that have separate semantics.

The function allocflags_to_migratetype() actually takes gfp flags, not alloc 
flags, and returns a migratetype.  Rename it to gfpflags_to_migratetype().

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/gfp.h | 2 +-
 mm/compaction.c     | 4 ++--
 mm/page_alloc.c     | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -156,7 +156,7 @@ struct vm_area_struct;
 #define GFP_DMA32	__GFP_DMA32
 
 /* Convert GFP flags to their corresponding migrate type */
-static inline int allocflags_to_migratetype(gfp_t gfp_flags)
+static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
 {
 	WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
 
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1096,7 +1096,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
 		.nr_freepages = 0,
 		.nr_migratepages = 0,
 		.order = order,
-		.migratetype = allocflags_to_migratetype(gfp_mask),
+		.migratetype = gfpflags_to_migratetype(gfp_mask),
 		.zone = zone,
 		.mode = mode,
 	};
@@ -1145,7 +1145,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 	count_compact_event(COMPACTSTALL);
 
 #ifdef CONFIG_CMA
-	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+	if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 	/* Compact each zone in the list */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2473,7 +2473,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 	}
 #ifdef CONFIG_CMA
-	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+	if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 	return alloc_flags;
@@ -2716,7 +2716,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	struct zone *preferred_zone;
 	struct zoneref *preferred_zoneref;
 	struct page *page = NULL;
-	int migratetype = allocflags_to_migratetype(gfp_mask);
+	int migratetype = gfpflags_to_migratetype(gfp_mask);
 	unsigned int cpuset_mems_cookie;
 	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
 	int classzone_idx;
@@ -2750,7 +2750,7 @@ retry_cpuset:
 	classzone_idx = zonelist_zone_idx(preferred_zoneref);
 
 #ifdef CONFIG_CMA
-	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+	if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 retry:

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch -mm 2/3] mm, compaction: pass gfp mask to compact_control
  2014-06-04  0:29                       ` David Rientjes
@ 2014-06-04  0:29                         ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-04  0:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vlastimil Babka, Mel Gorman, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Michal Nazarewicz, Rik van Riel

struct compact_control currently converts the gfp mask to a migratetype, but we 
need the entire gfp mask in a follow-up patch.

Pass the entire gfp mask as part of struct compact_control.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 12 +++++++-----
 mm/internal.h   |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -883,8 +883,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 	return ISOLATE_SUCCESS;
 }
 
-static int compact_finished(struct zone *zone,
-			    struct compact_control *cc)
+static int compact_finished(struct zone *zone, struct compact_control *cc,
+			    const int migratetype)
 {
 	unsigned int order;
 	unsigned long watermark;
@@ -930,7 +930,7 @@ static int compact_finished(struct zone *zone,
 		struct free_area *area = &zone->free_area[order];
 
 		/* Job done if page is free of the right migratetype */
-		if (!list_empty(&area->free_list[cc->migratetype]))
+		if (!list_empty(&area->free_list[migratetype]))
 			return COMPACT_PARTIAL;
 
 		/* Job done if allocation would set block type */
@@ -996,6 +996,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	int ret;
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone_end_pfn(zone);
+	const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
 	const bool sync = cc->mode != MIGRATE_ASYNC;
 
 	ret = compaction_suitable(zone, cc->order);
@@ -1038,7 +1039,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 	migrate_prep_local();
 
-	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+	while ((ret = compact_finished(zone, cc, migratetype)) ==
+						COMPACT_CONTINUE) {
 		int err;
 
 		switch (isolate_migratepages(zone, cc)) {
@@ -1096,7 +1098,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
 		.nr_freepages = 0,
 		.nr_migratepages = 0,
 		.order = order,
-		.migratetype = gfpflags_to_migratetype(gfp_mask),
+		.gfp_mask = gfp_mask,
 		.zone = zone,
 		.mode = mode,
 	};
diff --git a/mm/internal.h b/mm/internal.h
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -142,7 +142,7 @@ struct compact_control {
 	bool finished_update_migrate;
 
 	int order;			/* order a direct compactor needs */
-	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
+	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
 	struct zone *zone;
 	bool contended;			/* True if a lock was contended, or
 					 * need_resched() true during async

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch -mm 2/3] mm, compaction: pass gfp mask to compact_control
@ 2014-06-04  0:29                         ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-04  0:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vlastimil Babka, Mel Gorman, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Michal Nazarewicz, Rik van Riel

struct compact_control currently converts the gfp mask to a migratetype, but we 
need the entire gfp mask in a follow-up patch.

Pass the entire gfp mask as part of struct compact_control.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 12 +++++++-----
 mm/internal.h   |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -883,8 +883,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 	return ISOLATE_SUCCESS;
 }
 
-static int compact_finished(struct zone *zone,
-			    struct compact_control *cc)
+static int compact_finished(struct zone *zone, struct compact_control *cc,
+			    const int migratetype)
 {
 	unsigned int order;
 	unsigned long watermark;
@@ -930,7 +930,7 @@ static int compact_finished(struct zone *zone,
 		struct free_area *area = &zone->free_area[order];
 
 		/* Job done if page is free of the right migratetype */
-		if (!list_empty(&area->free_list[cc->migratetype]))
+		if (!list_empty(&area->free_list[migratetype]))
 			return COMPACT_PARTIAL;
 
 		/* Job done if allocation would set block type */
@@ -996,6 +996,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	int ret;
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone_end_pfn(zone);
+	const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
 	const bool sync = cc->mode != MIGRATE_ASYNC;
 
 	ret = compaction_suitable(zone, cc->order);
@@ -1038,7 +1039,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 	migrate_prep_local();
 
-	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+	while ((ret = compact_finished(zone, cc, migratetype)) ==
+						COMPACT_CONTINUE) {
 		int err;
 
 		switch (isolate_migratepages(zone, cc)) {
@@ -1096,7 +1098,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
 		.nr_freepages = 0,
 		.nr_migratepages = 0,
 		.order = order,
-		.migratetype = gfpflags_to_migratetype(gfp_mask),
+		.gfp_mask = gfp_mask,
 		.zone = zone,
 		.mode = mode,
 	};
diff --git a/mm/internal.h b/mm/internal.h
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -142,7 +142,7 @@ struct compact_control {
 	bool finished_update_migrate;
 
 	int order;			/* order a direct compactor needs */
-	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
+	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
 	struct zone *zone;
 	bool contended;			/* True if a lock was contended, or
 					 * need_resched() true during async

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch -mm 3/3] mm, compaction: avoid compacting memory for thp if pageblock cannot become free
  2014-06-04  0:29                       ` David Rientjes
@ 2014-06-04  0:30                         ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-04  0:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vlastimil Babka, Mel Gorman, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Michal Nazarewicz, Rik van Riel

It's pointless to migrate pages within a pageblock if the entire pageblock will 
not become free for a thp allocation.

If we encounter a page that cannot be migrated and a direct compactor other than 
khugepaged is trying to allocate a hugepage for thp, then skip the entire 
pageblock and avoid migrating pages needlessly.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -537,12 +537,12 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
 			if (!pfn_valid(low_pfn)) {
 				low_pfn += MAX_ORDER_NR_PAGES - 1;
-				continue;
+				goto next;
 			}
 		}
 
 		if (!pfn_valid_within(low_pfn))
-			continue;
+			goto next;
 		nr_scanned++;
 
 		/*
@@ -553,7 +553,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		 */
 		page = pfn_to_page(low_pfn);
 		if (page_zone(page) != zone)
-			continue;
+			goto next;
 
 		if (!valid_page)
 			valid_page = page;
@@ -599,7 +599,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 					goto isolate_success;
 				}
 			}
-			continue;
+			goto next;
 		}
 
 		/*
@@ -616,7 +616,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			if (!locked)
 				goto next_pageblock;
 			low_pfn += (1 << compound_order(page)) - 1;
-			continue;
+			goto next;
 		}
 
 		/*
@@ -626,7 +626,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		 */
 		if (!page_mapping(page) &&
 		    page_count(page) > page_mapcount(page))
-			continue;
+			goto next;
 
 		/* Check if it is ok to still hold the lock */
 		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
@@ -636,17 +636,17 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 
 		/* Recheck PageLRU and PageTransHuge under lock */
 		if (!PageLRU(page))
-			continue;
+			goto next;
 		if (PageTransHuge(page)) {
 			low_pfn += (1 << compound_order(page)) - 1;
-			continue;
+			goto next;
 		}
 
 		lruvec = mem_cgroup_page_lruvec(page, zone);
 
 		/* Try isolate the page */
 		if (__isolate_lru_page(page, mode) != 0)
-			continue;
+			goto next;
 
 		VM_BUG_ON_PAGE(PageTransCompound(page), page);
 
@@ -669,6 +669,24 @@ isolate_success:
 
 next_pageblock:
 		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
+
+next:
+		/*
+		 * It is too expensive for compaction to migrate pages from a
+		 * pageblock for thp page faults unless the entire pageblock
+		 * will become free.
+		 */
+		if ((cc->gfp_mask & __GFP_NO_KSWAPD) &&
+		    !(current->flags & PF_KTHREAD)) {
+			if (locked) {
+				spin_unlock_irqrestore(&zone->lru_lock, flags);
+				locked = false;
+			}
+			putback_movable_pages(migratelist);
+			cc->nr_migratepages = 0;
+			nr_isolated = 0;
+			low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
+		}
 	}
 
 	acct_isolated(zone, locked, cc);
@@ -880,7 +898,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 
 	cc->migrate_pfn = low_pfn;
 
-	return ISOLATE_SUCCESS;
+	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
 }
 
 static int compact_finished(struct zone *zone, struct compact_control *cc,
@@ -1055,9 +1073,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 			;
 		}
 
-		if (!cc->nr_migratepages)
-			continue;
-
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				compaction_free, (unsigned long)cc, cc->mode,
 				MR_COMPACTION);

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [patch -mm 3/3] mm, compaction: avoid compacting memory for thp if pageblock cannot become free
@ 2014-06-04  0:30                         ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-04  0:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vlastimil Babka, Mel Gorman, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Michal Nazarewicz, Rik van Riel

It's pointless to migrate pages within a pageblock if the entire pageblock will 
not become free for a thp allocation.

If we encounter a page that cannot be migrated and a direct compactor other than 
khugepaged is trying to allocate a hugepage for thp, then skip the entire 
pageblock and avoid migrating pages needlessly.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -537,12 +537,12 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
 			if (!pfn_valid(low_pfn)) {
 				low_pfn += MAX_ORDER_NR_PAGES - 1;
-				continue;
+				goto next;
 			}
 		}
 
 		if (!pfn_valid_within(low_pfn))
-			continue;
+			goto next;
 		nr_scanned++;
 
 		/*
@@ -553,7 +553,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		 */
 		page = pfn_to_page(low_pfn);
 		if (page_zone(page) != zone)
-			continue;
+			goto next;
 
 		if (!valid_page)
 			valid_page = page;
@@ -599,7 +599,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 					goto isolate_success;
 				}
 			}
-			continue;
+			goto next;
 		}
 
 		/*
@@ -616,7 +616,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			if (!locked)
 				goto next_pageblock;
 			low_pfn += (1 << compound_order(page)) - 1;
-			continue;
+			goto next;
 		}
 
 		/*
@@ -626,7 +626,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		 */
 		if (!page_mapping(page) &&
 		    page_count(page) > page_mapcount(page))
-			continue;
+			goto next;
 
 		/* Check if it is ok to still hold the lock */
 		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
@@ -636,17 +636,17 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 
 		/* Recheck PageLRU and PageTransHuge under lock */
 		if (!PageLRU(page))
-			continue;
+			goto next;
 		if (PageTransHuge(page)) {
 			low_pfn += (1 << compound_order(page)) - 1;
-			continue;
+			goto next;
 		}
 
 		lruvec = mem_cgroup_page_lruvec(page, zone);
 
 		/* Try isolate the page */
 		if (__isolate_lru_page(page, mode) != 0)
-			continue;
+			goto next;
 
 		VM_BUG_ON_PAGE(PageTransCompound(page), page);
 
@@ -669,6 +669,24 @@ isolate_success:
 
 next_pageblock:
 		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
+
+next:
+		/*
+		 * It is too expensive for compaction to migrate pages from a
+		 * pageblock for thp page faults unless the entire pageblock
+		 * will become free.
+		 */
+		if ((cc->gfp_mask & __GFP_NO_KSWAPD) &&
+		    !(current->flags & PF_KTHREAD)) {
+			if (locked) {
+				spin_unlock_irqrestore(&zone->lru_lock, flags);
+				locked = false;
+			}
+			putback_movable_pages(migratelist);
+			cc->nr_migratepages = 0;
+			nr_isolated = 0;
+			low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
+		}
 	}
 
 	acct_isolated(zone, locked, cc);
@@ -880,7 +898,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 
 	cc->migrate_pfn = low_pfn;
 
-	return ISOLATE_SUCCESS;
+	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
 }
 
 static int compact_finished(struct zone *zone, struct compact_control *cc,
@@ -1055,9 +1073,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 			;
 		}
 
-		if (!cc->nr_migratepages)
-			continue;
-
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				compaction_free, (unsigned long)cc, cc->mode,
 				MR_COMPACTION);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm 3/3] mm, compaction: avoid compacting memory for thp if pageblock cannot become free
  2014-06-04  0:30                         ` David Rientjes
@ 2014-06-04 11:04                           ` Mel Gorman
  -1 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-06-04 11:04 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Vlastimil Babka, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Michal Nazarewicz, Rik van Riel

On Tue, Jun 03, 2014 at 05:30:01PM -0700, David Rientjes wrote:
> It's pointless to migrate pages within a pageblock if the entire pageblock will 
> not become free for a thp allocation.
> 
> If we encounter a page that cannot be migrated and a direct compactor other than 
> khugepaged is trying to allocate a hugepage for thp, then skip the entire 
> pageblock and avoid migrating pages needlessly.
> 

It's not completely pointless. A movable page may be placed within an
unmovable pageblock due to insufficient free memory or a pageblock changed
type. When this happens then partial migration moves the movable page
of out of the unmovable block. Future unmovable allocations can then be
placed with other unmovable pages instead of falling back to other blocks
and degrading fragmentation over time.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm 3/3] mm, compaction: avoid compacting memory for thp if pageblock cannot become free
@ 2014-06-04 11:04                           ` Mel Gorman
  0 siblings, 0 replies; 267+ messages in thread
From: Mel Gorman @ 2014-06-04 11:04 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Vlastimil Babka, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Michal Nazarewicz, Rik van Riel

On Tue, Jun 03, 2014 at 05:30:01PM -0700, David Rientjes wrote:
> It's pointless to migrate pages within a pageblock if the entire pageblock will 
> not become free for a thp allocation.
> 
> If we encounter a page that cannot be migrated and a direct compactor other than 
> khugepaged is trying to allocate a hugepage for thp, then skip the entire 
> pageblock and avoid migrating pages needlessly.
> 

It's not completely pointless. A movable page may be placed within an
unmovable pageblock due to insufficient free memory or a pageblock changed
type. When this happens then partial migration moves the movable page
of out of the unmovable block. Future unmovable allocations can then be
placed with other unmovable pages instead of falling back to other blocks
and degrading fragmentation over time.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm 3/3] mm, compaction: avoid compacting memory for thp if pageblock cannot become free
  2014-06-04  0:30                         ` David Rientjes
@ 2014-06-04 16:07                           ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-04 16:07 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Joonsoo Kim, Hugh Dickins, Greg Thelen, linux-kernel,
	linux-mm, Minchan Kim, Michal Nazarewicz, Rik van Riel

On 06/04/2014 02:30 AM, David Rientjes wrote:
> It's pointless to migrate pages within a pageblock if the entire pageblock will
> not become free for a thp allocation.
>
> If we encounter a page that cannot be migrated and a direct compactor other than
> khugepaged is trying to allocate a hugepage for thp, then skip the entire
> pageblock and avoid migrating pages needlessly.

The problem here is that if you encounter a PageBuddy with order > 0, in 
the next iteration you will see a page that's neither PageBuddy nor 
PageLRU and skip the rest of the pageblock.

I was working on slightly different approach, which is not properly 
tested yet so I will just post a RFC for discussion.

> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   mm/compaction.c | 41 ++++++++++++++++++++++++++++-------------
>   1 file changed, 28 insertions(+), 13 deletions(-)
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -537,12 +537,12 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   		if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
>   			if (!pfn_valid(low_pfn)) {
>   				low_pfn += MAX_ORDER_NR_PAGES - 1;
> -				continue;
> +				goto next;
>   			}
>   		}
>
>   		if (!pfn_valid_within(low_pfn))
> -			continue;
> +			goto next;
>   		nr_scanned++;
>
>   		/*
> @@ -553,7 +553,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   		 */
>   		page = pfn_to_page(low_pfn);
>   		if (page_zone(page) != zone)
> -			continue;
> +			goto next;
>
>   		if (!valid_page)
>   			valid_page = page;
> @@ -599,7 +599,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   					goto isolate_success;
>   				}
>   			}
> -			continue;
> +			goto next;
>   		}
>
>   		/*
> @@ -616,7 +616,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			if (!locked)
>   				goto next_pageblock;
>   			low_pfn += (1 << compound_order(page)) - 1;
> -			continue;
> +			goto next;
>   		}
>
>   		/*
> @@ -626,7 +626,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   		 */
>   		if (!page_mapping(page) &&
>   		    page_count(page) > page_mapcount(page))
> -			continue;
> +			goto next;
>
>   		/* Check if it is ok to still hold the lock */
>   		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
> @@ -636,17 +636,17 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>
>   		/* Recheck PageLRU and PageTransHuge under lock */
>   		if (!PageLRU(page))
> -			continue;
> +			goto next;
>   		if (PageTransHuge(page)) {
>   			low_pfn += (1 << compound_order(page)) - 1;
> -			continue;
> +			goto next;
>   		}
>
>   		lruvec = mem_cgroup_page_lruvec(page, zone);
>
>   		/* Try isolate the page */
>   		if (__isolate_lru_page(page, mode) != 0)
> -			continue;
> +			goto next;
>
>   		VM_BUG_ON_PAGE(PageTransCompound(page), page);
>
> @@ -669,6 +669,24 @@ isolate_success:
>
>   next_pageblock:
>   		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
> +
> +next:
> +		/*
> +		 * It is too expensive for compaction to migrate pages from a
> +		 * pageblock for thp page faults unless the entire pageblock
> +		 * will become free.
> +		 */
> +		if ((cc->gfp_mask & __GFP_NO_KSWAPD) &&
> +		    !(current->flags & PF_KTHREAD)) {
> +			if (locked) {
> +				spin_unlock_irqrestore(&zone->lru_lock, flags);
> +				locked = false;
> +			}
> +			putback_movable_pages(migratelist);
> +			cc->nr_migratepages = 0;
> +			nr_isolated = 0;
> +			low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
> +		}
>   	}
>
>   	acct_isolated(zone, locked, cc);
> @@ -880,7 +898,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
>
>   	cc->migrate_pfn = low_pfn;
>
> -	return ISOLATE_SUCCESS;
> +	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
>   }
>
>   static int compact_finished(struct zone *zone, struct compact_control *cc,
> @@ -1055,9 +1073,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   			;
>   		}
>
> -		if (!cc->nr_migratepages)
> -			continue;
> -
>   		err = migrate_pages(&cc->migratepages, compaction_alloc,
>   				compaction_free, (unsigned long)cc, cc->mode,
>   				MR_COMPACTION);
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm 3/3] mm, compaction: avoid compacting memory for thp if pageblock cannot become free
@ 2014-06-04 16:07                           ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-04 16:07 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Joonsoo Kim, Hugh Dickins, Greg Thelen, linux-kernel,
	linux-mm, Minchan Kim, Michal Nazarewicz, Rik van Riel

On 06/04/2014 02:30 AM, David Rientjes wrote:
> It's pointless to migrate pages within a pageblock if the entire pageblock will
> not become free for a thp allocation.
>
> If we encounter a page that cannot be migrated and a direct compactor other than
> khugepaged is trying to allocate a hugepage for thp, then skip the entire
> pageblock and avoid migrating pages needlessly.

The problem here is that if you encounter a PageBuddy with order > 0, in 
the next iteration you will see a page that's neither PageBuddy nor 
PageLRU and skip the rest of the pageblock.

I was working on slightly different approach, which is not properly 
tested yet so I will just post a RFC for discussion.

> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>   mm/compaction.c | 41 ++++++++++++++++++++++++++++-------------
>   1 file changed, 28 insertions(+), 13 deletions(-)
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -537,12 +537,12 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   		if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
>   			if (!pfn_valid(low_pfn)) {
>   				low_pfn += MAX_ORDER_NR_PAGES - 1;
> -				continue;
> +				goto next;
>   			}
>   		}
>
>   		if (!pfn_valid_within(low_pfn))
> -			continue;
> +			goto next;
>   		nr_scanned++;
>
>   		/*
> @@ -553,7 +553,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   		 */
>   		page = pfn_to_page(low_pfn);
>   		if (page_zone(page) != zone)
> -			continue;
> +			goto next;
>
>   		if (!valid_page)
>   			valid_page = page;
> @@ -599,7 +599,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   					goto isolate_success;
>   				}
>   			}
> -			continue;
> +			goto next;
>   		}
>
>   		/*
> @@ -616,7 +616,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   			if (!locked)
>   				goto next_pageblock;
>   			low_pfn += (1 << compound_order(page)) - 1;
> -			continue;
> +			goto next;
>   		}
>
>   		/*
> @@ -626,7 +626,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   		 */
>   		if (!page_mapping(page) &&
>   		    page_count(page) > page_mapcount(page))
> -			continue;
> +			goto next;
>
>   		/* Check if it is ok to still hold the lock */
>   		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
> @@ -636,17 +636,17 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>
>   		/* Recheck PageLRU and PageTransHuge under lock */
>   		if (!PageLRU(page))
> -			continue;
> +			goto next;
>   		if (PageTransHuge(page)) {
>   			low_pfn += (1 << compound_order(page)) - 1;
> -			continue;
> +			goto next;
>   		}
>
>   		lruvec = mem_cgroup_page_lruvec(page, zone);
>
>   		/* Try isolate the page */
>   		if (__isolate_lru_page(page, mode) != 0)
> -			continue;
> +			goto next;
>
>   		VM_BUG_ON_PAGE(PageTransCompound(page), page);
>
> @@ -669,6 +669,24 @@ isolate_success:
>
>   next_pageblock:
>   		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
> +
> +next:
> +		/*
> +		 * It is too expensive for compaction to migrate pages from a
> +		 * pageblock for thp page faults unless the entire pageblock
> +		 * will become free.
> +		 */
> +		if ((cc->gfp_mask & __GFP_NO_KSWAPD) &&
> +		    !(current->flags & PF_KTHREAD)) {
> +			if (locked) {
> +				spin_unlock_irqrestore(&zone->lru_lock, flags);
> +				locked = false;
> +			}
> +			putback_movable_pages(migratelist);
> +			cc->nr_migratepages = 0;
> +			nr_isolated = 0;
> +			low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
> +		}
>   	}
>
>   	acct_isolated(zone, locked, cc);
> @@ -880,7 +898,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
>
>   	cc->migrate_pfn = low_pfn;
>
> -	return ISOLATE_SUCCESS;
> +	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
>   }
>
>   static int compact_finished(struct zone *zone, struct compact_control *cc,
> @@ -1055,9 +1073,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>   			;
>   		}
>
> -		if (!cc->nr_migratepages)
> -			continue;
> -
>   		err = migrate_pages(&cc->migratepages, compaction_alloc,
>   				compaction_free, (unsigned long)cc, cc->mode,
>   				MR_COMPACTION);
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* [RFC PATCH 1/6] mm, compaction: periodically drop lock and restore IRQs in scanners
  2014-05-22  3:20               ` David Rientjes
@ 2014-06-04 16:11                 ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-04 16:11 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Andrew Morton, Greg Thelen, Vlastimil Babka,
	Minchan Kim, Mel Gorman, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel, David Rientjes

Compaction scanners regularly check for lock contention and need_resched()
through the compact_checklock_irqsave() function. However, if there is no
contention, the lock can be held and IRQ disabled for potentially long time.

This has been addressed by commit b2eef8c0d0 ("mm: compaction: minimise the
time IRQs are disabled while isolating pages for migration") for the migration
scanner. However, the refactoring done by commit 748446bb6b ("mm: compaction:
acquire the zone->lru_lock as late as possible") has changed the conditions so
that the lock is dropped only when there's contention on the lock or
need_resched() is true. Also, need_resched() is checked only when the lock is
already held. The comment "give a chance to irqs before checking need_resched"
is therefore misleading, as IRQs remain disabled when the check is done.

This patch restores the behavior intended by commit b2eef8c0d0 and also tries
to better balance and make more deterministic the time spent by checking for
contention vs the time the scanners might run between the checks. It also
avoids situations where checking has not been done often enough before. The
result should be avoiding both too frequent and too infrequent contention
checking, and especially the potentially long-running scans with IRQs disabled
and no checking of need_resched() or for fatal signal pending, which can happen
when many consecutive pages or pageblocks fail the preliminary tests and do not
reach the later call site to compact_checklock_irqsave(), as explained below.

Before the patch:

In the migration scanner, compact_checklock_irqsave() was called each loop, if
reached. If not reached, some lower-frequency checking could still be done if
the lock was already held, but this would not result in aborting contended
async compaction until reaching compact_checklock_irqsave() or end of
pageblock. In the free scanner, it was similar but completely without the
periodical checking, so lock can be potentially held until reaching the end of
pageblock.

After the patch, in both scanners:

The periodical check is done as the first thing in the loop on each
SWAP_CLUSTER_MAX aligned pfn, using the new compact_unlock_should_abort()
function, which always unlocks the lock (if locked) and aborts async compaction
if scheduling is needed or someone else holds the lock. It also aborts any type
of compaction when a fatal signal is pending.

The compact_checklock_irqsave() function is replaced with a slightly different
compact_trylock_irqsave(). The biggest difference is that the function is not
called at all if the lock is already held. The periodical contention checking
is left solely to compact_unlock_should_abort(). If the lock is not held, the
function however does avoid contended run by making async compaction abort when
sched_needed() holds or trylock fails. Sync compaction will conditionally
schedule, and spin on lock.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 113 ++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 77 insertions(+), 36 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index ed7102c..f0fd4b5 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -185,47 +185,74 @@ static void update_pageblock_skip(struct compact_control *cc,
 }
 #endif /* CONFIG_COMPACTION */
 
-static inline bool should_release_lock(spinlock_t *lock)
+/*
+ * Compaction requires the taking of some coarse locks that are potentially
+ * very heavily contended. Check if the process needs to be scheduled or
+ * if the lock is contended. For async compaction, back out if the process
+ * needs to be scheduled, or the lock cannot be taken immediately. For sync
+ * compaction, schedule and spin on the lock if needed.
+ *
+ * Returns true if the lock is held
+ * Returns false if the lock is not held and compaction should abort
+ */
+static inline bool compact_trylock_irqsave(spinlock_t *lock,
+			unsigned long *flags, struct compact_control *cc)
 {
-	return need_resched() || spin_is_contended(lock);
+	if (cc->mode == MIGRATE_ASYNC) {
+		if (need_resched() || !spin_trylock_irqsave(lock, *flags)) {
+			cc->contended = true;
+			return false;
+		}
+	} else {
+		cond_resched();
+		spin_lock_irqsave(lock, *flags);
+	}
+
+	return true;
 }
 
 /*
  * Compaction requires the taking of some coarse locks that are potentially
- * very heavily contended. Check if the process needs to be scheduled or
- * if the lock is contended. For async compaction, back out in the event
- * if contention is severe. For sync compaction, schedule.
+ * very heavily contended. The lock should be periodically unlocked to avoid
+ * having disabled IRQs for a long time, even when there is nobody waiting on
+ * the lock. It might also be that allowing the IRQs will result in
+ * need_resched() becoming true. If scheduling is needed, or somebody else
+ * has taken the lock, async compaction aborts. Sync compaction schedules.
+ * Either compaction type will also abort if a fatal signal is pending.
+ * In either case if the lock was locked, it is dropped and not regained.
  *
- * Returns true if the lock is held.
- * Returns false if the lock is released and compaction should abort
+ * Returns true if compaction should abort due to fatal signal pending, or
+ *		async compaction due to lock contention or need to schedule
+ * Returns false when compaction can continue (sync compaction might have
+ *		scheduled)
  */
-static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
-				      bool locked, struct compact_control *cc)
+static inline bool compact_unlock_should_abort(spinlock_t *lock,
+		unsigned long flags, bool *locked, struct compact_control *cc)
 {
-	if (should_release_lock(lock)) {
-		if (locked) {
-			spin_unlock_irqrestore(lock, *flags);
-			locked = false;
-		}
+	if (*locked) {
+		spin_unlock_irqrestore(lock, flags);
+		*locked = false;
+	}
 
-		/* async aborts if taking too long or contended */
-		if (cc->mode == MIGRATE_ASYNC) {
+	if (fatal_signal_pending(current))
+		return true;
+
+	if (cc->mode == MIGRATE_ASYNC) {
+		if (need_resched() || spin_is_locked(lock)) {
 			cc->contended = true;
-			return false;
+			return true;
 		}
-
+	} else {
 		cond_resched();
 	}
 
-	if (!locked)
-		spin_lock_irqsave(lock, *flags);
-	return true;
+	return false;
 }
 
 /*
- * Aside from avoiding lock contention, compaction also periodically checks
+ * Aside from avoiding lock contention, compaction should also periodically checks
  * need_resched() and either schedules in sync compaction, or aborts async
- * compaction. This is similar to compact_checklock_irqsave() does, but used
+ * compaction. This is similar to compact_unlock_should_abort() does, but used
  * where no lock is concerned.
  *
  * Returns false when no scheduling was needed, or sync compaction scheduled.
@@ -285,6 +312,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		int isolated, i;
 		struct page *page = cursor;
 
+		/*
+		 * Periodically drop the lock (if held) regardless of its
+		 * contention, to give chance to IRQs. Abort async compaction
+		 * if contended.
+		 */
+		if (!(blockpfn % SWAP_CLUSTER_MAX)
+		    && compact_unlock_should_abort(&cc->zone->lock, flags,
+								&locked, cc))
+			break;
+
 		nr_scanned++;
 		if (!pfn_valid_within(blockpfn))
 			goto isolate_fail;
@@ -302,8 +339,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		 * spin on the lock and we acquire the lock as late as
 		 * possible.
 		 */
-		locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
-								locked, cc);
+		if (!locked)
+			locked = compact_trylock_irqsave(&cc->zone->lock,
+								&flags, cc);
 		if (!locked)
 			break;
 
@@ -523,13 +561,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 
 	/* Time to isolate some pages for migration */
 	for (; low_pfn < end_pfn; low_pfn++) {
-		/* give a chance to irqs before checking need_resched() */
-		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
-			if (should_release_lock(&zone->lru_lock)) {
-				spin_unlock_irqrestore(&zone->lru_lock, flags);
-				locked = false;
-			}
-		}
+		/*
+		 * Periodically drop the lock (if held) regardless of its
+		 * contention, to give chance to IRQs. Abort async compaction
+		 * if contended.
+		 */
+		if (!(low_pfn % SWAP_CLUSTER_MAX)
+		    && compact_unlock_should_abort(&zone->lru_lock, flags,
+								&locked, cc))
+			break;
 
 		/*
 		 * migrate_pfn does not necessarily start aligned to a
@@ -631,10 +671,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		    page_count(page) > page_mapcount(page))
 			continue;
 
-		/* Check if it is ok to still hold the lock */
-		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
-								locked, cc);
-		if (!locked || fatal_signal_pending(current))
+		/* If the lock is not held, try to take it */
+		if (!locked)
+			locked = compact_trylock_irqsave(&zone->lru_lock,
+								&flags, cc);
+		if (!locked)
 			break;
 
 		/* Recheck PageLRU and PageTransHuge under lock */
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [RFC PATCH 1/6] mm, compaction: periodically drop lock and restore IRQs in scanners
@ 2014-06-04 16:11                 ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-04 16:11 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Andrew Morton, Greg Thelen, Vlastimil Babka,
	Minchan Kim, Mel Gorman, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel, David Rientjes

Compaction scanners regularly check for lock contention and need_resched()
through the compact_checklock_irqsave() function. However, if there is no
contention, the lock can be held and IRQ disabled for potentially long time.

This has been addressed by commit b2eef8c0d0 ("mm: compaction: minimise the
time IRQs are disabled while isolating pages for migration") for the migration
scanner. However, the refactoring done by commit 748446bb6b ("mm: compaction:
acquire the zone->lru_lock as late as possible") has changed the conditions so
that the lock is dropped only when there's contention on the lock or
need_resched() is true. Also, need_resched() is checked only when the lock is
already held. The comment "give a chance to irqs before checking need_resched"
is therefore misleading, as IRQs remain disabled when the check is done.

This patch restores the behavior intended by commit b2eef8c0d0 and also tries
to better balance and make more deterministic the time spent by checking for
contention vs the time the scanners might run between the checks. It also
avoids situations where checking has not been done often enough before. The
result should be avoiding both too frequent and too infrequent contention
checking, and especially the potentially long-running scans with IRQs disabled
and no checking of need_resched() or for fatal signal pending, which can happen
when many consecutive pages or pageblocks fail the preliminary tests and do not
reach the later call site to compact_checklock_irqsave(), as explained below.

Before the patch:

In the migration scanner, compact_checklock_irqsave() was called each loop, if
reached. If not reached, some lower-frequency checking could still be done if
the lock was already held, but this would not result in aborting contended
async compaction until reaching compact_checklock_irqsave() or end of
pageblock. In the free scanner, it was similar but completely without the
periodical checking, so lock can be potentially held until reaching the end of
pageblock.

After the patch, in both scanners:

The periodical check is done as the first thing in the loop on each
SWAP_CLUSTER_MAX aligned pfn, using the new compact_unlock_should_abort()
function, which always unlocks the lock (if locked) and aborts async compaction
if scheduling is needed or someone else holds the lock. It also aborts any type
of compaction when a fatal signal is pending.

The compact_checklock_irqsave() function is replaced with a slightly different
compact_trylock_irqsave(). The biggest difference is that the function is not
called at all if the lock is already held. The periodical contention checking
is left solely to compact_unlock_should_abort(). If the lock is not held, the
function however does avoid contended run by making async compaction abort when
sched_needed() holds or trylock fails. Sync compaction will conditionally
schedule, and spin on lock.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 113 ++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 77 insertions(+), 36 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index ed7102c..f0fd4b5 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -185,47 +185,74 @@ static void update_pageblock_skip(struct compact_control *cc,
 }
 #endif /* CONFIG_COMPACTION */
 
-static inline bool should_release_lock(spinlock_t *lock)
+/*
+ * Compaction requires the taking of some coarse locks that are potentially
+ * very heavily contended. Check if the process needs to be scheduled or
+ * if the lock is contended. For async compaction, back out if the process
+ * needs to be scheduled, or the lock cannot be taken immediately. For sync
+ * compaction, schedule and spin on the lock if needed.
+ *
+ * Returns true if the lock is held
+ * Returns false if the lock is not held and compaction should abort
+ */
+static inline bool compact_trylock_irqsave(spinlock_t *lock,
+			unsigned long *flags, struct compact_control *cc)
 {
-	return need_resched() || spin_is_contended(lock);
+	if (cc->mode == MIGRATE_ASYNC) {
+		if (need_resched() || !spin_trylock_irqsave(lock, *flags)) {
+			cc->contended = true;
+			return false;
+		}
+	} else {
+		cond_resched();
+		spin_lock_irqsave(lock, *flags);
+	}
+
+	return true;
 }
 
 /*
  * Compaction requires the taking of some coarse locks that are potentially
- * very heavily contended. Check if the process needs to be scheduled or
- * if the lock is contended. For async compaction, back out in the event
- * if contention is severe. For sync compaction, schedule.
+ * very heavily contended. The lock should be periodically unlocked to avoid
+ * having disabled IRQs for a long time, even when there is nobody waiting on
+ * the lock. It might also be that allowing the IRQs will result in
+ * need_resched() becoming true. If scheduling is needed, or somebody else
+ * has taken the lock, async compaction aborts. Sync compaction schedules.
+ * Either compaction type will also abort if a fatal signal is pending.
+ * In either case if the lock was locked, it is dropped and not regained.
  *
- * Returns true if the lock is held.
- * Returns false if the lock is released and compaction should abort
+ * Returns true if compaction should abort due to fatal signal pending, or
+ *		async compaction due to lock contention or need to schedule
+ * Returns false when compaction can continue (sync compaction might have
+ *		scheduled)
  */
-static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
-				      bool locked, struct compact_control *cc)
+static inline bool compact_unlock_should_abort(spinlock_t *lock,
+		unsigned long flags, bool *locked, struct compact_control *cc)
 {
-	if (should_release_lock(lock)) {
-		if (locked) {
-			spin_unlock_irqrestore(lock, *flags);
-			locked = false;
-		}
+	if (*locked) {
+		spin_unlock_irqrestore(lock, flags);
+		*locked = false;
+	}
 
-		/* async aborts if taking too long or contended */
-		if (cc->mode == MIGRATE_ASYNC) {
+	if (fatal_signal_pending(current))
+		return true;
+
+	if (cc->mode == MIGRATE_ASYNC) {
+		if (need_resched() || spin_is_locked(lock)) {
 			cc->contended = true;
-			return false;
+			return true;
 		}
-
+	} else {
 		cond_resched();
 	}
 
-	if (!locked)
-		spin_lock_irqsave(lock, *flags);
-	return true;
+	return false;
 }
 
 /*
- * Aside from avoiding lock contention, compaction also periodically checks
+ * Aside from avoiding lock contention, compaction should also periodically checks
  * need_resched() and either schedules in sync compaction, or aborts async
- * compaction. This is similar to compact_checklock_irqsave() does, but used
+ * compaction. This is similar to compact_unlock_should_abort() does, but used
  * where no lock is concerned.
  *
  * Returns false when no scheduling was needed, or sync compaction scheduled.
@@ -285,6 +312,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		int isolated, i;
 		struct page *page = cursor;
 
+		/*
+		 * Periodically drop the lock (if held) regardless of its
+		 * contention, to give chance to IRQs. Abort async compaction
+		 * if contended.
+		 */
+		if (!(blockpfn % SWAP_CLUSTER_MAX)
+		    && compact_unlock_should_abort(&cc->zone->lock, flags,
+								&locked, cc))
+			break;
+
 		nr_scanned++;
 		if (!pfn_valid_within(blockpfn))
 			goto isolate_fail;
@@ -302,8 +339,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		 * spin on the lock and we acquire the lock as late as
 		 * possible.
 		 */
-		locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
-								locked, cc);
+		if (!locked)
+			locked = compact_trylock_irqsave(&cc->zone->lock,
+								&flags, cc);
 		if (!locked)
 			break;
 
@@ -523,13 +561,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 
 	/* Time to isolate some pages for migration */
 	for (; low_pfn < end_pfn; low_pfn++) {
-		/* give a chance to irqs before checking need_resched() */
-		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
-			if (should_release_lock(&zone->lru_lock)) {
-				spin_unlock_irqrestore(&zone->lru_lock, flags);
-				locked = false;
-			}
-		}
+		/*
+		 * Periodically drop the lock (if held) regardless of its
+		 * contention, to give chance to IRQs. Abort async compaction
+		 * if contended.
+		 */
+		if (!(low_pfn % SWAP_CLUSTER_MAX)
+		    && compact_unlock_should_abort(&zone->lru_lock, flags,
+								&locked, cc))
+			break;
 
 		/*
 		 * migrate_pfn does not necessarily start aligned to a
@@ -631,10 +671,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		    page_count(page) > page_mapcount(page))
 			continue;
 
-		/* Check if it is ok to still hold the lock */
-		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
-								locked, cc);
-		if (!locked || fatal_signal_pending(current))
+		/* If the lock is not held, try to take it */
+		if (!locked)
+			locked = compact_trylock_irqsave(&zone->lru_lock,
+								&flags, cc);
+		if (!locked)
 			break;
 
 		/* Recheck PageLRU and PageTransHuge under lock */
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [RFC PATCH 2/6] mm, compaction: skip rechecks when lock was already held
  2014-06-04 16:11                 ` Vlastimil Babka
@ 2014-06-04 16:11                   ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-04 16:11 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Andrew Morton, Greg Thelen, Vlastimil Babka,
	Minchan Kim, Mel Gorman, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel, David Rientjes

Compaction scanners try to lock zone locks as late as possible by checking
many page or pageblock properties opportunistically without lock and skipping
them if not unsuitable. For pages that pass the initial checks, some properties
have to be checked again safely under lock. However, if the lock was already
held from a previous iteration in the initial checks, the rechecks are
unnecessary.

This patch therefore skips the rechecks when the lock was already held. This is
now possible to do, since we don't (potentially) drop and reacquire the lock
between the initial checks and the safe rechecks anymore.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index f0fd4b5..27c73d7 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -332,6 +332,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 			goto isolate_fail;
 
 		/*
+		 * If we already hold the lock, we can skip some rechecking.
+		 * Note that if we hold the lock now, checked_pageblock was
+		 * already set in some previous iteration (or strict is true),
+		 * so it is correct to skip the suitable migration target
+		 * recheck as well.
+		 */
+		if (locked)
+			goto skip_recheck;
+
+		/*
 		 * The zone lock must be held to isolate freepages.
 		 * Unfortunately this is a very coarse lock and can be
 		 * heavily contended if there are parallel allocations
@@ -339,9 +349,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		 * spin on the lock and we acquire the lock as late as
 		 * possible.
 		 */
-		if (!locked)
-			locked = compact_trylock_irqsave(&cc->zone->lock,
-								&flags, cc);
+		locked = compact_trylock_irqsave(&cc->zone->lock, &flags, cc);
 		if (!locked)
 			break;
 
@@ -361,6 +369,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		if (!PageBuddy(page))
 			goto isolate_fail;
 
+skip_recheck:
 		/* Found a free page, break it into order-0 pages */
 		isolated = split_free_page(page);
 		total_isolated += isolated;
@@ -671,10 +680,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		    page_count(page) > page_mapcount(page))
 			continue;
 
-		/* If the lock is not held, try to take it */
-		if (!locked)
-			locked = compact_trylock_irqsave(&zone->lru_lock,
-								&flags, cc);
+		/* If we already hold the lock, we can skip some rechecking */
+		if (locked)
+			goto skip_recheck;
+
+		locked = compact_trylock_irqsave(&zone->lru_lock, &flags, cc);
 		if (!locked)
 			break;
 
@@ -686,6 +696,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			continue;
 		}
 
+skip_recheck:
 		lruvec = mem_cgroup_page_lruvec(page, zone);
 
 		/* Try isolate the page */
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [RFC PATCH 2/6] mm, compaction: skip rechecks when lock was already held
@ 2014-06-04 16:11                   ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-04 16:11 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Andrew Morton, Greg Thelen, Vlastimil Babka,
	Minchan Kim, Mel Gorman, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel, David Rientjes

Compaction scanners try to lock zone locks as late as possible by checking
many page or pageblock properties opportunistically without lock and skipping
them if not unsuitable. For pages that pass the initial checks, some properties
have to be checked again safely under lock. However, if the lock was already
held from a previous iteration in the initial checks, the rechecks are
unnecessary.

This patch therefore skips the rechecks when the lock was already held. This is
now possible to do, since we don't (potentially) drop and reacquire the lock
between the initial checks and the safe rechecks anymore.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index f0fd4b5..27c73d7 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -332,6 +332,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 			goto isolate_fail;
 
 		/*
+		 * If we already hold the lock, we can skip some rechecking.
+		 * Note that if we hold the lock now, checked_pageblock was
+		 * already set in some previous iteration (or strict is true),
+		 * so it is correct to skip the suitable migration target
+		 * recheck as well.
+		 */
+		if (locked)
+			goto skip_recheck;
+
+		/*
 		 * The zone lock must be held to isolate freepages.
 		 * Unfortunately this is a very coarse lock and can be
 		 * heavily contended if there are parallel allocations
@@ -339,9 +349,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		 * spin on the lock and we acquire the lock as late as
 		 * possible.
 		 */
-		if (!locked)
-			locked = compact_trylock_irqsave(&cc->zone->lock,
-								&flags, cc);
+		locked = compact_trylock_irqsave(&cc->zone->lock, &flags, cc);
 		if (!locked)
 			break;
 
@@ -361,6 +369,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		if (!PageBuddy(page))
 			goto isolate_fail;
 
+skip_recheck:
 		/* Found a free page, break it into order-0 pages */
 		isolated = split_free_page(page);
 		total_isolated += isolated;
@@ -671,10 +680,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		    page_count(page) > page_mapcount(page))
 			continue;
 
-		/* If the lock is not held, try to take it */
-		if (!locked)
-			locked = compact_trylock_irqsave(&zone->lru_lock,
-								&flags, cc);
+		/* If we already hold the lock, we can skip some rechecking */
+		if (locked)
+			goto skip_recheck;
+
+		locked = compact_trylock_irqsave(&zone->lru_lock, &flags, cc);
 		if (!locked)
 			break;
 
@@ -686,6 +696,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			continue;
 		}
 
+skip_recheck:
 		lruvec = mem_cgroup_page_lruvec(page, zone);
 
 		/* Try isolate the page */
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [RFC PATCH 3/6] mm, compaction: remember position within pageblock in free pages scanner
  2014-06-04 16:11                 ` Vlastimil Babka
@ 2014-06-04 16:11                   ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-04 16:11 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Andrew Morton, Greg Thelen, Vlastimil Babka,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Michal Nazarewicz,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel, David Rientjes

Unlike the migration scanner, the free scanner remembers the beginning of the
last scanned pageblock in cc->free_pfn. It might be therefore rescanning pages
uselessly when called several times during single compaction. This might have
been useful when pages were returned to the buddy allocator after a failed
migration, but this is no longer the case.

This patch changes the meaning of cc->free_pfn so that if it points to a
middle of a pageblock, that pageblock is scanned only from cc->free_pfn to the
end. isolate_freepages_block() will record the pfn of the last page it looked
at, which is then used to update cc->free_pfn.

In the mmtests stress-highalloc benchmark, this has resulted in lowering the
ratio between pages scanned by both scanners, from 2.5 free pages per migrate
page, to 2.25 free pages per migrate page, without affecting success rates.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 27c73d7..ae7db5f 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -294,7 +294,7 @@ static bool suitable_migration_target(struct page *page)
  * (even though it may still end up isolating some pages).
  */
 static unsigned long isolate_freepages_block(struct compact_control *cc,
-				unsigned long blockpfn,
+				unsigned long *start_pfn,
 				unsigned long end_pfn,
 				struct list_head *freelist,
 				bool strict)
@@ -304,6 +304,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 	unsigned long flags;
 	bool locked = false;
 	bool checked_pageblock = false;
+	unsigned long blockpfn = *start_pfn;
 
 	cursor = pfn_to_page(blockpfn);
 
@@ -312,6 +313,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		int isolated, i;
 		struct page *page = cursor;
 
+		/* Record how far we have got within the block */
+		*start_pfn = blockpfn;
+
 		/*
 		 * Periodically drop the lock (if held) regardless of its
 		 * contention, to give chance to IRQs. Abort async compaction
@@ -438,6 +442,9 @@ isolate_freepages_range(struct compact_control *cc,
 	LIST_HEAD(freelist);
 
 	for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
+		/* Protect pfn from changing by isolate_freepages_block */
+		unsigned long isolate_start_pfn = pfn;
+
 		if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
 			break;
 
@@ -448,8 +455,8 @@ isolate_freepages_range(struct compact_control *cc,
 		block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 		block_end_pfn = min(block_end_pfn, end_pfn);
 
-		isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
-						   &freelist, true);
+		isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+						block_end_pfn, &freelist, true);
 
 		/*
 		 * In strict mode, isolate_freepages_block() returns 0 if
@@ -789,6 +796,7 @@ static void isolate_freepages(struct zone *zone,
 				block_end_pfn = block_start_pfn,
 				block_start_pfn -= pageblock_nr_pages) {
 		unsigned long isolated;
+		unsigned long isolate_start_pfn;
 
 		/*
 		 * This can iterate a massively long zone without finding any
@@ -822,11 +830,26 @@ static void isolate_freepages(struct zone *zone,
 			continue;
 
 		/* Found a block suitable for isolating free pages from */
-		cc->free_pfn = block_start_pfn;
-		isolated = isolate_freepages_block(cc, block_start_pfn,
+		isolate_start_pfn = block_start_pfn;
+
+		/* 
+		 * If we are restarting the free scanner in this block, do not
+		 * rescan the beginning of the block
+		 */
+		if (cc->free_pfn < block_end_pfn)
+			isolate_start_pfn = cc->free_pfn;
+
+		isolated = isolate_freepages_block(cc, &isolate_start_pfn,
 					block_end_pfn, freelist, false);
 		nr_freepages += isolated;
 
+		/* 
+		 * Remember where the free scanner should restart next time.
+		 * This will point to the last page of pageblock we just
+		 * scanned, if we scanned it fully.
+		 */
+		cc->free_pfn = isolate_start_pfn;
+
 		/*
 		 * Set a flag that we successfully isolated in this pageblock.
 		 * In the next loop iteration, zone->compact_cached_free_pfn
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [RFC PATCH 3/6] mm, compaction: remember position within pageblock in free pages scanner
@ 2014-06-04 16:11                   ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-04 16:11 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Andrew Morton, Greg Thelen, Vlastimil Babka,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Michal Nazarewicz,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel, David Rientjes

Unlike the migration scanner, the free scanner remembers the beginning of the
last scanned pageblock in cc->free_pfn. It might be therefore rescanning pages
uselessly when called several times during single compaction. This might have
been useful when pages were returned to the buddy allocator after a failed
migration, but this is no longer the case.

This patch changes the meaning of cc->free_pfn so that if it points to a
middle of a pageblock, that pageblock is scanned only from cc->free_pfn to the
end. isolate_freepages_block() will record the pfn of the last page it looked
at, which is then used to update cc->free_pfn.

In the mmtests stress-highalloc benchmark, this has resulted in lowering the
ratio between pages scanned by both scanners, from 2.5 free pages per migrate
page, to 2.25 free pages per migrate page, without affecting success rates.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 27c73d7..ae7db5f 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -294,7 +294,7 @@ static bool suitable_migration_target(struct page *page)
  * (even though it may still end up isolating some pages).
  */
 static unsigned long isolate_freepages_block(struct compact_control *cc,
-				unsigned long blockpfn,
+				unsigned long *start_pfn,
 				unsigned long end_pfn,
 				struct list_head *freelist,
 				bool strict)
@@ -304,6 +304,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 	unsigned long flags;
 	bool locked = false;
 	bool checked_pageblock = false;
+	unsigned long blockpfn = *start_pfn;
 
 	cursor = pfn_to_page(blockpfn);
 
@@ -312,6 +313,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		int isolated, i;
 		struct page *page = cursor;
 
+		/* Record how far we have got within the block */
+		*start_pfn = blockpfn;
+
 		/*
 		 * Periodically drop the lock (if held) regardless of its
 		 * contention, to give chance to IRQs. Abort async compaction
@@ -438,6 +442,9 @@ isolate_freepages_range(struct compact_control *cc,
 	LIST_HEAD(freelist);
 
 	for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
+		/* Protect pfn from changing by isolate_freepages_block */
+		unsigned long isolate_start_pfn = pfn;
+
 		if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
 			break;
 
@@ -448,8 +455,8 @@ isolate_freepages_range(struct compact_control *cc,
 		block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 		block_end_pfn = min(block_end_pfn, end_pfn);
 
-		isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
-						   &freelist, true);
+		isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+						block_end_pfn, &freelist, true);
 
 		/*
 		 * In strict mode, isolate_freepages_block() returns 0 if
@@ -789,6 +796,7 @@ static void isolate_freepages(struct zone *zone,
 				block_end_pfn = block_start_pfn,
 				block_start_pfn -= pageblock_nr_pages) {
 		unsigned long isolated;
+		unsigned long isolate_start_pfn;
 
 		/*
 		 * This can iterate a massively long zone without finding any
@@ -822,11 +830,26 @@ static void isolate_freepages(struct zone *zone,
 			continue;
 
 		/* Found a block suitable for isolating free pages from */
-		cc->free_pfn = block_start_pfn;
-		isolated = isolate_freepages_block(cc, block_start_pfn,
+		isolate_start_pfn = block_start_pfn;
+
+		/* 
+		 * If we are restarting the free scanner in this block, do not
+		 * rescan the beginning of the block
+		 */
+		if (cc->free_pfn < block_end_pfn)
+			isolate_start_pfn = cc->free_pfn;
+
+		isolated = isolate_freepages_block(cc, &isolate_start_pfn,
 					block_end_pfn, freelist, false);
 		nr_freepages += isolated;
 
+		/* 
+		 * Remember where the free scanner should restart next time.
+		 * This will point to the last page of pageblock we just
+		 * scanned, if we scanned it fully.
+		 */
+		cc->free_pfn = isolate_start_pfn;
+
 		/*
 		 * Set a flag that we successfully isolated in this pageblock.
 		 * In the next loop iteration, zone->compact_cached_free_pfn
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
  2014-06-04 16:11                 ` Vlastimil Babka
@ 2014-06-04 16:11                   ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-04 16:11 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Andrew Morton, Greg Thelen, Vlastimil Babka,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Michal Nazarewicz,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel, David Rientjes

The migration scanner skips PageBuddy pages, but does not consider their order
as checking page_order() is generally unsafe without holding the zone->lock,
and acquiring the lock just for the check wouldn't be a good tradeoff.

Still, this could avoid some iterations over the rest of the buddy page, and
if we are careful, the race window between PageBuddy() check and page_order()
is small, and the worst thing that can happen is that we skip too much and miss
some isolation candidates. This is not that bad, as compaction can already fail
for many other reasons like parallel allocations, and those have much larger
race window.

This patch therefore makes the migration scanner obtain the buddy page order
and use it to skip the whole buddy page, if the order appears to be in the
valid range.

It's important that the page_order() is read only once, so that the value used
in the checks and in the pfn calculation is the same. But in theory the
compiler can replace the local variable by multiple inlines of page_order().
Therefore, the patch introduces page_order_unsafe() that uses ACCESS_ONCE to
prevent this.

Preliminary results with stress-highalloc from mmtests show a 10% reduction in
number of pages scanned by migration scanner. This change is also important to
later allow detecting when a cc->order block of pages cannot be compacted, and
the scanner should skip to the next block instead of wasting time.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 20 +++++++++++++++++---
 mm/internal.h   | 20 +++++++++++++++++++-
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index ae7db5f..3dce5a7 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -640,11 +640,18 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		}
 
 		/*
-		 * Skip if free. page_order cannot be used without zone->lock
-		 * as nothing prevents parallel allocations or buddy merging.
+		 * Skip if free. We read page order here without zone lock
+		 * which is generally unsafe, but the race window is small and
+		 * the worst thing that can happen is that we skip some
+		 * potential isolation targets.
 		 */
-		if (PageBuddy(page))
+		if (PageBuddy(page)) {
+			unsigned long freepage_order = page_order_unsafe(page);
+
+			if (freepage_order > 0 && freepage_order < MAX_ORDER)
+				low_pfn += (1UL << freepage_order) - 1;
 			continue;
+		}
 
 		/*
 		 * Check may be lockless but that's ok as we recheck later.
@@ -733,6 +740,13 @@ next_pageblock:
 		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
 	}
 
+	/*
+	 * The PageBuddy() check could have potentially brought us outside
+	 * the range to be scanned.
+	 */
+	if (unlikely(low_pfn > end_pfn))
+		end_pfn = low_pfn;
+
 	acct_isolated(zone, locked, cc);
 
 	if (locked)
diff --git a/mm/internal.h b/mm/internal.h
index 1a8a0d4..6aa1f74 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
  * general, page_zone(page)->lock must be held by the caller to prevent the
  * page from being allocated in parallel and returning garbage as the order.
  * If a caller does not hold page_zone(page)->lock, it must guarantee that the
- * page cannot be allocated or merged in parallel.
+ * page cannot be allocated or merged in parallel. Alternatively, it must
+ * handle invalid values gracefully, and use page_order_unsafe() below.
  */
 static inline unsigned long page_order(struct page *page)
 {
@@ -172,6 +173,23 @@ static inline unsigned long page_order(struct page *page)
 	return page_private(page);
 }
 
+/*
+ * Like page_order(), but for callers who cannot afford to hold the zone lock,
+ * and handle invalid values gracefully. ACCESS_ONCE is used so that if the
+ * caller assigns the result into a local variable and e.g. tests it for valid
+ * range  before using, the compiler cannot decide to remove the variable and
+ * inline the function multiple times, potentially observing different values
+ * in the tests and the actual use of the result.
+ */
+static inline unsigned long page_order_unsafe(struct page *page)
+{
+	/*
+	 * PageBuddy() should be checked by the caller to minimize race window,
+	 * and invalid values must be handled gracefully.
+	 */
+	return ACCESS_ONCE(page_private(page));
+}
+
 /* mm/util.c */
 void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
 		struct vm_area_struct *prev, struct rb_node *rb_parent);
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
@ 2014-06-04 16:11                   ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-04 16:11 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Andrew Morton, Greg Thelen, Vlastimil Babka,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Michal Nazarewicz,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel, David Rientjes

The migration scanner skips PageBuddy pages, but does not consider their order
as checking page_order() is generally unsafe without holding the zone->lock,
and acquiring the lock just for the check wouldn't be a good tradeoff.

Still, this could avoid some iterations over the rest of the buddy page, and
if we are careful, the race window between PageBuddy() check and page_order()
is small, and the worst thing that can happen is that we skip too much and miss
some isolation candidates. This is not that bad, as compaction can already fail
for many other reasons like parallel allocations, and those have much larger
race window.

This patch therefore makes the migration scanner obtain the buddy page order
and use it to skip the whole buddy page, if the order appears to be in the
valid range.

It's important that the page_order() is read only once, so that the value used
in the checks and in the pfn calculation is the same. But in theory the
compiler can replace the local variable by multiple inlines of page_order().
Therefore, the patch introduces page_order_unsafe() that uses ACCESS_ONCE to
prevent this.

Preliminary results with stress-highalloc from mmtests show a 10% reduction in
number of pages scanned by migration scanner. This change is also important to
later allow detecting when a cc->order block of pages cannot be compacted, and
the scanner should skip to the next block instead of wasting time.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 20 +++++++++++++++++---
 mm/internal.h   | 20 +++++++++++++++++++-
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index ae7db5f..3dce5a7 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -640,11 +640,18 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		}
 
 		/*
-		 * Skip if free. page_order cannot be used without zone->lock
-		 * as nothing prevents parallel allocations or buddy merging.
+		 * Skip if free. We read page order here without zone lock
+		 * which is generally unsafe, but the race window is small and
+		 * the worst thing that can happen is that we skip some
+		 * potential isolation targets.
 		 */
-		if (PageBuddy(page))
+		if (PageBuddy(page)) {
+			unsigned long freepage_order = page_order_unsafe(page);
+
+			if (freepage_order > 0 && freepage_order < MAX_ORDER)
+				low_pfn += (1UL << freepage_order) - 1;
 			continue;
+		}
 
 		/*
 		 * Check may be lockless but that's ok as we recheck later.
@@ -733,6 +740,13 @@ next_pageblock:
 		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
 	}
 
+	/*
+	 * The PageBuddy() check could have potentially brought us outside
+	 * the range to be scanned.
+	 */
+	if (unlikely(low_pfn > end_pfn))
+		end_pfn = low_pfn;
+
 	acct_isolated(zone, locked, cc);
 
 	if (locked)
diff --git a/mm/internal.h b/mm/internal.h
index 1a8a0d4..6aa1f74 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
  * general, page_zone(page)->lock must be held by the caller to prevent the
  * page from being allocated in parallel and returning garbage as the order.
  * If a caller does not hold page_zone(page)->lock, it must guarantee that the
- * page cannot be allocated or merged in parallel.
+ * page cannot be allocated or merged in parallel. Alternatively, it must
+ * handle invalid values gracefully, and use page_order_unsafe() below.
  */
 static inline unsigned long page_order(struct page *page)
 {
@@ -172,6 +173,23 @@ static inline unsigned long page_order(struct page *page)
 	return page_private(page);
 }
 
+/*
+ * Like page_order(), but for callers who cannot afford to hold the zone lock,
+ * and handle invalid values gracefully. ACCESS_ONCE is used so that if the
+ * caller assigns the result into a local variable and e.g. tests it for valid
+ * range  before using, the compiler cannot decide to remove the variable and
+ * inline the function multiple times, potentially observing different values
+ * in the tests and the actual use of the result.
+ */
+static inline unsigned long page_order_unsafe(struct page *page)
+{
+	/*
+	 * PageBuddy() should be checked by the caller to minimize race window,
+	 * and invalid values must be handled gracefully.
+	 */
+	return ACCESS_ONCE(page_private(page));
+}
+
 /* mm/util.c */
 void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
 		struct vm_area_struct *prev, struct rb_node *rb_parent);
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [RFC PATCH 5/6] mm, compaction: try to capture the just-created high-order freepage
  2014-06-04 16:11                 ` Vlastimil Babka
@ 2014-06-04 16:11                   ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-04 16:11 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Andrew Morton, Greg Thelen, Vlastimil Babka,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Michal Nazarewicz,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel, David Rientjes

Compaction uses watermark checking to determine if it succeeded in creating
a high-order free page. My testing has shown that this is quite racy and it
can happen that watermark checking in compaction succeeds, and moments later
the watermark checking in page allocation fails, even though the number of
free pages has increased meanwhile.

It should be more reliable if direct compaction captured the high-order free
page as soon as it detects it, and pass it back to allocation. This would
also reduce the window for somebody else to allocate the free page.

This has been already implemented by 1fb3f8ca0e92 ("mm: compaction: capture a
suitable high-order page immediately when it is made available"), but later
reverted by 8fb74b9f ("mm: compaction: partially revert capture of suitable
high-order page") due to flaws.

This patch differs from the previous attempt in two aspects:

1) The previous patch scanned free lists to capture the page. In this patch,
   only the cc->order aligned block that the migration scanner just finished
   is considered, but only if pages were actually isolated for migration in
   that block. Tracking cc->order aligned blocks also has benefits for the
   following patch that skips blocks where non-migratable pages were found.

2) In this patch, the isolated free page is allocated through extending
   get_page_from_freelist() and buffered_rmqueue(). This ensures that it gets
   all operations such as prep_new_page() and page->pfmemalloc setting that
   was missing in the previous attempt, zone statistics are updated etc.

Evaluation is pending.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
---
 include/linux/compaction.h |  5 ++-
 mm/compaction.c            | 85 ++++++++++++++++++++++++++++++++++++++++++++--
 mm/internal.h              |  2 ++
 mm/page_alloc.c            | 70 ++++++++++++++++++++++++++++++--------
 4 files changed, 144 insertions(+), 18 deletions(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 01e3132..69579f5 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -10,6 +10,8 @@
 #define COMPACT_PARTIAL		2
 /* The full zone was compacted */
 #define COMPACT_COMPLETE	3
+/* Captured a high-order free page in direct compaction */
+#define COMPACT_CAPTURED	4
 
 #ifdef CONFIG_COMPACTION
 extern int sysctl_compact_memory;
@@ -22,7 +24,8 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask,
-			enum migrate_mode mode, bool *contended);
+			enum migrate_mode mode, bool *contended,
+			struct page **captured_page);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
diff --git a/mm/compaction.c b/mm/compaction.c
index 3dce5a7..5909a88 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -555,6 +555,16 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
 					ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
+	unsigned long capture_pfn = 0;   /* current candidate for capturing */
+	unsigned long next_capture_pfn = 0; /* next candidate for capturing */
+
+	if (cc->order > PAGE_ALLOC_COSTLY_ORDER
+			&& cc->migratetype == MIGRATE_MOVABLE
+			&& cc->order <= pageblock_order) {
+		/* This may be outside the zone, but we check that later */
+		capture_pfn = low_pfn & ~((1UL << cc->order) - 1);
+		next_capture_pfn = ALIGN(low_pfn + 1, (1UL << cc->order));
+	}
 
 	/*
 	 * Ensure that there are not too many pages isolated from the LRU
@@ -577,6 +587,19 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 
 	/* Time to isolate some pages for migration */
 	for (; low_pfn < end_pfn; low_pfn++) {
+		if (low_pfn == next_capture_pfn) {
+			/*
+			 * We have a capture candidate if we isolated something
+			 * during the last cc->order aligned block of pages
+			 */
+			if (nr_isolated && capture_pfn >= zone->zone_start_pfn) {
+				cc->capture_page = pfn_to_page(capture_pfn);
+				break;
+			}
+			capture_pfn = next_capture_pfn;
+			next_capture_pfn += (1UL << cc->order);
+		}
+
 		/*
 		 * Periodically drop the lock (if held) regardless of its
 		 * contention, to give chance to IRQs. Abort async compaction
@@ -596,6 +619,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
 			if (!pfn_valid(low_pfn)) {
 				low_pfn += MAX_ORDER_NR_PAGES - 1;
+				if (next_capture_pfn)
+					next_capture_pfn = low_pfn + 1;
 				continue;
 			}
 		}
@@ -682,6 +707,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			if (!locked)
 				goto next_pageblock;
 			low_pfn += (1 << compound_order(page)) - 1;
+			if (next_capture_pfn)
+				next_capture_pfn =
+					ALIGN(low_pfn + 1, (1UL << cc->order));
 			continue;
 		}
 
@@ -707,6 +735,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			continue;
 		if (PageTransHuge(page)) {
 			low_pfn += (1 << compound_order(page)) - 1;
+			next_capture_pfn = low_pfn + 1;
 			continue;
 		}
 
@@ -738,6 +767,8 @@ isolate_success:
 
 next_pageblock:
 		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
+		if (next_capture_pfn)
+			next_capture_pfn = low_pfn + 1;
 	}
 
 	/*
@@ -975,6 +1006,41 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 	return ISOLATE_SUCCESS;
 }
 
+/*
+ * When called, cc->capture_page is just a candidate. This function will either
+ * successfully capture the page, or reset it to NULL.
+ */
+static bool compact_capture_page(struct compact_control *cc)
+{
+	struct page *page = cc->capture_page;
+
+	/* Unsafe check if it's worth to try acquiring the zone->lock at all */
+	if (PageBuddy(page) && page_order_unsafe(page) >= cc->order)
+		goto try_capture;
+
+	/*
+	 * There's a good chance that we have just put free pages on this CPU's
+	 * pcplists after the page migration. Drain them to allow merging.
+	 */
+	get_cpu();
+	drain_local_pages(NULL);
+	put_cpu();
+
+	/* Did the draining help? */
+	if (PageBuddy(page) && page_order_unsafe(page) >= cc->order)
+		goto try_capture;
+
+	goto fail;
+
+try_capture:
+	if (capture_free_page(page, cc->order))
+		return true;
+
+fail:
+	cc->capture_page = NULL;
+	return false;
+}
+
 static int compact_finished(struct zone *zone,
 			    struct compact_control *cc)
 {
@@ -1003,6 +1069,10 @@ static int compact_finished(struct zone *zone,
 		return COMPACT_COMPLETE;
 	}
 
+	/* Did we just finish a pageblock that was capture candidate? */
+	if (cc->capture_page && compact_capture_page(cc))
+		return COMPACT_CAPTURED;
+
 	/*
 	 * order == -1 is expected when compacting via
 	 * /proc/sys/vm/compact_memory
@@ -1181,7 +1251,8 @@ out:
 }
 
 static unsigned long compact_zone_order(struct zone *zone, int order,
-		gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
+		gfp_t gfp_mask, enum migrate_mode mode, bool *contended,
+						struct page **captured_page)
 {
 	unsigned long ret;
 	struct compact_control cc = {
@@ -1197,6 +1268,9 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
 
 	ret = compact_zone(zone, &cc);
 
+	if (ret == COMPACT_CAPTURED)
+		*captured_page = cc.capture_page;
+
 	VM_BUG_ON(!list_empty(&cc.freepages));
 	VM_BUG_ON(!list_empty(&cc.migratepages));
 
@@ -1220,7 +1294,8 @@ int sysctl_extfrag_threshold = 500;
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			enum migrate_mode mode, bool *contended)
+			enum migrate_mode mode, bool *contended,
+			struct page **captured_page)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1246,9 +1321,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 		int status;
 
 		status = compact_zone_order(zone, order, gfp_mask, mode,
-						contended);
+						contended, captured_page);
 		rc = max(status, rc);
 
+		/* If we captured a page, stop compacting */
+		if (*captured_page)
+			break;
+
 		/* If a normal allocation would succeed, stop compacting */
 		if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
 				      alloc_flags))
diff --git a/mm/internal.h b/mm/internal.h
index 6aa1f74..5f223d3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -110,6 +110,7 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
  */
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
 extern void prep_compound_page(struct page *page, unsigned long order);
+extern bool capture_free_page(struct page *page, unsigned int order);
 #ifdef CONFIG_MEMORY_FAILURE
 extern bool is_free_buddy_page(struct page *page);
 #endif
@@ -148,6 +149,7 @@ struct compact_control {
 					 * need_resched() true during async
 					 * compaction
 					 */
+	struct page *capture_page;	/* Free page captured by compaction */
 };
 
 unsigned long
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0b3dd64..3788162 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -937,7 +937,6 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 	return NULL;
 }
 
-
 /*
  * This array describes the order lists are fallen back to when
  * the free lists for the desirable migrate type are depleted
@@ -1453,9 +1452,11 @@ static int __isolate_free_page(struct page *page, unsigned int order)
 {
 	unsigned long watermark;
 	struct zone *zone;
+	struct free_area *area;
 	int mt;
+	unsigned int freepage_order = page_order(page);
 
-	BUG_ON(!PageBuddy(page));
+	VM_BUG_ON_PAGE((!PageBuddy(page) || freepage_order < order), page);
 
 	zone = page_zone(page);
 	mt = get_pageblock_migratetype(page);
@@ -1470,9 +1471,12 @@ static int __isolate_free_page(struct page *page, unsigned int order)
 	}
 
 	/* Remove page from free list */
+	area = &zone->free_area[freepage_order];
 	list_del(&page->lru);
-	zone->free_area[order].nr_free--;
+	area->nr_free--;
 	rmv_page_order(page);
+	if (freepage_order != order)
+		expand(zone, page, order, freepage_order, area, mt);
 
 	/* Set the pageblock if the isolated page is at least a pageblock */
 	if (order >= pageblock_order - 1) {
@@ -1515,6 +1519,26 @@ int split_free_page(struct page *page)
 	return nr_pages;
 }
 
+bool capture_free_page(struct page *page, unsigned int order)
+{
+	struct zone *zone = page_zone(page);
+	unsigned long flags;
+	bool ret;
+
+	spin_lock_irqsave(&zone->lock, flags);
+
+	if (!PageBuddy(page) || page_order(page) < order) {
+		ret = false;
+		goto out;
+	}
+
+	ret = __isolate_free_page(page, order);
+
+out:
+	spin_unlock_irqrestore(&zone->lock, flags);
+	return ret;
+}
+
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
@@ -1523,7 +1547,7 @@ int split_free_page(struct page *page)
 static inline
 struct page *buffered_rmqueue(struct zone *preferred_zone,
 			struct zone *zone, int order, gfp_t gfp_flags,
-			int migratetype)
+			int migratetype, struct page *isolated_freepage)
 {
 	unsigned long flags;
 	struct page *page;
@@ -1552,6 +1576,9 @@ again:
 
 		list_del(&page->lru);
 		pcp->count--;
+	} else if (unlikely(isolated_freepage)) {
+		page = isolated_freepage;
+		local_irq_save(flags);
 	} else {
 		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
 			/*
@@ -1567,7 +1594,9 @@ again:
 			WARN_ON_ONCE(order > 1);
 		}
 		spin_lock_irqsave(&zone->lock, flags);
+
 		page = __rmqueue(zone, order, migratetype);
+
 		spin_unlock(&zone->lock);
 		if (!page)
 			goto failed;
@@ -1907,7 +1936,7 @@ static inline void init_zone_allows_reclaim(int nid)
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
-		struct zone *preferred_zone, int migratetype)
+		struct zone *preferred_zone, int migratetype, struct page *isolated_freepage)
 {
 	struct zoneref *z;
 	struct page *page = NULL;
@@ -1916,8 +1945,17 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
+	unsigned long mark;
 
 	classzone_idx = zone_idx(preferred_zone);
+
+	if (isolated_freepage) {
+		zone = page_zone(isolated_freepage);
+		page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask,
+						migratetype, isolated_freepage);
+		goto got_page;
+	}
+
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
@@ -1925,7 +1963,6 @@ zonelist_scan:
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						high_zoneidx, nodemask) {
-		unsigned long mark;
 
 		if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
@@ -2040,7 +2077,7 @@ zonelist_scan:
 
 try_this_zone:
 		page = buffered_rmqueue(preferred_zone, zone, order,
-						gfp_mask, migratetype);
+						gfp_mask, migratetype, NULL);
 		if (page)
 			break;
 this_zone_full:
@@ -2054,6 +2091,7 @@ this_zone_full:
 		goto zonelist_scan;
 	}
 
+got_page:
 	if (page)
 		/*
 		 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
@@ -2191,7 +2229,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
 		order, zonelist, high_zoneidx,
 		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
-		preferred_zone, migratetype);
+		preferred_zone, migratetype, NULL);
 	if (page)
 		goto out;
 
@@ -2230,6 +2268,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
+	struct page *captured_page;
+
 	if (!order)
 		return NULL;
 
@@ -2241,7 +2281,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 						nodemask, mode,
-						contended_compaction);
+						contended_compaction,
+						&captured_page);
 	current->flags &= ~PF_MEMALLOC;
 
 	if (*did_some_progress != COMPACT_SKIPPED) {
@@ -2254,7 +2295,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		page = get_page_from_freelist(gfp_mask, nodemask,
 				order, zonelist, high_zoneidx,
 				alloc_flags & ~ALLOC_NO_WATERMARKS,
-				preferred_zone, migratetype);
+				preferred_zone, migratetype, captured_page);
+
 		if (page) {
 			preferred_zone->compact_blockskip_flush = false;
 			compaction_defer_reset(preferred_zone, order, true);
@@ -2344,7 +2386,7 @@ retry:
 	page = get_page_from_freelist(gfp_mask, nodemask, order,
 					zonelist, high_zoneidx,
 					alloc_flags & ~ALLOC_NO_WATERMARKS,
-					preferred_zone, migratetype);
+					preferred_zone, migratetype, NULL);
 
 	/*
 	 * If an allocation failed after direct reclaim, it could be because
@@ -2374,7 +2416,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 	do {
 		page = get_page_from_freelist(gfp_mask, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
-			preferred_zone, migratetype);
+			preferred_zone, migratetype, NULL);
 
 		if (!page && gfp_mask & __GFP_NOFAIL)
 			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
@@ -2532,7 +2574,7 @@ rebalance:
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
-			preferred_zone, migratetype);
+			preferred_zone, migratetype, NULL);
 	if (page)
 		goto got_pg;
 
@@ -2736,7 +2778,7 @@ retry:
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, alloc_flags,
-			preferred_zone, migratetype);
+			preferred_zone, migratetype, NULL);
 	if (unlikely(!page)) {
 		/*
 		 * The first pass makes sure allocations are spread
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [RFC PATCH 5/6] mm, compaction: try to capture the just-created high-order freepage
@ 2014-06-04 16:11                   ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-04 16:11 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Andrew Morton, Greg Thelen, Vlastimil Babka,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Michal Nazarewicz,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel, David Rientjes

Compaction uses watermark checking to determine if it succeeded in creating
a high-order free page. My testing has shown that this is quite racy and it
can happen that watermark checking in compaction succeeds, and moments later
the watermark checking in page allocation fails, even though the number of
free pages has increased meanwhile.

It should be more reliable if direct compaction captured the high-order free
page as soon as it detects it, and pass it back to allocation. This would
also reduce the window for somebody else to allocate the free page.

This has been already implemented by 1fb3f8ca0e92 ("mm: compaction: capture a
suitable high-order page immediately when it is made available"), but later
reverted by 8fb74b9f ("mm: compaction: partially revert capture of suitable
high-order page") due to flaws.

This patch differs from the previous attempt in two aspects:

1) The previous patch scanned free lists to capture the page. In this patch,
   only the cc->order aligned block that the migration scanner just finished
   is considered, but only if pages were actually isolated for migration in
   that block. Tracking cc->order aligned blocks also has benefits for the
   following patch that skips blocks where non-migratable pages were found.

2) In this patch, the isolated free page is allocated through extending
   get_page_from_freelist() and buffered_rmqueue(). This ensures that it gets
   all operations such as prep_new_page() and page->pfmemalloc setting that
   was missing in the previous attempt, zone statistics are updated etc.

Evaluation is pending.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
---
 include/linux/compaction.h |  5 ++-
 mm/compaction.c            | 85 ++++++++++++++++++++++++++++++++++++++++++++--
 mm/internal.h              |  2 ++
 mm/page_alloc.c            | 70 ++++++++++++++++++++++++++++++--------
 4 files changed, 144 insertions(+), 18 deletions(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 01e3132..69579f5 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -10,6 +10,8 @@
 #define COMPACT_PARTIAL		2
 /* The full zone was compacted */
 #define COMPACT_COMPLETE	3
+/* Captured a high-order free page in direct compaction */
+#define COMPACT_CAPTURED	4
 
 #ifdef CONFIG_COMPACTION
 extern int sysctl_compact_memory;
@@ -22,7 +24,8 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *mask,
-			enum migrate_mode mode, bool *contended);
+			enum migrate_mode mode, bool *contended,
+			struct page **captured_page);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
diff --git a/mm/compaction.c b/mm/compaction.c
index 3dce5a7..5909a88 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -555,6 +555,16 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
 					ISOLATE_ASYNC_MIGRATE : 0) |
 				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
+	unsigned long capture_pfn = 0;   /* current candidate for capturing */
+	unsigned long next_capture_pfn = 0; /* next candidate for capturing */
+
+	if (cc->order > PAGE_ALLOC_COSTLY_ORDER
+			&& cc->migratetype == MIGRATE_MOVABLE
+			&& cc->order <= pageblock_order) {
+		/* This may be outside the zone, but we check that later */
+		capture_pfn = low_pfn & ~((1UL << cc->order) - 1);
+		next_capture_pfn = ALIGN(low_pfn + 1, (1UL << cc->order));
+	}
 
 	/*
 	 * Ensure that there are not too many pages isolated from the LRU
@@ -577,6 +587,19 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 
 	/* Time to isolate some pages for migration */
 	for (; low_pfn < end_pfn; low_pfn++) {
+		if (low_pfn == next_capture_pfn) {
+			/*
+			 * We have a capture candidate if we isolated something
+			 * during the last cc->order aligned block of pages
+			 */
+			if (nr_isolated && capture_pfn >= zone->zone_start_pfn) {
+				cc->capture_page = pfn_to_page(capture_pfn);
+				break;
+			}
+			capture_pfn = next_capture_pfn;
+			next_capture_pfn += (1UL << cc->order);
+		}
+
 		/*
 		 * Periodically drop the lock (if held) regardless of its
 		 * contention, to give chance to IRQs. Abort async compaction
@@ -596,6 +619,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
 			if (!pfn_valid(low_pfn)) {
 				low_pfn += MAX_ORDER_NR_PAGES - 1;
+				if (next_capture_pfn)
+					next_capture_pfn = low_pfn + 1;
 				continue;
 			}
 		}
@@ -682,6 +707,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			if (!locked)
 				goto next_pageblock;
 			low_pfn += (1 << compound_order(page)) - 1;
+			if (next_capture_pfn)
+				next_capture_pfn =
+					ALIGN(low_pfn + 1, (1UL << cc->order));
 			continue;
 		}
 
@@ -707,6 +735,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			continue;
 		if (PageTransHuge(page)) {
 			low_pfn += (1 << compound_order(page)) - 1;
+			next_capture_pfn = low_pfn + 1;
 			continue;
 		}
 
@@ -738,6 +767,8 @@ isolate_success:
 
 next_pageblock:
 		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
+		if (next_capture_pfn)
+			next_capture_pfn = low_pfn + 1;
 	}
 
 	/*
@@ -975,6 +1006,41 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 	return ISOLATE_SUCCESS;
 }
 
+/*
+ * When called, cc->capture_page is just a candidate. This function will either
+ * successfully capture the page, or reset it to NULL.
+ */
+static bool compact_capture_page(struct compact_control *cc)
+{
+	struct page *page = cc->capture_page;
+
+	/* Unsafe check if it's worth to try acquiring the zone->lock at all */
+	if (PageBuddy(page) && page_order_unsafe(page) >= cc->order)
+		goto try_capture;
+
+	/*
+	 * There's a good chance that we have just put free pages on this CPU's
+	 * pcplists after the page migration. Drain them to allow merging.
+	 */
+	get_cpu();
+	drain_local_pages(NULL);
+	put_cpu();
+
+	/* Did the draining help? */
+	if (PageBuddy(page) && page_order_unsafe(page) >= cc->order)
+		goto try_capture;
+
+	goto fail;
+
+try_capture:
+	if (capture_free_page(page, cc->order))
+		return true;
+
+fail:
+	cc->capture_page = NULL;
+	return false;
+}
+
 static int compact_finished(struct zone *zone,
 			    struct compact_control *cc)
 {
@@ -1003,6 +1069,10 @@ static int compact_finished(struct zone *zone,
 		return COMPACT_COMPLETE;
 	}
 
+	/* Did we just finish a pageblock that was capture candidate? */
+	if (cc->capture_page && compact_capture_page(cc))
+		return COMPACT_CAPTURED;
+
 	/*
 	 * order == -1 is expected when compacting via
 	 * /proc/sys/vm/compact_memory
@@ -1181,7 +1251,8 @@ out:
 }
 
 static unsigned long compact_zone_order(struct zone *zone, int order,
-		gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
+		gfp_t gfp_mask, enum migrate_mode mode, bool *contended,
+						struct page **captured_page)
 {
 	unsigned long ret;
 	struct compact_control cc = {
@@ -1197,6 +1268,9 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
 
 	ret = compact_zone(zone, &cc);
 
+	if (ret == COMPACT_CAPTURED)
+		*captured_page = cc.capture_page;
+
 	VM_BUG_ON(!list_empty(&cc.freepages));
 	VM_BUG_ON(!list_empty(&cc.migratepages));
 
@@ -1220,7 +1294,8 @@ int sysctl_extfrag_threshold = 500;
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			enum migrate_mode mode, bool *contended)
+			enum migrate_mode mode, bool *contended,
+			struct page **captured_page)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1246,9 +1321,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 		int status;
 
 		status = compact_zone_order(zone, order, gfp_mask, mode,
-						contended);
+						contended, captured_page);
 		rc = max(status, rc);
 
+		/* If we captured a page, stop compacting */
+		if (*captured_page)
+			break;
+
 		/* If a normal allocation would succeed, stop compacting */
 		if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
 				      alloc_flags))
diff --git a/mm/internal.h b/mm/internal.h
index 6aa1f74..5f223d3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -110,6 +110,7 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
  */
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
 extern void prep_compound_page(struct page *page, unsigned long order);
+extern bool capture_free_page(struct page *page, unsigned int order);
 #ifdef CONFIG_MEMORY_FAILURE
 extern bool is_free_buddy_page(struct page *page);
 #endif
@@ -148,6 +149,7 @@ struct compact_control {
 					 * need_resched() true during async
 					 * compaction
 					 */
+	struct page *capture_page;	/* Free page captured by compaction */
 };
 
 unsigned long
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0b3dd64..3788162 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -937,7 +937,6 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 	return NULL;
 }
 
-
 /*
  * This array describes the order lists are fallen back to when
  * the free lists for the desirable migrate type are depleted
@@ -1453,9 +1452,11 @@ static int __isolate_free_page(struct page *page, unsigned int order)
 {
 	unsigned long watermark;
 	struct zone *zone;
+	struct free_area *area;
 	int mt;
+	unsigned int freepage_order = page_order(page);
 
-	BUG_ON(!PageBuddy(page));
+	VM_BUG_ON_PAGE((!PageBuddy(page) || freepage_order < order), page);
 
 	zone = page_zone(page);
 	mt = get_pageblock_migratetype(page);
@@ -1470,9 +1471,12 @@ static int __isolate_free_page(struct page *page, unsigned int order)
 	}
 
 	/* Remove page from free list */
+	area = &zone->free_area[freepage_order];
 	list_del(&page->lru);
-	zone->free_area[order].nr_free--;
+	area->nr_free--;
 	rmv_page_order(page);
+	if (freepage_order != order)
+		expand(zone, page, order, freepage_order, area, mt);
 
 	/* Set the pageblock if the isolated page is at least a pageblock */
 	if (order >= pageblock_order - 1) {
@@ -1515,6 +1519,26 @@ int split_free_page(struct page *page)
 	return nr_pages;
 }
 
+bool capture_free_page(struct page *page, unsigned int order)
+{
+	struct zone *zone = page_zone(page);
+	unsigned long flags;
+	bool ret;
+
+	spin_lock_irqsave(&zone->lock, flags);
+
+	if (!PageBuddy(page) || page_order(page) < order) {
+		ret = false;
+		goto out;
+	}
+
+	ret = __isolate_free_page(page, order);
+
+out:
+	spin_unlock_irqrestore(&zone->lock, flags);
+	return ret;
+}
+
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
@@ -1523,7 +1547,7 @@ int split_free_page(struct page *page)
 static inline
 struct page *buffered_rmqueue(struct zone *preferred_zone,
 			struct zone *zone, int order, gfp_t gfp_flags,
-			int migratetype)
+			int migratetype, struct page *isolated_freepage)
 {
 	unsigned long flags;
 	struct page *page;
@@ -1552,6 +1576,9 @@ again:
 
 		list_del(&page->lru);
 		pcp->count--;
+	} else if (unlikely(isolated_freepage)) {
+		page = isolated_freepage;
+		local_irq_save(flags);
 	} else {
 		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
 			/*
@@ -1567,7 +1594,9 @@ again:
 			WARN_ON_ONCE(order > 1);
 		}
 		spin_lock_irqsave(&zone->lock, flags);
+
 		page = __rmqueue(zone, order, migratetype);
+
 		spin_unlock(&zone->lock);
 		if (!page)
 			goto failed;
@@ -1907,7 +1936,7 @@ static inline void init_zone_allows_reclaim(int nid)
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
-		struct zone *preferred_zone, int migratetype)
+		struct zone *preferred_zone, int migratetype, struct page *isolated_freepage)
 {
 	struct zoneref *z;
 	struct page *page = NULL;
@@ -1916,8 +1945,17 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
+	unsigned long mark;
 
 	classzone_idx = zone_idx(preferred_zone);
+
+	if (isolated_freepage) {
+		zone = page_zone(isolated_freepage);
+		page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask,
+						migratetype, isolated_freepage);
+		goto got_page;
+	}
+
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
@@ -1925,7 +1963,6 @@ zonelist_scan:
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						high_zoneidx, nodemask) {
-		unsigned long mark;
 
 		if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
@@ -2040,7 +2077,7 @@ zonelist_scan:
 
 try_this_zone:
 		page = buffered_rmqueue(preferred_zone, zone, order,
-						gfp_mask, migratetype);
+						gfp_mask, migratetype, NULL);
 		if (page)
 			break;
 this_zone_full:
@@ -2054,6 +2091,7 @@ this_zone_full:
 		goto zonelist_scan;
 	}
 
+got_page:
 	if (page)
 		/*
 		 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
@@ -2191,7 +2229,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
 		order, zonelist, high_zoneidx,
 		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
-		preferred_zone, migratetype);
+		preferred_zone, migratetype, NULL);
 	if (page)
 		goto out;
 
@@ -2230,6 +2268,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
+	struct page *captured_page;
+
 	if (!order)
 		return NULL;
 
@@ -2241,7 +2281,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 						nodemask, mode,
-						contended_compaction);
+						contended_compaction,
+						&captured_page);
 	current->flags &= ~PF_MEMALLOC;
 
 	if (*did_some_progress != COMPACT_SKIPPED) {
@@ -2254,7 +2295,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		page = get_page_from_freelist(gfp_mask, nodemask,
 				order, zonelist, high_zoneidx,
 				alloc_flags & ~ALLOC_NO_WATERMARKS,
-				preferred_zone, migratetype);
+				preferred_zone, migratetype, captured_page);
+
 		if (page) {
 			preferred_zone->compact_blockskip_flush = false;
 			compaction_defer_reset(preferred_zone, order, true);
@@ -2344,7 +2386,7 @@ retry:
 	page = get_page_from_freelist(gfp_mask, nodemask, order,
 					zonelist, high_zoneidx,
 					alloc_flags & ~ALLOC_NO_WATERMARKS,
-					preferred_zone, migratetype);
+					preferred_zone, migratetype, NULL);
 
 	/*
 	 * If an allocation failed after direct reclaim, it could be because
@@ -2374,7 +2416,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 	do {
 		page = get_page_from_freelist(gfp_mask, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
-			preferred_zone, migratetype);
+			preferred_zone, migratetype, NULL);
 
 		if (!page && gfp_mask & __GFP_NOFAIL)
 			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
@@ -2532,7 +2574,7 @@ rebalance:
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
-			preferred_zone, migratetype);
+			preferred_zone, migratetype, NULL);
 	if (page)
 		goto got_pg;
 
@@ -2736,7 +2778,7 @@ retry:
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, alloc_flags,
-			preferred_zone, migratetype);
+			preferred_zone, migratetype, NULL);
 	if (unlikely(!page)) {
 		/*
 		 * The first pass makes sure allocations are spread
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [RFC PATCH 6/6] mm, compaction: don't migrate in blocks that cannot be fully compacted in async direct compaction
  2014-06-04 16:11                 ` Vlastimil Babka
@ 2014-06-04 16:11                   ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-04 16:11 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Andrew Morton, Greg Thelen, Vlastimil Babka,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Michal Nazarewicz,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel, David Rientjes

In direct compaction, we want to allocate the high-order page as soon as
possible, so migrating from a block of pages that contains also unmigratable
pages just adds to allocation latency.

This patch therefore makes the migration scanner skip to the next cc->order
aligned block of pages as soon as it cannot isolate a non-free page. Everything
isolated up to that point is put back.

In this mode, the nr_isolated limit to COMPACT_CLUSTER_MAX is not observed,
allowing the scanner to scan the whole block at once, instead of migrating
COMPACT_CLUSTER_MAX pages and then finding an unmigratable page in the next
call. This might however have some implications on too_many_isolated.

Also in this RFC PATCH, the "skipping mode" is tied to async migration mode,
which is not optimal. What we most probably want is skipping in direct
compactions, but not from kswapd and hugepaged.

In very preliminary tests, this has reduced migrate_scanned, isolations and
migrations by about 10%, while the success rate of stress-highalloc mmtests
actually improved a bit.

Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 48 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 5909a88..c648ade 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -626,7 +626,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		}
 
 		if (!pfn_valid_within(low_pfn))
-			continue;
+			goto isolation_failed;
 		nr_scanned++;
 
 		/*
@@ -637,7 +637,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		 */
 		page = pfn_to_page(low_pfn);
 		if (page_zone(page) != zone)
-			continue;
+			goto isolation_failed;
 
 		if (!valid_page)
 			valid_page = page;
@@ -673,8 +673,12 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		if (PageBuddy(page)) {
 			unsigned long freepage_order = page_order_unsafe(page);
 
-			if (freepage_order > 0 && freepage_order < MAX_ORDER)
-				low_pfn += (1UL << freepage_order) - 1;
+			if (freepage_order > 0 && freepage_order < MAX_ORDER) {
+                                low_pfn += (1UL << freepage_order) - 1;
+                                if (next_capture_pfn)
+                                        next_capture_pfn = ALIGN(low_pfn + 1,
+                                                        (1UL << cc->order));
+			}
 			continue;
 		}
 
@@ -690,7 +694,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 					goto isolate_success;
 				}
 			}
-			continue;
+			goto isolation_failed;
 		}
 
 		/*
@@ -710,7 +714,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			if (next_capture_pfn)
 				next_capture_pfn =
 					ALIGN(low_pfn + 1, (1UL << cc->order));
-			continue;
+			goto isolation_failed;
 		}
 
 		/*
@@ -720,7 +724,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		 */
 		if (!page_mapping(page) &&
 		    page_count(page) > page_mapcount(page))
-			continue;
+			goto isolation_failed;
 
 		/* If we already hold the lock, we can skip some rechecking */
 		if (locked)
@@ -732,11 +736,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 
 		/* Recheck PageLRU and PageTransHuge under lock */
 		if (!PageLRU(page))
-			continue;
+			goto isolation_failed;
 		if (PageTransHuge(page)) {
 			low_pfn += (1 << compound_order(page)) - 1;
 			next_capture_pfn = low_pfn + 1;
-			continue;
+			goto isolation_failed;
 		}
 
 skip_recheck:
@@ -744,7 +748,7 @@ skip_recheck:
 
 		/* Try isolate the page */
 		if (__isolate_lru_page(page, mode) != 0)
-			continue;
+			goto isolation_failed;
 
 		VM_BUG_ON_PAGE(PageTransCompound(page), page);
 
@@ -754,11 +758,14 @@ skip_recheck:
 isolate_success:
 		cc->finished_update_migrate = true;
 		list_add(&page->lru, migratelist);
-		cc->nr_migratepages++;
 		nr_isolated++;
 
-		/* Avoid isolating too much */
-		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+		/*
+		 * Avoid isolating too much, except if we try to capture a
+		 * free page and want to find out at once if it can be done
+		 * or we should skip to the next block.
+		 */
+		if (!next_capture_pfn && nr_isolated == COMPACT_CLUSTER_MAX) {
 			++low_pfn;
 			break;
 		}
@@ -769,6 +776,20 @@ next_pageblock:
 		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
 		if (next_capture_pfn)
 			next_capture_pfn = low_pfn + 1;
+
+isolation_failed:
+		if (cc->mode == MIGRATE_ASYNC && next_capture_pfn) {
+			if (nr_isolated) {
+				if (locked) {
+					spin_unlock_irqrestore(&zone->lru_lock,
+									flags);
+					locked = false;
+				}
+				putback_movable_pages(migratelist);
+				nr_isolated = 0;
+			}
+			low_pfn = next_capture_pfn - 1;
+		}
 	}
 
 	/*
@@ -778,6 +799,7 @@ next_pageblock:
 	if (unlikely(low_pfn > end_pfn))
 		end_pfn = low_pfn;
 
+	cc->nr_migratepages = nr_isolated;
 	acct_isolated(zone, locked, cc);
 
 	if (locked)
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 267+ messages in thread

* [RFC PATCH 6/6] mm, compaction: don't migrate in blocks that cannot be fully compacted in async direct compaction
@ 2014-06-04 16:11                   ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-04 16:11 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Andrew Morton, Greg Thelen, Vlastimil Babka,
	Minchan Kim, Mel Gorman, Joonsoo Kim, Michal Nazarewicz,
	Naoya Horiguchi, Christoph Lameter, Rik van Riel, David Rientjes

In direct compaction, we want to allocate the high-order page as soon as
possible, so migrating from a block of pages that contains also unmigratable
pages just adds to allocation latency.

This patch therefore makes the migration scanner skip to the next cc->order
aligned block of pages as soon as it cannot isolate a non-free page. Everything
isolated up to that point is put back.

In this mode, the nr_isolated limit to COMPACT_CLUSTER_MAX is not observed,
allowing the scanner to scan the whole block at once, instead of migrating
COMPACT_CLUSTER_MAX pages and then finding an unmigratable page in the next
call. This might however have some implications on too_many_isolated.

Also in this RFC PATCH, the "skipping mode" is tied to async migration mode,
which is not optimal. What we most probably want is skipping in direct
compactions, but not from kswapd and hugepaged.

In very preliminary tests, this has reduced migrate_scanned, isolations and
migrations by about 10%, while the success rate of stress-highalloc mmtests
actually improved a bit.

Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
---
 mm/compaction.c | 48 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 5909a88..c648ade 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -626,7 +626,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		}
 
 		if (!pfn_valid_within(low_pfn))
-			continue;
+			goto isolation_failed;
 		nr_scanned++;
 
 		/*
@@ -637,7 +637,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		 */
 		page = pfn_to_page(low_pfn);
 		if (page_zone(page) != zone)
-			continue;
+			goto isolation_failed;
 
 		if (!valid_page)
 			valid_page = page;
@@ -673,8 +673,12 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		if (PageBuddy(page)) {
 			unsigned long freepage_order = page_order_unsafe(page);
 
-			if (freepage_order > 0 && freepage_order < MAX_ORDER)
-				low_pfn += (1UL << freepage_order) - 1;
+			if (freepage_order > 0 && freepage_order < MAX_ORDER) {
+                                low_pfn += (1UL << freepage_order) - 1;
+                                if (next_capture_pfn)
+                                        next_capture_pfn = ALIGN(low_pfn + 1,
+                                                        (1UL << cc->order));
+			}
 			continue;
 		}
 
@@ -690,7 +694,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 					goto isolate_success;
 				}
 			}
-			continue;
+			goto isolation_failed;
 		}
 
 		/*
@@ -710,7 +714,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			if (next_capture_pfn)
 				next_capture_pfn =
 					ALIGN(low_pfn + 1, (1UL << cc->order));
-			continue;
+			goto isolation_failed;
 		}
 
 		/*
@@ -720,7 +724,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		 */
 		if (!page_mapping(page) &&
 		    page_count(page) > page_mapcount(page))
-			continue;
+			goto isolation_failed;
 
 		/* If we already hold the lock, we can skip some rechecking */
 		if (locked)
@@ -732,11 +736,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 
 		/* Recheck PageLRU and PageTransHuge under lock */
 		if (!PageLRU(page))
-			continue;
+			goto isolation_failed;
 		if (PageTransHuge(page)) {
 			low_pfn += (1 << compound_order(page)) - 1;
 			next_capture_pfn = low_pfn + 1;
-			continue;
+			goto isolation_failed;
 		}
 
 skip_recheck:
@@ -744,7 +748,7 @@ skip_recheck:
 
 		/* Try isolate the page */
 		if (__isolate_lru_page(page, mode) != 0)
-			continue;
+			goto isolation_failed;
 
 		VM_BUG_ON_PAGE(PageTransCompound(page), page);
 
@@ -754,11 +758,14 @@ skip_recheck:
 isolate_success:
 		cc->finished_update_migrate = true;
 		list_add(&page->lru, migratelist);
-		cc->nr_migratepages++;
 		nr_isolated++;
 
-		/* Avoid isolating too much */
-		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+		/*
+		 * Avoid isolating too much, except if we try to capture a
+		 * free page and want to find out at once if it can be done
+		 * or we should skip to the next block.
+		 */
+		if (!next_capture_pfn && nr_isolated == COMPACT_CLUSTER_MAX) {
 			++low_pfn;
 			break;
 		}
@@ -769,6 +776,20 @@ next_pageblock:
 		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
 		if (next_capture_pfn)
 			next_capture_pfn = low_pfn + 1;
+
+isolation_failed:
+		if (cc->mode == MIGRATE_ASYNC && next_capture_pfn) {
+			if (nr_isolated) {
+				if (locked) {
+					spin_unlock_irqrestore(&zone->lru_lock,
+									flags);
+					locked = false;
+				}
+				putback_movable_pages(migratelist);
+				nr_isolated = 0;
+			}
+			low_pfn = next_capture_pfn - 1;
+		}
 	}
 
 	/*
@@ -778,6 +799,7 @@ next_pageblock:
 	if (unlikely(low_pfn > end_pfn))
 		end_pfn = low_pfn;
 
+	cc->nr_migratepages = nr_isolated;
 	acct_isolated(zone, locked, cc);
 
 	if (locked)
-- 
1.8.4.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 267+ messages in thread

* Re: [patch -mm 3/3] mm, compaction: avoid compacting memory for thp if pageblock cannot become free
  2014-06-04 11:04                           ` Mel Gorman
@ 2014-06-04 22:02                             ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-04 22:02 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, Vlastimil Babka, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Michal Nazarewicz, Rik van Riel

On Wed, 4 Jun 2014, Mel Gorman wrote:

> > It's pointless to migrate pages within a pageblock if the entire pageblock will 
> > not become free for a thp allocation.
> > 
> > If we encounter a page that cannot be migrated and a direct compactor other than 
> > khugepaged is trying to allocate a hugepage for thp, then skip the entire 
> > pageblock and avoid migrating pages needlessly.
> > 
> 
> It's not completely pointless. A movable page may be placed within an
> unmovable pageblock due to insufficient free memory or a pageblock changed
> type. When this happens then partial migration moves the movable page
> of out of the unmovable block. Future unmovable allocations can then be
> placed with other unmovable pages instead of falling back to other blocks
> and degrading fragmentation over time.
> 

Sorry, this should say that it's pointless when doing a HPAGE_PMD_ORDER 
allocation and we're calling direct compaction for thp.  While the result 
may be that there will be less external fragmentation in the longrun, I 
don't think it's appropriate to do this at fault.

We keep a running tracker of how long it takes to fault 64MB of anonymous 
memory with thp enabled in one of our production cells.  For an instance 
that took 1.58225s in fault (not including the mmap() time or munmap() 
time), here are the compaction stats:

Before:
compact_blocks_moved 508932592
compact_pages_moved 93068626
compact_pagemigrate_failed 199708939
compact_stall 7014989
compact_fail 6977371
compact_success 37617

After:
compact_blocks_moved 508938635
compact_pages_moved 93068667
compact_pagemigrate_failed 199712677
compact_stall 7015029
compact_fail 6977411
compact_success 37617

Not one of the compaction stalls resulted in a thp page being allocated, 
probably because the number of pages actually migrated is very low.  The 
delta here is 6043 pageblocks scanned over 40 compaction calls, 41 pages 
_total_ being successfully migrated and 3738 pages total being isolated 
but unsuccessfully migrated.

Those statistics are horrible.  We scan approximately 151 pageblocks per 
compaction stall needlessly in this case and, on average, migrate a single 
page but isolate and fail to migrate 93 pages.

I believe my patch would reduce this pointless migration when an entire 
pageblock will not be freed in the thp fault path.  I do need to factor in 
Vlastimil's feedback concerning the PageBuddy order, but I think this is 
generally the right approach for thp fault.

Additionally, I need to figure out why those 3738 pages are isolated but 
fail to migrate and it doesn't seem to be because of any race.  Perhaps 
there will be a chance to do something similar to what I did in commit 
119d6d59dcc0 ("mm, compaction: avoid isolating pinned pages") to avoid 
even considering certain checks, but I'll have to identify the source(s) 
of these failures first.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [patch -mm 3/3] mm, compaction: avoid compacting memory for thp if pageblock cannot become free
@ 2014-06-04 22:02                             ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-04 22:02 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, Vlastimil Babka, Joonsoo Kim, Hugh Dickins,
	Greg Thelen, linux-kernel, linux-mm, Minchan Kim,
	Michal Nazarewicz, Rik van Riel

On Wed, 4 Jun 2014, Mel Gorman wrote:

> > It's pointless to migrate pages within a pageblock if the entire pageblock will 
> > not become free for a thp allocation.
> > 
> > If we encounter a page that cannot be migrated and a direct compactor other than 
> > khugepaged is trying to allocate a hugepage for thp, then skip the entire 
> > pageblock and avoid migrating pages needlessly.
> > 
> 
> It's not completely pointless. A movable page may be placed within an
> unmovable pageblock due to insufficient free memory or a pageblock changed
> type. When this happens then partial migration moves the movable page
> of out of the unmovable block. Future unmovable allocations can then be
> placed with other unmovable pages instead of falling back to other blocks
> and degrading fragmentation over time.
> 

Sorry, this should say that it's pointless when doing a HPAGE_PMD_ORDER 
allocation and we're calling direct compaction for thp.  While the result 
may be that there will be less external fragmentation in the longrun, I 
don't think it's appropriate to do this at fault.

We keep a running tracker of how long it takes to fault 64MB of anonymous 
memory with thp enabled in one of our production cells.  For an instance 
that took 1.58225s in fault (not including the mmap() time or munmap() 
time), here are the compaction stats:

Before:
compact_blocks_moved 508932592
compact_pages_moved 93068626
compact_pagemigrate_failed 199708939
compact_stall 7014989
compact_fail 6977371
compact_success 37617

After:
compact_blocks_moved 508938635
compact_pages_moved 93068667
compact_pagemigrate_failed 199712677
compact_stall 7015029
compact_fail 6977411
compact_success 37617

Not one of the compaction stalls resulted in a thp page being allocated, 
probably because the number of pages actually migrated is very low.  The 
delta here is 6043 pageblocks scanned over 40 compaction calls, 41 pages 
_total_ being successfully migrated and 3738 pages total being isolated 
but unsuccessfully migrated.

Those statistics are horrible.  We scan approximately 151 pageblocks per 
compaction stall needlessly in this case and, on average, migrate a single 
page but isolate and fail to migrate 93 pages.

I believe my patch would reduce this pointless migration when an entire 
pageblock will not be freed in the thp fault path.  I do need to factor in 
Vlastimil's feedback concerning the PageBuddy order, but I think this is 
generally the right approach for thp fault.

Additionally, I need to figure out why those 3738 pages are isolated but 
fail to migrate and it doesn't seem to be because of any race.  Perhaps 
there will be a chance to do something similar to what I did in commit 
119d6d59dcc0 ("mm, compaction: avoid isolating pinned pages") to avoid 
even considering certain checks, but I'll have to identify the source(s) 
of these failures first.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 1/6] mm, compaction: periodically drop lock and restore IRQs in scanners
  2014-06-04 16:11                 ` Vlastimil Babka
@ 2014-06-04 23:39                   ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-04 23:39 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Wed, 4 Jun 2014, Vlastimil Babka wrote:

> diff --git a/mm/compaction.c b/mm/compaction.c
> index ed7102c..f0fd4b5 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -185,47 +185,74 @@ static void update_pageblock_skip(struct compact_control *cc,
>  }
>  #endif /* CONFIG_COMPACTION */
>  
> -static inline bool should_release_lock(spinlock_t *lock)
> +/*
> + * Compaction requires the taking of some coarse locks that are potentially
> + * very heavily contended. Check if the process needs to be scheduled or
> + * if the lock is contended. For async compaction, back out if the process
> + * needs to be scheduled, or the lock cannot be taken immediately. For sync
> + * compaction, schedule and spin on the lock if needed.
> + *
> + * Returns true if the lock is held
> + * Returns false if the lock is not held and compaction should abort
> + */
> +static inline bool compact_trylock_irqsave(spinlock_t *lock,
> +			unsigned long *flags, struct compact_control *cc)

Hmm, what tree is this series based on?  It doesn't apply cleanly to 
linux-next, I think you're missing
mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix.patch
in your tree.

Is there a performance benefit to doing the inlining here?

>  {
> -	return need_resched() || spin_is_contended(lock);
> +	if (cc->mode == MIGRATE_ASYNC) {
> +		if (need_resched() || !spin_trylock_irqsave(lock, *flags)) {
> +			cc->contended = true;
> +			return false;
> +		}
> +	} else {
> +		cond_resched();

Why do we need this cond_resched() here if there is already a 
cond_resched() in compact_unlock_should_abort() for non-async compaction?

If this is trying to reschedule right before disabling irqs because 
otherwise spinning on the lock causes irq starvation, then that's a very 
delicate balance and I think we're going to get in trouble later.

> +		spin_lock_irqsave(lock, *flags);
> +	}
> +
> +	return true;
>  }
>  
>  /*
>   * Compaction requires the taking of some coarse locks that are potentially
> - * very heavily contended. Check if the process needs to be scheduled or
> - * if the lock is contended. For async compaction, back out in the event
> - * if contention is severe. For sync compaction, schedule.
> + * very heavily contended. The lock should be periodically unlocked to avoid
> + * having disabled IRQs for a long time, even when there is nobody waiting on
> + * the lock. It might also be that allowing the IRQs will result in
> + * need_resched() becoming true. If scheduling is needed, or somebody else
> + * has taken the lock, async compaction aborts. Sync compaction schedules.
> + * Either compaction type will also abort if a fatal signal is pending.
> + * In either case if the lock was locked, it is dropped and not regained.
>   *
> - * Returns true if the lock is held.
> - * Returns false if the lock is released and compaction should abort
> + * Returns true if compaction should abort due to fatal signal pending, or
> + *		async compaction due to lock contention or need to schedule
> + * Returns false when compaction can continue (sync compaction might have
> + *		scheduled)
>   */
> -static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
> -				      bool locked, struct compact_control *cc)
> +static inline bool compact_unlock_should_abort(spinlock_t *lock,
> +		unsigned long flags, bool *locked, struct compact_control *cc)

This inlining is also suspicious and I think keeping both of them 
out-of-line for the freeing and migration scanners is going to be the best 
route unless there's some measurable performance benefit I'm not seeing.

>  {
> -	if (should_release_lock(lock)) {
> -		if (locked) {
> -			spin_unlock_irqrestore(lock, *flags);
> -			locked = false;
> -		}
> +	if (*locked) {
> +		spin_unlock_irqrestore(lock, flags);
> +		*locked = false;
> +	}
>  
> -		/* async aborts if taking too long or contended */
> -		if (cc->mode == MIGRATE_ASYNC) {
> +	if (fatal_signal_pending(current))
> +		return true;
> +
> +	if (cc->mode == MIGRATE_ASYNC) {
> +		if (need_resched() || spin_is_locked(lock)) {
>  			cc->contended = true;
> -			return false;
> +			return true;
>  		}
> -
> +	} else {
>  		cond_resched();
>  	}
>  
> -	if (!locked)
> -		spin_lock_irqsave(lock, *flags);
> -	return true;
> +	return false;
>  }
>  
>  /*
> - * Aside from avoiding lock contention, compaction also periodically checks
> + * Aside from avoiding lock contention, compaction should also periodically checks

Not sure what the purpose of this commentary change is, it's gramatically 
incorrect now.

>   * need_resched() and either schedules in sync compaction, or aborts async
> - * compaction. This is similar to compact_checklock_irqsave() does, but used
> + * compaction. This is similar to compact_unlock_should_abort() does, but used

This was and still is gramatically incorrect :)

>   * where no lock is concerned.
>   *
>   * Returns false when no scheduling was needed, or sync compaction scheduled.
> @@ -285,6 +312,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>  		int isolated, i;
>  		struct page *page = cursor;
>  
> +		/*
> +		 * Periodically drop the lock (if held) regardless of its
> +		 * contention, to give chance to IRQs. Abort async compaction
> +		 * if contended.
> +		 */
> +		if (!(blockpfn % SWAP_CLUSTER_MAX)
> +		    && compact_unlock_should_abort(&cc->zone->lock, flags,
> +								&locked, cc))
> +			break;
> +
>  		nr_scanned++;
>  		if (!pfn_valid_within(blockpfn))
>  			goto isolate_fail;
> @@ -302,8 +339,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>  		 * spin on the lock and we acquire the lock as late as
>  		 * possible.
>  		 */
> -		locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
> -								locked, cc);
> +		if (!locked)
> +			locked = compact_trylock_irqsave(&cc->zone->lock,
> +								&flags, cc);
>  		if (!locked)
>  			break;
>  
> @@ -523,13 +561,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  
>  	/* Time to isolate some pages for migration */
>  	for (; low_pfn < end_pfn; low_pfn++) {
> -		/* give a chance to irqs before checking need_resched() */
> -		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
> -			if (should_release_lock(&zone->lru_lock)) {
> -				spin_unlock_irqrestore(&zone->lru_lock, flags);
> -				locked = false;
> -			}
> -		}
> +		/*
> +		 * Periodically drop the lock (if held) regardless of its
> +		 * contention, to give chance to IRQs. Abort async compaction
> +		 * if contended.
> +		 */
> +		if (!(low_pfn % SWAP_CLUSTER_MAX)
> +		    && compact_unlock_should_abort(&zone->lru_lock, flags,
> +								&locked, cc))
> +			break;
>  
>  		/*
>  		 * migrate_pfn does not necessarily start aligned to a
> @@ -631,10 +671,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  		    page_count(page) > page_mapcount(page))
>  			continue;
>  
> -		/* Check if it is ok to still hold the lock */
> -		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
> -								locked, cc);
> -		if (!locked || fatal_signal_pending(current))
> +		/* If the lock is not held, try to take it */
> +		if (!locked)
> +			locked = compact_trylock_irqsave(&zone->lru_lock,
> +								&flags, cc);
> +		if (!locked)
>  			break;
>  
>  		/* Recheck PageLRU and PageTransHuge under lock */

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 1/6] mm, compaction: periodically drop lock and restore IRQs in scanners
@ 2014-06-04 23:39                   ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-04 23:39 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Wed, 4 Jun 2014, Vlastimil Babka wrote:

> diff --git a/mm/compaction.c b/mm/compaction.c
> index ed7102c..f0fd4b5 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -185,47 +185,74 @@ static void update_pageblock_skip(struct compact_control *cc,
>  }
>  #endif /* CONFIG_COMPACTION */
>  
> -static inline bool should_release_lock(spinlock_t *lock)
> +/*
> + * Compaction requires the taking of some coarse locks that are potentially
> + * very heavily contended. Check if the process needs to be scheduled or
> + * if the lock is contended. For async compaction, back out if the process
> + * needs to be scheduled, or the lock cannot be taken immediately. For sync
> + * compaction, schedule and spin on the lock if needed.
> + *
> + * Returns true if the lock is held
> + * Returns false if the lock is not held and compaction should abort
> + */
> +static inline bool compact_trylock_irqsave(spinlock_t *lock,
> +			unsigned long *flags, struct compact_control *cc)

Hmm, what tree is this series based on?  It doesn't apply cleanly to 
linux-next, I think you're missing
mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix.patch
in your tree.

Is there a performance benefit to doing the inlining here?

>  {
> -	return need_resched() || spin_is_contended(lock);
> +	if (cc->mode == MIGRATE_ASYNC) {
> +		if (need_resched() || !spin_trylock_irqsave(lock, *flags)) {
> +			cc->contended = true;
> +			return false;
> +		}
> +	} else {
> +		cond_resched();

Why do we need this cond_resched() here if there is already a 
cond_resched() in compact_unlock_should_abort() for non-async compaction?

If this is trying to reschedule right before disabling irqs because 
otherwise spinning on the lock causes irq starvation, then that's a very 
delicate balance and I think we're going to get in trouble later.

> +		spin_lock_irqsave(lock, *flags);
> +	}
> +
> +	return true;
>  }
>  
>  /*
>   * Compaction requires the taking of some coarse locks that are potentially
> - * very heavily contended. Check if the process needs to be scheduled or
> - * if the lock is contended. For async compaction, back out in the event
> - * if contention is severe. For sync compaction, schedule.
> + * very heavily contended. The lock should be periodically unlocked to avoid
> + * having disabled IRQs for a long time, even when there is nobody waiting on
> + * the lock. It might also be that allowing the IRQs will result in
> + * need_resched() becoming true. If scheduling is needed, or somebody else
> + * has taken the lock, async compaction aborts. Sync compaction schedules.
> + * Either compaction type will also abort if a fatal signal is pending.
> + * In either case if the lock was locked, it is dropped and not regained.
>   *
> - * Returns true if the lock is held.
> - * Returns false if the lock is released and compaction should abort
> + * Returns true if compaction should abort due to fatal signal pending, or
> + *		async compaction due to lock contention or need to schedule
> + * Returns false when compaction can continue (sync compaction might have
> + *		scheduled)
>   */
> -static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
> -				      bool locked, struct compact_control *cc)
> +static inline bool compact_unlock_should_abort(spinlock_t *lock,
> +		unsigned long flags, bool *locked, struct compact_control *cc)

This inlining is also suspicious and I think keeping both of them 
out-of-line for the freeing and migration scanners is going to be the best 
route unless there's some measurable performance benefit I'm not seeing.

>  {
> -	if (should_release_lock(lock)) {
> -		if (locked) {
> -			spin_unlock_irqrestore(lock, *flags);
> -			locked = false;
> -		}
> +	if (*locked) {
> +		spin_unlock_irqrestore(lock, flags);
> +		*locked = false;
> +	}
>  
> -		/* async aborts if taking too long or contended */
> -		if (cc->mode == MIGRATE_ASYNC) {
> +	if (fatal_signal_pending(current))
> +		return true;
> +
> +	if (cc->mode == MIGRATE_ASYNC) {
> +		if (need_resched() || spin_is_locked(lock)) {
>  			cc->contended = true;
> -			return false;
> +			return true;
>  		}
> -
> +	} else {
>  		cond_resched();
>  	}
>  
> -	if (!locked)
> -		spin_lock_irqsave(lock, *flags);
> -	return true;
> +	return false;
>  }
>  
>  /*
> - * Aside from avoiding lock contention, compaction also periodically checks
> + * Aside from avoiding lock contention, compaction should also periodically checks

Not sure what the purpose of this commentary change is, it's gramatically 
incorrect now.

>   * need_resched() and either schedules in sync compaction, or aborts async
> - * compaction. This is similar to compact_checklock_irqsave() does, but used
> + * compaction. This is similar to compact_unlock_should_abort() does, but used

This was and still is gramatically incorrect :)

>   * where no lock is concerned.
>   *
>   * Returns false when no scheduling was needed, or sync compaction scheduled.
> @@ -285,6 +312,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>  		int isolated, i;
>  		struct page *page = cursor;
>  
> +		/*
> +		 * Periodically drop the lock (if held) regardless of its
> +		 * contention, to give chance to IRQs. Abort async compaction
> +		 * if contended.
> +		 */
> +		if (!(blockpfn % SWAP_CLUSTER_MAX)
> +		    && compact_unlock_should_abort(&cc->zone->lock, flags,
> +								&locked, cc))
> +			break;
> +
>  		nr_scanned++;
>  		if (!pfn_valid_within(blockpfn))
>  			goto isolate_fail;
> @@ -302,8 +339,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>  		 * spin on the lock and we acquire the lock as late as
>  		 * possible.
>  		 */
> -		locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
> -								locked, cc);
> +		if (!locked)
> +			locked = compact_trylock_irqsave(&cc->zone->lock,
> +								&flags, cc);
>  		if (!locked)
>  			break;
>  
> @@ -523,13 +561,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  
>  	/* Time to isolate some pages for migration */
>  	for (; low_pfn < end_pfn; low_pfn++) {
> -		/* give a chance to irqs before checking need_resched() */
> -		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
> -			if (should_release_lock(&zone->lru_lock)) {
> -				spin_unlock_irqrestore(&zone->lru_lock, flags);
> -				locked = false;
> -			}
> -		}
> +		/*
> +		 * Periodically drop the lock (if held) regardless of its
> +		 * contention, to give chance to IRQs. Abort async compaction
> +		 * if contended.
> +		 */
> +		if (!(low_pfn % SWAP_CLUSTER_MAX)
> +		    && compact_unlock_should_abort(&zone->lru_lock, flags,
> +								&locked, cc))
> +			break;
>  
>  		/*
>  		 * migrate_pfn does not necessarily start aligned to a
> @@ -631,10 +671,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  		    page_count(page) > page_mapcount(page))
>  			continue;
>  
> -		/* Check if it is ok to still hold the lock */
> -		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
> -								locked, cc);
> -		if (!locked || fatal_signal_pending(current))
> +		/* If the lock is not held, try to take it */
> +		if (!locked)
> +			locked = compact_trylock_irqsave(&zone->lru_lock,
> +								&flags, cc);
> +		if (!locked)
>  			break;
>  
>  		/* Recheck PageLRU and PageTransHuge under lock */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 2/6] mm, compaction: skip rechecks when lock was already held
  2014-06-04 16:11                   ` Vlastimil Babka
@ 2014-06-04 23:46                     ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-04 23:46 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Wed, 4 Jun 2014, Vlastimil Babka wrote:

> diff --git a/mm/compaction.c b/mm/compaction.c
> index f0fd4b5..27c73d7 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -332,6 +332,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>  			goto isolate_fail;
>  
>  		/*
> +		 * If we already hold the lock, we can skip some rechecking.
> +		 * Note that if we hold the lock now, checked_pageblock was
> +		 * already set in some previous iteration (or strict is true),
> +		 * so it is correct to skip the suitable migration target
> +		 * recheck as well.
> +		 */
> +		if (locked)
> +			goto skip_recheck;
> +
> +		/*
>  		 * The zone lock must be held to isolate freepages.
>  		 * Unfortunately this is a very coarse lock and can be
>  		 * heavily contended if there are parallel allocations
> @@ -339,9 +349,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>  		 * spin on the lock and we acquire the lock as late as
>  		 * possible.
>  		 */
> -		if (!locked)
> -			locked = compact_trylock_irqsave(&cc->zone->lock,
> -								&flags, cc);
> +		locked = compact_trylock_irqsave(&cc->zone->lock, &flags, cc);
>  		if (!locked)
>  			break;
>  
> @@ -361,6 +369,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>  		if (!PageBuddy(page))
>  			goto isolate_fail;
>  
> +skip_recheck:
>  		/* Found a free page, break it into order-0 pages */
>  		isolated = split_free_page(page);
>  		total_isolated += isolated;

This doesn't apply cleanly, you probably need Andrew's 
"mm/compaction.c:isolate_freepages_block(): small tuneup"?  Rebasing the 
series on -mm would probably be best.

> @@ -671,10 +680,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  		    page_count(page) > page_mapcount(page))
>  			continue;
>  
> -		/* If the lock is not held, try to take it */
> -		if (!locked)
> -			locked = compact_trylock_irqsave(&zone->lru_lock,
> -								&flags, cc);
> +		/* If we already hold the lock, we can skip some rechecking */
> +		if (locked)
> +			goto skip_recheck;
> +
> +		locked = compact_trylock_irqsave(&zone->lru_lock, &flags, cc);
>  		if (!locked)
>  			break;
>  
> @@ -686,6 +696,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  			continue;
>  		}
>  
> +skip_recheck:
>  		lruvec = mem_cgroup_page_lruvec(page, zone);
>  
>  		/* Try isolate the page */

Looks good.  I wonder how the lock-taking and the rechecks would look 
nested in a "if (!locked)" clause and whether that would be cleaner (and 
avoid the gotos), but I assume you already looked at that.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 2/6] mm, compaction: skip rechecks when lock was already held
@ 2014-06-04 23:46                     ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-04 23:46 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Wed, 4 Jun 2014, Vlastimil Babka wrote:

> diff --git a/mm/compaction.c b/mm/compaction.c
> index f0fd4b5..27c73d7 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -332,6 +332,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>  			goto isolate_fail;
>  
>  		/*
> +		 * If we already hold the lock, we can skip some rechecking.
> +		 * Note that if we hold the lock now, checked_pageblock was
> +		 * already set in some previous iteration (or strict is true),
> +		 * so it is correct to skip the suitable migration target
> +		 * recheck as well.
> +		 */
> +		if (locked)
> +			goto skip_recheck;
> +
> +		/*
>  		 * The zone lock must be held to isolate freepages.
>  		 * Unfortunately this is a very coarse lock and can be
>  		 * heavily contended if there are parallel allocations
> @@ -339,9 +349,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>  		 * spin on the lock and we acquire the lock as late as
>  		 * possible.
>  		 */
> -		if (!locked)
> -			locked = compact_trylock_irqsave(&cc->zone->lock,
> -								&flags, cc);
> +		locked = compact_trylock_irqsave(&cc->zone->lock, &flags, cc);
>  		if (!locked)
>  			break;
>  
> @@ -361,6 +369,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>  		if (!PageBuddy(page))
>  			goto isolate_fail;
>  
> +skip_recheck:
>  		/* Found a free page, break it into order-0 pages */
>  		isolated = split_free_page(page);
>  		total_isolated += isolated;

This doesn't apply cleanly, you probably need Andrew's 
"mm/compaction.c:isolate_freepages_block(): small tuneup"?  Rebasing the 
series on -mm would probably be best.

> @@ -671,10 +680,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  		    page_count(page) > page_mapcount(page))
>  			continue;
>  
> -		/* If the lock is not held, try to take it */
> -		if (!locked)
> -			locked = compact_trylock_irqsave(&zone->lru_lock,
> -								&flags, cc);
> +		/* If we already hold the lock, we can skip some rechecking */
> +		if (locked)
> +			goto skip_recheck;
> +
> +		locked = compact_trylock_irqsave(&zone->lru_lock, &flags, cc);
>  		if (!locked)
>  			break;
>  
> @@ -686,6 +696,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  			continue;
>  		}
>  
> +skip_recheck:
>  		lruvec = mem_cgroup_page_lruvec(page, zone);
>  
>  		/* Try isolate the page */

Looks good.  I wonder how the lock-taking and the rechecks would look 
nested in a "if (!locked)" clause and whether that would be cleaner (and 
avoid the gotos), but I assume you already looked at that.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
  2014-06-04 16:11                   ` Vlastimil Babka
@ 2014-06-05  0:02                     ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-05  0:02 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Wed, 4 Jun 2014, Vlastimil Babka wrote:

> diff --git a/mm/compaction.c b/mm/compaction.c
> index ae7db5f..3dce5a7 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -640,11 +640,18 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  		}
>  
>  		/*
> -		 * Skip if free. page_order cannot be used without zone->lock
> -		 * as nothing prevents parallel allocations or buddy merging.
> +		 * Skip if free. We read page order here without zone lock
> +		 * which is generally unsafe, but the race window is small and
> +		 * the worst thing that can happen is that we skip some
> +		 * potential isolation targets.

Should we only be doing the low_pfn adjustment based on the order for 
MIGRATE_ASYNC?  It seems like sync compaction, including compaction that 
is triggered from the command line, would prefer to scan over the 
following pages.

>  		 */
> -		if (PageBuddy(page))
> +		if (PageBuddy(page)) {
> +			unsigned long freepage_order = page_order_unsafe(page);

I don't assume that we want a smp_wmb() in set_page_order() for this 
little race and to recheck PageBuddy() here after smp_rmb().

I think this is fine for MIGRATE_ASYNC.

> +
> +			if (freepage_order > 0 && freepage_order < MAX_ORDER)
> +				low_pfn += (1UL << freepage_order) - 1;
>  			continue;
> +		}
>  
>  		/*
>  		 * Check may be lockless but that's ok as we recheck later.
> @@ -733,6 +740,13 @@ next_pageblock:
>  		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
>  	}
>  
> +	/*
> +	 * The PageBuddy() check could have potentially brought us outside
> +	 * the range to be scanned.
> +	 */
> +	if (unlikely(low_pfn > end_pfn))
> +		end_pfn = low_pfn;
> +
>  	acct_isolated(zone, locked, cc);
>  
>  	if (locked)
> diff --git a/mm/internal.h b/mm/internal.h
> index 1a8a0d4..6aa1f74 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   * general, page_zone(page)->lock must be held by the caller to prevent the
>   * page from being allocated in parallel and returning garbage as the order.
>   * If a caller does not hold page_zone(page)->lock, it must guarantee that the
> - * page cannot be allocated or merged in parallel.
> + * page cannot be allocated or merged in parallel. Alternatively, it must
> + * handle invalid values gracefully, and use page_order_unsafe() below.
>   */
>  static inline unsigned long page_order(struct page *page)
>  {
> @@ -172,6 +173,23 @@ static inline unsigned long page_order(struct page *page)
>  	return page_private(page);
>  }
>  
> +/*
> + * Like page_order(), but for callers who cannot afford to hold the zone lock,
> + * and handle invalid values gracefully. ACCESS_ONCE is used so that if the
> + * caller assigns the result into a local variable and e.g. tests it for valid
> + * range  before using, the compiler cannot decide to remove the variable and
> + * inline the function multiple times, potentially observing different values
> + * in the tests and the actual use of the result.
> + */
> +static inline unsigned long page_order_unsafe(struct page *page)
> +{
> +	/*
> +	 * PageBuddy() should be checked by the caller to minimize race window,
> +	 * and invalid values must be handled gracefully.
> +	 */
> +	return ACCESS_ONCE(page_private(page));
> +}
> +
>  /* mm/util.c */
>  void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
>  		struct vm_area_struct *prev, struct rb_node *rb_parent);

I don't like this change at all, I don't think we should have header 
functions that imply the context in which the function will be called.  I 
think it would make much more sense to just do 
ACCESS_ONCE(page_order(page)) in the migration scanner with a comment.  
These are __attribute__((pure)) semantics for page_order().

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
@ 2014-06-05  0:02                     ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-05  0:02 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Wed, 4 Jun 2014, Vlastimil Babka wrote:

> diff --git a/mm/compaction.c b/mm/compaction.c
> index ae7db5f..3dce5a7 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -640,11 +640,18 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  		}
>  
>  		/*
> -		 * Skip if free. page_order cannot be used without zone->lock
> -		 * as nothing prevents parallel allocations or buddy merging.
> +		 * Skip if free. We read page order here without zone lock
> +		 * which is generally unsafe, but the race window is small and
> +		 * the worst thing that can happen is that we skip some
> +		 * potential isolation targets.

Should we only be doing the low_pfn adjustment based on the order for 
MIGRATE_ASYNC?  It seems like sync compaction, including compaction that 
is triggered from the command line, would prefer to scan over the 
following pages.

>  		 */
> -		if (PageBuddy(page))
> +		if (PageBuddy(page)) {
> +			unsigned long freepage_order = page_order_unsafe(page);

I don't assume that we want a smp_wmb() in set_page_order() for this 
little race and to recheck PageBuddy() here after smp_rmb().

I think this is fine for MIGRATE_ASYNC.

> +
> +			if (freepage_order > 0 && freepage_order < MAX_ORDER)
> +				low_pfn += (1UL << freepage_order) - 1;
>  			continue;
> +		}
>  
>  		/*
>  		 * Check may be lockless but that's ok as we recheck later.
> @@ -733,6 +740,13 @@ next_pageblock:
>  		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
>  	}
>  
> +	/*
> +	 * The PageBuddy() check could have potentially brought us outside
> +	 * the range to be scanned.
> +	 */
> +	if (unlikely(low_pfn > end_pfn))
> +		end_pfn = low_pfn;
> +
>  	acct_isolated(zone, locked, cc);
>  
>  	if (locked)
> diff --git a/mm/internal.h b/mm/internal.h
> index 1a8a0d4..6aa1f74 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>   * general, page_zone(page)->lock must be held by the caller to prevent the
>   * page from being allocated in parallel and returning garbage as the order.
>   * If a caller does not hold page_zone(page)->lock, it must guarantee that the
> - * page cannot be allocated or merged in parallel.
> + * page cannot be allocated or merged in parallel. Alternatively, it must
> + * handle invalid values gracefully, and use page_order_unsafe() below.
>   */
>  static inline unsigned long page_order(struct page *page)
>  {
> @@ -172,6 +173,23 @@ static inline unsigned long page_order(struct page *page)
>  	return page_private(page);
>  }
>  
> +/*
> + * Like page_order(), but for callers who cannot afford to hold the zone lock,
> + * and handle invalid values gracefully. ACCESS_ONCE is used so that if the
> + * caller assigns the result into a local variable and e.g. tests it for valid
> + * range  before using, the compiler cannot decide to remove the variable and
> + * inline the function multiple times, potentially observing different values
> + * in the tests and the actual use of the result.
> + */
> +static inline unsigned long page_order_unsafe(struct page *page)
> +{
> +	/*
> +	 * PageBuddy() should be checked by the caller to minimize race window,
> +	 * and invalid values must be handled gracefully.
> +	 */
> +	return ACCESS_ONCE(page_private(page));
> +}
> +
>  /* mm/util.c */
>  void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
>  		struct vm_area_struct *prev, struct rb_node *rb_parent);

I don't like this change at all, I don't think we should have header 
functions that imply the context in which the function will be called.  I 
think it would make much more sense to just do 
ACCESS_ONCE(page_order(page)) in the migration scanner with a comment.  
These are __attribute__((pure)) semantics for page_order().

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 6/6] mm, compaction: don't migrate in blocks that cannot be fully compacted in async direct compaction
  2014-06-04 16:11                   ` Vlastimil Babka
@ 2014-06-05  0:08                     ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-05  0:08 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Wed, 4 Jun 2014, Vlastimil Babka wrote:

> In direct compaction, we want to allocate the high-order page as soon as
> possible, so migrating from a block of pages that contains also unmigratable
> pages just adds to allocation latency.
> 

The title of the patch in the subject line should probably be reworded 
since it implies we never isolate from blocks that cannot become 
completely free and what you're really doing is skipping cc->order aligned 
pages.

> This patch therefore makes the migration scanner skip to the next cc->order
> aligned block of pages as soon as it cannot isolate a non-free page. Everything
> isolated up to that point is put back.
> 
> In this mode, the nr_isolated limit to COMPACT_CLUSTER_MAX is not observed,
> allowing the scanner to scan the whole block at once, instead of migrating
> COMPACT_CLUSTER_MAX pages and then finding an unmigratable page in the next
> call. This might however have some implications on too_many_isolated.
> 
> Also in this RFC PATCH, the "skipping mode" is tied to async migration mode,
> which is not optimal. What we most probably want is skipping in direct
> compactions, but not from kswapd and hugepaged.
> 
> In very preliminary tests, this has reduced migrate_scanned, isolations and
> migrations by about 10%, while the success rate of stress-highalloc mmtests
> actually improved a bit.
> 

Ok, so this obsoletes my patchseries that did something similar.  I hope 
you can rebase this set on top of linux-next and then propose it formally 
without the RFC tag.

We also need to discuss the scheduling heuristics, the reliance on 
need_resched(), to abort async compaction.  In testing, we actualy 
sometimes see 2-3 pageblocks scanned before terminating and thp has a very 
little chance of being allocated.  At the same time, if we try to fault 
64MB of anon memory in and each of the 32 calls to compaction are 
expensive but don't result in an order-9 page, we see very lengthy fault 
latency.

I think it would be interesting to consider doing async compaction 
deferral up to 1 << COMPACT_MAX_DEFER_SHIFT after a sysctl-configurable 
amount of memory is scanned, at least for thp, and remove the scheduling 
heuristic entirely.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 6/6] mm, compaction: don't migrate in blocks that cannot be fully compacted in async direct compaction
@ 2014-06-05  0:08                     ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-05  0:08 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Wed, 4 Jun 2014, Vlastimil Babka wrote:

> In direct compaction, we want to allocate the high-order page as soon as
> possible, so migrating from a block of pages that contains also unmigratable
> pages just adds to allocation latency.
> 

The title of the patch in the subject line should probably be reworded 
since it implies we never isolate from blocks that cannot become 
completely free and what you're really doing is skipping cc->order aligned 
pages.

> This patch therefore makes the migration scanner skip to the next cc->order
> aligned block of pages as soon as it cannot isolate a non-free page. Everything
> isolated up to that point is put back.
> 
> In this mode, the nr_isolated limit to COMPACT_CLUSTER_MAX is not observed,
> allowing the scanner to scan the whole block at once, instead of migrating
> COMPACT_CLUSTER_MAX pages and then finding an unmigratable page in the next
> call. This might however have some implications on too_many_isolated.
> 
> Also in this RFC PATCH, the "skipping mode" is tied to async migration mode,
> which is not optimal. What we most probably want is skipping in direct
> compactions, but not from kswapd and hugepaged.
> 
> In very preliminary tests, this has reduced migrate_scanned, isolations and
> migrations by about 10%, while the success rate of stress-highalloc mmtests
> actually improved a bit.
> 

Ok, so this obsoletes my patchseries that did something similar.  I hope 
you can rebase this set on top of linux-next and then propose it formally 
without the RFC tag.

We also need to discuss the scheduling heuristics, the reliance on 
need_resched(), to abort async compaction.  In testing, we actualy 
sometimes see 2-3 pageblocks scanned before terminating and thp has a very 
little chance of being allocated.  At the same time, if we try to fault 
64MB of anon memory in and each of the 32 calls to compaction are 
expensive but don't result in an order-9 page, we see very lengthy fault 
latency.

I think it would be interesting to consider doing async compaction 
deferral up to 1 << COMPACT_MAX_DEFER_SHIFT after a sysctl-configurable 
amount of memory is scanned, at least for thp, and remove the scheduling 
heuristic entirely.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 1/6] mm, compaction: periodically drop lock and restore IRQs in scanners
  2014-06-04 23:39                   ` David Rientjes
@ 2014-06-05  9:05                     ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-05  9:05 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/05/2014 01:39 AM, David Rientjes wrote:
> On Wed, 4 Jun 2014, Vlastimil Babka wrote:
>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index ed7102c..f0fd4b5 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -185,47 +185,74 @@ static void update_pageblock_skip(struct compact_control *cc,
>>   }
>>   #endif /* CONFIG_COMPACTION */
>>
>> -static inline bool should_release_lock(spinlock_t *lock)
>> +/*
>> + * Compaction requires the taking of some coarse locks that are potentially
>> + * very heavily contended. Check if the process needs to be scheduled or
>> + * if the lock is contended. For async compaction, back out if the process
>> + * needs to be scheduled, or the lock cannot be taken immediately. For sync
>> + * compaction, schedule and spin on the lock if needed.
>> + *
>> + * Returns true if the lock is held
>> + * Returns false if the lock is not held and compaction should abort
>> + */
>> +static inline bool compact_trylock_irqsave(spinlock_t *lock,
>> +			unsigned long *flags, struct compact_control *cc)
>
> Hmm, what tree is this series based on?  It doesn't apply cleanly to
> linux-next, I think you're missing
> mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix.patch
> in your tree.

It was 3.15-rc5 and all compaction stuff in -mm applied. I do have the 
fix, but perhaps I forgot something else, or linux-next did not have the 
fix yet (Andrew picked it up on tuesday I think).
Anyway I saw the pending compaction stuff was just merged so things will 
get easier the next round :)

> Is there a performance benefit to doing the inlining here?

I admit I didn't check, I just thought it might make sense since there 
are just two call sites. I'll reconsider but I doubt I will be able to 
see any runtime difference in this path. Maybe just based on the object 
size.

>>   {
>> -	return need_resched() || spin_is_contended(lock);
>> +	if (cc->mode == MIGRATE_ASYNC) {
>> +		if (need_resched() || !spin_trylock_irqsave(lock, *flags)) {
>> +			cc->contended = true;
>> +			return false;
>> +		}
>> +	} else {
>> +		cond_resched();
>
> Why do we need this cond_resched() here if there is already a
> cond_resched() in compact_unlock_should_abort() for non-async compaction?
>
> If this is trying to reschedule right before disabling irqs because
> otherwise spinning on the lock causes irq starvation, then that's a very
> delicate balance and I think we're going to get in trouble later.

I wasn't thinking about real starvation. I guess it's already not so bad 
if nobody noticed that the periodical unlock and IRQ enable was broken 
for so long and the original patch was adding it based on profiles 
looking better, not solving any particular problem encountered.
So here I just thought it wouldn't be too costly to check before taking 
the lock, since it might have been up to 32 iterations since the last 
check. But it's true it might be pointless since when we hold the lock, 
we check once per 32 iterations (with this patch) anyway.

>> +		spin_lock_irqsave(lock, *flags);
>> +	}
>> +
>> +	return true;
>>   }
>>
>>   /*
>>    * Compaction requires the taking of some coarse locks that are potentially
>> - * very heavily contended. Check if the process needs to be scheduled or
>> - * if the lock is contended. For async compaction, back out in the event
>> - * if contention is severe. For sync compaction, schedule.
>> + * very heavily contended. The lock should be periodically unlocked to avoid
>> + * having disabled IRQs for a long time, even when there is nobody waiting on
>> + * the lock. It might also be that allowing the IRQs will result in
>> + * need_resched() becoming true. If scheduling is needed, or somebody else
>> + * has taken the lock, async compaction aborts. Sync compaction schedules.
>> + * Either compaction type will also abort if a fatal signal is pending.
>> + * In either case if the lock was locked, it is dropped and not regained.
>>    *
>> - * Returns true if the lock is held.
>> - * Returns false if the lock is released and compaction should abort
>> + * Returns true if compaction should abort due to fatal signal pending, or
>> + *		async compaction due to lock contention or need to schedule
>> + * Returns false when compaction can continue (sync compaction might have
>> + *		scheduled)
>>    */
>> -static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>> -				      bool locked, struct compact_control *cc)
>> +static inline bool compact_unlock_should_abort(spinlock_t *lock,
>> +		unsigned long flags, bool *locked, struct compact_control *cc)
>
> This inlining is also suspicious and I think keeping both of them
> out-of-line for the freeing and migration scanners is going to be the best
> route unless there's some measurable performance benefit I'm not seeing.

OK.

>>   {
>> -	if (should_release_lock(lock)) {
>> -		if (locked) {
>> -			spin_unlock_irqrestore(lock, *flags);
>> -			locked = false;
>> -		}
>> +	if (*locked) {
>> +		spin_unlock_irqrestore(lock, flags);
>> +		*locked = false;
>> +	}
>>
>> -		/* async aborts if taking too long or contended */
>> -		if (cc->mode == MIGRATE_ASYNC) {
>> +	if (fatal_signal_pending(current))
>> +		return true;
>> +
>> +	if (cc->mode == MIGRATE_ASYNC) {
>> +		if (need_resched() || spin_is_locked(lock)) {
>>   			cc->contended = true;
>> -			return false;
>> +			return true;
>>   		}
>> -
>> +	} else {
>>   		cond_resched();
>>   	}
>>
>> -	if (!locked)
>> -		spin_lock_irqsave(lock, *flags);
>> -	return true;
>> +	return false;
>>   }
>>
>>   /*
>> - * Aside from avoiding lock contention, compaction also periodically checks
>> + * Aside from avoiding lock contention, compaction should also periodically checks
>
> Not sure what the purpose of this commentary change is, it's gramatically
> incorrect now.
>
>>    * need_resched() and either schedules in sync compaction, or aborts async
>> - * compaction. This is similar to compact_checklock_irqsave() does, but used
>> + * compaction. This is similar to compact_unlock_should_abort() does, but used
>
> This was and still is gramatically incorrect :)

Thanks, will fix.

>
>>    * where no lock is concerned.
>>    *
>>    * Returns false when no scheduling was needed, or sync compaction scheduled.
>> @@ -285,6 +312,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>>   		int isolated, i;
>>   		struct page *page = cursor;
>>
>> +		/*
>> +		 * Periodically drop the lock (if held) regardless of its
>> +		 * contention, to give chance to IRQs. Abort async compaction
>> +		 * if contended.
>> +		 */
>> +		if (!(blockpfn % SWAP_CLUSTER_MAX)
>> +		    && compact_unlock_should_abort(&cc->zone->lock, flags,
>> +								&locked, cc))
>> +			break;
>> +
>>   		nr_scanned++;
>>   		if (!pfn_valid_within(blockpfn))
>>   			goto isolate_fail;
>> @@ -302,8 +339,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>>   		 * spin on the lock and we acquire the lock as late as
>>   		 * possible.
>>   		 */
>> -		locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
>> -								locked, cc);
>> +		if (!locked)
>> +			locked = compact_trylock_irqsave(&cc->zone->lock,
>> +								&flags, cc);
>>   		if (!locked)
>>   			break;
>>
>> @@ -523,13 +561,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>>
>>   	/* Time to isolate some pages for migration */
>>   	for (; low_pfn < end_pfn; low_pfn++) {
>> -		/* give a chance to irqs before checking need_resched() */
>> -		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
>> -			if (should_release_lock(&zone->lru_lock)) {
>> -				spin_unlock_irqrestore(&zone->lru_lock, flags);
>> -				locked = false;
>> -			}
>> -		}
>> +		/*
>> +		 * Periodically drop the lock (if held) regardless of its
>> +		 * contention, to give chance to IRQs. Abort async compaction
>> +		 * if contended.
>> +		 */
>> +		if (!(low_pfn % SWAP_CLUSTER_MAX)
>> +		    && compact_unlock_should_abort(&zone->lru_lock, flags,
>> +								&locked, cc))
>> +			break;
>>
>>   		/*
>>   		 * migrate_pfn does not necessarily start aligned to a
>> @@ -631,10 +671,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>>   		    page_count(page) > page_mapcount(page))
>>   			continue;
>>
>> -		/* Check if it is ok to still hold the lock */
>> -		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
>> -								locked, cc);
>> -		if (!locked || fatal_signal_pending(current))
>> +		/* If the lock is not held, try to take it */
>> +		if (!locked)
>> +			locked = compact_trylock_irqsave(&zone->lru_lock,
>> +								&flags, cc);
>> +		if (!locked)
>>   			break;
>>
>>   		/* Recheck PageLRU and PageTransHuge under lock */


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 1/6] mm, compaction: periodically drop lock and restore IRQs in scanners
@ 2014-06-05  9:05                     ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-05  9:05 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/05/2014 01:39 AM, David Rientjes wrote:
> On Wed, 4 Jun 2014, Vlastimil Babka wrote:
>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index ed7102c..f0fd4b5 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -185,47 +185,74 @@ static void update_pageblock_skip(struct compact_control *cc,
>>   }
>>   #endif /* CONFIG_COMPACTION */
>>
>> -static inline bool should_release_lock(spinlock_t *lock)
>> +/*
>> + * Compaction requires the taking of some coarse locks that are potentially
>> + * very heavily contended. Check if the process needs to be scheduled or
>> + * if the lock is contended. For async compaction, back out if the process
>> + * needs to be scheduled, or the lock cannot be taken immediately. For sync
>> + * compaction, schedule and spin on the lock if needed.
>> + *
>> + * Returns true if the lock is held
>> + * Returns false if the lock is not held and compaction should abort
>> + */
>> +static inline bool compact_trylock_irqsave(spinlock_t *lock,
>> +			unsigned long *flags, struct compact_control *cc)
>
> Hmm, what tree is this series based on?  It doesn't apply cleanly to
> linux-next, I think you're missing
> mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention-fix.patch
> in your tree.

It was 3.15-rc5 and all compaction stuff in -mm applied. I do have the 
fix, but perhaps I forgot something else, or linux-next did not have the 
fix yet (Andrew picked it up on tuesday I think).
Anyway I saw the pending compaction stuff was just merged so things will 
get easier the next round :)

> Is there a performance benefit to doing the inlining here?

I admit I didn't check, I just thought it might make sense since there 
are just two call sites. I'll reconsider but I doubt I will be able to 
see any runtime difference in this path. Maybe just based on the object 
size.

>>   {
>> -	return need_resched() || spin_is_contended(lock);
>> +	if (cc->mode == MIGRATE_ASYNC) {
>> +		if (need_resched() || !spin_trylock_irqsave(lock, *flags)) {
>> +			cc->contended = true;
>> +			return false;
>> +		}
>> +	} else {
>> +		cond_resched();
>
> Why do we need this cond_resched() here if there is already a
> cond_resched() in compact_unlock_should_abort() for non-async compaction?
>
> If this is trying to reschedule right before disabling irqs because
> otherwise spinning on the lock causes irq starvation, then that's a very
> delicate balance and I think we're going to get in trouble later.

I wasn't thinking about real starvation. I guess it's already not so bad 
if nobody noticed that the periodical unlock and IRQ enable was broken 
for so long and the original patch was adding it based on profiles 
looking better, not solving any particular problem encountered.
So here I just thought it wouldn't be too costly to check before taking 
the lock, since it might have been up to 32 iterations since the last 
check. But it's true it might be pointless since when we hold the lock, 
we check once per 32 iterations (with this patch) anyway.

>> +		spin_lock_irqsave(lock, *flags);
>> +	}
>> +
>> +	return true;
>>   }
>>
>>   /*
>>    * Compaction requires the taking of some coarse locks that are potentially
>> - * very heavily contended. Check if the process needs to be scheduled or
>> - * if the lock is contended. For async compaction, back out in the event
>> - * if contention is severe. For sync compaction, schedule.
>> + * very heavily contended. The lock should be periodically unlocked to avoid
>> + * having disabled IRQs for a long time, even when there is nobody waiting on
>> + * the lock. It might also be that allowing the IRQs will result in
>> + * need_resched() becoming true. If scheduling is needed, or somebody else
>> + * has taken the lock, async compaction aborts. Sync compaction schedules.
>> + * Either compaction type will also abort if a fatal signal is pending.
>> + * In either case if the lock was locked, it is dropped and not regained.
>>    *
>> - * Returns true if the lock is held.
>> - * Returns false if the lock is released and compaction should abort
>> + * Returns true if compaction should abort due to fatal signal pending, or
>> + *		async compaction due to lock contention or need to schedule
>> + * Returns false when compaction can continue (sync compaction might have
>> + *		scheduled)
>>    */
>> -static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>> -				      bool locked, struct compact_control *cc)
>> +static inline bool compact_unlock_should_abort(spinlock_t *lock,
>> +		unsigned long flags, bool *locked, struct compact_control *cc)
>
> This inlining is also suspicious and I think keeping both of them
> out-of-line for the freeing and migration scanners is going to be the best
> route unless there's some measurable performance benefit I'm not seeing.

OK.

>>   {
>> -	if (should_release_lock(lock)) {
>> -		if (locked) {
>> -			spin_unlock_irqrestore(lock, *flags);
>> -			locked = false;
>> -		}
>> +	if (*locked) {
>> +		spin_unlock_irqrestore(lock, flags);
>> +		*locked = false;
>> +	}
>>
>> -		/* async aborts if taking too long or contended */
>> -		if (cc->mode == MIGRATE_ASYNC) {
>> +	if (fatal_signal_pending(current))
>> +		return true;
>> +
>> +	if (cc->mode == MIGRATE_ASYNC) {
>> +		if (need_resched() || spin_is_locked(lock)) {
>>   			cc->contended = true;
>> -			return false;
>> +			return true;
>>   		}
>> -
>> +	} else {
>>   		cond_resched();
>>   	}
>>
>> -	if (!locked)
>> -		spin_lock_irqsave(lock, *flags);
>> -	return true;
>> +	return false;
>>   }
>>
>>   /*
>> - * Aside from avoiding lock contention, compaction also periodically checks
>> + * Aside from avoiding lock contention, compaction should also periodically checks
>
> Not sure what the purpose of this commentary change is, it's gramatically
> incorrect now.
>
>>    * need_resched() and either schedules in sync compaction, or aborts async
>> - * compaction. This is similar to compact_checklock_irqsave() does, but used
>> + * compaction. This is similar to compact_unlock_should_abort() does, but used
>
> This was and still is gramatically incorrect :)

Thanks, will fix.

>
>>    * where no lock is concerned.
>>    *
>>    * Returns false when no scheduling was needed, or sync compaction scheduled.
>> @@ -285,6 +312,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>>   		int isolated, i;
>>   		struct page *page = cursor;
>>
>> +		/*
>> +		 * Periodically drop the lock (if held) regardless of its
>> +		 * contention, to give chance to IRQs. Abort async compaction
>> +		 * if contended.
>> +		 */
>> +		if (!(blockpfn % SWAP_CLUSTER_MAX)
>> +		    && compact_unlock_should_abort(&cc->zone->lock, flags,
>> +								&locked, cc))
>> +			break;
>> +
>>   		nr_scanned++;
>>   		if (!pfn_valid_within(blockpfn))
>>   			goto isolate_fail;
>> @@ -302,8 +339,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>>   		 * spin on the lock and we acquire the lock as late as
>>   		 * possible.
>>   		 */
>> -		locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
>> -								locked, cc);
>> +		if (!locked)
>> +			locked = compact_trylock_irqsave(&cc->zone->lock,
>> +								&flags, cc);
>>   		if (!locked)
>>   			break;
>>
>> @@ -523,13 +561,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>>
>>   	/* Time to isolate some pages for migration */
>>   	for (; low_pfn < end_pfn; low_pfn++) {
>> -		/* give a chance to irqs before checking need_resched() */
>> -		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
>> -			if (should_release_lock(&zone->lru_lock)) {
>> -				spin_unlock_irqrestore(&zone->lru_lock, flags);
>> -				locked = false;
>> -			}
>> -		}
>> +		/*
>> +		 * Periodically drop the lock (if held) regardless of its
>> +		 * contention, to give chance to IRQs. Abort async compaction
>> +		 * if contended.
>> +		 */
>> +		if (!(low_pfn % SWAP_CLUSTER_MAX)
>> +		    && compact_unlock_should_abort(&zone->lru_lock, flags,
>> +								&locked, cc))
>> +			break;
>>
>>   		/*
>>   		 * migrate_pfn does not necessarily start aligned to a
>> @@ -631,10 +671,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>>   		    page_count(page) > page_mapcount(page))
>>   			continue;
>>
>> -		/* Check if it is ok to still hold the lock */
>> -		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
>> -								locked, cc);
>> -		if (!locked || fatal_signal_pending(current))
>> +		/* If the lock is not held, try to take it */
>> +		if (!locked)
>> +			locked = compact_trylock_irqsave(&zone->lru_lock,
>> +								&flags, cc);
>> +		if (!locked)
>>   			break;
>>
>>   		/* Recheck PageLRU and PageTransHuge under lock */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
  2014-06-05  0:02                     ` David Rientjes
@ 2014-06-05  9:24                       ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-05  9:24 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/05/2014 02:02 AM, David Rientjes wrote:
> On Wed, 4 Jun 2014, Vlastimil Babka wrote:
>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index ae7db5f..3dce5a7 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -640,11 +640,18 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>>   		}
>>
>>   		/*
>> -		 * Skip if free. page_order cannot be used without zone->lock
>> -		 * as nothing prevents parallel allocations or buddy merging.
>> +		 * Skip if free. We read page order here without zone lock
>> +		 * which is generally unsafe, but the race window is small and
>> +		 * the worst thing that can happen is that we skip some
>> +		 * potential isolation targets.
>
> Should we only be doing the low_pfn adjustment based on the order for
> MIGRATE_ASYNC?  It seems like sync compaction, including compaction that
> is triggered from the command line, would prefer to scan over the
> following pages.

I thought even sync compaction would benefit from the skipped 
iterations. I'd say the probability of this race is smaller than 
probability of somebody allocating what compaction just freed.

>>   		 */
>> -		if (PageBuddy(page))
>> +		if (PageBuddy(page)) {
>> +			unsigned long freepage_order = page_order_unsafe(page);
>
> I don't assume that we want a smp_wmb() in set_page_order() for this
> little race and to recheck PageBuddy() here after smp_rmb().

Hm right, barriers didn't came up last time a patch like this was 
posted. Rechecking PageBuddy() did came up but I thought the range 
checks on the order are enough for this case.

> I think this is fine for MIGRATE_ASYNC.
>
>> +
>> +			if (freepage_order > 0 && freepage_order < MAX_ORDER)
>> +				low_pfn += (1UL << freepage_order) - 1;
>>   			continue;
>> +		}
>>
>>   		/*
>>   		 * Check may be lockless but that's ok as we recheck later.
>> @@ -733,6 +740,13 @@ next_pageblock:
>>   		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
>>   	}
>>
>> +	/*
>> +	 * The PageBuddy() check could have potentially brought us outside
>> +	 * the range to be scanned.
>> +	 */
>> +	if (unlikely(low_pfn > end_pfn))
>> +		end_pfn = low_pfn;
>> +
>>   	acct_isolated(zone, locked, cc);
>>
>>   	if (locked)
>> diff --git a/mm/internal.h b/mm/internal.h
>> index 1a8a0d4..6aa1f74 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>>    * general, page_zone(page)->lock must be held by the caller to prevent the
>>    * page from being allocated in parallel and returning garbage as the order.
>>    * If a caller does not hold page_zone(page)->lock, it must guarantee that the
>> - * page cannot be allocated or merged in parallel.
>> + * page cannot be allocated or merged in parallel. Alternatively, it must
>> + * handle invalid values gracefully, and use page_order_unsafe() below.
>>    */
>>   static inline unsigned long page_order(struct page *page)
>>   {
>> @@ -172,6 +173,23 @@ static inline unsigned long page_order(struct page *page)
>>   	return page_private(page);
>>   }
>>
>> +/*
>> + * Like page_order(), but for callers who cannot afford to hold the zone lock,
>> + * and handle invalid values gracefully. ACCESS_ONCE is used so that if the
>> + * caller assigns the result into a local variable and e.g. tests it for valid
>> + * range  before using, the compiler cannot decide to remove the variable and
>> + * inline the function multiple times, potentially observing different values
>> + * in the tests and the actual use of the result.
>> + */
>> +static inline unsigned long page_order_unsafe(struct page *page)
>> +{
>> +	/*
>> +	 * PageBuddy() should be checked by the caller to minimize race window,
>> +	 * and invalid values must be handled gracefully.
>> +	 */
>> +	return ACCESS_ONCE(page_private(page));
>> +}
>> +
>>   /* mm/util.c */
>>   void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
>>   		struct vm_area_struct *prev, struct rb_node *rb_parent);
>
> I don't like this change at all, I don't think we should have header
> functions that imply the context in which the function will be called.  I
> think it would make much more sense to just do
> ACCESS_ONCE(page_order(page)) in the migration scanner with a comment.

But that won't compile. It would have to be converted to a #define, 
unless there's some trick I don't know. Sure I would hope this could be 
done cleaner somehow.

> These are __attribute__((pure)) semantics for page_order().

Not sure I understand what you mean here. Would adding that attribute 
change anything?



^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
@ 2014-06-05  9:24                       ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-05  9:24 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/05/2014 02:02 AM, David Rientjes wrote:
> On Wed, 4 Jun 2014, Vlastimil Babka wrote:
>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index ae7db5f..3dce5a7 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -640,11 +640,18 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>>   		}
>>
>>   		/*
>> -		 * Skip if free. page_order cannot be used without zone->lock
>> -		 * as nothing prevents parallel allocations or buddy merging.
>> +		 * Skip if free. We read page order here without zone lock
>> +		 * which is generally unsafe, but the race window is small and
>> +		 * the worst thing that can happen is that we skip some
>> +		 * potential isolation targets.
>
> Should we only be doing the low_pfn adjustment based on the order for
> MIGRATE_ASYNC?  It seems like sync compaction, including compaction that
> is triggered from the command line, would prefer to scan over the
> following pages.

I thought even sync compaction would benefit from the skipped 
iterations. I'd say the probability of this race is smaller than 
probability of somebody allocating what compaction just freed.

>>   		 */
>> -		if (PageBuddy(page))
>> +		if (PageBuddy(page)) {
>> +			unsigned long freepage_order = page_order_unsafe(page);
>
> I don't assume that we want a smp_wmb() in set_page_order() for this
> little race and to recheck PageBuddy() here after smp_rmb().

Hm right, barriers didn't came up last time a patch like this was 
posted. Rechecking PageBuddy() did came up but I thought the range 
checks on the order are enough for this case.

> I think this is fine for MIGRATE_ASYNC.
>
>> +
>> +			if (freepage_order > 0 && freepage_order < MAX_ORDER)
>> +				low_pfn += (1UL << freepage_order) - 1;
>>   			continue;
>> +		}
>>
>>   		/*
>>   		 * Check may be lockless but that's ok as we recheck later.
>> @@ -733,6 +740,13 @@ next_pageblock:
>>   		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
>>   	}
>>
>> +	/*
>> +	 * The PageBuddy() check could have potentially brought us outside
>> +	 * the range to be scanned.
>> +	 */
>> +	if (unlikely(low_pfn > end_pfn))
>> +		end_pfn = low_pfn;
>> +
>>   	acct_isolated(zone, locked, cc);
>>
>>   	if (locked)
>> diff --git a/mm/internal.h b/mm/internal.h
>> index 1a8a0d4..6aa1f74 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>>    * general, page_zone(page)->lock must be held by the caller to prevent the
>>    * page from being allocated in parallel and returning garbage as the order.
>>    * If a caller does not hold page_zone(page)->lock, it must guarantee that the
>> - * page cannot be allocated or merged in parallel.
>> + * page cannot be allocated or merged in parallel. Alternatively, it must
>> + * handle invalid values gracefully, and use page_order_unsafe() below.
>>    */
>>   static inline unsigned long page_order(struct page *page)
>>   {
>> @@ -172,6 +173,23 @@ static inline unsigned long page_order(struct page *page)
>>   	return page_private(page);
>>   }
>>
>> +/*
>> + * Like page_order(), but for callers who cannot afford to hold the zone lock,
>> + * and handle invalid values gracefully. ACCESS_ONCE is used so that if the
>> + * caller assigns the result into a local variable and e.g. tests it for valid
>> + * range  before using, the compiler cannot decide to remove the variable and
>> + * inline the function multiple times, potentially observing different values
>> + * in the tests and the actual use of the result.
>> + */
>> +static inline unsigned long page_order_unsafe(struct page *page)
>> +{
>> +	/*
>> +	 * PageBuddy() should be checked by the caller to minimize race window,
>> +	 * and invalid values must be handled gracefully.
>> +	 */
>> +	return ACCESS_ONCE(page_private(page));
>> +}
>> +
>>   /* mm/util.c */
>>   void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
>>   		struct vm_area_struct *prev, struct rb_node *rb_parent);
>
> I don't like this change at all, I don't think we should have header
> functions that imply the context in which the function will be called.  I
> think it would make much more sense to just do
> ACCESS_ONCE(page_order(page)) in the migration scanner with a comment.

But that won't compile. It would have to be converted to a #define, 
unless there's some trick I don't know. Sure I would hope this could be 
done cleaner somehow.

> These are __attribute__((pure)) semantics for page_order().

Not sure I understand what you mean here. Would adding that attribute 
change anything?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 6/6] mm, compaction: don't migrate in blocks that cannot be fully compacted in async direct compaction
  2014-06-05  0:08                     ` David Rientjes
@ 2014-06-05 15:38                       ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-05 15:38 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/05/2014 02:08 AM, David Rientjes wrote:
> On Wed, 4 Jun 2014, Vlastimil Babka wrote:
>
>> In direct compaction, we want to allocate the high-order page as soon as
>> possible, so migrating from a block of pages that contains also unmigratable
>> pages just adds to allocation latency.
>>
>
> The title of the patch in the subject line should probably be reworded
> since it implies we never isolate from blocks that cannot become
> completely free and what you're really doing is skipping cc->order aligned
> pages.
>
>> This patch therefore makes the migration scanner skip to the next cc->order
>> aligned block of pages as soon as it cannot isolate a non-free page. Everything
>> isolated up to that point is put back.
>>
>> In this mode, the nr_isolated limit to COMPACT_CLUSTER_MAX is not observed,
>> allowing the scanner to scan the whole block at once, instead of migrating
>> COMPACT_CLUSTER_MAX pages and then finding an unmigratable page in the next
>> call. This might however have some implications on too_many_isolated.
>>
>> Also in this RFC PATCH, the "skipping mode" is tied to async migration mode,
>> which is not optimal. What we most probably want is skipping in direct
>> compactions, but not from kswapd and hugepaged.
>>
>> In very preliminary tests, this has reduced migrate_scanned, isolations and
>> migrations by about 10%, while the success rate of stress-highalloc mmtests
>> actually improved a bit.
>>
>
> Ok, so this obsoletes my patchseries that did something similar.  I hope

Your patches 1/3 and 2/3 would still make sense. Checking alloc flags is 
IMHO better than checking async here. That way, hugepaged and kswapd 
would still try to migrate stuff which is important as Mel described in 
the reply to your 3/3.

> you can rebase this set on top of linux-next and then propose it formally
> without the RFC tag.

I posted this early to facilitate discussion, but if you want to test on 
linux-next then sure.

> We also need to discuss the scheduling heuristics, the reliance on
> need_resched(), to abort async compaction.  In testing, we actualy
> sometimes see 2-3 pageblocks scanned before terminating and thp has a very
> little chance of being allocated.  At the same time, if we try to fault
> 64MB of anon memory in and each of the 32 calls to compaction are
> expensive but don't result in an order-9 page, we see very lengthy fault
> latency.

Yes, I thought you were about to try the 1GB per call setting. I don't 
currently have a test setup like you. My patch 1/6 still uses on 
need_resched() but that could be replaced with a later patch.

> I think it would be interesting to consider doing async compaction
> deferral up to 1 << COMPACT_MAX_DEFER_SHIFT after a sysctl-configurable
> amount of memory is scanned, at least for thp, and remove the scheduling
> heuristic entirely.

That could work. How about the lock contention heuristic? Is it possible 
on a large and/or busy system to compact anything substantional without 
hitting the lock contention? Are your observations about too early abort 
based on need_resched() or lock contention?

Vlastimil

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 6/6] mm, compaction: don't migrate in blocks that cannot be fully compacted in async direct compaction
@ 2014-06-05 15:38                       ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-05 15:38 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/05/2014 02:08 AM, David Rientjes wrote:
> On Wed, 4 Jun 2014, Vlastimil Babka wrote:
>
>> In direct compaction, we want to allocate the high-order page as soon as
>> possible, so migrating from a block of pages that contains also unmigratable
>> pages just adds to allocation latency.
>>
>
> The title of the patch in the subject line should probably be reworded
> since it implies we never isolate from blocks that cannot become
> completely free and what you're really doing is skipping cc->order aligned
> pages.
>
>> This patch therefore makes the migration scanner skip to the next cc->order
>> aligned block of pages as soon as it cannot isolate a non-free page. Everything
>> isolated up to that point is put back.
>>
>> In this mode, the nr_isolated limit to COMPACT_CLUSTER_MAX is not observed,
>> allowing the scanner to scan the whole block at once, instead of migrating
>> COMPACT_CLUSTER_MAX pages and then finding an unmigratable page in the next
>> call. This might however have some implications on too_many_isolated.
>>
>> Also in this RFC PATCH, the "skipping mode" is tied to async migration mode,
>> which is not optimal. What we most probably want is skipping in direct
>> compactions, but not from kswapd and hugepaged.
>>
>> In very preliminary tests, this has reduced migrate_scanned, isolations and
>> migrations by about 10%, while the success rate of stress-highalloc mmtests
>> actually improved a bit.
>>
>
> Ok, so this obsoletes my patchseries that did something similar.  I hope

Your patches 1/3 and 2/3 would still make sense. Checking alloc flags is 
IMHO better than checking async here. That way, hugepaged and kswapd 
would still try to migrate stuff which is important as Mel described in 
the reply to your 3/3.

> you can rebase this set on top of linux-next and then propose it formally
> without the RFC tag.

I posted this early to facilitate discussion, but if you want to test on 
linux-next then sure.

> We also need to discuss the scheduling heuristics, the reliance on
> need_resched(), to abort async compaction.  In testing, we actualy
> sometimes see 2-3 pageblocks scanned before terminating and thp has a very
> little chance of being allocated.  At the same time, if we try to fault
> 64MB of anon memory in and each of the 32 calls to compaction are
> expensive but don't result in an order-9 page, we see very lengthy fault
> latency.

Yes, I thought you were about to try the 1GB per call setting. I don't 
currently have a test setup like you. My patch 1/6 still uses on 
need_resched() but that could be replaced with a later patch.

> I think it would be interesting to consider doing async compaction
> deferral up to 1 << COMPACT_MAX_DEFER_SHIFT after a sysctl-configurable
> amount of memory is scanned, at least for thp, and remove the scheduling
> heuristic entirely.

That could work. How about the lock contention heuristic? Is it possible 
on a large and/or busy system to compact anything substantional without 
hitting the lock contention? Are your observations about too early abort 
based on need_resched() or lock contention?

Vlastimil

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
  2014-06-05  9:24                       ` Vlastimil Babka
@ 2014-06-05 21:30                         ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-05 21:30 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Thu, 5 Jun 2014, Vlastimil Babka wrote:

> > > diff --git a/mm/compaction.c b/mm/compaction.c
> > > index ae7db5f..3dce5a7 100644
> > > --- a/mm/compaction.c
> > > +++ b/mm/compaction.c
> > > @@ -640,11 +640,18 @@ isolate_migratepages_range(struct zone *zone, struct
> > > compact_control *cc,
> > >   		}
> > > 
> > >   		/*
> > > -		 * Skip if free. page_order cannot be used without zone->lock
> > > -		 * as nothing prevents parallel allocations or buddy merging.
> > > +		 * Skip if free. We read page order here without zone lock
> > > +		 * which is generally unsafe, but the race window is small and
> > > +		 * the worst thing that can happen is that we skip some
> > > +		 * potential isolation targets.
> > 
> > Should we only be doing the low_pfn adjustment based on the order for
> > MIGRATE_ASYNC?  It seems like sync compaction, including compaction that
> > is triggered from the command line, would prefer to scan over the
> > following pages.
> 
> I thought even sync compaction would benefit from the skipped iterations. I'd
> say the probability of this race is smaller than probability of somebody
> allocating what compaction just freed.
> 

Ok.

> > > diff --git a/mm/internal.h b/mm/internal.h
> > > index 1a8a0d4..6aa1f74 100644
> > > --- a/mm/internal.h
> > > +++ b/mm/internal.h
> > > @@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct
> > > compact_control *cc,
> > >    * general, page_zone(page)->lock must be held by the caller to prevent
> > > the
> > >    * page from being allocated in parallel and returning garbage as the
> > > order.
> > >    * If a caller does not hold page_zone(page)->lock, it must guarantee
> > > that the
> > > - * page cannot be allocated or merged in parallel.
> > > + * page cannot be allocated or merged in parallel. Alternatively, it must
> > > + * handle invalid values gracefully, and use page_order_unsafe() below.
> > >    */
> > >   static inline unsigned long page_order(struct page *page)
> > >   {
> > > @@ -172,6 +173,23 @@ static inline unsigned long page_order(struct page
> > > *page)
> > >   	return page_private(page);
> > >   }
> > > 
> > > +/*
> > > + * Like page_order(), but for callers who cannot afford to hold the zone
> > > lock,
> > > + * and handle invalid values gracefully. ACCESS_ONCE is used so that if
> > > the
> > > + * caller assigns the result into a local variable and e.g. tests it for
> > > valid
> > > + * range  before using, the compiler cannot decide to remove the variable
> > > and
> > > + * inline the function multiple times, potentially observing different
> > > values
> > > + * in the tests and the actual use of the result.
> > > + */
> > > +static inline unsigned long page_order_unsafe(struct page *page)
> > > +{
> > > +	/*
> > > +	 * PageBuddy() should be checked by the caller to minimize race
> > > window,
> > > +	 * and invalid values must be handled gracefully.
> > > +	 */
> > > +	return ACCESS_ONCE(page_private(page));
> > > +}
> > > +
> > >   /* mm/util.c */
> > >   void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
> > >   		struct vm_area_struct *prev, struct rb_node *rb_parent);
> > 
> > I don't like this change at all, I don't think we should have header
> > functions that imply the context in which the function will be called.  I
> > think it would make much more sense to just do
> > ACCESS_ONCE(page_order(page)) in the migration scanner with a comment.
> 
> But that won't compile. It would have to be converted to a #define, unless
> there's some trick I don't know. Sure I would hope this could be done cleaner
> somehow.
> 

Sorry, I meant ACCESS_ONCE(page_private(page)) in the migration scanner 
with a comment about it being racy.  It also helps to understand why 
you're testing for order < MAX_ORDER before skipping low_pfn there which 
is a little subtle right now.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
@ 2014-06-05 21:30                         ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-05 21:30 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Thu, 5 Jun 2014, Vlastimil Babka wrote:

> > > diff --git a/mm/compaction.c b/mm/compaction.c
> > > index ae7db5f..3dce5a7 100644
> > > --- a/mm/compaction.c
> > > +++ b/mm/compaction.c
> > > @@ -640,11 +640,18 @@ isolate_migratepages_range(struct zone *zone, struct
> > > compact_control *cc,
> > >   		}
> > > 
> > >   		/*
> > > -		 * Skip if free. page_order cannot be used without zone->lock
> > > -		 * as nothing prevents parallel allocations or buddy merging.
> > > +		 * Skip if free. We read page order here without zone lock
> > > +		 * which is generally unsafe, but the race window is small and
> > > +		 * the worst thing that can happen is that we skip some
> > > +		 * potential isolation targets.
> > 
> > Should we only be doing the low_pfn adjustment based on the order for
> > MIGRATE_ASYNC?  It seems like sync compaction, including compaction that
> > is triggered from the command line, would prefer to scan over the
> > following pages.
> 
> I thought even sync compaction would benefit from the skipped iterations. I'd
> say the probability of this race is smaller than probability of somebody
> allocating what compaction just freed.
> 

Ok.

> > > diff --git a/mm/internal.h b/mm/internal.h
> > > index 1a8a0d4..6aa1f74 100644
> > > --- a/mm/internal.h
> > > +++ b/mm/internal.h
> > > @@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct
> > > compact_control *cc,
> > >    * general, page_zone(page)->lock must be held by the caller to prevent
> > > the
> > >    * page from being allocated in parallel and returning garbage as the
> > > order.
> > >    * If a caller does not hold page_zone(page)->lock, it must guarantee
> > > that the
> > > - * page cannot be allocated or merged in parallel.
> > > + * page cannot be allocated or merged in parallel. Alternatively, it must
> > > + * handle invalid values gracefully, and use page_order_unsafe() below.
> > >    */
> > >   static inline unsigned long page_order(struct page *page)
> > >   {
> > > @@ -172,6 +173,23 @@ static inline unsigned long page_order(struct page
> > > *page)
> > >   	return page_private(page);
> > >   }
> > > 
> > > +/*
> > > + * Like page_order(), but for callers who cannot afford to hold the zone
> > > lock,
> > > + * and handle invalid values gracefully. ACCESS_ONCE is used so that if
> > > the
> > > + * caller assigns the result into a local variable and e.g. tests it for
> > > valid
> > > + * range  before using, the compiler cannot decide to remove the variable
> > > and
> > > + * inline the function multiple times, potentially observing different
> > > values
> > > + * in the tests and the actual use of the result.
> > > + */
> > > +static inline unsigned long page_order_unsafe(struct page *page)
> > > +{
> > > +	/*
> > > +	 * PageBuddy() should be checked by the caller to minimize race
> > > window,
> > > +	 * and invalid values must be handled gracefully.
> > > +	 */
> > > +	return ACCESS_ONCE(page_private(page));
> > > +}
> > > +
> > >   /* mm/util.c */
> > >   void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
> > >   		struct vm_area_struct *prev, struct rb_node *rb_parent);
> > 
> > I don't like this change at all, I don't think we should have header
> > functions that imply the context in which the function will be called.  I
> > think it would make much more sense to just do
> > ACCESS_ONCE(page_order(page)) in the migration scanner with a comment.
> 
> But that won't compile. It would have to be converted to a #define, unless
> there's some trick I don't know. Sure I would hope this could be done cleaner
> somehow.
> 

Sorry, I meant ACCESS_ONCE(page_private(page)) in the migration scanner 
with a comment about it being racy.  It also helps to understand why 
you're testing for order < MAX_ORDER before skipping low_pfn there which 
is a little subtle right now.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 6/6] mm, compaction: don't migrate in blocks that cannot be fully compacted in async direct compaction
  2014-06-05 15:38                       ` Vlastimil Babka
@ 2014-06-05 21:38                         ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-05 21:38 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Thu, 5 Jun 2014, Vlastimil Babka wrote:

> > Ok, so this obsoletes my patchseries that did something similar.  I hope
> 
> Your patches 1/3 and 2/3 would still make sense. Checking alloc flags is IMHO
> better than checking async here. That way, hugepaged and kswapd would still
> try to migrate stuff which is important as Mel described in the reply to your
> 3/3.
> 

Would you mind folding those two patches into your series since you'll be 
requiring the gfp_mask in struct compact_control and your pageblock skip 
is better than mine?

> > you can rebase this set on top of linux-next and then propose it formally
> > without the RFC tag.
> 
> I posted this early to facilitate discussion, but if you want to test on
> linux-next then sure.
> 

I'd love to test these.

> > We also need to discuss the scheduling heuristics, the reliance on
> > need_resched(), to abort async compaction.  In testing, we actualy
> > sometimes see 2-3 pageblocks scanned before terminating and thp has a very
> > little chance of being allocated.  At the same time, if we try to fault
> > 64MB of anon memory in and each of the 32 calls to compaction are
> > expensive but don't result in an order-9 page, we see very lengthy fault
> > latency.
> 
> Yes, I thought you were about to try the 1GB per call setting. I don't
> currently have a test setup like you. My patch 1/6 still uses on
> need_resched() but that could be replaced with a later patch.
> 

Agreed.  I was thinking higher than 1GB would be possible once we have 
your series that does the pageblock skip for thp, I think the expense 
would be constant because we won't needlessly be migrating pages unless it 
has a good chance at succeeding.  I'm slightly concerned about the 
COMPACT_CLUSTER_MAX termination, though, before we find unmigratable 
memory but I think that will be very low probability.

> > I think it would be interesting to consider doing async compaction
> > deferral up to 1 << COMPACT_MAX_DEFER_SHIFT after a sysctl-configurable
> > amount of memory is scanned, at least for thp, and remove the scheduling
> > heuristic entirely.
> 
> That could work. How about the lock contention heuristic? Is it possible on a
> large and/or busy system to compact anything substantional without hitting the
> lock contention? Are your observations about too early abort based on
> need_resched() or lock contention?
> 

Eek, it's mostly need_resched() because we don't use zone->lru_lock, we 
have the memcg lruvec locks for lru locking.  We end up dropping and 
reacquiring different locks based on the memcg of the page being isolated 
quite a bit.

This does beg the question about parallel direct compactors, though, that 
will be contending on the same coarse zone->lru_lock locks and immediately 
aborting and falling back to PAGE_SIZE pages for thp faults that will be 
more likely if your patch to grab the high-order page and return it to the 
page allocator is merged.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 6/6] mm, compaction: don't migrate in blocks that cannot be fully compacted in async direct compaction
@ 2014-06-05 21:38                         ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-05 21:38 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Thu, 5 Jun 2014, Vlastimil Babka wrote:

> > Ok, so this obsoletes my patchseries that did something similar.  I hope
> 
> Your patches 1/3 and 2/3 would still make sense. Checking alloc flags is IMHO
> better than checking async here. That way, hugepaged and kswapd would still
> try to migrate stuff which is important as Mel described in the reply to your
> 3/3.
> 

Would you mind folding those two patches into your series since you'll be 
requiring the gfp_mask in struct compact_control and your pageblock skip 
is better than mine?

> > you can rebase this set on top of linux-next and then propose it formally
> > without the RFC tag.
> 
> I posted this early to facilitate discussion, but if you want to test on
> linux-next then sure.
> 

I'd love to test these.

> > We also need to discuss the scheduling heuristics, the reliance on
> > need_resched(), to abort async compaction.  In testing, we actualy
> > sometimes see 2-3 pageblocks scanned before terminating and thp has a very
> > little chance of being allocated.  At the same time, if we try to fault
> > 64MB of anon memory in and each of the 32 calls to compaction are
> > expensive but don't result in an order-9 page, we see very lengthy fault
> > latency.
> 
> Yes, I thought you were about to try the 1GB per call setting. I don't
> currently have a test setup like you. My patch 1/6 still uses on
> need_resched() but that could be replaced with a later patch.
> 

Agreed.  I was thinking higher than 1GB would be possible once we have 
your series that does the pageblock skip for thp, I think the expense 
would be constant because we won't needlessly be migrating pages unless it 
has a good chance at succeeding.  I'm slightly concerned about the 
COMPACT_CLUSTER_MAX termination, though, before we find unmigratable 
memory but I think that will be very low probability.

> > I think it would be interesting to consider doing async compaction
> > deferral up to 1 << COMPACT_MAX_DEFER_SHIFT after a sysctl-configurable
> > amount of memory is scanned, at least for thp, and remove the scheduling
> > heuristic entirely.
> 
> That could work. How about the lock contention heuristic? Is it possible on a
> large and/or busy system to compact anything substantional without hitting the
> lock contention? Are your observations about too early abort based on
> need_resched() or lock contention?
> 

Eek, it's mostly need_resched() because we don't use zone->lru_lock, we 
have the memcg lruvec locks for lru locking.  We end up dropping and 
reacquiring different locks based on the memcg of the page being isolated 
quite a bit.

This does beg the question about parallel direct compactors, though, that 
will be contending on the same coarse zone->lru_lock locks and immediately 
aborting and falling back to PAGE_SIZE pages for thp faults that will be 
more likely if your patch to grab the high-order page and return it to the 
page allocator is merged.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
  2014-06-05 21:30                         ` David Rientjes
@ 2014-06-06  7:20                           ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-06  7:20 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/05/2014 11:30 PM, David Rientjes wrote:
> On Thu, 5 Jun 2014, Vlastimil Babka wrote:
> 
>> > > diff --git a/mm/compaction.c b/mm/compaction.c
>> > > index ae7db5f..3dce5a7 100644
>> > > --- a/mm/compaction.c
>> > > +++ b/mm/compaction.c
>> > > @@ -640,11 +640,18 @@ isolate_migratepages_range(struct zone *zone, struct
>> > > compact_control *cc,
>> > >   		}
>> > > 
>> > >   		/*
>> > > -		 * Skip if free. page_order cannot be used without zone->lock
>> > > -		 * as nothing prevents parallel allocations or buddy merging.
>> > > +		 * Skip if free. We read page order here without zone lock
>> > > +		 * which is generally unsafe, but the race window is small and
>> > > +		 * the worst thing that can happen is that we skip some
>> > > +		 * potential isolation targets.
>> > 
>> > Should we only be doing the low_pfn adjustment based on the order for
>> > MIGRATE_ASYNC?  It seems like sync compaction, including compaction that
>> > is triggered from the command line, would prefer to scan over the
>> > following pages.
>> 
>> I thought even sync compaction would benefit from the skipped iterations. I'd
>> say the probability of this race is smaller than probability of somebody
>> allocating what compaction just freed.
>> 
> 
> Ok.
> 
>> > > diff --git a/mm/internal.h b/mm/internal.h
>> > > index 1a8a0d4..6aa1f74 100644
>> > > --- a/mm/internal.h
>> > > +++ b/mm/internal.h
>> > > @@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct
>> > > compact_control *cc,
>> > >    * general, page_zone(page)->lock must be held by the caller to prevent
>> > > the
>> > >    * page from being allocated in parallel and returning garbage as the
>> > > order.
>> > >    * If a caller does not hold page_zone(page)->lock, it must guarantee
>> > > that the
>> > > - * page cannot be allocated or merged in parallel.
>> > > + * page cannot be allocated or merged in parallel. Alternatively, it must
>> > > + * handle invalid values gracefully, and use page_order_unsafe() below.
>> > >    */
>> > >   static inline unsigned long page_order(struct page *page)
>> > >   {
>> > > @@ -172,6 +173,23 @@ static inline unsigned long page_order(struct page
>> > > *page)
>> > >   	return page_private(page);
>> > >   }
>> > > 
>> > > +/*
>> > > + * Like page_order(), but for callers who cannot afford to hold the zone
>> > > lock,
>> > > + * and handle invalid values gracefully. ACCESS_ONCE is used so that if
>> > > the
>> > > + * caller assigns the result into a local variable and e.g. tests it for
>> > > valid
>> > > + * range  before using, the compiler cannot decide to remove the variable
>> > > and
>> > > + * inline the function multiple times, potentially observing different
>> > > values
>> > > + * in the tests and the actual use of the result.
>> > > + */
>> > > +static inline unsigned long page_order_unsafe(struct page *page)
>> > > +{
>> > > +	/*
>> > > +	 * PageBuddy() should be checked by the caller to minimize race
>> > > window,
>> > > +	 * and invalid values must be handled gracefully.
>> > > +	 */
>> > > +	return ACCESS_ONCE(page_private(page));
>> > > +}
>> > > +
>> > >   /* mm/util.c */
>> > >   void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
>> > >   		struct vm_area_struct *prev, struct rb_node *rb_parent);
>> > 
>> > I don't like this change at all, I don't think we should have header
>> > functions that imply the context in which the function will be called.  I
>> > think it would make much more sense to just do
>> > ACCESS_ONCE(page_order(page)) in the migration scanner with a comment.
>> 
>> But that won't compile. It would have to be converted to a #define, unless
>> there's some trick I don't know. Sure I would hope this could be done cleaner
>> somehow.
>> 
> 
> Sorry, I meant ACCESS_ONCE(page_private(page)) in the migration scanner 

Hm but that's breaking the abstraction of page_order(). I don't know if it's
worse to create a new variant of page_order() or to do this. BTW, seems like
next_active_pageblock() in memory-hotplug.c should use this variant too.

> with a comment about it being racy.  It also helps to understand why 
> you're testing for order < MAX_ORDER before skipping low_pfn there which 
> is a little subtle right now.
> 


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
@ 2014-06-06  7:20                           ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-06  7:20 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/05/2014 11:30 PM, David Rientjes wrote:
> On Thu, 5 Jun 2014, Vlastimil Babka wrote:
> 
>> > > diff --git a/mm/compaction.c b/mm/compaction.c
>> > > index ae7db5f..3dce5a7 100644
>> > > --- a/mm/compaction.c
>> > > +++ b/mm/compaction.c
>> > > @@ -640,11 +640,18 @@ isolate_migratepages_range(struct zone *zone, struct
>> > > compact_control *cc,
>> > >   		}
>> > > 
>> > >   		/*
>> > > -		 * Skip if free. page_order cannot be used without zone->lock
>> > > -		 * as nothing prevents parallel allocations or buddy merging.
>> > > +		 * Skip if free. We read page order here without zone lock
>> > > +		 * which is generally unsafe, but the race window is small and
>> > > +		 * the worst thing that can happen is that we skip some
>> > > +		 * potential isolation targets.
>> > 
>> > Should we only be doing the low_pfn adjustment based on the order for
>> > MIGRATE_ASYNC?  It seems like sync compaction, including compaction that
>> > is triggered from the command line, would prefer to scan over the
>> > following pages.
>> 
>> I thought even sync compaction would benefit from the skipped iterations. I'd
>> say the probability of this race is smaller than probability of somebody
>> allocating what compaction just freed.
>> 
> 
> Ok.
> 
>> > > diff --git a/mm/internal.h b/mm/internal.h
>> > > index 1a8a0d4..6aa1f74 100644
>> > > --- a/mm/internal.h
>> > > +++ b/mm/internal.h
>> > > @@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct
>> > > compact_control *cc,
>> > >    * general, page_zone(page)->lock must be held by the caller to prevent
>> > > the
>> > >    * page from being allocated in parallel and returning garbage as the
>> > > order.
>> > >    * If a caller does not hold page_zone(page)->lock, it must guarantee
>> > > that the
>> > > - * page cannot be allocated or merged in parallel.
>> > > + * page cannot be allocated or merged in parallel. Alternatively, it must
>> > > + * handle invalid values gracefully, and use page_order_unsafe() below.
>> > >    */
>> > >   static inline unsigned long page_order(struct page *page)
>> > >   {
>> > > @@ -172,6 +173,23 @@ static inline unsigned long page_order(struct page
>> > > *page)
>> > >   	return page_private(page);
>> > >   }
>> > > 
>> > > +/*
>> > > + * Like page_order(), but for callers who cannot afford to hold the zone
>> > > lock,
>> > > + * and handle invalid values gracefully. ACCESS_ONCE is used so that if
>> > > the
>> > > + * caller assigns the result into a local variable and e.g. tests it for
>> > > valid
>> > > + * range  before using, the compiler cannot decide to remove the variable
>> > > and
>> > > + * inline the function multiple times, potentially observing different
>> > > values
>> > > + * in the tests and the actual use of the result.
>> > > + */
>> > > +static inline unsigned long page_order_unsafe(struct page *page)
>> > > +{
>> > > +	/*
>> > > +	 * PageBuddy() should be checked by the caller to minimize race
>> > > window,
>> > > +	 * and invalid values must be handled gracefully.
>> > > +	 */
>> > > +	return ACCESS_ONCE(page_private(page));
>> > > +}
>> > > +
>> > >   /* mm/util.c */
>> > >   void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
>> > >   		struct vm_area_struct *prev, struct rb_node *rb_parent);
>> > 
>> > I don't like this change at all, I don't think we should have header
>> > functions that imply the context in which the function will be called.  I
>> > think it would make much more sense to just do
>> > ACCESS_ONCE(page_order(page)) in the migration scanner with a comment.
>> 
>> But that won't compile. It would have to be converted to a #define, unless
>> there's some trick I don't know. Sure I would hope this could be done cleaner
>> somehow.
>> 
> 
> Sorry, I meant ACCESS_ONCE(page_private(page)) in the migration scanner 

Hm but that's breaking the abstraction of page_order(). I don't know if it's
worse to create a new variant of page_order() or to do this. BTW, seems like
next_active_pageblock() in memory-hotplug.c should use this variant too.

> with a comment about it being racy.  It also helps to understand why 
> you're testing for order < MAX_ORDER before skipping low_pfn there which 
> is a little subtle right now.
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 6/6] mm, compaction: don't migrate in blocks that cannot be fully compacted in async direct compaction
  2014-06-05 21:38                         ` David Rientjes
@ 2014-06-06  7:33                           ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-06  7:33 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/05/2014 11:38 PM, David Rientjes wrote:
> On Thu, 5 Jun 2014, Vlastimil Babka wrote:
> 
>> > Ok, so this obsoletes my patchseries that did something similar.  I hope
>> 
>> Your patches 1/3 and 2/3 would still make sense. Checking alloc flags is IMHO
>> better than checking async here. That way, hugepaged and kswapd would still
>> try to migrate stuff which is important as Mel described in the reply to your
>> 3/3.
>> 
> 
> Would you mind folding those two patches into your series since you'll be 
> requiring the gfp_mask in struct compact_control and your pageblock skip 
> is better than mine?

Sure!

>> > you can rebase this set on top of linux-next and then propose it formally
>> > without the RFC tag.
>> 
>> I posted this early to facilitate discussion, but if you want to test on
>> linux-next then sure.
>> 
> 
> I'd love to test these.

OK, I'll repost it based on -next.

>> > We also need to discuss the scheduling heuristics, the reliance on
>> > need_resched(), to abort async compaction.  In testing, we actualy
>> > sometimes see 2-3 pageblocks scanned before terminating and thp has a very
>> > little chance of being allocated.  At the same time, if we try to fault
>> > 64MB of anon memory in and each of the 32 calls to compaction are
>> > expensive but don't result in an order-9 page, we see very lengthy fault
>> > latency.
>> 
>> Yes, I thought you were about to try the 1GB per call setting. I don't
>> currently have a test setup like you. My patch 1/6 still uses on
>> need_resched() but that could be replaced with a later patch.
>> 
> 
> Agreed.  I was thinking higher than 1GB would be possible once we have 
> your series that does the pageblock skip for thp, I think the expense 
> would be constant because we won't needlessly be migrating pages unless it 
> has a good chance at succeeding.

Looks like a counter of iterations actually done in scanners, maintained in
compact_control, would work better than any memory size based limit? It could
better reflect the actual work done and thus latency. Maybe increase the counter
also for migrations, with a higher cost than for a scanner iteration.


> I'm slightly concerned about the 
> COMPACT_CLUSTER_MAX termination, though, before we find unmigratable 
> memory but I think that will be very low probability.

Well I have removed the COMPACT_CLUSTER_MAX termination for this case. But
that could be perhaps an issue with compactors starving reclaimers through
too_many_isolated().

>> > I think it would be interesting to consider doing async compaction
>> > deferral up to 1 << COMPACT_MAX_DEFER_SHIFT after a sysctl-configurable
>> > amount of memory is scanned, at least for thp, and remove the scheduling
>> > heuristic entirely.
>> 
>> That could work. How about the lock contention heuristic? Is it possible on a
>> large and/or busy system to compact anything substantional without hitting the
>> lock contention? Are your observations about too early abort based on
>> need_resched() or lock contention?
>> 
> 
> Eek, it's mostly need_resched() because we don't use zone->lru_lock, we 
> have the memcg lruvec locks for lru locking.  We end up dropping and 
> reacquiring different locks based on the memcg of the page being isolated 
> quite a bit.

Hm I will probably as a first thing remove setting cc->contended for need_resched()
Looks like a bad decision, if there's no lock contention. cc->contended = true
means that there is no second direct compaction attempt (sync for hugepaged) which
is not good. And you basically say that this happens almost always. But it makes me
wonder how much difference could your patch "mm, thp: avoid excessive compaction latency
during fault" actually make? Because it makes the second attempt async instead of sync,
but if the second attempt never happens...
Ah, I get it, it was probably before I put cc->contended setting everywhere...

> This does beg the question about parallel direct compactors, though, that 
> will be contending on the same coarse zone->lru_lock locks and immediately 
> aborting and falling back to PAGE_SIZE pages for thp faults that will be 
> more likely if your patch to grab the high-order page and return it to the 
> page allocator is merged.

Hm can you explain how the page capturing makes this worse? I don't see it.


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 6/6] mm, compaction: don't migrate in blocks that cannot be fully compacted in async direct compaction
@ 2014-06-06  7:33                           ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-06  7:33 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/05/2014 11:38 PM, David Rientjes wrote:
> On Thu, 5 Jun 2014, Vlastimil Babka wrote:
> 
>> > Ok, so this obsoletes my patchseries that did something similar.  I hope
>> 
>> Your patches 1/3 and 2/3 would still make sense. Checking alloc flags is IMHO
>> better than checking async here. That way, hugepaged and kswapd would still
>> try to migrate stuff which is important as Mel described in the reply to your
>> 3/3.
>> 
> 
> Would you mind folding those two patches into your series since you'll be 
> requiring the gfp_mask in struct compact_control and your pageblock skip 
> is better than mine?

Sure!

>> > you can rebase this set on top of linux-next and then propose it formally
>> > without the RFC tag.
>> 
>> I posted this early to facilitate discussion, but if you want to test on
>> linux-next then sure.
>> 
> 
> I'd love to test these.

OK, I'll repost it based on -next.

>> > We also need to discuss the scheduling heuristics, the reliance on
>> > need_resched(), to abort async compaction.  In testing, we actualy
>> > sometimes see 2-3 pageblocks scanned before terminating and thp has a very
>> > little chance of being allocated.  At the same time, if we try to fault
>> > 64MB of anon memory in and each of the 32 calls to compaction are
>> > expensive but don't result in an order-9 page, we see very lengthy fault
>> > latency.
>> 
>> Yes, I thought you were about to try the 1GB per call setting. I don't
>> currently have a test setup like you. My patch 1/6 still uses on
>> need_resched() but that could be replaced with a later patch.
>> 
> 
> Agreed.  I was thinking higher than 1GB would be possible once we have 
> your series that does the pageblock skip for thp, I think the expense 
> would be constant because we won't needlessly be migrating pages unless it 
> has a good chance at succeeding.

Looks like a counter of iterations actually done in scanners, maintained in
compact_control, would work better than any memory size based limit? It could
better reflect the actual work done and thus latency. Maybe increase the counter
also for migrations, with a higher cost than for a scanner iteration.


> I'm slightly concerned about the 
> COMPACT_CLUSTER_MAX termination, though, before we find unmigratable 
> memory but I think that will be very low probability.

Well I have removed the COMPACT_CLUSTER_MAX termination for this case. But
that could be perhaps an issue with compactors starving reclaimers through
too_many_isolated().

>> > I think it would be interesting to consider doing async compaction
>> > deferral up to 1 << COMPACT_MAX_DEFER_SHIFT after a sysctl-configurable
>> > amount of memory is scanned, at least for thp, and remove the scheduling
>> > heuristic entirely.
>> 
>> That could work. How about the lock contention heuristic? Is it possible on a
>> large and/or busy system to compact anything substantional without hitting the
>> lock contention? Are your observations about too early abort based on
>> need_resched() or lock contention?
>> 
> 
> Eek, it's mostly need_resched() because we don't use zone->lru_lock, we 
> have the memcg lruvec locks for lru locking.  We end up dropping and 
> reacquiring different locks based on the memcg of the page being isolated 
> quite a bit.

Hm I will probably as a first thing remove setting cc->contended for need_resched()
Looks like a bad decision, if there's no lock contention. cc->contended = true
means that there is no second direct compaction attempt (sync for hugepaged) which
is not good. And you basically say that this happens almost always. But it makes me
wonder how much difference could your patch "mm, thp: avoid excessive compaction latency
during fault" actually make? Because it makes the second attempt async instead of sync,
but if the second attempt never happens...
Ah, I get it, it was probably before I put cc->contended setting everywhere...

> This does beg the question about parallel direct compactors, though, that 
> will be contending on the same coarse zone->lru_lock locks and immediately 
> aborting and falling back to PAGE_SIZE pages for thp faults that will be 
> more likely if your patch to grab the high-order page and return it to the 
> page allocator is merged.

Hm can you explain how the page capturing makes this worse? I don't see it.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 6/6] mm, compaction: don't migrate in blocks that cannot be fully compacted in async direct compaction
  2014-06-06  7:33                           ` Vlastimil Babka
@ 2014-06-09  9:06                             ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-09  9:06 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Fri, 6 Jun 2014, Vlastimil Babka wrote:

> > Agreed.  I was thinking higher than 1GB would be possible once we have 
> > your series that does the pageblock skip for thp, I think the expense 
> > would be constant because we won't needlessly be migrating pages unless it 
> > has a good chance at succeeding.
> 
> Looks like a counter of iterations actually done in scanners, maintained in
> compact_control, would work better than any memory size based limit? It could
> better reflect the actual work done and thus latency. Maybe increase the counter
> also for migrations, with a higher cost than for a scanner iteration.
> 

I'm not sure we can expose that to be configurable by userspace in any 
meaningful way.  We'll want to be able to tune this depending on the size 
of the machine if we are to truly remove the need_resched() heuristic and 
give it a sane default.  I was thinking it would be similar to 
khugepaged's pages_to_scan value that it uses on each wakeup.

> > This does beg the question about parallel direct compactors, though, that 
> > will be contending on the same coarse zone->lru_lock locks and immediately 
> > aborting and falling back to PAGE_SIZE pages for thp faults that will be 
> > more likely if your patch to grab the high-order page and return it to the 
> > page allocator is merged.
> 
> Hm can you explain how the page capturing makes this worse? I don't see it.
> 

I was expecting that your patch to capture the high-order page made a 
difference because the zone watermark check doesn't imply the high-order 
page will be allocatable after we return to the page allocator to allocate 
it.  In that case, we terminated compaction prematurely.  If that's true, 
then it seems like no parallel thp allocator will be able to allocate 
memory that another direct compactor has freed without entering compaction 
itself on a fragmented machine, and thus an increase in zone->lru_lock 
contention if there's migratable memory.

Having 32 cpus fault thp memory and all entering compaction and contending 
(and aborting because of contention, currently) on zone->lru_lock is a 
really bad situation.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 6/6] mm, compaction: don't migrate in blocks that cannot be fully compacted in async direct compaction
@ 2014-06-09  9:06                             ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-09  9:06 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Fri, 6 Jun 2014, Vlastimil Babka wrote:

> > Agreed.  I was thinking higher than 1GB would be possible once we have 
> > your series that does the pageblock skip for thp, I think the expense 
> > would be constant because we won't needlessly be migrating pages unless it 
> > has a good chance at succeeding.
> 
> Looks like a counter of iterations actually done in scanners, maintained in
> compact_control, would work better than any memory size based limit? It could
> better reflect the actual work done and thus latency. Maybe increase the counter
> also for migrations, with a higher cost than for a scanner iteration.
> 

I'm not sure we can expose that to be configurable by userspace in any 
meaningful way.  We'll want to be able to tune this depending on the size 
of the machine if we are to truly remove the need_resched() heuristic and 
give it a sane default.  I was thinking it would be similar to 
khugepaged's pages_to_scan value that it uses on each wakeup.

> > This does beg the question about parallel direct compactors, though, that 
> > will be contending on the same coarse zone->lru_lock locks and immediately 
> > aborting and falling back to PAGE_SIZE pages for thp faults that will be 
> > more likely if your patch to grab the high-order page and return it to the 
> > page allocator is merged.
> 
> Hm can you explain how the page capturing makes this worse? I don't see it.
> 

I was expecting that your patch to capture the high-order page made a 
difference because the zone watermark check doesn't imply the high-order 
page will be allocatable after we return to the page allocator to allocate 
it.  In that case, we terminated compaction prematurely.  If that's true, 
then it seems like no parallel thp allocator will be able to allocate 
memory that another direct compactor has freed without entering compaction 
itself on a fragmented machine, and thus an increase in zone->lru_lock 
contention if there's migratable memory.

Having 32 cpus fault thp memory and all entering compaction and contending 
(and aborting because of contention, currently) on zone->lru_lock is a 
really bad situation.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
  2014-06-06  7:20                           ` Vlastimil Babka
@ 2014-06-09  9:09                             ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-09  9:09 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Fri, 6 Jun 2014, Vlastimil Babka wrote:

> >> > > diff --git a/mm/internal.h b/mm/internal.h
> >> > > index 1a8a0d4..6aa1f74 100644
> >> > > --- a/mm/internal.h
> >> > > +++ b/mm/internal.h
> >> > > @@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct
> >> > > compact_control *cc,
> >> > >    * general, page_zone(page)->lock must be held by the caller to prevent
> >> > > the
> >> > >    * page from being allocated in parallel and returning garbage as the
> >> > > order.
> >> > >    * If a caller does not hold page_zone(page)->lock, it must guarantee
> >> > > that the
> >> > > - * page cannot be allocated or merged in parallel.
> >> > > + * page cannot be allocated or merged in parallel. Alternatively, it must
> >> > > + * handle invalid values gracefully, and use page_order_unsafe() below.
> >> > >    */
> >> > >   static inline unsigned long page_order(struct page *page)
> >> > >   {
> >> > > @@ -172,6 +173,23 @@ static inline unsigned long page_order(struct page
> >> > > *page)
> >> > >   	return page_private(page);
> >> > >   }
> >> > > 
> >> > > +/*
> >> > > + * Like page_order(), but for callers who cannot afford to hold the zone
> >> > > lock,
> >> > > + * and handle invalid values gracefully. ACCESS_ONCE is used so that if
> >> > > the
> >> > > + * caller assigns the result into a local variable and e.g. tests it for
> >> > > valid
> >> > > + * range  before using, the compiler cannot decide to remove the variable
> >> > > and
> >> > > + * inline the function multiple times, potentially observing different
> >> > > values
> >> > > + * in the tests and the actual use of the result.
> >> > > + */
> >> > > +static inline unsigned long page_order_unsafe(struct page *page)
> >> > > +{
> >> > > +	/*
> >> > > +	 * PageBuddy() should be checked by the caller to minimize race
> >> > > window,
> >> > > +	 * and invalid values must be handled gracefully.
> >> > > +	 */
> >> > > +	return ACCESS_ONCE(page_private(page));
> >> > > +}
> >> > > +
> >> > >   /* mm/util.c */
> >> > >   void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
> >> > >   		struct vm_area_struct *prev, struct rb_node *rb_parent);
> >> > 
> >> > I don't like this change at all, I don't think we should have header
> >> > functions that imply the context in which the function will be called.  I
> >> > think it would make much more sense to just do
> >> > ACCESS_ONCE(page_order(page)) in the migration scanner with a comment.
> >> 
> >> But that won't compile. It would have to be converted to a #define, unless
> >> there's some trick I don't know. Sure I would hope this could be done cleaner
> >> somehow.
> >> 
> > 
> > Sorry, I meant ACCESS_ONCE(page_private(page)) in the migration scanner 
> 
> Hm but that's breaking the abstraction of page_order(). I don't know if it's
> worse to create a new variant of page_order() or to do this. BTW, seems like
> next_active_pageblock() in memory-hotplug.c should use this variant too.
> 

The compiler seems free to disregard the access of a volatile object above 
because the return value of the inline function is unsigned long.  What's 
the difference between unsigned long order = page_order_unsafe(page) and
unsigned long order = (unsigned long)ACCESS_ONCE(page_private(page)) and 
the compiler being able to reaccess page_private() because the result is 
no longer volatile qualified?

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
@ 2014-06-09  9:09                             ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-09  9:09 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Fri, 6 Jun 2014, Vlastimil Babka wrote:

> >> > > diff --git a/mm/internal.h b/mm/internal.h
> >> > > index 1a8a0d4..6aa1f74 100644
> >> > > --- a/mm/internal.h
> >> > > +++ b/mm/internal.h
> >> > > @@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct
> >> > > compact_control *cc,
> >> > >    * general, page_zone(page)->lock must be held by the caller to prevent
> >> > > the
> >> > >    * page from being allocated in parallel and returning garbage as the
> >> > > order.
> >> > >    * If a caller does not hold page_zone(page)->lock, it must guarantee
> >> > > that the
> >> > > - * page cannot be allocated or merged in parallel.
> >> > > + * page cannot be allocated or merged in parallel. Alternatively, it must
> >> > > + * handle invalid values gracefully, and use page_order_unsafe() below.
> >> > >    */
> >> > >   static inline unsigned long page_order(struct page *page)
> >> > >   {
> >> > > @@ -172,6 +173,23 @@ static inline unsigned long page_order(struct page
> >> > > *page)
> >> > >   	return page_private(page);
> >> > >   }
> >> > > 
> >> > > +/*
> >> > > + * Like page_order(), but for callers who cannot afford to hold the zone
> >> > > lock,
> >> > > + * and handle invalid values gracefully. ACCESS_ONCE is used so that if
> >> > > the
> >> > > + * caller assigns the result into a local variable and e.g. tests it for
> >> > > valid
> >> > > + * range  before using, the compiler cannot decide to remove the variable
> >> > > and
> >> > > + * inline the function multiple times, potentially observing different
> >> > > values
> >> > > + * in the tests and the actual use of the result.
> >> > > + */
> >> > > +static inline unsigned long page_order_unsafe(struct page *page)
> >> > > +{
> >> > > +	/*
> >> > > +	 * PageBuddy() should be checked by the caller to minimize race
> >> > > window,
> >> > > +	 * and invalid values must be handled gracefully.
> >> > > +	 */
> >> > > +	return ACCESS_ONCE(page_private(page));
> >> > > +}
> >> > > +
> >> > >   /* mm/util.c */
> >> > >   void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
> >> > >   		struct vm_area_struct *prev, struct rb_node *rb_parent);
> >> > 
> >> > I don't like this change at all, I don't think we should have header
> >> > functions that imply the context in which the function will be called.  I
> >> > think it would make much more sense to just do
> >> > ACCESS_ONCE(page_order(page)) in the migration scanner with a comment.
> >> 
> >> But that won't compile. It would have to be converted to a #define, unless
> >> there's some trick I don't know. Sure I would hope this could be done cleaner
> >> somehow.
> >> 
> > 
> > Sorry, I meant ACCESS_ONCE(page_private(page)) in the migration scanner 
> 
> Hm but that's breaking the abstraction of page_order(). I don't know if it's
> worse to create a new variant of page_order() or to do this. BTW, seems like
> next_active_pageblock() in memory-hotplug.c should use this variant too.
> 

The compiler seems free to disregard the access of a volatile object above 
because the return value of the inline function is unsigned long.  What's 
the difference between unsigned long order = page_order_unsafe(page) and
unsigned long order = (unsigned long)ACCESS_ONCE(page_private(page)) and 
the compiler being able to reaccess page_private() because the result is 
no longer volatile qualified?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
  2014-06-09  9:09                             ` David Rientjes
@ 2014-06-09 11:35                               ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-09 11:35 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/09/2014 11:09 AM, David Rientjes wrote:
> On Fri, 6 Jun 2014, Vlastimil Babka wrote:
>
>>>>>> diff --git a/mm/internal.h b/mm/internal.h
>>>>>> index 1a8a0d4..6aa1f74 100644
>>>>>> --- a/mm/internal.h
>>>>>> +++ b/mm/internal.h
>>>>>> @@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct
>>>>>> compact_control *cc,
>>>>>>     * general, page_zone(page)->lock must be held by the caller to prevent
>>>>>> the
>>>>>>     * page from being allocated in parallel and returning garbage as the
>>>>>> order.
>>>>>>     * If a caller does not hold page_zone(page)->lock, it must guarantee
>>>>>> that the
>>>>>> - * page cannot be allocated or merged in parallel.
>>>>>> + * page cannot be allocated or merged in parallel. Alternatively, it must
>>>>>> + * handle invalid values gracefully, and use page_order_unsafe() below.
>>>>>>     */
>>>>>>    static inline unsigned long page_order(struct page *page)
>>>>>>    {
>>>>>> @@ -172,6 +173,23 @@ static inline unsigned long page_order(struct page
>>>>>> *page)
>>>>>>    	return page_private(page);
>>>>>>    }
>>>>>>
>>>>>> +/*
>>>>>> + * Like page_order(), but for callers who cannot afford to hold the zone
>>>>>> lock,
>>>>>> + * and handle invalid values gracefully. ACCESS_ONCE is used so that if
>>>>>> the
>>>>>> + * caller assigns the result into a local variable and e.g. tests it for
>>>>>> valid
>>>>>> + * range  before using, the compiler cannot decide to remove the variable
>>>>>> and
>>>>>> + * inline the function multiple times, potentially observing different
>>>>>> values
>>>>>> + * in the tests and the actual use of the result.
>>>>>> + */
>>>>>> +static inline unsigned long page_order_unsafe(struct page *page)
>>>>>> +{
>>>>>> +	/*
>>>>>> +	 * PageBuddy() should be checked by the caller to minimize race
>>>>>> window,
>>>>>> +	 * and invalid values must be handled gracefully.
>>>>>> +	 */
>>>>>> +	return ACCESS_ONCE(page_private(page));
>>>>>> +}
>>>>>> +
>>>>>>    /* mm/util.c */
>>>>>>    void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
>>>>>>    		struct vm_area_struct *prev, struct rb_node *rb_parent);
>>>>>
>>>>> I don't like this change at all, I don't think we should have header
>>>>> functions that imply the context in which the function will be called.  I
>>>>> think it would make much more sense to just do
>>>>> ACCESS_ONCE(page_order(page)) in the migration scanner with a comment.
>>>>
>>>> But that won't compile. It would have to be converted to a #define, unless
>>>> there's some trick I don't know. Sure I would hope this could be done cleaner
>>>> somehow.
>>>>
>>>
>>> Sorry, I meant ACCESS_ONCE(page_private(page)) in the migration scanner
>>
>> Hm but that's breaking the abstraction of page_order(). I don't know if it's
>> worse to create a new variant of page_order() or to do this. BTW, seems like
>> next_active_pageblock() in memory-hotplug.c should use this variant too.
>>
>
> The compiler seems free to disregard the access of a volatile object above
> because the return value of the inline function is unsigned long.  What's
> the difference between unsigned long order = page_order_unsafe(page) and
> unsigned long order = (unsigned long)ACCESS_ONCE(page_private(page)) and

I think there's none functionally, but one is abstraction layer 
violation and the other imply the context of usage as you say (but is 
that so uncommon?).

> the compiler being able to reaccess page_private() because the result is
> no longer volatile qualified?

You think it will reaccess? That would defeat all current ACCESS_ONCE 
usages, no?

What I'm trying to prevent is that this code:

unsigned long freepage_order = page_order(page);

if (freepage_order > 0 && freepage_order < MAX_ORDER)
	low_pfn += (1UL << freepage_order) - 1;

could be effectively changed (AFAIK legal for the compiler to do) to:

if (page_order(page) > 0 && page_order(page) < MAX_ORDER)
	low_pfn += (1UL << page_order(page)) - 1;

And thus check a different value than it's in the end used to bump low_pfn.

I believe that even though freepage_order itself is not volatile, the 
fact it was assigned through a volatile cast means the compiler won't be 
able to do this anymore.

> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
@ 2014-06-09 11:35                               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-09 11:35 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/09/2014 11:09 AM, David Rientjes wrote:
> On Fri, 6 Jun 2014, Vlastimil Babka wrote:
>
>>>>>> diff --git a/mm/internal.h b/mm/internal.h
>>>>>> index 1a8a0d4..6aa1f74 100644
>>>>>> --- a/mm/internal.h
>>>>>> +++ b/mm/internal.h
>>>>>> @@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct
>>>>>> compact_control *cc,
>>>>>>     * general, page_zone(page)->lock must be held by the caller to prevent
>>>>>> the
>>>>>>     * page from being allocated in parallel and returning garbage as the
>>>>>> order.
>>>>>>     * If a caller does not hold page_zone(page)->lock, it must guarantee
>>>>>> that the
>>>>>> - * page cannot be allocated or merged in parallel.
>>>>>> + * page cannot be allocated or merged in parallel. Alternatively, it must
>>>>>> + * handle invalid values gracefully, and use page_order_unsafe() below.
>>>>>>     */
>>>>>>    static inline unsigned long page_order(struct page *page)
>>>>>>    {
>>>>>> @@ -172,6 +173,23 @@ static inline unsigned long page_order(struct page
>>>>>> *page)
>>>>>>    	return page_private(page);
>>>>>>    }
>>>>>>
>>>>>> +/*
>>>>>> + * Like page_order(), but for callers who cannot afford to hold the zone
>>>>>> lock,
>>>>>> + * and handle invalid values gracefully. ACCESS_ONCE is used so that if
>>>>>> the
>>>>>> + * caller assigns the result into a local variable and e.g. tests it for
>>>>>> valid
>>>>>> + * range  before using, the compiler cannot decide to remove the variable
>>>>>> and
>>>>>> + * inline the function multiple times, potentially observing different
>>>>>> values
>>>>>> + * in the tests and the actual use of the result.
>>>>>> + */
>>>>>> +static inline unsigned long page_order_unsafe(struct page *page)
>>>>>> +{
>>>>>> +	/*
>>>>>> +	 * PageBuddy() should be checked by the caller to minimize race
>>>>>> window,
>>>>>> +	 * and invalid values must be handled gracefully.
>>>>>> +	 */
>>>>>> +	return ACCESS_ONCE(page_private(page));
>>>>>> +}
>>>>>> +
>>>>>>    /* mm/util.c */
>>>>>>    void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
>>>>>>    		struct vm_area_struct *prev, struct rb_node *rb_parent);
>>>>>
>>>>> I don't like this change at all, I don't think we should have header
>>>>> functions that imply the context in which the function will be called.  I
>>>>> think it would make much more sense to just do
>>>>> ACCESS_ONCE(page_order(page)) in the migration scanner with a comment.
>>>>
>>>> But that won't compile. It would have to be converted to a #define, unless
>>>> there's some trick I don't know. Sure I would hope this could be done cleaner
>>>> somehow.
>>>>
>>>
>>> Sorry, I meant ACCESS_ONCE(page_private(page)) in the migration scanner
>>
>> Hm but that's breaking the abstraction of page_order(). I don't know if it's
>> worse to create a new variant of page_order() or to do this. BTW, seems like
>> next_active_pageblock() in memory-hotplug.c should use this variant too.
>>
>
> The compiler seems free to disregard the access of a volatile object above
> because the return value of the inline function is unsigned long.  What's
> the difference between unsigned long order = page_order_unsafe(page) and
> unsigned long order = (unsigned long)ACCESS_ONCE(page_private(page)) and

I think there's none functionally, but one is abstraction layer 
violation and the other imply the context of usage as you say (but is 
that so uncommon?).

> the compiler being able to reaccess page_private() because the result is
> no longer volatile qualified?

You think it will reaccess? That would defeat all current ACCESS_ONCE 
usages, no?

What I'm trying to prevent is that this code:

unsigned long freepage_order = page_order(page);

if (freepage_order > 0 && freepage_order < MAX_ORDER)
	low_pfn += (1UL << freepage_order) - 1;

could be effectively changed (AFAIK legal for the compiler to do) to:

if (page_order(page) > 0 && page_order(page) < MAX_ORDER)
	low_pfn += (1UL << page_order(page)) - 1;

And thus check a different value than it's in the end used to bump low_pfn.

I believe that even though freepage_order itself is not volatile, the 
fact it was assigned through a volatile cast means the compiler won't be 
able to do this anymore.

> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
  2014-06-09 11:35                               ` Vlastimil Babka
@ 2014-06-09 22:25                                 ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-09 22:25 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Mon, 9 Jun 2014, Vlastimil Babka wrote:

> > > > Sorry, I meant ACCESS_ONCE(page_private(page)) in the migration scanner
> > > 
> > > Hm but that's breaking the abstraction of page_order(). I don't know if
> > > it's
> > > worse to create a new variant of page_order() or to do this. BTW, seems
> > > like
> > > next_active_pageblock() in memory-hotplug.c should use this variant too.
> > > 
> > 
> > The compiler seems free to disregard the access of a volatile object above
> > because the return value of the inline function is unsigned long.  What's
> > the difference between unsigned long order = page_order_unsafe(page) and
> > unsigned long order = (unsigned long)ACCESS_ONCE(page_private(page)) and
> 
> I think there's none functionally, but one is abstraction layer violation and
> the other imply the context of usage as you say (but is that so uncommon?).
> 
> > the compiler being able to reaccess page_private() because the result is
> > no longer volatile qualified?
> 
> You think it will reaccess? That would defeat all current ACCESS_ONCE usages,
> no?
> 

I think the compiler is allowed to turn this into

	if (ACCESS_ONCE(page_private(page)) > 0 &&
	    ACCESS_ONCE(page_private(page)) < MAX_ORDER)
		low_pfn += (1UL << ACCESS_ONCE(page_private(page))) - 1;

since the inline function has a return value of unsigned long but gcc may 
not do this.  I think

	/*
	 * Big fat comment describing why we're using ACCESS_ONCE(), that 
	 * we're ok to race, and that this is meaningful only because of
	 * the previous PageBuddy() check.
	 */
	unsigned long pageblock_order = ACCESS_ONCE(page_private(page));

is better.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
@ 2014-06-09 22:25                                 ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-09 22:25 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Mon, 9 Jun 2014, Vlastimil Babka wrote:

> > > > Sorry, I meant ACCESS_ONCE(page_private(page)) in the migration scanner
> > > 
> > > Hm but that's breaking the abstraction of page_order(). I don't know if
> > > it's
> > > worse to create a new variant of page_order() or to do this. BTW, seems
> > > like
> > > next_active_pageblock() in memory-hotplug.c should use this variant too.
> > > 
> > 
> > The compiler seems free to disregard the access of a volatile object above
> > because the return value of the inline function is unsigned long.  What's
> > the difference between unsigned long order = page_order_unsafe(page) and
> > unsigned long order = (unsigned long)ACCESS_ONCE(page_private(page)) and
> 
> I think there's none functionally, but one is abstraction layer violation and
> the other imply the context of usage as you say (but is that so uncommon?).
> 
> > the compiler being able to reaccess page_private() because the result is
> > no longer volatile qualified?
> 
> You think it will reaccess? That would defeat all current ACCESS_ONCE usages,
> no?
> 

I think the compiler is allowed to turn this into

	if (ACCESS_ONCE(page_private(page)) > 0 &&
	    ACCESS_ONCE(page_private(page)) < MAX_ORDER)
		low_pfn += (1UL << ACCESS_ONCE(page_private(page))) - 1;

since the inline function has a return value of unsigned long but gcc may 
not do this.  I think

	/*
	 * Big fat comment describing why we're using ACCESS_ONCE(), that 
	 * we're ok to race, and that this is meaningful only because of
	 * the previous PageBuddy() check.
	 */
	unsigned long pageblock_order = ACCESS_ONCE(page_private(page));

is better.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
  2014-06-09 22:25                                 ` David Rientjes
@ 2014-06-10  7:26                                   ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-10  7:26 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/10/2014 12:25 AM, David Rientjes wrote:
> On Mon, 9 Jun 2014, Vlastimil Babka wrote:
>
>>>>> Sorry, I meant ACCESS_ONCE(page_private(page)) in the migration scanner
>>>>
>>>> Hm but that's breaking the abstraction of page_order(). I don't know if
>>>> it's
>>>> worse to create a new variant of page_order() or to do this. BTW, seems
>>>> like
>>>> next_active_pageblock() in memory-hotplug.c should use this variant too.
>>>>
>>>
>>> The compiler seems free to disregard the access of a volatile object above
>>> because the return value of the inline function is unsigned long.  What's
>>> the difference between unsigned long order = page_order_unsafe(page) and
>>> unsigned long order = (unsigned long)ACCESS_ONCE(page_private(page)) and
>>
>> I think there's none functionally, but one is abstraction layer violation and
>> the other imply the context of usage as you say (but is that so uncommon?).
>>
>>> the compiler being able to reaccess page_private() because the result is
>>> no longer volatile qualified?
>>
>> You think it will reaccess? That would defeat all current ACCESS_ONCE usages,
>> no?
>>
>
> I think the compiler is allowed to turn this into
>
> 	if (ACCESS_ONCE(page_private(page)) > 0 &&
> 	    ACCESS_ONCE(page_private(page)) < MAX_ORDER)
> 		low_pfn += (1UL << ACCESS_ONCE(page_private(page))) - 1;
>
> since the inline function has a return value of unsigned long but gcc may
> not do this.  I think
>
> 	/*
> 	 * Big fat comment describing why we're using ACCESS_ONCE(), that
> 	 * we're ok to race, and that this is meaningful only because of
> 	 * the previous PageBuddy() check.
> 	 */
> 	unsigned long pageblock_order = ACCESS_ONCE(page_private(page));
>
> is better.

I've talked about it with a gcc guy and (although he didn't actually see 
the code so it might be due to me not explaining it perfectly), the 
compiler will inline page_order_unsafe() so that there's effectively.

unsigned long freepage_order = ACCESS_ONCE(page_private(page));

and now it cannot just replace all freepage_order occurences with new 
page_private() accesses. So thanks to the inlining, the volatile 
qualification propagates to where it matters. It makes sense to me, but 
if it's according to standard or gcc specific, I don't know.



^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
@ 2014-06-10  7:26                                   ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-10  7:26 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/10/2014 12:25 AM, David Rientjes wrote:
> On Mon, 9 Jun 2014, Vlastimil Babka wrote:
>
>>>>> Sorry, I meant ACCESS_ONCE(page_private(page)) in the migration scanner
>>>>
>>>> Hm but that's breaking the abstraction of page_order(). I don't know if
>>>> it's
>>>> worse to create a new variant of page_order() or to do this. BTW, seems
>>>> like
>>>> next_active_pageblock() in memory-hotplug.c should use this variant too.
>>>>
>>>
>>> The compiler seems free to disregard the access of a volatile object above
>>> because the return value of the inline function is unsigned long.  What's
>>> the difference between unsigned long order = page_order_unsafe(page) and
>>> unsigned long order = (unsigned long)ACCESS_ONCE(page_private(page)) and
>>
>> I think there's none functionally, but one is abstraction layer violation and
>> the other imply the context of usage as you say (but is that so uncommon?).
>>
>>> the compiler being able to reaccess page_private() because the result is
>>> no longer volatile qualified?
>>
>> You think it will reaccess? That would defeat all current ACCESS_ONCE usages,
>> no?
>>
>
> I think the compiler is allowed to turn this into
>
> 	if (ACCESS_ONCE(page_private(page)) > 0 &&
> 	    ACCESS_ONCE(page_private(page)) < MAX_ORDER)
> 		low_pfn += (1UL << ACCESS_ONCE(page_private(page))) - 1;
>
> since the inline function has a return value of unsigned long but gcc may
> not do this.  I think
>
> 	/*
> 	 * Big fat comment describing why we're using ACCESS_ONCE(), that
> 	 * we're ok to race, and that this is meaningful only because of
> 	 * the previous PageBuddy() check.
> 	 */
> 	unsigned long pageblock_order = ACCESS_ONCE(page_private(page));
>
> is better.

I've talked about it with a gcc guy and (although he didn't actually see 
the code so it might be due to me not explaining it perfectly), the 
compiler will inline page_order_unsafe() so that there's effectively.

unsigned long freepage_order = ACCESS_ONCE(page_private(page));

and now it cannot just replace all freepage_order occurences with new 
page_private() accesses. So thanks to the inlining, the volatile 
qualification propagates to where it matters. It makes sense to me, but 
if it's according to standard or gcc specific, I don't know.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
  2014-06-10  7:26                                   ` Vlastimil Babka
@ 2014-06-10 23:54                                     ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-10 23:54 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Tue, 10 Jun 2014, Vlastimil Babka wrote:

> > I think the compiler is allowed to turn this into
> > 
> > 	if (ACCESS_ONCE(page_private(page)) > 0 &&
> > 	    ACCESS_ONCE(page_private(page)) < MAX_ORDER)
> > 		low_pfn += (1UL << ACCESS_ONCE(page_private(page))) - 1;
> > 
> > since the inline function has a return value of unsigned long but gcc may
> > not do this.  I think
> > 
> > 	/*
> > 	 * Big fat comment describing why we're using ACCESS_ONCE(), that
> > 	 * we're ok to race, and that this is meaningful only because of
> > 	 * the previous PageBuddy() check.
> > 	 */
> > 	unsigned long pageblock_order = ACCESS_ONCE(page_private(page));
> > 
> > is better.
> 
> I've talked about it with a gcc guy and (although he didn't actually see the
> code so it might be due to me not explaining it perfectly), the compiler will
> inline page_order_unsafe() so that there's effectively.
> 
> unsigned long freepage_order = ACCESS_ONCE(page_private(page));
> 
> and now it cannot just replace all freepage_order occurences with new
> page_private() accesses. So thanks to the inlining, the volatile qualification
> propagates to where it matters. It makes sense to me, but if it's according to
> standard or gcc specific, I don't know.
> 

I hate to belabor this point, but I think gcc does treat it differently.  
If you look at the assembly comparing your patch to if you do

	unsigned long freepage_order = ACCESS_ONCE(page_private(page));

instead, then if you enable annotation you'll see that gcc treats the 
store as page_x->D.y.private in your patch vs. MEM[(volatile long unsigned 
int *)page_x + 48B] with the above.

I don't have the ability to prove that all versions of gcc optimization 
will not choose to reaccess page_private(page) here, but it does show that 
at least gcc 4.6.3 does not consider them to be equivalents.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
@ 2014-06-10 23:54                                     ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-10 23:54 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Tue, 10 Jun 2014, Vlastimil Babka wrote:

> > I think the compiler is allowed to turn this into
> > 
> > 	if (ACCESS_ONCE(page_private(page)) > 0 &&
> > 	    ACCESS_ONCE(page_private(page)) < MAX_ORDER)
> > 		low_pfn += (1UL << ACCESS_ONCE(page_private(page))) - 1;
> > 
> > since the inline function has a return value of unsigned long but gcc may
> > not do this.  I think
> > 
> > 	/*
> > 	 * Big fat comment describing why we're using ACCESS_ONCE(), that
> > 	 * we're ok to race, and that this is meaningful only because of
> > 	 * the previous PageBuddy() check.
> > 	 */
> > 	unsigned long pageblock_order = ACCESS_ONCE(page_private(page));
> > 
> > is better.
> 
> I've talked about it with a gcc guy and (although he didn't actually see the
> code so it might be due to me not explaining it perfectly), the compiler will
> inline page_order_unsafe() so that there's effectively.
> 
> unsigned long freepage_order = ACCESS_ONCE(page_private(page));
> 
> and now it cannot just replace all freepage_order occurences with new
> page_private() accesses. So thanks to the inlining, the volatile qualification
> propagates to where it matters. It makes sense to me, but if it's according to
> standard or gcc specific, I don't know.
> 

I hate to belabor this point, but I think gcc does treat it differently.  
If you look at the assembly comparing your patch to if you do

	unsigned long freepage_order = ACCESS_ONCE(page_private(page));

instead, then if you enable annotation you'll see that gcc treats the 
store as page_x->D.y.private in your patch vs. MEM[(volatile long unsigned 
int *)page_x + 48B] with the above.

I don't have the ability to prove that all versions of gcc optimization 
will not choose to reaccess page_private(page) here, but it does show that 
at least gcc 4.6.3 does not consider them to be equivalents.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
  2014-06-10 23:54                                     ` David Rientjes
@ 2014-06-11 12:18                                       ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-11 12:18 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/11/2014 01:54 AM, David Rientjes wrote:
> On Tue, 10 Jun 2014, Vlastimil Babka wrote:
>
>>> I think the compiler is allowed to turn this into
>>>
>>> 	if (ACCESS_ONCE(page_private(page)) > 0 &&
>>> 	    ACCESS_ONCE(page_private(page)) < MAX_ORDER)
>>> 		low_pfn += (1UL << ACCESS_ONCE(page_private(page))) - 1;
>>>
>>> since the inline function has a return value of unsigned long but gcc may
>>> not do this.  I think
>>>
>>> 	/*
>>> 	 * Big fat comment describing why we're using ACCESS_ONCE(), that
>>> 	 * we're ok to race, and that this is meaningful only because of
>>> 	 * the previous PageBuddy() check.
>>> 	 */
>>> 	unsigned long pageblock_order = ACCESS_ONCE(page_private(page));
>>>
>>> is better.
>>
>> I've talked about it with a gcc guy and (although he didn't actually see the
>> code so it might be due to me not explaining it perfectly), the compiler will
>> inline page_order_unsafe() so that there's effectively.
>>
>> unsigned long freepage_order = ACCESS_ONCE(page_private(page));
>>
>> and now it cannot just replace all freepage_order occurences with new
>> page_private() accesses. So thanks to the inlining, the volatile qualification
>> propagates to where it matters. It makes sense to me, but if it's according to
>> standard or gcc specific, I don't know.
>>
>
> I hate to belabor this point, but I think gcc does treat it differently.
> If you look at the assembly comparing your patch to if you do
>
> 	unsigned long freepage_order = ACCESS_ONCE(page_private(page));
>
> instead, then if you enable annotation you'll see that gcc treats the
> store as page_x->D.y.private in your patch vs. MEM[(volatile long unsigned
> int *)page_x + 48B] with the above.

Hm sure you compiled a version that used page_order_unsafe() and not 
page_order()? Because I do see:

MEM[(volatile long unsigned int *)valid_page_114 + 48B];

That's gcc 4.8.1, but our gcc guy said he tried 4.5+ and all was like 
this. And that it would be a gcc bug if not.
He also did a test where page_order was called twice in one function and 
page_order_unsafe twice in another function. page_order() was reduced to 
a single access in the assembly, page_order_unsafe were two accesses.

> I don't have the ability to prove that all versions of gcc optimization
> will not choose to reaccess page_private(page) here, but it does show that
> at least gcc 4.6.3 does not consider them to be equivalents.
>


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
@ 2014-06-11 12:18                                       ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-11 12:18 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/11/2014 01:54 AM, David Rientjes wrote:
> On Tue, 10 Jun 2014, Vlastimil Babka wrote:
>
>>> I think the compiler is allowed to turn this into
>>>
>>> 	if (ACCESS_ONCE(page_private(page)) > 0 &&
>>> 	    ACCESS_ONCE(page_private(page)) < MAX_ORDER)
>>> 		low_pfn += (1UL << ACCESS_ONCE(page_private(page))) - 1;
>>>
>>> since the inline function has a return value of unsigned long but gcc may
>>> not do this.  I think
>>>
>>> 	/*
>>> 	 * Big fat comment describing why we're using ACCESS_ONCE(), that
>>> 	 * we're ok to race, and that this is meaningful only because of
>>> 	 * the previous PageBuddy() check.
>>> 	 */
>>> 	unsigned long pageblock_order = ACCESS_ONCE(page_private(page));
>>>
>>> is better.
>>
>> I've talked about it with a gcc guy and (although he didn't actually see the
>> code so it might be due to me not explaining it perfectly), the compiler will
>> inline page_order_unsafe() so that there's effectively.
>>
>> unsigned long freepage_order = ACCESS_ONCE(page_private(page));
>>
>> and now it cannot just replace all freepage_order occurences with new
>> page_private() accesses. So thanks to the inlining, the volatile qualification
>> propagates to where it matters. It makes sense to me, but if it's according to
>> standard or gcc specific, I don't know.
>>
>
> I hate to belabor this point, but I think gcc does treat it differently.
> If you look at the assembly comparing your patch to if you do
>
> 	unsigned long freepage_order = ACCESS_ONCE(page_private(page));
>
> instead, then if you enable annotation you'll see that gcc treats the
> store as page_x->D.y.private in your patch vs. MEM[(volatile long unsigned
> int *)page_x + 48B] with the above.

Hm sure you compiled a version that used page_order_unsafe() and not 
page_order()? Because I do see:

MEM[(volatile long unsigned int *)valid_page_114 + 48B];

That's gcc 4.8.1, but our gcc guy said he tried 4.5+ and all was like 
this. And that it would be a gcc bug if not.
He also did a test where page_order was called twice in one function and 
page_order_unsafe twice in another function. page_order() was reduced to 
a single access in the assembly, page_order_unsafe were two accesses.

> I don't have the ability to prove that all versions of gcc optimization
> will not choose to reaccess page_private(page) here, but it does show that
> at least gcc 4.6.3 does not consider them to be equivalents.
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
  2014-06-11 12:18                                       ` Vlastimil Babka
@ 2014-06-12  0:21                                         ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-12  0:21 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Wed, 11 Jun 2014, Vlastimil Babka wrote:

> > I hate to belabor this point, but I think gcc does treat it differently.
> > If you look at the assembly comparing your patch to if you do
> > 
> > 	unsigned long freepage_order = ACCESS_ONCE(page_private(page));
> > 
> > instead, then if you enable annotation you'll see that gcc treats the
> > store as page_x->D.y.private in your patch vs. MEM[(volatile long unsigned
> > int *)page_x + 48B] with the above.
> 
> Hm sure you compiled a version that used page_order_unsafe() and not
> page_order()? Because I do see:
> 
> MEM[(volatile long unsigned int *)valid_page_114 + 48B];
> 
> That's gcc 4.8.1, but our gcc guy said he tried 4.5+ and all was like this.
> And that it would be a gcc bug if not.
> He also did a test where page_order was called twice in one function and
> page_order_unsafe twice in another function. page_order() was reduced to a
> single access in the assembly, page_order_unsafe were two accesses.
> 

Ok, and I won't continue to push the point.  I think the lockless 
suitable_migration_target() call that looks at page_order() is fine in the 
free scanner since we use it as a racy check, but it might benefit from 
either a comment describing the behavior or a sanity check for 
page_order(page) <= MAX_ORDER as you've done before.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
@ 2014-06-12  0:21                                         ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-12  0:21 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Wed, 11 Jun 2014, Vlastimil Babka wrote:

> > I hate to belabor this point, but I think gcc does treat it differently.
> > If you look at the assembly comparing your patch to if you do
> > 
> > 	unsigned long freepage_order = ACCESS_ONCE(page_private(page));
> > 
> > instead, then if you enable annotation you'll see that gcc treats the
> > store as page_x->D.y.private in your patch vs. MEM[(volatile long unsigned
> > int *)page_x + 48B] with the above.
> 
> Hm sure you compiled a version that used page_order_unsafe() and not
> page_order()? Because I do see:
> 
> MEM[(volatile long unsigned int *)valid_page_114 + 48B];
> 
> That's gcc 4.8.1, but our gcc guy said he tried 4.5+ and all was like this.
> And that it would be a gcc bug if not.
> He also did a test where page_order was called twice in one function and
> page_order_unsafe twice in another function. page_order() was reduced to a
> single access in the assembly, page_order_unsafe were two accesses.
> 

Ok, and I won't continue to push the point.  I think the lockless 
suitable_migration_target() call that looks at page_order() is fine in the 
free scanner since we use it as a racy check, but it might benefit from 
either a comment describing the behavior or a sanity check for 
page_order(page) <= MAX_ORDER as you've done before.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
  2014-06-12  0:21                                         ` David Rientjes
@ 2014-06-12 11:56                                           ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-12 11:56 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/12/2014 02:21 AM, David Rientjes wrote:
> On Wed, 11 Jun 2014, Vlastimil Babka wrote:
>
>>> I hate to belabor this point, but I think gcc does treat it differently.
>>> If you look at the assembly comparing your patch to if you do
>>>
>>> 	unsigned long freepage_order = ACCESS_ONCE(page_private(page));
>>>
>>> instead, then if you enable annotation you'll see that gcc treats the
>>> store as page_x->D.y.private in your patch vs. MEM[(volatile long unsigned
>>> int *)page_x + 48B] with the above.
>>
>> Hm sure you compiled a version that used page_order_unsafe() and not
>> page_order()? Because I do see:
>>
>> MEM[(volatile long unsigned int *)valid_page_114 + 48B];
>>
>> That's gcc 4.8.1, but our gcc guy said he tried 4.5+ and all was like this.
>> And that it would be a gcc bug if not.
>> He also did a test where page_order was called twice in one function and
>> page_order_unsafe twice in another function. page_order() was reduced to a
>> single access in the assembly, page_order_unsafe were two accesses.
>>
>
> Ok, and I won't continue to push the point.

I'd rather know I'm correct and not just persistent enough :) If you 
confirm that your compiler behaves differently, then maybe making 
page_order_unsafe a #define instead of inline function would prevent 
this issue?

> I think the lockless
> suitable_migration_target() call that looks at page_order() is fine in the
> free scanner since we use it as a racy check, but it might benefit from
> either a comment describing the behavior or a sanity check for
> page_order(page) <= MAX_ORDER as you've done before.

OK, I'll add that.

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
@ 2014-06-12 11:56                                           ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-12 11:56 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/12/2014 02:21 AM, David Rientjes wrote:
> On Wed, 11 Jun 2014, Vlastimil Babka wrote:
>
>>> I hate to belabor this point, but I think gcc does treat it differently.
>>> If you look at the assembly comparing your patch to if you do
>>>
>>> 	unsigned long freepage_order = ACCESS_ONCE(page_private(page));
>>>
>>> instead, then if you enable annotation you'll see that gcc treats the
>>> store as page_x->D.y.private in your patch vs. MEM[(volatile long unsigned
>>> int *)page_x + 48B] with the above.
>>
>> Hm sure you compiled a version that used page_order_unsafe() and not
>> page_order()? Because I do see:
>>
>> MEM[(volatile long unsigned int *)valid_page_114 + 48B];
>>
>> That's gcc 4.8.1, but our gcc guy said he tried 4.5+ and all was like this.
>> And that it would be a gcc bug if not.
>> He also did a test where page_order was called twice in one function and
>> page_order_unsafe twice in another function. page_order() was reduced to a
>> single access in the assembly, page_order_unsafe were two accesses.
>>
>
> Ok, and I won't continue to push the point.

I'd rather know I'm correct and not just persistent enough :) If you 
confirm that your compiler behaves differently, then maybe making 
page_order_unsafe a #define instead of inline function would prevent 
this issue?

> I think the lockless
> suitable_migration_target() call that looks at page_order() is fine in the
> free scanner since we use it as a racy check, but it might benefit from
> either a comment describing the behavior or a sanity check for
> page_order(page) <= MAX_ORDER as you've done before.

OK, I'll add that.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 6/6] mm, compaction: don't migrate in blocks that cannot be fully compacted in async direct compaction
  2014-06-09  9:06                             ` David Rientjes
@ 2014-06-12 12:18                               ` Vlastimil Babka
  -1 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-12 12:18 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/09/2014 11:06 AM, David Rientjes wrote:
> On Fri, 6 Jun 2014, Vlastimil Babka wrote:
>
>>> Agreed.  I was thinking higher than 1GB would be possible once we have
>>> your series that does the pageblock skip for thp, I think the expense
>>> would be constant because we won't needlessly be migrating pages unless it
>>> has a good chance at succeeding.
>>
>> Looks like a counter of iterations actually done in scanners, maintained in
>> compact_control, would work better than any memory size based limit? It could
>> better reflect the actual work done and thus latency. Maybe increase the counter
>> also for migrations, with a higher cost than for a scanner iteration.
>>
>
> I'm not sure we can expose that to be configurable by userspace in any
> meaningful way.  We'll want to be able to tune this depending on the size
> of the machine if we are to truly remove the need_resched() heuristic and
> give it a sane default.  I was thinking it would be similar to
> khugepaged's pages_to_scan value that it uses on each wakeup.

Perhaps userspace can see the value in memory size unit, which would be 
translated to pages_to_scan assuming the worst case, i.e. scanning each 
page? Which would be used to limit the iterations, so if we end up 
skipping blocks of pages instead of single pages for whatever reasons, 
we can effectively scan a bigger memory size with the same effort?

>>> This does beg the question about parallel direct compactors, though, that
>>> will be contending on the same coarse zone->lru_lock locks and immediately
>>> aborting and falling back to PAGE_SIZE pages for thp faults that will be
>>> more likely if your patch to grab the high-order page and return it to the
>>> page allocator is merged.
>>
>> Hm can you explain how the page capturing makes this worse? I don't see it.
>>
>
> I was expecting that your patch to capture the high-order page made a
> difference because the zone watermark check doesn't imply the high-order
> page will be allocatable after we return to the page allocator to allocate
> it.  In that case, we terminated compaction prematurely.

In fact compact_finished() uses both a watermark check and then a 
free_list check. Only if both pass, it exits. But page allocation then 
does another watermark check which may fail (due to its raciness and 
drift) even though the page is still available on the free_list.

> If that's true,
> then it seems like no parallel thp allocator will be able to allocate
> memory that another direct compactor has freed without entering compaction
> itself on a fragmented machine, and thus an increase in zone->lru_lock
> contention if there's migratable memory.

I think it's only fair if someone who did the compaction work can 
allocate the page. Another compaction then has to do its own work, so in 
the end it's 2 units of work for 2 allocations (assuming success). 
Without the fairness, it might be 2 units of work by single allocator, 
for 2 successful allocations of two allocators. Or, as you seem to 
imply, 1 unit of work for 1 successful allocation, because the one doing 
the work will terminate prematurely and end up without allocation.
If we really rely on this premature termination as a contention 
prevention, then it seems quite unfair and fragile to me.

> Having 32 cpus fault thp memory and all entering compaction and contending
> (and aborting because of contention, currently) on zone->lru_lock is a
> really bad situation.

I'm not sure if the premature termination could prevent this reliably. I 
rather doubt that. The lock contention checks should work just fine in 
this case. And also I don't think it's that bad if they abort due to 
contention, if it happens quickly. It means that in such situation, it's 
simply a better performance tradeoff to give up on THP and fallback to 
4k allocation. Also you say "currently" but we are not going to change 
that for lock contention, are we?


^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 6/6] mm, compaction: don't migrate in blocks that cannot be fully compacted in async direct compaction
@ 2014-06-12 12:18                               ` Vlastimil Babka
  0 siblings, 0 replies; 267+ messages in thread
From: Vlastimil Babka @ 2014-06-12 12:18 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On 06/09/2014 11:06 AM, David Rientjes wrote:
> On Fri, 6 Jun 2014, Vlastimil Babka wrote:
>
>>> Agreed.  I was thinking higher than 1GB would be possible once we have
>>> your series that does the pageblock skip for thp, I think the expense
>>> would be constant because we won't needlessly be migrating pages unless it
>>> has a good chance at succeeding.
>>
>> Looks like a counter of iterations actually done in scanners, maintained in
>> compact_control, would work better than any memory size based limit? It could
>> better reflect the actual work done and thus latency. Maybe increase the counter
>> also for migrations, with a higher cost than for a scanner iteration.
>>
>
> I'm not sure we can expose that to be configurable by userspace in any
> meaningful way.  We'll want to be able to tune this depending on the size
> of the machine if we are to truly remove the need_resched() heuristic and
> give it a sane default.  I was thinking it would be similar to
> khugepaged's pages_to_scan value that it uses on each wakeup.

Perhaps userspace can see the value in memory size unit, which would be 
translated to pages_to_scan assuming the worst case, i.e. scanning each 
page? Which would be used to limit the iterations, so if we end up 
skipping blocks of pages instead of single pages for whatever reasons, 
we can effectively scan a bigger memory size with the same effort?

>>> This does beg the question about parallel direct compactors, though, that
>>> will be contending on the same coarse zone->lru_lock locks and immediately
>>> aborting and falling back to PAGE_SIZE pages for thp faults that will be
>>> more likely if your patch to grab the high-order page and return it to the
>>> page allocator is merged.
>>
>> Hm can you explain how the page capturing makes this worse? I don't see it.
>>
>
> I was expecting that your patch to capture the high-order page made a
> difference because the zone watermark check doesn't imply the high-order
> page will be allocatable after we return to the page allocator to allocate
> it.  In that case, we terminated compaction prematurely.

In fact compact_finished() uses both a watermark check and then a 
free_list check. Only if both pass, it exits. But page allocation then 
does another watermark check which may fail (due to its raciness and 
drift) even though the page is still available on the free_list.

> If that's true,
> then it seems like no parallel thp allocator will be able to allocate
> memory that another direct compactor has freed without entering compaction
> itself on a fragmented machine, and thus an increase in zone->lru_lock
> contention if there's migratable memory.

I think it's only fair if someone who did the compaction work can 
allocate the page. Another compaction then has to do its own work, so in 
the end it's 2 units of work for 2 allocations (assuming success). 
Without the fairness, it might be 2 units of work by single allocator, 
for 2 successful allocations of two allocators. Or, as you seem to 
imply, 1 unit of work for 1 successful allocation, because the one doing 
the work will terminate prematurely and end up without allocation.
If we really rely on this premature termination as a contention 
prevention, then it seems quite unfair and fragile to me.

> Having 32 cpus fault thp memory and all entering compaction and contending
> (and aborting because of contention, currently) on zone->lru_lock is a
> really bad situation.

I'm not sure if the premature termination could prevent this reliably. I 
rather doubt that. The lock contention checks should work just fine in 
this case. And also I don't think it's that bad if they abort due to 
contention, if it happens quickly. It means that in such situation, it's 
simply a better performance tradeoff to give up on THP and fallback to 
4k allocation. Also you say "currently" but we are not going to change 
that for lock contention, are we?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
  2014-06-12 11:56                                           ` Vlastimil Babka
@ 2014-06-12 21:48                                             ` David Rientjes
  -1 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-12 21:48 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Thu, 12 Jun 2014, Vlastimil Babka wrote:

> > Ok, and I won't continue to push the point.
> 
> I'd rather know I'm correct and not just persistent enough :) If you confirm
> that your compiler behaves differently, then maybe making page_order_unsafe a
> #define instead of inline function would prevent this issue?
> 

The reason I was hesitatnt is because there's no way I can prove under all 
possible circumstances in which page_order_unsafe() could be used that gcc 
won't make the decision to reaccess.  I personally didn't think that doing

	if (PageBuddy(page)) {
		/*
		 * Racy check since we know PageBuddy() is true and we do
		 * some sanity checking on this scan to ensure it is an
		 * appropriate order.
		 */
		unsigned long order = ACCESS_ONCE(page_private(page));
		...
	}

was too much of a problem and actually put the ACCESS_ONCE() in the 
context in which it matters rather than hiding behind an inline function.

> > I think the lockless
> > suitable_migration_target() call that looks at page_order() is fine in the
> > free scanner since we use it as a racy check, but it might benefit from
> > either a comment describing the behavior or a sanity check for
> > page_order(page) <= MAX_ORDER as you've done before.
> 
> OK, I'll add that.
> 

Thanks!

^ permalink raw reply	[flat|nested] 267+ messages in thread

* Re: [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner
@ 2014-06-12 21:48                                             ` David Rientjes
  0 siblings, 0 replies; 267+ messages in thread
From: David Rientjes @ 2014-06-12 21:48 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Andrew Morton, Greg Thelen, Minchan Kim,
	Mel Gorman, Joonsoo Kim, Michal Nazarewicz, Naoya Horiguchi,
	Christoph Lameter, Rik van Riel

On Thu, 12 Jun 2014, Vlastimil Babka wrote:

> > Ok, and I won't continue to push the point.
> 
> I'd rather know I'm correct and not just persistent enough :) If you confirm
> that your compiler behaves differently, then maybe making page_order_unsafe a
> #define instead of inline function would prevent this issue?
> 

The reason I was hesitatnt is because there's no way I can prove under all 
possible circumstances in which page_order_unsafe() could be used that gcc 
won't make the decision to reaccess.  I personally didn't think that doing

	if (PageBuddy(page)) {
		/*
		 * Racy check since we know PageBuddy() is true and we do
		 * some sanity checking on this scan to ensure it is an
		 * appropriate order.
		 */
		unsigned long order = ACCESS_ONCE(page_private(page));
		...
	}

was too much of a problem and actually put the ACCESS_ONCE() in the 
context in which it matters rather than hiding behind an inline function.

> > I think the lockless
> > suitable_migration_target() call that looks at page_order() is fine in the
> > free scanner since we use it as a racy check, but it might benefit from
> > either a comment describing the behavior or a sanity check for
> > page_order(page) <= MAX_ORDER as you've done before.
> 
> OK, I'll add that.
> 

Thanks!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 267+ messages in thread

end of thread, other threads:[~2014-06-12 21:48 UTC | newest]

Thread overview: 267+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-05-01  0:45 [patch 1/2] mm, migration: add destination page freeing callback David Rientjes
2014-05-01  0:45 ` David Rientjes
2014-05-01  0:45 ` [patch 2/2] mm, compaction: return failed migration target pages back to freelist David Rientjes
2014-05-01  0:45   ` David Rientjes
2014-05-01  5:10   ` Naoya Horiguchi
2014-05-01 21:02     ` David Rientjes
2014-05-01 21:02       ` David Rientjes
2014-05-01  5:08 ` [patch 1/2] mm, migration: add destination page freeing callback Naoya Horiguchi
     [not found] ` <5361d71e.236ec20a.1b3d.ffffc8aeSMTPIN_ADDED_BROKEN@mx.google.com>
2014-05-01 21:02   ` David Rientjes
2014-05-01 21:02     ` David Rientjes
2014-05-01 21:35 ` [patch v2 1/4] " David Rientjes
2014-05-01 21:35   ` David Rientjes
2014-05-01 21:35   ` [patch v2 2/4] mm, compaction: return failed migration target pages back to freelist David Rientjes
2014-05-01 21:35     ` David Rientjes
2014-05-02 10:11     ` Mel Gorman
2014-05-02 10:11       ` Mel Gorman
2014-05-02 15:23     ` Vlastimil Babka
2014-05-02 15:23       ` Vlastimil Babka
2014-05-02 15:26       ` [PATCH] mm/compaction: do not count migratepages when unnecessary Vlastimil Babka
2014-05-06 21:18         ` Naoya Horiguchi
     [not found]         ` <1399411134-k43fsr0p@n-horiguchi@ah.jp.nec.com>
2014-05-07  9:33           ` Vlastimil Babka
2014-05-07  9:33             ` Vlastimil Babka
2014-05-02 15:27       ` [PATCH 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages Vlastimil Babka
2014-05-02 15:27         ` Vlastimil Babka
2014-05-06 22:19         ` Naoya Horiguchi
     [not found]         ` <1399414778-xakujfb3@n-horiguchi@ah.jp.nec.com>
2014-05-07  9:22           ` Vlastimil Babka
2014-05-07  9:22             ` Vlastimil Babka
2014-05-02 15:29       ` [PATCH 1/2] mm/compaction: do not count migratepages when unnecessary Vlastimil Babka
2014-05-02 15:29         ` Vlastimil Babka
2014-05-01 21:35   ` [patch v2 3/4] mm, compaction: add per-zone migration pfn cache for async compaction David Rientjes
2014-05-01 21:35     ` David Rientjes
2014-05-05  9:34     ` Vlastimil Babka
2014-05-05  9:34       ` Vlastimil Babka
2014-05-05  9:51       ` David Rientjes
2014-05-05  9:51         ` David Rientjes
2014-05-05 14:24         ` Vlastimil Babka
2014-05-05 14:24           ` Vlastimil Babka
2014-05-06  0:29           ` David Rientjes
2014-05-06  0:29             ` David Rientjes
2014-05-06 11:52             ` Vlastimil Babka
2014-05-06 11:52               ` Vlastimil Babka
2014-05-01 21:35   ` [patch v2 4/4] mm, thp: do not perform sync compaction on pagefault David Rientjes
2014-05-01 21:35     ` David Rientjes
2014-05-02 10:22     ` Mel Gorman
2014-05-02 10:22       ` Mel Gorman
2014-05-02 11:22       ` David Rientjes
2014-05-02 11:22         ` David Rientjes
2014-05-02 11:58         ` Mel Gorman
2014-05-02 20:29           ` David Rientjes
2014-05-02 20:29             ` David Rientjes
2014-05-05 14:48             ` Vlastimil Babka
2014-05-05 14:48               ` Vlastimil Babka
2014-05-06  8:55             ` Mel Gorman
2014-05-06  8:55               ` Mel Gorman
2014-05-06 15:05               ` Vlastimil Babka
2014-05-06 15:05                 ` Vlastimil Babka
2014-05-02 10:10   ` [patch v2 1/4] mm, migration: add destination page freeing callback Mel Gorman
2014-05-07  2:22   ` [patch v3 1/6] " David Rientjes
2014-05-07  2:22     ` David Rientjes
2014-05-07  2:22     ` [patch v3 2/6] mm, compaction: return failed migration target pages back to freelist David Rientjes
2014-05-07  2:22       ` David Rientjes
2014-05-07 14:14       ` Naoya Horiguchi
2014-05-07 21:15       ` Andrew Morton
2014-05-07 21:15         ` Andrew Morton
2014-05-07 21:21         ` David Rientjes
2014-05-07 21:21           ` David Rientjes
2014-05-12  8:35           ` Vlastimil Babka
2014-05-12  8:35             ` Vlastimil Babka
2014-05-07 21:39         ` Greg Thelen
2014-05-07 21:39           ` Greg Thelen
2014-05-12  8:37           ` Vlastimil Babka
2014-05-12  8:37             ` Vlastimil Babka
2014-05-07  2:22     ` [patch v3 3/6] mm, compaction: add per-zone migration pfn cache for async compaction David Rientjes
2014-05-07  2:22       ` David Rientjes
2014-05-07  9:34       ` Vlastimil Babka
2014-05-07  9:34         ` Vlastimil Babka
2014-05-07 20:56       ` Naoya Horiguchi
2014-05-07  2:22     ` [patch v3 4/6] mm, compaction: embed migration mode in compact_control David Rientjes
2014-05-07  2:22       ` David Rientjes
2014-05-07  9:55       ` Vlastimil Babka
2014-05-07  9:55         ` Vlastimil Babka
2014-05-07 10:36         ` [patch v4 " David Rientjes
2014-05-07 10:36           ` David Rientjes
2014-05-09 22:03           ` Andrew Morton
2014-05-09 22:03             ` Andrew Morton
2014-05-07  2:22     ` [patch v3 5/6] mm, thp: avoid excessive compaction latency during fault David Rientjes
2014-05-07  2:22       ` David Rientjes
2014-05-07  9:39       ` Mel Gorman
2014-05-07  9:39         ` Mel Gorman
2014-05-08  5:30       ` [patch -mm] mm, thp: avoid excessive compaction latency during fault fix David Rientjes
2014-05-08  5:30         ` David Rientjes
2014-05-13 10:00         ` Vlastimil Babka
2014-05-13 10:00           ` Vlastimil Babka
2014-05-22  2:49           ` David Rientjes
2014-05-22  2:49             ` David Rientjes
2014-05-22  8:43             ` Vlastimil Babka
2014-05-22  8:43               ` Vlastimil Babka
2014-05-07  2:22     ` [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling David Rientjes
2014-05-07  2:22       ` David Rientjes
2014-05-07  9:41       ` Mel Gorman
2014-05-07  9:41         ` Mel Gorman
2014-05-07 12:09       ` [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary Vlastimil Babka
2014-05-07 12:09         ` Vlastimil Babka
2014-05-07 12:09         ` [PATCH v2 2/2] mm/compaction: avoid rescanning pageblocks in isolate_freepages Vlastimil Babka
2014-05-07 12:09           ` Vlastimil Babka
2014-05-07 21:47           ` David Rientjes
2014-05-07 21:47             ` David Rientjes
2014-05-07 22:06           ` Naoya Horiguchi
2014-05-08  5:28           ` Joonsoo Kim
2014-05-08  5:28             ` Joonsoo Kim
2014-05-12  9:09             ` Vlastimil Babka
2014-05-12  9:09               ` Vlastimil Babka
2014-05-13  1:15               ` Joonsoo Kim
2014-05-13  1:15                 ` Joonsoo Kim
2014-05-09 15:49           ` Michal Nazarewicz
2014-05-09 15:49             ` Michal Nazarewicz
2014-05-19 10:14           ` Vlastimil Babka
2014-05-19 10:14             ` Vlastimil Babka
2014-05-22  2:51             ` David Rientjes
2014-05-22  2:51               ` David Rientjes
2014-05-07 21:44         ` [PATCH v2 1/2] mm/compaction: do not count migratepages when unnecessary David Rientjes
2014-05-07 21:44           ` David Rientjes
2014-05-09 15:48         ` Michal Nazarewicz
2014-05-09 15:48           ` Michal Nazarewicz
2014-05-12  9:51           ` Vlastimil Babka
2014-05-12  9:51             ` Vlastimil Babka
2014-05-07 12:10       ` [patch v3 6/6] mm, compaction: terminate async compaction when rescheduling Vlastimil Babka
2014-05-07 12:10         ` Vlastimil Babka
2014-05-07 21:20       ` Andrew Morton
2014-05-07 21:20         ` Andrew Morton
2014-05-07 21:28         ` David Rientjes
2014-05-07 21:28           ` David Rientjes
2014-05-08  5:17       ` Joonsoo Kim
2014-05-08  5:17         ` Joonsoo Kim
2014-05-12 14:15         ` [PATCH] mm, compaction: properly signal and act upon lock and need_sched() contention Vlastimil Babka
2014-05-12 14:15           ` Vlastimil Babka
2014-05-12 15:34           ` Naoya Horiguchi
     [not found]           ` <1399908847-ouuxeneo@n-horiguchi@ah.jp.nec.com>
2014-05-12 15:45             ` Vlastimil Babka
2014-05-12 15:45               ` Vlastimil Babka
2014-05-12 15:53               ` Naoya Horiguchi
2014-05-12 20:28           ` David Rientjes
2014-05-12 20:28             ` David Rientjes
2014-05-13  8:50             ` Vlastimil Babka
2014-05-13  8:50               ` Vlastimil Babka
2014-05-13  0:44           ` Joonsoo Kim
2014-05-13  0:44             ` Joonsoo Kim
2014-05-13  8:54             ` Vlastimil Babka
2014-05-13  8:54               ` Vlastimil Babka
2014-05-15  2:21               ` Joonsoo Kim
2014-05-15  2:21                 ` Joonsoo Kim
2014-05-16  9:47           ` [PATCH v2] " Vlastimil Babka
2014-05-16  9:47             ` Vlastimil Babka
2014-05-16 17:33             ` Michal Nazarewicz
2014-05-16 17:33               ` Michal Nazarewicz
2014-05-19 23:37             ` Andrew Morton
2014-05-19 23:37               ` Andrew Morton
2014-05-21 14:13               ` Vlastimil Babka
2014-05-21 14:13                 ` Vlastimil Babka
2014-05-21 20:11                 ` Andrew Morton
2014-05-21 20:11                   ` Andrew Morton
2014-05-22  3:20             ` compaction is still too expensive for thp (was: [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention) David Rientjes
2014-05-22  3:20               ` David Rientjes
2014-05-22  8:10               ` compaction is still too expensive for thp Vlastimil Babka
2014-05-22  8:10                 ` Vlastimil Babka
2014-05-22  8:55                 ` David Rientjes
2014-05-22  8:55                   ` David Rientjes
2014-05-22 12:03                   ` Vlastimil Babka
2014-05-22 12:03                     ` Vlastimil Babka
2014-06-04  0:29                     ` [patch -mm 1/3] mm: rename allocflags_to_migratetype for clarity David Rientjes
2014-06-04  0:29                       ` David Rientjes
2014-06-04  0:29                       ` [patch -mm 2/3] mm, compaction: pass gfp mask to compact_control David Rientjes
2014-06-04  0:29                         ` David Rientjes
2014-06-04  0:30                       ` [patch -mm 3/3] mm, compaction: avoid compacting memory for thp if pageblock cannot become free David Rientjes
2014-06-04  0:30                         ` David Rientjes
2014-06-04 11:04                         ` Mel Gorman
2014-06-04 11:04                           ` Mel Gorman
2014-06-04 22:02                           ` David Rientjes
2014-06-04 22:02                             ` David Rientjes
2014-06-04 16:07                         ` Vlastimil Babka
2014-06-04 16:07                           ` Vlastimil Babka
2014-06-04 16:11               ` [RFC PATCH 1/6] mm, compaction: periodically drop lock and restore IRQs in scanners Vlastimil Babka
2014-06-04 16:11                 ` Vlastimil Babka
2014-06-04 16:11                 ` [RFC PATCH 2/6] mm, compaction: skip rechecks when lock was already held Vlastimil Babka
2014-06-04 16:11                   ` Vlastimil Babka
2014-06-04 23:46                   ` David Rientjes
2014-06-04 23:46                     ` David Rientjes
2014-06-04 16:11                 ` [RFC PATCH 3/6] mm, compaction: remember position within pageblock in free pages scanner Vlastimil Babka
2014-06-04 16:11                   ` Vlastimil Babka
2014-06-04 16:11                 ` [RFC PATCH 4/6] mm, compaction: skip buddy pages by their order in the migrate scanner Vlastimil Babka
2014-06-04 16:11                   ` Vlastimil Babka
2014-06-05  0:02                   ` David Rientjes
2014-06-05  0:02                     ` David Rientjes
2014-06-05  9:24                     ` Vlastimil Babka
2014-06-05  9:24                       ` Vlastimil Babka
2014-06-05 21:30                       ` David Rientjes
2014-06-05 21:30                         ` David Rientjes
2014-06-06  7:20                         ` Vlastimil Babka
2014-06-06  7:20                           ` Vlastimil Babka
2014-06-09  9:09                           ` David Rientjes
2014-06-09  9:09                             ` David Rientjes
2014-06-09 11:35                             ` Vlastimil Babka
2014-06-09 11:35                               ` Vlastimil Babka
2014-06-09 22:25                               ` David Rientjes
2014-06-09 22:25                                 ` David Rientjes
2014-06-10  7:26                                 ` Vlastimil Babka
2014-06-10  7:26                                   ` Vlastimil Babka
2014-06-10 23:54                                   ` David Rientjes
2014-06-10 23:54                                     ` David Rientjes
2014-06-11 12:18                                     ` Vlastimil Babka
2014-06-11 12:18                                       ` Vlastimil Babka
2014-06-12  0:21                                       ` David Rientjes
2014-06-12  0:21                                         ` David Rientjes
2014-06-12 11:56                                         ` Vlastimil Babka
2014-06-12 11:56                                           ` Vlastimil Babka
2014-06-12 21:48                                           ` David Rientjes
2014-06-12 21:48                                             ` David Rientjes
2014-06-04 16:11                 ` [RFC PATCH 5/6] mm, compaction: try to capture the just-created high-order freepage Vlastimil Babka
2014-06-04 16:11                   ` Vlastimil Babka
2014-06-04 16:11                 ` [RFC PATCH 6/6] mm, compaction: don't migrate in blocks that cannot be fully compacted in async direct compaction Vlastimil Babka
2014-06-04 16:11                   ` Vlastimil Babka
2014-06-05  0:08                   ` David Rientjes
2014-06-05  0:08                     ` David Rientjes
2014-06-05 15:38                     ` Vlastimil Babka
2014-06-05 15:38                       ` Vlastimil Babka
2014-06-05 21:38                       ` David Rientjes
2014-06-05 21:38                         ` David Rientjes
2014-06-06  7:33                         ` Vlastimil Babka
2014-06-06  7:33                           ` Vlastimil Babka
2014-06-09  9:06                           ` David Rientjes
2014-06-09  9:06                             ` David Rientjes
2014-06-12 12:18                             ` Vlastimil Babka
2014-06-12 12:18                               ` Vlastimil Babka
2014-06-04 23:39                 ` [RFC PATCH 1/6] mm, compaction: periodically drop lock and restore IRQs in scanners David Rientjes
2014-06-04 23:39                   ` David Rientjes
2014-06-05  9:05                   ` Vlastimil Babka
2014-06-05  9:05                     ` Vlastimil Babka
2014-05-22 23:49             ` [PATCH v2] mm, compaction: properly signal and act upon lock and need_sched() contention Kevin Hilman
2014-05-22 23:49               ` Kevin Hilman
2014-05-23  2:48               ` Shawn Guo
2014-05-23  2:48                 ` Shawn Guo
2014-05-23  2:48                 ` Shawn Guo
2014-05-23  8:34                 ` Vlastimil Babka
2014-05-23  8:34                   ` Vlastimil Babka
2014-05-23  8:34                   ` Vlastimil Babka
2014-05-23 10:49                   ` Shawn Guo
2014-05-23 10:49                     ` Shawn Guo
2014-05-23 10:49                     ` Shawn Guo
2014-05-23 15:07                   ` Kevin Hilman
2014-05-23 15:07                     ` Kevin Hilman
2014-05-23 15:07                     ` Kevin Hilman
2014-05-30 16:59                   ` Stephen Warren
2014-05-30 16:59                     ` Stephen Warren
2014-05-30 16:59                     ` Stephen Warren
2014-06-02 13:35                   ` Fabio Estevam
2014-06-02 13:35                     ` Fabio Estevam
2014-06-02 13:35                     ` Fabio Estevam
2014-06-02 14:33                     ` [PATCH -mm] mm, compaction: properly signal and act upon lock and need_sched() contention - fix Vlastimil Babka
2014-06-02 14:33                       ` Vlastimil Babka
2014-06-02 14:33                       ` Vlastimil Babka
2014-06-02 15:18                       ` Fabio Estevam
2014-06-02 15:18                         ` Fabio Estevam
2014-06-02 15:18                         ` Fabio Estevam
2014-06-02 20:09                       ` David Rientjes
2014-06-02 20:09                         ` David Rientjes
2014-06-02 20:09                         ` David Rientjes
2014-05-02 13:16 ` [patch 1/2] mm, migration: add destination page freeing callback Vlastimil Babka
2014-05-02 13:16   ` Vlastimil Babka

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.