All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3 v3] mm: Factor out memory isolate functions
@ 2012-07-12  2:50 ` Minchan Kim
  0 siblings, 0 replies; 18+ messages in thread
From: Minchan Kim @ 2012-07-12  2:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Mel Gorman, KOSAKI Motohiro,
	Kamezawa Hiroyuki, Aaditya Kumar, Minchan Kim, Andi Kleen,
	Marek Szyprowski

Now mm/page_alloc.c has some memory isolation functions but they
are used oly when we enable CONFIG_{CMA|MEMORY_HOTPLUG|MEMORY_FAILURE}.
So let's make it configurable by new CONFIG_MEMORY_ISOLATION so that it
can reduce binary size and we can check it simple by CONFIG_MEMORY_ISOLATION,
not if defined CONFIG_{CMA|MEMORY_HOTPLUG|MEMORY_FAILURE}.

* from v2
 - rebase on mmotm-2012-07-10-16-59

* from v1
 - rebase on next-20120626

Cc: Andi Kleen <andi@firstfloor.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 drivers/base/Kconfig           |    1 +
 include/linux/page-isolation.h |   13 +++++--
 mm/Kconfig                     |    5 +++
 mm/Makefile                    |    4 +-
 mm/page_alloc.c                |   80 ++--------------------------------------
 mm/page_isolation.c            |   71 +++++++++++++++++++++++++++++++++++
 6 files changed, 92 insertions(+), 82 deletions(-)

diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 9b21469..08b4c52 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -196,6 +196,7 @@ config CMA
 	bool "Contiguous Memory Allocator (EXPERIMENTAL)"
 	depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK && EXPERIMENTAL
 	select MIGRATION
+	select MEMORY_ISOLATION
 	help
 	  This enables the Contiguous Memory Allocator which allows drivers
 	  to allocate big physically-contiguous blocks of memory for use with
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 3bdcab3..105077a 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -1,6 +1,11 @@
 #ifndef __LINUX_PAGEISOLATION_H
 #define __LINUX_PAGEISOLATION_H
 
+
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count);
+void set_pageblock_migratetype(struct page *page, int migratetype);
+int move_freepages_block(struct zone *zone, struct page *page,
+				int migratetype);
 /*
  * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
  * If specified range includes migrate types other than MOVABLE or CMA,
@@ -10,7 +15,7 @@
  * free all pages in the range. test_page_isolated() can be used for
  * test it.
  */
-extern int
+int
 start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 			 unsigned migratetype);
 
@@ -18,7 +23,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
  * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
  * target range is [start_pfn, end_pfn)
  */
-extern int
+int
 undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 			unsigned migratetype);
 
@@ -30,8 +35,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn);
 /*
  * Internal functions. Changes pageblock's migrate type.
  */
-extern int set_migratetype_isolate(struct page *page);
-extern void unset_migratetype_isolate(struct page *page, unsigned migratetype);
+int set_migratetype_isolate(struct page *page);
+void unset_migratetype_isolate(struct page *page, unsigned migratetype);
 
 
 #endif
diff --git a/mm/Kconfig b/mm/Kconfig
index 82fed4e..d5c8019 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK
 config NO_BOOTMEM
 	boolean
 
+config MEMORY_ISOLATION
+	boolean
+
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
 	bool "Allow for memory hot-add"
+	select MEMORY_ISOLATION
 	depends on SPARSEMEM || X86_64_ACPI_NUMA
 	depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
 	depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -272,6 +276,7 @@ config MEMORY_FAILURE
 	depends on MMU
 	depends on ARCH_SUPPORTS_MEMORY_FAILURE
 	bool "Enable recovery from hardware memory errors"
+	select MEMORY_ISOLATION
 	help
 	  Enables code to recover from some memory failures on systems
 	  with MCA recovery. This allows a system to continue running
diff --git a/mm/Makefile b/mm/Makefile
index 262360a..7deaa29 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,8 +15,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page_alloc.o page-writeback.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-			   page_isolation.o mm_init.o mmu_context.o percpu.o \
-			   compaction.o $(mmu-y)
+			   mm_init.o mmu_context.o percpu.o compaction.o $(mmu-y)
 obj-y += init-mm.o
 
 ifdef CONFIG_NO_BOOTMEM
@@ -55,3 +54,4 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd94467..f17e6e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,7 +51,6 @@
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
-#include <linux/memory.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
@@ -219,7 +218,7 @@ EXPORT_SYMBOL(nr_online_nodes);
 
 int page_group_by_mobility_disabled __read_mostly;
 
-static void set_pageblock_migratetype(struct page *page, int migratetype)
+void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 
 	if (unlikely(page_group_by_mobility_disabled))
@@ -953,7 +952,7 @@ static int move_freepages(struct zone *zone,
 	return pages_moved;
 }
 
-static int move_freepages_block(struct zone *zone, struct page *page,
+int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype)
 {
 	unsigned long start_pfn, end_pfn;
@@ -5468,8 +5467,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
  * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
  * expect this function should be exact.
  */
-static bool
-__has_unmovable_pages(struct zone *zone, struct page *page, int count)
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
 {
 	unsigned long pfn, iter, found;
 	int mt;
@@ -5546,77 +5544,7 @@ bool is_pageblock_removable_nolock(struct page *page)
 			zone->zone_start_pfn + zone->spanned_pages <= pfn)
 		return false;
 
-	return !__has_unmovable_pages(zone, page, 0);
-}
-
-int set_migratetype_isolate(struct page *page)
-{
-	struct zone *zone;
-	unsigned long flags, pfn;
-	struct memory_isolate_notify arg;
-	int notifier_ret;
-	int ret = -EBUSY;
-
-	zone = page_zone(page);
-
-	spin_lock_irqsave(&zone->lock, flags);
-
-	pfn = page_to_pfn(page);
-	arg.start_pfn = pfn;
-	arg.nr_pages = pageblock_nr_pages;
-	arg.pages_found = 0;
-
-	/*
-	 * It may be possible to isolate a pageblock even if the
-	 * migratetype is not MIGRATE_MOVABLE. The memory isolation
-	 * notifier chain is used by balloon drivers to return the
-	 * number of pages in a range that are held by the balloon
-	 * driver to shrink memory. If all the pages are accounted for
-	 * by balloons, are free, or on the LRU, isolation can continue.
-	 * Later, for example, when memory hotplug notifier runs, these
-	 * pages reported as "can be isolated" should be isolated(freed)
-	 * by the balloon driver through the memory notifier chain.
-	 */
-	notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
-	notifier_ret = notifier_to_errno(notifier_ret);
-	if (notifier_ret)
-		goto out;
-	/*
-	 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
-	 * We just check MOVABLE pages.
-	 */
-	if (!__has_unmovable_pages(zone, page, arg.pages_found))
-		ret = 0;
-	/*
-	 * Unmovable means "not-on-lru" pages. If Unmovable pages are
-	 * larger than removable-by-driver pages reported by notifier,
-	 * we'll fail.
-	 */
-
-out:
-	if (!ret) {
-		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
-		move_freepages_block(zone, page, MIGRATE_ISOLATE);
-	}
-
-	spin_unlock_irqrestore(&zone->lock, flags);
-	if (!ret)
-		drain_all_pages();
-	return ret;
-}
-
-void unset_migratetype_isolate(struct page *page, unsigned migratetype)
-{
-	struct zone *zone;
-	unsigned long flags;
-	zone = page_zone(page);
-	spin_lock_irqsave(&zone->lock, flags);
-	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
-		goto out;
-	set_pageblock_migratetype(page, migratetype);
-	move_freepages_block(zone, page, migratetype);
-out:
-	spin_unlock_irqrestore(&zone->lock, flags);
+	return !has_unmovable_pages(zone, page, 0);
 }
 
 #ifdef CONFIG_CMA
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c9f0477..fb482cf 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -5,8 +5,79 @@
 #include <linux/mm.h>
 #include <linux/page-isolation.h>
 #include <linux/pageblock-flags.h>
+#include <linux/memory.h>
 #include "internal.h"
 
+int set_migratetype_isolate(struct page *page)
+{
+	struct zone *zone;
+	unsigned long flags, pfn;
+	struct memory_isolate_notify arg;
+	int notifier_ret;
+	int ret = -EBUSY;
+
+	zone = page_zone(page);
+
+	spin_lock_irqsave(&zone->lock, flags);
+
+	pfn = page_to_pfn(page);
+	arg.start_pfn = pfn;
+	arg.nr_pages = pageblock_nr_pages;
+	arg.pages_found = 0;
+
+	/*
+	 * It may be possible to isolate a pageblock even if the
+	 * migratetype is not MIGRATE_MOVABLE. The memory isolation
+	 * notifier chain is used by balloon drivers to return the
+	 * number of pages in a range that are held by the balloon
+	 * driver to shrink memory. If all the pages are accounted for
+	 * by balloons, are free, or on the LRU, isolation can continue.
+	 * Later, for example, when memory hotplug notifier runs, these
+	 * pages reported as "can be isolated" should be isolated(freed)
+	 * by the balloon driver through the memory notifier chain.
+	 */
+	notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
+	notifier_ret = notifier_to_errno(notifier_ret);
+	if (notifier_ret)
+		goto out;
+	/*
+	 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+	 * We just check MOVABLE pages.
+	 */
+	if (!has_unmovable_pages(zone, page, arg.pages_found))
+		ret = 0;
+
+	/*
+	 * immobile means "not-on-lru" paes. If immobile is larger than
+	 * removable-by-driver pages reported by notifier, we'll fail.
+	 */
+
+out:
+	if (!ret) {
+		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+		move_freepages_block(zone, page, MIGRATE_ISOLATE);
+	}
+
+	spin_unlock_irqrestore(&zone->lock, flags);
+	if (!ret)
+		drain_all_pages();
+	return ret;
+}
+
+void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+{
+	struct zone *zone;
+	unsigned long flags;
+	zone = page_zone(page);
+	spin_lock_irqsave(&zone->lock, flags);
+	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+		goto out;
+	set_pageblock_migratetype(page, migratetype);
+	move_freepages_block(zone, page, migratetype);
+out:
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+
 static inline struct page *
 __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 {
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 1/3 v3] mm: Factor out memory isolate functions
@ 2012-07-12  2:50 ` Minchan Kim
  0 siblings, 0 replies; 18+ messages in thread
From: Minchan Kim @ 2012-07-12  2:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Mel Gorman, KOSAKI Motohiro,
	Kamezawa Hiroyuki, Aaditya Kumar, Minchan Kim, Andi Kleen,
	Marek Szyprowski

Now mm/page_alloc.c has some memory isolation functions but they
are used oly when we enable CONFIG_{CMA|MEMORY_HOTPLUG|MEMORY_FAILURE}.
So let's make it configurable by new CONFIG_MEMORY_ISOLATION so that it
can reduce binary size and we can check it simple by CONFIG_MEMORY_ISOLATION,
not if defined CONFIG_{CMA|MEMORY_HOTPLUG|MEMORY_FAILURE}.

* from v2
 - rebase on mmotm-2012-07-10-16-59

* from v1
 - rebase on next-20120626

Cc: Andi Kleen <andi@firstfloor.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 drivers/base/Kconfig           |    1 +
 include/linux/page-isolation.h |   13 +++++--
 mm/Kconfig                     |    5 +++
 mm/Makefile                    |    4 +-
 mm/page_alloc.c                |   80 ++--------------------------------------
 mm/page_isolation.c            |   71 +++++++++++++++++++++++++++++++++++
 6 files changed, 92 insertions(+), 82 deletions(-)

diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 9b21469..08b4c52 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -196,6 +196,7 @@ config CMA
 	bool "Contiguous Memory Allocator (EXPERIMENTAL)"
 	depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK && EXPERIMENTAL
 	select MIGRATION
+	select MEMORY_ISOLATION
 	help
 	  This enables the Contiguous Memory Allocator which allows drivers
 	  to allocate big physically-contiguous blocks of memory for use with
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 3bdcab3..105077a 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -1,6 +1,11 @@
 #ifndef __LINUX_PAGEISOLATION_H
 #define __LINUX_PAGEISOLATION_H
 
+
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count);
+void set_pageblock_migratetype(struct page *page, int migratetype);
+int move_freepages_block(struct zone *zone, struct page *page,
+				int migratetype);
 /*
  * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
  * If specified range includes migrate types other than MOVABLE or CMA,
@@ -10,7 +15,7 @@
  * free all pages in the range. test_page_isolated() can be used for
  * test it.
  */
-extern int
+int
 start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 			 unsigned migratetype);
 
@@ -18,7 +23,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
  * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
  * target range is [start_pfn, end_pfn)
  */
-extern int
+int
 undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 			unsigned migratetype);
 
@@ -30,8 +35,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn);
 /*
  * Internal functions. Changes pageblock's migrate type.
  */
-extern int set_migratetype_isolate(struct page *page);
-extern void unset_migratetype_isolate(struct page *page, unsigned migratetype);
+int set_migratetype_isolate(struct page *page);
+void unset_migratetype_isolate(struct page *page, unsigned migratetype);
 
 
 #endif
diff --git a/mm/Kconfig b/mm/Kconfig
index 82fed4e..d5c8019 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK
 config NO_BOOTMEM
 	boolean
 
+config MEMORY_ISOLATION
+	boolean
+
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
 	bool "Allow for memory hot-add"
+	select MEMORY_ISOLATION
 	depends on SPARSEMEM || X86_64_ACPI_NUMA
 	depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
 	depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -272,6 +276,7 @@ config MEMORY_FAILURE
 	depends on MMU
 	depends on ARCH_SUPPORTS_MEMORY_FAILURE
 	bool "Enable recovery from hardware memory errors"
+	select MEMORY_ISOLATION
 	help
 	  Enables code to recover from some memory failures on systems
 	  with MCA recovery. This allows a system to continue running
diff --git a/mm/Makefile b/mm/Makefile
index 262360a..7deaa29 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,8 +15,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page_alloc.o page-writeback.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-			   page_isolation.o mm_init.o mmu_context.o percpu.o \
-			   compaction.o $(mmu-y)
+			   mm_init.o mmu_context.o percpu.o compaction.o $(mmu-y)
 obj-y += init-mm.o
 
 ifdef CONFIG_NO_BOOTMEM
@@ -55,3 +54,4 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd94467..f17e6e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,7 +51,6 @@
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
-#include <linux/memory.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
@@ -219,7 +218,7 @@ EXPORT_SYMBOL(nr_online_nodes);
 
 int page_group_by_mobility_disabled __read_mostly;
 
-static void set_pageblock_migratetype(struct page *page, int migratetype)
+void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 
 	if (unlikely(page_group_by_mobility_disabled))
@@ -953,7 +952,7 @@ static int move_freepages(struct zone *zone,
 	return pages_moved;
 }
 
-static int move_freepages_block(struct zone *zone, struct page *page,
+int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype)
 {
 	unsigned long start_pfn, end_pfn;
@@ -5468,8 +5467,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
  * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
  * expect this function should be exact.
  */
-static bool
-__has_unmovable_pages(struct zone *zone, struct page *page, int count)
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
 {
 	unsigned long pfn, iter, found;
 	int mt;
@@ -5546,77 +5544,7 @@ bool is_pageblock_removable_nolock(struct page *page)
 			zone->zone_start_pfn + zone->spanned_pages <= pfn)
 		return false;
 
-	return !__has_unmovable_pages(zone, page, 0);
-}
-
-int set_migratetype_isolate(struct page *page)
-{
-	struct zone *zone;
-	unsigned long flags, pfn;
-	struct memory_isolate_notify arg;
-	int notifier_ret;
-	int ret = -EBUSY;
-
-	zone = page_zone(page);
-
-	spin_lock_irqsave(&zone->lock, flags);
-
-	pfn = page_to_pfn(page);
-	arg.start_pfn = pfn;
-	arg.nr_pages = pageblock_nr_pages;
-	arg.pages_found = 0;
-
-	/*
-	 * It may be possible to isolate a pageblock even if the
-	 * migratetype is not MIGRATE_MOVABLE. The memory isolation
-	 * notifier chain is used by balloon drivers to return the
-	 * number of pages in a range that are held by the balloon
-	 * driver to shrink memory. If all the pages are accounted for
-	 * by balloons, are free, or on the LRU, isolation can continue.
-	 * Later, for example, when memory hotplug notifier runs, these
-	 * pages reported as "can be isolated" should be isolated(freed)
-	 * by the balloon driver through the memory notifier chain.
-	 */
-	notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
-	notifier_ret = notifier_to_errno(notifier_ret);
-	if (notifier_ret)
-		goto out;
-	/*
-	 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
-	 * We just check MOVABLE pages.
-	 */
-	if (!__has_unmovable_pages(zone, page, arg.pages_found))
-		ret = 0;
-	/*
-	 * Unmovable means "not-on-lru" pages. If Unmovable pages are
-	 * larger than removable-by-driver pages reported by notifier,
-	 * we'll fail.
-	 */
-
-out:
-	if (!ret) {
-		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
-		move_freepages_block(zone, page, MIGRATE_ISOLATE);
-	}
-
-	spin_unlock_irqrestore(&zone->lock, flags);
-	if (!ret)
-		drain_all_pages();
-	return ret;
-}
-
-void unset_migratetype_isolate(struct page *page, unsigned migratetype)
-{
-	struct zone *zone;
-	unsigned long flags;
-	zone = page_zone(page);
-	spin_lock_irqsave(&zone->lock, flags);
-	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
-		goto out;
-	set_pageblock_migratetype(page, migratetype);
-	move_freepages_block(zone, page, migratetype);
-out:
-	spin_unlock_irqrestore(&zone->lock, flags);
+	return !has_unmovable_pages(zone, page, 0);
 }
 
 #ifdef CONFIG_CMA
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c9f0477..fb482cf 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -5,8 +5,79 @@
 #include <linux/mm.h>
 #include <linux/page-isolation.h>
 #include <linux/pageblock-flags.h>
+#include <linux/memory.h>
 #include "internal.h"
 
+int set_migratetype_isolate(struct page *page)
+{
+	struct zone *zone;
+	unsigned long flags, pfn;
+	struct memory_isolate_notify arg;
+	int notifier_ret;
+	int ret = -EBUSY;
+
+	zone = page_zone(page);
+
+	spin_lock_irqsave(&zone->lock, flags);
+
+	pfn = page_to_pfn(page);
+	arg.start_pfn = pfn;
+	arg.nr_pages = pageblock_nr_pages;
+	arg.pages_found = 0;
+
+	/*
+	 * It may be possible to isolate a pageblock even if the
+	 * migratetype is not MIGRATE_MOVABLE. The memory isolation
+	 * notifier chain is used by balloon drivers to return the
+	 * number of pages in a range that are held by the balloon
+	 * driver to shrink memory. If all the pages are accounted for
+	 * by balloons, are free, or on the LRU, isolation can continue.
+	 * Later, for example, when memory hotplug notifier runs, these
+	 * pages reported as "can be isolated" should be isolated(freed)
+	 * by the balloon driver through the memory notifier chain.
+	 */
+	notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
+	notifier_ret = notifier_to_errno(notifier_ret);
+	if (notifier_ret)
+		goto out;
+	/*
+	 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+	 * We just check MOVABLE pages.
+	 */
+	if (!has_unmovable_pages(zone, page, arg.pages_found))
+		ret = 0;
+
+	/*
+	 * immobile means "not-on-lru" paes. If immobile is larger than
+	 * removable-by-driver pages reported by notifier, we'll fail.
+	 */
+
+out:
+	if (!ret) {
+		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+		move_freepages_block(zone, page, MIGRATE_ISOLATE);
+	}
+
+	spin_unlock_irqrestore(&zone->lock, flags);
+	if (!ret)
+		drain_all_pages();
+	return ret;
+}
+
+void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+{
+	struct zone *zone;
+	unsigned long flags;
+	zone = page_zone(page);
+	spin_lock_irqsave(&zone->lock, flags);
+	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+		goto out;
+	set_pageblock_migratetype(page, migratetype);
+	move_freepages_block(zone, page, migratetype);
+out:
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+
 static inline struct page *
 __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 {
-- 
1.7.9.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/3 v3] mm: bug fix free page check in zone_watermark_ok
  2012-07-12  2:50 ` Minchan Kim
@ 2012-07-12  2:50   ` Minchan Kim
  -1 siblings, 0 replies; 18+ messages in thread
From: Minchan Kim @ 2012-07-12  2:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Mel Gorman, KOSAKI Motohiro,
	Kamezawa Hiroyuki, Aaditya Kumar, Minchan Kim

In __zone_watermark_ok, free and min are signed long type
while z->lowmem_reserve[classzone_idx] is unsigned long type.
So comparision of them could be wrong due to type conversion
to unsigned although free_pages is minus value.

It could return true instead of false in case of order-0 check
so that kswapd could sleep forever. It means livelock because
direct reclaimer loops forever until kswapd set
zone->all_unreclaimable.

Aaditya reported this problem when he test my hotplug patch.

Reported-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
Tested-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
Signed-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
This patch isn't dependent with this series.
It seems to be candidate for -stable but I'm not sure because of this part.
So, pass the decision to akpm.

" - It must fix a real bug that bothers people (not a, "This could be a
   problem..." type thing)."

 mm/page_alloc.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f17e6e4..627653c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1594,6 +1594,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark;
+	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
 	int o;
 
 	free_pages -= (1 << order) - 1;
@@ -1602,7 +1603,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
 
-	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+	if (free_pages <= min + lowmem_reserve)
 		return false;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/3 v3] mm: bug fix free page check in zone_watermark_ok
@ 2012-07-12  2:50   ` Minchan Kim
  0 siblings, 0 replies; 18+ messages in thread
From: Minchan Kim @ 2012-07-12  2:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Mel Gorman, KOSAKI Motohiro,
	Kamezawa Hiroyuki, Aaditya Kumar, Minchan Kim

In __zone_watermark_ok, free and min are signed long type
while z->lowmem_reserve[classzone_idx] is unsigned long type.
So comparision of them could be wrong due to type conversion
to unsigned although free_pages is minus value.

It could return true instead of false in case of order-0 check
so that kswapd could sleep forever. It means livelock because
direct reclaimer loops forever until kswapd set
zone->all_unreclaimable.

Aaditya reported this problem when he test my hotplug patch.

Reported-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
Tested-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
Signed-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
This patch isn't dependent with this series.
It seems to be candidate for -stable but I'm not sure because of this part.
So, pass the decision to akpm.

" - It must fix a real bug that bothers people (not a, "This could be a
   problem..." type thing)."

 mm/page_alloc.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f17e6e4..627653c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1594,6 +1594,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark;
+	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
 	int o;
 
 	free_pages -= (1 << order) - 1;
@@ -1602,7 +1603,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
 
-	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+	if (free_pages <= min + lowmem_reserve)
 		return false;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
-- 
1.7.9.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/3 v3] memory-hotplug: fix kswapd looping forever problem
  2012-07-12  2:50 ` Minchan Kim
@ 2012-07-12  2:50   ` Minchan Kim
  -1 siblings, 0 replies; 18+ messages in thread
From: Minchan Kim @ 2012-07-12  2:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Mel Gorman, KOSAKI Motohiro,
	Kamezawa Hiroyuki, Aaditya Kumar, Minchan Kim

When hotplug offlining happens on zone A, it starts to mark freed page
as MIGRATE_ISOLATE type in buddy for preventing further allocation.
(MIGRATE_ISOLATE is very irony type because it's apparently on buddy
but we can't allocate them).
When the memory shortage happens during hotplug offlining,
current task starts to reclaim, then wake up kswapd.
Kswapd checks watermark, then go sleep because current zone_watermark_ok_safe
doesn't consider MIGRATE_ISOLATE freed page count.
Current task continue to reclaim in direct reclaim path without kswapd's helping.
The problem is that zone->all_unreclaimable is set by only kswapd
so that current task would be looping forever like below.

__alloc_pages_slowpath
restart:
	wake_all_kswapd
rebalance:
	__alloc_pages_direct_reclaim
		do_try_to_free_pages
			if global_reclaim && !all_unreclaimable
				return 1; /* It means we did did_some_progress */
	skip __alloc_pages_may_oom
	should_alloc_retry
		goto rebalance;

If we apply KOSAKI's patch[1] which doesn't depends on kswapd
about setting zone->all_unreclaimable, we can solve this problem
by killing some task in direct reclaim path. But it doesn't wake up kswapd, still.
It could be a problem still if other subsystem needs GFP_ATOMIC request.
So kswapd should consider MIGRATE_ISOLATE when it calculate free pages
BEFORE going sleep.

This patch counts the number of MIGRATE_ISOLATE page block and
zone_watermark_ok_safe will consider it if the system has such blocks
(fortunately, it's very rare so no problem in POV overhead and kswapd is never
hotpath).

Copy/modify from Mel's quote
"
Ideal solution would be "allocating" the pageblock.
It would keep the free space accounting as it is but historically,
memory hotplug didn't allocate pages because it would be difficult to
detect if a pageblock was isolated or if part of some balloon.
Allocating just full pageblocks would work around this, However,
it would play very badly with CMA.
"

[1] http://lkml.org/lkml/2012/6/14/74

* from v2
 - rebased on mmotm-2012-07-10-16-59
 - Add Tested-by

* from v1
 - add changelog
 - make functions simple
 - remove atomic variable
 - discard exact isolated free page accounting.
 - rebased on next-20120626

Suggested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tested-by: Aaditya Kumar <aaditya.kumar.30@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
---
 include/linux/mmzone.h |    8 ++++++++
 mm/page_alloc.c        |   31 +++++++++++++++++++++++++++++++
 mm/page_isolation.c    |   29 +++++++++++++++++++++++++++--
 3 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3219014..3bd253e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -477,6 +477,14 @@ struct zone {
 	 * rarely used fields:
 	 */
 	const char		*name;
+#ifdef CONFIG_MEMORY_ISOLATION
+	/*
+	 * the number of MIGRATE_ISOLATE *pageblock*.
+	 * We need this for free page counting. Look at zone_watermark_ok_safe.
+	 * It's protected by zone->lock
+	 */
+	int		nr_pageblock_isolate;
+#endif
 } ____cacheline_internodealigned_in_smp;
 
 typedef enum {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 627653c..980c75b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -218,6 +218,11 @@ EXPORT_SYMBOL(nr_online_nodes);
 
 int page_group_by_mobility_disabled __read_mostly;
 
+/*
+ * NOTE:
+ * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
+ * Instead, use {un}set_pageblock_isolate.
+ */
 void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 
@@ -1618,6 +1623,23 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 	return true;
 }
 
+#ifdef CONFIG_MEMORY_ISOLATION
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+	unsigned long nr_pages = 0;
+
+	if (unlikely(zone->nr_pageblock_isolate)) {
+		nr_pages = zone->nr_pageblock_isolate * pageblock_nr_pages;
+	}
+	return nr_pages;
+}
+#else
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+	return 0;
+}
+#endif
+
 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
@@ -1633,6 +1655,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
 	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
 		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
 
+	/*
+	 * If the zone has MIGRATE_ISOLATE type free page,
+	 * we should consider it. nr_zone_isolate_freepages is never
+	 * accurate so kswapd might not sleep although she can.
+	 * But it's more desirable for memory hotplug rather than
+	 * forever sleep which cause livelock in direct reclaim path.
+	 */
+	free_pages -= nr_zone_isolate_freepages(z);
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 								free_pages);
 }
@@ -4397,6 +4427,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		lruvec_init(&zone->lruvec, zone);
 		zap_zone_vm_stats(zone);
 		zone->flags = 0;
+		zone->nr_pageblock_isolate = 0;
 		if (!size)
 			continue;
 
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index fb482cf..64abb33 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -8,6 +8,31 @@
 #include <linux/memory.h>
 #include "internal.h"
 
+/* called by holding zone->lock */
+static void set_pageblock_isolate(struct zone *zone, struct page *page)
+{
+	BUG_ON(page_zone(page) != zone);
+
+	if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
+		return;
+
+	set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+	zone->nr_pageblock_isolate++;
+}
+
+/* called by holding zone->lock */
+static void restore_pageblock_isolate(struct zone *zone, struct page *page,
+		int migratetype)
+{
+	BUG_ON(page_zone(page) != zone);
+	if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
+		return;
+
+	BUG_ON(zone->nr_pageblock_isolate <= 0);
+	set_pageblock_migratetype(page, migratetype);
+	zone->nr_pageblock_isolate--;
+}
+
 int set_migratetype_isolate(struct page *page)
 {
 	struct zone *zone;
@@ -54,7 +79,7 @@ int set_migratetype_isolate(struct page *page)
 
 out:
 	if (!ret) {
-		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+		set_pageblock_isolate(zone, page);
 		move_freepages_block(zone, page, MIGRATE_ISOLATE);
 	}
 
@@ -72,8 +97,8 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 	spin_lock_irqsave(&zone->lock, flags);
 	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
 		goto out;
-	set_pageblock_migratetype(page, migratetype);
 	move_freepages_block(zone, page, migratetype);
+	restore_pageblock_isolate(zone, page, migratetype);
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/3 v3] memory-hotplug: fix kswapd looping forever problem
@ 2012-07-12  2:50   ` Minchan Kim
  0 siblings, 0 replies; 18+ messages in thread
From: Minchan Kim @ 2012-07-12  2:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Mel Gorman, KOSAKI Motohiro,
	Kamezawa Hiroyuki, Aaditya Kumar, Minchan Kim

When hotplug offlining happens on zone A, it starts to mark freed page
as MIGRATE_ISOLATE type in buddy for preventing further allocation.
(MIGRATE_ISOLATE is very irony type because it's apparently on buddy
but we can't allocate them).
When the memory shortage happens during hotplug offlining,
current task starts to reclaim, then wake up kswapd.
Kswapd checks watermark, then go sleep because current zone_watermark_ok_safe
doesn't consider MIGRATE_ISOLATE freed page count.
Current task continue to reclaim in direct reclaim path without kswapd's helping.
The problem is that zone->all_unreclaimable is set by only kswapd
so that current task would be looping forever like below.

__alloc_pages_slowpath
restart:
	wake_all_kswapd
rebalance:
	__alloc_pages_direct_reclaim
		do_try_to_free_pages
			if global_reclaim && !all_unreclaimable
				return 1; /* It means we did did_some_progress */
	skip __alloc_pages_may_oom
	should_alloc_retry
		goto rebalance;

If we apply KOSAKI's patch[1] which doesn't depends on kswapd
about setting zone->all_unreclaimable, we can solve this problem
by killing some task in direct reclaim path. But it doesn't wake up kswapd, still.
It could be a problem still if other subsystem needs GFP_ATOMIC request.
So kswapd should consider MIGRATE_ISOLATE when it calculate free pages
BEFORE going sleep.

This patch counts the number of MIGRATE_ISOLATE page block and
zone_watermark_ok_safe will consider it if the system has such blocks
(fortunately, it's very rare so no problem in POV overhead and kswapd is never
hotpath).

Copy/modify from Mel's quote
"
Ideal solution would be "allocating" the pageblock.
It would keep the free space accounting as it is but historically,
memory hotplug didn't allocate pages because it would be difficult to
detect if a pageblock was isolated or if part of some balloon.
Allocating just full pageblocks would work around this, However,
it would play very badly with CMA.
"

[1] http://lkml.org/lkml/2012/6/14/74

* from v2
 - rebased on mmotm-2012-07-10-16-59
 - Add Tested-by

* from v1
 - add changelog
 - make functions simple
 - remove atomic variable
 - discard exact isolated free page accounting.
 - rebased on next-20120626

Suggested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tested-by: Aaditya Kumar <aaditya.kumar.30@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
---
 include/linux/mmzone.h |    8 ++++++++
 mm/page_alloc.c        |   31 +++++++++++++++++++++++++++++++
 mm/page_isolation.c    |   29 +++++++++++++++++++++++++++--
 3 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3219014..3bd253e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -477,6 +477,14 @@ struct zone {
 	 * rarely used fields:
 	 */
 	const char		*name;
+#ifdef CONFIG_MEMORY_ISOLATION
+	/*
+	 * the number of MIGRATE_ISOLATE *pageblock*.
+	 * We need this for free page counting. Look at zone_watermark_ok_safe.
+	 * It's protected by zone->lock
+	 */
+	int		nr_pageblock_isolate;
+#endif
 } ____cacheline_internodealigned_in_smp;
 
 typedef enum {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 627653c..980c75b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -218,6 +218,11 @@ EXPORT_SYMBOL(nr_online_nodes);
 
 int page_group_by_mobility_disabled __read_mostly;
 
+/*
+ * NOTE:
+ * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
+ * Instead, use {un}set_pageblock_isolate.
+ */
 void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 
@@ -1618,6 +1623,23 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 	return true;
 }
 
+#ifdef CONFIG_MEMORY_ISOLATION
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+	unsigned long nr_pages = 0;
+
+	if (unlikely(zone->nr_pageblock_isolate)) {
+		nr_pages = zone->nr_pageblock_isolate * pageblock_nr_pages;
+	}
+	return nr_pages;
+}
+#else
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+	return 0;
+}
+#endif
+
 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
@@ -1633,6 +1655,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
 	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
 		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
 
+	/*
+	 * If the zone has MIGRATE_ISOLATE type free page,
+	 * we should consider it. nr_zone_isolate_freepages is never
+	 * accurate so kswapd might not sleep although she can.
+	 * But it's more desirable for memory hotplug rather than
+	 * forever sleep which cause livelock in direct reclaim path.
+	 */
+	free_pages -= nr_zone_isolate_freepages(z);
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 								free_pages);
 }
@@ -4397,6 +4427,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		lruvec_init(&zone->lruvec, zone);
 		zap_zone_vm_stats(zone);
 		zone->flags = 0;
+		zone->nr_pageblock_isolate = 0;
 		if (!size)
 			continue;
 
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index fb482cf..64abb33 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -8,6 +8,31 @@
 #include <linux/memory.h>
 #include "internal.h"
 
+/* called by holding zone->lock */
+static void set_pageblock_isolate(struct zone *zone, struct page *page)
+{
+	BUG_ON(page_zone(page) != zone);
+
+	if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
+		return;
+
+	set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+	zone->nr_pageblock_isolate++;
+}
+
+/* called by holding zone->lock */
+static void restore_pageblock_isolate(struct zone *zone, struct page *page,
+		int migratetype)
+{
+	BUG_ON(page_zone(page) != zone);
+	if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
+		return;
+
+	BUG_ON(zone->nr_pageblock_isolate <= 0);
+	set_pageblock_migratetype(page, migratetype);
+	zone->nr_pageblock_isolate--;
+}
+
 int set_migratetype_isolate(struct page *page)
 {
 	struct zone *zone;
@@ -54,7 +79,7 @@ int set_migratetype_isolate(struct page *page)
 
 out:
 	if (!ret) {
-		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+		set_pageblock_isolate(zone, page);
 		move_freepages_block(zone, page, MIGRATE_ISOLATE);
 	}
 
@@ -72,8 +97,8 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 	spin_lock_irqsave(&zone->lock, flags);
 	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
 		goto out;
-	set_pageblock_migratetype(page, migratetype);
 	move_freepages_block(zone, page, migratetype);
+	restore_pageblock_isolate(zone, page, migratetype);
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
-- 
1.7.9.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/3 v3] mm: bug fix free page check in zone_watermark_ok
  2012-07-12  2:50   ` Minchan Kim
@ 2012-07-12  8:19     ` Michal Hocko
  -1 siblings, 0 replies; 18+ messages in thread
From: Michal Hocko @ 2012-07-12  8:19 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman,
	KOSAKI Motohiro, Kamezawa Hiroyuki, Aaditya Kumar

On Thu 12-07-12 11:50:48, Minchan Kim wrote:
> In __zone_watermark_ok, free and min are signed long type
> while z->lowmem_reserve[classzone_idx] is unsigned long type.
> So comparision of them could be wrong due to type conversion
> to unsigned although free_pages is minus value.

Agreed on that
but
> 
> It could return true instead of false in case of order-0 check
> so that kswapd could sleep forever. 

I am kind of lost here. How can we have negative free_pages with
order-0? It would need to come with a negative value because 
free_pages -= (1 << order) - 1;
won't make it negative.

> It means livelock because direct reclaimer loops forever until kswapd
> set zone->all_unreclaimable.
> 
> Aaditya reported this problem when he test my hotplug patch.
> 
> Reported-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> Tested-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> Signed-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> Signed-off-by: Minchan Kim <minchan@kernel.org>

So you can add my Reviewed-by: Michal Hocko <mhocko@suse.cz>
but the changelog could be more clear.

> ---
> This patch isn't dependent with this series.
> It seems to be candidate for -stable but I'm not sure because of this part.
> So, pass the decision to akpm.
> 
> " - It must fix a real bug that bothers people (not a, "This could be a
>    problem..." type thing)."

I am wondering what Testted-by means if "This could be a problem..."
type thing)."

> 
>  mm/page_alloc.c |    3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index f17e6e4..627653c 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1594,6 +1594,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
>  {
>  	/* free_pages my go negative - that's OK */
>  	long min = mark;
> +	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
>  	int o;
>  
>  	free_pages -= (1 << order) - 1;
> @@ -1602,7 +1603,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
>  	if (alloc_flags & ALLOC_HARDER)
>  		min -= min / 4;
>  
> -	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
> +	if (free_pages <= min + lowmem_reserve)
>  		return false;
>  	for (o = 0; o < order; o++) {
>  		/* At the next order, this order's pages become unavailable */
> -- 
> 1.7.9.5
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Michal Hocko
SUSE Labs
SUSE LINUX s.r.o.
Lihovarska 1060/12
190 00 Praha 9    
Czech Republic

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/3 v3] mm: bug fix free page check in zone_watermark_ok
@ 2012-07-12  8:19     ` Michal Hocko
  0 siblings, 0 replies; 18+ messages in thread
From: Michal Hocko @ 2012-07-12  8:19 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman,
	KOSAKI Motohiro, Kamezawa Hiroyuki, Aaditya Kumar

On Thu 12-07-12 11:50:48, Minchan Kim wrote:
> In __zone_watermark_ok, free and min are signed long type
> while z->lowmem_reserve[classzone_idx] is unsigned long type.
> So comparision of them could be wrong due to type conversion
> to unsigned although free_pages is minus value.

Agreed on that
but
> 
> It could return true instead of false in case of order-0 check
> so that kswapd could sleep forever. 

I am kind of lost here. How can we have negative free_pages with
order-0? It would need to come with a negative value because 
free_pages -= (1 << order) - 1;
won't make it negative.

> It means livelock because direct reclaimer loops forever until kswapd
> set zone->all_unreclaimable.
> 
> Aaditya reported this problem when he test my hotplug patch.
> 
> Reported-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> Tested-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> Signed-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> Signed-off-by: Minchan Kim <minchan@kernel.org>

So you can add my Reviewed-by: Michal Hocko <mhocko@suse.cz>
but the changelog could be more clear.

> ---
> This patch isn't dependent with this series.
> It seems to be candidate for -stable but I'm not sure because of this part.
> So, pass the decision to akpm.
> 
> " - It must fix a real bug that bothers people (not a, "This could be a
>    problem..." type thing)."

I am wondering what Testted-by means if "This could be a problem..."
type thing)."

> 
>  mm/page_alloc.c |    3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index f17e6e4..627653c 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1594,6 +1594,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
>  {
>  	/* free_pages my go negative - that's OK */
>  	long min = mark;
> +	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
>  	int o;
>  
>  	free_pages -= (1 << order) - 1;
> @@ -1602,7 +1603,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
>  	if (alloc_flags & ALLOC_HARDER)
>  		min -= min / 4;
>  
> -	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
> +	if (free_pages <= min + lowmem_reserve)
>  		return false;
>  	for (o = 0; o < order; o++) {
>  		/* At the next order, this order's pages become unavailable */
> -- 
> 1.7.9.5
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Michal Hocko
SUSE Labs
SUSE LINUX s.r.o.
Lihovarska 1060/12
190 00 Praha 9    
Czech Republic

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/3 v3] mm: bug fix free page check in zone_watermark_ok
  2012-07-12  8:19     ` Michal Hocko
@ 2012-07-12  9:05       ` Minchan Kim
  -1 siblings, 0 replies; 18+ messages in thread
From: Minchan Kim @ 2012-07-12  9:05 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman,
	KOSAKI Motohiro, Kamezawa Hiroyuki, Aaditya Kumar

Hi Michal,

On Thu, Jul 12, 2012 at 10:19:23AM +0200, Michal Hocko wrote:
> On Thu 12-07-12 11:50:48, Minchan Kim wrote:
> > In __zone_watermark_ok, free and min are signed long type
> > while z->lowmem_reserve[classzone_idx] is unsigned long type.
> > So comparision of them could be wrong due to type conversion
> > to unsigned although free_pages is minus value.
> 
> Agreed on that
> but
> > 
> > It could return true instead of false in case of order-0 check
> > so that kswapd could sleep forever. 
> 
> I am kind of lost here. How can we have negative free_pages with
> order-0? It would need to come with a negative value because 
> free_pages -= (1 << order) - 1;
> won't make it negative.

Right you are. I missed that part.

The reason Aaditya reported this problem is caused by my patch[1] in
this series. zone_watermark_ok_safe didn't reset free_pages to zero
although free_pages becomes minus value(Please, look at [1])

[1] memory-hotplug: fix kswapd looping forever problem

I think we have no problem in current code because if order isn't zero 
and free_pages is minus value, it could exit in the middle of loop with
false.

So I think it was totally patch's problem. :(
But I agree auto type casting problem is subtle error-prone so it's valuable
to fix, too. As you mentiond, I should rewrite description and subject for it.

Thanks for the review!

> 
> > It means livelock because direct reclaimer loops forever until kswapd
> > set zone->all_unreclaimable.
> > 
> > Aaditya reported this problem when he test my hotplug patch.
> > 
> > Reported-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> > Tested-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> > Signed-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> 
> So you can add my Reviewed-by: Michal Hocko <mhocko@suse.cz>
> but the changelog could be more clear.
> 
> > ---
> > This patch isn't dependent with this series.
> > It seems to be candidate for -stable but I'm not sure because of this part.
> > So, pass the decision to akpm.
> > 
> > " - It must fix a real bug that bothers people (not a, "This could be a
> >    problem..." type thing)."
> 
> I am wondering what Testted-by means if "This could be a problem..."
> type thing)."

He reported it during testing this series so I don't think it will happen
in current code because zone_page_state and zone_page_state_snapshot
can't set free_pages to minus.

> 
> > 
> >  mm/page_alloc.c |    3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> > 
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index f17e6e4..627653c 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1594,6 +1594,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> >  {
> >  	/* free_pages my go negative - that's OK */
> >  	long min = mark;
> > +	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
> >  	int o;
> >  
> >  	free_pages -= (1 << order) - 1;
> > @@ -1602,7 +1603,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> >  	if (alloc_flags & ALLOC_HARDER)
> >  		min -= min / 4;
> >  
> > -	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
> > +	if (free_pages <= min + lowmem_reserve)
> >  		return false;
> >  	for (o = 0; o < order; o++) {
> >  		/* At the next order, this order's pages become unavailable */
> > -- 
> > 1.7.9.5
> > 
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to majordomo@kvack.org.  For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 
> -- 
> Michal Hocko
> SUSE Labs
> SUSE LINUX s.r.o.
> Lihovarska 1060/12
> 190 00 Praha 9    
> Czech Republic
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/3 v3] mm: bug fix free page check in zone_watermark_ok
@ 2012-07-12  9:05       ` Minchan Kim
  0 siblings, 0 replies; 18+ messages in thread
From: Minchan Kim @ 2012-07-12  9:05 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman,
	KOSAKI Motohiro, Kamezawa Hiroyuki, Aaditya Kumar

Hi Michal,

On Thu, Jul 12, 2012 at 10:19:23AM +0200, Michal Hocko wrote:
> On Thu 12-07-12 11:50:48, Minchan Kim wrote:
> > In __zone_watermark_ok, free and min are signed long type
> > while z->lowmem_reserve[classzone_idx] is unsigned long type.
> > So comparision of them could be wrong due to type conversion
> > to unsigned although free_pages is minus value.
> 
> Agreed on that
> but
> > 
> > It could return true instead of false in case of order-0 check
> > so that kswapd could sleep forever. 
> 
> I am kind of lost here. How can we have negative free_pages with
> order-0? It would need to come with a negative value because 
> free_pages -= (1 << order) - 1;
> won't make it negative.

Right you are. I missed that part.

The reason Aaditya reported this problem is caused by my patch[1] in
this series. zone_watermark_ok_safe didn't reset free_pages to zero
although free_pages becomes minus value(Please, look at [1])

[1] memory-hotplug: fix kswapd looping forever problem

I think we have no problem in current code because if order isn't zero 
and free_pages is minus value, it could exit in the middle of loop with
false.

So I think it was totally patch's problem. :(
But I agree auto type casting problem is subtle error-prone so it's valuable
to fix, too. As you mentiond, I should rewrite description and subject for it.

Thanks for the review!

> 
> > It means livelock because direct reclaimer loops forever until kswapd
> > set zone->all_unreclaimable.
> > 
> > Aaditya reported this problem when he test my hotplug patch.
> > 
> > Reported-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> > Tested-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> > Signed-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> 
> So you can add my Reviewed-by: Michal Hocko <mhocko@suse.cz>
> but the changelog could be more clear.
> 
> > ---
> > This patch isn't dependent with this series.
> > It seems to be candidate for -stable but I'm not sure because of this part.
> > So, pass the decision to akpm.
> > 
> > " - It must fix a real bug that bothers people (not a, "This could be a
> >    problem..." type thing)."
> 
> I am wondering what Testted-by means if "This could be a problem..."
> type thing)."

He reported it during testing this series so I don't think it will happen
in current code because zone_page_state and zone_page_state_snapshot
can't set free_pages to minus.

> 
> > 
> >  mm/page_alloc.c |    3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> > 
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index f17e6e4..627653c 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1594,6 +1594,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> >  {
> >  	/* free_pages my go negative - that's OK */
> >  	long min = mark;
> > +	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
> >  	int o;
> >  
> >  	free_pages -= (1 << order) - 1;
> > @@ -1602,7 +1603,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> >  	if (alloc_flags & ALLOC_HARDER)
> >  		min -= min / 4;
> >  
> > -	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
> > +	if (free_pages <= min + lowmem_reserve)
> >  		return false;
> >  	for (o = 0; o < order; o++) {
> >  		/* At the next order, this order's pages become unavailable */
> > -- 
> > 1.7.9.5
> > 
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to majordomo@kvack.org.  For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 
> -- 
> Michal Hocko
> SUSE Labs
> SUSE LINUX s.r.o.
> Lihovarska 1060/12
> 190 00 Praha 9    
> Czech Republic
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/3 v3] mm: bug fix free page check in zone_watermark_ok
  2012-07-12  9:05       ` Minchan Kim
@ 2012-07-12 10:16         ` Michal Hocko
  -1 siblings, 0 replies; 18+ messages in thread
From: Michal Hocko @ 2012-07-12 10:16 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman,
	KOSAKI Motohiro, Kamezawa Hiroyuki, Aaditya Kumar

On Thu 12-07-12 18:05:19, Minchan Kim wrote:
> Hi Michal,
> 
> On Thu, Jul 12, 2012 at 10:19:23AM +0200, Michal Hocko wrote:
> > On Thu 12-07-12 11:50:48, Minchan Kim wrote:
> > > In __zone_watermark_ok, free and min are signed long type
> > > while z->lowmem_reserve[classzone_idx] is unsigned long type.
> > > So comparision of them could be wrong due to type conversion
> > > to unsigned although free_pages is minus value.
> > 
> > Agreed on that
> > but
> > > 
> > > It could return true instead of false in case of order-0 check
> > > so that kswapd could sleep forever. 
> > 
> > I am kind of lost here. How can we have negative free_pages with
> > order-0? It would need to come with a negative value because 
> > free_pages -= (1 << order) - 1;
> > won't make it negative.
> 
> Right you are. I missed that part.
> 
> The reason Aaditya reported this problem is caused by my patch[1] in
> this series. zone_watermark_ok_safe didn't reset free_pages to zero
> although free_pages becomes minus value(Please, look at [1])
> 
> [1] memory-hotplug: fix kswapd looping forever problem
> 
> I think we have no problem in current code because if order isn't zero 
> and free_pages is minus value, it could exit in the middle of loop with
> false.

Yes, but it would be still nice to not rely on the loop and make the
first test effective. So I think the patch still makes sense.

> 
> So I think it was totally patch's problem. :(
> But I agree auto type casting problem is subtle error-prone so it's valuable
> to fix, too. As you mentiond, I should rewrite description and subject for it.
> 
> Thanks for the review!
> 
> > 
> > > It means livelock because direct reclaimer loops forever until kswapd
> > > set zone->all_unreclaimable.
> > > 
> > > Aaditya reported this problem when he test my hotplug patch.
> > > 
> > > Reported-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> > > Tested-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> > > Signed-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> > > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > 
> > So you can add my Reviewed-by: Michal Hocko <mhocko@suse.cz>
> > but the changelog could be more clear.
> > 
> > > ---
> > > This patch isn't dependent with this series.
> > > It seems to be candidate for -stable but I'm not sure because of this part.
> > > So, pass the decision to akpm.
> > > 
> > > " - It must fix a real bug that bothers people (not a, "This could be a
> > >    problem..." type thing)."
> > 
> > I am wondering what Testted-by means if "This could be a problem..."
> > type thing)."
> 
> He reported it during testing this series so I don't think it will happen
> in current code because zone_page_state and zone_page_state_snapshot
> can't set free_pages to minus.
> 
> > 
> > > 
> > >  mm/page_alloc.c |    3 ++-
> > >  1 file changed, 2 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > index f17e6e4..627653c 100644
> > > --- a/mm/page_alloc.c
> > > +++ b/mm/page_alloc.c
> > > @@ -1594,6 +1594,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> > >  {
> > >  	/* free_pages my go negative - that's OK */
> > >  	long min = mark;
> > > +	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
> > >  	int o;
> > >  
> > >  	free_pages -= (1 << order) - 1;
> > > @@ -1602,7 +1603,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> > >  	if (alloc_flags & ALLOC_HARDER)
> > >  		min -= min / 4;
> > >  
> > > -	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
> > > +	if (free_pages <= min + lowmem_reserve)
> > >  		return false;
> > >  	for (o = 0; o < order; o++) {
> > >  		/* At the next order, this order's pages become unavailable */
> > > -- 
> > > 1.7.9.5
> > > 
> > > --
> > > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > > the body to majordomo@kvack.org.  For more info on Linux MM,
> > > see: http://www.linux-mm.org/ .
> > > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> > 
> > -- 
> > Michal Hocko
> > SUSE Labs
> > SUSE LINUX s.r.o.
> > Lihovarska 1060/12
> > 190 00 Praha 9    
> > Czech Republic
> > 
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to majordomo@kvack.org.  For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 
> -- 
> Kind regards,
> Minchan Kim

-- 
Michal Hocko
SUSE Labs
SUSE LINUX s.r.o.
Lihovarska 1060/12
190 00 Praha 9    
Czech Republic

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/3 v3] mm: bug fix free page check in zone_watermark_ok
@ 2012-07-12 10:16         ` Michal Hocko
  0 siblings, 0 replies; 18+ messages in thread
From: Michal Hocko @ 2012-07-12 10:16 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman,
	KOSAKI Motohiro, Kamezawa Hiroyuki, Aaditya Kumar

On Thu 12-07-12 18:05:19, Minchan Kim wrote:
> Hi Michal,
> 
> On Thu, Jul 12, 2012 at 10:19:23AM +0200, Michal Hocko wrote:
> > On Thu 12-07-12 11:50:48, Minchan Kim wrote:
> > > In __zone_watermark_ok, free and min are signed long type
> > > while z->lowmem_reserve[classzone_idx] is unsigned long type.
> > > So comparision of them could be wrong due to type conversion
> > > to unsigned although free_pages is minus value.
> > 
> > Agreed on that
> > but
> > > 
> > > It could return true instead of false in case of order-0 check
> > > so that kswapd could sleep forever. 
> > 
> > I am kind of lost here. How can we have negative free_pages with
> > order-0? It would need to come with a negative value because 
> > free_pages -= (1 << order) - 1;
> > won't make it negative.
> 
> Right you are. I missed that part.
> 
> The reason Aaditya reported this problem is caused by my patch[1] in
> this series. zone_watermark_ok_safe didn't reset free_pages to zero
> although free_pages becomes minus value(Please, look at [1])
> 
> [1] memory-hotplug: fix kswapd looping forever problem
> 
> I think we have no problem in current code because if order isn't zero 
> and free_pages is minus value, it could exit in the middle of loop with
> false.

Yes, but it would be still nice to not rely on the loop and make the
first test effective. So I think the patch still makes sense.

> 
> So I think it was totally patch's problem. :(
> But I agree auto type casting problem is subtle error-prone so it's valuable
> to fix, too. As you mentiond, I should rewrite description and subject for it.
> 
> Thanks for the review!
> 
> > 
> > > It means livelock because direct reclaimer loops forever until kswapd
> > > set zone->all_unreclaimable.
> > > 
> > > Aaditya reported this problem when he test my hotplug patch.
> > > 
> > > Reported-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> > > Tested-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> > > Signed-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
> > > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > 
> > So you can add my Reviewed-by: Michal Hocko <mhocko@suse.cz>
> > but the changelog could be more clear.
> > 
> > > ---
> > > This patch isn't dependent with this series.
> > > It seems to be candidate for -stable but I'm not sure because of this part.
> > > So, pass the decision to akpm.
> > > 
> > > " - It must fix a real bug that bothers people (not a, "This could be a
> > >    problem..." type thing)."
> > 
> > I am wondering what Testted-by means if "This could be a problem..."
> > type thing)."
> 
> He reported it during testing this series so I don't think it will happen
> in current code because zone_page_state and zone_page_state_snapshot
> can't set free_pages to minus.
> 
> > 
> > > 
> > >  mm/page_alloc.c |    3 ++-
> > >  1 file changed, 2 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > index f17e6e4..627653c 100644
> > > --- a/mm/page_alloc.c
> > > +++ b/mm/page_alloc.c
> > > @@ -1594,6 +1594,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> > >  {
> > >  	/* free_pages my go negative - that's OK */
> > >  	long min = mark;
> > > +	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
> > >  	int o;
> > >  
> > >  	free_pages -= (1 << order) - 1;
> > > @@ -1602,7 +1603,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> > >  	if (alloc_flags & ALLOC_HARDER)
> > >  		min -= min / 4;
> > >  
> > > -	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
> > > +	if (free_pages <= min + lowmem_reserve)
> > >  		return false;
> > >  	for (o = 0; o < order; o++) {
> > >  		/* At the next order, this order's pages become unavailable */
> > > -- 
> > > 1.7.9.5
> > > 
> > > --
> > > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > > the body to majordomo@kvack.org.  For more info on Linux MM,
> > > see: http://www.linux-mm.org/ .
> > > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> > 
> > -- 
> > Michal Hocko
> > SUSE Labs
> > SUSE LINUX s.r.o.
> > Lihovarska 1060/12
> > 190 00 Praha 9    
> > Czech Republic
> > 
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to majordomo@kvack.org.  For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 
> -- 
> Kind regards,
> Minchan Kim

-- 
Michal Hocko
SUSE Labs
SUSE LINUX s.r.o.
Lihovarska 1060/12
190 00 Praha 9    
Czech Republic

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3 v3] memory-hotplug: fix kswapd looping forever problem
  2012-07-12  2:50   ` Minchan Kim
@ 2012-07-12 21:01     ` Andrew Morton
  -1 siblings, 0 replies; 18+ messages in thread
From: Andrew Morton @ 2012-07-12 21:01 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-kernel, linux-mm, Mel Gorman, KOSAKI Motohiro,
	Kamezawa Hiroyuki, Aaditya Kumar

On Thu, 12 Jul 2012 11:50:49 +0900
Minchan Kim <minchan@kernel.org> wrote:

> When hotplug offlining happens on zone A, it starts to mark freed page
> as MIGRATE_ISOLATE type in buddy for preventing further allocation.
> (MIGRATE_ISOLATE is very irony type because it's apparently on buddy
> but we can't allocate them).
> When the memory shortage happens during hotplug offlining,
> current task starts to reclaim, then wake up kswapd.
> Kswapd checks watermark, then go sleep because current zone_watermark_ok_safe
> doesn't consider MIGRATE_ISOLATE freed page count.
> Current task continue to reclaim in direct reclaim path without kswapd's helping.
> The problem is that zone->all_unreclaimable is set by only kswapd
> so that current task would be looping forever like below.
> 
> __alloc_pages_slowpath
> restart:
> 	wake_all_kswapd
> rebalance:
> 	__alloc_pages_direct_reclaim
> 		do_try_to_free_pages
> 			if global_reclaim && !all_unreclaimable
> 				return 1; /* It means we did did_some_progress */
> 	skip __alloc_pages_may_oom
> 	should_alloc_retry
> 		goto rebalance;
> 
> If we apply KOSAKI's patch[1] which doesn't depends on kswapd
> about setting zone->all_unreclaimable, we can solve this problem
> by killing some task in direct reclaim path. But it doesn't wake up kswapd, still.
> It could be a problem still if other subsystem needs GFP_ATOMIC request.
> So kswapd should consider MIGRATE_ISOLATE when it calculate free pages
> BEFORE going sleep.
> 
> This patch counts the number of MIGRATE_ISOLATE page block and
> zone_watermark_ok_safe will consider it if the system has such blocks
> (fortunately, it's very rare so no problem in POV overhead and kswapd is never
> hotpath).
> 
> Copy/modify from Mel's quote
> "
> Ideal solution would be "allocating" the pageblock.
> It would keep the free space accounting as it is but historically,
> memory hotplug didn't allocate pages because it would be difficult to
> detect if a pageblock was isolated or if part of some balloon.
> Allocating just full pageblocks would work around this, However,
> it would play very badly with CMA.
> "
> 
> [1] http://lkml.org/lkml/2012/6/14/74
>
> ...
>
> +#ifdef CONFIG_MEMORY_ISOLATION
> +static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
> +{
> +	unsigned long nr_pages = 0;
> +
> +	if (unlikely(zone->nr_pageblock_isolate)) {
> +		nr_pages = zone->nr_pageblock_isolate * pageblock_nr_pages;
> +	}
> +	return nr_pages;
> +}

That's pretty verbose.  I couldn't resist fiddling with it ;)

> +#else
> +static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
> +{
> +	return 0;
> +}
> +#endif
> +
>  bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
>  		      int classzone_idx, int alloc_flags)
>  {
> @@ -1633,6 +1655,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
>  	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
>  		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
>  
> +	/*
> +	 * If the zone has MIGRATE_ISOLATE type free page,
> +	 * we should consider it. nr_zone_isolate_freepages is never
> +	 * accurate so kswapd might not sleep although she can.
> +	 * But it's more desirable for memory hotplug rather than
> +	 * forever sleep which cause livelock in direct reclaim path.
> +	 */

I had a go at clarifying this comment.  Please review the result.

> +	free_pages -= nr_zone_isolate_freepages(z);
>  	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
>  								free_pages);
>  }
> @@ -4397,6 +4427,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>  		lruvec_init(&zone->lruvec, zone);
>  		zap_zone_vm_stats(zone);
>  		zone->flags = 0;
> +		zone->nr_pageblock_isolate = 0;
>  		if (!size)
>  			continue;
>  
> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
> index fb482cf..64abb33 100644
> --- a/mm/page_isolation.c
> +++ b/mm/page_isolation.c
> @@ -8,6 +8,31 @@
>  #include <linux/memory.h>
>  #include "internal.h"
>  
> +/* called by holding zone->lock */
> +static void set_pageblock_isolate(struct zone *zone, struct page *page)
> +{
> +	BUG_ON(page_zone(page) != zone);

Well.  If this is the case then why not eliminate this function's
`zone' argument?

> +	if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
> +		return;
> +
> +	set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> +	zone->nr_pageblock_isolate++;
> +}
> +
> +/* called by holding zone->lock */
> +static void restore_pageblock_isolate(struct zone *zone, struct page *page,
> +		int migratetype)
> +{
> +	BUG_ON(page_zone(page) != zone);

ditto

> +	if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
> +		return;
> +
> +	BUG_ON(zone->nr_pageblock_isolate <= 0);
> +	set_pageblock_migratetype(page, migratetype);
> +	zone->nr_pageblock_isolate--;
> +}

From: Andrew Morton <akpm@linux-foundation.org>
Subject: memory-hotplug-fix-kswapd-looping-forever-problem-fix

simplify nr_zone_isolate_freepages(), rework zone_watermark_ok_safe() comment, simplify set_pageblock_isolate() and restore_pageblock_isolate().

Cc: Aaditya Kumar <aaditya.kumar.30@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 mm/page_alloc.c     |   19 ++++++++-----------
 mm/page_isolation.c |   19 ++++++++-----------
 2 files changed, 16 insertions(+), 22 deletions(-)

diff -puN include/linux/mmzone.h~memory-hotplug-fix-kswapd-looping-forever-problem-fix include/linux/mmzone.h
diff -puN mm/page_alloc.c~memory-hotplug-fix-kswapd-looping-forever-problem-fix mm/page_alloc.c
--- a/mm/page_alloc.c~memory-hotplug-fix-kswapd-looping-forever-problem-fix
+++ a/mm/page_alloc.c
@@ -1626,12 +1626,9 @@ static bool __zone_watermark_ok(struct z
 #ifdef CONFIG_MEMORY_ISOLATION
 static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
 {
-	unsigned long nr_pages = 0;
-
-	if (unlikely(zone->nr_pageblock_isolate)) {
-		nr_pages = zone->nr_pageblock_isolate * pageblock_nr_pages;
-	}
-	return nr_pages;
+	if (unlikely(zone->nr_pageblock_isolate))
+		return zone->nr_pageblock_isolate * pageblock_nr_pages;
+	return 0;
 }
 #else
 static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
@@ -1656,11 +1653,11 @@ bool zone_watermark_ok_safe(struct zone 
 		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
 
 	/*
-	 * If the zone has MIGRATE_ISOLATE type free page,
-	 * we should consider it. nr_zone_isolate_freepages is never
-	 * accurate so kswapd might not sleep although she can.
-	 * But it's more desirable for memory hotplug rather than
-	 * forever sleep which cause livelock in direct reclaim path.
+	 * If the zone has MIGRATE_ISOLATE type free pages, we should consider
+	 * it.  nr_zone_isolate_freepages is never accurate so kswapd might not
+	 * sleep although it could do so.  But this is more desirable for memory
+	 * hotplug than sleeping which can cause a livelock in the direct
+	 * reclaim path.
 	 */
 	free_pages -= nr_zone_isolate_freepages(z);
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
diff -puN mm/page_isolation.c~memory-hotplug-fix-kswapd-looping-forever-problem-fix mm/page_isolation.c
--- a/mm/page_isolation.c~memory-hotplug-fix-kswapd-looping-forever-problem-fix
+++ a/mm/page_isolation.c
@@ -8,23 +8,20 @@
 #include <linux/memory.h>
 #include "internal.h"
 
-/* called by holding zone->lock */
-static void set_pageblock_isolate(struct zone *zone, struct page *page)
+/* called while holding zone->lock */
+static void set_pageblock_isolate(struct page *page)
 {
-	BUG_ON(page_zone(page) != zone);
-
 	if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
 		return;
 
 	set_pageblock_migratetype(page, MIGRATE_ISOLATE);
-	zone->nr_pageblock_isolate++;
+	page_zone(page)->nr_pageblock_isolate++;
 }
 
-/* called by holding zone->lock */
-static void restore_pageblock_isolate(struct zone *zone, struct page *page,
-		int migratetype)
+/* called while holding zone->lock */
+static void restore_pageblock_isolate(struct page *page, int migratetype)
 {
-	BUG_ON(page_zone(page) != zone);
+	struct zone *zone = page_zone(page);
 	if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
 		return;
 
@@ -79,7 +76,7 @@ int set_migratetype_isolate(struct page 
 
 out:
 	if (!ret) {
-		set_pageblock_isolate(zone, page);
+		set_pageblock_isolate(page);
 		move_freepages_block(zone, page, MIGRATE_ISOLATE);
 	}
 
@@ -98,7 +95,7 @@ void unset_migratetype_isolate(struct pa
 	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
 		goto out;
 	move_freepages_block(zone, page, migratetype);
-	restore_pageblock_isolate(zone, page, migratetype);
+	restore_pageblock_isolate(page, migratetype);
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
_


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3 v3] memory-hotplug: fix kswapd looping forever problem
@ 2012-07-12 21:01     ` Andrew Morton
  0 siblings, 0 replies; 18+ messages in thread
From: Andrew Morton @ 2012-07-12 21:01 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-kernel, linux-mm, Mel Gorman, KOSAKI Motohiro,
	Kamezawa Hiroyuki, Aaditya Kumar

On Thu, 12 Jul 2012 11:50:49 +0900
Minchan Kim <minchan@kernel.org> wrote:

> When hotplug offlining happens on zone A, it starts to mark freed page
> as MIGRATE_ISOLATE type in buddy for preventing further allocation.
> (MIGRATE_ISOLATE is very irony type because it's apparently on buddy
> but we can't allocate them).
> When the memory shortage happens during hotplug offlining,
> current task starts to reclaim, then wake up kswapd.
> Kswapd checks watermark, then go sleep because current zone_watermark_ok_safe
> doesn't consider MIGRATE_ISOLATE freed page count.
> Current task continue to reclaim in direct reclaim path without kswapd's helping.
> The problem is that zone->all_unreclaimable is set by only kswapd
> so that current task would be looping forever like below.
> 
> __alloc_pages_slowpath
> restart:
> 	wake_all_kswapd
> rebalance:
> 	__alloc_pages_direct_reclaim
> 		do_try_to_free_pages
> 			if global_reclaim && !all_unreclaimable
> 				return 1; /* It means we did did_some_progress */
> 	skip __alloc_pages_may_oom
> 	should_alloc_retry
> 		goto rebalance;
> 
> If we apply KOSAKI's patch[1] which doesn't depends on kswapd
> about setting zone->all_unreclaimable, we can solve this problem
> by killing some task in direct reclaim path. But it doesn't wake up kswapd, still.
> It could be a problem still if other subsystem needs GFP_ATOMIC request.
> So kswapd should consider MIGRATE_ISOLATE when it calculate free pages
> BEFORE going sleep.
> 
> This patch counts the number of MIGRATE_ISOLATE page block and
> zone_watermark_ok_safe will consider it if the system has such blocks
> (fortunately, it's very rare so no problem in POV overhead and kswapd is never
> hotpath).
> 
> Copy/modify from Mel's quote
> "
> Ideal solution would be "allocating" the pageblock.
> It would keep the free space accounting as it is but historically,
> memory hotplug didn't allocate pages because it would be difficult to
> detect if a pageblock was isolated or if part of some balloon.
> Allocating just full pageblocks would work around this, However,
> it would play very badly with CMA.
> "
> 
> [1] http://lkml.org/lkml/2012/6/14/74
>
> ...
>
> +#ifdef CONFIG_MEMORY_ISOLATION
> +static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
> +{
> +	unsigned long nr_pages = 0;
> +
> +	if (unlikely(zone->nr_pageblock_isolate)) {
> +		nr_pages = zone->nr_pageblock_isolate * pageblock_nr_pages;
> +	}
> +	return nr_pages;
> +}

That's pretty verbose.  I couldn't resist fiddling with it ;)

> +#else
> +static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
> +{
> +	return 0;
> +}
> +#endif
> +
>  bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
>  		      int classzone_idx, int alloc_flags)
>  {
> @@ -1633,6 +1655,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
>  	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
>  		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
>  
> +	/*
> +	 * If the zone has MIGRATE_ISOLATE type free page,
> +	 * we should consider it. nr_zone_isolate_freepages is never
> +	 * accurate so kswapd might not sleep although she can.
> +	 * But it's more desirable for memory hotplug rather than
> +	 * forever sleep which cause livelock in direct reclaim path.
> +	 */

I had a go at clarifying this comment.  Please review the result.

> +	free_pages -= nr_zone_isolate_freepages(z);
>  	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
>  								free_pages);
>  }
> @@ -4397,6 +4427,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>  		lruvec_init(&zone->lruvec, zone);
>  		zap_zone_vm_stats(zone);
>  		zone->flags = 0;
> +		zone->nr_pageblock_isolate = 0;
>  		if (!size)
>  			continue;
>  
> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
> index fb482cf..64abb33 100644
> --- a/mm/page_isolation.c
> +++ b/mm/page_isolation.c
> @@ -8,6 +8,31 @@
>  #include <linux/memory.h>
>  #include "internal.h"
>  
> +/* called by holding zone->lock */
> +static void set_pageblock_isolate(struct zone *zone, struct page *page)
> +{
> +	BUG_ON(page_zone(page) != zone);

Well.  If this is the case then why not eliminate this function's
`zone' argument?

> +	if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
> +		return;
> +
> +	set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> +	zone->nr_pageblock_isolate++;
> +}
> +
> +/* called by holding zone->lock */
> +static void restore_pageblock_isolate(struct zone *zone, struct page *page,
> +		int migratetype)
> +{
> +	BUG_ON(page_zone(page) != zone);

ditto

> +	if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
> +		return;
> +
> +	BUG_ON(zone->nr_pageblock_isolate <= 0);
> +	set_pageblock_migratetype(page, migratetype);
> +	zone->nr_pageblock_isolate--;
> +}

From: Andrew Morton <akpm@linux-foundation.org>
Subject: memory-hotplug-fix-kswapd-looping-forever-problem-fix

simplify nr_zone_isolate_freepages(), rework zone_watermark_ok_safe() comment, simplify set_pageblock_isolate() and restore_pageblock_isolate().

Cc: Aaditya Kumar <aaditya.kumar.30@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 mm/page_alloc.c     |   19 ++++++++-----------
 mm/page_isolation.c |   19 ++++++++-----------
 2 files changed, 16 insertions(+), 22 deletions(-)

diff -puN include/linux/mmzone.h~memory-hotplug-fix-kswapd-looping-forever-problem-fix include/linux/mmzone.h
diff -puN mm/page_alloc.c~memory-hotplug-fix-kswapd-looping-forever-problem-fix mm/page_alloc.c
--- a/mm/page_alloc.c~memory-hotplug-fix-kswapd-looping-forever-problem-fix
+++ a/mm/page_alloc.c
@@ -1626,12 +1626,9 @@ static bool __zone_watermark_ok(struct z
 #ifdef CONFIG_MEMORY_ISOLATION
 static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
 {
-	unsigned long nr_pages = 0;
-
-	if (unlikely(zone->nr_pageblock_isolate)) {
-		nr_pages = zone->nr_pageblock_isolate * pageblock_nr_pages;
-	}
-	return nr_pages;
+	if (unlikely(zone->nr_pageblock_isolate))
+		return zone->nr_pageblock_isolate * pageblock_nr_pages;
+	return 0;
 }
 #else
 static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
@@ -1656,11 +1653,11 @@ bool zone_watermark_ok_safe(struct zone 
 		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
 
 	/*
-	 * If the zone has MIGRATE_ISOLATE type free page,
-	 * we should consider it. nr_zone_isolate_freepages is never
-	 * accurate so kswapd might not sleep although she can.
-	 * But it's more desirable for memory hotplug rather than
-	 * forever sleep which cause livelock in direct reclaim path.
+	 * If the zone has MIGRATE_ISOLATE type free pages, we should consider
+	 * it.  nr_zone_isolate_freepages is never accurate so kswapd might not
+	 * sleep although it could do so.  But this is more desirable for memory
+	 * hotplug than sleeping which can cause a livelock in the direct
+	 * reclaim path.
 	 */
 	free_pages -= nr_zone_isolate_freepages(z);
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
diff -puN mm/page_isolation.c~memory-hotplug-fix-kswapd-looping-forever-problem-fix mm/page_isolation.c
--- a/mm/page_isolation.c~memory-hotplug-fix-kswapd-looping-forever-problem-fix
+++ a/mm/page_isolation.c
@@ -8,23 +8,20 @@
 #include <linux/memory.h>
 #include "internal.h"
 
-/* called by holding zone->lock */
-static void set_pageblock_isolate(struct zone *zone, struct page *page)
+/* called while holding zone->lock */
+static void set_pageblock_isolate(struct page *page)
 {
-	BUG_ON(page_zone(page) != zone);
-
 	if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
 		return;
 
 	set_pageblock_migratetype(page, MIGRATE_ISOLATE);
-	zone->nr_pageblock_isolate++;
+	page_zone(page)->nr_pageblock_isolate++;
 }
 
-/* called by holding zone->lock */
-static void restore_pageblock_isolate(struct zone *zone, struct page *page,
-		int migratetype)
+/* called while holding zone->lock */
+static void restore_pageblock_isolate(struct page *page, int migratetype)
 {
-	BUG_ON(page_zone(page) != zone);
+	struct zone *zone = page_zone(page);
 	if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
 		return;
 
@@ -79,7 +76,7 @@ int set_migratetype_isolate(struct page 
 
 out:
 	if (!ret) {
-		set_pageblock_isolate(zone, page);
+		set_pageblock_isolate(page);
 		move_freepages_block(zone, page, MIGRATE_ISOLATE);
 	}
 
@@ -98,7 +95,7 @@ void unset_migratetype_isolate(struct pa
 	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
 		goto out;
 	move_freepages_block(zone, page, migratetype);
-	restore_pageblock_isolate(zone, page, migratetype);
+	restore_pageblock_isolate(page, migratetype);
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
_

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3 v3] memory-hotplug: fix kswapd looping forever problem
  2012-07-12 21:01     ` Andrew Morton
@ 2012-07-13  0:57       ` Minchan Kim
  -1 siblings, 0 replies; 18+ messages in thread
From: Minchan Kim @ 2012-07-13  0:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Mel Gorman, KOSAKI Motohiro,
	Kamezawa Hiroyuki, Aaditya Kumar

On Thu, Jul 12, 2012 at 02:01:54PM -0700, Andrew Morton wrote:
> On Thu, 12 Jul 2012 11:50:49 +0900
> Minchan Kim <minchan@kernel.org> wrote:
> 
> > When hotplug offlining happens on zone A, it starts to mark freed page
> > as MIGRATE_ISOLATE type in buddy for preventing further allocation.
> > (MIGRATE_ISOLATE is very irony type because it's apparently on buddy
> > but we can't allocate them).
> > When the memory shortage happens during hotplug offlining,
> > current task starts to reclaim, then wake up kswapd.
> > Kswapd checks watermark, then go sleep because current zone_watermark_ok_safe
> > doesn't consider MIGRATE_ISOLATE freed page count.
> > Current task continue to reclaim in direct reclaim path without kswapd's helping.
> > The problem is that zone->all_unreclaimable is set by only kswapd
> > so that current task would be looping forever like below.
> > 
> > __alloc_pages_slowpath
> > restart:
> > 	wake_all_kswapd
> > rebalance:
> > 	__alloc_pages_direct_reclaim
> > 		do_try_to_free_pages
> > 			if global_reclaim && !all_unreclaimable
> > 				return 1; /* It means we did did_some_progress */
> > 	skip __alloc_pages_may_oom
> > 	should_alloc_retry
> > 		goto rebalance;
> > 
> > If we apply KOSAKI's patch[1] which doesn't depends on kswapd
> > about setting zone->all_unreclaimable, we can solve this problem
> > by killing some task in direct reclaim path. But it doesn't wake up kswapd, still.
> > It could be a problem still if other subsystem needs GFP_ATOMIC request.
> > So kswapd should consider MIGRATE_ISOLATE when it calculate free pages
> > BEFORE going sleep.
> > 
> > This patch counts the number of MIGRATE_ISOLATE page block and
> > zone_watermark_ok_safe will consider it if the system has such blocks
> > (fortunately, it's very rare so no problem in POV overhead and kswapd is never
> > hotpath).
> > 
> > Copy/modify from Mel's quote
> > "
> > Ideal solution would be "allocating" the pageblock.
> > It would keep the free space accounting as it is but historically,
> > memory hotplug didn't allocate pages because it would be difficult to
> > detect if a pageblock was isolated or if part of some balloon.
> > Allocating just full pageblocks would work around this, However,
> > it would play very badly with CMA.
> > "
> > 
> > [1] http://lkml.org/lkml/2012/6/14/74
> >
> > ...
> >
> > +#ifdef CONFIG_MEMORY_ISOLATION
> > +static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
> > +{
> > +	unsigned long nr_pages = 0;
> > +
> > +	if (unlikely(zone->nr_pageblock_isolate)) {
> > +		nr_pages = zone->nr_pageblock_isolate * pageblock_nr_pages;
> > +	}
> > +	return nr_pages;
> > +}
> 
> That's pretty verbose.  I couldn't resist fiddling with it ;)
> 
> > +#else
> > +static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
> > +{
> > +	return 0;
> > +}
> > +#endif
> > +
> >  bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> >  		      int classzone_idx, int alloc_flags)
> >  {
> > @@ -1633,6 +1655,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
> >  	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
> >  		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
> >  
> > +	/*
> > +	 * If the zone has MIGRATE_ISOLATE type free page,
> > +	 * we should consider it. nr_zone_isolate_freepages is never
> > +	 * accurate so kswapd might not sleep although she can.
> > +	 * But it's more desirable for memory hotplug rather than
> > +	 * forever sleep which cause livelock in direct reclaim path.
> > +	 */
> 
> I had a go at clarifying this comment.  Please review the result.
> 
> > +	free_pages -= nr_zone_isolate_freepages(z);
> >  	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
> >  								free_pages);
> >  }
> > @@ -4397,6 +4427,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
> >  		lruvec_init(&zone->lruvec, zone);
> >  		zap_zone_vm_stats(zone);
> >  		zone->flags = 0;
> > +		zone->nr_pageblock_isolate = 0;
> >  		if (!size)
> >  			continue;
> >  
> > diff --git a/mm/page_isolation.c b/mm/page_isolation.c
> > index fb482cf..64abb33 100644
> > --- a/mm/page_isolation.c
> > +++ b/mm/page_isolation.c
> > @@ -8,6 +8,31 @@
> >  #include <linux/memory.h>
> >  #include "internal.h"
> >  
> > +/* called by holding zone->lock */
> > +static void set_pageblock_isolate(struct zone *zone, struct page *page)
> > +{
> > +	BUG_ON(page_zone(page) != zone);
> 
> Well.  If this is the case then why not eliminate this function's
> `zone' argument?
> 
> > +	if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
> > +		return;
> > +
> > +	set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> > +	zone->nr_pageblock_isolate++;
> > +}
> > +
> > +/* called by holding zone->lock */
> > +static void restore_pageblock_isolate(struct zone *zone, struct page *page,
> > +		int migratetype)
> > +{
> > +	BUG_ON(page_zone(page) != zone);
> 
> ditto
> 
> > +	if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
> > +		return;
> > +
> > +	BUG_ON(zone->nr_pageblock_isolate <= 0);
> > +	set_pageblock_migratetype(page, migratetype);
> > +	zone->nr_pageblock_isolate--;
> > +}
> 
> From: Andrew Morton <akpm@linux-foundation.org>
> Subject: memory-hotplug-fix-kswapd-looping-forever-problem-fix
> 
> simplify nr_zone_isolate_freepages(), rework zone_watermark_ok_safe() comment, simplify set_pageblock_isolate() and restore_pageblock_isolate().
> 
> Cc: Aaditya Kumar <aaditya.kumar.30@gmail.com>
> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Michal Hocko <mhocko@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Minchan Kim <minchan@kernel.org>

Looks better.
Thanks, Andrew!

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3 v3] memory-hotplug: fix kswapd looping forever problem
@ 2012-07-13  0:57       ` Minchan Kim
  0 siblings, 0 replies; 18+ messages in thread
From: Minchan Kim @ 2012-07-13  0:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Mel Gorman, KOSAKI Motohiro,
	Kamezawa Hiroyuki, Aaditya Kumar

On Thu, Jul 12, 2012 at 02:01:54PM -0700, Andrew Morton wrote:
> On Thu, 12 Jul 2012 11:50:49 +0900
> Minchan Kim <minchan@kernel.org> wrote:
> 
> > When hotplug offlining happens on zone A, it starts to mark freed page
> > as MIGRATE_ISOLATE type in buddy for preventing further allocation.
> > (MIGRATE_ISOLATE is very irony type because it's apparently on buddy
> > but we can't allocate them).
> > When the memory shortage happens during hotplug offlining,
> > current task starts to reclaim, then wake up kswapd.
> > Kswapd checks watermark, then go sleep because current zone_watermark_ok_safe
> > doesn't consider MIGRATE_ISOLATE freed page count.
> > Current task continue to reclaim in direct reclaim path without kswapd's helping.
> > The problem is that zone->all_unreclaimable is set by only kswapd
> > so that current task would be looping forever like below.
> > 
> > __alloc_pages_slowpath
> > restart:
> > 	wake_all_kswapd
> > rebalance:
> > 	__alloc_pages_direct_reclaim
> > 		do_try_to_free_pages
> > 			if global_reclaim && !all_unreclaimable
> > 				return 1; /* It means we did did_some_progress */
> > 	skip __alloc_pages_may_oom
> > 	should_alloc_retry
> > 		goto rebalance;
> > 
> > If we apply KOSAKI's patch[1] which doesn't depends on kswapd
> > about setting zone->all_unreclaimable, we can solve this problem
> > by killing some task in direct reclaim path. But it doesn't wake up kswapd, still.
> > It could be a problem still if other subsystem needs GFP_ATOMIC request.
> > So kswapd should consider MIGRATE_ISOLATE when it calculate free pages
> > BEFORE going sleep.
> > 
> > This patch counts the number of MIGRATE_ISOLATE page block and
> > zone_watermark_ok_safe will consider it if the system has such blocks
> > (fortunately, it's very rare so no problem in POV overhead and kswapd is never
> > hotpath).
> > 
> > Copy/modify from Mel's quote
> > "
> > Ideal solution would be "allocating" the pageblock.
> > It would keep the free space accounting as it is but historically,
> > memory hotplug didn't allocate pages because it would be difficult to
> > detect if a pageblock was isolated or if part of some balloon.
> > Allocating just full pageblocks would work around this, However,
> > it would play very badly with CMA.
> > "
> > 
> > [1] http://lkml.org/lkml/2012/6/14/74
> >
> > ...
> >
> > +#ifdef CONFIG_MEMORY_ISOLATION
> > +static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
> > +{
> > +	unsigned long nr_pages = 0;
> > +
> > +	if (unlikely(zone->nr_pageblock_isolate)) {
> > +		nr_pages = zone->nr_pageblock_isolate * pageblock_nr_pages;
> > +	}
> > +	return nr_pages;
> > +}
> 
> That's pretty verbose.  I couldn't resist fiddling with it ;)
> 
> > +#else
> > +static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
> > +{
> > +	return 0;
> > +}
> > +#endif
> > +
> >  bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> >  		      int classzone_idx, int alloc_flags)
> >  {
> > @@ -1633,6 +1655,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
> >  	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
> >  		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
> >  
> > +	/*
> > +	 * If the zone has MIGRATE_ISOLATE type free page,
> > +	 * we should consider it. nr_zone_isolate_freepages is never
> > +	 * accurate so kswapd might not sleep although she can.
> > +	 * But it's more desirable for memory hotplug rather than
> > +	 * forever sleep which cause livelock in direct reclaim path.
> > +	 */
> 
> I had a go at clarifying this comment.  Please review the result.
> 
> > +	free_pages -= nr_zone_isolate_freepages(z);
> >  	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
> >  								free_pages);
> >  }
> > @@ -4397,6 +4427,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
> >  		lruvec_init(&zone->lruvec, zone);
> >  		zap_zone_vm_stats(zone);
> >  		zone->flags = 0;
> > +		zone->nr_pageblock_isolate = 0;
> >  		if (!size)
> >  			continue;
> >  
> > diff --git a/mm/page_isolation.c b/mm/page_isolation.c
> > index fb482cf..64abb33 100644
> > --- a/mm/page_isolation.c
> > +++ b/mm/page_isolation.c
> > @@ -8,6 +8,31 @@
> >  #include <linux/memory.h>
> >  #include "internal.h"
> >  
> > +/* called by holding zone->lock */
> > +static void set_pageblock_isolate(struct zone *zone, struct page *page)
> > +{
> > +	BUG_ON(page_zone(page) != zone);
> 
> Well.  If this is the case then why not eliminate this function's
> `zone' argument?
> 
> > +	if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
> > +		return;
> > +
> > +	set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> > +	zone->nr_pageblock_isolate++;
> > +}
> > +
> > +/* called by holding zone->lock */
> > +static void restore_pageblock_isolate(struct zone *zone, struct page *page,
> > +		int migratetype)
> > +{
> > +	BUG_ON(page_zone(page) != zone);
> 
> ditto
> 
> > +	if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
> > +		return;
> > +
> > +	BUG_ON(zone->nr_pageblock_isolate <= 0);
> > +	set_pageblock_migratetype(page, migratetype);
> > +	zone->nr_pageblock_isolate--;
> > +}
> 
> From: Andrew Morton <akpm@linux-foundation.org>
> Subject: memory-hotplug-fix-kswapd-looping-forever-problem-fix
> 
> simplify nr_zone_isolate_freepages(), rework zone_watermark_ok_safe() comment, simplify set_pageblock_isolate() and restore_pageblock_isolate().
> 
> Cc: Aaditya Kumar <aaditya.kumar.30@gmail.com>
> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Michal Hocko <mhocko@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Minchan Kim <minchan@kernel.org>

Looks better.
Thanks, Andrew!

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3 v3] memory-hotplug: fix kswapd looping forever problem
  2012-07-12 21:01     ` Andrew Morton
@ 2012-07-19 10:09       ` Kamezawa Hiroyuki
  -1 siblings, 0 replies; 18+ messages in thread
From: Kamezawa Hiroyuki @ 2012-07-19 10:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Minchan Kim, linux-kernel, linux-mm, Mel Gorman, KOSAKI Motohiro,
	Aaditya Kumar


> From: Andrew Morton <akpm@linux-foundation.org>
> Subject: memory-hotplug-fix-kswapd-looping-forever-problem-fix
>
> simplify nr_zone_isolate_freepages(), rework zone_watermark_ok_safe() comment, simplify set_pageblock_isolate() and restore_pageblock_isolate().
>
> Cc: Aaditya Kumar <aaditya.kumar.30@gmail.com>
> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Michal Hocko <mhocko@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3 v3] memory-hotplug: fix kswapd looping forever problem
@ 2012-07-19 10:09       ` Kamezawa Hiroyuki
  0 siblings, 0 replies; 18+ messages in thread
From: Kamezawa Hiroyuki @ 2012-07-19 10:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Minchan Kim, linux-kernel, linux-mm, Mel Gorman, KOSAKI Motohiro,
	Aaditya Kumar


> From: Andrew Morton <akpm@linux-foundation.org>
> Subject: memory-hotplug-fix-kswapd-looping-forever-problem-fix
>
> simplify nr_zone_isolate_freepages(), rework zone_watermark_ok_safe() comment, simplify set_pageblock_isolate() and restore_pageblock_isolate().
>
> Cc: Aaditya Kumar <aaditya.kumar.30@gmail.com>
> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Michal Hocko <mhocko@suse.cz>
> Cc: Minchan Kim <minchan@kernel.org>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2012-07-19 10:11 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-07-12  2:50 [PATCH 1/3 v3] mm: Factor out memory isolate functions Minchan Kim
2012-07-12  2:50 ` Minchan Kim
2012-07-12  2:50 ` [PATCH 2/3 v3] mm: bug fix free page check in zone_watermark_ok Minchan Kim
2012-07-12  2:50   ` Minchan Kim
2012-07-12  8:19   ` Michal Hocko
2012-07-12  8:19     ` Michal Hocko
2012-07-12  9:05     ` Minchan Kim
2012-07-12  9:05       ` Minchan Kim
2012-07-12 10:16       ` Michal Hocko
2012-07-12 10:16         ` Michal Hocko
2012-07-12  2:50 ` [PATCH 3/3 v3] memory-hotplug: fix kswapd looping forever problem Minchan Kim
2012-07-12  2:50   ` Minchan Kim
2012-07-12 21:01   ` Andrew Morton
2012-07-12 21:01     ` Andrew Morton
2012-07-13  0:57     ` Minchan Kim
2012-07-13  0:57       ` Minchan Kim
2012-07-19 10:09     ` Kamezawa Hiroyuki
2012-07-19 10:09       ` Kamezawa Hiroyuki

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.