linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mm/memory-hotplug: Switch locking to a percpu rwsem
@ 2017-06-29 16:11 Thomas Gleixner
  2017-06-30  9:27 ` Michal Hocko
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Thomas Gleixner @ 2017-06-29 16:11 UTC (permalink / raw)
  To: Andrey Ryabinin
  Cc: LKML, linux-mm, Andrew Morton, Michal Hocko, Vlastimil Babka

Andrey reported a potential deadlock with the memory hotplug lock and the
cpu hotplug lock.

The reason is that memory hotplug takes the memory hotplug lock and then
calls stop_machine() which calls get_online_cpus(). That's the reverse lock
order to get_online_cpus(); get_online_mems(); in mm/slub_common.c

The problem has been there forever. The reason why this was never reported
is that the cpu hotplug locking had this homebrewn recursive reader writer
semaphore construct which due to the recursion evaded the full lock dep
coverage. The memory hotplug code copied that construct verbatim and
therefor has similar issues.

Two steps to fix this:

1) Convert the memory hotplug locking to a per cpu rwsem so the potential
   issues get reported proper by lockdep.

2) Lock the online cpus in mem_hotplug_begin() before taking the memory
   hotplug rwsem and use stop_machine_cpuslocked() in the page_alloc code
   to avoid recursive locking.

Reported-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
---

Note 1:
 Applies against -next or
     
   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git smp/hotplug

 which contains the hotplug locking rework including stop_machine_cpuslocked()

Note 2:

 Most of the call sites of get_online_mems() are also calling get_online_cpus().

 So we could switch the whole machinery to use the CPU hotplug locking for
 protecting both memory and CPU hotplug. That actually works and removes
 another 40 lines of code.

---
 mm/memory_hotplug.c |   85 +++++++---------------------------------------------
 mm/page_alloc.c     |    2 -
 2 files changed, 14 insertions(+), 73 deletions(-)

--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -52,32 +52,17 @@ static void generic_online_page(struct p
 static online_page_callback_t online_page_callback = generic_online_page;
 static DEFINE_MUTEX(online_page_callback_lock);
 
-/* The same as the cpu_hotplug lock, but for memory hotplug. */
-static struct {
-	struct task_struct *active_writer;
-	struct mutex lock; /* Synchronizes accesses to refcount, */
-	/*
-	 * Also blocks the new readers during
-	 * an ongoing mem hotplug operation.
-	 */
-	int refcount;
+DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif
-} mem_hotplug = {
-	.active_writer = NULL,
-	.lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
-	.refcount = 0,
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	.dep_map = {.name = "mem_hotplug.lock" },
-#endif
-};
+void get_online_mems(void)
+{
+	percpu_down_read(&mem_hotplug_lock);
+}
 
-/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
-#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
-#define memhp_lock_acquire()      lock_map_acquire(&mem_hotplug.dep_map)
-#define memhp_lock_release()      lock_map_release(&mem_hotplug.dep_map)
+void put_online_mems(void)
+{
+	percpu_up_read(&mem_hotplug_lock);
+}
 
 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
 bool memhp_auto_online;
@@ -97,60 +82,16 @@ static int __init setup_memhp_default_st
 }
 __setup("memhp_default_state=", setup_memhp_default_state);
 
-void get_online_mems(void)
-{
-	might_sleep();
-	if (mem_hotplug.active_writer == current)
-		return;
-	memhp_lock_acquire_read();
-	mutex_lock(&mem_hotplug.lock);
-	mem_hotplug.refcount++;
-	mutex_unlock(&mem_hotplug.lock);
-
-}
-
-void put_online_mems(void)
-{
-	if (mem_hotplug.active_writer == current)
-		return;
-	mutex_lock(&mem_hotplug.lock);
-
-	if (WARN_ON(!mem_hotplug.refcount))
-		mem_hotplug.refcount++; /* try to fix things up */
-
-	if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
-		wake_up_process(mem_hotplug.active_writer);
-	mutex_unlock(&mem_hotplug.lock);
-	memhp_lock_release();
-
-}
-
-/* Serializes write accesses to mem_hotplug.active_writer. */
-static DEFINE_MUTEX(memory_add_remove_lock);
-
 void mem_hotplug_begin(void)
 {
-	mutex_lock(&memory_add_remove_lock);
-
-	mem_hotplug.active_writer = current;
-
-	memhp_lock_acquire();
-	for (;;) {
-		mutex_lock(&mem_hotplug.lock);
-		if (likely(!mem_hotplug.refcount))
-			break;
-		__set_current_state(TASK_UNINTERRUPTIBLE);
-		mutex_unlock(&mem_hotplug.lock);
-		schedule();
-	}
+	cpus_read_lock();
+	percpu_down_write(&mem_hotplug_lock);
 }
 
 void mem_hotplug_done(void)
 {
-	mem_hotplug.active_writer = NULL;
-	mutex_unlock(&mem_hotplug.lock);
-	memhp_lock_release();
-	mutex_unlock(&memory_add_remove_lock);
+	percpu_up_write(&mem_hotplug_lock);
+	cpus_read_unlock();
 }
 
 /* add this memory to iomem resource */
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5216,7 +5216,7 @@ void __ref build_all_zonelists(pg_data_t
 #endif
 		/* we have to stop all cpus to guarantee there is no user
 		   of zonelist */
-		stop_machine(__build_all_zonelists, pgdat, NULL);
+		stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
 		/* cpuset refresh routine should be here */
 	}
 	vm_total_pages = nr_free_pagecache_pages();

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] mm/memory-hotplug: Switch locking to a percpu rwsem
  2017-06-29 16:11 [PATCH] mm/memory-hotplug: Switch locking to a percpu rwsem Thomas Gleixner
@ 2017-06-30  9:27 ` Michal Hocko
  2017-06-30 10:15   ` Thomas Gleixner
  2017-07-03 12:41 ` Vladimir Davydov
  2017-07-03 16:38 ` Michal Hocko
  2 siblings, 1 reply; 10+ messages in thread
From: Michal Hocko @ 2017-06-30  9:27 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andrey Ryabinin, LKML, linux-mm, Andrew Morton, Vlastimil Babka,
	Vladimir Davydov, Heiko Carstens

[CC Vladimir and Heiko who were touching this area lately]

On Thu 29-06-17 18:11:15, Thomas Gleixner wrote:
> Andrey reported a potential deadlock with the memory hotplug lock and the
> cpu hotplug lock.
> 
> The reason is that memory hotplug takes the memory hotplug lock and then
> calls stop_machine() which calls get_online_cpus(). That's the reverse lock
> order to get_online_cpus(); get_online_mems(); in mm/slub_common.c

I always considered the stop_machine usage there totally gross. But
never had time to look into it properly. Memory hotplug locking is a
story of its own.
 
> The problem has been there forever. The reason why this was never reported
> is that the cpu hotplug locking had this homebrewn recursive reader writer
> semaphore construct which due to the recursion evaded the full lock dep
> coverage. The memory hotplug code copied that construct verbatim and
> therefor has similar issues.
> 
> Two steps to fix this:
> 
> 1) Convert the memory hotplug locking to a per cpu rwsem so the potential
>    issues get reported proper by lockdep.
> 
> 2) Lock the online cpus in mem_hotplug_begin() before taking the memory
>    hotplug rwsem and use stop_machine_cpuslocked() in the page_alloc code
>    to avoid recursive locking.

So I like this simplification a lot! Even if we can get rid of the
stop_machine eventually this patch would be an improvement. A short
comment on why the per-cpu semaphore over the regular one is better
would be nice.

I cannot give my ack yet, I have to mull over the patch some
more because this has been an area of subtle bugs (especially
the lock dependency with the hotplug device locking - look at
lock_device_hotplug_sysfs if you dare) but it looks good from the first
look. Give me few days, please.

> Reported-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: linux-mm@kvack.org
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Vlastimil Babka <vbabka@suse.cz>
> ---
> 
> Note 1:
>  Applies against -next or
>      
>    git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git smp/hotplug
> 
>  which contains the hotplug locking rework including stop_machine_cpuslocked()
> 
> Note 2:
> 
>  Most of the call sites of get_online_mems() are also calling get_online_cpus().
> 
>  So we could switch the whole machinery to use the CPU hotplug locking for
>  protecting both memory and CPU hotplug. That actually works and removes
>  another 40 lines of code.
> 
> ---
>  mm/memory_hotplug.c |   85 +++++++---------------------------------------------
>  mm/page_alloc.c     |    2 -
>  2 files changed, 14 insertions(+), 73 deletions(-)
> 
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -52,32 +52,17 @@ static void generic_online_page(struct p
>  static online_page_callback_t online_page_callback = generic_online_page;
>  static DEFINE_MUTEX(online_page_callback_lock);
>  
> -/* The same as the cpu_hotplug lock, but for memory hotplug. */
> -static struct {
> -	struct task_struct *active_writer;
> -	struct mutex lock; /* Synchronizes accesses to refcount, */
> -	/*
> -	 * Also blocks the new readers during
> -	 * an ongoing mem hotplug operation.
> -	 */
> -	int refcount;
> +DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
>  
> -#ifdef CONFIG_DEBUG_LOCK_ALLOC
> -	struct lockdep_map dep_map;
> -#endif
> -} mem_hotplug = {
> -	.active_writer = NULL,
> -	.lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
> -	.refcount = 0,
> -#ifdef CONFIG_DEBUG_LOCK_ALLOC
> -	.dep_map = {.name = "mem_hotplug.lock" },
> -#endif
> -};
> +void get_online_mems(void)
> +{
> +	percpu_down_read(&mem_hotplug_lock);
> +}
>  
> -/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
> -#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
> -#define memhp_lock_acquire()      lock_map_acquire(&mem_hotplug.dep_map)
> -#define memhp_lock_release()      lock_map_release(&mem_hotplug.dep_map)
> +void put_online_mems(void)
> +{
> +	percpu_up_read(&mem_hotplug_lock);
> +}
>  
>  #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
>  bool memhp_auto_online;
> @@ -97,60 +82,16 @@ static int __init setup_memhp_default_st
>  }
>  __setup("memhp_default_state=", setup_memhp_default_state);
>  
> -void get_online_mems(void)
> -{
> -	might_sleep();
> -	if (mem_hotplug.active_writer == current)
> -		return;
> -	memhp_lock_acquire_read();
> -	mutex_lock(&mem_hotplug.lock);
> -	mem_hotplug.refcount++;
> -	mutex_unlock(&mem_hotplug.lock);
> -
> -}
> -
> -void put_online_mems(void)
> -{
> -	if (mem_hotplug.active_writer == current)
> -		return;
> -	mutex_lock(&mem_hotplug.lock);
> -
> -	if (WARN_ON(!mem_hotplug.refcount))
> -		mem_hotplug.refcount++; /* try to fix things up */
> -
> -	if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
> -		wake_up_process(mem_hotplug.active_writer);
> -	mutex_unlock(&mem_hotplug.lock);
> -	memhp_lock_release();
> -
> -}
> -
> -/* Serializes write accesses to mem_hotplug.active_writer. */
> -static DEFINE_MUTEX(memory_add_remove_lock);
> -
>  void mem_hotplug_begin(void)
>  {
> -	mutex_lock(&memory_add_remove_lock);
> -
> -	mem_hotplug.active_writer = current;
> -
> -	memhp_lock_acquire();
> -	for (;;) {
> -		mutex_lock(&mem_hotplug.lock);
> -		if (likely(!mem_hotplug.refcount))
> -			break;
> -		__set_current_state(TASK_UNINTERRUPTIBLE);
> -		mutex_unlock(&mem_hotplug.lock);
> -		schedule();
> -	}
> +	cpus_read_lock();
> +	percpu_down_write(&mem_hotplug_lock);
>  }
>  
>  void mem_hotplug_done(void)
>  {
> -	mem_hotplug.active_writer = NULL;
> -	mutex_unlock(&mem_hotplug.lock);
> -	memhp_lock_release();
> -	mutex_unlock(&memory_add_remove_lock);
> +	percpu_up_write(&mem_hotplug_lock);
> +	cpus_read_unlock();
>  }
>  
>  /* add this memory to iomem resource */
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5216,7 +5216,7 @@ void __ref build_all_zonelists(pg_data_t
>  #endif
>  		/* we have to stop all cpus to guarantee there is no user
>  		   of zonelist */
> -		stop_machine(__build_all_zonelists, pgdat, NULL);
> +		stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
>  		/* cpuset refresh routine should be here */
>  	}
>  	vm_total_pages = nr_free_pagecache_pages();

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] mm/memory-hotplug: Switch locking to a percpu rwsem
  2017-06-30  9:27 ` Michal Hocko
@ 2017-06-30 10:15   ` Thomas Gleixner
  2017-06-30 11:49     ` Andrey Ryabinin
  2017-07-03 16:32     ` Michal Hocko
  0 siblings, 2 replies; 10+ messages in thread
From: Thomas Gleixner @ 2017-06-30 10:15 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrey Ryabinin, LKML, linux-mm, Andrew Morton, Vlastimil Babka,
	Vladimir Davydov, Heiko Carstens

On Fri, 30 Jun 2017, Michal Hocko wrote:
> So I like this simplification a lot! Even if we can get rid of the
> stop_machine eventually this patch would be an improvement. A short
> comment on why the per-cpu semaphore over the regular one is better
> would be nice.

Yes, will add one.

The main point is that the current locking construct is evading lockdep due
to the ability to support recursive locking, which I did not observe so
far.

> I cannot give my ack yet, I have to mull over the patch some more because
> this has been an area of subtle bugs (especially the lock dependency with
> the hotplug device locking - look at lock_device_hotplug_sysfs if you
> dare) but it looks good from the first look. Give me few days, please.

Sure. Just to make you to mull over more stuff, find below the patch which
moves all of this to use the cpuhotplug lock.

Thanks,

	tglx

8<--------------------
Subject: mm/memory-hotplug: Use cpu hotplug lock
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jun 2017 16:30:00 +0200

Most place which take the memory hotplug lock take the cpu hotplug lock as
well. Avoid the double locking and use the cpu hotplug lock for both.

Not-Yet-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/base/memory.c          |    5 ++--
 include/linux/memory_hotplug.h |   12 -----------
 mm/kmemleak.c                  |    4 +--
 mm/memory-failure.c            |    5 ++--
 mm/memory_hotplug.c            |   44 +++++++++--------------------------------
 mm/slab_common.c               |   14 -------------
 mm/slub.c                      |    4 +--
 7 files changed, 20 insertions(+), 68 deletions(-)

Index: b/drivers/base/memory.c
===================================================================
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -21,6 +21,7 @@
 #include <linux/mutex.h>
 #include <linux/stat.h>
 #include <linux/slab.h>
+#include <linux/cpu.h>
 
 #include <linux/atomic.h>
 #include <linux/uaccess.h>
@@ -339,7 +340,7 @@ store_mem_state(struct device *dev,
 	 * inversion, memory_subsys_online() callbacks will be implemented by
 	 * assuming it's already protected.
 	 */
-	mem_hotplug_begin();
+	cpus_write_lock();
 
 	switch (online_type) {
 	case MMOP_ONLINE_KERNEL:
@@ -355,7 +356,7 @@ store_mem_state(struct device *dev,
 		ret = -EINVAL; /* should never happen */
 	}
 
-	mem_hotplug_done();
+	cpus_write_unlock();
 err:
 	unlock_device_hotplug();
 
Index: b/include/linux/memory_hotplug.h
===================================================================
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -193,12 +193,6 @@ extern void put_page_bootmem(struct page
 extern void get_page_bootmem(unsigned long ingo, struct page *page,
 			     unsigned long type);
 
-void get_online_mems(void);
-void put_online_mems(void);
-
-void mem_hotplug_begin(void);
-void mem_hotplug_done(void);
-
 extern void set_zone_contiguous(struct zone *zone);
 extern void clear_zone_contiguous(struct zone *zone);
 
@@ -238,12 +232,6 @@ static inline int try_online_node(int ni
 	return 0;
 }
 
-static inline void get_online_mems(void) {}
-static inline void put_online_mems(void) {}
-
-static inline void mem_hotplug_begin(void) {}
-static inline void mem_hotplug_done(void) {}
-
 #endif /* ! CONFIG_MEMORY_HOTPLUG */
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
Index: b/mm/kmemleak.c
===================================================================
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1428,7 +1428,7 @@ static void kmemleak_scan(void)
 	/*
 	 * Struct page scanning for each node.
 	 */
-	get_online_mems();
+	get_online_cpus();
 	for_each_online_node(i) {
 		unsigned long start_pfn = node_start_pfn(i);
 		unsigned long end_pfn = node_end_pfn(i);
@@ -1446,7 +1446,7 @@ static void kmemleak_scan(void)
 			scan_block(page, page + 1, NULL);
 		}
 	}
-	put_online_mems();
+	put_online_cpus();
 
 	/*
 	 * Scanning the task stacks (may introduce false negatives).
Index: b/mm/memory-failure.c
===================================================================
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -58,6 +58,7 @@
 #include <linux/mm_inline.h>
 #include <linux/kfifo.h>
 #include <linux/ratelimit.h>
+#include <linux/cpu.h>
 #include "internal.h"
 #include "ras/ras_event.h"
 
@@ -1773,9 +1774,9 @@ int soft_offline_page(struct page *page,
 		return -EBUSY;
 	}
 
-	get_online_mems();
+	get_online_cpus();
 	ret = get_any_page(page, pfn, flags);
-	put_online_mems();
+	put_online_cpus();
 
 	if (ret > 0)
 		ret = soft_offline_in_use_page(page, flags);
Index: b/mm/memory_hotplug.c
===================================================================
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -52,18 +52,6 @@ static void generic_online_page(struct p
 static online_page_callback_t online_page_callback = generic_online_page;
 static DEFINE_MUTEX(online_page_callback_lock);
 
-DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
-
-void get_online_mems(void)
-{
-	percpu_down_read(&mem_hotplug_lock);
-}
-
-void put_online_mems(void)
-{
-	percpu_up_read(&mem_hotplug_lock);
-}
-
 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
 bool memhp_auto_online;
 #else
@@ -82,18 +70,6 @@ static int __init setup_memhp_default_st
 }
 __setup("memhp_default_state=", setup_memhp_default_state);
 
-void mem_hotplug_begin(void)
-{
-	cpus_read_lock();
-	percpu_down_write(&mem_hotplug_lock);
-}
-
-void mem_hotplug_done(void)
-{
-	percpu_up_write(&mem_hotplug_lock);
-	cpus_read_unlock();
-}
-
 /* add this memory to iomem resource */
 static struct resource *register_memory_resource(u64 start, u64 size)
 {
@@ -816,7 +792,7 @@ int set_online_page_callback(online_page
 {
 	int rc = -EINVAL;
 
-	get_online_mems();
+	get_online_cpus();
 	mutex_lock(&online_page_callback_lock);
 
 	if (online_page_callback == generic_online_page) {
@@ -825,7 +801,7 @@ int set_online_page_callback(online_page
 	}
 
 	mutex_unlock(&online_page_callback_lock);
-	put_online_mems();
+	put_online_cpus();
 
 	return rc;
 }
@@ -835,7 +811,7 @@ int restore_online_page_callback(online_
 {
 	int rc = -EINVAL;
 
-	get_online_mems();
+	get_online_cpus();
 	mutex_lock(&online_page_callback_lock);
 
 	if (online_page_callback == callback) {
@@ -844,7 +820,7 @@ int restore_online_page_callback(online_
 	}
 
 	mutex_unlock(&online_page_callback_lock);
-	put_online_mems();
+	put_online_cpus();
 
 	return rc;
 }
@@ -1213,7 +1189,7 @@ int try_online_node(int nid)
 	if (node_online(nid))
 		return 0;
 
-	mem_hotplug_begin();
+	cpus_write_lock();
 	pgdat = hotadd_new_pgdat(nid, 0);
 	if (!pgdat) {
 		pr_err("Cannot online node %d due to NULL pgdat\n", nid);
@@ -1231,7 +1207,7 @@ int try_online_node(int nid)
 	}
 
 out:
-	mem_hotplug_done();
+	cpus_write_unlock();
 	return ret;
 }
 
@@ -1311,7 +1287,7 @@ int __ref add_memory_resource(int nid, s
 		new_pgdat = !p;
 	}
 
-	mem_hotplug_begin();
+	cpus_write_lock();
 
 	/*
 	 * Add new range to memblock so that when hotadd_new_pgdat() is called
@@ -1365,7 +1341,7 @@ int __ref add_memory_resource(int nid, s
 	memblock_remove(start, size);
 
 out:
-	mem_hotplug_done();
+	cpus_write_unlock();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(add_memory_resource);
@@ -2117,7 +2093,7 @@ void __ref remove_memory(int nid, u64 st
 
 	BUG_ON(check_hotplug_memory_range(start, size));
 
-	mem_hotplug_begin();
+	cpus_write_lock();
 
 	/*
 	 * All memory blocks must be offlined before removing memory.  Check
@@ -2138,7 +2114,7 @@ void __ref remove_memory(int nid, u64 st
 
 	try_offline_node(nid);
 
-	mem_hotplug_done();
+	cpus_write_lock();
 }
 EXPORT_SYMBOL_GPL(remove_memory);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
Index: b/mm/slab_common.c
===================================================================
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -430,7 +430,6 @@ kmem_cache_create(const char *name, size
 	int err;
 
 	get_online_cpus();
-	get_online_mems();
 	memcg_get_cache_ids();
 
 	mutex_lock(&slab_mutex);
@@ -476,7 +475,6 @@ kmem_cache_create(const char *name, size
 	mutex_unlock(&slab_mutex);
 
 	memcg_put_cache_ids();
-	put_online_mems();
 	put_online_cpus();
 
 	if (err) {
@@ -572,7 +570,6 @@ void memcg_create_kmem_cache(struct mem_
 	int idx;
 
 	get_online_cpus();
-	get_online_mems();
 
 	mutex_lock(&slab_mutex);
 
@@ -626,7 +623,6 @@ void memcg_create_kmem_cache(struct mem_
 out_unlock:
 	mutex_unlock(&slab_mutex);
 
-	put_online_mems();
 	put_online_cpus();
 }
 
@@ -636,7 +632,6 @@ static void kmemcg_deactivate_workfn(str
 					    memcg_params.deact_work);
 
 	get_online_cpus();
-	get_online_mems();
 
 	mutex_lock(&slab_mutex);
 
@@ -644,7 +639,6 @@ static void kmemcg_deactivate_workfn(str
 
 	mutex_unlock(&slab_mutex);
 
-	put_online_mems();
 	put_online_cpus();
 
 	/* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */
@@ -699,7 +693,6 @@ void memcg_deactivate_kmem_caches(struct
 	idx = memcg_cache_id(memcg);
 
 	get_online_cpus();
-	get_online_mems();
 
 	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_root_caches, root_caches_node) {
@@ -714,7 +707,6 @@ void memcg_deactivate_kmem_caches(struct
 	}
 	mutex_unlock(&slab_mutex);
 
-	put_online_mems();
 	put_online_cpus();
 }
 
@@ -723,7 +715,6 @@ void memcg_destroy_kmem_caches(struct me
 	struct kmem_cache *s, *s2;
 
 	get_online_cpus();
-	get_online_mems();
 
 	mutex_lock(&slab_mutex);
 	list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
@@ -736,7 +727,6 @@ void memcg_destroy_kmem_caches(struct me
 	}
 	mutex_unlock(&slab_mutex);
 
-	put_online_mems();
 	put_online_cpus();
 }
 
@@ -817,7 +807,6 @@ void kmem_cache_destroy(struct kmem_cach
 		return;
 
 	get_online_cpus();
-	get_online_mems();
 
 	mutex_lock(&slab_mutex);
 
@@ -837,7 +826,6 @@ void kmem_cache_destroy(struct kmem_cach
 out_unlock:
 	mutex_unlock(&slab_mutex);
 
-	put_online_mems();
 	put_online_cpus();
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -854,10 +842,8 @@ int kmem_cache_shrink(struct kmem_cache
 	int ret;
 
 	get_online_cpus();
-	get_online_mems();
 	kasan_cache_shrink(cachep);
 	ret = __kmem_cache_shrink(cachep);
-	put_online_mems();
 	put_online_cpus();
 	return ret;
 }
Index: b/mm/slub.c
===================================================================
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4775,7 +4775,7 @@ static ssize_t show_slab_objects(struct
 		}
 	}
 
-	get_online_mems();
+	get_online_cpus();
 #ifdef CONFIG_SLUB_DEBUG
 	if (flags & SO_ALL) {
 		struct kmem_cache_node *n;
@@ -4816,7 +4816,7 @@ static ssize_t show_slab_objects(struct
 			x += sprintf(buf + x, " N%d=%lu",
 					node, nodes[node]);
 #endif
-	put_online_mems();
+	put_online_cpus();
 	kfree(nodes);
 	return x + sprintf(buf + x, "\n");
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] mm/memory-hotplug: Switch locking to a percpu rwsem
  2017-06-30 10:15   ` Thomas Gleixner
@ 2017-06-30 11:49     ` Andrey Ryabinin
  2017-06-30 13:00       ` Thomas Gleixner
  2017-07-03 16:32     ` Michal Hocko
  1 sibling, 1 reply; 10+ messages in thread
From: Andrey Ryabinin @ 2017-06-30 11:49 UTC (permalink / raw)
  To: Thomas Gleixner, Michal Hocko
  Cc: LKML, linux-mm, Andrew Morton, Vlastimil Babka, Vladimir Davydov,
	Heiko Carstens



On 06/30/2017 01:15 PM, Thomas Gleixner wrote:
> On Fri, 30 Jun 2017, Michal Hocko wrote:
>> So I like this simplification a lot! Even if we can get rid of the
>> stop_machine eventually this patch would be an improvement. A short
>> comment on why the per-cpu semaphore over the regular one is better
>> would be nice.
> 
> Yes, will add one.
> 
> The main point is that the current locking construct is evading lockdep due
> to the ability to support recursive locking, which I did not observe so
> far.
> 

Like this?


[  131.022567] ============================================
[  131.023034] WARNING: possible recursive locking detected
[  131.023034] 4.12.0-rc7-next-20170630 #10 Not tainted
[  131.023034] --------------------------------------------
[  131.023034] bash/2266 is trying to acquire lock:
[  131.023034]  (cpu_hotplug_lock.rw_sem){++++++}, at: [<ffffffff8117fcd2>] lru_add_drain_all+0x42/0x190
[  131.023034] 
               but task is already holding lock:
[  131.023034]  (cpu_hotplug_lock.rw_sem){++++++}, at: [<ffffffff811d5489>] mem_hotplug_begin+0x9/0x20
[  131.023034] 
               other info that might help us debug this:
[  131.023034]  Possible unsafe locking scenario:

[  131.023034]        CPU0
[  131.023034]        ----
[  131.023034]   lock(cpu_hotplug_lock.rw_sem);
[  131.023034]   lock(cpu_hotplug_lock.rw_sem);
[  131.023034] 
                *** DEADLOCK ***

[  131.023034]  May be due to missing lock nesting notation

[  131.023034] 8 locks held by bash/2266:
[  131.023034]  #0:  (sb_writers#8){.+.+.+}, at: [<ffffffff811e81f8>] vfs_write+0x1a8/0x1d0
[  131.023034]  #1:  (&of->mutex){+.+.+.}, at: [<ffffffff81274b2c>] kernfs_fop_write+0xfc/0x1b0
[  131.023034]  #2:  (s_active#48){.+.+.+}, at: [<ffffffff81274b34>] kernfs_fop_write+0x104/0x1b0
[  131.023034]  #3:  (device_hotplug_lock){+.+.+.}, at: [<ffffffff816d4810>] lock_device_hotplug_sysfs+0x10/0x40
[  131.023034]  #4:  (cpu_hotplug_lock.rw_sem){++++++}, at: [<ffffffff811d5489>] mem_hotplug_begin+0x9/0x20
[  131.023034]  #5:  (mem_hotplug_lock.rw_sem){++++++}, at: [<ffffffff810ada81>] percpu_down_write+0x21/0x110
[  131.023034]  #6:  (&dev->mutex){......}, at: [<ffffffff816d5bd5>] device_offline+0x45/0xb0
[  131.023034]  #7:  (lock#3){+.+...}, at: [<ffffffff8117fccd>] lru_add_drain_all+0x3d/0x190
[  131.023034] 
               stack backtrace:
[  131.023034] CPU: 0 PID: 2266 Comm: bash Not tainted 4.12.0-rc7-next-20170630 #10
[  131.023034] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
[  131.023034] Call Trace:
[  131.023034]  dump_stack+0x85/0xc7
[  131.023034]  __lock_acquire+0x1747/0x17a0
[  131.023034]  ? lru_add_drain_all+0x3d/0x190
[  131.023034]  ? __mutex_lock+0x218/0x940
[  131.023034]  ? trace_hardirqs_on+0xd/0x10
[  131.023034]  lock_acquire+0x103/0x200
[  131.023034]  ? lock_acquire+0x103/0x200
[  131.023034]  ? lru_add_drain_all+0x42/0x190
[  131.023034]  cpus_read_lock+0x3d/0x80
[  131.023034]  ? lru_add_drain_all+0x42/0x190
[  131.023034]  lru_add_drain_all+0x42/0x190
[  131.023034]  __offline_pages.constprop.25+0x5de/0x870
[  131.023034]  offline_pages+0xc/0x10
[  131.023034]  memory_subsys_offline+0x43/0x70
[  131.023034]  device_offline+0x83/0xb0
[  131.023034]  store_mem_state+0xdb/0xe0
[  131.023034]  dev_attr_store+0x13/0x20
[  131.023034]  sysfs_kf_write+0x40/0x50
[  131.023034]  kernfs_fop_write+0x130/0x1b0
[  131.023034]  __vfs_write+0x23/0x130
[  131.023034]  ? rcu_read_lock_sched_held+0x6d/0x80
[  131.023034]  ? rcu_sync_lockdep_assert+0x2a/0x50
[  131.023034]  ? __sb_start_write+0xd4/0x1c0
[  131.023034]  ? vfs_write+0x1a8/0x1d0
[  131.023034]  vfs_write+0xc8/0x1d0
[  131.023034]  SyS_write+0x44/0xa0
[  131.023034]  entry_SYSCALL_64_fastpath+0x1f/0xbe
[  131.023034] RIP: 0033:0x7fb6b54ac310
[  131.023034] RSP: 002b:00007ffcb7b123e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[  131.023034] RAX: ffffffffffffffda RBX: 00007fb6b5767640 RCX: 00007fb6b54ac310
[  131.023034] RDX: 0000000000000008 RSI: 00007fb6b5e2d000 RDI: 0000000000000001
[  131.023034] RBP: 0000000000000007 R08: 00007fb6b57687a0 R09: 00007fb6b5e23700
[  131.023034] R10: 0000000000000098 R11: 0000000000000246 R12: 0000000000000007
[  131.023034] R13: 000000000173e9f0 R14: 0000000000000000 R15: 0000000000491569


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] mm/memory-hotplug: Switch locking to a percpu rwsem
  2017-06-30 11:49     ` Andrey Ryabinin
@ 2017-06-30 13:00       ` Thomas Gleixner
  2017-06-30 15:56         ` Andrey Ryabinin
  0 siblings, 1 reply; 10+ messages in thread
From: Thomas Gleixner @ 2017-06-30 13:00 UTC (permalink / raw)
  To: Andrey Ryabinin
  Cc: Michal Hocko, LKML, linux-mm, Andrew Morton, Vlastimil Babka,
	Vladimir Davydov, Heiko Carstens

On Fri, 30 Jun 2017, Andrey Ryabinin wrote:
> On 06/30/2017 01:15 PM, Thomas Gleixner wrote:
> > On Fri, 30 Jun 2017, Michal Hocko wrote:
> >> So I like this simplification a lot! Even if we can get rid of the
> >> stop_machine eventually this patch would be an improvement. A short
> >> comment on why the per-cpu semaphore over the regular one is better
> >> would be nice.
> > 
> > Yes, will add one.
> > 
> > The main point is that the current locking construct is evading lockdep due
> > to the ability to support recursive locking, which I did not observe so
> > far.
> > 
> 
> Like this?

Cute.....

> [  131.023034] Call Trace:
> [  131.023034]  dump_stack+0x85/0xc7
> [  131.023034]  __lock_acquire+0x1747/0x17a0
> [  131.023034]  ? lru_add_drain_all+0x3d/0x190
> [  131.023034]  ? __mutex_lock+0x218/0x940
> [  131.023034]  ? trace_hardirqs_on+0xd/0x10
> [  131.023034]  lock_acquire+0x103/0x200
> [  131.023034]  ? lock_acquire+0x103/0x200
> [  131.023034]  ? lru_add_drain_all+0x42/0x190
> [  131.023034]  cpus_read_lock+0x3d/0x80
> [  131.023034]  ? lru_add_drain_all+0x42/0x190
> [  131.023034]  lru_add_drain_all+0x42/0x190
> [  131.023034]  __offline_pages.constprop.25+0x5de/0x870
> [  131.023034]  offline_pages+0xc/0x10
> [  131.023034]  memory_subsys_offline+0x43/0x70
> [  131.023034]  device_offline+0x83/0xb0
> [  131.023034]  store_mem_state+0xdb/0xe0
> [  131.023034]  dev_attr_store+0x13/0x20
> [  131.023034]  sysfs_kf_write+0x40/0x50
> [  131.023034]  kernfs_fop_write+0x130/0x1b0
> [  131.023034]  __vfs_write+0x23/0x130
> [  131.023034]  ? rcu_read_lock_sched_held+0x6d/0x80
> [  131.023034]  ? rcu_sync_lockdep_assert+0x2a/0x50
> [  131.023034]  ? __sb_start_write+0xd4/0x1c0
> [  131.023034]  ? vfs_write+0x1a8/0x1d0
> [  131.023034]  vfs_write+0xc8/0x1d0
> [  131.023034]  SyS_write+0x44/0xa0

Why didn't trigger that here? Bah, I should have become suspicious due to
not seeing a splat ....

The patch below should cure that.

Thanks,

	tglx

8<-------------------
Subject: mm: Change cpuhotplug lock order in lru_add_drain_all()
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 30 Jun 2017 14:25:24 +0200

Not-Yet-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/swap.h |    1 +
 mm/memory_hotplug.c  |    4 ++--
 mm/swap.c            |   11 ++++++++---
 3 files changed, 11 insertions(+), 5 deletions(-)

Index: b/include/linux/swap.h
===================================================================
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -277,6 +277,7 @@ extern void mark_page_accessed(struct pa
 extern void lru_add_drain(void);
 extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_all(void);
+extern void lru_add_drain_all_cpuslocked(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void deactivate_file_page(struct page *page);
 extern void mark_page_lazyfree(struct page *page);
Index: b/mm/memory_hotplug.c
===================================================================
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1860,7 +1860,7 @@ static int __ref __offline_pages(unsigne
 		goto failed_removal;
 	ret = 0;
 	if (drain) {
-		lru_add_drain_all();
+		lru_add_drain_all_cpuslocked();
 		cond_resched();
 		drain_all_pages(zone);
 	}
@@ -1881,7 +1881,7 @@ static int __ref __offline_pages(unsigne
 		}
 	}
 	/* drain all zone's lru pagevec, this is asynchronous... */
-	lru_add_drain_all();
+	lru_add_drain_all_cpuslocked();
 	yield();
 	/* drain pcp pages, this is synchronous. */
 	drain_all_pages(zone);
Index: b/mm/swap.c
===================================================================
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -687,7 +687,7 @@ static void lru_add_drain_per_cpu(struct
 
 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
 
-void lru_add_drain_all(void)
+void lru_add_drain_all_cpuslocked(void)
 {
 	static DEFINE_MUTEX(lock);
 	static struct cpumask has_work;
@@ -701,7 +701,6 @@ void lru_add_drain_all(void)
 		return;
 
 	mutex_lock(&lock);
-	get_online_cpus();
 	cpumask_clear(&has_work);
 
 	for_each_online_cpu(cpu) {
@@ -721,10 +720,16 @@ void lru_add_drain_all(void)
 	for_each_cpu(cpu, &has_work)
 		flush_work(&per_cpu(lru_add_drain_work, cpu));
 
-	put_online_cpus();
 	mutex_unlock(&lock);
 }
 
+void lru_add_drain_all(void)
+{
+	get_online_cpus();
+	lru_add_drain_all_cpuslocked();
+	put_online_cpus();
+}
+
 /**
  * release_pages - batched put_page()
  * @pages: array of pages to release

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] mm/memory-hotplug: Switch locking to a percpu rwsem
  2017-06-30 13:00       ` Thomas Gleixner
@ 2017-06-30 15:56         ` Andrey Ryabinin
  0 siblings, 0 replies; 10+ messages in thread
From: Andrey Ryabinin @ 2017-06-30 15:56 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Michal Hocko, LKML, linux-mm, Andrew Morton, Vlastimil Babka,
	Vladimir Davydov, Heiko Carstens

On 06/30/2017 04:00 PM, Thomas Gleixner wrote:
> On Fri, 30 Jun 2017, Andrey Ryabinin wrote:
>> On 06/30/2017 01:15 PM, Thomas Gleixner wrote:
>>> On Fri, 30 Jun 2017, Michal Hocko wrote:
>>>> So I like this simplification a lot! Even if we can get rid of the
>>>> stop_machine eventually this patch would be an improvement. A short
>>>> comment on why the per-cpu semaphore over the regular one is better
>>>> would be nice.
>>>
>>> Yes, will add one.
>>>
>>> The main point is that the current locking construct is evading lockdep due
>>> to the ability to support recursive locking, which I did not observe so
>>> far.
>>>
>>
>> Like this?
> 
> Cute.....
> 
>> [  131.023034] Call Trace:
>> [  131.023034]  dump_stack+0x85/0xc7
>> [  131.023034]  __lock_acquire+0x1747/0x17a0
>> [  131.023034]  ? lru_add_drain_all+0x3d/0x190
>> [  131.023034]  ? __mutex_lock+0x218/0x940
>> [  131.023034]  ? trace_hardirqs_on+0xd/0x10
>> [  131.023034]  lock_acquire+0x103/0x200
>> [  131.023034]  ? lock_acquire+0x103/0x200
>> [  131.023034]  ? lru_add_drain_all+0x42/0x190
>> [  131.023034]  cpus_read_lock+0x3d/0x80
>> [  131.023034]  ? lru_add_drain_all+0x42/0x190
>> [  131.023034]  lru_add_drain_all+0x42/0x190
>> [  131.023034]  __offline_pages.constprop.25+0x5de/0x870
>> [  131.023034]  offline_pages+0xc/0x10
>> [  131.023034]  memory_subsys_offline+0x43/0x70
>> [  131.023034]  device_offline+0x83/0xb0
>> [  131.023034]  store_mem_state+0xdb/0xe0
>> [  131.023034]  dev_attr_store+0x13/0x20
>> [  131.023034]  sysfs_kf_write+0x40/0x50
>> [  131.023034]  kernfs_fop_write+0x130/0x1b0
>> [  131.023034]  __vfs_write+0x23/0x130
>> [  131.023034]  ? rcu_read_lock_sched_held+0x6d/0x80
>> [  131.023034]  ? rcu_sync_lockdep_assert+0x2a/0x50
>> [  131.023034]  ? __sb_start_write+0xd4/0x1c0
>> [  131.023034]  ? vfs_write+0x1a8/0x1d0
>> [  131.023034]  vfs_write+0xc8/0x1d0
>> [  131.023034]  SyS_write+0x44/0xa0
> 
> Why didn't trigger that here? Bah, I should have become suspicious due to
> not seeing a splat ....
> 
> The patch below should cure that.
> 

FWIW, it works for me.

> Thanks,
> 
> 	tglx
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] mm/memory-hotplug: Switch locking to a percpu rwsem
  2017-06-29 16:11 [PATCH] mm/memory-hotplug: Switch locking to a percpu rwsem Thomas Gleixner
  2017-06-30  9:27 ` Michal Hocko
@ 2017-07-03 12:41 ` Vladimir Davydov
  2017-07-03 16:38 ` Michal Hocko
  2 siblings, 0 replies; 10+ messages in thread
From: Vladimir Davydov @ 2017-07-03 12:41 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andrey Ryabinin, LKML, linux-mm, Andrew Morton, Michal Hocko,
	Vlastimil Babka

On Thu, Jun 29, 2017 at 06:11:15PM +0200, Thomas Gleixner wrote:
> Andrey reported a potential deadlock with the memory hotplug lock and the
> cpu hotplug lock.
> 
> The reason is that memory hotplug takes the memory hotplug lock and then
> calls stop_machine() which calls get_online_cpus(). That's the reverse lock
> order to get_online_cpus(); get_online_mems(); in mm/slub_common.c
> 
> The problem has been there forever. The reason why this was never reported
> is that the cpu hotplug locking had this homebrewn recursive reader writer
> semaphore construct which due to the recursion evaded the full lock dep
> coverage. The memory hotplug code copied that construct verbatim and
> therefor has similar issues.

The only reason I copied get_online_cpus() implementation instead of
using an rw semaphore was that I didn't want to deal with potential
deadlocks caused by calling get_online_mems() from the memory hotplug
code, like the one reported by Andrey below. However, these bugs should
be pretty easy to fix, as you clearly demonstrated in response to
Andrey's report. Apart from that, I don't see any problems with this
patch, and the code simplification does look compelling. FWIW,

Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>

> 
> Two steps to fix this:
> 
> 1) Convert the memory hotplug locking to a per cpu rwsem so the potential
>    issues get reported proper by lockdep.
> 
> 2) Lock the online cpus in mem_hotplug_begin() before taking the memory
>    hotplug rwsem and use stop_machine_cpuslocked() in the page_alloc code
>    to avoid recursive locking.
> 
> Reported-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: linux-mm@kvack.org
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Vlastimil Babka <vbabka@suse.cz>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] mm/memory-hotplug: Switch locking to a percpu rwsem
  2017-06-30 10:15   ` Thomas Gleixner
  2017-06-30 11:49     ` Andrey Ryabinin
@ 2017-07-03 16:32     ` Michal Hocko
  2017-07-03 19:57       ` Thomas Gleixner
  1 sibling, 1 reply; 10+ messages in thread
From: Michal Hocko @ 2017-07-03 16:32 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andrey Ryabinin, LKML, linux-mm, Andrew Morton, Vlastimil Babka,
	Vladimir Davydov, Heiko Carstens

On Fri 30-06-17 12:15:21, Thomas Gleixner wrote:
[...]
> Sure. Just to make you to mull over more stuff, find below the patch which
> moves all of this to use the cpuhotplug lock.
> 
> Thanks,
> 
> 	tglx
> 
> 8<--------------------
> Subject: mm/memory-hotplug: Use cpu hotplug lock
> From: Thomas Gleixner <tglx@linutronix.de>
> Date: Thu, 29 Jun 2017 16:30:00 +0200
> 
> Most place which take the memory hotplug lock take the cpu hotplug lock as
> well. Avoid the double locking and use the cpu hotplug lock for both.

Hmm, I am usually not a fan of locks conflating because it is then less
clear what the lock actually protects. Memory and cpu hotplugs should
be largely independent so I am not sure this patch simplify things a
lot. It is nice to see few lines go away but I am little bit worried
that we will enventually develop a separate locking again in future for
some weird memory hotplug usecases.
 
> Not-Yet-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[...]
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
[...]
> @@ -2138,7 +2114,7 @@ void __ref remove_memory(int nid, u64 st
>  
>  	try_offline_node(nid);
>  
> -	mem_hotplug_done();
> +	cpus_write_lock();

unlock you meant here, right?

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] mm/memory-hotplug: Switch locking to a percpu rwsem
  2017-06-29 16:11 [PATCH] mm/memory-hotplug: Switch locking to a percpu rwsem Thomas Gleixner
  2017-06-30  9:27 ` Michal Hocko
  2017-07-03 12:41 ` Vladimir Davydov
@ 2017-07-03 16:38 ` Michal Hocko
  2 siblings, 0 replies; 10+ messages in thread
From: Michal Hocko @ 2017-07-03 16:38 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andrey Ryabinin, LKML, linux-mm, Andrew Morton, Vlastimil Babka

On Thu 29-06-17 18:11:15, Thomas Gleixner wrote:
> Andrey reported a potential deadlock with the memory hotplug lock and the
> cpu hotplug lock.
> 
> The reason is that memory hotplug takes the memory hotplug lock and then
> calls stop_machine() which calls get_online_cpus(). That's the reverse lock
> order to get_online_cpus(); get_online_mems(); in mm/slub_common.c
> 
> The problem has been there forever. The reason why this was never reported
> is that the cpu hotplug locking had this homebrewn recursive reader writer
> semaphore construct which due to the recursion evaded the full lock dep
> coverage. The memory hotplug code copied that construct verbatim and
> therefor has similar issues.
> 
> Two steps to fix this:
> 
> 1) Convert the memory hotplug locking to a per cpu rwsem so the potential
>    issues get reported proper by lockdep.
> 
> 2) Lock the online cpus in mem_hotplug_begin() before taking the memory
>    hotplug rwsem and use stop_machine_cpuslocked() in the page_alloc code
>    to avoid recursive locking.
> 
> Reported-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: linux-mm@kvack.org
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Vlastimil Babka <vbabka@suse.cz>

With the full patch for the lockdep splat
Acked-by: Michal Hocko <mhocko@suse.com>

Thanks!

> ---
> 
> Note 1:
>  Applies against -next or
>      
>    git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git smp/hotplug
> 
>  which contains the hotplug locking rework including stop_machine_cpuslocked()
> 
> Note 2:
> 
>  Most of the call sites of get_online_mems() are also calling get_online_cpus().
> 
>  So we could switch the whole machinery to use the CPU hotplug locking for
>  protecting both memory and CPU hotplug. That actually works and removes
>  another 40 lines of code.
> 
> ---
>  mm/memory_hotplug.c |   85 +++++++---------------------------------------------
>  mm/page_alloc.c     |    2 -
>  2 files changed, 14 insertions(+), 73 deletions(-)
> 
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -52,32 +52,17 @@ static void generic_online_page(struct p
>  static online_page_callback_t online_page_callback = generic_online_page;
>  static DEFINE_MUTEX(online_page_callback_lock);
>  
> -/* The same as the cpu_hotplug lock, but for memory hotplug. */
> -static struct {
> -	struct task_struct *active_writer;
> -	struct mutex lock; /* Synchronizes accesses to refcount, */
> -	/*
> -	 * Also blocks the new readers during
> -	 * an ongoing mem hotplug operation.
> -	 */
> -	int refcount;
> +DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
>  
> -#ifdef CONFIG_DEBUG_LOCK_ALLOC
> -	struct lockdep_map dep_map;
> -#endif
> -} mem_hotplug = {
> -	.active_writer = NULL,
> -	.lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
> -	.refcount = 0,
> -#ifdef CONFIG_DEBUG_LOCK_ALLOC
> -	.dep_map = {.name = "mem_hotplug.lock" },
> -#endif
> -};
> +void get_online_mems(void)
> +{
> +	percpu_down_read(&mem_hotplug_lock);
> +}
>  
> -/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
> -#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
> -#define memhp_lock_acquire()      lock_map_acquire(&mem_hotplug.dep_map)
> -#define memhp_lock_release()      lock_map_release(&mem_hotplug.dep_map)
> +void put_online_mems(void)
> +{
> +	percpu_up_read(&mem_hotplug_lock);
> +}
>  
>  #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
>  bool memhp_auto_online;
> @@ -97,60 +82,16 @@ static int __init setup_memhp_default_st
>  }
>  __setup("memhp_default_state=", setup_memhp_default_state);
>  
> -void get_online_mems(void)
> -{
> -	might_sleep();
> -	if (mem_hotplug.active_writer == current)
> -		return;
> -	memhp_lock_acquire_read();
> -	mutex_lock(&mem_hotplug.lock);
> -	mem_hotplug.refcount++;
> -	mutex_unlock(&mem_hotplug.lock);
> -
> -}
> -
> -void put_online_mems(void)
> -{
> -	if (mem_hotplug.active_writer == current)
> -		return;
> -	mutex_lock(&mem_hotplug.lock);
> -
> -	if (WARN_ON(!mem_hotplug.refcount))
> -		mem_hotplug.refcount++; /* try to fix things up */
> -
> -	if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
> -		wake_up_process(mem_hotplug.active_writer);
> -	mutex_unlock(&mem_hotplug.lock);
> -	memhp_lock_release();
> -
> -}
> -
> -/* Serializes write accesses to mem_hotplug.active_writer. */
> -static DEFINE_MUTEX(memory_add_remove_lock);
> -
>  void mem_hotplug_begin(void)
>  {
> -	mutex_lock(&memory_add_remove_lock);
> -
> -	mem_hotplug.active_writer = current;
> -
> -	memhp_lock_acquire();
> -	for (;;) {
> -		mutex_lock(&mem_hotplug.lock);
> -		if (likely(!mem_hotplug.refcount))
> -			break;
> -		__set_current_state(TASK_UNINTERRUPTIBLE);
> -		mutex_unlock(&mem_hotplug.lock);
> -		schedule();
> -	}
> +	cpus_read_lock();
> +	percpu_down_write(&mem_hotplug_lock);
>  }
>  
>  void mem_hotplug_done(void)
>  {
> -	mem_hotplug.active_writer = NULL;
> -	mutex_unlock(&mem_hotplug.lock);
> -	memhp_lock_release();
> -	mutex_unlock(&memory_add_remove_lock);
> +	percpu_up_write(&mem_hotplug_lock);
> +	cpus_read_unlock();
>  }
>  
>  /* add this memory to iomem resource */
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5216,7 +5216,7 @@ void __ref build_all_zonelists(pg_data_t
>  #endif
>  		/* we have to stop all cpus to guarantee there is no user
>  		   of zonelist */
> -		stop_machine(__build_all_zonelists, pgdat, NULL);
> +		stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
>  		/* cpuset refresh routine should be here */
>  	}
>  	vm_total_pages = nr_free_pagecache_pages();

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] mm/memory-hotplug: Switch locking to a percpu rwsem
  2017-07-03 16:32     ` Michal Hocko
@ 2017-07-03 19:57       ` Thomas Gleixner
  0 siblings, 0 replies; 10+ messages in thread
From: Thomas Gleixner @ 2017-07-03 19:57 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrey Ryabinin, LKML, linux-mm, Andrew Morton, Vlastimil Babka,
	Vladimir Davydov, Heiko Carstens

On Mon, 3 Jul 2017, Michal Hocko wrote:
> On Fri 30-06-17 12:15:21, Thomas Gleixner wrote:
> [...]
> > Sure. Just to make you to mull over more stuff, find below the patch which
> > moves all of this to use the cpuhotplug lock.
> > 
> > Thanks,
> > 
> > 	tglx
> > 
> > 8<--------------------
> > Subject: mm/memory-hotplug: Use cpu hotplug lock
> > From: Thomas Gleixner <tglx@linutronix.de>
> > Date: Thu, 29 Jun 2017 16:30:00 +0200
> > 
> > Most place which take the memory hotplug lock take the cpu hotplug lock as
> > well. Avoid the double locking and use the cpu hotplug lock for both.
> 
> Hmm, I am usually not a fan of locks conflating because it is then less
> clear what the lock actually protects. Memory and cpu hotplugs should
> be largely independent so I am not sure this patch simplify things a
> lot. It is nice to see few lines go away but I am little bit worried
> that we will enventually develop a separate locking again in future for
> some weird memory hotplug usecases.

Fair enough.

>  
> > Not-Yet-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> [...]
> > --- a/mm/memory_hotplug.c
> > +++ b/mm/memory_hotplug.c
> [...]
> > @@ -2138,7 +2114,7 @@ void __ref remove_memory(int nid, u64 st
> >  
> >  	try_offline_node(nid);
> >  
> > -	mem_hotplug_done();
> > +	cpus_write_lock();
> 
> unlock you meant here, right?

Doh, -ENOQUILTREFRESH

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2017-07-03 19:57 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-29 16:11 [PATCH] mm/memory-hotplug: Switch locking to a percpu rwsem Thomas Gleixner
2017-06-30  9:27 ` Michal Hocko
2017-06-30 10:15   ` Thomas Gleixner
2017-06-30 11:49     ` Andrey Ryabinin
2017-06-30 13:00       ` Thomas Gleixner
2017-06-30 15:56         ` Andrey Ryabinin
2017-07-03 16:32     ` Michal Hocko
2017-07-03 19:57       ` Thomas Gleixner
2017-07-03 12:41 ` Vladimir Davydov
2017-07-03 16:38 ` Michal Hocko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).