Linux-mm Archive on lore.kernel.org
 help / color / Atom feed
From: Michal Hocko <mhocko@kernel.org>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Vlastimil Babka <vbabka@suse.cz>,
	linux-mm@kvack.org, LKML <linux-kernel@vger.kernel.org>,
	Michal Hocko <mhocko@suse.com>
Subject: [PATCH 8/9] mm, memory_hotplug: get rid of zonelists_mutex
Date: Fri, 21 Jul 2017 16:39:14 +0200
Message-ID: <20170721143915.14161-9-mhocko@kernel.org> (raw)
In-Reply-To: <20170721143915.14161-1-mhocko@kernel.org>

From: Michal Hocko <mhocko@suse.com>

zonelists_mutex has been introduced by 4eaf3f64397c ("mem-hotplug: fix
potential race while building zonelist for new populated zone") to
protect zonelist building from races. This is no longer needed though
because both memory online and offline are fully serialized. New users
have grown since then.

Notably setup_per_zone_wmarks wants to prevent from races between memory
hotplug, khugepaged setup and manual min_free_kbytes update via sysctl
(see cfd3da1e49bb ("mm: Serialize access to min_free_kbytes"). Let's
add a private lock for that purpose. This will not prevent from seeing
halfway through memory hotplug operation but that shouldn't be a big
deal becuse memory hotplug will update watermarks explicitly so we will
eventually get a full picture. The lock just makes sure we won't race
when updating watermarks leading to weird results.

Also __build_all_zonelists manipulates global data so add a private lock
for it as well. This doesn't seem to be necessary today but it is more
robust to have a lock there.

While we are at it make sure we document that memory online/offline
depends on a full serialization either via mem_hotplug_begin() or
device_lock.

Signed-off-by: Michal Hocko <mhocko@suse.com>
---
 include/linux/mmzone.h |  1 -
 mm/memory_hotplug.c    | 12 ++----------
 mm/page_alloc.c        | 18 +++++++++---------
 3 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0add5ee09729..fda9afbd14d9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -770,7 +770,6 @@ static inline bool is_dev_zone(const struct zone *zone)
 
 #include <linux/memory_hotplug.h>
 
-extern struct mutex zonelists_mutex;
 void build_all_zonelists(pg_data_t *pgdat);
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0d2f6a11075c..c1b0077eb9d1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -917,7 +917,7 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid,
 	return zone;
 }
 
-/* Must be protected by mem_hotplug_begin() */
+/* Must be protected by mem_hotplug_begin() or a device_lock */
 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
 {
 	unsigned long flags;
@@ -949,7 +949,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	 * This means the page allocator ignores this zone.
 	 * So, zonelist must be updated after online.
 	 */
-	mutex_lock(&zonelists_mutex);
 	if (!populated_zone(zone)) {
 		need_zonelists_rebuild = 1;
 		setup_zone_pageset(zone);
@@ -960,7 +959,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	if (ret) {
 		if (need_zonelists_rebuild)
 			zone_pcp_reset(zone);
-		mutex_unlock(&zonelists_mutex);
 		goto failed_addition;
 	}
 
@@ -978,8 +976,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 			zone_pcp_update(zone);
 	}
 
-	mutex_unlock(&zonelists_mutex);
-
 	init_per_zone_wmark_min();
 
 	if (onlined_pages) {
@@ -1050,9 +1046,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 	 * The node we allocated has no zone fallback lists. For avoiding
 	 * to access not-initialized zonelist, build here.
 	 */
-	mutex_lock(&zonelists_mutex);
 	build_all_zonelists(pgdat);
-	mutex_unlock(&zonelists_mutex);
 
 	/*
 	 * zone->managed_pages is set to an approximate value in
@@ -1719,9 +1713,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
 
 	if (!populated_zone(zone)) {
 		zone_pcp_reset(zone);
-		mutex_lock(&zonelists_mutex);
 		build_all_zonelists(NULL);
-		mutex_unlock(&zonelists_mutex);
 	} else
 		zone_pcp_update(zone);
 
@@ -1747,7 +1739,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
 	return ret;
 }
 
-/* Must be protected by mem_hotplug_begin() */
+/* Must be protected by mem_hotplug_begin() or a device_lock */
 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
 	return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cf2eb3cf2cc5..369a263ce2fb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5067,17 +5067,14 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
 
-/*
- * Global mutex to protect against size modification of zonelists
- * as well as to serialize pageset setup for the new populated zone.
- */
-DEFINE_MUTEX(zonelists_mutex);
-
 static void __build_all_zonelists(void *data)
 {
 	int nid;
 	int __maybe_unused cpu;
 	pg_data_t *self = data;
+	static DEFINE_SPINLOCK(lock);
+
+	spin_lock(&lock);
 
 #ifdef CONFIG_NUMA
 	memset(node_load, 0, sizeof(node_load));
@@ -5109,6 +5106,8 @@ static void __build_all_zonelists(void *data)
 			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
 #endif
 	}
+
+	spin_unlock(&lock);
 }
 
 static noinline void __init
@@ -5139,7 +5138,6 @@ build_all_zonelists_init(void)
 }
 
 /*
- * Called with zonelists_mutex held always
  * unless system_state == SYSTEM_BOOTING.
  *
  * __ref due to call of __init annotated helper build_all_zonelists_init
@@ -6880,9 +6878,11 @@ static void __setup_per_zone_wmarks(void)
  */
 void setup_per_zone_wmarks(void)
 {
-	mutex_lock(&zonelists_mutex);
+	static DEFINE_SPINLOCK(lock);
+
+	spin_lock(&lock);
 	__setup_per_zone_wmarks();
-	mutex_unlock(&zonelists_mutex);
+	spin_unlock(&lock);
 }
 
 /*
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply index

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-07-21 14:39 [PATCH -v1 0/9] cleanup zonelists initialization Michal Hocko
2017-07-21 14:39 ` [PATCH 1/9] mm, page_alloc: rip out ZONELIST_ORDER_ZONE Michal Hocko
2017-07-21 14:45   ` Michal Hocko
2017-07-21 14:39 ` [PATCH 2/9] mm, page_alloc: remove boot pageset initialization from memory hotplug Michal Hocko
2017-07-21 14:39 ` [PATCH 3/9] mm, page_alloc: do not set_cpu_numa_mem on empty nodes initialization Michal Hocko
2017-07-21 14:39 ` [PATCH 4/9] mm, memory_hotplug: drop zone from build_all_zonelists Michal Hocko
2017-07-21 14:39 ` [PATCH 5/9] mm, memory_hotplug: remove explicit build_all_zonelists from try_online_node Michal Hocko
2017-07-21 14:39 ` [PATCH 6/9] mm, page_alloc: simplify zonelist initialization Michal Hocko
2017-07-24  9:25   ` Vlastimil Babka
2017-07-21 14:39 ` [PATCH 7/9] mm, page_alloc: remove stop_machine from build_all_zonelists Michal Hocko
2017-07-21 14:39 ` Michal Hocko [this message]
2017-07-21 14:39 ` [PATCH 9/9] mm, sparse, page_ext: drop ugly N_HIGH_MEMORY branches for allocations Michal Hocko
  -- strict thread matches above, loose matches on Subject: below --
2017-07-14  7:59 [PATCH 0/9] cleanup zonelists initialization Michal Hocko
2017-07-14  8:00 ` [PATCH 8/9] mm, memory_hotplug: get rid of zonelists_mutex Michal Hocko

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170721143915.14161-9-mhocko@kernel.org \
    --to=mhocko@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=mhocko@suse.com \
    --cc=vbabka@suse.cz \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-mm Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-mm/0 linux-mm/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-mm linux-mm/ https://lore.kernel.org/linux-mm \
		linux-mm@kvack.org
	public-inbox-index linux-mm

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kvack.linux-mm


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git