All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-25 21:00 ` Tahsin Erdogan
  0 siblings, 0 replies; 38+ messages in thread
From: Tahsin Erdogan @ 2017-02-25 21:00 UTC (permalink / raw)
  To: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, Michal Hocko,
	zijun_hu, Joonsoo Kim
  Cc: linux-mm, linux-kernel, Tahsin Erdogan

When pcpu_alloc() is called with gfp != GFP_KERNEL, the likelihood of
a failure is higher than GFP_KERNEL case. This is mainly because
pcpu_alloc() relies on previously allocated reserves and does not make
an effort to add memory to its pools for non-GFP_KERNEL case.

This issue is somewhat mitigated by kicking off a background work when
a memory allocation failure occurs. But this doesn't really help the
original victim of allocation failure.

This problem affects blkg_lookup_create() callers on machines with a
lot of cpus.

This patch reduces failure cases by trying to expand the memory pools.
It passes along gfp flag so it is safe to allocate memory this way.

To make this work, a gfp flag aware vmalloc_gfp() function is added.
Also, locking around vmap_area_lock has been updated to save/restore
irq flags. This was needed to avoid a lockdep problem between
request_queue->queue_lock and vmap_area_lock.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 include/linux/vmalloc.h |   5 +-
 mm/percpu-km.c          |   8 +--
 mm/percpu-vm.c          | 119 +++++++++++-------------------------
 mm/percpu.c             | 156 ++++++++++++++++++++++++++++--------------------
 mm/vmalloc.c            |  74 ++++++++++++++---------
 5 files changed, 179 insertions(+), 183 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index d68edffbf142..8110a0040b9d 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -72,6 +72,7 @@ extern void *vzalloc(unsigned long size);
 extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
 extern void *vzalloc_node(unsigned long size, int node);
+extern void *vmalloc_gfp(unsigned long size, gfp_t gfp_mask);
 extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
 extern void *vmalloc_32_user(unsigned long size);
@@ -165,14 +166,14 @@ extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 # ifdef CONFIG_MMU
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				     const size_t *sizes, int nr_vms,
-				     size_t align);
+				     size_t align, gfp_t gfp_mask);
 
 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
 # else
 static inline struct vm_struct **
 pcpu_get_vm_areas(const unsigned long *offsets,
 		const size_t *sizes, int nr_vms,
-		size_t align)
+		size_t align, gfp_t gfp_mask)
 {
 	return NULL;
 }
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index d66911ff42d9..599a9ce84544 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -34,7 +34,7 @@
 #include <linux/log2.h>
 
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
-			       int page_start, int page_end)
+			       int page_start, int page_end, gfp_t gfp)
 {
 	return 0;
 }
@@ -45,18 +45,18 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
 	/* nada */
 }
 
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
 {
 	const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
 	struct pcpu_chunk *chunk;
 	struct page *pages;
 	int i;
 
-	chunk = pcpu_alloc_chunk();
+	chunk = pcpu_alloc_chunk(gfp);
 	if (!chunk)
 		return NULL;
 
-	pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
+	pages = alloc_pages(gfp, order_base_2(nr_pages));
 	if (!pages) {
 		pcpu_free_chunk(chunk);
 		return NULL;
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 9ac639499bd1..42348a421ccf 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -20,28 +20,6 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
 }
 
 /**
- * pcpu_get_pages - get temp pages array
- *
- * Returns pointer to array of pointers to struct page which can be indexed
- * with pcpu_page_idx().  Note that there is only one array and accesses
- * should be serialized by pcpu_alloc_mutex.
- *
- * RETURNS:
- * Pointer to temp pages array on success.
- */
-static struct page **pcpu_get_pages(void)
-{
-	static struct page **pages;
-	size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
-
-	lockdep_assert_held(&pcpu_alloc_mutex);
-
-	if (!pages)
-		pages = pcpu_mem_zalloc(pages_size);
-	return pages;
-}
-
-/**
  * pcpu_free_pages - free pages which were allocated for @chunk
  * @chunk: chunk pages were allocated for
  * @pages: array of pages to be freed, indexed by pcpu_page_idx()
@@ -73,15 +51,16 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
  * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
  * @page_start: page index of the first page to be allocated
  * @page_end: page index of the last page to be allocated + 1
+ * @gfp: gfp flags
  *
  * Allocate pages [@page_start,@page_end) into @pages for all units.
  * The allocation is for @chunk.  Percpu core doesn't care about the
  * content of @pages and will pass it verbatim to pcpu_map_pages().
  */
 static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
-			    struct page **pages, int page_start, int page_end)
+			    struct page **pages, int page_start, int page_end,
+			    gfp_t gfp)
 {
-	const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
 	unsigned int cpu, tcpu;
 	int i;
 
@@ -135,38 +114,6 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
 }
 
 /**
- * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
- * @chunk: chunk of interest
- * @pages: pages array which can be used to pass information to free
- * @page_start: page index of the first page to unmap
- * @page_end: page index of the last page to unmap + 1
- *
- * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
- * Corresponding elements in @pages were cleared by the caller and can
- * be used to carry information to pcpu_free_pages() which will be
- * called after all unmaps are finished.  The caller should call
- * proper pre/post flush functions.
- */
-static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
-			     struct page **pages, int page_start, int page_end)
-{
-	unsigned int cpu;
-	int i;
-
-	for_each_possible_cpu(cpu) {
-		for (i = page_start; i < page_end; i++) {
-			struct page *page;
-
-			page = pcpu_chunk_page(chunk, cpu, i);
-			WARN_ON(!page);
-			pages[pcpu_page_idx(cpu, i)] = page;
-		}
-		__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
-				   page_end - page_start);
-	}
-}
-
-/**
  * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
  * @chunk: pcpu_chunk the regions to be flushed belong to
  * @page_start: page index of the first page to be flushed
@@ -262,32 +209,38 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
  * @chunk: chunk of interest
  * @page_start: the start page
  * @page_end: the end page
+ * @gfp: gfp flags
  *
  * For each cpu, populate and map pages [@page_start,@page_end) into
  * @chunk.
- *
- * CONTEXT:
- * pcpu_alloc_mutex, does GFP_KERNEL allocation.
  */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
-			       int page_start, int page_end)
+			       int page_start, int page_end, gfp_t gfp)
 {
 	struct page **pages;
+	size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
+	int ret;
 
-	pages = pcpu_get_pages();
+	pages = pcpu_mem_zalloc(pages_size, gfp);
 	if (!pages)
 		return -ENOMEM;
 
-	if (pcpu_alloc_pages(chunk, pages, page_start, page_end))
-		return -ENOMEM;
+	if (pcpu_alloc_pages(chunk, pages, page_start, page_end,
+			     gfp | __GFP_HIGHMEM | __GFP_COLD)) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
 		pcpu_free_pages(chunk, pages, page_start, page_end);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 	pcpu_post_map_flush(chunk, page_start, page_end);
-
-	return 0;
+	ret = 0;
+out:
+	pcpu_mem_free(pages);
+	return ret;
 }
 
 /**
@@ -298,44 +251,40 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
  *
  * For each cpu, depopulate and unmap pages [@page_start,@page_end)
  * from @chunk.
- *
- * CONTEXT:
- * pcpu_alloc_mutex.
  */
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
 				  int page_start, int page_end)
 {
-	struct page **pages;
-
-	/*
-	 * If control reaches here, there must have been at least one
-	 * successful population attempt so the temp pages array must
-	 * be available now.
-	 */
-	pages = pcpu_get_pages();
-	BUG_ON(!pages);
+	unsigned int cpu;
+	int i;
 
-	/* unmap and free */
 	pcpu_pre_unmap_flush(chunk, page_start, page_end);
 
-	pcpu_unmap_pages(chunk, pages, page_start, page_end);
+	for_each_possible_cpu(cpu)
+		for (i = page_start; i < page_end; i++) {
+			struct page *page;
+
+			page = pcpu_chunk_page(chunk, cpu, i);
+			WARN_ON(!page);
 
-	/* no need to flush tlb, vmalloc will handle it lazily */
+			__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, i), 1);
 
-	pcpu_free_pages(chunk, pages, page_start, page_end);
+			if (likely(page))
+				__free_page(page);
+		}
 }
 
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
 {
 	struct pcpu_chunk *chunk;
 	struct vm_struct **vms;
 
-	chunk = pcpu_alloc_chunk();
+	chunk = pcpu_alloc_chunk(gfp);
 	if (!chunk)
 		return NULL;
 
 	vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
-				pcpu_nr_groups, pcpu_atom_size);
+				pcpu_nr_groups, pcpu_atom_size, gfp);
 	if (!vms) {
 		pcpu_free_chunk(chunk);
 		return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index 232356a2d914..f2cee0ae8688 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -103,6 +103,11 @@
 #define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)
 #endif	/* CONFIG_SMP */
 
+#define PCPU_BUSY_EXPAND_MAP		1	/* pcpu_alloc() is expanding the
+						 * the map
+						 */
+#define PCPU_BUSY_POPULATE_CHUNK	2	/* chunk is being populated */
+
 struct pcpu_chunk {
 	struct list_head	list;		/* linked to pcpu_slot lists */
 	int			free_size;	/* free bytes in the chunk */
@@ -118,6 +123,7 @@ struct pcpu_chunk {
 	int			first_free;	/* no free below this */
 	bool			immutable;	/* no [de]population allowed */
 	int			nr_populated;	/* # of populated pages */
+	int			busy_flags;	/* type of work in progress */
 	unsigned long		populated[];	/* populated bitmap */
 };
 
@@ -162,7 +168,6 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
 static int pcpu_reserved_chunk_limit;
 
 static DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
-static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */
 
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 
@@ -282,29 +287,31 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
 	     (rs) < (re);						    \
 	     (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
 
+static bool pcpu_has_unpop_pages(struct pcpu_chunk *chunk, int start, int end)
+{
+	return find_next_zero_bit(chunk->populated, end, start) < end;
+}
+
 /**
  * pcpu_mem_zalloc - allocate memory
  * @size: bytes to allocate
  *
  * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
- * kzalloc() is used; otherwise, vzalloc() is used.  The returned
+ * kzalloc() is used; otherwise, vmalloc_gfp() is used.  The returned
  * memory is always zeroed.
  *
- * CONTEXT:
- * Does GFP_KERNEL allocation.
- *
  * RETURNS:
  * Pointer to the allocated area on success, NULL on failure.
  */
-static void *pcpu_mem_zalloc(size_t size)
+static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
 {
 	if (WARN_ON_ONCE(!slab_is_available()))
 		return NULL;
 
 	if (size <= PAGE_SIZE)
-		return kzalloc(size, GFP_KERNEL);
+		return kzalloc(size, gfp);
 	else
-		return vzalloc(size);
+		return vmalloc_gfp(size, gfp | __GFP_HIGHMEM | __GFP_ZERO);
 }
 
 /**
@@ -438,15 +445,14 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc,
+				gfp_t gfp)
 {
 	int *old = NULL, *new = NULL;
 	size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
 	unsigned long flags;
 
-	lockdep_assert_held(&pcpu_alloc_mutex);
-
-	new = pcpu_mem_zalloc(new_size);
+	new = pcpu_mem_zalloc(new_size, gfp);
 	if (!new)
 		return -ENOMEM;
 
@@ -716,16 +722,16 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
 	pcpu_chunk_relocate(chunk, oslot);
 }
 
-static struct pcpu_chunk *pcpu_alloc_chunk(void)
+static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
 {
 	struct pcpu_chunk *chunk;
 
-	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
+	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
 	if (!chunk)
 		return NULL;
 
 	chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
-						sizeof(chunk->map[0]));
+						sizeof(chunk->map[0]), gfp);
 	if (!chunk->map) {
 		pcpu_mem_free(chunk);
 		return NULL;
@@ -811,9 +817,10 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
  * pcpu_addr_to_page		- translate address to physical address
  * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
  */
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size,
+			       gfp_t gfp);
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
-static struct pcpu_chunk *pcpu_create_chunk(void);
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
 static struct page *pcpu_addr_to_page(void *addr);
 static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@ -874,6 +881,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 	bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
 	int occ_pages = 0;
 	int slot, off, new_alloc, cpu, ret;
+	int page_start, page_end;
 	unsigned long flags;
 	void __percpu *ptr;
 
@@ -893,9 +901,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 		return NULL;
 	}
 
-	if (!is_atomic)
-		mutex_lock(&pcpu_alloc_mutex);
-
 	spin_lock_irqsave(&pcpu_lock, flags);
 
 	/* serve reserved allocations from the reserved chunk if available */
@@ -909,8 +914,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 
 		while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
 			spin_unlock_irqrestore(&pcpu_lock, flags);
-			if (is_atomic ||
-			    pcpu_extend_area_map(chunk, new_alloc) < 0) {
+			if (pcpu_extend_area_map(chunk, new_alloc, gfp) < 0) {
 				err = "failed to extend area map of reserved chunk";
 				goto fail;
 			}
@@ -933,17 +937,24 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 			if (size > chunk->contig_hint)
 				continue;
 
+			if (chunk->busy_flags & PCPU_BUSY_POPULATE_CHUNK)
+				continue;
+
 			new_alloc = pcpu_need_to_extend(chunk, is_atomic);
 			if (new_alloc) {
-				if (is_atomic)
-					continue;
+				chunk->busy_flags |= PCPU_BUSY_EXPAND_MAP;
 				spin_unlock_irqrestore(&pcpu_lock, flags);
-				if (pcpu_extend_area_map(chunk,
-							 new_alloc) < 0) {
+
+				ret = pcpu_extend_area_map(chunk, new_alloc,
+							   gfp);
+				spin_lock_irqsave(&pcpu_lock, flags);
+				chunk->busy_flags &= ~PCPU_BUSY_EXPAND_MAP;
+				if (ret < 0) {
+					spin_unlock_irqrestore(&pcpu_lock,
+							       flags);
 					err = "failed to extend area map";
 					goto fail;
 				}
-				spin_lock_irqsave(&pcpu_lock, flags);
 				/*
 				 * pcpu_lock has been dropped, need to
 				 * restart cpu_slot list walking.
@@ -953,53 +964,59 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 
 			off = pcpu_alloc_area(chunk, size, align, is_atomic,
 					      &occ_pages);
+			if (off < 0 && is_atomic) {
+				/* Try non-populated areas. */
+				off = pcpu_alloc_area(chunk, size, align, false,
+						      &occ_pages);
+			}
+
 			if (off >= 0)
 				goto area_found;
 		}
 	}
 
+	WARN_ON(!list_empty(&pcpu_slot[pcpu_nr_slots - 1]));
+
 	spin_unlock_irqrestore(&pcpu_lock, flags);
 
-	/*
-	 * No space left.  Create a new chunk.  We don't want multiple
-	 * tasks to create chunks simultaneously.  Serialize and create iff
-	 * there's still no empty chunk after grabbing the mutex.
-	 */
-	if (is_atomic)
+	chunk = pcpu_create_chunk(gfp);
+	if (!chunk) {
+		err = "failed to allocate new chunk";
 		goto fail;
+	}
 
-	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
-		chunk = pcpu_create_chunk();
-		if (!chunk) {
-			err = "failed to allocate new chunk";
-			goto fail;
-		}
+	spin_lock_irqsave(&pcpu_lock, flags);
 
-		spin_lock_irqsave(&pcpu_lock, flags);
+	/* Check whether someone else added a chunk while lock was
+	 * dropped.
+	 */
+	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1]))
 		pcpu_chunk_relocate(chunk, -1);
-	} else {
-		spin_lock_irqsave(&pcpu_lock, flags);
-	}
+	else
+		pcpu_destroy_chunk(chunk);
 
 	goto restart;
 
 area_found:
-	spin_unlock_irqrestore(&pcpu_lock, flags);
+
+	page_start = PFN_DOWN(off);
+	page_end = PFN_UP(off + size);
 
 	/* populate if not all pages are already there */
-	if (!is_atomic) {
-		int page_start, page_end, rs, re;
+	if (pcpu_has_unpop_pages(chunk, page_start, page_end)) {
+		int rs, re;
 
-		page_start = PFN_DOWN(off);
-		page_end = PFN_UP(off + size);
+		chunk->busy_flags |= PCPU_BUSY_POPULATE_CHUNK;
+		spin_unlock_irqrestore(&pcpu_lock, flags);
 
 		pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
 			WARN_ON(chunk->immutable);
 
-			ret = pcpu_populate_chunk(chunk, rs, re);
+			ret = pcpu_populate_chunk(chunk, rs, re, gfp);
 
 			spin_lock_irqsave(&pcpu_lock, flags);
 			if (ret) {
+				chunk->busy_flags &= ~PCPU_BUSY_POPULATE_CHUNK;
 				pcpu_free_area(chunk, off, &occ_pages);
 				err = "failed to populate";
 				goto fail_unlock;
@@ -1008,18 +1025,18 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 			spin_unlock_irqrestore(&pcpu_lock, flags);
 		}
 
-		mutex_unlock(&pcpu_alloc_mutex);
+		spin_lock_irqsave(&pcpu_lock, flags);
+		chunk->busy_flags &= ~PCPU_BUSY_POPULATE_CHUNK;
 	}
 
-	if (chunk != pcpu_reserved_chunk) {
-		spin_lock_irqsave(&pcpu_lock, flags);
+	if (chunk != pcpu_reserved_chunk)
 		pcpu_nr_empty_pop_pages -= occ_pages;
-		spin_unlock_irqrestore(&pcpu_lock, flags);
-	}
 
 	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
 		pcpu_schedule_balance_work();
 
+	spin_unlock_irqrestore(&pcpu_lock, flags);
+
 	/* clear the areas and return address relative to base address */
 	for_each_possible_cpu(cpu)
 		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
@@ -1042,8 +1059,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 		/* see the flag handling in pcpu_blance_workfn() */
 		pcpu_atomic_alloc_failed = true;
 		pcpu_schedule_balance_work();
-	} else {
-		mutex_unlock(&pcpu_alloc_mutex);
 	}
 	return NULL;
 }
@@ -1118,7 +1133,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
 	 * There's no reason to keep around multiple unused chunks and VM
 	 * areas can be scarce.  Destroy all free chunks except for one.
 	 */
-	mutex_lock(&pcpu_alloc_mutex);
 	spin_lock_irq(&pcpu_lock);
 
 	list_for_each_entry_safe(chunk, next, free_head, list) {
@@ -1128,6 +1142,10 @@ static void pcpu_balance_workfn(struct work_struct *work)
 		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
 			continue;
 
+		if (chunk->busy_flags & (PCPU_BUSY_POPULATE_CHUNK |
+					 PCPU_BUSY_EXPAND_MAP))
+			continue;
+
 		list_del_init(&chunk->map_extend_list);
 		list_move(&chunk->list, &to_free);
 	}
@@ -1162,7 +1180,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
 		spin_unlock_irq(&pcpu_lock);
 
 		if (new_alloc)
-			pcpu_extend_area_map(chunk, new_alloc);
+			pcpu_extend_area_map(chunk, new_alloc, GFP_KERNEL);
 	} while (chunk);
 
 	/*
@@ -1194,20 +1212,29 @@ static void pcpu_balance_workfn(struct work_struct *work)
 
 		spin_lock_irq(&pcpu_lock);
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+			if (chunk->busy_flags & PCPU_BUSY_POPULATE_CHUNK)
+				continue;
 			nr_unpop = pcpu_unit_pages - chunk->nr_populated;
 			if (nr_unpop)
 				break;
 		}
+
+		if (nr_unpop)
+			chunk->busy_flags |= PCPU_BUSY_POPULATE_CHUNK;
+
 		spin_unlock_irq(&pcpu_lock);
 
 		if (!nr_unpop)
 			continue;
 
-		/* @chunk can't go away while pcpu_alloc_mutex is held */
+		/* @chunk can't go away because only pcpu_balance_workfn
+		 * destroys it.
+		 */
 		pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
 			int nr = min(re - rs, nr_to_pop);
 
-			ret = pcpu_populate_chunk(chunk, rs, rs + nr);
+			ret = pcpu_populate_chunk(chunk, rs, rs + nr,
+						  GFP_KERNEL);
 			if (!ret) {
 				nr_to_pop -= nr;
 				spin_lock_irq(&pcpu_lock);
@@ -1220,11 +1247,14 @@ static void pcpu_balance_workfn(struct work_struct *work)
 			if (!nr_to_pop)
 				break;
 		}
+		spin_lock_irq(&pcpu_lock);
+		chunk->busy_flags &= ~PCPU_BUSY_POPULATE_CHUNK;
+		spin_unlock_irq(&pcpu_lock);
 	}
 
 	if (nr_to_pop) {
 		/* ran out of chunks to populate, create a new one and retry */
-		chunk = pcpu_create_chunk();
+		chunk = pcpu_create_chunk(GFP_KERNEL);
 		if (chunk) {
 			spin_lock_irq(&pcpu_lock);
 			pcpu_chunk_relocate(chunk, -1);
@@ -1232,8 +1262,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
 			goto retry_pop;
 		}
 	}
-
-	mutex_unlock(&pcpu_alloc_mutex);
 }
 
 /**
@@ -2297,7 +2325,7 @@ void __init percpu_init_late(void)
 
 		BUILD_BUG_ON(size > PAGE_SIZE);
 
-		map = pcpu_mem_zalloc(size);
+		map = pcpu_mem_zalloc(size, GFP_KERNEL);
 		BUG_ON(!map);
 
 		spin_lock_irqsave(&pcpu_lock, flags);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d89034a393f2..01abc9ed5224 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -360,6 +360,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	unsigned long addr;
 	int purged = 0;
 	struct vmap_area *first;
+	unsigned long flags;
 
 	BUG_ON(!size);
 	BUG_ON(offset_in_page(size));
@@ -379,7 +380,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
 
 retry:
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	/*
 	 * Invalidate cache if we have more permissive parameters.
 	 * cached_hole_size notes the largest hole noticed _below_
@@ -457,7 +458,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->flags = 0;
 	__insert_vmap_area(va);
 	free_vmap_cache = &va->rb_node;
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	BUG_ON(!IS_ALIGNED(va->va_start, align));
 	BUG_ON(va->va_start < vstart);
@@ -466,7 +467,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	return va;
 
 overflow:
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 	if (!purged) {
 		purge_vmap_area_lazy();
 		purged = 1;
@@ -541,9 +542,11 @@ static void __free_vmap_area(struct vmap_area *va)
  */
 static void free_vmap_area(struct vmap_area *va)
 {
-	spin_lock(&vmap_area_lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	__free_vmap_area(va);
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 }
 
 /*
@@ -629,6 +632,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 	struct vmap_area *va;
 	struct vmap_area *n_va;
 	bool do_free = false;
+	unsigned long flags;
 
 	lockdep_assert_held(&vmap_purge_lock);
 
@@ -646,15 +650,17 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 
 	flush_tlb_kernel_range(start, end);
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	llist_for_each_entry_safe(va, n_va, valist, purge_list) {
 		int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
 
 		__free_vmap_area(va);
 		atomic_sub(nr, &vmap_lazy_nr);
-		cond_resched_lock(&vmap_area_lock);
+		spin_unlock_irqrestore(&vmap_area_lock, flags);
+		cond_resched();
+		spin_lock_irqsave(&vmap_area_lock, flags);
 	}
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 	return true;
 }
 
@@ -713,10 +719,11 @@ static void free_unmap_vmap_area(struct vmap_area *va)
 static struct vmap_area *find_vmap_area(unsigned long addr)
 {
 	struct vmap_area *va;
+	unsigned long flags;
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	va = __find_vmap_area(addr);
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	return va;
 }
@@ -1313,14 +1320,16 @@ EXPORT_SYMBOL_GPL(map_vm_area);
 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 			      unsigned long flags, const void *caller)
 {
-	spin_lock(&vmap_area_lock);
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&vmap_area_lock, irq_flags);
 	vm->flags = flags;
 	vm->addr = (void *)va->va_start;
 	vm->size = va->va_end - va->va_start;
 	vm->caller = caller;
 	va->vm = vm;
 	va->flags |= VM_VM_AREA;
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, irq_flags);
 }
 
 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
@@ -1443,11 +1452,12 @@ struct vm_struct *remove_vm_area(const void *addr)
 	va = find_vmap_area((unsigned long)addr);
 	if (va && va->flags & VM_VM_AREA) {
 		struct vm_struct *vm = va->vm;
+		unsigned long flags;
 
-		spin_lock(&vmap_area_lock);
+		spin_lock_irqsave(&vmap_area_lock, flags);
 		va->vm = NULL;
 		va->flags &= ~VM_VM_AREA;
-		spin_unlock(&vmap_area_lock);
+		spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 		vmap_debug_free_range(va->va_start, va->va_end);
 		kasan_free_shadow(vm);
@@ -1858,6 +1868,11 @@ void *vzalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vzalloc_node);
 
+void *vmalloc_gfp(unsigned long size, gfp_t gfp_mask)
+{
+	return __vmalloc_node_flags(size, NUMA_NO_NODE, gfp_mask);
+}
+
 #ifndef PAGE_KERNEL_EXEC
 # define PAGE_KERNEL_EXEC PAGE_KERNEL
 #endif
@@ -2038,12 +2053,13 @@ long vread(char *buf, char *addr, unsigned long count)
 	char *vaddr, *buf_start = buf;
 	unsigned long buflen = count;
 	unsigned long n;
+	unsigned long flags;
 
 	/* Don't allow overflow */
 	if ((unsigned long) addr + count < count)
 		count = -(unsigned long) addr;
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	list_for_each_entry(va, &vmap_area_list, list) {
 		if (!count)
 			break;
@@ -2075,7 +2091,7 @@ long vread(char *buf, char *addr, unsigned long count)
 		count -= n;
 	}
 finished:
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	if (buf == buf_start)
 		return 0;
@@ -2119,13 +2135,14 @@ long vwrite(char *buf, char *addr, unsigned long count)
 	char *vaddr;
 	unsigned long n, buflen;
 	int copied = 0;
+	unsigned long flags;
 
 	/* Don't allow overflow */
 	if ((unsigned long) addr + count < count)
 		count = -(unsigned long) addr;
 	buflen = count;
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	list_for_each_entry(va, &vmap_area_list, list) {
 		if (!count)
 			break;
@@ -2156,7 +2173,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
 		count -= n;
 	}
 finished:
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 	if (!copied)
 		return 0;
 	return buflen;
@@ -2416,7 +2433,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
  */
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				     const size_t *sizes, int nr_vms,
-				     size_t align)
+				     size_t align, gfp_t gfp_mask)
 {
 	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
 	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2425,6 +2442,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 	int area, area2, last_area, term_area;
 	unsigned long base, start, end, last_end;
 	bool purged = false;
+	unsigned long flags;
 
 	/* verify parameters and allocate data structures */
 	BUG_ON(offset_in_page(align) || !is_power_of_2(align));
@@ -2458,19 +2476,19 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 		return NULL;
 	}
 
-	vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
-	vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
+	vms = kcalloc(nr_vms, sizeof(vms[0]), gfp_mask);
+	vas = kcalloc(nr_vms, sizeof(vas[0]), gfp_mask);
 	if (!vas || !vms)
 		goto err_free2;
 
 	for (area = 0; area < nr_vms; area++) {
-		vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
-		vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
+		vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
+		vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
 		if (!vas[area] || !vms[area])
 			goto err_free;
 	}
 retry:
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 
 	/* start scanning - we scan from the top, begin with the last area */
 	area = term_area = last_area;
@@ -2492,7 +2510,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 		 * comparing.
 		 */
 		if (base + last_end < vmalloc_start + last_end) {
-			spin_unlock(&vmap_area_lock);
+			spin_unlock_irqrestore(&vmap_area_lock, flags);
 			if (!purged) {
 				purge_vmap_area_lazy();
 				purged = true;
@@ -2547,7 +2565,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 
 	vmap_area_pcpu_hole = base + offsets[last_area];
 
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	/* insert all vm's */
 	for (area = 0; area < nr_vms; area++)
@@ -2589,7 +2607,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 static void *s_start(struct seq_file *m, loff_t *pos)
 	__acquires(&vmap_area_lock)
 {
-	spin_lock(&vmap_area_lock);
+	spin_lock_irq(&vmap_area_lock);
 	return seq_list_start(&vmap_area_list, *pos);
 }
 
@@ -2601,7 +2619,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 static void s_stop(struct seq_file *m, void *p)
 	__releases(&vmap_area_lock)
 {
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irq(&vmap_area_lock);
 }
 
 static void show_numa_info(struct seq_file *m, struct vm_struct *v)
-- 
2.11.0.483.g087da7b7c-goog

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-25 21:00 ` Tahsin Erdogan
  0 siblings, 0 replies; 38+ messages in thread
From: Tahsin Erdogan @ 2017-02-25 21:00 UTC (permalink / raw)
  To: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, Michal Hocko,
	zijun_hu, Joonsoo Kim
  Cc: linux-mm, linux-kernel, Tahsin Erdogan

When pcpu_alloc() is called with gfp != GFP_KERNEL, the likelihood of
a failure is higher than GFP_KERNEL case. This is mainly because
pcpu_alloc() relies on previously allocated reserves and does not make
an effort to add memory to its pools for non-GFP_KERNEL case.

This issue is somewhat mitigated by kicking off a background work when
a memory allocation failure occurs. But this doesn't really help the
original victim of allocation failure.

This problem affects blkg_lookup_create() callers on machines with a
lot of cpus.

This patch reduces failure cases by trying to expand the memory pools.
It passes along gfp flag so it is safe to allocate memory this way.

To make this work, a gfp flag aware vmalloc_gfp() function is added.
Also, locking around vmap_area_lock has been updated to save/restore
irq flags. This was needed to avoid a lockdep problem between
request_queue->queue_lock and vmap_area_lock.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 include/linux/vmalloc.h |   5 +-
 mm/percpu-km.c          |   8 +--
 mm/percpu-vm.c          | 119 +++++++++++-------------------------
 mm/percpu.c             | 156 ++++++++++++++++++++++++++++--------------------
 mm/vmalloc.c            |  74 ++++++++++++++---------
 5 files changed, 179 insertions(+), 183 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index d68edffbf142..8110a0040b9d 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -72,6 +72,7 @@ extern void *vzalloc(unsigned long size);
 extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
 extern void *vzalloc_node(unsigned long size, int node);
+extern void *vmalloc_gfp(unsigned long size, gfp_t gfp_mask);
 extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
 extern void *vmalloc_32_user(unsigned long size);
@@ -165,14 +166,14 @@ extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 # ifdef CONFIG_MMU
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				     const size_t *sizes, int nr_vms,
-				     size_t align);
+				     size_t align, gfp_t gfp_mask);
 
 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
 # else
 static inline struct vm_struct **
 pcpu_get_vm_areas(const unsigned long *offsets,
 		const size_t *sizes, int nr_vms,
-		size_t align)
+		size_t align, gfp_t gfp_mask)
 {
 	return NULL;
 }
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index d66911ff42d9..599a9ce84544 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -34,7 +34,7 @@
 #include <linux/log2.h>
 
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
-			       int page_start, int page_end)
+			       int page_start, int page_end, gfp_t gfp)
 {
 	return 0;
 }
@@ -45,18 +45,18 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
 	/* nada */
 }
 
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
 {
 	const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
 	struct pcpu_chunk *chunk;
 	struct page *pages;
 	int i;
 
-	chunk = pcpu_alloc_chunk();
+	chunk = pcpu_alloc_chunk(gfp);
 	if (!chunk)
 		return NULL;
 
-	pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
+	pages = alloc_pages(gfp, order_base_2(nr_pages));
 	if (!pages) {
 		pcpu_free_chunk(chunk);
 		return NULL;
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 9ac639499bd1..42348a421ccf 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -20,28 +20,6 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
 }
 
 /**
- * pcpu_get_pages - get temp pages array
- *
- * Returns pointer to array of pointers to struct page which can be indexed
- * with pcpu_page_idx().  Note that there is only one array and accesses
- * should be serialized by pcpu_alloc_mutex.
- *
- * RETURNS:
- * Pointer to temp pages array on success.
- */
-static struct page **pcpu_get_pages(void)
-{
-	static struct page **pages;
-	size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
-
-	lockdep_assert_held(&pcpu_alloc_mutex);
-
-	if (!pages)
-		pages = pcpu_mem_zalloc(pages_size);
-	return pages;
-}
-
-/**
  * pcpu_free_pages - free pages which were allocated for @chunk
  * @chunk: chunk pages were allocated for
  * @pages: array of pages to be freed, indexed by pcpu_page_idx()
@@ -73,15 +51,16 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
  * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
  * @page_start: page index of the first page to be allocated
  * @page_end: page index of the last page to be allocated + 1
+ * @gfp: gfp flags
  *
  * Allocate pages [@page_start,@page_end) into @pages for all units.
  * The allocation is for @chunk.  Percpu core doesn't care about the
  * content of @pages and will pass it verbatim to pcpu_map_pages().
  */
 static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
-			    struct page **pages, int page_start, int page_end)
+			    struct page **pages, int page_start, int page_end,
+			    gfp_t gfp)
 {
-	const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
 	unsigned int cpu, tcpu;
 	int i;
 
@@ -135,38 +114,6 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
 }
 
 /**
- * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
- * @chunk: chunk of interest
- * @pages: pages array which can be used to pass information to free
- * @page_start: page index of the first page to unmap
- * @page_end: page index of the last page to unmap + 1
- *
- * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
- * Corresponding elements in @pages were cleared by the caller and can
- * be used to carry information to pcpu_free_pages() which will be
- * called after all unmaps are finished.  The caller should call
- * proper pre/post flush functions.
- */
-static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
-			     struct page **pages, int page_start, int page_end)
-{
-	unsigned int cpu;
-	int i;
-
-	for_each_possible_cpu(cpu) {
-		for (i = page_start; i < page_end; i++) {
-			struct page *page;
-
-			page = pcpu_chunk_page(chunk, cpu, i);
-			WARN_ON(!page);
-			pages[pcpu_page_idx(cpu, i)] = page;
-		}
-		__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
-				   page_end - page_start);
-	}
-}
-
-/**
  * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
  * @chunk: pcpu_chunk the regions to be flushed belong to
  * @page_start: page index of the first page to be flushed
@@ -262,32 +209,38 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
  * @chunk: chunk of interest
  * @page_start: the start page
  * @page_end: the end page
+ * @gfp: gfp flags
  *
  * For each cpu, populate and map pages [@page_start,@page_end) into
  * @chunk.
- *
- * CONTEXT:
- * pcpu_alloc_mutex, does GFP_KERNEL allocation.
  */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
-			       int page_start, int page_end)
+			       int page_start, int page_end, gfp_t gfp)
 {
 	struct page **pages;
+	size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
+	int ret;
 
-	pages = pcpu_get_pages();
+	pages = pcpu_mem_zalloc(pages_size, gfp);
 	if (!pages)
 		return -ENOMEM;
 
-	if (pcpu_alloc_pages(chunk, pages, page_start, page_end))
-		return -ENOMEM;
+	if (pcpu_alloc_pages(chunk, pages, page_start, page_end,
+			     gfp | __GFP_HIGHMEM | __GFP_COLD)) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
 		pcpu_free_pages(chunk, pages, page_start, page_end);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 	pcpu_post_map_flush(chunk, page_start, page_end);
-
-	return 0;
+	ret = 0;
+out:
+	pcpu_mem_free(pages);
+	return ret;
 }
 
 /**
@@ -298,44 +251,40 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
  *
  * For each cpu, depopulate and unmap pages [@page_start,@page_end)
  * from @chunk.
- *
- * CONTEXT:
- * pcpu_alloc_mutex.
  */
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
 				  int page_start, int page_end)
 {
-	struct page **pages;
-
-	/*
-	 * If control reaches here, there must have been at least one
-	 * successful population attempt so the temp pages array must
-	 * be available now.
-	 */
-	pages = pcpu_get_pages();
-	BUG_ON(!pages);
+	unsigned int cpu;
+	int i;
 
-	/* unmap and free */
 	pcpu_pre_unmap_flush(chunk, page_start, page_end);
 
-	pcpu_unmap_pages(chunk, pages, page_start, page_end);
+	for_each_possible_cpu(cpu)
+		for (i = page_start; i < page_end; i++) {
+			struct page *page;
+
+			page = pcpu_chunk_page(chunk, cpu, i);
+			WARN_ON(!page);
 
-	/* no need to flush tlb, vmalloc will handle it lazily */
+			__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, i), 1);
 
-	pcpu_free_pages(chunk, pages, page_start, page_end);
+			if (likely(page))
+				__free_page(page);
+		}
 }
 
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
 {
 	struct pcpu_chunk *chunk;
 	struct vm_struct **vms;
 
-	chunk = pcpu_alloc_chunk();
+	chunk = pcpu_alloc_chunk(gfp);
 	if (!chunk)
 		return NULL;
 
 	vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
-				pcpu_nr_groups, pcpu_atom_size);
+				pcpu_nr_groups, pcpu_atom_size, gfp);
 	if (!vms) {
 		pcpu_free_chunk(chunk);
 		return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index 232356a2d914..f2cee0ae8688 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -103,6 +103,11 @@
 #define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)
 #endif	/* CONFIG_SMP */
 
+#define PCPU_BUSY_EXPAND_MAP		1	/* pcpu_alloc() is expanding the
+						 * the map
+						 */
+#define PCPU_BUSY_POPULATE_CHUNK	2	/* chunk is being populated */
+
 struct pcpu_chunk {
 	struct list_head	list;		/* linked to pcpu_slot lists */
 	int			free_size;	/* free bytes in the chunk */
@@ -118,6 +123,7 @@ struct pcpu_chunk {
 	int			first_free;	/* no free below this */
 	bool			immutable;	/* no [de]population allowed */
 	int			nr_populated;	/* # of populated pages */
+	int			busy_flags;	/* type of work in progress */
 	unsigned long		populated[];	/* populated bitmap */
 };
 
@@ -162,7 +168,6 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
 static int pcpu_reserved_chunk_limit;
 
 static DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
-static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */
 
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 
@@ -282,29 +287,31 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
 	     (rs) < (re);						    \
 	     (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
 
+static bool pcpu_has_unpop_pages(struct pcpu_chunk *chunk, int start, int end)
+{
+	return find_next_zero_bit(chunk->populated, end, start) < end;
+}
+
 /**
  * pcpu_mem_zalloc - allocate memory
  * @size: bytes to allocate
  *
  * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
- * kzalloc() is used; otherwise, vzalloc() is used.  The returned
+ * kzalloc() is used; otherwise, vmalloc_gfp() is used.  The returned
  * memory is always zeroed.
  *
- * CONTEXT:
- * Does GFP_KERNEL allocation.
- *
  * RETURNS:
  * Pointer to the allocated area on success, NULL on failure.
  */
-static void *pcpu_mem_zalloc(size_t size)
+static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
 {
 	if (WARN_ON_ONCE(!slab_is_available()))
 		return NULL;
 
 	if (size <= PAGE_SIZE)
-		return kzalloc(size, GFP_KERNEL);
+		return kzalloc(size, gfp);
 	else
-		return vzalloc(size);
+		return vmalloc_gfp(size, gfp | __GFP_HIGHMEM | __GFP_ZERO);
 }
 
 /**
@@ -438,15 +445,14 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc,
+				gfp_t gfp)
 {
 	int *old = NULL, *new = NULL;
 	size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
 	unsigned long flags;
 
-	lockdep_assert_held(&pcpu_alloc_mutex);
-
-	new = pcpu_mem_zalloc(new_size);
+	new = pcpu_mem_zalloc(new_size, gfp);
 	if (!new)
 		return -ENOMEM;
 
@@ -716,16 +722,16 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
 	pcpu_chunk_relocate(chunk, oslot);
 }
 
-static struct pcpu_chunk *pcpu_alloc_chunk(void)
+static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
 {
 	struct pcpu_chunk *chunk;
 
-	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
+	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
 	if (!chunk)
 		return NULL;
 
 	chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
-						sizeof(chunk->map[0]));
+						sizeof(chunk->map[0]), gfp);
 	if (!chunk->map) {
 		pcpu_mem_free(chunk);
 		return NULL;
@@ -811,9 +817,10 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
  * pcpu_addr_to_page		- translate address to physical address
  * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
  */
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size,
+			       gfp_t gfp);
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
-static struct pcpu_chunk *pcpu_create_chunk(void);
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
 static struct page *pcpu_addr_to_page(void *addr);
 static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@ -874,6 +881,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 	bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
 	int occ_pages = 0;
 	int slot, off, new_alloc, cpu, ret;
+	int page_start, page_end;
 	unsigned long flags;
 	void __percpu *ptr;
 
@@ -893,9 +901,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 		return NULL;
 	}
 
-	if (!is_atomic)
-		mutex_lock(&pcpu_alloc_mutex);
-
 	spin_lock_irqsave(&pcpu_lock, flags);
 
 	/* serve reserved allocations from the reserved chunk if available */
@@ -909,8 +914,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 
 		while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
 			spin_unlock_irqrestore(&pcpu_lock, flags);
-			if (is_atomic ||
-			    pcpu_extend_area_map(chunk, new_alloc) < 0) {
+			if (pcpu_extend_area_map(chunk, new_alloc, gfp) < 0) {
 				err = "failed to extend area map of reserved chunk";
 				goto fail;
 			}
@@ -933,17 +937,24 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 			if (size > chunk->contig_hint)
 				continue;
 
+			if (chunk->busy_flags & PCPU_BUSY_POPULATE_CHUNK)
+				continue;
+
 			new_alloc = pcpu_need_to_extend(chunk, is_atomic);
 			if (new_alloc) {
-				if (is_atomic)
-					continue;
+				chunk->busy_flags |= PCPU_BUSY_EXPAND_MAP;
 				spin_unlock_irqrestore(&pcpu_lock, flags);
-				if (pcpu_extend_area_map(chunk,
-							 new_alloc) < 0) {
+
+				ret = pcpu_extend_area_map(chunk, new_alloc,
+							   gfp);
+				spin_lock_irqsave(&pcpu_lock, flags);
+				chunk->busy_flags &= ~PCPU_BUSY_EXPAND_MAP;
+				if (ret < 0) {
+					spin_unlock_irqrestore(&pcpu_lock,
+							       flags);
 					err = "failed to extend area map";
 					goto fail;
 				}
-				spin_lock_irqsave(&pcpu_lock, flags);
 				/*
 				 * pcpu_lock has been dropped, need to
 				 * restart cpu_slot list walking.
@@ -953,53 +964,59 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 
 			off = pcpu_alloc_area(chunk, size, align, is_atomic,
 					      &occ_pages);
+			if (off < 0 && is_atomic) {
+				/* Try non-populated areas. */
+				off = pcpu_alloc_area(chunk, size, align, false,
+						      &occ_pages);
+			}
+
 			if (off >= 0)
 				goto area_found;
 		}
 	}
 
+	WARN_ON(!list_empty(&pcpu_slot[pcpu_nr_slots - 1]));
+
 	spin_unlock_irqrestore(&pcpu_lock, flags);
 
-	/*
-	 * No space left.  Create a new chunk.  We don't want multiple
-	 * tasks to create chunks simultaneously.  Serialize and create iff
-	 * there's still no empty chunk after grabbing the mutex.
-	 */
-	if (is_atomic)
+	chunk = pcpu_create_chunk(gfp);
+	if (!chunk) {
+		err = "failed to allocate new chunk";
 		goto fail;
+	}
 
-	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
-		chunk = pcpu_create_chunk();
-		if (!chunk) {
-			err = "failed to allocate new chunk";
-			goto fail;
-		}
+	spin_lock_irqsave(&pcpu_lock, flags);
 
-		spin_lock_irqsave(&pcpu_lock, flags);
+	/* Check whether someone else added a chunk while lock was
+	 * dropped.
+	 */
+	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1]))
 		pcpu_chunk_relocate(chunk, -1);
-	} else {
-		spin_lock_irqsave(&pcpu_lock, flags);
-	}
+	else
+		pcpu_destroy_chunk(chunk);
 
 	goto restart;
 
 area_found:
-	spin_unlock_irqrestore(&pcpu_lock, flags);
+
+	page_start = PFN_DOWN(off);
+	page_end = PFN_UP(off + size);
 
 	/* populate if not all pages are already there */
-	if (!is_atomic) {
-		int page_start, page_end, rs, re;
+	if (pcpu_has_unpop_pages(chunk, page_start, page_end)) {
+		int rs, re;
 
-		page_start = PFN_DOWN(off);
-		page_end = PFN_UP(off + size);
+		chunk->busy_flags |= PCPU_BUSY_POPULATE_CHUNK;
+		spin_unlock_irqrestore(&pcpu_lock, flags);
 
 		pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
 			WARN_ON(chunk->immutable);
 
-			ret = pcpu_populate_chunk(chunk, rs, re);
+			ret = pcpu_populate_chunk(chunk, rs, re, gfp);
 
 			spin_lock_irqsave(&pcpu_lock, flags);
 			if (ret) {
+				chunk->busy_flags &= ~PCPU_BUSY_POPULATE_CHUNK;
 				pcpu_free_area(chunk, off, &occ_pages);
 				err = "failed to populate";
 				goto fail_unlock;
@@ -1008,18 +1025,18 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 			spin_unlock_irqrestore(&pcpu_lock, flags);
 		}
 
-		mutex_unlock(&pcpu_alloc_mutex);
+		spin_lock_irqsave(&pcpu_lock, flags);
+		chunk->busy_flags &= ~PCPU_BUSY_POPULATE_CHUNK;
 	}
 
-	if (chunk != pcpu_reserved_chunk) {
-		spin_lock_irqsave(&pcpu_lock, flags);
+	if (chunk != pcpu_reserved_chunk)
 		pcpu_nr_empty_pop_pages -= occ_pages;
-		spin_unlock_irqrestore(&pcpu_lock, flags);
-	}
 
 	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
 		pcpu_schedule_balance_work();
 
+	spin_unlock_irqrestore(&pcpu_lock, flags);
+
 	/* clear the areas and return address relative to base address */
 	for_each_possible_cpu(cpu)
 		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
@@ -1042,8 +1059,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 		/* see the flag handling in pcpu_blance_workfn() */
 		pcpu_atomic_alloc_failed = true;
 		pcpu_schedule_balance_work();
-	} else {
-		mutex_unlock(&pcpu_alloc_mutex);
 	}
 	return NULL;
 }
@@ -1118,7 +1133,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
 	 * There's no reason to keep around multiple unused chunks and VM
 	 * areas can be scarce.  Destroy all free chunks except for one.
 	 */
-	mutex_lock(&pcpu_alloc_mutex);
 	spin_lock_irq(&pcpu_lock);
 
 	list_for_each_entry_safe(chunk, next, free_head, list) {
@@ -1128,6 +1142,10 @@ static void pcpu_balance_workfn(struct work_struct *work)
 		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
 			continue;
 
+		if (chunk->busy_flags & (PCPU_BUSY_POPULATE_CHUNK |
+					 PCPU_BUSY_EXPAND_MAP))
+			continue;
+
 		list_del_init(&chunk->map_extend_list);
 		list_move(&chunk->list, &to_free);
 	}
@@ -1162,7 +1180,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
 		spin_unlock_irq(&pcpu_lock);
 
 		if (new_alloc)
-			pcpu_extend_area_map(chunk, new_alloc);
+			pcpu_extend_area_map(chunk, new_alloc, GFP_KERNEL);
 	} while (chunk);
 
 	/*
@@ -1194,20 +1212,29 @@ static void pcpu_balance_workfn(struct work_struct *work)
 
 		spin_lock_irq(&pcpu_lock);
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+			if (chunk->busy_flags & PCPU_BUSY_POPULATE_CHUNK)
+				continue;
 			nr_unpop = pcpu_unit_pages - chunk->nr_populated;
 			if (nr_unpop)
 				break;
 		}
+
+		if (nr_unpop)
+			chunk->busy_flags |= PCPU_BUSY_POPULATE_CHUNK;
+
 		spin_unlock_irq(&pcpu_lock);
 
 		if (!nr_unpop)
 			continue;
 
-		/* @chunk can't go away while pcpu_alloc_mutex is held */
+		/* @chunk can't go away because only pcpu_balance_workfn
+		 * destroys it.
+		 */
 		pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
 			int nr = min(re - rs, nr_to_pop);
 
-			ret = pcpu_populate_chunk(chunk, rs, rs + nr);
+			ret = pcpu_populate_chunk(chunk, rs, rs + nr,
+						  GFP_KERNEL);
 			if (!ret) {
 				nr_to_pop -= nr;
 				spin_lock_irq(&pcpu_lock);
@@ -1220,11 +1247,14 @@ static void pcpu_balance_workfn(struct work_struct *work)
 			if (!nr_to_pop)
 				break;
 		}
+		spin_lock_irq(&pcpu_lock);
+		chunk->busy_flags &= ~PCPU_BUSY_POPULATE_CHUNK;
+		spin_unlock_irq(&pcpu_lock);
 	}
 
 	if (nr_to_pop) {
 		/* ran out of chunks to populate, create a new one and retry */
-		chunk = pcpu_create_chunk();
+		chunk = pcpu_create_chunk(GFP_KERNEL);
 		if (chunk) {
 			spin_lock_irq(&pcpu_lock);
 			pcpu_chunk_relocate(chunk, -1);
@@ -1232,8 +1262,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
 			goto retry_pop;
 		}
 	}
-
-	mutex_unlock(&pcpu_alloc_mutex);
 }
 
 /**
@@ -2297,7 +2325,7 @@ void __init percpu_init_late(void)
 
 		BUILD_BUG_ON(size > PAGE_SIZE);
 
-		map = pcpu_mem_zalloc(size);
+		map = pcpu_mem_zalloc(size, GFP_KERNEL);
 		BUG_ON(!map);
 
 		spin_lock_irqsave(&pcpu_lock, flags);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d89034a393f2..01abc9ed5224 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -360,6 +360,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	unsigned long addr;
 	int purged = 0;
 	struct vmap_area *first;
+	unsigned long flags;
 
 	BUG_ON(!size);
 	BUG_ON(offset_in_page(size));
@@ -379,7 +380,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
 
 retry:
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	/*
 	 * Invalidate cache if we have more permissive parameters.
 	 * cached_hole_size notes the largest hole noticed _below_
@@ -457,7 +458,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->flags = 0;
 	__insert_vmap_area(va);
 	free_vmap_cache = &va->rb_node;
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	BUG_ON(!IS_ALIGNED(va->va_start, align));
 	BUG_ON(va->va_start < vstart);
@@ -466,7 +467,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	return va;
 
 overflow:
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 	if (!purged) {
 		purge_vmap_area_lazy();
 		purged = 1;
@@ -541,9 +542,11 @@ static void __free_vmap_area(struct vmap_area *va)
  */
 static void free_vmap_area(struct vmap_area *va)
 {
-	spin_lock(&vmap_area_lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	__free_vmap_area(va);
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 }
 
 /*
@@ -629,6 +632,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 	struct vmap_area *va;
 	struct vmap_area *n_va;
 	bool do_free = false;
+	unsigned long flags;
 
 	lockdep_assert_held(&vmap_purge_lock);
 
@@ -646,15 +650,17 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 
 	flush_tlb_kernel_range(start, end);
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	llist_for_each_entry_safe(va, n_va, valist, purge_list) {
 		int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
 
 		__free_vmap_area(va);
 		atomic_sub(nr, &vmap_lazy_nr);
-		cond_resched_lock(&vmap_area_lock);
+		spin_unlock_irqrestore(&vmap_area_lock, flags);
+		cond_resched();
+		spin_lock_irqsave(&vmap_area_lock, flags);
 	}
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 	return true;
 }
 
@@ -713,10 +719,11 @@ static void free_unmap_vmap_area(struct vmap_area *va)
 static struct vmap_area *find_vmap_area(unsigned long addr)
 {
 	struct vmap_area *va;
+	unsigned long flags;
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	va = __find_vmap_area(addr);
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	return va;
 }
@@ -1313,14 +1320,16 @@ EXPORT_SYMBOL_GPL(map_vm_area);
 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 			      unsigned long flags, const void *caller)
 {
-	spin_lock(&vmap_area_lock);
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&vmap_area_lock, irq_flags);
 	vm->flags = flags;
 	vm->addr = (void *)va->va_start;
 	vm->size = va->va_end - va->va_start;
 	vm->caller = caller;
 	va->vm = vm;
 	va->flags |= VM_VM_AREA;
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, irq_flags);
 }
 
 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
@@ -1443,11 +1452,12 @@ struct vm_struct *remove_vm_area(const void *addr)
 	va = find_vmap_area((unsigned long)addr);
 	if (va && va->flags & VM_VM_AREA) {
 		struct vm_struct *vm = va->vm;
+		unsigned long flags;
 
-		spin_lock(&vmap_area_lock);
+		spin_lock_irqsave(&vmap_area_lock, flags);
 		va->vm = NULL;
 		va->flags &= ~VM_VM_AREA;
-		spin_unlock(&vmap_area_lock);
+		spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 		vmap_debug_free_range(va->va_start, va->va_end);
 		kasan_free_shadow(vm);
@@ -1858,6 +1868,11 @@ void *vzalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vzalloc_node);
 
+void *vmalloc_gfp(unsigned long size, gfp_t gfp_mask)
+{
+	return __vmalloc_node_flags(size, NUMA_NO_NODE, gfp_mask);
+}
+
 #ifndef PAGE_KERNEL_EXEC
 # define PAGE_KERNEL_EXEC PAGE_KERNEL
 #endif
@@ -2038,12 +2053,13 @@ long vread(char *buf, char *addr, unsigned long count)
 	char *vaddr, *buf_start = buf;
 	unsigned long buflen = count;
 	unsigned long n;
+	unsigned long flags;
 
 	/* Don't allow overflow */
 	if ((unsigned long) addr + count < count)
 		count = -(unsigned long) addr;
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	list_for_each_entry(va, &vmap_area_list, list) {
 		if (!count)
 			break;
@@ -2075,7 +2091,7 @@ long vread(char *buf, char *addr, unsigned long count)
 		count -= n;
 	}
 finished:
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	if (buf == buf_start)
 		return 0;
@@ -2119,13 +2135,14 @@ long vwrite(char *buf, char *addr, unsigned long count)
 	char *vaddr;
 	unsigned long n, buflen;
 	int copied = 0;
+	unsigned long flags;
 
 	/* Don't allow overflow */
 	if ((unsigned long) addr + count < count)
 		count = -(unsigned long) addr;
 	buflen = count;
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	list_for_each_entry(va, &vmap_area_list, list) {
 		if (!count)
 			break;
@@ -2156,7 +2173,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
 		count -= n;
 	}
 finished:
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 	if (!copied)
 		return 0;
 	return buflen;
@@ -2416,7 +2433,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
  */
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				     const size_t *sizes, int nr_vms,
-				     size_t align)
+				     size_t align, gfp_t gfp_mask)
 {
 	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
 	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2425,6 +2442,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 	int area, area2, last_area, term_area;
 	unsigned long base, start, end, last_end;
 	bool purged = false;
+	unsigned long flags;
 
 	/* verify parameters and allocate data structures */
 	BUG_ON(offset_in_page(align) || !is_power_of_2(align));
@@ -2458,19 +2476,19 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 		return NULL;
 	}
 
-	vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
-	vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
+	vms = kcalloc(nr_vms, sizeof(vms[0]), gfp_mask);
+	vas = kcalloc(nr_vms, sizeof(vas[0]), gfp_mask);
 	if (!vas || !vms)
 		goto err_free2;
 
 	for (area = 0; area < nr_vms; area++) {
-		vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
-		vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
+		vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
+		vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
 		if (!vas[area] || !vms[area])
 			goto err_free;
 	}
 retry:
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 
 	/* start scanning - we scan from the top, begin with the last area */
 	area = term_area = last_area;
@@ -2492,7 +2510,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 		 * comparing.
 		 */
 		if (base + last_end < vmalloc_start + last_end) {
-			spin_unlock(&vmap_area_lock);
+			spin_unlock_irqrestore(&vmap_area_lock, flags);
 			if (!purged) {
 				purge_vmap_area_lazy();
 				purged = true;
@@ -2547,7 +2565,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 
 	vmap_area_pcpu_hole = base + offsets[last_area];
 
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	/* insert all vm's */
 	for (area = 0; area < nr_vms; area++)
@@ -2589,7 +2607,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 static void *s_start(struct seq_file *m, loff_t *pos)
 	__acquires(&vmap_area_lock)
 {
-	spin_lock(&vmap_area_lock);
+	spin_lock_irq(&vmap_area_lock);
 	return seq_list_start(&vmap_area_list, *pos);
 }
 
@@ -2601,7 +2619,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 static void s_stop(struct seq_file *m, void *p)
 	__releases(&vmap_area_lock)
 {
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irq(&vmap_area_lock);
 }
 
 static void show_numa_info(struct seq_file *m, struct vm_struct *v)
-- 
2.11.0.483.g087da7b7c-goog

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-25 21:00 ` Tahsin Erdogan
@ 2017-02-25 23:54   ` kbuild test robot
  -1 siblings, 0 replies; 38+ messages in thread
From: kbuild test robot @ 2017-02-25 23:54 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: kbuild-all, Tejun Heo, Christoph Lameter, Andrew Morton,
	Chris Wilson, Andrey Ryabinin, Roman Pen, Joonas Lahtinen,
	Michal Hocko, zijun_hu, Joonsoo Kim, linux-mm, linux-kernel,
	Tahsin Erdogan

[-- Attachment #1: Type: text/plain, Size: 1069 bytes --]

Hi Tahsin,

[auto build test ERROR on mmotm/master]
[also build test ERROR on v4.10 next-20170224]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Tahsin-Erdogan/percpu-remove-unused-chunk_alloc-parameter-from-pcpu_get_pages/20170226-052515
base:   git://git.cmpxchg.org/linux-mmotm.git master
config: sh-rsk7269_defconfig (attached as .config)
compiler: sh4-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=sh 

All errors (new ones prefixed by >>):

   mm/built-in.o: In function `pcpu_mem_zalloc':
>> percpu.c:(.text+0x12670): undefined reference to `vmalloc_gfp'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 10840 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-25 23:54   ` kbuild test robot
  0 siblings, 0 replies; 38+ messages in thread
From: kbuild test robot @ 2017-02-25 23:54 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: kbuild-all, Tejun Heo, Christoph Lameter, Andrew Morton,
	Chris Wilson, Andrey Ryabinin, Roman Pen, Joonas Lahtinen,
	Michal Hocko, zijun_hu, Joonsoo Kim, linux-mm, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1069 bytes --]

Hi Tahsin,

[auto build test ERROR on mmotm/master]
[also build test ERROR on v4.10 next-20170224]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Tahsin-Erdogan/percpu-remove-unused-chunk_alloc-parameter-from-pcpu_get_pages/20170226-052515
base:   git://git.cmpxchg.org/linux-mmotm.git master
config: sh-rsk7269_defconfig (attached as .config)
compiler: sh4-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=sh 

All errors (new ones prefixed by >>):

   mm/built-in.o: In function `pcpu_mem_zalloc':
>> percpu.c:(.text+0x12670): undefined reference to `vmalloc_gfp'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 10840 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-25 21:00 ` Tahsin Erdogan
@ 2017-02-26  0:48   ` kbuild test robot
  -1 siblings, 0 replies; 38+ messages in thread
From: kbuild test robot @ 2017-02-26  0:48 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: kbuild-all, Tejun Heo, Christoph Lameter, Andrew Morton,
	Chris Wilson, Andrey Ryabinin, Roman Pen, Joonas Lahtinen,
	Michal Hocko, zijun_hu, Joonsoo Kim, linux-mm, linux-kernel,
	Tahsin Erdogan

[-- Attachment #1: Type: text/plain, Size: 1337 bytes --]

Hi Tahsin,

[auto build test ERROR on mmotm/master]
[also build test ERROR on v4.10 next-20170224]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Tahsin-Erdogan/percpu-remove-unused-chunk_alloc-parameter-from-pcpu_get_pages/20170226-052515
base:   git://git.cmpxchg.org/linux-mmotm.git master
config: blackfin-BF561-EZKIT-SMP_defconfig (attached as .config)
compiler: bfin-uclinux-gcc (GCC) 6.2.0
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=blackfin 

All errors (new ones prefixed by >>):

   mm/built-in.o: In function `pcpu_mem_zalloc':
>> mm/percpu.c:314: undefined reference to `vmalloc_gfp'

vim +314 mm/percpu.c

   308		if (WARN_ON_ONCE(!slab_is_available()))
   309			return NULL;
   310	
   311		if (size <= PAGE_SIZE)
   312			return kzalloc(size, gfp);
   313		else
 > 314			return vmalloc_gfp(size, gfp | __GFP_HIGHMEM | __GFP_ZERO);
   315	}
   316	
   317	/**

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 10736 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-26  0:48   ` kbuild test robot
  0 siblings, 0 replies; 38+ messages in thread
From: kbuild test robot @ 2017-02-26  0:48 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: kbuild-all, Tejun Heo, Christoph Lameter, Andrew Morton,
	Chris Wilson, Andrey Ryabinin, Roman Pen, Joonas Lahtinen,
	Michal Hocko, zijun_hu, Joonsoo Kim, linux-mm, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1337 bytes --]

Hi Tahsin,

[auto build test ERROR on mmotm/master]
[also build test ERROR on v4.10 next-20170224]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Tahsin-Erdogan/percpu-remove-unused-chunk_alloc-parameter-from-pcpu_get_pages/20170226-052515
base:   git://git.cmpxchg.org/linux-mmotm.git master
config: blackfin-BF561-EZKIT-SMP_defconfig (attached as .config)
compiler: bfin-uclinux-gcc (GCC) 6.2.0
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=blackfin 

All errors (new ones prefixed by >>):

   mm/built-in.o: In function `pcpu_mem_zalloc':
>> mm/percpu.c:314: undefined reference to `vmalloc_gfp'

vim +314 mm/percpu.c

   308		if (WARN_ON_ONCE(!slab_is_available()))
   309			return NULL;
   310	
   311		if (size <= PAGE_SIZE)
   312			return kzalloc(size, gfp);
   313		else
 > 314			return vmalloc_gfp(size, gfp | __GFP_HIGHMEM | __GFP_ZERO);
   315	}
   316	
   317	/**

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 10736 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-26  0:48   ` kbuild test robot
@ 2017-02-26  4:38     ` Tahsin Erdogan
  -1 siblings, 0 replies; 38+ messages in thread
From: Tahsin Erdogan @ 2017-02-26  4:38 UTC (permalink / raw)
  To: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, Tahsin Erdogan,
	Michal Hocko, zijun_hu, Joonsoo Kim, David Rientjes
  Cc: linux-mm, linux-kernel

When pcpu_alloc() is called with gfp != GFP_KERNEL, the likelihood of
a failure is higher than GFP_KERNEL case. This is mainly because
pcpu_alloc() relies on previously allocated reserves and does not make
an effort to add memory to its pools for non-GFP_KERNEL case.

This issue is somewhat mitigated by kicking off a background work when
a memory allocation failure occurs. But this doesn't really help the
original victim of allocation failure.

This problem affects blkg_lookup_create() callers on machines with a
lot of cpus.

This patch reduces failure cases by trying to expand the memory pools.
It passes along gfp flag so it is safe to allocate memory this way.

To make this work, a gfp flag aware vmalloc_gfp() function is added.
Also, locking around vmap_area_lock has been updated to save/restore
irq flags. This was needed to avoid a lockdep problem between
request_queue->queue_lock and vmap_area_lock.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
v2:
 added vmalloc_gfp() to mm/nommu.c as well

 include/linux/vmalloc.h |   5 +-
 mm/nommu.c              |   5 ++
 mm/percpu-km.c          |   8 +--
 mm/percpu-vm.c          | 119 +++++++++++-------------------------
 mm/percpu.c             | 156 ++++++++++++++++++++++++++++--------------------
 mm/vmalloc.c            |  74 ++++++++++++++---------
 6 files changed, 184 insertions(+), 183 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index d68edffbf142..8110a0040b9d 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -72,6 +72,7 @@ extern void *vzalloc(unsigned long size);
 extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
 extern void *vzalloc_node(unsigned long size, int node);
+extern void *vmalloc_gfp(unsigned long size, gfp_t gfp_mask);
 extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
 extern void *vmalloc_32_user(unsigned long size);
@@ -165,14 +166,14 @@ extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 # ifdef CONFIG_MMU
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				     const size_t *sizes, int nr_vms,
-				     size_t align);
+				     size_t align, gfp_t gfp_mask);
 
 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
 # else
 static inline struct vm_struct **
 pcpu_get_vm_areas(const unsigned long *offsets,
 		const size_t *sizes, int nr_vms,
-		size_t align)
+		size_t align, gfp_t gfp_mask)
 {
 	return NULL;
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index bc964c26be8c..e81d4724ac07 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -359,6 +359,11 @@ void *vzalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vzalloc_node);
 
+void *vmalloc_gfp(unsigned long size, gfp_t gfp_mask)
+{
+	return __vmalloc(size, gfp_mask, PAGE_KERNEL);
+}
+
 #ifndef PAGE_KERNEL_EXEC
 # define PAGE_KERNEL_EXEC PAGE_KERNEL
 #endif
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index d66911ff42d9..599a9ce84544 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -34,7 +34,7 @@
 #include <linux/log2.h>
 
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
-			       int page_start, int page_end)
+			       int page_start, int page_end, gfp_t gfp)
 {
 	return 0;
 }
@@ -45,18 +45,18 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
 	/* nada */
 }
 
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
 {
 	const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
 	struct pcpu_chunk *chunk;
 	struct page *pages;
 	int i;
 
-	chunk = pcpu_alloc_chunk();
+	chunk = pcpu_alloc_chunk(gfp);
 	if (!chunk)
 		return NULL;
 
-	pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
+	pages = alloc_pages(gfp, order_base_2(nr_pages));
 	if (!pages) {
 		pcpu_free_chunk(chunk);
 		return NULL;
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 9ac639499bd1..42348a421ccf 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -20,28 +20,6 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
 }
 
 /**
- * pcpu_get_pages - get temp pages array
- *
- * Returns pointer to array of pointers to struct page which can be indexed
- * with pcpu_page_idx().  Note that there is only one array and accesses
- * should be serialized by pcpu_alloc_mutex.
- *
- * RETURNS:
- * Pointer to temp pages array on success.
- */
-static struct page **pcpu_get_pages(void)
-{
-	static struct page **pages;
-	size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
-
-	lockdep_assert_held(&pcpu_alloc_mutex);
-
-	if (!pages)
-		pages = pcpu_mem_zalloc(pages_size);
-	return pages;
-}
-
-/**
  * pcpu_free_pages - free pages which were allocated for @chunk
  * @chunk: chunk pages were allocated for
  * @pages: array of pages to be freed, indexed by pcpu_page_idx()
@@ -73,15 +51,16 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
  * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
  * @page_start: page index of the first page to be allocated
  * @page_end: page index of the last page to be allocated + 1
+ * @gfp: gfp flags
  *
  * Allocate pages [@page_start,@page_end) into @pages for all units.
  * The allocation is for @chunk.  Percpu core doesn't care about the
  * content of @pages and will pass it verbatim to pcpu_map_pages().
  */
 static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
-			    struct page **pages, int page_start, int page_end)
+			    struct page **pages, int page_start, int page_end,
+			    gfp_t gfp)
 {
-	const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
 	unsigned int cpu, tcpu;
 	int i;
 
@@ -135,38 +114,6 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
 }
 
 /**
- * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
- * @chunk: chunk of interest
- * @pages: pages array which can be used to pass information to free
- * @page_start: page index of the first page to unmap
- * @page_end: page index of the last page to unmap + 1
- *
- * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
- * Corresponding elements in @pages were cleared by the caller and can
- * be used to carry information to pcpu_free_pages() which will be
- * called after all unmaps are finished.  The caller should call
- * proper pre/post flush functions.
- */
-static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
-			     struct page **pages, int page_start, int page_end)
-{
-	unsigned int cpu;
-	int i;
-
-	for_each_possible_cpu(cpu) {
-		for (i = page_start; i < page_end; i++) {
-			struct page *page;
-
-			page = pcpu_chunk_page(chunk, cpu, i);
-			WARN_ON(!page);
-			pages[pcpu_page_idx(cpu, i)] = page;
-		}
-		__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
-				   page_end - page_start);
-	}
-}
-
-/**
  * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
  * @chunk: pcpu_chunk the regions to be flushed belong to
  * @page_start: page index of the first page to be flushed
@@ -262,32 +209,38 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
  * @chunk: chunk of interest
  * @page_start: the start page
  * @page_end: the end page
+ * @gfp: gfp flags
  *
  * For each cpu, populate and map pages [@page_start,@page_end) into
  * @chunk.
- *
- * CONTEXT:
- * pcpu_alloc_mutex, does GFP_KERNEL allocation.
  */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
-			       int page_start, int page_end)
+			       int page_start, int page_end, gfp_t gfp)
 {
 	struct page **pages;
+	size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
+	int ret;
 
-	pages = pcpu_get_pages();
+	pages = pcpu_mem_zalloc(pages_size, gfp);
 	if (!pages)
 		return -ENOMEM;
 
-	if (pcpu_alloc_pages(chunk, pages, page_start, page_end))
-		return -ENOMEM;
+	if (pcpu_alloc_pages(chunk, pages, page_start, page_end,
+			     gfp | __GFP_HIGHMEM | __GFP_COLD)) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
 		pcpu_free_pages(chunk, pages, page_start, page_end);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 	pcpu_post_map_flush(chunk, page_start, page_end);
-
-	return 0;
+	ret = 0;
+out:
+	pcpu_mem_free(pages);
+	return ret;
 }
 
 /**
@@ -298,44 +251,40 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
  *
  * For each cpu, depopulate and unmap pages [@page_start,@page_end)
  * from @chunk.
- *
- * CONTEXT:
- * pcpu_alloc_mutex.
  */
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
 				  int page_start, int page_end)
 {
-	struct page **pages;
-
-	/*
-	 * If control reaches here, there must have been at least one
-	 * successful population attempt so the temp pages array must
-	 * be available now.
-	 */
-	pages = pcpu_get_pages();
-	BUG_ON(!pages);
+	unsigned int cpu;
+	int i;
 
-	/* unmap and free */
 	pcpu_pre_unmap_flush(chunk, page_start, page_end);
 
-	pcpu_unmap_pages(chunk, pages, page_start, page_end);
+	for_each_possible_cpu(cpu)
+		for (i = page_start; i < page_end; i++) {
+			struct page *page;
+
+			page = pcpu_chunk_page(chunk, cpu, i);
+			WARN_ON(!page);
 
-	/* no need to flush tlb, vmalloc will handle it lazily */
+			__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, i), 1);
 
-	pcpu_free_pages(chunk, pages, page_start, page_end);
+			if (likely(page))
+				__free_page(page);
+		}
 }
 
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
 {
 	struct pcpu_chunk *chunk;
 	struct vm_struct **vms;
 
-	chunk = pcpu_alloc_chunk();
+	chunk = pcpu_alloc_chunk(gfp);
 	if (!chunk)
 		return NULL;
 
 	vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
-				pcpu_nr_groups, pcpu_atom_size);
+				pcpu_nr_groups, pcpu_atom_size, gfp);
 	if (!vms) {
 		pcpu_free_chunk(chunk);
 		return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index 232356a2d914..f2cee0ae8688 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -103,6 +103,11 @@
 #define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)
 #endif	/* CONFIG_SMP */
 
+#define PCPU_BUSY_EXPAND_MAP		1	/* pcpu_alloc() is expanding the
+						 * the map
+						 */
+#define PCPU_BUSY_POPULATE_CHUNK	2	/* chunk is being populated */
+
 struct pcpu_chunk {
 	struct list_head	list;		/* linked to pcpu_slot lists */
 	int			free_size;	/* free bytes in the chunk */
@@ -118,6 +123,7 @@ struct pcpu_chunk {
 	int			first_free;	/* no free below this */
 	bool			immutable;	/* no [de]population allowed */
 	int			nr_populated;	/* # of populated pages */
+	int			busy_flags;	/* type of work in progress */
 	unsigned long		populated[];	/* populated bitmap */
 };
 
@@ -162,7 +168,6 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
 static int pcpu_reserved_chunk_limit;
 
 static DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
-static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */
 
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 
@@ -282,29 +287,31 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
 	     (rs) < (re);						    \
 	     (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
 
+static bool pcpu_has_unpop_pages(struct pcpu_chunk *chunk, int start, int end)
+{
+	return find_next_zero_bit(chunk->populated, end, start) < end;
+}
+
 /**
  * pcpu_mem_zalloc - allocate memory
  * @size: bytes to allocate
  *
  * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
- * kzalloc() is used; otherwise, vzalloc() is used.  The returned
+ * kzalloc() is used; otherwise, vmalloc_gfp() is used.  The returned
  * memory is always zeroed.
  *
- * CONTEXT:
- * Does GFP_KERNEL allocation.
- *
  * RETURNS:
  * Pointer to the allocated area on success, NULL on failure.
  */
-static void *pcpu_mem_zalloc(size_t size)
+static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
 {
 	if (WARN_ON_ONCE(!slab_is_available()))
 		return NULL;
 
 	if (size <= PAGE_SIZE)
-		return kzalloc(size, GFP_KERNEL);
+		return kzalloc(size, gfp);
 	else
-		return vzalloc(size);
+		return vmalloc_gfp(size, gfp | __GFP_HIGHMEM | __GFP_ZERO);
 }
 
 /**
@@ -438,15 +445,14 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc,
+				gfp_t gfp)
 {
 	int *old = NULL, *new = NULL;
 	size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
 	unsigned long flags;
 
-	lockdep_assert_held(&pcpu_alloc_mutex);
-
-	new = pcpu_mem_zalloc(new_size);
+	new = pcpu_mem_zalloc(new_size, gfp);
 	if (!new)
 		return -ENOMEM;
 
@@ -716,16 +722,16 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
 	pcpu_chunk_relocate(chunk, oslot);
 }
 
-static struct pcpu_chunk *pcpu_alloc_chunk(void)
+static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
 {
 	struct pcpu_chunk *chunk;
 
-	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
+	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
 	if (!chunk)
 		return NULL;
 
 	chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
-						sizeof(chunk->map[0]));
+						sizeof(chunk->map[0]), gfp);
 	if (!chunk->map) {
 		pcpu_mem_free(chunk);
 		return NULL;
@@ -811,9 +817,10 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
  * pcpu_addr_to_page		- translate address to physical address
  * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
  */
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size,
+			       gfp_t gfp);
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
-static struct pcpu_chunk *pcpu_create_chunk(void);
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
 static struct page *pcpu_addr_to_page(void *addr);
 static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@ -874,6 +881,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 	bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
 	int occ_pages = 0;
 	int slot, off, new_alloc, cpu, ret;
+	int page_start, page_end;
 	unsigned long flags;
 	void __percpu *ptr;
 
@@ -893,9 +901,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 		return NULL;
 	}
 
-	if (!is_atomic)
-		mutex_lock(&pcpu_alloc_mutex);
-
 	spin_lock_irqsave(&pcpu_lock, flags);
 
 	/* serve reserved allocations from the reserved chunk if available */
@@ -909,8 +914,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 
 		while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
 			spin_unlock_irqrestore(&pcpu_lock, flags);
-			if (is_atomic ||
-			    pcpu_extend_area_map(chunk, new_alloc) < 0) {
+			if (pcpu_extend_area_map(chunk, new_alloc, gfp) < 0) {
 				err = "failed to extend area map of reserved chunk";
 				goto fail;
 			}
@@ -933,17 +937,24 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 			if (size > chunk->contig_hint)
 				continue;
 
+			if (chunk->busy_flags & PCPU_BUSY_POPULATE_CHUNK)
+				continue;
+
 			new_alloc = pcpu_need_to_extend(chunk, is_atomic);
 			if (new_alloc) {
-				if (is_atomic)
-					continue;
+				chunk->busy_flags |= PCPU_BUSY_EXPAND_MAP;
 				spin_unlock_irqrestore(&pcpu_lock, flags);
-				if (pcpu_extend_area_map(chunk,
-							 new_alloc) < 0) {
+
+				ret = pcpu_extend_area_map(chunk, new_alloc,
+							   gfp);
+				spin_lock_irqsave(&pcpu_lock, flags);
+				chunk->busy_flags &= ~PCPU_BUSY_EXPAND_MAP;
+				if (ret < 0) {
+					spin_unlock_irqrestore(&pcpu_lock,
+							       flags);
 					err = "failed to extend area map";
 					goto fail;
 				}
-				spin_lock_irqsave(&pcpu_lock, flags);
 				/*
 				 * pcpu_lock has been dropped, need to
 				 * restart cpu_slot list walking.
@@ -953,53 +964,59 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 
 			off = pcpu_alloc_area(chunk, size, align, is_atomic,
 					      &occ_pages);
+			if (off < 0 && is_atomic) {
+				/* Try non-populated areas. */
+				off = pcpu_alloc_area(chunk, size, align, false,
+						      &occ_pages);
+			}
+
 			if (off >= 0)
 				goto area_found;
 		}
 	}
 
+	WARN_ON(!list_empty(&pcpu_slot[pcpu_nr_slots - 1]));
+
 	spin_unlock_irqrestore(&pcpu_lock, flags);
 
-	/*
-	 * No space left.  Create a new chunk.  We don't want multiple
-	 * tasks to create chunks simultaneously.  Serialize and create iff
-	 * there's still no empty chunk after grabbing the mutex.
-	 */
-	if (is_atomic)
+	chunk = pcpu_create_chunk(gfp);
+	if (!chunk) {
+		err = "failed to allocate new chunk";
 		goto fail;
+	}
 
-	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
-		chunk = pcpu_create_chunk();
-		if (!chunk) {
-			err = "failed to allocate new chunk";
-			goto fail;
-		}
+	spin_lock_irqsave(&pcpu_lock, flags);
 
-		spin_lock_irqsave(&pcpu_lock, flags);
+	/* Check whether someone else added a chunk while lock was
+	 * dropped.
+	 */
+	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1]))
 		pcpu_chunk_relocate(chunk, -1);
-	} else {
-		spin_lock_irqsave(&pcpu_lock, flags);
-	}
+	else
+		pcpu_destroy_chunk(chunk);
 
 	goto restart;
 
 area_found:
-	spin_unlock_irqrestore(&pcpu_lock, flags);
+
+	page_start = PFN_DOWN(off);
+	page_end = PFN_UP(off + size);
 
 	/* populate if not all pages are already there */
-	if (!is_atomic) {
-		int page_start, page_end, rs, re;
+	if (pcpu_has_unpop_pages(chunk, page_start, page_end)) {
+		int rs, re;
 
-		page_start = PFN_DOWN(off);
-		page_end = PFN_UP(off + size);
+		chunk->busy_flags |= PCPU_BUSY_POPULATE_CHUNK;
+		spin_unlock_irqrestore(&pcpu_lock, flags);
 
 		pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
 			WARN_ON(chunk->immutable);
 
-			ret = pcpu_populate_chunk(chunk, rs, re);
+			ret = pcpu_populate_chunk(chunk, rs, re, gfp);
 
 			spin_lock_irqsave(&pcpu_lock, flags);
 			if (ret) {
+				chunk->busy_flags &= ~PCPU_BUSY_POPULATE_CHUNK;
 				pcpu_free_area(chunk, off, &occ_pages);
 				err = "failed to populate";
 				goto fail_unlock;
@@ -1008,18 +1025,18 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 			spin_unlock_irqrestore(&pcpu_lock, flags);
 		}
 
-		mutex_unlock(&pcpu_alloc_mutex);
+		spin_lock_irqsave(&pcpu_lock, flags);
+		chunk->busy_flags &= ~PCPU_BUSY_POPULATE_CHUNK;
 	}
 
-	if (chunk != pcpu_reserved_chunk) {
-		spin_lock_irqsave(&pcpu_lock, flags);
+	if (chunk != pcpu_reserved_chunk)
 		pcpu_nr_empty_pop_pages -= occ_pages;
-		spin_unlock_irqrestore(&pcpu_lock, flags);
-	}
 
 	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
 		pcpu_schedule_balance_work();
 
+	spin_unlock_irqrestore(&pcpu_lock, flags);
+
 	/* clear the areas and return address relative to base address */
 	for_each_possible_cpu(cpu)
 		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
@@ -1042,8 +1059,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 		/* see the flag handling in pcpu_blance_workfn() */
 		pcpu_atomic_alloc_failed = true;
 		pcpu_schedule_balance_work();
-	} else {
-		mutex_unlock(&pcpu_alloc_mutex);
 	}
 	return NULL;
 }
@@ -1118,7 +1133,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
 	 * There's no reason to keep around multiple unused chunks and VM
 	 * areas can be scarce.  Destroy all free chunks except for one.
 	 */
-	mutex_lock(&pcpu_alloc_mutex);
 	spin_lock_irq(&pcpu_lock);
 
 	list_for_each_entry_safe(chunk, next, free_head, list) {
@@ -1128,6 +1142,10 @@ static void pcpu_balance_workfn(struct work_struct *work)
 		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
 			continue;
 
+		if (chunk->busy_flags & (PCPU_BUSY_POPULATE_CHUNK |
+					 PCPU_BUSY_EXPAND_MAP))
+			continue;
+
 		list_del_init(&chunk->map_extend_list);
 		list_move(&chunk->list, &to_free);
 	}
@@ -1162,7 +1180,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
 		spin_unlock_irq(&pcpu_lock);
 
 		if (new_alloc)
-			pcpu_extend_area_map(chunk, new_alloc);
+			pcpu_extend_area_map(chunk, new_alloc, GFP_KERNEL);
 	} while (chunk);
 
 	/*
@@ -1194,20 +1212,29 @@ static void pcpu_balance_workfn(struct work_struct *work)
 
 		spin_lock_irq(&pcpu_lock);
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+			if (chunk->busy_flags & PCPU_BUSY_POPULATE_CHUNK)
+				continue;
 			nr_unpop = pcpu_unit_pages - chunk->nr_populated;
 			if (nr_unpop)
 				break;
 		}
+
+		if (nr_unpop)
+			chunk->busy_flags |= PCPU_BUSY_POPULATE_CHUNK;
+
 		spin_unlock_irq(&pcpu_lock);
 
 		if (!nr_unpop)
 			continue;
 
-		/* @chunk can't go away while pcpu_alloc_mutex is held */
+		/* @chunk can't go away because only pcpu_balance_workfn
+		 * destroys it.
+		 */
 		pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
 			int nr = min(re - rs, nr_to_pop);
 
-			ret = pcpu_populate_chunk(chunk, rs, rs + nr);
+			ret = pcpu_populate_chunk(chunk, rs, rs + nr,
+						  GFP_KERNEL);
 			if (!ret) {
 				nr_to_pop -= nr;
 				spin_lock_irq(&pcpu_lock);
@@ -1220,11 +1247,14 @@ static void pcpu_balance_workfn(struct work_struct *work)
 			if (!nr_to_pop)
 				break;
 		}
+		spin_lock_irq(&pcpu_lock);
+		chunk->busy_flags &= ~PCPU_BUSY_POPULATE_CHUNK;
+		spin_unlock_irq(&pcpu_lock);
 	}
 
 	if (nr_to_pop) {
 		/* ran out of chunks to populate, create a new one and retry */
-		chunk = pcpu_create_chunk();
+		chunk = pcpu_create_chunk(GFP_KERNEL);
 		if (chunk) {
 			spin_lock_irq(&pcpu_lock);
 			pcpu_chunk_relocate(chunk, -1);
@@ -1232,8 +1262,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
 			goto retry_pop;
 		}
 	}
-
-	mutex_unlock(&pcpu_alloc_mutex);
 }
 
 /**
@@ -2297,7 +2325,7 @@ void __init percpu_init_late(void)
 
 		BUILD_BUG_ON(size > PAGE_SIZE);
 
-		map = pcpu_mem_zalloc(size);
+		map = pcpu_mem_zalloc(size, GFP_KERNEL);
 		BUG_ON(!map);
 
 		spin_lock_irqsave(&pcpu_lock, flags);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d89034a393f2..01abc9ed5224 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -360,6 +360,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	unsigned long addr;
 	int purged = 0;
 	struct vmap_area *first;
+	unsigned long flags;
 
 	BUG_ON(!size);
 	BUG_ON(offset_in_page(size));
@@ -379,7 +380,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
 
 retry:
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	/*
 	 * Invalidate cache if we have more permissive parameters.
 	 * cached_hole_size notes the largest hole noticed _below_
@@ -457,7 +458,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->flags = 0;
 	__insert_vmap_area(va);
 	free_vmap_cache = &va->rb_node;
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	BUG_ON(!IS_ALIGNED(va->va_start, align));
 	BUG_ON(va->va_start < vstart);
@@ -466,7 +467,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	return va;
 
 overflow:
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 	if (!purged) {
 		purge_vmap_area_lazy();
 		purged = 1;
@@ -541,9 +542,11 @@ static void __free_vmap_area(struct vmap_area *va)
  */
 static void free_vmap_area(struct vmap_area *va)
 {
-	spin_lock(&vmap_area_lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	__free_vmap_area(va);
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 }
 
 /*
@@ -629,6 +632,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 	struct vmap_area *va;
 	struct vmap_area *n_va;
 	bool do_free = false;
+	unsigned long flags;
 
 	lockdep_assert_held(&vmap_purge_lock);
 
@@ -646,15 +650,17 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 
 	flush_tlb_kernel_range(start, end);
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	llist_for_each_entry_safe(va, n_va, valist, purge_list) {
 		int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
 
 		__free_vmap_area(va);
 		atomic_sub(nr, &vmap_lazy_nr);
-		cond_resched_lock(&vmap_area_lock);
+		spin_unlock_irqrestore(&vmap_area_lock, flags);
+		cond_resched();
+		spin_lock_irqsave(&vmap_area_lock, flags);
 	}
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 	return true;
 }
 
@@ -713,10 +719,11 @@ static void free_unmap_vmap_area(struct vmap_area *va)
 static struct vmap_area *find_vmap_area(unsigned long addr)
 {
 	struct vmap_area *va;
+	unsigned long flags;
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	va = __find_vmap_area(addr);
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	return va;
 }
@@ -1313,14 +1320,16 @@ EXPORT_SYMBOL_GPL(map_vm_area);
 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 			      unsigned long flags, const void *caller)
 {
-	spin_lock(&vmap_area_lock);
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&vmap_area_lock, irq_flags);
 	vm->flags = flags;
 	vm->addr = (void *)va->va_start;
 	vm->size = va->va_end - va->va_start;
 	vm->caller = caller;
 	va->vm = vm;
 	va->flags |= VM_VM_AREA;
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, irq_flags);
 }
 
 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
@@ -1443,11 +1452,12 @@ struct vm_struct *remove_vm_area(const void *addr)
 	va = find_vmap_area((unsigned long)addr);
 	if (va && va->flags & VM_VM_AREA) {
 		struct vm_struct *vm = va->vm;
+		unsigned long flags;
 
-		spin_lock(&vmap_area_lock);
+		spin_lock_irqsave(&vmap_area_lock, flags);
 		va->vm = NULL;
 		va->flags &= ~VM_VM_AREA;
-		spin_unlock(&vmap_area_lock);
+		spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 		vmap_debug_free_range(va->va_start, va->va_end);
 		kasan_free_shadow(vm);
@@ -1858,6 +1868,11 @@ void *vzalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vzalloc_node);
 
+void *vmalloc_gfp(unsigned long size, gfp_t gfp_mask)
+{
+	return __vmalloc_node_flags(size, NUMA_NO_NODE, gfp_mask);
+}
+
 #ifndef PAGE_KERNEL_EXEC
 # define PAGE_KERNEL_EXEC PAGE_KERNEL
 #endif
@@ -2038,12 +2053,13 @@ long vread(char *buf, char *addr, unsigned long count)
 	char *vaddr, *buf_start = buf;
 	unsigned long buflen = count;
 	unsigned long n;
+	unsigned long flags;
 
 	/* Don't allow overflow */
 	if ((unsigned long) addr + count < count)
 		count = -(unsigned long) addr;
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	list_for_each_entry(va, &vmap_area_list, list) {
 		if (!count)
 			break;
@@ -2075,7 +2091,7 @@ long vread(char *buf, char *addr, unsigned long count)
 		count -= n;
 	}
 finished:
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	if (buf == buf_start)
 		return 0;
@@ -2119,13 +2135,14 @@ long vwrite(char *buf, char *addr, unsigned long count)
 	char *vaddr;
 	unsigned long n, buflen;
 	int copied = 0;
+	unsigned long flags;
 
 	/* Don't allow overflow */
 	if ((unsigned long) addr + count < count)
 		count = -(unsigned long) addr;
 	buflen = count;
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	list_for_each_entry(va, &vmap_area_list, list) {
 		if (!count)
 			break;
@@ -2156,7 +2173,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
 		count -= n;
 	}
 finished:
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 	if (!copied)
 		return 0;
 	return buflen;
@@ -2416,7 +2433,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
  */
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				     const size_t *sizes, int nr_vms,
-				     size_t align)
+				     size_t align, gfp_t gfp_mask)
 {
 	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
 	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2425,6 +2442,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 	int area, area2, last_area, term_area;
 	unsigned long base, start, end, last_end;
 	bool purged = false;
+	unsigned long flags;
 
 	/* verify parameters and allocate data structures */
 	BUG_ON(offset_in_page(align) || !is_power_of_2(align));
@@ -2458,19 +2476,19 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 		return NULL;
 	}
 
-	vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
-	vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
+	vms = kcalloc(nr_vms, sizeof(vms[0]), gfp_mask);
+	vas = kcalloc(nr_vms, sizeof(vas[0]), gfp_mask);
 	if (!vas || !vms)
 		goto err_free2;
 
 	for (area = 0; area < nr_vms; area++) {
-		vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
-		vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
+		vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
+		vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
 		if (!vas[area] || !vms[area])
 			goto err_free;
 	}
 retry:
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 
 	/* start scanning - we scan from the top, begin with the last area */
 	area = term_area = last_area;
@@ -2492,7 +2510,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 		 * comparing.
 		 */
 		if (base + last_end < vmalloc_start + last_end) {
-			spin_unlock(&vmap_area_lock);
+			spin_unlock_irqrestore(&vmap_area_lock, flags);
 			if (!purged) {
 				purge_vmap_area_lazy();
 				purged = true;
@@ -2547,7 +2565,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 
 	vmap_area_pcpu_hole = base + offsets[last_area];
 
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	/* insert all vm's */
 	for (area = 0; area < nr_vms; area++)
@@ -2589,7 +2607,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 static void *s_start(struct seq_file *m, loff_t *pos)
 	__acquires(&vmap_area_lock)
 {
-	spin_lock(&vmap_area_lock);
+	spin_lock_irq(&vmap_area_lock);
 	return seq_list_start(&vmap_area_list, *pos);
 }
 
@@ -2601,7 +2619,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 static void s_stop(struct seq_file *m, void *p)
 	__releases(&vmap_area_lock)
 {
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irq(&vmap_area_lock);
 }
 
 static void show_numa_info(struct seq_file *m, struct vm_struct *v)
-- 
2.11.0.483.g087da7b7c-goog

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-26  4:38     ` Tahsin Erdogan
  0 siblings, 0 replies; 38+ messages in thread
From: Tahsin Erdogan @ 2017-02-26  4:38 UTC (permalink / raw)
  To: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, Tahsin Erdogan,
	Michal Hocko, zijun_hu, Joonsoo Kim, David Rientjes
  Cc: linux-mm, linux-kernel

When pcpu_alloc() is called with gfp != GFP_KERNEL, the likelihood of
a failure is higher than GFP_KERNEL case. This is mainly because
pcpu_alloc() relies on previously allocated reserves and does not make
an effort to add memory to its pools for non-GFP_KERNEL case.

This issue is somewhat mitigated by kicking off a background work when
a memory allocation failure occurs. But this doesn't really help the
original victim of allocation failure.

This problem affects blkg_lookup_create() callers on machines with a
lot of cpus.

This patch reduces failure cases by trying to expand the memory pools.
It passes along gfp flag so it is safe to allocate memory this way.

To make this work, a gfp flag aware vmalloc_gfp() function is added.
Also, locking around vmap_area_lock has been updated to save/restore
irq flags. This was needed to avoid a lockdep problem between
request_queue->queue_lock and vmap_area_lock.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
v2:
 added vmalloc_gfp() to mm/nommu.c as well

 include/linux/vmalloc.h |   5 +-
 mm/nommu.c              |   5 ++
 mm/percpu-km.c          |   8 +--
 mm/percpu-vm.c          | 119 +++++++++++-------------------------
 mm/percpu.c             | 156 ++++++++++++++++++++++++++++--------------------
 mm/vmalloc.c            |  74 ++++++++++++++---------
 6 files changed, 184 insertions(+), 183 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index d68edffbf142..8110a0040b9d 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -72,6 +72,7 @@ extern void *vzalloc(unsigned long size);
 extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
 extern void *vzalloc_node(unsigned long size, int node);
+extern void *vmalloc_gfp(unsigned long size, gfp_t gfp_mask);
 extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
 extern void *vmalloc_32_user(unsigned long size);
@@ -165,14 +166,14 @@ extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 # ifdef CONFIG_MMU
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				     const size_t *sizes, int nr_vms,
-				     size_t align);
+				     size_t align, gfp_t gfp_mask);
 
 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
 # else
 static inline struct vm_struct **
 pcpu_get_vm_areas(const unsigned long *offsets,
 		const size_t *sizes, int nr_vms,
-		size_t align)
+		size_t align, gfp_t gfp_mask)
 {
 	return NULL;
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index bc964c26be8c..e81d4724ac07 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -359,6 +359,11 @@ void *vzalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vzalloc_node);
 
+void *vmalloc_gfp(unsigned long size, gfp_t gfp_mask)
+{
+	return __vmalloc(size, gfp_mask, PAGE_KERNEL);
+}
+
 #ifndef PAGE_KERNEL_EXEC
 # define PAGE_KERNEL_EXEC PAGE_KERNEL
 #endif
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index d66911ff42d9..599a9ce84544 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -34,7 +34,7 @@
 #include <linux/log2.h>
 
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
-			       int page_start, int page_end)
+			       int page_start, int page_end, gfp_t gfp)
 {
 	return 0;
 }
@@ -45,18 +45,18 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
 	/* nada */
 }
 
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
 {
 	const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
 	struct pcpu_chunk *chunk;
 	struct page *pages;
 	int i;
 
-	chunk = pcpu_alloc_chunk();
+	chunk = pcpu_alloc_chunk(gfp);
 	if (!chunk)
 		return NULL;
 
-	pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
+	pages = alloc_pages(gfp, order_base_2(nr_pages));
 	if (!pages) {
 		pcpu_free_chunk(chunk);
 		return NULL;
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 9ac639499bd1..42348a421ccf 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -20,28 +20,6 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
 }
 
 /**
- * pcpu_get_pages - get temp pages array
- *
- * Returns pointer to array of pointers to struct page which can be indexed
- * with pcpu_page_idx().  Note that there is only one array and accesses
- * should be serialized by pcpu_alloc_mutex.
- *
- * RETURNS:
- * Pointer to temp pages array on success.
- */
-static struct page **pcpu_get_pages(void)
-{
-	static struct page **pages;
-	size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
-
-	lockdep_assert_held(&pcpu_alloc_mutex);
-
-	if (!pages)
-		pages = pcpu_mem_zalloc(pages_size);
-	return pages;
-}
-
-/**
  * pcpu_free_pages - free pages which were allocated for @chunk
  * @chunk: chunk pages were allocated for
  * @pages: array of pages to be freed, indexed by pcpu_page_idx()
@@ -73,15 +51,16 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
  * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
  * @page_start: page index of the first page to be allocated
  * @page_end: page index of the last page to be allocated + 1
+ * @gfp: gfp flags
  *
  * Allocate pages [@page_start,@page_end) into @pages for all units.
  * The allocation is for @chunk.  Percpu core doesn't care about the
  * content of @pages and will pass it verbatim to pcpu_map_pages().
  */
 static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
-			    struct page **pages, int page_start, int page_end)
+			    struct page **pages, int page_start, int page_end,
+			    gfp_t gfp)
 {
-	const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
 	unsigned int cpu, tcpu;
 	int i;
 
@@ -135,38 +114,6 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
 }
 
 /**
- * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
- * @chunk: chunk of interest
- * @pages: pages array which can be used to pass information to free
- * @page_start: page index of the first page to unmap
- * @page_end: page index of the last page to unmap + 1
- *
- * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
- * Corresponding elements in @pages were cleared by the caller and can
- * be used to carry information to pcpu_free_pages() which will be
- * called after all unmaps are finished.  The caller should call
- * proper pre/post flush functions.
- */
-static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
-			     struct page **pages, int page_start, int page_end)
-{
-	unsigned int cpu;
-	int i;
-
-	for_each_possible_cpu(cpu) {
-		for (i = page_start; i < page_end; i++) {
-			struct page *page;
-
-			page = pcpu_chunk_page(chunk, cpu, i);
-			WARN_ON(!page);
-			pages[pcpu_page_idx(cpu, i)] = page;
-		}
-		__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
-				   page_end - page_start);
-	}
-}
-
-/**
  * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
  * @chunk: pcpu_chunk the regions to be flushed belong to
  * @page_start: page index of the first page to be flushed
@@ -262,32 +209,38 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
  * @chunk: chunk of interest
  * @page_start: the start page
  * @page_end: the end page
+ * @gfp: gfp flags
  *
  * For each cpu, populate and map pages [@page_start,@page_end) into
  * @chunk.
- *
- * CONTEXT:
- * pcpu_alloc_mutex, does GFP_KERNEL allocation.
  */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
-			       int page_start, int page_end)
+			       int page_start, int page_end, gfp_t gfp)
 {
 	struct page **pages;
+	size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
+	int ret;
 
-	pages = pcpu_get_pages();
+	pages = pcpu_mem_zalloc(pages_size, gfp);
 	if (!pages)
 		return -ENOMEM;
 
-	if (pcpu_alloc_pages(chunk, pages, page_start, page_end))
-		return -ENOMEM;
+	if (pcpu_alloc_pages(chunk, pages, page_start, page_end,
+			     gfp | __GFP_HIGHMEM | __GFP_COLD)) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
 		pcpu_free_pages(chunk, pages, page_start, page_end);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 	pcpu_post_map_flush(chunk, page_start, page_end);
-
-	return 0;
+	ret = 0;
+out:
+	pcpu_mem_free(pages);
+	return ret;
 }
 
 /**
@@ -298,44 +251,40 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
  *
  * For each cpu, depopulate and unmap pages [@page_start,@page_end)
  * from @chunk.
- *
- * CONTEXT:
- * pcpu_alloc_mutex.
  */
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
 				  int page_start, int page_end)
 {
-	struct page **pages;
-
-	/*
-	 * If control reaches here, there must have been at least one
-	 * successful population attempt so the temp pages array must
-	 * be available now.
-	 */
-	pages = pcpu_get_pages();
-	BUG_ON(!pages);
+	unsigned int cpu;
+	int i;
 
-	/* unmap and free */
 	pcpu_pre_unmap_flush(chunk, page_start, page_end);
 
-	pcpu_unmap_pages(chunk, pages, page_start, page_end);
+	for_each_possible_cpu(cpu)
+		for (i = page_start; i < page_end; i++) {
+			struct page *page;
+
+			page = pcpu_chunk_page(chunk, cpu, i);
+			WARN_ON(!page);
 
-	/* no need to flush tlb, vmalloc will handle it lazily */
+			__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, i), 1);
 
-	pcpu_free_pages(chunk, pages, page_start, page_end);
+			if (likely(page))
+				__free_page(page);
+		}
 }
 
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
 {
 	struct pcpu_chunk *chunk;
 	struct vm_struct **vms;
 
-	chunk = pcpu_alloc_chunk();
+	chunk = pcpu_alloc_chunk(gfp);
 	if (!chunk)
 		return NULL;
 
 	vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
-				pcpu_nr_groups, pcpu_atom_size);
+				pcpu_nr_groups, pcpu_atom_size, gfp);
 	if (!vms) {
 		pcpu_free_chunk(chunk);
 		return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index 232356a2d914..f2cee0ae8688 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -103,6 +103,11 @@
 #define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)
 #endif	/* CONFIG_SMP */
 
+#define PCPU_BUSY_EXPAND_MAP		1	/* pcpu_alloc() is expanding the
+						 * the map
+						 */
+#define PCPU_BUSY_POPULATE_CHUNK	2	/* chunk is being populated */
+
 struct pcpu_chunk {
 	struct list_head	list;		/* linked to pcpu_slot lists */
 	int			free_size;	/* free bytes in the chunk */
@@ -118,6 +123,7 @@ struct pcpu_chunk {
 	int			first_free;	/* no free below this */
 	bool			immutable;	/* no [de]population allowed */
 	int			nr_populated;	/* # of populated pages */
+	int			busy_flags;	/* type of work in progress */
 	unsigned long		populated[];	/* populated bitmap */
 };
 
@@ -162,7 +168,6 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
 static int pcpu_reserved_chunk_limit;
 
 static DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
-static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */
 
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 
@@ -282,29 +287,31 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
 	     (rs) < (re);						    \
 	     (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
 
+static bool pcpu_has_unpop_pages(struct pcpu_chunk *chunk, int start, int end)
+{
+	return find_next_zero_bit(chunk->populated, end, start) < end;
+}
+
 /**
  * pcpu_mem_zalloc - allocate memory
  * @size: bytes to allocate
  *
  * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
- * kzalloc() is used; otherwise, vzalloc() is used.  The returned
+ * kzalloc() is used; otherwise, vmalloc_gfp() is used.  The returned
  * memory is always zeroed.
  *
- * CONTEXT:
- * Does GFP_KERNEL allocation.
- *
  * RETURNS:
  * Pointer to the allocated area on success, NULL on failure.
  */
-static void *pcpu_mem_zalloc(size_t size)
+static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
 {
 	if (WARN_ON_ONCE(!slab_is_available()))
 		return NULL;
 
 	if (size <= PAGE_SIZE)
-		return kzalloc(size, GFP_KERNEL);
+		return kzalloc(size, gfp);
 	else
-		return vzalloc(size);
+		return vmalloc_gfp(size, gfp | __GFP_HIGHMEM | __GFP_ZERO);
 }
 
 /**
@@ -438,15 +445,14 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc,
+				gfp_t gfp)
 {
 	int *old = NULL, *new = NULL;
 	size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
 	unsigned long flags;
 
-	lockdep_assert_held(&pcpu_alloc_mutex);
-
-	new = pcpu_mem_zalloc(new_size);
+	new = pcpu_mem_zalloc(new_size, gfp);
 	if (!new)
 		return -ENOMEM;
 
@@ -716,16 +722,16 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
 	pcpu_chunk_relocate(chunk, oslot);
 }
 
-static struct pcpu_chunk *pcpu_alloc_chunk(void)
+static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
 {
 	struct pcpu_chunk *chunk;
 
-	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
+	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
 	if (!chunk)
 		return NULL;
 
 	chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
-						sizeof(chunk->map[0]));
+						sizeof(chunk->map[0]), gfp);
 	if (!chunk->map) {
 		pcpu_mem_free(chunk);
 		return NULL;
@@ -811,9 +817,10 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
  * pcpu_addr_to_page		- translate address to physical address
  * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
  */
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size,
+			       gfp_t gfp);
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
-static struct pcpu_chunk *pcpu_create_chunk(void);
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
 static struct page *pcpu_addr_to_page(void *addr);
 static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@ -874,6 +881,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 	bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
 	int occ_pages = 0;
 	int slot, off, new_alloc, cpu, ret;
+	int page_start, page_end;
 	unsigned long flags;
 	void __percpu *ptr;
 
@@ -893,9 +901,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 		return NULL;
 	}
 
-	if (!is_atomic)
-		mutex_lock(&pcpu_alloc_mutex);
-
 	spin_lock_irqsave(&pcpu_lock, flags);
 
 	/* serve reserved allocations from the reserved chunk if available */
@@ -909,8 +914,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 
 		while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
 			spin_unlock_irqrestore(&pcpu_lock, flags);
-			if (is_atomic ||
-			    pcpu_extend_area_map(chunk, new_alloc) < 0) {
+			if (pcpu_extend_area_map(chunk, new_alloc, gfp) < 0) {
 				err = "failed to extend area map of reserved chunk";
 				goto fail;
 			}
@@ -933,17 +937,24 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 			if (size > chunk->contig_hint)
 				continue;
 
+			if (chunk->busy_flags & PCPU_BUSY_POPULATE_CHUNK)
+				continue;
+
 			new_alloc = pcpu_need_to_extend(chunk, is_atomic);
 			if (new_alloc) {
-				if (is_atomic)
-					continue;
+				chunk->busy_flags |= PCPU_BUSY_EXPAND_MAP;
 				spin_unlock_irqrestore(&pcpu_lock, flags);
-				if (pcpu_extend_area_map(chunk,
-							 new_alloc) < 0) {
+
+				ret = pcpu_extend_area_map(chunk, new_alloc,
+							   gfp);
+				spin_lock_irqsave(&pcpu_lock, flags);
+				chunk->busy_flags &= ~PCPU_BUSY_EXPAND_MAP;
+				if (ret < 0) {
+					spin_unlock_irqrestore(&pcpu_lock,
+							       flags);
 					err = "failed to extend area map";
 					goto fail;
 				}
-				spin_lock_irqsave(&pcpu_lock, flags);
 				/*
 				 * pcpu_lock has been dropped, need to
 				 * restart cpu_slot list walking.
@@ -953,53 +964,59 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 
 			off = pcpu_alloc_area(chunk, size, align, is_atomic,
 					      &occ_pages);
+			if (off < 0 && is_atomic) {
+				/* Try non-populated areas. */
+				off = pcpu_alloc_area(chunk, size, align, false,
+						      &occ_pages);
+			}
+
 			if (off >= 0)
 				goto area_found;
 		}
 	}
 
+	WARN_ON(!list_empty(&pcpu_slot[pcpu_nr_slots - 1]));
+
 	spin_unlock_irqrestore(&pcpu_lock, flags);
 
-	/*
-	 * No space left.  Create a new chunk.  We don't want multiple
-	 * tasks to create chunks simultaneously.  Serialize and create iff
-	 * there's still no empty chunk after grabbing the mutex.
-	 */
-	if (is_atomic)
+	chunk = pcpu_create_chunk(gfp);
+	if (!chunk) {
+		err = "failed to allocate new chunk";
 		goto fail;
+	}
 
-	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
-		chunk = pcpu_create_chunk();
-		if (!chunk) {
-			err = "failed to allocate new chunk";
-			goto fail;
-		}
+	spin_lock_irqsave(&pcpu_lock, flags);
 
-		spin_lock_irqsave(&pcpu_lock, flags);
+	/* Check whether someone else added a chunk while lock was
+	 * dropped.
+	 */
+	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1]))
 		pcpu_chunk_relocate(chunk, -1);
-	} else {
-		spin_lock_irqsave(&pcpu_lock, flags);
-	}
+	else
+		pcpu_destroy_chunk(chunk);
 
 	goto restart;
 
 area_found:
-	spin_unlock_irqrestore(&pcpu_lock, flags);
+
+	page_start = PFN_DOWN(off);
+	page_end = PFN_UP(off + size);
 
 	/* populate if not all pages are already there */
-	if (!is_atomic) {
-		int page_start, page_end, rs, re;
+	if (pcpu_has_unpop_pages(chunk, page_start, page_end)) {
+		int rs, re;
 
-		page_start = PFN_DOWN(off);
-		page_end = PFN_UP(off + size);
+		chunk->busy_flags |= PCPU_BUSY_POPULATE_CHUNK;
+		spin_unlock_irqrestore(&pcpu_lock, flags);
 
 		pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
 			WARN_ON(chunk->immutable);
 
-			ret = pcpu_populate_chunk(chunk, rs, re);
+			ret = pcpu_populate_chunk(chunk, rs, re, gfp);
 
 			spin_lock_irqsave(&pcpu_lock, flags);
 			if (ret) {
+				chunk->busy_flags &= ~PCPU_BUSY_POPULATE_CHUNK;
 				pcpu_free_area(chunk, off, &occ_pages);
 				err = "failed to populate";
 				goto fail_unlock;
@@ -1008,18 +1025,18 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 			spin_unlock_irqrestore(&pcpu_lock, flags);
 		}
 
-		mutex_unlock(&pcpu_alloc_mutex);
+		spin_lock_irqsave(&pcpu_lock, flags);
+		chunk->busy_flags &= ~PCPU_BUSY_POPULATE_CHUNK;
 	}
 
-	if (chunk != pcpu_reserved_chunk) {
-		spin_lock_irqsave(&pcpu_lock, flags);
+	if (chunk != pcpu_reserved_chunk)
 		pcpu_nr_empty_pop_pages -= occ_pages;
-		spin_unlock_irqrestore(&pcpu_lock, flags);
-	}
 
 	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
 		pcpu_schedule_balance_work();
 
+	spin_unlock_irqrestore(&pcpu_lock, flags);
+
 	/* clear the areas and return address relative to base address */
 	for_each_possible_cpu(cpu)
 		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
@@ -1042,8 +1059,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 		/* see the flag handling in pcpu_blance_workfn() */
 		pcpu_atomic_alloc_failed = true;
 		pcpu_schedule_balance_work();
-	} else {
-		mutex_unlock(&pcpu_alloc_mutex);
 	}
 	return NULL;
 }
@@ -1118,7 +1133,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
 	 * There's no reason to keep around multiple unused chunks and VM
 	 * areas can be scarce.  Destroy all free chunks except for one.
 	 */
-	mutex_lock(&pcpu_alloc_mutex);
 	spin_lock_irq(&pcpu_lock);
 
 	list_for_each_entry_safe(chunk, next, free_head, list) {
@@ -1128,6 +1142,10 @@ static void pcpu_balance_workfn(struct work_struct *work)
 		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
 			continue;
 
+		if (chunk->busy_flags & (PCPU_BUSY_POPULATE_CHUNK |
+					 PCPU_BUSY_EXPAND_MAP))
+			continue;
+
 		list_del_init(&chunk->map_extend_list);
 		list_move(&chunk->list, &to_free);
 	}
@@ -1162,7 +1180,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
 		spin_unlock_irq(&pcpu_lock);
 
 		if (new_alloc)
-			pcpu_extend_area_map(chunk, new_alloc);
+			pcpu_extend_area_map(chunk, new_alloc, GFP_KERNEL);
 	} while (chunk);
 
 	/*
@@ -1194,20 +1212,29 @@ static void pcpu_balance_workfn(struct work_struct *work)
 
 		spin_lock_irq(&pcpu_lock);
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+			if (chunk->busy_flags & PCPU_BUSY_POPULATE_CHUNK)
+				continue;
 			nr_unpop = pcpu_unit_pages - chunk->nr_populated;
 			if (nr_unpop)
 				break;
 		}
+
+		if (nr_unpop)
+			chunk->busy_flags |= PCPU_BUSY_POPULATE_CHUNK;
+
 		spin_unlock_irq(&pcpu_lock);
 
 		if (!nr_unpop)
 			continue;
 
-		/* @chunk can't go away while pcpu_alloc_mutex is held */
+		/* @chunk can't go away because only pcpu_balance_workfn
+		 * destroys it.
+		 */
 		pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
 			int nr = min(re - rs, nr_to_pop);
 
-			ret = pcpu_populate_chunk(chunk, rs, rs + nr);
+			ret = pcpu_populate_chunk(chunk, rs, rs + nr,
+						  GFP_KERNEL);
 			if (!ret) {
 				nr_to_pop -= nr;
 				spin_lock_irq(&pcpu_lock);
@@ -1220,11 +1247,14 @@ static void pcpu_balance_workfn(struct work_struct *work)
 			if (!nr_to_pop)
 				break;
 		}
+		spin_lock_irq(&pcpu_lock);
+		chunk->busy_flags &= ~PCPU_BUSY_POPULATE_CHUNK;
+		spin_unlock_irq(&pcpu_lock);
 	}
 
 	if (nr_to_pop) {
 		/* ran out of chunks to populate, create a new one and retry */
-		chunk = pcpu_create_chunk();
+		chunk = pcpu_create_chunk(GFP_KERNEL);
 		if (chunk) {
 			spin_lock_irq(&pcpu_lock);
 			pcpu_chunk_relocate(chunk, -1);
@@ -1232,8 +1262,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
 			goto retry_pop;
 		}
 	}
-
-	mutex_unlock(&pcpu_alloc_mutex);
 }
 
 /**
@@ -2297,7 +2325,7 @@ void __init percpu_init_late(void)
 
 		BUILD_BUG_ON(size > PAGE_SIZE);
 
-		map = pcpu_mem_zalloc(size);
+		map = pcpu_mem_zalloc(size, GFP_KERNEL);
 		BUG_ON(!map);
 
 		spin_lock_irqsave(&pcpu_lock, flags);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d89034a393f2..01abc9ed5224 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -360,6 +360,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	unsigned long addr;
 	int purged = 0;
 	struct vmap_area *first;
+	unsigned long flags;
 
 	BUG_ON(!size);
 	BUG_ON(offset_in_page(size));
@@ -379,7 +380,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
 
 retry:
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	/*
 	 * Invalidate cache if we have more permissive parameters.
 	 * cached_hole_size notes the largest hole noticed _below_
@@ -457,7 +458,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->flags = 0;
 	__insert_vmap_area(va);
 	free_vmap_cache = &va->rb_node;
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	BUG_ON(!IS_ALIGNED(va->va_start, align));
 	BUG_ON(va->va_start < vstart);
@@ -466,7 +467,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	return va;
 
 overflow:
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 	if (!purged) {
 		purge_vmap_area_lazy();
 		purged = 1;
@@ -541,9 +542,11 @@ static void __free_vmap_area(struct vmap_area *va)
  */
 static void free_vmap_area(struct vmap_area *va)
 {
-	spin_lock(&vmap_area_lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	__free_vmap_area(va);
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 }
 
 /*
@@ -629,6 +632,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 	struct vmap_area *va;
 	struct vmap_area *n_va;
 	bool do_free = false;
+	unsigned long flags;
 
 	lockdep_assert_held(&vmap_purge_lock);
 
@@ -646,15 +650,17 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 
 	flush_tlb_kernel_range(start, end);
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	llist_for_each_entry_safe(va, n_va, valist, purge_list) {
 		int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
 
 		__free_vmap_area(va);
 		atomic_sub(nr, &vmap_lazy_nr);
-		cond_resched_lock(&vmap_area_lock);
+		spin_unlock_irqrestore(&vmap_area_lock, flags);
+		cond_resched();
+		spin_lock_irqsave(&vmap_area_lock, flags);
 	}
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 	return true;
 }
 
@@ -713,10 +719,11 @@ static void free_unmap_vmap_area(struct vmap_area *va)
 static struct vmap_area *find_vmap_area(unsigned long addr)
 {
 	struct vmap_area *va;
+	unsigned long flags;
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	va = __find_vmap_area(addr);
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	return va;
 }
@@ -1313,14 +1320,16 @@ EXPORT_SYMBOL_GPL(map_vm_area);
 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 			      unsigned long flags, const void *caller)
 {
-	spin_lock(&vmap_area_lock);
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&vmap_area_lock, irq_flags);
 	vm->flags = flags;
 	vm->addr = (void *)va->va_start;
 	vm->size = va->va_end - va->va_start;
 	vm->caller = caller;
 	va->vm = vm;
 	va->flags |= VM_VM_AREA;
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, irq_flags);
 }
 
 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
@@ -1443,11 +1452,12 @@ struct vm_struct *remove_vm_area(const void *addr)
 	va = find_vmap_area((unsigned long)addr);
 	if (va && va->flags & VM_VM_AREA) {
 		struct vm_struct *vm = va->vm;
+		unsigned long flags;
 
-		spin_lock(&vmap_area_lock);
+		spin_lock_irqsave(&vmap_area_lock, flags);
 		va->vm = NULL;
 		va->flags &= ~VM_VM_AREA;
-		spin_unlock(&vmap_area_lock);
+		spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 		vmap_debug_free_range(va->va_start, va->va_end);
 		kasan_free_shadow(vm);
@@ -1858,6 +1868,11 @@ void *vzalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vzalloc_node);
 
+void *vmalloc_gfp(unsigned long size, gfp_t gfp_mask)
+{
+	return __vmalloc_node_flags(size, NUMA_NO_NODE, gfp_mask);
+}
+
 #ifndef PAGE_KERNEL_EXEC
 # define PAGE_KERNEL_EXEC PAGE_KERNEL
 #endif
@@ -2038,12 +2053,13 @@ long vread(char *buf, char *addr, unsigned long count)
 	char *vaddr, *buf_start = buf;
 	unsigned long buflen = count;
 	unsigned long n;
+	unsigned long flags;
 
 	/* Don't allow overflow */
 	if ((unsigned long) addr + count < count)
 		count = -(unsigned long) addr;
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	list_for_each_entry(va, &vmap_area_list, list) {
 		if (!count)
 			break;
@@ -2075,7 +2091,7 @@ long vread(char *buf, char *addr, unsigned long count)
 		count -= n;
 	}
 finished:
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	if (buf == buf_start)
 		return 0;
@@ -2119,13 +2135,14 @@ long vwrite(char *buf, char *addr, unsigned long count)
 	char *vaddr;
 	unsigned long n, buflen;
 	int copied = 0;
+	unsigned long flags;
 
 	/* Don't allow overflow */
 	if ((unsigned long) addr + count < count)
 		count = -(unsigned long) addr;
 	buflen = count;
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	list_for_each_entry(va, &vmap_area_list, list) {
 		if (!count)
 			break;
@@ -2156,7 +2173,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
 		count -= n;
 	}
 finished:
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 	if (!copied)
 		return 0;
 	return buflen;
@@ -2416,7 +2433,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
  */
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				     const size_t *sizes, int nr_vms,
-				     size_t align)
+				     size_t align, gfp_t gfp_mask)
 {
 	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
 	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2425,6 +2442,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 	int area, area2, last_area, term_area;
 	unsigned long base, start, end, last_end;
 	bool purged = false;
+	unsigned long flags;
 
 	/* verify parameters and allocate data structures */
 	BUG_ON(offset_in_page(align) || !is_power_of_2(align));
@@ -2458,19 +2476,19 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 		return NULL;
 	}
 
-	vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
-	vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
+	vms = kcalloc(nr_vms, sizeof(vms[0]), gfp_mask);
+	vas = kcalloc(nr_vms, sizeof(vas[0]), gfp_mask);
 	if (!vas || !vms)
 		goto err_free2;
 
 	for (area = 0; area < nr_vms; area++) {
-		vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
-		vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
+		vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
+		vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
 		if (!vas[area] || !vms[area])
 			goto err_free;
 	}
 retry:
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 
 	/* start scanning - we scan from the top, begin with the last area */
 	area = term_area = last_area;
@@ -2492,7 +2510,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 		 * comparing.
 		 */
 		if (base + last_end < vmalloc_start + last_end) {
-			spin_unlock(&vmap_area_lock);
+			spin_unlock_irqrestore(&vmap_area_lock, flags);
 			if (!purged) {
 				purge_vmap_area_lazy();
 				purged = true;
@@ -2547,7 +2565,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 
 	vmap_area_pcpu_hole = base + offsets[last_area];
 
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	/* insert all vm's */
 	for (area = 0; area < nr_vms; area++)
@@ -2589,7 +2607,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 static void *s_start(struct seq_file *m, loff_t *pos)
 	__acquires(&vmap_area_lock)
 {
-	spin_lock(&vmap_area_lock);
+	spin_lock_irq(&vmap_area_lock);
 	return seq_list_start(&vmap_area_list, *pos);
 }
 
@@ -2601,7 +2619,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 static void s_stop(struct seq_file *m, void *p)
 	__releases(&vmap_area_lock)
 {
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irq(&vmap_area_lock);
 }
 
 static void show_numa_info(struct seq_file *m, struct vm_struct *v)
-- 
2.11.0.483.g087da7b7c-goog

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-26  4:38     ` Tahsin Erdogan
@ 2017-02-27  9:52       ` Michal Hocko
  -1 siblings, 0 replies; 38+ messages in thread
From: Michal Hocko @ 2017-02-27  9:52 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Sat 25-02-17 20:38:29, Tahsin Erdogan wrote:
> When pcpu_alloc() is called with gfp != GFP_KERNEL, the likelihood of
> a failure is higher than GFP_KERNEL case. This is mainly because
> pcpu_alloc() relies on previously allocated reserves and does not make
> an effort to add memory to its pools for non-GFP_KERNEL case.

Who is going to use a different mask?
 
> This issue is somewhat mitigated by kicking off a background work when
> a memory allocation failure occurs. But this doesn't really help the
> original victim of allocation failure.
> 
> This problem affects blkg_lookup_create() callers on machines with a
> lot of cpus.
> 
> This patch reduces failure cases by trying to expand the memory pools.
> It passes along gfp flag so it is safe to allocate memory this way.
> 
> To make this work, a gfp flag aware vmalloc_gfp() function is added.
> Also, locking around vmap_area_lock has been updated to save/restore
> irq flags. This was needed to avoid a lockdep problem between
> request_queue->queue_lock and vmap_area_lock.

We already have __vmalloc_gfp, why this cannot be used? Also note that
vmalloc dosn't really support arbitrary gfp flags. One have to be really
careful because there are some internal allocations which are hardcoded
GFP_KERNEL. Also this patch doesn't really add any new callers so it is
hard to tell whether what you do actually makes sense and is correct.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-27  9:52       ` Michal Hocko
  0 siblings, 0 replies; 38+ messages in thread
From: Michal Hocko @ 2017-02-27  9:52 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Sat 25-02-17 20:38:29, Tahsin Erdogan wrote:
> When pcpu_alloc() is called with gfp != GFP_KERNEL, the likelihood of
> a failure is higher than GFP_KERNEL case. This is mainly because
> pcpu_alloc() relies on previously allocated reserves and does not make
> an effort to add memory to its pools for non-GFP_KERNEL case.

Who is going to use a different mask?
 
> This issue is somewhat mitigated by kicking off a background work when
> a memory allocation failure occurs. But this doesn't really help the
> original victim of allocation failure.
> 
> This problem affects blkg_lookup_create() callers on machines with a
> lot of cpus.
> 
> This patch reduces failure cases by trying to expand the memory pools.
> It passes along gfp flag so it is safe to allocate memory this way.
> 
> To make this work, a gfp flag aware vmalloc_gfp() function is added.
> Also, locking around vmap_area_lock has been updated to save/restore
> irq flags. This was needed to avoid a lockdep problem between
> request_queue->queue_lock and vmap_area_lock.

We already have __vmalloc_gfp, why this cannot be used? Also note that
vmalloc dosn't really support arbitrary gfp flags. One have to be really
careful because there are some internal allocations which are hardcoded
GFP_KERNEL. Also this patch doesn't really add any new callers so it is
hard to tell whether what you do actually makes sense and is correct.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-27  9:52       ` Michal Hocko
@ 2017-02-27 13:00         ` Tahsin Erdogan
  -1 siblings, 0 replies; 38+ messages in thread
From: Tahsin Erdogan @ 2017-02-27 13:00 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Mon, Feb 27, 2017 at 1:52 AM, Michal Hocko <mhocko@kernel.org> wrote:
> On Sat 25-02-17 20:38:29, Tahsin Erdogan wrote:
>> When pcpu_alloc() is called with gfp != GFP_KERNEL, the likelihood of
>> a failure is higher than GFP_KERNEL case. This is mainly because
>> pcpu_alloc() relies on previously allocated reserves and does not make
>> an effort to add memory to its pools for non-GFP_KERNEL case.
>
> Who is going to use a different mask?

blkg_create() makes a call with a non-GFP_KERNEL mask:
   new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);

which turns into a call stack like below:

__vmalloc+0x45/0x50
pcpu_mem_zalloc+0x50/0x80
pcpu_populate_chunk+0x3b/0x380
pcpu_alloc+0x588/0x6e0
__alloc_percpu_gfp+0xd/0x10
__percpu_counter_init+0x55/0xc0
blkg_alloc+0x76/0x230
blkg_create+0x489/0x670
blkg_lookup_create+0x9a/0x230
generic_make_request_checks+0x7dd/0x890
generic_make_request+0x1f/0x180
submit_bio+0x61/0x120


> We already have __vmalloc_gfp, why this cannot be used? Also note that
> vmalloc dosn't really support arbitrary gfp flags. One have to be really
> careful because there are some internal allocations which are hardcoded
> GFP_KERNEL. Also this patch doesn't really add any new callers so it is
> hard to tell whether what you do actually makes sense and is correct.

Did you mean to say __vmalloc? If so, yes, I should use that.

By the way, I now noticed the might_sleep() in alloc_vmap_area() which makes
it unsafe to call vmalloc* in GFP_ATOMIC contexts. It was added recently:

commit 5803ed292e63 ("mm: mark all calls into the vmalloc subsystem as
potentially sleeping")

Any suggestions on how to deal with that? For instance, would it be
safe to replace it with:

might_sleep_if(gfpflags_allow_blocking(gfp_mask));

and then skip purge_vmap_area_lazy() if blocking is not allowed?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-27 13:00         ` Tahsin Erdogan
  0 siblings, 0 replies; 38+ messages in thread
From: Tahsin Erdogan @ 2017-02-27 13:00 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Mon, Feb 27, 2017 at 1:52 AM, Michal Hocko <mhocko@kernel.org> wrote:
> On Sat 25-02-17 20:38:29, Tahsin Erdogan wrote:
>> When pcpu_alloc() is called with gfp != GFP_KERNEL, the likelihood of
>> a failure is higher than GFP_KERNEL case. This is mainly because
>> pcpu_alloc() relies on previously allocated reserves and does not make
>> an effort to add memory to its pools for non-GFP_KERNEL case.
>
> Who is going to use a different mask?

blkg_create() makes a call with a non-GFP_KERNEL mask:
   new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);

which turns into a call stack like below:

__vmalloc+0x45/0x50
pcpu_mem_zalloc+0x50/0x80
pcpu_populate_chunk+0x3b/0x380
pcpu_alloc+0x588/0x6e0
__alloc_percpu_gfp+0xd/0x10
__percpu_counter_init+0x55/0xc0
blkg_alloc+0x76/0x230
blkg_create+0x489/0x670
blkg_lookup_create+0x9a/0x230
generic_make_request_checks+0x7dd/0x890
generic_make_request+0x1f/0x180
submit_bio+0x61/0x120


> We already have __vmalloc_gfp, why this cannot be used? Also note that
> vmalloc dosn't really support arbitrary gfp flags. One have to be really
> careful because there are some internal allocations which are hardcoded
> GFP_KERNEL. Also this patch doesn't really add any new callers so it is
> hard to tell whether what you do actually makes sense and is correct.

Did you mean to say __vmalloc? If so, yes, I should use that.

By the way, I now noticed the might_sleep() in alloc_vmap_area() which makes
it unsafe to call vmalloc* in GFP_ATOMIC contexts. It was added recently:

commit 5803ed292e63 ("mm: mark all calls into the vmalloc subsystem as
potentially sleeping")

Any suggestions on how to deal with that? For instance, would it be
safe to replace it with:

might_sleep_if(gfpflags_allow_blocking(gfp_mask));

and then skip purge_vmap_area_lazy() if blocking is not allowed?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-27 13:00         ` Tahsin Erdogan
@ 2017-02-27 15:25           ` Michal Hocko
  -1 siblings, 0 replies; 38+ messages in thread
From: Michal Hocko @ 2017-02-27 15:25 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Mon 27-02-17 05:00:31, Tahsin Erdogan wrote:
> On Mon, Feb 27, 2017 at 1:52 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > On Sat 25-02-17 20:38:29, Tahsin Erdogan wrote:
> >> When pcpu_alloc() is called with gfp != GFP_KERNEL, the likelihood of
> >> a failure is higher than GFP_KERNEL case. This is mainly because
> >> pcpu_alloc() relies on previously allocated reserves and does not make
> >> an effort to add memory to its pools for non-GFP_KERNEL case.
> >
> > Who is going to use a different mask?
> 
> blkg_create() makes a call with a non-GFP_KERNEL mask:
>    new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
> 
> which turns into a call stack like below:
> 
> __vmalloc+0x45/0x50
> pcpu_mem_zalloc+0x50/0x80
> pcpu_populate_chunk+0x3b/0x380
> pcpu_alloc+0x588/0x6e0
> __alloc_percpu_gfp+0xd/0x10
> __percpu_counter_init+0x55/0xc0
> blkg_alloc+0x76/0x230
> blkg_create+0x489/0x670
> blkg_lookup_create+0x9a/0x230
> generic_make_request_checks+0x7dd/0x890
> generic_make_request+0x1f/0x180
> submit_bio+0x61/0x120

OK, I see. Thanks for the clarification. I am not familiar with the pcp
allocator much, but we have
	/*
	 * No space left.  Create a new chunk.  We don't want multiple
	 * tasks to create chunks simultaneously.  Serialize and create iff
	 * there's still no empty chunk after grabbing the mutex.
	 */
	if (is_atomic)
		goto fail;

right before pcpu_populate_chunk so is this actually a problem?

> > We already have __vmalloc_gfp, why this cannot be used? Also note that
> > vmalloc dosn't really support arbitrary gfp flags. One have to be really
> > careful because there are some internal allocations which are hardcoded
> > GFP_KERNEL. Also this patch doesn't really add any new callers so it is
> > hard to tell whether what you do actually makes sense and is correct.
> 
> Did you mean to say __vmalloc? If so, yes, I should use that.

yeah

> By the way, I now noticed the might_sleep() in alloc_vmap_area() which makes
> it unsafe to call vmalloc* in GFP_ATOMIC contexts. It was added recently:

Do we call alloc_vmap_area from true atomic contexts (aka from under
spinlocks etc)? I thought this was a nogo and GFP_NOWAIT resp.
GFP_ATOMIC was more about optimistic request resp. access to memory
reserves rather than true atomicity requirements.

> commit 5803ed292e63 ("mm: mark all calls into the vmalloc subsystem as
> potentially sleeping")
> 
> Any suggestions on how to deal with that? For instance, would it be
> safe to replace it with:
> 
> might_sleep_if(gfpflags_allow_blocking(gfp_mask));
> 
> and then skip purge_vmap_area_lazy() if blocking is not allowed?

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-27 15:25           ` Michal Hocko
  0 siblings, 0 replies; 38+ messages in thread
From: Michal Hocko @ 2017-02-27 15:25 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Mon 27-02-17 05:00:31, Tahsin Erdogan wrote:
> On Mon, Feb 27, 2017 at 1:52 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > On Sat 25-02-17 20:38:29, Tahsin Erdogan wrote:
> >> When pcpu_alloc() is called with gfp != GFP_KERNEL, the likelihood of
> >> a failure is higher than GFP_KERNEL case. This is mainly because
> >> pcpu_alloc() relies on previously allocated reserves and does not make
> >> an effort to add memory to its pools for non-GFP_KERNEL case.
> >
> > Who is going to use a different mask?
> 
> blkg_create() makes a call with a non-GFP_KERNEL mask:
>    new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
> 
> which turns into a call stack like below:
> 
> __vmalloc+0x45/0x50
> pcpu_mem_zalloc+0x50/0x80
> pcpu_populate_chunk+0x3b/0x380
> pcpu_alloc+0x588/0x6e0
> __alloc_percpu_gfp+0xd/0x10
> __percpu_counter_init+0x55/0xc0
> blkg_alloc+0x76/0x230
> blkg_create+0x489/0x670
> blkg_lookup_create+0x9a/0x230
> generic_make_request_checks+0x7dd/0x890
> generic_make_request+0x1f/0x180
> submit_bio+0x61/0x120

OK, I see. Thanks for the clarification. I am not familiar with the pcp
allocator much, but we have
	/*
	 * No space left.  Create a new chunk.  We don't want multiple
	 * tasks to create chunks simultaneously.  Serialize and create iff
	 * there's still no empty chunk after grabbing the mutex.
	 */
	if (is_atomic)
		goto fail;

right before pcpu_populate_chunk so is this actually a problem?

> > We already have __vmalloc_gfp, why this cannot be used? Also note that
> > vmalloc dosn't really support arbitrary gfp flags. One have to be really
> > careful because there are some internal allocations which are hardcoded
> > GFP_KERNEL. Also this patch doesn't really add any new callers so it is
> > hard to tell whether what you do actually makes sense and is correct.
> 
> Did you mean to say __vmalloc? If so, yes, I should use that.

yeah

> By the way, I now noticed the might_sleep() in alloc_vmap_area() which makes
> it unsafe to call vmalloc* in GFP_ATOMIC contexts. It was added recently:

Do we call alloc_vmap_area from true atomic contexts (aka from under
spinlocks etc)? I thought this was a nogo and GFP_NOWAIT resp.
GFP_ATOMIC was more about optimistic request resp. access to memory
reserves rather than true atomicity requirements.

> commit 5803ed292e63 ("mm: mark all calls into the vmalloc subsystem as
> potentially sleeping")
> 
> Any suggestions on how to deal with that? For instance, would it be
> safe to replace it with:
> 
> might_sleep_if(gfpflags_allow_blocking(gfp_mask));
> 
> and then skip purge_vmap_area_lazy() if blocking is not allowed?

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-27 15:25           ` Michal Hocko
@ 2017-02-27 17:01             ` Tahsin Erdogan
  -1 siblings, 0 replies; 38+ messages in thread
From: Tahsin Erdogan @ 2017-02-27 17:01 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Mon, Feb 27, 2017 at 7:25 AM, Michal Hocko <mhocko@kernel.org> wrote:
>         /*
>          * No space left.  Create a new chunk.  We don't want multiple
>          * tasks to create chunks simultaneously.  Serialize and create iff
>          * there's still no empty chunk after grabbing the mutex.
>          */
>         if (is_atomic)
>                 goto fail;
>
> right before pcpu_populate_chunk so is this actually a problem?

Yes, this prevents adding more pcpu chunks and so cause "atomic" allocations
to fail more easily.

>> By the way, I now noticed the might_sleep() in alloc_vmap_area() which makes
>> it unsafe to call vmalloc* in GFP_ATOMIC contexts. It was added recently:
>
> Do we call alloc_vmap_area from true atomic contexts (aka from under
> spinlocks etc)? I thought this was a nogo and GFP_NOWAIT resp.
> GFP_ATOMIC was more about optimistic request resp. access to memory
> reserves rather than true atomicity requirements.

In the call path that I am trying to fix, the caller uses GFP_NOWAIT mask.
The caller is holding a spinlock (request_queue->queue_lock) so we can't afford
to sleep.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-27 17:01             ` Tahsin Erdogan
  0 siblings, 0 replies; 38+ messages in thread
From: Tahsin Erdogan @ 2017-02-27 17:01 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Mon, Feb 27, 2017 at 7:25 AM, Michal Hocko <mhocko@kernel.org> wrote:
>         /*
>          * No space left.  Create a new chunk.  We don't want multiple
>          * tasks to create chunks simultaneously.  Serialize and create iff
>          * there's still no empty chunk after grabbing the mutex.
>          */
>         if (is_atomic)
>                 goto fail;
>
> right before pcpu_populate_chunk so is this actually a problem?

Yes, this prevents adding more pcpu chunks and so cause "atomic" allocations
to fail more easily.

>> By the way, I now noticed the might_sleep() in alloc_vmap_area() which makes
>> it unsafe to call vmalloc* in GFP_ATOMIC contexts. It was added recently:
>
> Do we call alloc_vmap_area from true atomic contexts (aka from under
> spinlocks etc)? I thought this was a nogo and GFP_NOWAIT resp.
> GFP_ATOMIC was more about optimistic request resp. access to memory
> reserves rather than true atomicity requirements.

In the call path that I am trying to fix, the caller uses GFP_NOWAIT mask.
The caller is holding a spinlock (request_queue->queue_lock) so we can't afford
to sleep.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-27 17:01             ` Tahsin Erdogan
@ 2017-02-27 17:07               ` Michal Hocko
  -1 siblings, 0 replies; 38+ messages in thread
From: Michal Hocko @ 2017-02-27 17:07 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Mon 27-02-17 09:01:09, Tahsin Erdogan wrote:
> On Mon, Feb 27, 2017 at 7:25 AM, Michal Hocko <mhocko@kernel.org> wrote:
> >         /*
> >          * No space left.  Create a new chunk.  We don't want multiple
> >          * tasks to create chunks simultaneously.  Serialize and create iff
> >          * there's still no empty chunk after grabbing the mutex.
> >          */
> >         if (is_atomic)
> >                 goto fail;
> >
> > right before pcpu_populate_chunk so is this actually a problem?
> 
> Yes, this prevents adding more pcpu chunks and so cause "atomic" allocations
> to fail more easily.

Then I fail to see what is the problem you are trying to fix.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-27 17:07               ` Michal Hocko
  0 siblings, 0 replies; 38+ messages in thread
From: Michal Hocko @ 2017-02-27 17:07 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Mon 27-02-17 09:01:09, Tahsin Erdogan wrote:
> On Mon, Feb 27, 2017 at 7:25 AM, Michal Hocko <mhocko@kernel.org> wrote:
> >         /*
> >          * No space left.  Create a new chunk.  We don't want multiple
> >          * tasks to create chunks simultaneously.  Serialize and create iff
> >          * there's still no empty chunk after grabbing the mutex.
> >          */
> >         if (is_atomic)
> >                 goto fail;
> >
> > right before pcpu_populate_chunk so is this actually a problem?
> 
> Yes, this prevents adding more pcpu chunks and so cause "atomic" allocations
> to fail more easily.

Then I fail to see what is the problem you are trying to fix.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-27 17:07               ` Michal Hocko
@ 2017-02-27 17:14                 ` Michal Hocko
  -1 siblings, 0 replies; 38+ messages in thread
From: Michal Hocko @ 2017-02-27 17:14 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Mon 27-02-17 18:07:53, Michal Hocko wrote:
> On Mon 27-02-17 09:01:09, Tahsin Erdogan wrote:
> > On Mon, Feb 27, 2017 at 7:25 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > >         /*
> > >          * No space left.  Create a new chunk.  We don't want multiple
> > >          * tasks to create chunks simultaneously.  Serialize and create iff
> > >          * there's still no empty chunk after grabbing the mutex.
> > >          */
> > >         if (is_atomic)
> > >                 goto fail;
> > >
> > > right before pcpu_populate_chunk so is this actually a problem?
> > 
> > Yes, this prevents adding more pcpu chunks and so cause "atomic" allocations
> > to fail more easily.
> 
> Then I fail to see what is the problem you are trying to fix.

To be more specific. Could you describe what more can we do in the
vmalloc layer for GFP_NOWAIT allocations? They certainly cannot sleep
and cannot perform the reclaim so you have to rely on the background
work.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-27 17:14                 ` Michal Hocko
  0 siblings, 0 replies; 38+ messages in thread
From: Michal Hocko @ 2017-02-27 17:14 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Mon 27-02-17 18:07:53, Michal Hocko wrote:
> On Mon 27-02-17 09:01:09, Tahsin Erdogan wrote:
> > On Mon, Feb 27, 2017 at 7:25 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > >         /*
> > >          * No space left.  Create a new chunk.  We don't want multiple
> > >          * tasks to create chunks simultaneously.  Serialize and create iff
> > >          * there's still no empty chunk after grabbing the mutex.
> > >          */
> > >         if (is_atomic)
> > >                 goto fail;
> > >
> > > right before pcpu_populate_chunk so is this actually a problem?
> > 
> > Yes, this prevents adding more pcpu chunks and so cause "atomic" allocations
> > to fail more easily.
> 
> Then I fail to see what is the problem you are trying to fix.

To be more specific. Could you describe what more can we do in the
vmalloc layer for GFP_NOWAIT allocations? They certainly cannot sleep
and cannot perform the reclaim so you have to rely on the background
work.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-27 17:14                 ` Michal Hocko
@ 2017-02-27 19:32                   ` Tahsin Erdogan
  -1 siblings, 0 replies; 38+ messages in thread
From: Tahsin Erdogan @ 2017-02-27 19:32 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

>> >
>> > Yes, this prevents adding more pcpu chunks and so cause "atomic" allocations
>> > to fail more easily.
>>
>> Then I fail to see what is the problem you are trying to fix.
>
> To be more specific. Could you describe what more can we do in the
> vmalloc layer for GFP_NOWAIT allocations? They certainly cannot sleep
> and cannot perform the reclaim so you have to rely on the background
> work.

The main problem that I am trying to fix is in percpu.c code. It
currently doesn't
even attempt to call vmalloc() for GFP_NOWAIT case. It solely relies on the
background allocator to replenish the reserves. I would like percpu.c to call
__vmalloc(GFP_NOWAIT) inline and see whether that succeeds. If that fails, it is
fair to fail the call.

For this to work, __vmalloc() should be ready to serve a caller that
is holding a
spinlock. The might_sleep() in alloc_vmap_area() basically prevents us calling
vmalloc in this context.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-27 19:32                   ` Tahsin Erdogan
  0 siblings, 0 replies; 38+ messages in thread
From: Tahsin Erdogan @ 2017-02-27 19:32 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

>> >
>> > Yes, this prevents adding more pcpu chunks and so cause "atomic" allocations
>> > to fail more easily.
>>
>> Then I fail to see what is the problem you are trying to fix.
>
> To be more specific. Could you describe what more can we do in the
> vmalloc layer for GFP_NOWAIT allocations? They certainly cannot sleep
> and cannot perform the reclaim so you have to rely on the background
> work.

The main problem that I am trying to fix is in percpu.c code. It
currently doesn't
even attempt to call vmalloc() for GFP_NOWAIT case. It solely relies on the
background allocator to replenish the reserves. I would like percpu.c to call
__vmalloc(GFP_NOWAIT) inline and see whether that succeeds. If that fails, it is
fair to fail the call.

For this to work, __vmalloc() should be ready to serve a caller that
is holding a
spinlock. The might_sleep() in alloc_vmap_area() basically prevents us calling
vmalloc in this context.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-27 19:32                   ` Tahsin Erdogan
@ 2017-02-27 19:47                     ` Michal Hocko
  -1 siblings, 0 replies; 38+ messages in thread
From: Michal Hocko @ 2017-02-27 19:47 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Mon 27-02-17 11:32:50, Tahsin Erdogan wrote:
> >> >
> >> > Yes, this prevents adding more pcpu chunks and so cause "atomic" allocations
> >> > to fail more easily.
> >>
> >> Then I fail to see what is the problem you are trying to fix.
> >
> > To be more specific. Could you describe what more can we do in the
> > vmalloc layer for GFP_NOWAIT allocations? They certainly cannot sleep
> > and cannot perform the reclaim so you have to rely on the background
> > work.
> 
> The main problem that I am trying to fix is in percpu.c code. It
> currently doesn't even attempt to call vmalloc() for GFP_NOWAIT
> case. It solely relies on the background allocator to replenish the
> reserves. I would like percpu.c to call __vmalloc(GFP_NOWAIT) inline
> and see whether that succeeds. If that fails, it is fair to fail the
> call.

OK, that wasn't really clean from the patch to me. I guess it would be
much more easier if a preparatory patch did the gfp mask propagation and
then have patch that changes the pcpu allocator the way you need.
 
> For this to work, __vmalloc() should be ready to serve a caller
> that is holding a spinlock. The might_sleep() in alloc_vmap_area()
> basically prevents us calling vmalloc in this context.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-27 19:47                     ` Michal Hocko
  0 siblings, 0 replies; 38+ messages in thread
From: Michal Hocko @ 2017-02-27 19:47 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Tejun Heo, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Mon 27-02-17 11:32:50, Tahsin Erdogan wrote:
> >> >
> >> > Yes, this prevents adding more pcpu chunks and so cause "atomic" allocations
> >> > to fail more easily.
> >>
> >> Then I fail to see what is the problem you are trying to fix.
> >
> > To be more specific. Could you describe what more can we do in the
> > vmalloc layer for GFP_NOWAIT allocations? They certainly cannot sleep
> > and cannot perform the reclaim so you have to rely on the background
> > work.
> 
> The main problem that I am trying to fix is in percpu.c code. It
> currently doesn't even attempt to call vmalloc() for GFP_NOWAIT
> case. It solely relies on the background allocator to replenish the
> reserves. I would like percpu.c to call __vmalloc(GFP_NOWAIT) inline
> and see whether that succeeds. If that fails, it is fair to fail the
> call.

OK, that wasn't really clean from the patch to me. I guess it would be
much more easier if a preparatory patch did the gfp mask propagation and
then have patch that changes the pcpu allocator the way you need.
 
> For this to work, __vmalloc() should be ready to serve a caller
> that is holding a spinlock. The might_sleep() in alloc_vmap_area()
> basically prevents us calling vmalloc in this context.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-27 13:00         ` Tahsin Erdogan
@ 2017-02-27 19:51           ` Tejun Heo
  -1 siblings, 0 replies; 38+ messages in thread
From: Tejun Heo @ 2017-02-27 19:51 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Michal Hocko, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

Hello,

On Mon, Feb 27, 2017 at 05:00:31AM -0800, Tahsin Erdogan wrote:
> On Mon, Feb 27, 2017 at 1:52 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > On Sat 25-02-17 20:38:29, Tahsin Erdogan wrote:
> >> When pcpu_alloc() is called with gfp != GFP_KERNEL, the likelihood of
> >> a failure is higher than GFP_KERNEL case. This is mainly because
> >> pcpu_alloc() relies on previously allocated reserves and does not make
> >> an effort to add memory to its pools for non-GFP_KERNEL case.
> >
> > Who is going to use a different mask?
> 
> blkg_create() makes a call with a non-GFP_KERNEL mask:
>    new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
> 
> which turns into a call stack like below:
> 
> __vmalloc+0x45/0x50
> pcpu_mem_zalloc+0x50/0x80
> pcpu_populate_chunk+0x3b/0x380
> pcpu_alloc+0x588/0x6e0
> __alloc_percpu_gfp+0xd/0x10
> __percpu_counter_init+0x55/0xc0
> blkg_alloc+0x76/0x230
> blkg_create+0x489/0x670
> blkg_lookup_create+0x9a/0x230
> generic_make_request_checks+0x7dd/0x890
> generic_make_request+0x1f/0x180
> submit_bio+0x61/0x120

As indicated by GFP_NOWAIT | __GFP_NOWARN, it's okay to fail there.
It's not okay to fail consistently for a long time but it's not a big
issue to fail occassionally even if somewhat bunched up.  The only bad
side effect of that is temporary misaccounting of some IOs, which
shouldn't be noticeable outside of pathological cases.  If you're
actually seeing adverse effects of this, I'd love to learn about it.

> > We already have __vmalloc_gfp, why this cannot be used? Also note that
> > vmalloc dosn't really support arbitrary gfp flags. One have to be really
> > careful because there are some internal allocations which are hardcoded
> > GFP_KERNEL. Also this patch doesn't really add any new callers so it is
> > hard to tell whether what you do actually makes sense and is correct.
>
> Did you mean to say __vmalloc? If so, yes, I should use that.

So, the last time I looked at it the thorny ones in that path are the
page table (pgd, pud...) allocation functions.  There are several
layers of indirection there but they end up in arch-specific
implemntations which hard code GFP_KERNEL.  Without fixing them up, we
can't guarantee mapping the allocated pages making things kinda moot.

The only reason percpu allocator has the background allocator stuff is
vmalloc path can't do non-blocking allocations.  If we can properly
fix that up, we can get rid of all those code from percpu allocator
and simply path the gfp flag to vmap functions.  Please take a look at
__pcpu_map_pages() in mm/percpu-vm.c.  map_kernel_range_noflush() is
the function which has implicit GFP_KERNEL allocation in it and what's
requiring the reserve.

If you can get rid of that, awesome, but given that your patch doesn't
touch that at all, I can't see how it's supposed to work.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-27 19:51           ` Tejun Heo
  0 siblings, 0 replies; 38+ messages in thread
From: Tejun Heo @ 2017-02-27 19:51 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Michal Hocko, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

Hello,

On Mon, Feb 27, 2017 at 05:00:31AM -0800, Tahsin Erdogan wrote:
> On Mon, Feb 27, 2017 at 1:52 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > On Sat 25-02-17 20:38:29, Tahsin Erdogan wrote:
> >> When pcpu_alloc() is called with gfp != GFP_KERNEL, the likelihood of
> >> a failure is higher than GFP_KERNEL case. This is mainly because
> >> pcpu_alloc() relies on previously allocated reserves and does not make
> >> an effort to add memory to its pools for non-GFP_KERNEL case.
> >
> > Who is going to use a different mask?
> 
> blkg_create() makes a call with a non-GFP_KERNEL mask:
>    new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
> 
> which turns into a call stack like below:
> 
> __vmalloc+0x45/0x50
> pcpu_mem_zalloc+0x50/0x80
> pcpu_populate_chunk+0x3b/0x380
> pcpu_alloc+0x588/0x6e0
> __alloc_percpu_gfp+0xd/0x10
> __percpu_counter_init+0x55/0xc0
> blkg_alloc+0x76/0x230
> blkg_create+0x489/0x670
> blkg_lookup_create+0x9a/0x230
> generic_make_request_checks+0x7dd/0x890
> generic_make_request+0x1f/0x180
> submit_bio+0x61/0x120

As indicated by GFP_NOWAIT | __GFP_NOWARN, it's okay to fail there.
It's not okay to fail consistently for a long time but it's not a big
issue to fail occassionally even if somewhat bunched up.  The only bad
side effect of that is temporary misaccounting of some IOs, which
shouldn't be noticeable outside of pathological cases.  If you're
actually seeing adverse effects of this, I'd love to learn about it.

> > We already have __vmalloc_gfp, why this cannot be used? Also note that
> > vmalloc dosn't really support arbitrary gfp flags. One have to be really
> > careful because there are some internal allocations which are hardcoded
> > GFP_KERNEL. Also this patch doesn't really add any new callers so it is
> > hard to tell whether what you do actually makes sense and is correct.
>
> Did you mean to say __vmalloc? If so, yes, I should use that.

So, the last time I looked at it the thorny ones in that path are the
page table (pgd, pud...) allocation functions.  There are several
layers of indirection there but they end up in arch-specific
implemntations which hard code GFP_KERNEL.  Without fixing them up, we
can't guarantee mapping the allocated pages making things kinda moot.

The only reason percpu allocator has the background allocator stuff is
vmalloc path can't do non-blocking allocations.  If we can properly
fix that up, we can get rid of all those code from percpu allocator
and simply path the gfp flag to vmap functions.  Please take a look at
__pcpu_map_pages() in mm/percpu-vm.c.  map_kernel_range_noflush() is
the function which has implicit GFP_KERNEL allocation in it and what's
requiring the reserve.

If you can get rid of that, awesome, but given that your patch doesn't
touch that at all, I can't see how it's supposed to work.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-27 19:51           ` Tejun Heo
@ 2017-02-27 20:27             ` Tahsin Erdogan
  -1 siblings, 0 replies; 38+ messages in thread
From: Tahsin Erdogan @ 2017-02-27 20:27 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michal Hocko, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

Hi Tejun,

On Mon, Feb 27, 2017 at 11:51 AM, Tejun Heo <tj@kernel.org> wrote:
>> __vmalloc+0x45/0x50
>> pcpu_mem_zalloc+0x50/0x80
>> pcpu_populate_chunk+0x3b/0x380
>> pcpu_alloc+0x588/0x6e0
>> __alloc_percpu_gfp+0xd/0x10
>> __percpu_counter_init+0x55/0xc0
>> blkg_alloc+0x76/0x230
>> blkg_create+0x489/0x670
>> blkg_lookup_create+0x9a/0x230
>> generic_make_request_checks+0x7dd/0x890
>> generic_make_request+0x1f/0x180
>> submit_bio+0x61/0x120
>
> As indicated by GFP_NOWAIT | __GFP_NOWARN, it's okay to fail there.
> It's not okay to fail consistently for a long time but it's not a big
> issue to fail occassionally even if somewhat bunched up.  The only bad
> side effect of that is temporary misaccounting of some IOs, which
> shouldn't be noticeable outside of pathological cases.  If you're
> actually seeing adverse effects of this, I'd love to learn about it.

A better example is the call path below:

pcpu_alloc+0x68f/0x710
__alloc_percpu_gfp+0xd/0x10
__percpu_counter_init+0x55/0xc0
cfq_pd_alloc+0x3b2/0x4e0
blkg_alloc+0x187/0x230
blkg_create+0x489/0x670
blkg_lookup_create+0x9a/0x230
blkg_conf_prep+0x1fb/0x240
__cfqg_set_weight_device.isra.105+0x5c/0x180
cfq_set_weight_on_dfl+0x69/0xc0
cgroup_file_write+0x39/0x1c0
kernfs_fop_write+0x13f/0x1d0
__vfs_write+0x23/0x120
vfs_write+0xc2/0x1f0
SyS_write+0x44/0xb0
entry_SYSCALL_64_fastpath+0x18/0xad

A failure in this call path gives grief to tools which are trying to
configure io
weights. We see occasional failures happen here shortly after reboots even
when system is not under any memory pressure. Machines with a lot of cpus
are obviously more vulnerable.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-27 20:27             ` Tahsin Erdogan
  0 siblings, 0 replies; 38+ messages in thread
From: Tahsin Erdogan @ 2017-02-27 20:27 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michal Hocko, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

Hi Tejun,

On Mon, Feb 27, 2017 at 11:51 AM, Tejun Heo <tj@kernel.org> wrote:
>> __vmalloc+0x45/0x50
>> pcpu_mem_zalloc+0x50/0x80
>> pcpu_populate_chunk+0x3b/0x380
>> pcpu_alloc+0x588/0x6e0
>> __alloc_percpu_gfp+0xd/0x10
>> __percpu_counter_init+0x55/0xc0
>> blkg_alloc+0x76/0x230
>> blkg_create+0x489/0x670
>> blkg_lookup_create+0x9a/0x230
>> generic_make_request_checks+0x7dd/0x890
>> generic_make_request+0x1f/0x180
>> submit_bio+0x61/0x120
>
> As indicated by GFP_NOWAIT | __GFP_NOWARN, it's okay to fail there.
> It's not okay to fail consistently for a long time but it's not a big
> issue to fail occassionally even if somewhat bunched up.  The only bad
> side effect of that is temporary misaccounting of some IOs, which
> shouldn't be noticeable outside of pathological cases.  If you're
> actually seeing adverse effects of this, I'd love to learn about it.

A better example is the call path below:

pcpu_alloc+0x68f/0x710
__alloc_percpu_gfp+0xd/0x10
__percpu_counter_init+0x55/0xc0
cfq_pd_alloc+0x3b2/0x4e0
blkg_alloc+0x187/0x230
blkg_create+0x489/0x670
blkg_lookup_create+0x9a/0x230
blkg_conf_prep+0x1fb/0x240
__cfqg_set_weight_device.isra.105+0x5c/0x180
cfq_set_weight_on_dfl+0x69/0xc0
cgroup_file_write+0x39/0x1c0
kernfs_fop_write+0x13f/0x1d0
__vfs_write+0x23/0x120
vfs_write+0xc2/0x1f0
SyS_write+0x44/0xb0
entry_SYSCALL_64_fastpath+0x18/0xad

A failure in this call path gives grief to tools which are trying to
configure io
weights. We see occasional failures happen here shortly after reboots even
when system is not under any memory pressure. Machines with a lot of cpus
are obviously more vulnerable.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-27 20:27             ` Tahsin Erdogan
@ 2017-02-27 20:29               ` Tejun Heo
  -1 siblings, 0 replies; 38+ messages in thread
From: Tejun Heo @ 2017-02-27 20:29 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Michal Hocko, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

Hello,

On Mon, Feb 27, 2017 at 12:27:08PM -0800, Tahsin Erdogan wrote:
> A better example is the call path below:
> 
> pcpu_alloc+0x68f/0x710
> __alloc_percpu_gfp+0xd/0x10
> __percpu_counter_init+0x55/0xc0
> cfq_pd_alloc+0x3b2/0x4e0
> blkg_alloc+0x187/0x230
> blkg_create+0x489/0x670
> blkg_lookup_create+0x9a/0x230
> blkg_conf_prep+0x1fb/0x240
> __cfqg_set_weight_device.isra.105+0x5c/0x180
> cfq_set_weight_on_dfl+0x69/0xc0
> cgroup_file_write+0x39/0x1c0
> kernfs_fop_write+0x13f/0x1d0
> __vfs_write+0x23/0x120
> vfs_write+0xc2/0x1f0
> SyS_write+0x44/0xb0
> entry_SYSCALL_64_fastpath+0x18/0xad
> 
> A failure in this call path gives grief to tools which are trying to
> configure io
> weights. We see occasional failures happen here shortly after reboots even
> when system is not under any memory pressure. Machines with a lot of cpus
> are obviously more vulnerable.

Ah, absolutely, that's a stupid failure but we should be able to fix
that by making the blkg functions take gfp mask and allocate
accordingly, right?  It'll probably take preallocation tricks because
of locking but should be doable.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-27 20:29               ` Tejun Heo
  0 siblings, 0 replies; 38+ messages in thread
From: Tejun Heo @ 2017-02-27 20:29 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Michal Hocko, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

Hello,

On Mon, Feb 27, 2017 at 12:27:08PM -0800, Tahsin Erdogan wrote:
> A better example is the call path below:
> 
> pcpu_alloc+0x68f/0x710
> __alloc_percpu_gfp+0xd/0x10
> __percpu_counter_init+0x55/0xc0
> cfq_pd_alloc+0x3b2/0x4e0
> blkg_alloc+0x187/0x230
> blkg_create+0x489/0x670
> blkg_lookup_create+0x9a/0x230
> blkg_conf_prep+0x1fb/0x240
> __cfqg_set_weight_device.isra.105+0x5c/0x180
> cfq_set_weight_on_dfl+0x69/0xc0
> cgroup_file_write+0x39/0x1c0
> kernfs_fop_write+0x13f/0x1d0
> __vfs_write+0x23/0x120
> vfs_write+0xc2/0x1f0
> SyS_write+0x44/0xb0
> entry_SYSCALL_64_fastpath+0x18/0xad
> 
> A failure in this call path gives grief to tools which are trying to
> configure io
> weights. We see occasional failures happen here shortly after reboots even
> when system is not under any memory pressure. Machines with a lot of cpus
> are obviously more vulnerable.

Ah, absolutely, that's a stupid failure but we should be able to fix
that by making the blkg functions take gfp mask and allocate
accordingly, right?  It'll probably take preallocation tricks because
of locking but should be doable.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-27 20:29               ` Tejun Heo
@ 2017-02-27 20:37                 ` Tahsin Erdogan
  -1 siblings, 0 replies; 38+ messages in thread
From: Tahsin Erdogan @ 2017-02-27 20:37 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michal Hocko, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Mon, Feb 27, 2017 at 12:29 PM, Tejun Heo <tj@kernel.org> wrote:
> Hello,
>
> On Mon, Feb 27, 2017 at 12:27:08PM -0800, Tahsin Erdogan wrote:
>> A better example is the call path below:
>>
>> pcpu_alloc+0x68f/0x710
>> __alloc_percpu_gfp+0xd/0x10
>> __percpu_counter_init+0x55/0xc0
>> cfq_pd_alloc+0x3b2/0x4e0
>> blkg_alloc+0x187/0x230
>> blkg_create+0x489/0x670
>> blkg_lookup_create+0x9a/0x230
>> blkg_conf_prep+0x1fb/0x240
>> __cfqg_set_weight_device.isra.105+0x5c/0x180
>> cfq_set_weight_on_dfl+0x69/0xc0
>> cgroup_file_write+0x39/0x1c0
>> kernfs_fop_write+0x13f/0x1d0
>> __vfs_write+0x23/0x120
>> vfs_write+0xc2/0x1f0
>> SyS_write+0x44/0xb0
>> entry_SYSCALL_64_fastpath+0x18/0xad
>>
>> A failure in this call path gives grief to tools which are trying to
>> configure io
>> weights. We see occasional failures happen here shortly after reboots even
>> when system is not under any memory pressure. Machines with a lot of cpus
>> are obviously more vulnerable.
>
> Ah, absolutely, that's a stupid failure but we should be able to fix
> that by making the blkg functions take gfp mask and allocate
> accordingly, right?  It'll probably take preallocation tricks because
> of locking but should be doable.

My initial goal was to allow calls to vmalloc(), but I now see the
challenges in that
approach.

Doing preallocations would probably work but not sure if that can be
done without
complicating code too much. Could you describe what you have in mind?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-27 20:37                 ` Tahsin Erdogan
  0 siblings, 0 replies; 38+ messages in thread
From: Tahsin Erdogan @ 2017-02-27 20:37 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michal Hocko, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Mon, Feb 27, 2017 at 12:29 PM, Tejun Heo <tj@kernel.org> wrote:
> Hello,
>
> On Mon, Feb 27, 2017 at 12:27:08PM -0800, Tahsin Erdogan wrote:
>> A better example is the call path below:
>>
>> pcpu_alloc+0x68f/0x710
>> __alloc_percpu_gfp+0xd/0x10
>> __percpu_counter_init+0x55/0xc0
>> cfq_pd_alloc+0x3b2/0x4e0
>> blkg_alloc+0x187/0x230
>> blkg_create+0x489/0x670
>> blkg_lookup_create+0x9a/0x230
>> blkg_conf_prep+0x1fb/0x240
>> __cfqg_set_weight_device.isra.105+0x5c/0x180
>> cfq_set_weight_on_dfl+0x69/0xc0
>> cgroup_file_write+0x39/0x1c0
>> kernfs_fop_write+0x13f/0x1d0
>> __vfs_write+0x23/0x120
>> vfs_write+0xc2/0x1f0
>> SyS_write+0x44/0xb0
>> entry_SYSCALL_64_fastpath+0x18/0xad
>>
>> A failure in this call path gives grief to tools which are trying to
>> configure io
>> weights. We see occasional failures happen here shortly after reboots even
>> when system is not under any memory pressure. Machines with a lot of cpus
>> are obviously more vulnerable.
>
> Ah, absolutely, that's a stupid failure but we should be able to fix
> that by making the blkg functions take gfp mask and allocate
> accordingly, right?  It'll probably take preallocation tricks because
> of locking but should be doable.

My initial goal was to allow calls to vmalloc(), but I now see the
challenges in that
approach.

Doing preallocations would probably work but not sure if that can be
done without
complicating code too much. Could you describe what you have in mind?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-27 20:37                 ` Tahsin Erdogan
@ 2017-02-27 20:45                   ` Tejun Heo
  -1 siblings, 0 replies; 38+ messages in thread
From: Tejun Heo @ 2017-02-27 20:45 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Michal Hocko, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

Hello, Tahsin.

On Mon, Feb 27, 2017 at 12:37:59PM -0800, Tahsin Erdogan wrote:
> > Ah, absolutely, that's a stupid failure but we should be able to fix
> > that by making the blkg functions take gfp mask and allocate
> > accordingly, right?  It'll probably take preallocation tricks because
> > of locking but should be doable.
> 
> My initial goal was to allow calls to vmalloc(), but I now see the
> challenges in that
> approach.

I'd love to see that working too but this is a different issue.  Even
GFP_ATOMIC can fail under pressure and it's kinda wrong to depend on
that for userspace interactions.

> Doing preallocations would probably work but not sure if that can be
> done without
> complicating code too much. Could you describe what you have in mind?

So, blkg_create() already takes @new_blkg argument which is the
preallocated blkg and used during q init.  Wouldn't it work to make
blkg_lookup_create() take @new_blkg too and pass it down to
blkg_create() (and also free it if it doesn't get used)?  Then,
blkg_conf_prep() can always (or after a failure with -ENOMEM) allocate
a new blkg before calling into blkg_lookup_create().  I don't think
it'll complicate the code path that much.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-27 20:45                   ` Tejun Heo
  0 siblings, 0 replies; 38+ messages in thread
From: Tejun Heo @ 2017-02-27 20:45 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Michal Hocko, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

Hello, Tahsin.

On Mon, Feb 27, 2017 at 12:37:59PM -0800, Tahsin Erdogan wrote:
> > Ah, absolutely, that's a stupid failure but we should be able to fix
> > that by making the blkg functions take gfp mask and allocate
> > accordingly, right?  It'll probably take preallocation tricks because
> > of locking but should be doable.
> 
> My initial goal was to allow calls to vmalloc(), but I now see the
> challenges in that
> approach.

I'd love to see that working too but this is a different issue.  Even
GFP_ATOMIC can fail under pressure and it's kinda wrong to depend on
that for userspace interactions.

> Doing preallocations would probably work but not sure if that can be
> done without
> complicating code too much. Could you describe what you have in mind?

So, blkg_create() already takes @new_blkg argument which is the
preallocated blkg and used during q init.  Wouldn't it work to make
blkg_lookup_create() take @new_blkg too and pass it down to
blkg_create() (and also free it if it doesn't get used)?  Then,
blkg_conf_prep() can always (or after a failure with -ENOMEM) allocate
a new blkg before calling into blkg_lookup_create().  I don't think
it'll complicate the code path that much.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-27 20:45                   ` Tejun Heo
@ 2017-02-27 21:12                     ` Tahsin Erdogan
  -1 siblings, 0 replies; 38+ messages in thread
From: Tahsin Erdogan @ 2017-02-27 21:12 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michal Hocko, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

>> Doing preallocations would probably work but not sure if that can be
>> done without
>> complicating code too much. Could you describe what you have in mind?
>
> So, blkg_create() already takes @new_blkg argument which is the
> preallocated blkg and used during q init.  Wouldn't it work to make
> blkg_lookup_create() take @new_blkg too and pass it down to
> blkg_create() (and also free it if it doesn't get used)?  Then,
> blkg_conf_prep() can always (or after a failure with -ENOMEM) allocate
> a new blkg before calling into blkg_lookup_create().  I don't think
> it'll complicate the code path that much.

That makes sense. I will work a patch that does that (unless you are
interested in implementing it yourself).

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-27 21:12                     ` Tahsin Erdogan
  0 siblings, 0 replies; 38+ messages in thread
From: Tahsin Erdogan @ 2017-02-27 21:12 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michal Hocko, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

>> Doing preallocations would probably work but not sure if that can be
>> done without
>> complicating code too much. Could you describe what you have in mind?
>
> So, blkg_create() already takes @new_blkg argument which is the
> preallocated blkg and used during q init.  Wouldn't it work to make
> blkg_lookup_create() take @new_blkg too and pass it down to
> blkg_create() (and also free it if it doesn't get used)?  Then,
> blkg_conf_prep() can always (or after a failure with -ENOMEM) allocate
> a new blkg before calling into blkg_lookup_create().  I don't think
> it'll complicate the code path that much.

That makes sense. I will work a patch that does that (unless you are
interested in implementing it yourself).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
  2017-02-27 21:12                     ` Tahsin Erdogan
@ 2017-02-27 21:28                       ` Tejun Heo
  -1 siblings, 0 replies; 38+ messages in thread
From: Tejun Heo @ 2017-02-27 21:28 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Michal Hocko, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Mon, Feb 27, 2017 at 01:12:11PM -0800, Tahsin Erdogan wrote:
> That makes sense. I will work a patch that does that (unless you are
> interested in implementing it yourself).

I'd really appreciate if you can work on it.  Thanks a lot!

-- 
tejun

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers
@ 2017-02-27 21:28                       ` Tejun Heo
  0 siblings, 0 replies; 38+ messages in thread
From: Tejun Heo @ 2017-02-27 21:28 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Michal Hocko, Christoph Lameter, Andrew Morton, Chris Wilson,
	Andrey Ryabinin, Roman Pen, Joonas Lahtinen, zijun_hu,
	Joonsoo Kim, David Rientjes, linux-mm, linux-kernel

On Mon, Feb 27, 2017 at 01:12:11PM -0800, Tahsin Erdogan wrote:
> That makes sense. I will work a patch that does that (unless you are
> interested in implementing it yourself).

I'd really appreciate if you can work on it.  Thanks a lot!

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2017-02-27 22:39 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-02-25 21:00 [PATCH 3/3] percpu: improve allocation success rate for non-GFP_KERNEL callers Tahsin Erdogan
2017-02-25 21:00 ` Tahsin Erdogan
2017-02-25 23:54 ` kbuild test robot
2017-02-25 23:54   ` kbuild test robot
2017-02-26  0:48 ` kbuild test robot
2017-02-26  0:48   ` kbuild test robot
2017-02-26  4:38   ` [PATCH v2 " Tahsin Erdogan
2017-02-26  4:38     ` Tahsin Erdogan
2017-02-27  9:52     ` Michal Hocko
2017-02-27  9:52       ` Michal Hocko
2017-02-27 13:00       ` Tahsin Erdogan
2017-02-27 13:00         ` Tahsin Erdogan
2017-02-27 15:25         ` Michal Hocko
2017-02-27 15:25           ` Michal Hocko
2017-02-27 17:01           ` Tahsin Erdogan
2017-02-27 17:01             ` Tahsin Erdogan
2017-02-27 17:07             ` Michal Hocko
2017-02-27 17:07               ` Michal Hocko
2017-02-27 17:14               ` Michal Hocko
2017-02-27 17:14                 ` Michal Hocko
2017-02-27 19:32                 ` Tahsin Erdogan
2017-02-27 19:32                   ` Tahsin Erdogan
2017-02-27 19:47                   ` Michal Hocko
2017-02-27 19:47                     ` Michal Hocko
2017-02-27 19:51         ` Tejun Heo
2017-02-27 19:51           ` Tejun Heo
2017-02-27 20:27           ` Tahsin Erdogan
2017-02-27 20:27             ` Tahsin Erdogan
2017-02-27 20:29             ` Tejun Heo
2017-02-27 20:29               ` Tejun Heo
2017-02-27 20:37               ` Tahsin Erdogan
2017-02-27 20:37                 ` Tahsin Erdogan
2017-02-27 20:45                 ` Tejun Heo
2017-02-27 20:45                   ` Tejun Heo
2017-02-27 21:12                   ` Tahsin Erdogan
2017-02-27 21:12                     ` Tahsin Erdogan
2017-02-27 21:28                     ` Tejun Heo
2017-02-27 21:28                       ` Tejun Heo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.