All of lore.kernel.org
 help / color / mirror / Atom feed
* incoming
@ 2019-12-18  4:50 Andrew Morton
  2019-12-18  4:51 ` [patch 1/6] kasan: fix crashes on access to memory mapped by vm_map_ram() Andrew Morton
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Andrew Morton @ 2019-12-18  4:50 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-mm, mm-commits

6 fixes based on 2187f215ebaac73ddbd814696d7c7fa34f0c3de0:

    Andrey Ryabinin <aryabinin@virtuozzo.com>:
      kasan: fix crashes on access to memory mapped by vm_map_ram()

    Daniel Axtens <dja@axtens.net>:
      mm/memory.c: add apply_to_existing_page_range() helper
      kasan: use apply_to_existing_page_range() for releasing vmalloc shadow
      kasan: don't assume percpu shadow allocations will succeed

    Yang Shi <yang.shi@linux.alibaba.com>:
      mm: vmscan: protect shrinker idr replace with CONFIG_MEMCG

    Changbin Du <changbin.du@gmail.com>:
      lib/Kconfig.debug: fix some messed up configurations

 include/linux/kasan.h |   15 +++--
 include/linux/mm.h    |    3 +
 lib/Kconfig.debug     |  100 ++++++++++++++++++------------------
 mm/kasan/common.c     |   36 ++++++++-----
 mm/memory.c           |  136 ++++++++++++++++++++++++++++++++++----------------
 mm/vmalloc.c          |  133 ++++++++++++++++++++++++++++--------------------
 mm/vmscan.c           |    2 
 7 files changed, 260 insertions(+), 165 deletions(-)



^ permalink raw reply	[flat|nested] 7+ messages in thread

* [patch 1/6] kasan: fix crashes on access to memory mapped by vm_map_ram()
  2019-12-18  4:50 incoming Andrew Morton
@ 2019-12-18  4:51 ` Andrew Morton
  2019-12-18  4:51 ` [patch 2/6] mm/memory.c: add apply_to_existing_page_range() helper Andrew Morton
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Andrew Morton @ 2019-12-18  4:51 UTC (permalink / raw)
  To: akpm, aryabinin, cai, dja, dvyukov, glider, linux-mm, mm-commits,
	torvalds, urezki

From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Subject: kasan: fix crashes on access to memory mapped by vm_map_ram()

With CONFIG_KASAN_VMALLOC=y any use of memory obtained via vm_map_ram()
will crash because there is no shadow backing that memory.

Instead of sprinkling additional kasan_populate_vmalloc() calls all over
the vmalloc code, move it into alloc_vmap_area(). This will fix
vm_map_ram() and simplify the code a bit.

[aryabinin@virtuozzo.com: v2]
  Link: http://lkml.kernel.org/r/20191205095942.1761-1-aryabinin@virtuozzo.comLink: http://lkml.kernel.org/r/20191204204534.32202-1-aryabinin@virtuozzo.com
Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory")
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Daniel Axtens <dja@axtens.net>
Cc: Alexander Potapenko <glider@google.com>
Cc: Daniel Axtens <dja@axtens.net>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 include/linux/kasan.h |   15 ++++---
 mm/kasan/common.c     |   27 ++++++++----
 mm/vmalloc.c          |   85 ++++++++++++++++++----------------------
 3 files changed, 67 insertions(+), 60 deletions(-)

--- a/include/linux/kasan.h~kasan-fix-crashes-on-access-to-memory-mapped-by-vm_map_ram
+++ a/include/linux/kasan.h
@@ -205,20 +205,23 @@ static inline void *kasan_reset_tag(cons
 #endif /* CONFIG_KASAN_SW_TAGS */
 
 #ifdef CONFIG_KASAN_VMALLOC
-int kasan_populate_vmalloc(unsigned long requested_size,
-			   struct vm_struct *area);
-void kasan_poison_vmalloc(void *start, unsigned long size);
+int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
+void kasan_poison_vmalloc(const void *start, unsigned long size);
+void kasan_unpoison_vmalloc(const void *start, unsigned long size);
 void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
 			   unsigned long free_region_end);
 #else
-static inline int kasan_populate_vmalloc(unsigned long requested_size,
-					 struct vm_struct *area)
+static inline int kasan_populate_vmalloc(unsigned long start,
+					unsigned long size)
 {
 	return 0;
 }
 
-static inline void kasan_poison_vmalloc(void *start, unsigned long size) {}
+static inline void kasan_poison_vmalloc(const void *start, unsigned long size)
+{ }
+static inline void kasan_unpoison_vmalloc(const void *start, unsigned long size)
+{ }
 static inline void kasan_release_vmalloc(unsigned long start,
 					 unsigned long end,
 					 unsigned long free_region_start,
--- a/mm/kasan/common.c~kasan-fix-crashes-on-access-to-memory-mapped-by-vm_map_ram
+++ a/mm/kasan/common.c
@@ -778,15 +778,17 @@ static int kasan_populate_vmalloc_pte(pt
 	return 0;
 }
 
-int kasan_populate_vmalloc(unsigned long requested_size, struct vm_struct *area)
+int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
 {
 	unsigned long shadow_start, shadow_end;
 	int ret;
 
-	shadow_start = (unsigned long)kasan_mem_to_shadow(area->addr);
+	if (!is_vmalloc_or_module_addr((void *)addr))
+		return 0;
+
+	shadow_start = (unsigned long)kasan_mem_to_shadow((void *)addr);
 	shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE);
-	shadow_end = (unsigned long)kasan_mem_to_shadow(area->addr +
-							area->size);
+	shadow_end = (unsigned long)kasan_mem_to_shadow((void *)addr + size);
 	shadow_end = ALIGN(shadow_end, PAGE_SIZE);
 
 	ret = apply_to_page_range(&init_mm, shadow_start,
@@ -797,10 +799,6 @@ int kasan_populate_vmalloc(unsigned long
 
 	flush_cache_vmap(shadow_start, shadow_end);
 
-	kasan_unpoison_shadow(area->addr, requested_size);
-
-	area->flags |= VM_KASAN;
-
 	/*
 	 * We need to be careful about inter-cpu effects here. Consider:
 	 *
@@ -843,12 +841,23 @@ int kasan_populate_vmalloc(unsigned long
  * Poison the shadow for a vmalloc region. Called as part of the
  * freeing process at the time the region is freed.
  */
-void kasan_poison_vmalloc(void *start, unsigned long size)
+void kasan_poison_vmalloc(const void *start, unsigned long size)
 {
+	if (!is_vmalloc_or_module_addr(start))
+		return;
+
 	size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
 	kasan_poison_shadow(start, size, KASAN_VMALLOC_INVALID);
 }
 
+void kasan_unpoison_vmalloc(const void *start, unsigned long size)
+{
+	if (!is_vmalloc_or_module_addr(start))
+		return;
+
+	kasan_unpoison_shadow(start, size);
+}
+
 static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
 					void *unused)
 {
--- a/mm/vmalloc.c~kasan-fix-crashes-on-access-to-memory-mapped-by-vm_map_ram
+++ a/mm/vmalloc.c
@@ -1062,6 +1062,26 @@ __alloc_vmap_area(unsigned long size, un
 }
 
 /*
+ * Free a region of KVA allocated by alloc_vmap_area
+ */
+static void free_vmap_area(struct vmap_area *va)
+{
+	/*
+	 * Remove from the busy tree/list.
+	 */
+	spin_lock(&vmap_area_lock);
+	unlink_va(va, &vmap_area_root);
+	spin_unlock(&vmap_area_lock);
+
+	/*
+	 * Insert/Merge it back to the free tree/list.
+	 */
+	spin_lock(&free_vmap_area_lock);
+	merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
+	spin_unlock(&free_vmap_area_lock);
+}
+
+/*
  * Allocate a region of KVA of the specified size and alignment, within the
  * vstart and vend.
  */
@@ -1073,6 +1093,7 @@ static struct vmap_area *alloc_vmap_area
 	struct vmap_area *va, *pva;
 	unsigned long addr;
 	int purged = 0;
+	int ret;
 
 	BUG_ON(!size);
 	BUG_ON(offset_in_page(size));
@@ -1139,6 +1160,7 @@ retry:
 	va->va_end = addr + size;
 	va->vm = NULL;
 
+
 	spin_lock(&vmap_area_lock);
 	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
 	spin_unlock(&vmap_area_lock);
@@ -1147,6 +1169,12 @@ retry:
 	BUG_ON(va->va_start < vstart);
 	BUG_ON(va->va_end > vend);
 
+	ret = kasan_populate_vmalloc(addr, size);
+	if (ret) {
+		free_vmap_area(va);
+		return ERR_PTR(ret);
+	}
+
 	return va;
 
 overflow:
@@ -1186,26 +1214,6 @@ int unregister_vmap_purge_notifier(struc
 EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
 
 /*
- * Free a region of KVA allocated by alloc_vmap_area
- */
-static void free_vmap_area(struct vmap_area *va)
-{
-	/*
-	 * Remove from the busy tree/list.
-	 */
-	spin_lock(&vmap_area_lock);
-	unlink_va(va, &vmap_area_root);
-	spin_unlock(&vmap_area_lock);
-
-	/*
-	 * Insert/Merge it back to the free tree/list.
-	 */
-	spin_lock(&free_vmap_area_lock);
-	merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
-	spin_unlock(&free_vmap_area_lock);
-}
-
-/*
  * Clear the pagetable entries of a given vmap_area
  */
 static void unmap_vmap_area(struct vmap_area *va)
@@ -1771,6 +1779,8 @@ void vm_unmap_ram(const void *mem, unsig
 	BUG_ON(addr > VMALLOC_END);
 	BUG_ON(!PAGE_ALIGNED(addr));
 
+	kasan_poison_vmalloc(mem, size);
+
 	if (likely(count <= VMAP_MAX_ALLOC)) {
 		debug_check_no_locks_freed(mem, size);
 		vb_free(mem, size);
@@ -1821,6 +1831,9 @@ void *vm_map_ram(struct page **pages, un
 		addr = va->va_start;
 		mem = (void *)addr;
 	}
+
+	kasan_unpoison_vmalloc(mem, size);
+
 	if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
 		vm_unmap_ram(mem, count);
 		return NULL;
@@ -2075,6 +2088,7 @@ static struct vm_struct *__get_vm_area_n
 {
 	struct vmap_area *va;
 	struct vm_struct *area;
+	unsigned long requested_size = size;
 
 	BUG_ON(in_interrupt());
 	size = PAGE_ALIGN(size);
@@ -2098,23 +2112,9 @@ static struct vm_struct *__get_vm_area_n
 		return NULL;
 	}
 
-	setup_vmalloc_vm(area, va, flags, caller);
+	kasan_unpoison_vmalloc((void *)va->va_start, requested_size);
 
-	/*
-	 * For KASAN, if we are in vmalloc space, we need to cover the shadow
-	 * area with real memory. If we come here through VM_ALLOC, this is
-	 * done by a higher level function that has access to the true size,
-	 * which might not be a full page.
-	 *
-	 * We assume module space comes via VM_ALLOC path.
-	 */
-	if (is_vmalloc_addr(area->addr) && !(area->flags & VM_ALLOC)) {
-		if (kasan_populate_vmalloc(area->size, area)) {
-			unmap_vmap_area(va);
-			kfree(area);
-			return NULL;
-		}
-	}
+	setup_vmalloc_vm(area, va, flags, caller);
 
 	return area;
 }
@@ -2293,8 +2293,7 @@ static void __vunmap(const void *addr, i
 	debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
 	debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
 
-	if (area->flags & VM_KASAN)
-		kasan_poison_vmalloc(area->addr, area->size);
+	kasan_poison_vmalloc(area->addr, area->size);
 
 	vm_remove_mappings(area, deallocate_pages);
 
@@ -2539,7 +2538,7 @@ void *__vmalloc_node_range(unsigned long
 	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
 		goto fail;
 
-	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
+	area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
 				vm_flags, start, end, node, gfp_mask, caller);
 	if (!area)
 		goto fail;
@@ -2548,11 +2547,6 @@ void *__vmalloc_node_range(unsigned long
 	if (!addr)
 		return NULL;
 
-	if (is_vmalloc_or_module_addr(area->addr)) {
-		if (kasan_populate_vmalloc(real_size, area))
-			return NULL;
-	}
-
 	/*
 	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
 	 * flag. It means that vm_struct is not fully initialized.
@@ -3437,7 +3431,8 @@ retry:
 	/* populate the shadow space outside of the lock */
 	for (area = 0; area < nr_vms; area++) {
 		/* assume success here */
-		kasan_populate_vmalloc(sizes[area], vms[area]);
+		kasan_populate_vmalloc(vas[area]->va_start, sizes[area]);
+		kasan_unpoison_vmalloc((void *)vms[area]->addr, sizes[area]);
 	}
 
 	kfree(vas);
_


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [patch 2/6] mm/memory.c: add apply_to_existing_page_range() helper
  2019-12-18  4:50 incoming Andrew Morton
  2019-12-18  4:51 ` [patch 1/6] kasan: fix crashes on access to memory mapped by vm_map_ram() Andrew Morton
@ 2019-12-18  4:51 ` Andrew Morton
  2019-12-18  4:51 ` [patch 3/6] kasan: use apply_to_existing_page_range() for releasing vmalloc shadow Andrew Morton
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Andrew Morton @ 2019-12-18  4:51 UTC (permalink / raw)
  To: akpm, aryabinin, cai, dja, dvyukov, glider, linux-mm, mm-commits,
	torvalds, urezki

From: Daniel Axtens <dja@axtens.net>
Subject: mm/memory.c: add apply_to_existing_page_range() helper

apply_to_page_range() takes an address range, and if any parts of it are
not covered by the existing page table hierarchy, it allocates memory to
fill them in.

In some use cases, this is not what we want - we want to be able to
operate exclusively on PTEs that are already in the tables.

Add apply_to_existing_page_range() for this.  Adjust the walker functions
for apply_to_page_range to take 'create', which switches them between the
old and new modes.

This will be used in KASAN vmalloc.

[akpm@linux-foundation.org: reduce code duplication]
[akpm@linux-foundation.org: s/apply_to_existing_pages/apply_to_existing_page_range/]
[akpm@linux-foundation.org: initialize __apply_to_page_range::err]
Link: http://lkml.kernel.org/r/20191205140407.1874-1-dja@axtens.net
Signed-off-by: Daniel Axtens <dja@axtens.net>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Daniel Axtens <dja@axtens.net>
Cc: Qian Cai <cai@lca.pw>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 include/linux/mm.h |    3 
 mm/memory.c        |  136 +++++++++++++++++++++++++++++--------------
 2 files changed, 97 insertions(+), 42 deletions(-)

--- a/include/linux/mm.h~mm-add-apply_to_existing_pages-helper
+++ a/include/linux/mm.h
@@ -2621,6 +2621,9 @@ static inline int vm_fault_to_errno(vm_f
 typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
+extern int apply_to_existing_page_range(struct mm_struct *mm,
+				   unsigned long address, unsigned long size,
+				   pte_fn_t fn, void *data);
 
 #ifdef CONFIG_PAGE_POISONING
 extern bool page_poisoning_enabled(void);
--- a/mm/memory.c~mm-add-apply_to_existing_pages-helper
+++ a/mm/memory.c
@@ -2021,26 +2021,34 @@ EXPORT_SYMBOL(vm_iomap_memory);
 
 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+				     pte_fn_t fn, void *data, bool create)
 {
 	pte_t *pte;
-	int err;
+	int err = 0;
 	spinlock_t *uninitialized_var(ptl);
 
-	pte = (mm == &init_mm) ?
-		pte_alloc_kernel(pmd, addr) :
-		pte_alloc_map_lock(mm, pmd, addr, &ptl);
-	if (!pte)
-		return -ENOMEM;
+	if (create) {
+		pte = (mm == &init_mm) ?
+			pte_alloc_kernel(pmd, addr) :
+			pte_alloc_map_lock(mm, pmd, addr, &ptl);
+		if (!pte)
+			return -ENOMEM;
+	} else {
+		pte = (mm == &init_mm) ?
+			pte_offset_kernel(pmd, addr) :
+			pte_offset_map_lock(mm, pmd, addr, &ptl);
+	}
 
 	BUG_ON(pmd_huge(*pmd));
 
 	arch_enter_lazy_mmu_mode();
 
 	do {
-		err = fn(pte++, addr, data);
-		if (err)
-			break;
+		if (create || !pte_none(*pte)) {
+			err = fn(pte++, addr, data);
+			if (err)
+				break;
+		}
 	} while (addr += PAGE_SIZE, addr != end);
 
 	arch_leave_lazy_mmu_mode();
@@ -2052,77 +2060,95 @@ static int apply_to_pte_range(struct mm_
 
 static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
 				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+				     pte_fn_t fn, void *data, bool create)
 {
 	pmd_t *pmd;
 	unsigned long next;
-	int err;
+	int err = 0;
 
 	BUG_ON(pud_huge(*pud));
 
-	pmd = pmd_alloc(mm, pud, addr);
-	if (!pmd)
-		return -ENOMEM;
+	if (create) {
+		pmd = pmd_alloc(mm, pud, addr);
+		if (!pmd)
+			return -ENOMEM;
+	} else {
+		pmd = pmd_offset(pud, addr);
+	}
 	do {
 		next = pmd_addr_end(addr, end);
-		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
-		if (err)
-			break;
+		if (create || !pmd_none_or_clear_bad(pmd)) {
+			err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
+						 create);
+			if (err)
+				break;
+		}
 	} while (pmd++, addr = next, addr != end);
 	return err;
 }
 
 static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
 				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+				     pte_fn_t fn, void *data, bool create)
 {
 	pud_t *pud;
 	unsigned long next;
-	int err;
+	int err = 0;
 
-	pud = pud_alloc(mm, p4d, addr);
-	if (!pud)
-		return -ENOMEM;
+	if (create) {
+		pud = pud_alloc(mm, p4d, addr);
+		if (!pud)
+			return -ENOMEM;
+	} else {
+		pud = pud_offset(p4d, addr);
+	}
 	do {
 		next = pud_addr_end(addr, end);
-		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
-		if (err)
-			break;
+		if (create || !pud_none_or_clear_bad(pud)) {
+			err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
+						 create);
+			if (err)
+				break;
+		}
 	} while (pud++, addr = next, addr != end);
 	return err;
 }
 
 static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
 				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+				     pte_fn_t fn, void *data, bool create)
 {
 	p4d_t *p4d;
 	unsigned long next;
-	int err;
+	int err = 0;
 
-	p4d = p4d_alloc(mm, pgd, addr);
-	if (!p4d)
-		return -ENOMEM;
+	if (create) {
+		p4d = p4d_alloc(mm, pgd, addr);
+		if (!p4d)
+			return -ENOMEM;
+	} else {
+		p4d = p4d_offset(pgd, addr);
+	}
 	do {
 		next = p4d_addr_end(addr, end);
-		err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
-		if (err)
-			break;
+		if (create || !p4d_none_or_clear_bad(p4d)) {
+			err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
+						 create);
+			if (err)
+				break;
+		}
 	} while (p4d++, addr = next, addr != end);
 	return err;
 }
 
-/*
- * Scan a region of virtual memory, filling in page tables as necessary
- * and calling a provided function on each leaf page table.
- */
-int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
-			unsigned long size, pte_fn_t fn, void *data)
+static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+				 unsigned long size, pte_fn_t fn,
+				 void *data, bool create)
 {
 	pgd_t *pgd;
 	unsigned long next;
 	unsigned long end = addr + size;
-	int err;
+	int err = 0;
 
 	if (WARN_ON(addr >= end))
 		return -EINVAL;
@@ -2130,16 +2156,42 @@ int apply_to_page_range(struct mm_struct
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
+		if (!create && pgd_none_or_clear_bad(pgd))
+			continue;
+		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
 
 	return err;
 }
+
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
+ */
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+			unsigned long size, pte_fn_t fn, void *data)
+{
+	return __apply_to_page_range(mm, addr, size, fn, data, true);
+}
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
 /*
+ * Scan a region of virtual memory, calling a provided function on
+ * each leaf page table where it exists.
+ *
+ * Unlike apply_to_page_range, this does _not_ fill in page tables
+ * where they are absent.
+ */
+int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
+				 unsigned long size, pte_fn_t fn, void *data)
+{
+	return __apply_to_page_range(mm, addr, size, fn, data, false);
+}
+EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
+
+/*
  * handle_pte_fault chooses page fault handler according to an entry which was
  * read non-atomically.  Before making any commitment, on those architectures
  * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
_


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [patch 3/6] kasan: use apply_to_existing_page_range() for releasing vmalloc shadow
  2019-12-18  4:50 incoming Andrew Morton
  2019-12-18  4:51 ` [patch 1/6] kasan: fix crashes on access to memory mapped by vm_map_ram() Andrew Morton
  2019-12-18  4:51 ` [patch 2/6] mm/memory.c: add apply_to_existing_page_range() helper Andrew Morton
@ 2019-12-18  4:51 ` Andrew Morton
  2019-12-18  4:51 ` [patch 4/6] kasan: don't assume percpu shadow allocations will succeed Andrew Morton
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Andrew Morton @ 2019-12-18  4:51 UTC (permalink / raw)
  To: akpm, aryabinin, cai, dja, dvyukov, glider, linux-mm, mm-commits,
	torvalds, urezki

From: Daniel Axtens <dja@axtens.net>
Subject: kasan: use apply_to_existing_page_range() for releasing vmalloc shadow

kasan_release_vmalloc uses apply_to_page_range to release vmalloc shadow. 
Unfortunately, apply_to_page_range can allocate memory to fill in page
table entries, which is not what we want.

Also, kasan_release_vmalloc is called under free_vmap_area_lock, so if
apply_to_page_range does allocate memory, we get a sleep in atomic bug:

	BUG: sleeping function called from invalid context at mm/page_alloc.c:46=
81
	in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 15087, name:

	Call Trace:
	 __dump_stack lib/dump_stack.c:77 [inline]
	 dump_stack+0x199/0x216 lib/dump_stack.c:118
	 ___might_sleep.cold.97+0x1f5/0x238 kernel/sched/core.c:6800
	 __might_sleep+0x95/0x190 kernel/sched/core.c:6753
	 prepare_alloc_pages mm/page_alloc.c:4681 [inline]
	 __alloc_pages_nodemask+0x3cd/0x890 mm/page_alloc.c:4730
	 alloc_pages_current+0x10c/0x210 mm/mempolicy.c:2211
	 alloc_pages include/linux/gfp.h:532 [inline]
	 __get_free_pages+0xc/0x40 mm/page_alloc.c:4786
	 __pte_alloc_one_kernel include/asm-generic/pgalloc.h:21 [inline]
	 pte_alloc_one_kernel include/asm-generic/pgalloc.h:33 [inline]
	 __pte_alloc_kernel+0x1d/0x200 mm/memory.c:459
	 apply_to_pte_range mm/memory.c:2031 [inline]
	 apply_to_pmd_range mm/memory.c:2068 [inline]
	 apply_to_pud_range mm/memory.c:2088 [inline]
	 apply_to_p4d_range mm/memory.c:2108 [inline]
	 apply_to_page_range+0x77d/0xa00 mm/memory.c:2133
	 kasan_release_vmalloc+0xa7/0xc0 mm/kasan/common.c:970
	 __purge_vmap_area_lazy+0xcbb/0x1f30 mm/vmalloc.c:1313
	 try_purge_vmap_area_lazy mm/vmalloc.c:1332 [inline]
	 free_vmap_area_noflush+0x2ca/0x390 mm/vmalloc.c:1368
	 free_unmap_vmap_area mm/vmalloc.c:1381 [inline]
	 remove_vm_area+0x1cc/0x230 mm/vmalloc.c:2209
	 vm_remove_mappings mm/vmalloc.c:2236 [inline]
	 __vunmap+0x223/0xa20 mm/vmalloc.c:2299
	 __vfree+0x3f/0xd0 mm/vmalloc.c:2356
	 __vmalloc_area_node mm/vmalloc.c:2507 [inline]
	 __vmalloc_node_range+0x5d5/0x810 mm/vmalloc.c:2547
	 __vmalloc_node mm/vmalloc.c:2607 [inline]
	 __vmalloc_node_flags mm/vmalloc.c:2621 [inline]
	 vzalloc+0x6f/0x80 mm/vmalloc.c:2666
	 alloc_one_pg_vec_page net/packet/af_packet.c:4233 [inline]
	 alloc_pg_vec net/packet/af_packet.c:4258 [inline]
	 packet_set_ring+0xbc0/0x1b50 net/packet/af_packet.c:4342
	 packet_setsockopt+0xed7/0x2d90 net/packet/af_packet.c:3695
	 __sys_setsockopt+0x29b/0x4d0 net/socket.c:2117
	 __do_sys_setsockopt net/socket.c:2133 [inline]
	 __se_sys_setsockopt net/socket.c:2130 [inline]
	 __x64_sys_setsockopt+0xbe/0x150 net/socket.c:2130
	 do_syscall_64+0xfa/0x780 arch/x86/entry/common.c:294
	 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Switch to using the apply_to_existing_page_range() helper instead, which
won't allocate memory.

[akpm@linux-foundation.org: s/apply_to_existing_pages/apply_to_existing_page_range/]
Link: http://lkml.kernel.org/r/20191205140407.1874-2-dja@axtens.net
Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shad=
ow memory")
Signed-off-by: Daniel Axtens <dja@axtens.net>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Qian Cai <cai@lca.pw>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 mm/kasan/common.c |    9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

--- a/mm/kasan/common.c~kasan-use-apply_to_existing_pages-for-releasing-vmalloc-shadow
+++ a/mm/kasan/common.c
@@ -957,6 +957,7 @@ void kasan_release_vmalloc(unsigned long
 {
 	void *shadow_start, *shadow_end;
 	unsigned long region_start, region_end;
+	unsigned long size;
 
 	region_start = ALIGN(start, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
 	region_end = ALIGN_DOWN(end, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
@@ -979,9 +980,11 @@ void kasan_release_vmalloc(unsigned long
 	shadow_end = kasan_mem_to_shadow((void *)region_end);
 
 	if (shadow_end > shadow_start) {
-		apply_to_page_range(&init_mm, (unsigned long)shadow_start,
-				    (unsigned long)(shadow_end - shadow_start),
-				    kasan_depopulate_vmalloc_pte, NULL);
+		size = shadow_end - shadow_start;
+		apply_to_existing_page_range(&init_mm,
+					     (unsigned long)shadow_start,
+					     size, kasan_depopulate_vmalloc_pte,
+					     NULL);
 		flush_tlb_kernel_range((unsigned long)shadow_start,
 				       (unsigned long)shadow_end);
 	}
_


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [patch 4/6] kasan: don't assume percpu shadow allocations will succeed
  2019-12-18  4:50 incoming Andrew Morton
                   ` (2 preceding siblings ...)
  2019-12-18  4:51 ` [patch 3/6] kasan: use apply_to_existing_page_range() for releasing vmalloc shadow Andrew Morton
@ 2019-12-18  4:51 ` Andrew Morton
  2019-12-18  4:51 ` [patch 5/6] mm: vmscan: protect shrinker idr replace with CONFIG_MEMCG Andrew Morton
  2019-12-18  4:51 ` [patch 6/6] lib/Kconfig.debug: fix some messed up configurations Andrew Morton
  5 siblings, 0 replies; 7+ messages in thread
From: Andrew Morton @ 2019-12-18  4:51 UTC (permalink / raw)
  To: akpm, aryabinin, cai, dja, dvyukov, glider, linux-mm, mm-commits,
	torvalds, urezki

From: Daniel Axtens <dja@axtens.net>
Subject: kasan: don't assume percpu shadow allocations will succeed

syzkaller and the fault injector showed that I was wrong to assume
that we could ignore percpu shadow allocation failures.

Handle failures properly. Merge all the allocated areas back into the fre=
e
list and release the shadow, then clean up and return NULL. The shadow
is released unconditionally, which relies upon the fact that the release
function is able to tolerate pages not being present.

Also clean up shadows in the recovery path - currently they are not
released, which leaks a bit of memory.

Link: http://lkml.kernel.org/r/20191205140407.1874-3-dja@axtens.net
Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shad=
ow memory")
Signed-off-by: Daniel Axtens <dja@axtens.net>
Reported-by: syzbot+82e323920b78d54aaed5@syzkaller.appspotmail.com
Reported-by: syzbot+59b7daa4315e07a994f1@syzkaller.appspotmail.com
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Qian Cai <cai@lca.pw>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 mm/vmalloc.c |   48 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 10 deletions(-)

--- a/mm/vmalloc.c~kasan-dont-assume-percpu-shadow-allocations-will-succeed
+++ a/mm/vmalloc.c
@@ -3288,7 +3288,7 @@ struct vm_struct **pcpu_get_vm_areas(con
 	struct vmap_area **vas, *va;
 	struct vm_struct **vms;
 	int area, area2, last_area, term_area;
-	unsigned long base, start, size, end, last_end;
+	unsigned long base, start, size, end, last_end, orig_start, orig_end;
 	bool purged = false;
 	enum fit_type type;
 
@@ -3418,6 +3418,15 @@ retry:
 
 	spin_unlock(&free_vmap_area_lock);
 
+	/* populate the kasan shadow space */
+	for (area = 0; area < nr_vms; area++) {
+		if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
+			goto err_free_shadow;
+
+		kasan_unpoison_vmalloc((void *)vas[area]->va_start,
+				       sizes[area]);
+	}
+
 	/* insert all vm's */
 	spin_lock(&vmap_area_lock);
 	for (area = 0; area < nr_vms; area++) {
@@ -3428,13 +3437,6 @@ retry:
 	}
 	spin_unlock(&vmap_area_lock);
 
-	/* populate the shadow space outside of the lock */
-	for (area = 0; area < nr_vms; area++) {
-		/* assume success here */
-		kasan_populate_vmalloc(vas[area]->va_start, sizes[area]);
-		kasan_unpoison_vmalloc((void *)vms[area]->addr, sizes[area]);
-	}
-
 	kfree(vas);
 	return vms;
 
@@ -3446,8 +3448,12 @@ recovery:
 	 * and when pcpu_get_vm_areas() is success.
 	 */
 	while (area--) {
-		merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
-				       &free_vmap_area_list);
+		orig_start = vas[area]->va_start;
+		orig_end = vas[area]->va_end;
+		va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
+					    &free_vmap_area_list);
+		kasan_release_vmalloc(orig_start, orig_end,
+				      va->va_start, va->va_end);
 		vas[area] = NULL;
 	}
 
@@ -3482,6 +3488,28 @@ err_free2:
 	kfree(vas);
 	kfree(vms);
 	return NULL;
+
+err_free_shadow:
+	spin_lock(&free_vmap_area_lock);
+	/*
+	 * We release all the vmalloc shadows, even the ones for regions that
+	 * hadn't been successfully added. This relies on kasan_release_vmalloc
+	 * being able to tolerate this case.
+	 */
+	for (area = 0; area < nr_vms; area++) {
+		orig_start = vas[area]->va_start;
+		orig_end = vas[area]->va_end;
+		va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
+					    &free_vmap_area_list);
+		kasan_release_vmalloc(orig_start, orig_end,
+				      va->va_start, va->va_end);
+		vas[area] = NULL;
+		kfree(vms[area]);
+	}
+	spin_unlock(&free_vmap_area_lock);
+	kfree(vas);
+	kfree(vms);
+	return NULL;
 }
 
 /**
_


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [patch 5/6] mm: vmscan: protect shrinker idr replace with CONFIG_MEMCG
  2019-12-18  4:50 incoming Andrew Morton
                   ` (3 preceding siblings ...)
  2019-12-18  4:51 ` [patch 4/6] kasan: don't assume percpu shadow allocations will succeed Andrew Morton
@ 2019-12-18  4:51 ` Andrew Morton
  2019-12-18  4:51 ` [patch 6/6] lib/Kconfig.debug: fix some messed up configurations Andrew Morton
  5 siblings, 0 replies; 7+ messages in thread
From: Andrew Morton @ 2019-12-18  4:51 UTC (permalink / raw)
  To: akpm, guro, hannes, ktkhai, linux-mm, mhocko, mm-commits,
	shakeelb, stable, torvalds, yang.shi

From: Yang Shi <yang.shi@linux.alibaba.com>
Subject: mm: vmscan: protect shrinker idr replace with CONFIG_MEMCG

Since commit 0a432dcbeb32edc ("mm: shrinker: make shrinker not depend on
memcg kmem"), shrinkers' idr is protected by CONFIG_MEMCG instead of
CONFIG_MEMCG_KMEM, so it makes no sense to protect shrinker idr replace
with CONFIG_MEMCG_KMEM.

And in the CONFIG_MEMCG && CONFIG_SLOB case, shrinker_idr contains only
shrinker, and it is deferred_split_shrinker.  But it is never actually
called, since idr_replace() is never compiled due to the wrong #ifdef. 
The deferred_split_shrinker all the time is staying in half-registered
state, and it's never called for subordinate mem cgroups.

Link: http://lkml.kernel.org/r/1575486978-45249-1-git-send-email-yang.shi@linux.alibaba.com
Fixes: 0a432dcbeb32 ("mm: shrinker: make shrinker not depend on memcg kmem")
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: <stable@vger.kernel.org>	[5.4+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 mm/vmscan.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/mm/vmscan.c~mm-vmscan-protect-shrinker-idr-replace-with-config_memcg
+++ a/mm/vmscan.c
@@ -387,7 +387,7 @@ void register_shrinker_prepared(struct s
 {
 	down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
 	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
 		idr_replace(&shrinker_idr, shrinker, shrinker->id);
 #endif
_

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [patch 6/6] lib/Kconfig.debug: fix some messed up configurations
  2019-12-18  4:50 incoming Andrew Morton
                   ` (4 preceding siblings ...)
  2019-12-18  4:51 ` [patch 5/6] mm: vmscan: protect shrinker idr replace with CONFIG_MEMCG Andrew Morton
@ 2019-12-18  4:51 ` Andrew Morton
  5 siblings, 0 replies; 7+ messages in thread
From: Andrew Morton @ 2019-12-18  4:51 UTC (permalink / raw)
  To: akpm, changbin.du, linux-mm, mm-commits, torvalds

From: Changbin Du <changbin.du@gmail.com>
Subject: lib/Kconfig.debug: fix some messed up configurations

Some configuration items are messed up during conflict resolving.  For
example, STRICT_DEVMEM should not in testing menu, but kunit should.  This
patch fixes all of them.

[akpm@linux-foundation.org: coding style fixes]
Link: http://lkml.kernel.org/r/20191209155653.7509-1-changbin.du@gmail.com
Signed-off-by: Changbin Du <changbin.du@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 lib/Kconfig.debug |  100 ++++++++++++++++++++++----------------------
 1 file changed, 51 insertions(+), 49 deletions(-)

--- a/lib/Kconfig.debug~lib-kconfigdebug-fix-some-messed-up-configurations
+++ a/lib/Kconfig.debug
@@ -1483,6 +1483,55 @@ config PROVIDE_OHCI1394_DMA_INIT
 
 	  See Documentation/debugging-via-ohci1394.txt for more information.
 
+source "samples/Kconfig"
+
+config ARCH_HAS_DEVMEM_IS_ALLOWED
+	bool
+
+config STRICT_DEVMEM
+	bool "Filter access to /dev/mem"
+	depends on MMU && DEVMEM
+	depends on ARCH_HAS_DEVMEM_IS_ALLOWED
+	default y if PPC || X86 || ARM64
+	help
+	  If this option is disabled, you allow userspace (root) access to all
+	  of memory, including kernel and userspace memory. Accidental
+	  access to this is obviously disastrous, but specific access can
+	  be used by people debugging the kernel. Note that with PAT support
+	  enabled, even in this case there are restrictions on /dev/mem
+	  use due to the cache aliasing requirements.
+
+	  If this option is switched on, and IO_STRICT_DEVMEM=n, the /dev/mem
+	  file only allows userspace access to PCI space and the BIOS code and
+	  data regions.  This is sufficient for dosemu and X and all common
+	  users of /dev/mem.
+
+	  If in doubt, say Y.
+
+config IO_STRICT_DEVMEM
+	bool "Filter I/O access to /dev/mem"
+	depends on STRICT_DEVMEM
+	help
+	  If this option is disabled, you allow userspace (root) access to all
+	  io-memory regardless of whether a driver is actively using that
+	  range.  Accidental access to this is obviously disastrous, but
+	  specific access can be used by people debugging kernel drivers.
+
+	  If this option is switched on, the /dev/mem file only allows
+	  userspace access to *idle* io-memory ranges (see /proc/iomem) This
+	  may break traditional users of /dev/mem (dosemu, legacy X, etc...)
+	  if the driver using a given range cannot be disabled.
+
+	  If in doubt, say Y.
+
+menu "$(SRCARCH) Debugging"
+
+source "arch/$(SRCARCH)/Kconfig.debug"
+
+endmenu
+
+menu "Kernel Testing and Coverage"
+
 source "lib/kunit/Kconfig"
 
 config NOTIFIER_ERROR_INJECTION
@@ -1643,10 +1692,6 @@ config FAULT_INJECTION_STACKTRACE_FILTER
 	help
 	  Provide stacktrace filter for fault-injection capabilities
 
-endmenu # "Kernel Testing and Coverage"
-
-menu "Kernel Testing and Coverage"
-
 config ARCH_HAS_KCOV
 	bool
 	help
@@ -2130,52 +2175,7 @@ config MEMTEST
 	        memtest=17, mean do 17 test patterns.
 	  If you are unsure how to answer this question, answer N.
 
-source "samples/Kconfig"
-
-config ARCH_HAS_DEVMEM_IS_ALLOWED
-	bool
-
-config STRICT_DEVMEM
-	bool "Filter access to /dev/mem"
-	depends on MMU && DEVMEM
-	depends on ARCH_HAS_DEVMEM_IS_ALLOWED
-	default y if PPC || X86 || ARM64
-	---help---
-	  If this option is disabled, you allow userspace (root) access to all
-	  of memory, including kernel and userspace memory. Accidental
-	  access to this is obviously disastrous, but specific access can
-	  be used by people debugging the kernel. Note that with PAT support
-	  enabled, even in this case there are restrictions on /dev/mem
-	  use due to the cache aliasing requirements.
-
-	  If this option is switched on, and IO_STRICT_DEVMEM=n, the /dev/mem
-	  file only allows userspace access to PCI space and the BIOS code and
-	  data regions.  This is sufficient for dosemu and X and all common
-	  users of /dev/mem.
-
-	  If in doubt, say Y.
 
-config IO_STRICT_DEVMEM
-	bool "Filter I/O access to /dev/mem"
-	depends on STRICT_DEVMEM
-	---help---
-	  If this option is disabled, you allow userspace (root) access to all
-	  io-memory regardless of whether a driver is actively using that
-	  range.  Accidental access to this is obviously disastrous, but
-	  specific access can be used by people debugging kernel drivers.
-
-	  If this option is switched on, the /dev/mem file only allows
-	  userspace access to *idle* io-memory ranges (see /proc/iomem) This
-	  may break traditional users of /dev/mem (dosemu, legacy X, etc...)
-	  if the driver using a given range cannot be disabled.
-
-	  If in doubt, say Y.
-
-menu "$(SRCARCH) Debugging"
-
-source "arch/$(SRCARCH)/Kconfig.debug"
-
-endmenu
 
 config HYPERV_TESTING
 	bool "Microsoft Hyper-V driver testing"
@@ -2184,4 +2184,6 @@ config HYPERV_TESTING
 	help
 	  Select this option to enable Hyper-V vmbus testing.
 
+endmenu # "Kernel Testing and Coverage"
+
 endmenu # Kernel hacking
_


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2019-12-18  4:51 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-18  4:50 incoming Andrew Morton
2019-12-18  4:51 ` [patch 1/6] kasan: fix crashes on access to memory mapped by vm_map_ram() Andrew Morton
2019-12-18  4:51 ` [patch 2/6] mm/memory.c: add apply_to_existing_page_range() helper Andrew Morton
2019-12-18  4:51 ` [patch 3/6] kasan: use apply_to_existing_page_range() for releasing vmalloc shadow Andrew Morton
2019-12-18  4:51 ` [patch 4/6] kasan: don't assume percpu shadow allocations will succeed Andrew Morton
2019-12-18  4:51 ` [patch 5/6] mm: vmscan: protect shrinker idr replace with CONFIG_MEMCG Andrew Morton
2019-12-18  4:51 ` [patch 6/6] lib/Kconfig.debug: fix some messed up configurations Andrew Morton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.