Linux-mm Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH 1/3] mm: add apply_to_existing_pages helper
@ 2019-12-05 14:04 Daniel Axtens
  2019-12-05 14:04 ` [PATCH 2/3] kasan: use apply_to_existing_pages for releasing vmalloc shadow Daniel Axtens
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Daniel Axtens @ 2019-12-05 14:04 UTC (permalink / raw)
  To: kasan-dev, linux-mm, aryabinin, glider, linux-kernel, dvyukov
  Cc: daniel, cai, Daniel Axtens

apply_to_page_range takes an address range, and if any parts of it
are not covered by the existing page table hierarchy, it allocates
memory to fill them in.

In some use cases, this is not what we want - we want to be able to
operate exclusively on PTEs that are already in the tables.

Add apply_to_existing_pages for this. Adjust the walker functions
for apply_to_page_range to take 'create', which switches them between
the old and new modes.

This will be used in KASAN vmalloc.

Signed-off-by: Daniel Axtens <dja@axtens.net>
---
 include/linux/mm.h |   3 ++
 mm/memory.c        | 131 +++++++++++++++++++++++++++++++++------------
 2 files changed, 99 insertions(+), 35 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c97ea3b694e6..f4dba827d76e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2621,6 +2621,9 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
 typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
+extern int apply_to_existing_pages(struct mm_struct *mm, unsigned long address,
+				   unsigned long size, pte_fn_t fn,
+				   void *data);
 
 #ifdef CONFIG_PAGE_POISONING
 extern bool page_poisoning_enabled(void);
diff --git a/mm/memory.c b/mm/memory.c
index 606da187d1de..e508ba7e0a19 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2021,26 +2021,34 @@ EXPORT_SYMBOL(vm_iomap_memory);
 
 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+				     pte_fn_t fn, void *data, bool create)
 {
 	pte_t *pte;
-	int err;
+	int err = 0;
 	spinlock_t *uninitialized_var(ptl);
 
-	pte = (mm == &init_mm) ?
-		pte_alloc_kernel(pmd, addr) :
-		pte_alloc_map_lock(mm, pmd, addr, &ptl);
-	if (!pte)
-		return -ENOMEM;
+	if (create) {
+		pte = (mm == &init_mm) ?
+			pte_alloc_kernel(pmd, addr) :
+			pte_alloc_map_lock(mm, pmd, addr, &ptl);
+		if (!pte)
+			return -ENOMEM;
+	} else {
+		pte = (mm == &init_mm) ?
+			pte_offset_kernel(pmd, addr) :
+			pte_offset_map_lock(mm, pmd, addr, &ptl);
+	}
 
 	BUG_ON(pmd_huge(*pmd));
 
 	arch_enter_lazy_mmu_mode();
 
 	do {
-		err = fn(pte++, addr, data);
-		if (err)
-			break;
+		if (create || !pte_none(*pte)) {
+			err = fn(pte++, addr, data);
+			if (err)
+				break;
+		}
 	} while (addr += PAGE_SIZE, addr != end);
 
 	arch_leave_lazy_mmu_mode();
@@ -2052,62 +2060,83 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 
 static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
 				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+				     pte_fn_t fn, void *data, bool create)
 {
 	pmd_t *pmd;
 	unsigned long next;
-	int err;
+	int err = 0;
 
 	BUG_ON(pud_huge(*pud));
 
-	pmd = pmd_alloc(mm, pud, addr);
-	if (!pmd)
-		return -ENOMEM;
+	if (create) {
+		pmd = pmd_alloc(mm, pud, addr);
+		if (!pmd)
+			return -ENOMEM;
+	} else {
+		pmd = pmd_offset(pud, addr);
+	}
 	do {
 		next = pmd_addr_end(addr, end);
-		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
-		if (err)
-			break;
+		if (create || !pmd_none_or_clear_bad(pmd)) {
+			err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
+						 create);
+			if (err)
+				break;
+		}
 	} while (pmd++, addr = next, addr != end);
 	return err;
 }
 
 static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
 				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+				     pte_fn_t fn, void *data, bool create)
 {
 	pud_t *pud;
 	unsigned long next;
-	int err;
+	int err = 0;
 
-	pud = pud_alloc(mm, p4d, addr);
-	if (!pud)
-		return -ENOMEM;
+	if (create) {
+		pud = pud_alloc(mm, p4d, addr);
+		if (!pud)
+			return -ENOMEM;
+	} else {
+		pud = pud_offset(p4d, addr);
+	}
 	do {
 		next = pud_addr_end(addr, end);
-		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
-		if (err)
-			break;
+		if (create || !pud_none_or_clear_bad(pud)) {
+			err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
+						 create);
+			if (err)
+				break;
+		}
 	} while (pud++, addr = next, addr != end);
 	return err;
 }
 
 static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
 				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+				     pte_fn_t fn, void *data, bool create)
 {
 	p4d_t *p4d;
 	unsigned long next;
-	int err;
+	int err = 0;
 
-	p4d = p4d_alloc(mm, pgd, addr);
-	if (!p4d)
-		return -ENOMEM;
+	if (create) {
+		p4d = p4d_alloc(mm, pgd, addr);
+		if (!p4d)
+			return -ENOMEM;
+	} else {
+		p4d = p4d_offset(pgd, addr);
+	}
 	do {
 		next = p4d_addr_end(addr, end);
-		err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
-		if (err)
-			break;
+		if (create || !p4d_none_or_clear_bad(p4d)) {
+			err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
+						 create);
+			if (err)
+				break;
+		}
 	} while (p4d++, addr = next, addr != end);
 	return err;
 }
@@ -2130,7 +2159,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
+		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, true);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
@@ -2139,6 +2168,38 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
+/*
+ * Scan a region of virtual memory, calling a provided function on
+ * each leaf page table where it exists.
+ *
+ * Unlike apply_to_page_range, this does _not_ fill in page tables
+ * where they are absent.
+ */
+int apply_to_existing_pages(struct mm_struct *mm, unsigned long addr,
+			    unsigned long size, pte_fn_t fn, void *data)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long end = addr + size;
+	int err = 0;
+
+	if (WARN_ON(addr >= end))
+		return -EINVAL;
+
+	pgd = pgd_offset(mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, false);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(apply_to_existing_pages);
+
 /*
  * handle_pte_fault chooses page fault handler according to an entry which was
  * read non-atomically.  Before making any commitment, on those architectures
-- 
2.20.1



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 2/3] kasan: use apply_to_existing_pages for releasing vmalloc shadow
  2019-12-05 14:04 [PATCH 1/3] mm: add apply_to_existing_pages helper Daniel Axtens
@ 2019-12-05 14:04 ` Daniel Axtens
  2019-12-06 16:24   ` Andrey Ryabinin
  2019-12-05 14:04 ` [PATCH 3/3] kasan: don't assume percpu shadow allocations will succeed Daniel Axtens
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: Daniel Axtens @ 2019-12-05 14:04 UTC (permalink / raw)
  To: kasan-dev, linux-mm, aryabinin, glider, linux-kernel, dvyukov
  Cc: daniel, cai, Daniel Axtens

kasan_release_vmalloc uses apply_to_page_range to release vmalloc
shadow. Unfortunately, apply_to_page_range can allocate memory to
fill in page table entries, which is not what we want.

Also, kasan_release_vmalloc is called under free_vmap_area_lock,
so if apply_to_page_range does allocate memory, we get a sleep in
atomic bug:

	BUG: sleeping function called from invalid context at mm/page_alloc.c:4681
	in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 15087, name:

	Call Trace:
	 __dump_stack lib/dump_stack.c:77 [inline]
	 dump_stack+0x199/0x216 lib/dump_stack.c:118
	 ___might_sleep.cold.97+0x1f5/0x238 kernel/sched/core.c:6800
	 __might_sleep+0x95/0x190 kernel/sched/core.c:6753
	 prepare_alloc_pages mm/page_alloc.c:4681 [inline]
	 __alloc_pages_nodemask+0x3cd/0x890 mm/page_alloc.c:4730
	 alloc_pages_current+0x10c/0x210 mm/mempolicy.c:2211
	 alloc_pages include/linux/gfp.h:532 [inline]
	 __get_free_pages+0xc/0x40 mm/page_alloc.c:4786
	 __pte_alloc_one_kernel include/asm-generic/pgalloc.h:21 [inline]
	 pte_alloc_one_kernel include/asm-generic/pgalloc.h:33 [inline]
	 __pte_alloc_kernel+0x1d/0x200 mm/memory.c:459
	 apply_to_pte_range mm/memory.c:2031 [inline]
	 apply_to_pmd_range mm/memory.c:2068 [inline]
	 apply_to_pud_range mm/memory.c:2088 [inline]
	 apply_to_p4d_range mm/memory.c:2108 [inline]
	 apply_to_page_range+0x77d/0xa00 mm/memory.c:2133
	 kasan_release_vmalloc+0xa7/0xc0 mm/kasan/common.c:970
	 __purge_vmap_area_lazy+0xcbb/0x1f30 mm/vmalloc.c:1313
	 try_purge_vmap_area_lazy mm/vmalloc.c:1332 [inline]
	 free_vmap_area_noflush+0x2ca/0x390 mm/vmalloc.c:1368
	 free_unmap_vmap_area mm/vmalloc.c:1381 [inline]
	 remove_vm_area+0x1cc/0x230 mm/vmalloc.c:2209
	 vm_remove_mappings mm/vmalloc.c:2236 [inline]
	 __vunmap+0x223/0xa20 mm/vmalloc.c:2299
	 __vfree+0x3f/0xd0 mm/vmalloc.c:2356
	 __vmalloc_area_node mm/vmalloc.c:2507 [inline]
	 __vmalloc_node_range+0x5d5/0x810 mm/vmalloc.c:2547
	 __vmalloc_node mm/vmalloc.c:2607 [inline]
	 __vmalloc_node_flags mm/vmalloc.c:2621 [inline]
	 vzalloc+0x6f/0x80 mm/vmalloc.c:2666
	 alloc_one_pg_vec_page net/packet/af_packet.c:4233 [inline]
	 alloc_pg_vec net/packet/af_packet.c:4258 [inline]
	 packet_set_ring+0xbc0/0x1b50 net/packet/af_packet.c:4342
	 packet_setsockopt+0xed7/0x2d90 net/packet/af_packet.c:3695
	 __sys_setsockopt+0x29b/0x4d0 net/socket.c:2117
	 __do_sys_setsockopt net/socket.c:2133 [inline]
	 __se_sys_setsockopt net/socket.c:2130 [inline]
	 __x64_sys_setsockopt+0xbe/0x150 net/socket.c:2130
	 do_syscall_64+0xfa/0x780 arch/x86/entry/common.c:294
	 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Switch to using the apply_to_existing_pages helper instead, which
won't allocate memory.

Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Daniel Axtens <dja@axtens.net>

---

Andrew, if you want to take this, it replaces
"kasan: Don't allocate page tables in kasan_release_vmalloc()"
---
 mm/kasan/common.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index e04e73603dfc..26fd0c13dd28 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -957,6 +957,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
 {
 	void *shadow_start, *shadow_end;
 	unsigned long region_start, region_end;
+	unsigned long size;
 
 	region_start = ALIGN(start, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
 	region_end = ALIGN_DOWN(end, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
@@ -979,9 +980,10 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
 	shadow_end = kasan_mem_to_shadow((void *)region_end);
 
 	if (shadow_end > shadow_start) {
-		apply_to_page_range(&init_mm, (unsigned long)shadow_start,
-				    (unsigned long)(shadow_end - shadow_start),
-				    kasan_depopulate_vmalloc_pte, NULL);
+		size = shadow_end - shadow_start;
+		apply_to_existing_pages(&init_mm, (unsigned long)shadow_start,
+					size, kasan_depopulate_vmalloc_pte,
+					NULL);
 		flush_tlb_kernel_range((unsigned long)shadow_start,
 				       (unsigned long)shadow_end);
 	}
-- 
2.20.1



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 3/3] kasan: don't assume percpu shadow allocations will succeed
  2019-12-05 14:04 [PATCH 1/3] mm: add apply_to_existing_pages helper Daniel Axtens
  2019-12-05 14:04 ` [PATCH 2/3] kasan: use apply_to_existing_pages for releasing vmalloc shadow Daniel Axtens
@ 2019-12-05 14:04 ` Daniel Axtens
  2019-12-06 16:24   ` Andrey Ryabinin
  2019-12-06 16:24 ` [PATCH 1/3] mm: add apply_to_existing_pages helper Andrey Ryabinin
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: Daniel Axtens @ 2019-12-05 14:04 UTC (permalink / raw)
  To: kasan-dev, linux-mm, aryabinin, glider, linux-kernel, dvyukov
  Cc: daniel, cai, Daniel Axtens, syzbot+82e323920b78d54aaed5,
	syzbot+59b7daa4315e07a994f1

syzkaller and the fault injector showed that I was wrong to assume
that we could ignore percpu shadow allocation failures.

Handle failures properly. Merge all the allocated areas back into the free
list and release the shadow, then clean up and return NULL. The shadow
is released unconditionally, which relies upon the fact that the release
function is able to tolerate pages not being present.

Also clean up shadows in the recovery path - currently they are not
released, which leaks a bit of memory.

Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory")
Reported-by: syzbot+82e323920b78d54aaed5@syzkaller.appspotmail.com
Reported-by: syzbot+59b7daa4315e07a994f1@syzkaller.appspotmail.com
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Daniel Axtens <dja@axtens.net>
---
 mm/vmalloc.c | 48 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 37af94b6cf30..fa5688093a88 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3291,7 +3291,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 	struct vmap_area **vas, *va;
 	struct vm_struct **vms;
 	int area, area2, last_area, term_area;
-	unsigned long base, start, size, end, last_end;
+	unsigned long base, start, size, end, last_end, orig_start, orig_end;
 	bool purged = false;
 	enum fit_type type;
 
@@ -3421,6 +3421,15 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 
 	spin_unlock(&free_vmap_area_lock);
 
+	/* populate the kasan shadow space */
+	for (area = 0; area < nr_vms; area++) {
+		if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
+			goto err_free_shadow;
+
+		kasan_unpoison_vmalloc((void *)vas[area]->va_start,
+				       sizes[area]);
+	}
+
 	/* insert all vm's */
 	spin_lock(&vmap_area_lock);
 	for (area = 0; area < nr_vms; area++) {
@@ -3431,13 +3440,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 	}
 	spin_unlock(&vmap_area_lock);
 
-	/* populate the shadow space outside of the lock */
-	for (area = 0; area < nr_vms; area++) {
-		/* assume success here */
-		kasan_populate_vmalloc(vas[area]->va_start, sizes[area]);
-		kasan_unpoison_vmalloc((void *)vms[area]->addr, sizes[area]);
-	}
-
 	kfree(vas);
 	return vms;
 
@@ -3449,8 +3451,12 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 	 * and when pcpu_get_vm_areas() is success.
 	 */
 	while (area--) {
-		merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
-				       &free_vmap_area_list);
+		orig_start = vas[area]->va_start;
+		orig_end = vas[area]->va_end;
+		va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
+					    &free_vmap_area_list);
+		kasan_release_vmalloc(orig_start, orig_end,
+				      va->va_start, va->va_end);
 		vas[area] = NULL;
 	}
 
@@ -3485,6 +3491,28 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 	kfree(vas);
 	kfree(vms);
 	return NULL;
+
+err_free_shadow:
+	spin_lock(&free_vmap_area_lock);
+	/*
+	 * We release all the vmalloc shadows, even the ones for regions that
+	 * hadn't been successfully added. This relies on kasan_release_vmalloc
+	 * being able to tolerate this case.
+	 */
+	for (area = 0; area < nr_vms; area++) {
+		orig_start = vas[area]->va_start;
+		orig_end = vas[area]->va_end;
+		va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
+					    &free_vmap_area_list);
+		kasan_release_vmalloc(orig_start, orig_end,
+				      va->va_start, va->va_end);
+		vas[area] = NULL;
+		kfree(vms[area]);
+	}
+	spin_unlock(&free_vmap_area_lock);
+	kfree(vas);
+	kfree(vms);
+	return NULL;
 }
 
 /**
-- 
2.20.1



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] mm: add apply_to_existing_pages helper
  2019-12-05 14:04 [PATCH 1/3] mm: add apply_to_existing_pages helper Daniel Axtens
  2019-12-05 14:04 ` [PATCH 2/3] kasan: use apply_to_existing_pages for releasing vmalloc shadow Daniel Axtens
  2019-12-05 14:04 ` [PATCH 3/3] kasan: don't assume percpu shadow allocations will succeed Daniel Axtens
@ 2019-12-06 16:24 ` Andrey Ryabinin
  2019-12-07  0:37 ` Andrew Morton
  2019-12-07  0:38 ` Andrew Morton
  4 siblings, 0 replies; 11+ messages in thread
From: Andrey Ryabinin @ 2019-12-06 16:24 UTC (permalink / raw)
  To: Daniel Axtens, kasan-dev, linux-mm, glider, linux-kernel, dvyukov
  Cc: daniel, cai, Andrew Morton



On 12/5/19 5:04 PM, Daniel Axtens wrote:
> apply_to_page_range takes an address range, and if any parts of it
> are not covered by the existing page table hierarchy, it allocates
> memory to fill them in.
> 
> In some use cases, this is not what we want - we want to be able to
> operate exclusively on PTEs that are already in the tables.
> 
> Add apply_to_existing_pages for this. Adjust the walker functions
> for apply_to_page_range to take 'create', which switches them between
> the old and new modes.
> 
> This will be used in KASAN vmalloc.
> 
> Signed-off-by: Daniel Axtens <dja@axtens.net>

Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/3] kasan: use apply_to_existing_pages for releasing vmalloc shadow
  2019-12-05 14:04 ` [PATCH 2/3] kasan: use apply_to_existing_pages for releasing vmalloc shadow Daniel Axtens
@ 2019-12-06 16:24   ` Andrey Ryabinin
  0 siblings, 0 replies; 11+ messages in thread
From: Andrey Ryabinin @ 2019-12-06 16:24 UTC (permalink / raw)
  To: Daniel Axtens, kasan-dev, linux-mm, glider, linux-kernel, dvyukov
  Cc: daniel, cai, Andrew Morton



On 12/5/19 5:04 PM, Daniel Axtens wrote:
> kasan_release_vmalloc uses apply_to_page_range to release vmalloc
> shadow. Unfortunately, apply_to_page_range can allocate memory to
> fill in page table entries, which is not what we want.
> 
> Also, kasan_release_vmalloc is called under free_vmap_area_lock,
> so if apply_to_page_range does allocate memory, we get a sleep in
> atomic bug:
> 
> 	BUG: sleeping function called from invalid context at mm/page_alloc.c:4681
> 	in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 15087, name:
> 
> 	Call Trace:
> 	 __dump_stack lib/dump_stack.c:77 [inline]
> 	 dump_stack+0x199/0x216 lib/dump_stack.c:118
> 	 ___might_sleep.cold.97+0x1f5/0x238 kernel/sched/core.c:6800
> 	 __might_sleep+0x95/0x190 kernel/sched/core.c:6753
> 	 prepare_alloc_pages mm/page_alloc.c:4681 [inline]
> 	 __alloc_pages_nodemask+0x3cd/0x890 mm/page_alloc.c:4730
> 	 alloc_pages_current+0x10c/0x210 mm/mempolicy.c:2211
> 	 alloc_pages include/linux/gfp.h:532 [inline]
> 	 __get_free_pages+0xc/0x40 mm/page_alloc.c:4786
> 	 __pte_alloc_one_kernel include/asm-generic/pgalloc.h:21 [inline]
> 	 pte_alloc_one_kernel include/asm-generic/pgalloc.h:33 [inline]
> 	 __pte_alloc_kernel+0x1d/0x200 mm/memory.c:459
> 	 apply_to_pte_range mm/memory.c:2031 [inline]
> 	 apply_to_pmd_range mm/memory.c:2068 [inline]
> 	 apply_to_pud_range mm/memory.c:2088 [inline]
> 	 apply_to_p4d_range mm/memory.c:2108 [inline]
> 	 apply_to_page_range+0x77d/0xa00 mm/memory.c:2133
> 	 kasan_release_vmalloc+0xa7/0xc0 mm/kasan/common.c:970
> 	 __purge_vmap_area_lazy+0xcbb/0x1f30 mm/vmalloc.c:1313
> 	 try_purge_vmap_area_lazy mm/vmalloc.c:1332 [inline]
> 	 free_vmap_area_noflush+0x2ca/0x390 mm/vmalloc.c:1368
> 	 free_unmap_vmap_area mm/vmalloc.c:1381 [inline]
> 	 remove_vm_area+0x1cc/0x230 mm/vmalloc.c:2209
> 	 vm_remove_mappings mm/vmalloc.c:2236 [inline]
> 	 __vunmap+0x223/0xa20 mm/vmalloc.c:2299
> 	 __vfree+0x3f/0xd0 mm/vmalloc.c:2356
> 	 __vmalloc_area_node mm/vmalloc.c:2507 [inline]
> 	 __vmalloc_node_range+0x5d5/0x810 mm/vmalloc.c:2547
> 	 __vmalloc_node mm/vmalloc.c:2607 [inline]
> 	 __vmalloc_node_flags mm/vmalloc.c:2621 [inline]
> 	 vzalloc+0x6f/0x80 mm/vmalloc.c:2666
> 	 alloc_one_pg_vec_page net/packet/af_packet.c:4233 [inline]
> 	 alloc_pg_vec net/packet/af_packet.c:4258 [inline]
> 	 packet_set_ring+0xbc0/0x1b50 net/packet/af_packet.c:4342
> 	 packet_setsockopt+0xed7/0x2d90 net/packet/af_packet.c:3695
> 	 __sys_setsockopt+0x29b/0x4d0 net/socket.c:2117
> 	 __do_sys_setsockopt net/socket.c:2133 [inline]
> 	 __se_sys_setsockopt net/socket.c:2130 [inline]
> 	 __x64_sys_setsockopt+0xbe/0x150 net/socket.c:2130
> 	 do_syscall_64+0xfa/0x780 arch/x86/entry/common.c:294
> 	 entry_SYSCALL_64_after_hwframe+0x49/0xbe
> 
> Switch to using the apply_to_existing_pages helper instead, which
> won't allocate memory.
> 
> Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory")
> Reported-by: Dmitry Vyukov <dvyukov@google.com>
> Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
> Signed-off-by: Daniel Axtens <dja@axtens.net>
> 
> ---
> 
> Andrew, if you want to take this, it replaces
> "kasan: Don't allocate page tables in kasan_release_vmalloc()"
> ---
>  mm/kasan/common.c | 8 +++++---
>  1 file changed, 5 insertions(+), 3 deletions(-)
> 


Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] kasan: don't assume percpu shadow allocations will succeed
  2019-12-05 14:04 ` [PATCH 3/3] kasan: don't assume percpu shadow allocations will succeed Daniel Axtens
@ 2019-12-06 16:24   ` Andrey Ryabinin
  0 siblings, 0 replies; 11+ messages in thread
From: Andrey Ryabinin @ 2019-12-06 16:24 UTC (permalink / raw)
  To: Daniel Axtens, kasan-dev, linux-mm, glider, linux-kernel, dvyukov
  Cc: daniel, cai, syzbot+82e323920b78d54aaed5,
	syzbot+59b7daa4315e07a994f1, Andrew Morton



On 12/5/19 5:04 PM, Daniel Axtens wrote:
> syzkaller and the fault injector showed that I was wrong to assume
> that we could ignore percpu shadow allocation failures.
> 
> Handle failures properly. Merge all the allocated areas back into the free
> list and release the shadow, then clean up and return NULL. The shadow
> is released unconditionally, which relies upon the fact that the release
> function is able to tolerate pages not being present.
> 
> Also clean up shadows in the recovery path - currently they are not
> released, which leaks a bit of memory.
> 
> Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory")
> Reported-by: syzbot+82e323920b78d54aaed5@syzkaller.appspotmail.com
> Reported-by: syzbot+59b7daa4315e07a994f1@syzkaller.appspotmail.com
> Cc: Dmitry Vyukov <dvyukov@google.com>
> Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
> Signed-off-by: Daniel Axtens <dja@axtens.net>
> ---

Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] mm: add apply_to_existing_pages helper
  2019-12-05 14:04 [PATCH 1/3] mm: add apply_to_existing_pages helper Daniel Axtens
                   ` (2 preceding siblings ...)
  2019-12-06 16:24 ` [PATCH 1/3] mm: add apply_to_existing_pages helper Andrey Ryabinin
@ 2019-12-07  0:37 ` Andrew Morton
  2019-12-07  0:38 ` Andrew Morton
  4 siblings, 0 replies; 11+ messages in thread
From: Andrew Morton @ 2019-12-07  0:37 UTC (permalink / raw)
  To: Daniel Axtens
  Cc: kasan-dev, linux-mm, aryabinin, glider, linux-kernel, dvyukov,
	daniel, cai

On Fri,  6 Dec 2019 01:04:05 +1100 Daniel Axtens <dja@axtens.net> wrote:

> +/*
> + * Scan a region of virtual memory, calling a provided function on
> + * each leaf page table where it exists.
> + *
> + * Unlike apply_to_page_range, this does _not_ fill in page tables
> + * where they are absent.
> + */
> +int apply_to_existing_pages(struct mm_struct *mm, unsigned long addr,
> +			    unsigned long size, pte_fn_t fn, void *data)
> +{
> +	pgd_t *pgd;
> +	unsigned long next;
> +	unsigned long end = addr + size;
> +	int err = 0;
> +
> +	if (WARN_ON(addr >= end))
> +		return -EINVAL;
> +
> +	pgd = pgd_offset(mm, addr);
> +	do {
> +		next = pgd_addr_end(addr, end);
> +		if (pgd_none_or_clear_bad(pgd))
> +			continue;
> +		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, false);
> +		if (err)
> +			break;
> +	} while (pgd++, addr = next, addr != end);
> +
> +	return err;
> +}
> +EXPORT_SYMBOL_GPL(apply_to_existing_pages);

This is almost identical to apply_to_page_range() and cries out for
some deduplication.  This?

--- a/mm/memory.c~mm-add-apply_to_existing_pages-helper-fix
+++ a/mm/memory.c
@@ -2141,12 +2141,9 @@ static int apply_to_p4d_range(struct mm_
 	return err;
 }
 
-/*
- * Scan a region of virtual memory, filling in page tables as necessary
- * and calling a provided function on each leaf page table.
- */
-int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
-			unsigned long size, pte_fn_t fn, void *data)
+static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+				 unsigned long size, pte_fn_t fn,
+				 void *data, bool create)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -2159,13 +2156,25 @@ int apply_to_page_range(struct mm_struct
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, true);
+		if (!create && pgd_none_or_clear_bad(pgd))
+			continue;
+		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
 
 	return err;
 }
+
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
+ */
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+			unsigned long size, pte_fn_t fn, void *data)
+{
+	return __apply_to_page_range(mm, addr, size, fn, data, true);
+}
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
 /*
@@ -2178,25 +2187,7 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
 int apply_to_existing_pages(struct mm_struct *mm, unsigned long addr,
 			    unsigned long size, pte_fn_t fn, void *data)
 {
-	pgd_t *pgd;
-	unsigned long next;
-	unsigned long end = addr + size;
-	int err = 0;
-
-	if (WARN_ON(addr >= end))
-		return -EINVAL;
-
-	pgd = pgd_offset(mm, addr);
-	do {
-		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
-			continue;
-		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, false);
-		if (err)
-			break;
-	} while (pgd++, addr = next, addr != end);
-
-	return err;
+	return __apply_to_page_range(mm, addr, size, fn, data, false);
 }
 EXPORT_SYMBOL_GPL(apply_to_existing_pages);
 
_




^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] mm: add apply_to_existing_pages helper
  2019-12-05 14:04 [PATCH 1/3] mm: add apply_to_existing_pages helper Daniel Axtens
                   ` (3 preceding siblings ...)
  2019-12-07  0:37 ` Andrew Morton
@ 2019-12-07  0:38 ` Andrew Morton
  2019-12-07  2:16   ` Daniel Axtens
  2019-12-09  7:34   ` Christoph Hellwig
  4 siblings, 2 replies; 11+ messages in thread
From: Andrew Morton @ 2019-12-07  0:38 UTC (permalink / raw)
  To: Daniel Axtens
  Cc: kasan-dev, linux-mm, aryabinin, glider, linux-kernel, dvyukov,
	daniel, cai

On Fri,  6 Dec 2019 01:04:05 +1100 Daniel Axtens <dja@axtens.net> wrote:

> apply_to_page_range takes an address range, and if any parts of it
> are not covered by the existing page table hierarchy, it allocates
> memory to fill them in.
> 
> In some use cases, this is not what we want - we want to be able to
> operate exclusively on PTEs that are already in the tables.
> 
> Add apply_to_existing_pages for this. Adjust the walker functions
> for apply_to_page_range to take 'create', which switches them between
> the old and new modes.

Wouldn't apply_to_existing_page_range() be a better name?

--- a/include/linux/mm.h~mm-add-apply_to_existing_pages-helper-fix-fix
+++ a/include/linux/mm.h
@@ -2621,9 +2621,9 @@ static inline int vm_fault_to_errno(vm_f
 typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
-extern int apply_to_existing_pages(struct mm_struct *mm, unsigned long address,
-				   unsigned long size, pte_fn_t fn,
-				   void *data);
+extern int apply_to_existing_page_range(struct mm_struct *mm,
+				   unsigned long address, unsigned long size,
+				   pte_fn_t fn, void *data);
 
 #ifdef CONFIG_PAGE_POISONING
 extern bool page_poisoning_enabled(void);
--- a/mm/memory.c~mm-add-apply_to_existing_pages-helper-fix-fix
+++ a/mm/memory.c
@@ -2184,12 +2184,12 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
  * Unlike apply_to_page_range, this does _not_ fill in page tables
  * where they are absent.
  */
-int apply_to_existing_pages(struct mm_struct *mm, unsigned long addr,
-			    unsigned long size, pte_fn_t fn, void *data)
+int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
+				 unsigned long size, pte_fn_t fn, void *data)
 {
 	return __apply_to_page_range(mm, addr, size, fn, data, false);
 }
-EXPORT_SYMBOL_GPL(apply_to_existing_pages);
+EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
 
 /*
  * handle_pte_fault chooses page fault handler according to an entry which was
_



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] mm: add apply_to_existing_pages helper
  2019-12-07  0:38 ` Andrew Morton
@ 2019-12-07  2:16   ` Daniel Axtens
  2019-12-09  7:34   ` Christoph Hellwig
  1 sibling, 0 replies; 11+ messages in thread
From: Daniel Axtens @ 2019-12-07  2:16 UTC (permalink / raw)
  To: Andrew Morton
  Cc: kasan-dev, linux-mm, aryabinin, glider, linux-kernel, dvyukov,
	daniel, cai

>
> Wouldn't apply_to_existing_page_range() be a better name?

I agree with both of those fixups, thanks!

Regards,
Daniel

>
> --- a/include/linux/mm.h~mm-add-apply_to_existing_pages-helper-fix-fix
> +++ a/include/linux/mm.h
> @@ -2621,9 +2621,9 @@ static inline int vm_fault_to_errno(vm_f
>  typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
>  extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
>  			       unsigned long size, pte_fn_t fn, void *data);
> -extern int apply_to_existing_pages(struct mm_struct *mm, unsigned long address,
> -				   unsigned long size, pte_fn_t fn,
> -				   void *data);
> +extern int apply_to_existing_page_range(struct mm_struct *mm,
> +				   unsigned long address, unsigned long size,
> +				   pte_fn_t fn, void *data);
>  
>  #ifdef CONFIG_PAGE_POISONING
>  extern bool page_poisoning_enabled(void);
> --- a/mm/memory.c~mm-add-apply_to_existing_pages-helper-fix-fix
> +++ a/mm/memory.c
> @@ -2184,12 +2184,12 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
>   * Unlike apply_to_page_range, this does _not_ fill in page tables
>   * where they are absent.
>   */
> -int apply_to_existing_pages(struct mm_struct *mm, unsigned long addr,
> -			    unsigned long size, pte_fn_t fn, void *data)
> +int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
> +				 unsigned long size, pte_fn_t fn, void *data)
>  {
>  	return __apply_to_page_range(mm, addr, size, fn, data, false);
>  }
> -EXPORT_SYMBOL_GPL(apply_to_existing_pages);
> +EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
>  
>  /*
>   * handle_pte_fault chooses page fault handler according to an entry which was
> _


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] mm: add apply_to_existing_pages helper
  2019-12-07  0:38 ` Andrew Morton
  2019-12-07  2:16   ` Daniel Axtens
@ 2019-12-09  7:34   ` Christoph Hellwig
  2019-12-11  1:31     ` Andrew Morton
  1 sibling, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2019-12-09  7:34 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Daniel Axtens, kasan-dev, linux-mm, aryabinin, glider,
	linux-kernel, dvyukov, daniel, cai

Or a flags argument with descriptive flags to the existing function?
These magic bool arguments don't scale..


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] mm: add apply_to_existing_pages helper
  2019-12-09  7:34   ` Christoph Hellwig
@ 2019-12-11  1:31     ` Andrew Morton
  0 siblings, 0 replies; 11+ messages in thread
From: Andrew Morton @ 2019-12-11  1:31 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Daniel Axtens, kasan-dev, linux-mm, aryabinin, glider,
	linux-kernel, dvyukov, daniel, cai

On Sun, 8 Dec 2019 23:34:58 -0800 Christoph Hellwig <hch@infradead.org> wrote:

> Or a flags argument with descriptive flags to the existing function?
> These magic bool arguments don't scale..

True.  But it's easy enough to do s/bool create/enum mode/ in the
future should the need arise.  For now, the code is clearer.



^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, back to index

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-05 14:04 [PATCH 1/3] mm: add apply_to_existing_pages helper Daniel Axtens
2019-12-05 14:04 ` [PATCH 2/3] kasan: use apply_to_existing_pages for releasing vmalloc shadow Daniel Axtens
2019-12-06 16:24   ` Andrey Ryabinin
2019-12-05 14:04 ` [PATCH 3/3] kasan: don't assume percpu shadow allocations will succeed Daniel Axtens
2019-12-06 16:24   ` Andrey Ryabinin
2019-12-06 16:24 ` [PATCH 1/3] mm: add apply_to_existing_pages helper Andrey Ryabinin
2019-12-07  0:37 ` Andrew Morton
2019-12-07  0:38 ` Andrew Morton
2019-12-07  2:16   ` Daniel Axtens
2019-12-09  7:34   ` Christoph Hellwig
2019-12-11  1:31     ` Andrew Morton

Linux-mm Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-mm/0 linux-mm/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-mm linux-mm/ https://lore.kernel.org/linux-mm \
		linux-mm@kvack.org
	public-inbox-index linux-mm

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kvack.linux-mm


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git