All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC 0/5] vmalloc_exec for modules and BPF programs
@ 2022-08-18 22:42 Song Liu
  2022-08-18 22:42 ` [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec Song Liu
                   ` (5 more replies)
  0 siblings, 6 replies; 20+ messages in thread
From: Song Liu @ 2022-08-18 22:42 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: akpm, x86, peterz, hch, kernel-team, rick.p.edgecombe, mcgrof,
	dave.hansen, Song Liu

This set is a prototype that allows dynamic kernel text (modules, bpf
programs, various trampolines, etc.) to share huge pages. The idea is
similar to Peter's suggestion in [1]. Please refer to each patch for
more detais.

The ultimate goal is to only host kernel text in 2MB pages (for x86_64).

Please share your comments on this.

Thanks!

[1] https://lore.kernel.org/bpf/Ys6cWUMHO8XwyYgr@hirez.programming.kicks-ass.net/

Song Liu (5):
  vmalloc: introduce vmalloc_exec and vfree_exec
  bpf: use vmalloc_exec
  modules, x86: use vmalloc_exec for module core
  vmalloc_exec: share a huge page with kernel text
  vmalloc: vfree_exec: free unused vm_struct

 arch/x86/Kconfig              |   1 +
 arch/x86/kernel/alternative.c |  30 ++++-
 arch/x86/kernel/module.c      |   1 +
 arch/x86/mm/init_64.c         |   3 +-
 include/linux/vmalloc.h       |  16 +--
 kernel/bpf/core.c             | 155 ++------------------------
 kernel/module/main.c          |  23 ++--
 kernel/module/strict_rwx.c    |   3 -
 kernel/trace/ftrace.c         |   3 +-
 mm/nommu.c                    |   7 ++
 mm/vmalloc.c                  | 200 +++++++++++++++++++++++++++++-----
 11 files changed, 239 insertions(+), 203 deletions(-)

--
2.30.2


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec
  2022-08-18 22:42 [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
@ 2022-08-18 22:42 ` Song Liu
  2022-10-06 23:15   ` Luis Chamberlain
  2022-08-18 22:42 ` [RFC 2/5] bpf: use vmalloc_exec Song Liu
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 20+ messages in thread
From: Song Liu @ 2022-08-18 22:42 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: akpm, x86, peterz, hch, kernel-team, rick.p.edgecombe, mcgrof,
	dave.hansen, Song Liu

This is a prototype to host dynamic kernel text (modules, BPF programs,
etc.) with huge pages. This is similar to the proposal by Peter in [1].

A new tree of vmap_area, free_text_area_* tree, is introduced in addition
to free_vmap_area_* and vmap_area_*. vmalloc_exec allocates pages from
free_text_area_*. When there isn't enough space left in free_text_area_*,
new PMD_SIZE page(s) is allocated from free_vmap_area_* and added to
free_text_area_*.

The new tree allows separate handling of < PAGE_SIZE allocations, as
current vmalloc code mostly assumes PAGE_SIZE aligned allocations. This
version of vmalloc_exec can handle bpf programs, which uses 64 byte
aligned allocations), and modules, which uses PAGE_SIZE aligned
allocations.

[1] https://lore.kernel.org/bpf/Ys6cWUMHO8XwyYgr@hirez.programming.kicks-ass.net/
---
 include/linux/vmalloc.h |   4 +
 mm/nommu.c              |   7 ++
 mm/vmalloc.c            | 163 +++++++++++++++++++++++++++++++++-------
 3 files changed, 147 insertions(+), 27 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 096d48aa3437..691c02ffe3db 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -35,6 +35,8 @@ struct notifier_block;		/* in notifier.h */
 #define VM_DEFER_KMEMLEAK	0
 #endif
 
+#define VM_KERNEL_EXEC		0x00001000	/* kernel text mapped as RO+X */
+
 /* bits [20..32] reserved for arch specific ioremap internals */
 
 /*
@@ -154,6 +156,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
 		int node, const void *caller) __alloc_size(1);
 void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+void *vmalloc_exec(unsigned long size, unsigned long align) __alloc_size(1);
+void vfree_exec(const void *addr);
 
 extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
 extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2);
diff --git a/mm/nommu.c b/mm/nommu.c
index 9d7afc2d959e..11e0fc996006 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -372,6 +372,13 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
 }
 EXPORT_SYMBOL(vm_map_pages_zero);
 
+void *vmalloc_exec(unsigned long size, unsigned long align)
+{
+	return NULL;
+}
+
+void vfree_exec(const void *addr) { }
+
 /*
  *  sys_brk() for the most part doesn't need the global kernel
  *  lock, except when an application is doing something nasty
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index effd1ff6a4b4..472287e71bf1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -753,6 +753,10 @@ static LIST_HEAD(free_vmap_area_list);
  */
 static struct rb_root free_vmap_area_root = RB_ROOT;
 
+static DEFINE_SPINLOCK(free_text_area_lock);
+static LIST_HEAD(free_text_area_list);
+static struct rb_root free_text_area_root = RB_ROOT;
+
 /*
  * Preload a CPU with one object for "no edge" split case. The
  * aim is to get rid of allocations from the atomic context, thus
@@ -814,9 +818,11 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
 	return va;
 }
 
-static struct vmap_area *__find_vmap_area(unsigned long addr)
+static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_node *root)
 {
-	struct rb_node *n = vmap_area_root.rb_node;
+	struct rb_node *n;
+
+	n = root ? root : vmap_area_root.rb_node;
 
 	addr = (unsigned long)kasan_reset_tag((void *)addr);
 
@@ -926,7 +932,7 @@ link_va(struct vmap_area *va, struct rb_root *root,
 
 	/* Insert to the rb-tree */
 	rb_link_node(&va->rb_node, parent, link);
-	if (root == &free_vmap_area_root) {
+	if (root == &free_vmap_area_root || root == &free_text_area_root) {
 		/*
 		 * Some explanation here. Just perform simple insertion
 		 * to the tree. We do not set va->subtree_max_size to
@@ -955,7 +961,7 @@ unlink_va(struct vmap_area *va, struct rb_root *root)
 	if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
 		return;
 
-	if (root == &free_vmap_area_root)
+	if (root == &free_vmap_area_root || root == &free_text_area_root)
 		rb_erase_augmented(&va->rb_node,
 			root, &free_vmap_area_rb_augment_cb);
 	else
@@ -1198,15 +1204,15 @@ is_within_this_va(struct vmap_area *va, unsigned long size,
  * overhead.
  */
 static __always_inline struct vmap_area *
-find_vmap_lowest_match(unsigned long size, unsigned long align,
-	unsigned long vstart, bool adjust_search_size)
+find_vmap_lowest_match(struct rb_node *root, unsigned long size,
+       unsigned long align, unsigned long vstart, bool adjust_search_size)
 {
 	struct vmap_area *va;
 	struct rb_node *node;
 	unsigned long length;
 
 	/* Start from the root. */
-	node = free_vmap_area_root.rb_node;
+	node = root;
 
 	/* Adjust the search size for alignment overhead. */
 	length = adjust_search_size ? size + align - 1 : size;
@@ -1290,8 +1296,9 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align)
 	get_random_bytes(&rnd, sizeof(rnd));
 	vstart = VMALLOC_START + rnd;
 
-	va_1 = find_vmap_lowest_match(size, align, vstart, false);
-	va_2 = find_vmap_lowest_linear_match(size, align, vstart);
+	va_1 = find_vmap_lowest_match(free_vmap_area_root.rb_node, size,
+				      align, vstart, false);
+	va_2 = find_vmap_lowest_linear_match(root, size, align, vstart);
 
 	if (va_1 != va_2)
 		pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
@@ -1334,7 +1341,8 @@ classify_va_fit_type(struct vmap_area *va,
 }
 
 static __always_inline int
-adjust_va_to_fit_type(struct vmap_area *va,
+adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
+	struct vmap_area *va,
 	unsigned long nva_start_addr, unsigned long size,
 	enum fit_type type)
 {
@@ -1348,7 +1356,7 @@ adjust_va_to_fit_type(struct vmap_area *va,
 		 * V      NVA      V
 		 * |---------------|
 		 */
-		unlink_va(va, &free_vmap_area_root);
+		unlink_va(va, root);
 		kmem_cache_free(vmap_area_cachep, va);
 	} else if (type == LE_FIT_TYPE) {
 		/*
@@ -1426,8 +1434,7 @@ adjust_va_to_fit_type(struct vmap_area *va,
 		augment_tree_propagate_from(va);
 
 		if (lva)	/* type == NE_FIT_TYPE */
-			insert_vmap_area_augment(lva, &va->rb_node,
-				&free_vmap_area_root, &free_vmap_area_list);
+			insert_vmap_area_augment(lva, &va->rb_node, root, head);
 	}
 
 	return 0;
@@ -1459,7 +1466,8 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
 	if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
 		adjust_search_size = false;
 
-	va = find_vmap_lowest_match(size, align, vstart, adjust_search_size);
+	va = find_vmap_lowest_match(free_vmap_area_root.rb_node,
+				    size, align, vstart, adjust_search_size);
 	if (unlikely(!va))
 		return vend;
 
@@ -1478,7 +1486,8 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
 		return vend;
 
 	/* Update the free vmap_area. */
-	ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
+	ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list,
+				    va, nva_start_addr, size, type);
 	if (ret)
 		return vend;
 
@@ -1539,7 +1548,7 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
 static struct vmap_area *alloc_vmap_area(unsigned long size,
 				unsigned long align,
 				unsigned long vstart, unsigned long vend,
-				int node, gfp_t gfp_mask)
+				int node, unsigned long vm_flags, gfp_t gfp_mask)
 {
 	struct vmap_area *va;
 	unsigned long freed;
@@ -1583,9 +1592,17 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->va_end = addr + size;
 	va->vm = NULL;
 
-	spin_lock(&vmap_area_lock);
-	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
-	spin_unlock(&vmap_area_lock);
+	if (vm_flags & VM_KERNEL_EXEC) {
+		spin_lock(&free_text_area_lock);
+		insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
+		/* update subtree_max_size now as we need this soon */
+		augment_tree_propagate_from(va);
+		spin_unlock(&free_text_area_lock);
+	} else {
+		spin_lock(&vmap_area_lock);
+		insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+		spin_unlock(&vmap_area_lock);
+	}
 
 	BUG_ON(!IS_ALIGNED(va->va_start, align));
 	BUG_ON(va->va_start < vstart);
@@ -1803,7 +1820,7 @@ struct vmap_area *find_vmap_area(unsigned long addr)
 	struct vmap_area *va;
 
 	spin_lock(&vmap_area_lock);
-	va = __find_vmap_area(addr);
+	va = __find_vmap_area(addr, vmap_area_root.rb_node);
 	spin_unlock(&vmap_area_lock);
 
 	return va;
@@ -1912,8 +1929,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 		return ERR_PTR(-ENOMEM);
 
 	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
-					VMALLOC_START, VMALLOC_END,
-					node, gfp_mask);
+				     VMALLOC_START, VMALLOC_END,
+				     node, 0, gfp_mask);
 	if (IS_ERR(va)) {
 		kfree(vb);
 		return ERR_CAST(va);
@@ -2209,8 +2226,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
 		addr = (unsigned long)mem;
 	} else {
 		struct vmap_area *va;
-		va = alloc_vmap_area(size, PAGE_SIZE,
-				VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
+		va = alloc_vmap_area(size, PAGE_SIZE, VMALLOC_START, VMALLOC_END,
+				     node, 0, GFP_KERNEL);
 		if (IS_ERR(va))
 			return NULL;
 
@@ -2450,7 +2467,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 	if (!(flags & VM_NO_GUARD))
 		size += PAGE_SIZE;
 
-	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
+	va = alloc_vmap_area(size, align, start, end, node, flags, gfp_mask);
 	if (IS_ERR(va)) {
 		kfree(area);
 		return NULL;
@@ -2546,7 +2563,7 @@ struct vm_struct *remove_vm_area(const void *addr)
 	might_sleep();
 
 	spin_lock(&vmap_area_lock);
-	va = __find_vmap_area((unsigned long)addr);
+	va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node);
 	if (va && va->vm) {
 		struct vm_struct *vm = va->vm;
 
@@ -3265,6 +3282,97 @@ void *vmalloc(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc);
 
+void *vmalloc_exec(unsigned long size, unsigned long align)
+{
+	struct vmap_area *va, *tmp;
+	unsigned long addr;
+	enum fit_type type;
+	int ret;
+
+	va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE);
+	if (unlikely(!va))
+		return ERR_PTR(-ENOMEM);
+
+again:
+	preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE);
+	tmp = find_vmap_lowest_match(free_text_area_root.rb_node,
+				     size, align, 1, false);
+
+	if (!tmp) {
+		unsigned long alloc_size;
+		void *ptr;
+
+		spin_unlock(&free_text_area_lock);
+
+		alloc_size = roundup(size, PMD_SIZE * num_online_nodes());
+		ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, MODULES_VADDR,
+					   MODULES_END, GFP_KERNEL, PAGE_KERNEL,
+					   VM_KERNEL_EXEC | VM_ALLOW_HUGE_VMAP | VM_NO_GUARD,
+					   NUMA_NO_NODE, __builtin_return_address(0));
+		if (unlikely(!ptr)) {
+			ret = -ENOMEM;
+			goto err_out;
+		}
+		memset(ptr, 0, alloc_size);
+		set_memory_ro((unsigned long)ptr, alloc_size >> PAGE_SHIFT);
+		set_memory_x((unsigned long)ptr, alloc_size >> PAGE_SHIFT);
+
+		goto again;
+	}
+
+	addr = roundup(tmp->va_start, align);
+	type = classify_va_fit_type(tmp, addr, size);
+	if (WARN_ON_ONCE(type == NOTHING_FIT)) {
+		addr = -ENOMEM;
+		goto err_out;
+	}
+
+	ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list,
+				    tmp, addr, size, type);
+	if (ret) {
+		addr = ret;
+		goto err_out;
+	}
+	spin_unlock(&free_text_area_lock);
+
+	va->va_start = addr;
+	va->va_end = addr + size;
+	va->vm = tmp->vm;
+
+	spin_lock(&vmap_area_lock);
+	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+	spin_unlock(&vmap_area_lock);
+
+	return (void *)addr;
+
+err_out:
+	spin_unlock(&free_text_area_lock);
+	return ERR_PTR(ret);
+}
+
+void vfree_exec(const void *addr)
+{
+	struct vmap_area *va;
+
+	might_sleep();
+
+	spin_lock(&vmap_area_lock);
+	va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node);
+	if (WARN_ON_ONCE(!va)) {
+		spin_unlock(&vmap_area_lock);
+		return;
+	}
+
+	unlink_va(va, &vmap_area_root);
+	spin_unlock(&vmap_area_lock);
+
+	spin_lock(&free_text_area_lock);
+	merge_or_add_vmap_area_augment(va,
+		&free_text_area_root, &free_text_area_list);
+	spin_unlock(&free_text_area_lock);
+	/* TODO: when the whole vm_struct is not in use, free it */
+}
+
 /**
  * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
  * @size:      allocation size
@@ -3851,7 +3959,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 			/* It is a BUG(), but trigger recovery instead. */
 			goto recovery;
 
-		ret = adjust_va_to_fit_type(va, start, size, type);
+		ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list,
+					    va, start, size, type);
 		if (unlikely(ret))
 			goto recovery;
 
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [RFC 2/5] bpf: use vmalloc_exec
  2022-08-18 22:42 [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
  2022-08-18 22:42 ` [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec Song Liu
@ 2022-08-18 22:42 ` Song Liu
  2022-08-18 22:42 ` [RFC 3/5] modules, x86: use vmalloc_exec for module core Song Liu
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 20+ messages in thread
From: Song Liu @ 2022-08-18 22:42 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: akpm, x86, peterz, hch, kernel-team, rick.p.edgecombe, mcgrof,
	dave.hansen, Song Liu

Use vmalloc_exec and vfree_exec instead of bpf_prog_pack_[alloc|free].
---
 kernel/bpf/core.c | 155 +++-------------------------------------------
 1 file changed, 10 insertions(+), 145 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c1e10d088dbb..834cce7e1ef2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -806,144 +806,6 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
 	return slot;
 }
 
-/*
- * BPF program pack allocator.
- *
- * Most BPF programs are pretty small. Allocating a hole page for each
- * program is sometime a waste. Many small bpf program also adds pressure
- * to instruction TLB. To solve this issue, we introduce a BPF program pack
- * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
- * to host BPF programs.
- */
-#define BPF_PROG_CHUNK_SHIFT	6
-#define BPF_PROG_CHUNK_SIZE	(1 << BPF_PROG_CHUNK_SHIFT)
-#define BPF_PROG_CHUNK_MASK	(~(BPF_PROG_CHUNK_SIZE - 1))
-
-struct bpf_prog_pack {
-	struct list_head list;
-	void *ptr;
-	unsigned long bitmap[];
-};
-
-#define BPF_PROG_SIZE_TO_NBITS(size)	(round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
-
-static DEFINE_MUTEX(pack_mutex);
-static LIST_HEAD(pack_list);
-
-/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
- * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
- */
-#ifdef PMD_SIZE
-#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes())
-#else
-#define BPF_PROG_PACK_SIZE PAGE_SIZE
-#endif
-
-#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
-
-static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
-{
-	struct bpf_prog_pack *pack;
-
-	pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
-		       GFP_KERNEL);
-	if (!pack)
-		return NULL;
-	pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
-	if (!pack->ptr) {
-		kfree(pack);
-		return NULL;
-	}
-	bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
-	bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
-	list_add_tail(&pack->list, &pack_list);
-
-	set_vm_flush_reset_perms(pack->ptr);
-	set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
-	set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
-	return pack;
-}
-
-static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
-{
-	unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
-	struct bpf_prog_pack *pack;
-	unsigned long pos;
-	void *ptr = NULL;
-
-	mutex_lock(&pack_mutex);
-	if (size > BPF_PROG_PACK_SIZE) {
-		size = round_up(size, PAGE_SIZE);
-		ptr = module_alloc(size);
-		if (ptr) {
-			bpf_fill_ill_insns(ptr, size);
-			set_vm_flush_reset_perms(ptr);
-			set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
-			set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
-		}
-		goto out;
-	}
-	list_for_each_entry(pack, &pack_list, list) {
-		pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
-						 nbits, 0);
-		if (pos < BPF_PROG_CHUNK_COUNT)
-			goto found_free_area;
-	}
-
-	pack = alloc_new_pack(bpf_fill_ill_insns);
-	if (!pack)
-		goto out;
-
-	pos = 0;
-
-found_free_area:
-	bitmap_set(pack->bitmap, pos, nbits);
-	ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
-
-out:
-	mutex_unlock(&pack_mutex);
-	return ptr;
-}
-
-static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
-{
-	struct bpf_prog_pack *pack = NULL, *tmp;
-	unsigned int nbits;
-	unsigned long pos;
-
-	mutex_lock(&pack_mutex);
-	if (hdr->size > BPF_PROG_PACK_SIZE) {
-		module_memfree(hdr);
-		goto out;
-	}
-
-	list_for_each_entry(tmp, &pack_list, list) {
-		if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) {
-			pack = tmp;
-			break;
-		}
-	}
-
-	if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
-		goto out;
-
-	nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
-	pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
-
-	WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size),
-		  "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
-
-	bitmap_clear(pack->bitmap, pos, nbits);
-	if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
-				       BPF_PROG_CHUNK_COUNT, 0) == 0) {
-		list_del(&pack->list);
-		module_memfree(pack->ptr);
-		kfree(pack);
-	}
-out:
-	mutex_unlock(&pack_mutex);
-}
-
 static atomic_long_t bpf_jit_current;
 
 /* Can be overridden by an arch's JIT compiler if it has a custom,
@@ -1043,6 +905,9 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 	bpf_jit_uncharge_modmem(size);
 }
 
+#define BPF_PROG_EXEC_ALIGN	64
+#define BPF_PROG_EXEC_MASK	(~(BPF_PROG_EXEC_ALIGN - 1))
+
 /* Allocate jit binary from bpf_prog_pack allocator.
  * Since the allocated memory is RO+X, the JIT engine cannot write directly
  * to the memory. To solve this problem, a RW buffer is also allocated at
@@ -1065,11 +930,11 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
 		     alignment > BPF_IMAGE_ALIGNMENT);
 
 	/* add 16 bytes for a random section of illegal instructions */
-	size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
+	size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_EXEC_ALIGN);
 
 	if (bpf_jit_charge_modmem(size))
 		return NULL;
-	ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
+	ro_header = vmalloc_exec(size, BPF_PROG_EXEC_ALIGN);
 	if (!ro_header) {
 		bpf_jit_uncharge_modmem(size);
 		return NULL;
@@ -1078,7 +943,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
 	*rw_header = kvmalloc(size, GFP_KERNEL);
 	if (!*rw_header) {
 		bpf_arch_text_copy(&ro_header->size, &size, sizeof(size));
-		bpf_prog_pack_free(ro_header);
+		vfree_exec(ro_header);
 		bpf_jit_uncharge_modmem(size);
 		return NULL;
 	}
@@ -1088,7 +953,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
 	(*rw_header)->size = size;
 
 	hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
-		     BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
+		     BPF_PROG_EXEC_ALIGN - sizeof(*ro_header));
 	start = (get_random_int() % hole) & ~(alignment - 1);
 
 	*image_ptr = &ro_header->image[start];
@@ -1109,7 +974,7 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
 	kvfree(rw_header);
 
 	if (IS_ERR(ptr)) {
-		bpf_prog_pack_free(ro_header);
+		vfree_exec(ro_header);
 		return PTR_ERR(ptr);
 	}
 	return 0;
@@ -1130,7 +995,7 @@ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
 {
 	u32 size = ro_header->size;
 
-	bpf_prog_pack_free(ro_header);
+	vfree_exec(ro_header);
 	kvfree(rw_header);
 	bpf_jit_uncharge_modmem(size);
 }
@@ -1141,7 +1006,7 @@ bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
 	unsigned long real_start = (unsigned long)fp->bpf_func;
 	unsigned long addr;
 
-	addr = real_start & BPF_PROG_CHUNK_MASK;
+	addr = real_start & BPF_PROG_EXEC_MASK;
 	return (void *)addr;
 }
 
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [RFC 3/5] modules, x86: use vmalloc_exec for module core
  2022-08-18 22:42 [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
  2022-08-18 22:42 ` [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec Song Liu
  2022-08-18 22:42 ` [RFC 2/5] bpf: use vmalloc_exec Song Liu
@ 2022-08-18 22:42 ` Song Liu
  2022-10-06 23:38   ` Luis Chamberlain
  2022-08-18 22:42 ` [RFC 4/5] vmalloc_exec: share a huge page with kernel text Song Liu
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 20+ messages in thread
From: Song Liu @ 2022-08-18 22:42 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: akpm, x86, peterz, hch, kernel-team, rick.p.edgecombe, mcgrof,
	dave.hansen, Song Liu

This is a prototype that allows modules to share 2MB text pages with other
modules and BPF programs.

Current version only covers core_layout.
---
 arch/x86/Kconfig              |  1 +
 arch/x86/kernel/alternative.c | 30 ++++++++++++++++++++++++------
 arch/x86/kernel/module.c      |  1 +
 kernel/module/main.c          | 23 +++++++++++++----------
 kernel/module/strict_rwx.c    |  3 ---
 kernel/trace/ftrace.c         |  3 ++-
 6 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fb5900e2c29a..e932bceb7f23 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -91,6 +91,7 @@ config X86
 	select ARCH_HAS_SET_DIRECT_MAP
 	select ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_HAS_STRICT_MODULE_RWX
+	select ARCH_WANTS_MODULES_DATA_IN_VMALLOC	if X86_64
 	select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
 	select ARCH_HAS_SYSCALL_WRAPPER
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 62f6b8b7c4a5..c83888ec232b 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -332,7 +332,13 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 
 		DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
 
-		text_poke_early(instr, insn_buff, insn_buff_sz);
+		if (system_state < SYSTEM_RUNNING) {
+			text_poke_early(instr, insn_buff, insn_buff_sz);
+		} else {
+			mutex_lock(&text_mutex);
+			text_poke(instr, insn_buff, insn_buff_sz);
+			mutex_unlock(&text_mutex);
+		}
 
 next:
 		optimize_nops(instr, a->instrlen);
@@ -503,7 +509,13 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
 			optimize_nops(bytes, len);
 			DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
 			DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
-			text_poke_early(addr, bytes, len);
+			if (system_state == SYSTEM_BOOTING) {
+				text_poke_early(addr, bytes, len);
+			} else {
+				mutex_lock(&text_mutex);
+				text_poke(addr, bytes, len);
+				mutex_unlock(&text_mutex);
+			}
 		}
 	}
 }
@@ -568,7 +580,13 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
 		if (len == insn.length) {
 			DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
 			DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
-			text_poke_early(addr, bytes, len);
+			if (unlikely(system_state == SYSTEM_BOOTING)) {
+				text_poke_early(addr, bytes, len);
+			} else {
+				mutex_lock(&text_mutex);
+				text_poke(addr, bytes, len);
+				mutex_unlock(&text_mutex);
+			}
 		}
 	}
 }
@@ -609,7 +627,7 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
 		 */
 		DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
 		DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
-		text_poke_early(addr, &poison, 4);
+		text_poke(addr, &poison, 4);
 	}
 }
 
@@ -791,7 +809,7 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
 
 		/* Pad the rest with nops */
 		add_nops(insn_buff + used, p->len - used);
-		text_poke_early(p->instr, insn_buff, p->len);
+		text_poke(p->instr, insn_buff, p->len);
 	}
 }
 extern struct paravirt_patch_site __start_parainstructions[],
@@ -1698,7 +1716,7 @@ void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *
 	struct text_poke_loc tp;
 
 	if (unlikely(system_state == SYSTEM_BOOTING)) {
-		text_poke_early(addr, opcode, len);
+		text_poke(addr, opcode, len);
 		return;
 	}
 
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 100446ffdc1d..570af623e28f 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -229,6 +229,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 	bool early = me->state == MODULE_STATE_UNFORMED;
 	void *(*write)(void *, const void *, size_t) = memcpy;
 
+	early = false;
 	if (!early) {
 		write = text_poke;
 		mutex_lock(&text_mutex);
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 57fc2821be63..c51dafa1089a 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -53,6 +53,7 @@
 #include <linux/bsearch.h>
 #include <linux/dynamic_debug.h>
 #include <linux/audit.h>
+#include <linux/bpf.h>
 #include <uapi/linux/module.h>
 #include "internal.h"
 
@@ -1198,7 +1199,7 @@ static void free_module(struct module *mod)
 	lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size);
 
 	/* Finally, free the core (containing the module structure) */
-	module_memfree(mod->core_layout.base);
+	vfree_exec(mod->core_layout.base);
 #ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
 	vfree(mod->data_layout.base);
 #endif
@@ -1316,7 +1317,8 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
 			ksym = resolve_symbol_wait(mod, info, name);
 			/* Ok if resolved.  */
 			if (ksym && !IS_ERR(ksym)) {
-				sym[i].st_value = kernel_symbol_value(ksym);
+				unsigned long val = kernel_symbol_value(ksym);
+				bpf_arch_text_copy(&sym[i].st_value, &val, sizeof(val));
 				break;
 			}
 
@@ -1337,7 +1339,8 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
 				secbase = (unsigned long)mod_percpu(mod);
 			else
 				secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
-			sym[i].st_value += secbase;
+			secbase += sym[i].st_value;
+			bpf_arch_text_copy(&sym[i].st_value, &secbase, sizeof(secbase));
 			break;
 		}
 	}
@@ -2118,7 +2121,7 @@ static int move_module(struct module *mod, struct load_info *info)
 	void *ptr;
 
 	/* Do the allocs. */
-	ptr = module_alloc(mod->core_layout.size);
+	ptr = vmalloc_exec(mod->core_layout.size, PAGE_SIZE);
 	/*
 	 * The pointer to this block is stored in the module structure
 	 * which is inside the block. Just mark it as not being a
@@ -2128,7 +2131,7 @@ static int move_module(struct module *mod, struct load_info *info)
 	if (!ptr)
 		return -ENOMEM;
 
-	memset(ptr, 0, mod->core_layout.size);
+/* 	memset(ptr, 0, mod->core_layout.size); */
 	mod->core_layout.base = ptr;
 
 	if (mod->init_layout.size) {
@@ -2141,7 +2144,7 @@ static int move_module(struct module *mod, struct load_info *info)
 		 */
 		kmemleak_ignore(ptr);
 		if (!ptr) {
-			module_memfree(mod->core_layout.base);
+			vfree_exec(mod->core_layout.base);
 			return -ENOMEM;
 		}
 		memset(ptr, 0, mod->init_layout.size);
@@ -2151,7 +2154,7 @@ static int move_module(struct module *mod, struct load_info *info)
 
 #ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
 	/* Do the allocs. */
-	ptr = vmalloc(mod->data_layout.size);
+	ptr = module_alloc(mod->data_layout.size);
 	/*
 	 * The pointer to this block is stored in the module structure
 	 * which is inside the block. Just mark it as not being a
@@ -2159,7 +2162,7 @@ static int move_module(struct module *mod, struct load_info *info)
 	 */
 	kmemleak_not_leak(ptr);
 	if (!ptr) {
-		module_memfree(mod->core_layout.base);
+		vfree_exec(mod->core_layout.base);
 		module_memfree(mod->init_layout.base);
 		return -ENOMEM;
 	}
@@ -2185,7 +2188,7 @@ static int move_module(struct module *mod, struct load_info *info)
 			dest = mod->core_layout.base + shdr->sh_entsize;
 
 		if (shdr->sh_type != SHT_NOBITS)
-			memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
+			bpf_arch_text_copy(dest, (void *)shdr->sh_addr, shdr->sh_size);
 		/* Update sh_addr to point to copy in image. */
 		shdr->sh_addr = (unsigned long)dest;
 		pr_debug("\t0x%lx %s\n",
@@ -2341,7 +2344,7 @@ static void module_deallocate(struct module *mod, struct load_info *info)
 	percpu_modfree(mod);
 	module_arch_freeing_init(mod);
 	module_memfree(mod->init_layout.base);
-	module_memfree(mod->core_layout.base);
+	vfree_exec(mod->core_layout.base);
 #ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
 	vfree(mod->data_layout.base);
 #endif
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index 14fbea66f12f..d392eb7bf574 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -85,7 +85,6 @@ void module_enable_x(const struct module *mod)
 	    !PAGE_ALIGNED(mod->init_layout.base))
 		return;
 
-	frob_text(&mod->core_layout, set_memory_x);
 	frob_text(&mod->init_layout, set_memory_x);
 }
 
@@ -98,9 +97,7 @@ void module_enable_ro(const struct module *mod, bool after_init)
 		return;
 #endif
 
-	set_vm_flush_reset_perms(mod->core_layout.base);
 	set_vm_flush_reset_perms(mod->init_layout.base);
-	frob_text(&mod->core_layout, set_memory_ro);
 
 	frob_rodata(&mod->data_layout, set_memory_ro);
 	frob_text(&mod->init_layout, set_memory_ro);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bc921a3f7ea8..8cd31dc9ac84 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3177,6 +3177,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
 	if (mod)
 		rec_flags |= FTRACE_FL_DISABLED;
 
+	ftrace_arch_code_modify_prepare();
 	for (pg = new_pgs; pg; pg = pg->next) {
 
 		for (i = 0; i < pg->index; i++) {
@@ -3198,7 +3199,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
 			update_cnt++;
 		}
 	}
-
+	ftrace_arch_code_modify_post_process();
 	stop = ftrace_now(raw_smp_processor_id());
 	ftrace_update_time = stop - start;
 	ftrace_update_tot_cnt += update_cnt;
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [RFC 4/5] vmalloc_exec: share a huge page with kernel text
  2022-08-18 22:42 [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
                   ` (2 preceding siblings ...)
  2022-08-18 22:42 ` [RFC 3/5] modules, x86: use vmalloc_exec for module core Song Liu
@ 2022-08-18 22:42 ` Song Liu
  2022-10-06 23:44   ` Luis Chamberlain
  2022-08-18 22:42 ` [RFC 5/5] vmalloc: vfree_exec: free unused vm_struct Song Liu
  2022-08-22 15:46 ` [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
  5 siblings, 1 reply; 20+ messages in thread
From: Song Liu @ 2022-08-18 22:42 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: akpm, x86, peterz, hch, kernel-team, rick.p.edgecombe, mcgrof,
	dave.hansen, Song Liu

On x86 kernel, we allocate 2MB pages for kernel text up to
round_down(_etext, 2MB). Therefore, some of the kernel text is still
on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to
round_up(_etext, 2MB), and use the rest of the page for modules and
BPF programs.

Here is an example:

[root@eth50-1 ~]# grep _etext /proc/kallsyms
ffffffff82202a08 T _etext

[root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms  | tail -n 3
ffffffff8220f920 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup       [bpf]
ffffffff8220fa28 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new   [bpf]
ffffffff8220fad4 t bpf_prog_3bf73fa16f5e3d92_handle__sched_switch       [bpf]

[root@eth50-1 ~]#  grep 0xffffffff82200000 /sys/kernel/debug/page_tables/kernel
0xffffffff82200000-0xffffffff82400000     2M     ro   PSE         x  pmd

[root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms
ffffffff822ba910 t xfs_flush_inodes_worker      [xfs]
ffffffff822bc580 t xfs_flush_inodes     [xfs]

ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, xfs
module, and bpf programs.
---
 arch/x86/mm/init_64.c |  3 ++-
 mm/vmalloc.c          | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 39c5246964a9..d27d0af5beb5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1367,12 +1367,13 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
 
 int kernel_set_to_readonly;
 
+#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
 void mark_rodata_ro(void)
 {
 	unsigned long start = PFN_ALIGN(_text);
 	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
 	unsigned long end = (unsigned long)__end_rodata_hpage_align;
-	unsigned long text_end = PFN_ALIGN(_etext);
+	unsigned long text_end = PMD_ALIGN(_etext);
 	unsigned long rodata_end = PFN_ALIGN(__end_rodata);
 	unsigned long all_end;
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 472287e71bf1..5f3b5df9313f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -72,6 +72,11 @@ early_param("nohugevmalloc", set_nohugevmalloc);
 static const bool vmap_allow_huge = false;
 #endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
 
+#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
+
+static struct vm_struct text_tail_vm;
+static struct vmap_area text_tail_va;
+
 bool is_vmalloc_addr(const void *x)
 {
 	unsigned long addr = (unsigned long)kasan_reset_tag(x);
@@ -634,6 +639,8 @@ int is_vmalloc_or_module_addr(const void *x)
 	unsigned long addr = (unsigned long)kasan_reset_tag(x);
 	if (addr >= MODULES_VADDR && addr < MODULES_END)
 		return 1;
+	if (addr >= text_tail_va.va_start && addr < text_tail_va.va_end)
+		return 1;
 #endif
 	return is_vmalloc_addr(x);
 }
@@ -2371,6 +2378,25 @@ static void vmap_init_free_space(void)
 	}
 }
 
+static void register_text_tail_vm(void)
+{
+	unsigned long start = PFN_ALIGN(_etext);
+	unsigned long end = PMD_ALIGN(_etext);
+	struct vmap_area *va;
+
+	va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+	if (WARN_ON_ONCE(!va))
+		return;
+	text_tail_vm.addr = (void *)start;
+	text_tail_vm.size = end - start;
+	text_tail_vm.flags = VM_KERNEL_EXEC;
+	text_tail_va.va_start = start;
+	text_tail_va.va_end = end;
+	text_tail_va.vm = &text_tail_vm;
+	memcpy(va, &text_tail_va, sizeof(*va));
+	insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
+}
+
 void __init vmalloc_init(void)
 {
 	struct vmap_area *va;
@@ -2381,6 +2407,7 @@ void __init vmalloc_init(void)
 	 * Create the cache for vmap_area objects.
 	 */
 	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
+	register_text_tail_vm();
 
 	for_each_possible_cpu(i) {
 		struct vmap_block_queue *vbq;
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [RFC 5/5] vmalloc: vfree_exec: free unused vm_struct
  2022-08-18 22:42 [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
                   ` (3 preceding siblings ...)
  2022-08-18 22:42 ` [RFC 4/5] vmalloc_exec: share a huge page with kernel text Song Liu
@ 2022-08-18 22:42 ` Song Liu
  2022-08-22 15:46 ` [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
  5 siblings, 0 replies; 20+ messages in thread
From: Song Liu @ 2022-08-18 22:42 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: akpm, x86, peterz, hch, kernel-team, rick.p.edgecombe, mcgrof,
	dave.hansen, Song Liu

This is clearly not done yet, but it won't be too hard.

I would like to highlight that, we need both subtree_max_size and vm for
vmap_area in free_text tree. Therefore, we cannot keep the union in
vmam_area.
---
 include/linux/vmalloc.h | 12 ++----------
 mm/vmalloc.c            | 14 ++++++++++++--
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 691c02ffe3db..de7731caadc0 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -68,16 +68,8 @@ struct vmap_area {
 	struct rb_node rb_node;         /* address sorted rbtree */
 	struct list_head list;          /* address sorted list */
 
-	/*
-	 * The following two variables can be packed, because
-	 * a vmap_area object can be either:
-	 *    1) in "free" tree (root is free_vmap_area_root)
-	 *    2) or "busy" tree (root is vmap_area_root)
-	 */
-	union {
-		unsigned long subtree_max_size; /* in "free" tree */
-		struct vm_struct *vm;           /* in "busy" tree */
-	};
+	unsigned long subtree_max_size;
+	struct vm_struct *vm;
 };
 
 /* archs that select HAVE_ARCH_HUGE_VMAP should override one or more of these */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5f3b5df9313f..57dd18882d37 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1428,6 +1428,7 @@ adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
 		 */
 		lva->va_start = va->va_start;
 		lva->va_end = nva_start_addr;
+		lva->vm = va->vm;
 
 		/*
 		 * Shrink this VA to remaining size.
@@ -3394,10 +3395,19 @@ void vfree_exec(const void *addr)
 	spin_unlock(&vmap_area_lock);
 
 	spin_lock(&free_text_area_lock);
-	merge_or_add_vmap_area_augment(va,
+	va = merge_or_add_vmap_area_augment(va,
 		&free_text_area_root, &free_text_area_list);
+	if (va) {
+		struct vm_struct *vm = va->vm;
+
+		if (vm != &text_tail_vm) {
+			va = __find_vmap_area((unsigned long)vm->addr,
+					      free_text_area_root.rb_node);
+			if (va->va_start == (unsigned long)vm->addr)
+				pr_info("%s TODO: free vm->addr %px\n", __func__, vm->addr);
+		}
+	}
 	spin_unlock(&free_text_area_lock);
-	/* TODO: when the whole vm_struct is not in use, free it */
 }
 
 /**
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [RFC 0/5] vmalloc_exec for modules and BPF programs
  2022-08-18 22:42 [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
                   ` (4 preceding siblings ...)
  2022-08-18 22:42 ` [RFC 5/5] vmalloc: vfree_exec: free unused vm_struct Song Liu
@ 2022-08-22 15:46 ` Song Liu
  2022-08-22 16:34   ` Peter Zijlstra
  5 siblings, 1 reply; 20+ messages in thread
From: Song Liu @ 2022-08-22 15:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linux-MM, lkml, Andrew Morton, X86 ML, Christoph Hellwig,
	Kernel Team, Edgecombe, Rick P, Luis Chamberlain, Dave Hansen,
	Song Liu



> On Aug 18, 2022, at 3:42 PM, Song Liu <song@kernel.org> wrote:
> 
> This set is a prototype that allows dynamic kernel text (modules, bpf
> programs, various trampolines, etc.) to share huge pages. The idea is
> similar to Peter's suggestion in [1]. Please refer to each patch for
> more detais.
> 
> The ultimate goal is to only host kernel text in 2MB pages (for x86_64).
> 
> Please share your comments on this.
> 
> Thanks!
> 
> [1] https://lore.kernel.org/bpf/Ys6cWUMHO8XwyYgr@hirez.programming.kicks-ass.net/

Hi Peter, 

Could you please share your feedback on this? 

Thanks,
Song

PS: I guess vger dropped my patch again. :( The set is also available at

https://git.kernel.org/pub/scm/linux/kernel/git/song/linux.git 

branch vmalloc_exec. 

> 
> Song Liu (5):
>  vmalloc: introduce vmalloc_exec and vfree_exec
>  bpf: use vmalloc_exec
>  modules, x86: use vmalloc_exec for module core
>  vmalloc_exec: share a huge page with kernel text
>  vmalloc: vfree_exec: free unused vm_struct
> 
> arch/x86/Kconfig              |   1 +
> arch/x86/kernel/alternative.c |  30 ++++-
> arch/x86/kernel/module.c      |   1 +
> arch/x86/mm/init_64.c         |   3 +-
> include/linux/vmalloc.h       |  16 +--
> kernel/bpf/core.c             | 155 ++------------------------
> kernel/module/main.c          |  23 ++--
> kernel/module/strict_rwx.c    |   3 -
> kernel/trace/ftrace.c         |   3 +-
> mm/nommu.c                    |   7 ++
> mm/vmalloc.c                  | 200 +++++++++++++++++++++++++++++-----
> 11 files changed, 239 insertions(+), 203 deletions(-)
> 
> --
> 2.30.2


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC 0/5] vmalloc_exec for modules and BPF programs
  2022-08-22 15:46 ` [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
@ 2022-08-22 16:34   ` Peter Zijlstra
  2022-08-22 16:56     ` Song Liu
  0 siblings, 1 reply; 20+ messages in thread
From: Peter Zijlstra @ 2022-08-22 16:34 UTC (permalink / raw)
  To: Song Liu
  Cc: Linux-MM, lkml, Andrew Morton, X86 ML, Christoph Hellwig,
	Kernel Team, Edgecombe, Rick P, Luis Chamberlain, Dave Hansen,
	Song Liu

On Mon, Aug 22, 2022 at 03:46:38PM +0000, Song Liu wrote:
> Could you please share your feedback on this? 

I've looked at it all of 5 minutes, so perhaps I've missed something.

However, I'm a little surprised you went with a second tree instead of
doing the top-down thing for data. The way you did it makes it hard to
have guard pages between text and data.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC 0/5] vmalloc_exec for modules and BPF programs
  2022-08-22 16:34   ` Peter Zijlstra
@ 2022-08-22 16:56     ` Song Liu
  2022-08-23  5:42       ` Peter Zijlstra
  2022-08-24 17:06       ` Song Liu
  0 siblings, 2 replies; 20+ messages in thread
From: Song Liu @ 2022-08-22 16:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linux-MM, lkml, Andrew Morton, X86 ML, Christoph Hellwig,
	Kernel Team, Edgecombe, Rick P, Luis Chamberlain, Dave Hansen,
	Song Liu



> On Aug 22, 2022, at 9:34 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Mon, Aug 22, 2022 at 03:46:38PM +0000, Song Liu wrote:
>> Could you please share your feedback on this? 
> 
> I've looked at it all of 5 minutes, so perhaps I've missed something.
> 
> However, I'm a little surprised you went with a second tree instead of
> doing the top-down thing for data. The way you did it makes it hard to
> have guard pages between text and data.

I didn't realize the importance of the guard pages. But it is not too
hard to do it with this approach. For each 2MB text page, we can reserve
4kB on the beginning and end of it. Would this work?

There are a couple benefits from a second tree:

1. It allows text allocations to go below PAGE_SIZE granularity, while 
   data allocations would still use PAGE_SIZE granularity, which is the
   same as current code. 
2. Text allocate requires mapping one vm_struct to many vmap_area. Putting
   text allocations in a separate tree make it easier to handle this. 
   (Well, I haven't finished this logic yet). 
3. A separate tree makes it easier to use text tail page, 
   [_etext, roundup(_etext, PMD_SIZE)], for modules and BPF programs. 

Does this make sense? Do you see other downsides with a second tree?

Thanks,
Song

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC 0/5] vmalloc_exec for modules and BPF programs
  2022-08-22 16:56     ` Song Liu
@ 2022-08-23  5:42       ` Peter Zijlstra
  2022-08-23  6:39         ` Christophe Leroy
  2022-08-23  6:55         ` Song Liu
  2022-08-24 17:06       ` Song Liu
  1 sibling, 2 replies; 20+ messages in thread
From: Peter Zijlstra @ 2022-08-23  5:42 UTC (permalink / raw)
  To: Song Liu
  Cc: Linux-MM, lkml, Andrew Morton, X86 ML, Christoph Hellwig,
	Kernel Team, Edgecombe, Rick P, Luis Chamberlain, Dave Hansen,
	Song Liu

On Mon, Aug 22, 2022 at 04:56:47PM +0000, Song Liu wrote:
> 
> 
> > On Aug 22, 2022, at 9:34 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > On Mon, Aug 22, 2022 at 03:46:38PM +0000, Song Liu wrote:
> >> Could you please share your feedback on this? 
> > 
> > I've looked at it all of 5 minutes, so perhaps I've missed something.
> > 
> > However, I'm a little surprised you went with a second tree instead of
> > doing the top-down thing for data. The way you did it makes it hard to
> > have guard pages between text and data.
> 
> I didn't realize the importance of the guard pages. But it is not too

I'm not sure how important it is, just seems like a good idea to trap
anybody trying to cross that divide. Also, to me it seems like a good
idea to have a single large contiguous text region instead of splintered
2M pages.

> hard to do it with this approach. For each 2MB text page, we can reserve
> 4kB on the beginning and end of it. Would this work?

Typically a guard page has different protections (as in none what so
ever) so that every access goes *splat*.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC 0/5] vmalloc_exec for modules and BPF programs
  2022-08-23  5:42       ` Peter Zijlstra
@ 2022-08-23  6:39         ` Christophe Leroy
  2022-08-23  6:57           ` Song Liu
  2022-08-23  6:55         ` Song Liu
  1 sibling, 1 reply; 20+ messages in thread
From: Christophe Leroy @ 2022-08-23  6:39 UTC (permalink / raw)
  To: Peter Zijlstra, Song Liu
  Cc: Linux-MM, lkml, Andrew Morton, X86 ML, Christoph Hellwig,
	Kernel Team, Edgecombe, Rick P, Luis Chamberlain, Dave Hansen,
	Song Liu



Le 23/08/2022 à 07:42, Peter Zijlstra a écrit :
> On Mon, Aug 22, 2022 at 04:56:47PM +0000, Song Liu wrote:
>>
>>
>>> On Aug 22, 2022, at 9:34 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>>>
>>> On Mon, Aug 22, 2022 at 03:46:38PM +0000, Song Liu wrote:
>>>> Could you please share your feedback on this?
>>>
>>> I've looked at it all of 5 minutes, so perhaps I've missed something.
>>>
>>> However, I'm a little surprised you went with a second tree instead of
>>> doing the top-down thing for data. The way you did it makes it hard to
>>> have guard pages between text and data.
>>
>> I didn't realize the importance of the guard pages. But it is not too
> 
> I'm not sure how important it is, just seems like a good idea to trap
> anybody trying to cross that divide. Also, to me it seems like a good
> idea to have a single large contiguous text region instead of splintered
> 2M pages.
> 
>> hard to do it with this approach. For each 2MB text page, we can reserve
>> 4kB on the beginning and end of it. Would this work?
> 
> Typically a guard page has different protections (as in none what so
> ever) so that every access goes *splat*. >

Text is RO-X, on some architectures even only X. So the only real thing 
to protect against is bad execution, isn't it ?. So I guess having some 
areas with invalid or trap instructions would be enough ?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC 0/5] vmalloc_exec for modules and BPF programs
  2022-08-23  5:42       ` Peter Zijlstra
  2022-08-23  6:39         ` Christophe Leroy
@ 2022-08-23  6:55         ` Song Liu
  1 sibling, 0 replies; 20+ messages in thread
From: Song Liu @ 2022-08-23  6:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linux-MM, lkml, Andrew Morton, X86 ML, Christoph Hellwig,
	Kernel Team, Edgecombe, Rick P, Luis Chamberlain, Dave Hansen,
	Song Liu



> On Aug 22, 2022, at 10:42 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Mon, Aug 22, 2022 at 04:56:47PM +0000, Song Liu wrote:
>> 
>> 
>>> On Aug 22, 2022, at 9:34 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>>> 
>>> On Mon, Aug 22, 2022 at 03:46:38PM +0000, Song Liu wrote:
>>>> Could you please share your feedback on this? 
>>> 
>>> I've looked at it all of 5 minutes, so perhaps I've missed something.
>>> 
>>> However, I'm a little surprised you went with a second tree instead of
>>> doing the top-down thing for data. The way you did it makes it hard to
>>> have guard pages between text and data.
>> 
>> I didn't realize the importance of the guard pages. But it is not too
> 
> I'm not sure how important it is, just seems like a good idea to trap
> anybody trying to cross that divide. Also, to me it seems like a good
> idea to have a single large contiguous text region instead of splintered
> 2M pages.

A single large contiguous text region is great. However, it is not easy to
keep it contiguous. For example, when we load a big module, and then unload
it. It is not easy to recycle the space. Say we load module-x-v1, which is 
4MB, and uses 2 huge pages. Then we load a small BPF program after it. The 
address space looks like:

MODULE_VADDR to MODULE_VADDR + 4MB:			module-x-v1
MODULE_VADDR + 4MB to MODULE_VADDR + 4MB + 4kB:		bpf_prog_xxxx

When we unload module-x-v1, there will be 4MB hole in the address space. 
If we then load module-x-v2, which is 4.1MB in size, we cannot reuse that
hole, because the module is a little too big for the hole. 

AFAICT, to use the space efficiently, we will have to deal with splintered
2MB pages. 

Does this make sense?

Thanks,
Song

> 
>> hard to do it with this approach. For each 2MB text page, we can reserve
>> 4kB on the beginning and end of it. Would this work?
> 
> Typically a guard page has different protections (as in none what so
> ever) so that every access goes *splat*.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC 0/5] vmalloc_exec for modules and BPF programs
  2022-08-23  6:39         ` Christophe Leroy
@ 2022-08-23  6:57           ` Song Liu
  0 siblings, 0 replies; 20+ messages in thread
From: Song Liu @ 2022-08-23  6:57 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: Peter Zijlstra, Linux-MM, lkml, Andrew Morton, X86 ML,
	Christoph Hellwig, Kernel Team, Edgecombe, Rick P,
	Luis Chamberlain, Dave Hansen, Song Liu



> On Aug 22, 2022, at 11:39 PM, Christophe Leroy <christophe.leroy@csgroup.eu> wrote:
> 
> 
> 
> Le 23/08/2022 à 07:42, Peter Zijlstra a écrit :
>> On Mon, Aug 22, 2022 at 04:56:47PM +0000, Song Liu wrote:
>>> 
>>> 
>>>> On Aug 22, 2022, at 9:34 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>>>> 
>>>> On Mon, Aug 22, 2022 at 03:46:38PM +0000, Song Liu wrote:
>>>>> Could you please share your feedback on this?
>>>> 
>>>> I've looked at it all of 5 minutes, so perhaps I've missed something.
>>>> 
>>>> However, I'm a little surprised you went with a second tree instead of
>>>> doing the top-down thing for data. The way you did it makes it hard to
>>>> have guard pages between text and data.
>>> 
>>> I didn't realize the importance of the guard pages. But it is not too
>> 
>> I'm not sure how important it is, just seems like a good idea to trap
>> anybody trying to cross that divide. Also, to me it seems like a good
>> idea to have a single large contiguous text region instead of splintered
>> 2M pages.
>> 
>>> hard to do it with this approach. For each 2MB text page, we can reserve
>>> 4kB on the beginning and end of it. Would this work?
>> 
>> Typically a guard page has different protections (as in none what so
>> ever) so that every access goes *splat*. >
> 
> Text is RO-X, on some architectures even only X. So the only real thing 
> to protect against is bad execution, isn't it ?. So I guess having some 
> areas with invalid or trap instructions would be enough ?

Agreed that filling with trap instructions should be enough. 

Thanks,
Song


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC 0/5] vmalloc_exec for modules and BPF programs
  2022-08-22 16:56     ` Song Liu
  2022-08-23  5:42       ` Peter Zijlstra
@ 2022-08-24 17:06       ` Song Liu
  1 sibling, 0 replies; 20+ messages in thread
From: Song Liu @ 2022-08-24 17:06 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linux-MM, lkml, Andrew Morton, X86 ML, Christoph Hellwig,
	Kernel Team, Edgecombe, Rick P, Luis Chamberlain, Dave Hansen,
	Song Liu

Hi Peter, 

> On Aug 22, 2022, at 9:56 AM, Song Liu <songliubraving@fb.com> wrote:
> 
>> On Aug 22, 2022, at 9:34 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>> 
>> On Mon, Aug 22, 2022 at 03:46:38PM +0000, Song Liu wrote:
>>> Could you please share your feedback on this? 
>> 
>> I've looked at it all of 5 minutes, so perhaps I've missed something.
>> 
>> However, I'm a little surprised you went with a second tree instead of
>> doing the top-down thing for data. The way you did it makes it hard to
>> have guard pages between text and data.
> 
> I didn't realize the importance of the guard pages. But it is not too
> hard to do it with this approach. For each 2MB text page, we can reserve
> 4kB on the beginning and end of it. Would this work?
> 
> There are a couple benefits from a second tree:
> 
> 1. It allows text allocations to go below PAGE_SIZE granularity, while 
>   data allocations would still use PAGE_SIZE granularity, which is the
>   same as current code. 
> 2. Text allocate requires mapping one vm_struct to many vmap_area. Putting
>   text allocations in a separate tree make it easier to handle this. 
>   (Well, I haven't finished this logic yet). 
> 3. A separate tree makes it easier to use text tail page, 
>   [_etext, roundup(_etext, PMD_SIZE)], for modules and BPF programs. 
> 
> Does this make sense? Do you see other downsides with a second tree?

Did these make sense? Do you have future comments that I would address in 
future versions?

Thanks,
Song


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec
  2022-08-18 22:42 ` [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec Song Liu
@ 2022-10-06 23:15   ` Luis Chamberlain
  2022-10-07  6:39     ` Song Liu
  0 siblings, 1 reply; 20+ messages in thread
From: Luis Chamberlain @ 2022-10-06 23:15 UTC (permalink / raw)
  To: Song Liu, Vlastimil Babka, Mel Gorman
  Cc: linux-mm, linux-kernel, akpm, x86, peterz, hch, kernel-team,
	rick.p.edgecombe, dave.hansen

On Thu, Aug 18, 2022 at 03:42:14PM -0700, Song Liu wrote:
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -372,6 +372,13 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
>  }
>  EXPORT_SYMBOL(vm_map_pages_zero);
>  
> +void *vmalloc_exec(unsigned long size, unsigned long align)
> +{
> +	return NULL;
> +}

Well that's not so nice for no-mmu systems. Shouldn't we have a
fallback?

> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index effd1ff6a4b4..472287e71bf1 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1583,9 +1592,17 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>  	va->va_end = addr + size;
>  	va->vm = NULL;
>  
> -	spin_lock(&vmap_area_lock);
> -	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> -	spin_unlock(&vmap_area_lock);
> +	if (vm_flags & VM_KERNEL_EXEC) {
> +		spin_lock(&free_text_area_lock);
> +		insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
> +		/* update subtree_max_size now as we need this soon */
> +		augment_tree_propagate_from(va);

Sorry, it is not clear to me why its needed only for exec, can you elaborate a
bit more?

> +		spin_unlock(&free_text_area_lock);
> +	} else {
> +		spin_lock(&vmap_area_lock);
> +		insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> +		spin_unlock(&vmap_area_lock);
> +	}
>  
>  	BUG_ON(!IS_ALIGNED(va->va_start, align));
>  	BUG_ON(va->va_start < vstart);

<-- snip -->

> @@ -3265,6 +3282,97 @@ void *vmalloc(unsigned long size)
>  }
>  EXPORT_SYMBOL(vmalloc);
>  
> +void *vmalloc_exec(unsigned long size, unsigned long align)
> +{
> +	struct vmap_area *va, *tmp;
> +	unsigned long addr;
> +	enum fit_type type;
> +	int ret;
> +
> +	va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE);
> +	if (unlikely(!va))
> +		return ERR_PTR(-ENOMEM);
> +
> +again:
> +	preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE);
> +	tmp = find_vmap_lowest_match(free_text_area_root.rb_node,
> +				     size, align, 1, false);
> +
> +	if (!tmp) {
> +		unsigned long alloc_size;
> +		void *ptr;
> +
> +		spin_unlock(&free_text_area_lock);
> +
> +		alloc_size = roundup(size, PMD_SIZE * num_online_nodes());
> +		ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, MODULES_VADDR,
> +					   MODULES_END, GFP_KERNEL, PAGE_KERNEL,
> +					   VM_KERNEL_EXEC | VM_ALLOW_HUGE_VMAP | VM_NO_GUARD,
> +					   NUMA_NO_NODE, __builtin_return_address(0));

We can review the guard stuff on the other thread with Peter.

> +		if (unlikely(!ptr)) {
> +			ret = -ENOMEM;
> +			goto err_out;
> +		}
> +		memset(ptr, 0, alloc_size);
> +		set_memory_ro((unsigned long)ptr, alloc_size >> PAGE_SHIFT);
> +		set_memory_x((unsigned long)ptr, alloc_size >> PAGE_SHIFT);

I *really* like that this is now not something users have to muck with thanks!

> +
> +		goto again;
> +	}
> +
> +	addr = roundup(tmp->va_start, align);
> +	type = classify_va_fit_type(tmp, addr, size);
> +	if (WARN_ON_ONCE(type == NOTHING_FIT)) {
> +		addr = -ENOMEM;
> +		goto err_out;
> +	}
> +
> +	ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list,
> +				    tmp, addr, size, type);
> +	if (ret) {
> +		addr = ret;
> +		goto err_out;
> +	}
> +	spin_unlock(&free_text_area_lock);
> +
> +	va->va_start = addr;
> +	va->va_end = addr + size;
> +	va->vm = tmp->vm;
> +
> +	spin_lock(&vmap_area_lock);
> +	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> +	spin_unlock(&vmap_area_lock);
> +
> +	return (void *)addr;
> +
> +err_out:
> +	spin_unlock(&free_text_area_lock);
> +	return ERR_PTR(ret);
> +}
> +
> +void vfree_exec(const void *addr)
> +{
> +	struct vmap_area *va;
> +
> +	might_sleep();
> +
> +	spin_lock(&vmap_area_lock);
> +	va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node);
> +	if (WARN_ON_ONCE(!va)) {
> +		spin_unlock(&vmap_area_lock);
> +		return;
> +	}
> +
> +	unlink_va(va, &vmap_area_root);

Curious why we don't memset to 0 before merge_or_add_vmap_area_augment()?
I realize other code doesn't seem to do it, though.

> +	spin_unlock(&vmap_area_lock);
> +
> +	spin_lock(&free_text_area_lock);
> +	merge_or_add_vmap_area_augment(va,
> +		&free_text_area_root, &free_text_area_list);

I have concern that we can be using precious physically contigous memory
from huge pages to then end up in a situation where we create our own
pool and allow things to be non-contigous afterwards.

I'm starting to suspect that if the allocation is > PAGE_SIZE we just
give it back generally. Otherwise wouldn't the fragmentation cause us
to eventually just eat up most huge pages available? Probably not for
eBPF but if we use this on a system with tons of module insertions /
deletions this seems like it could happen?

  Luis

> +	spin_unlock(&free_text_area_lock);
> +	/* TODO: when the whole vm_struct is not in use, free it */
> +}
> +
>  /**
>   * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
>   * @size:      allocation size
> @@ -3851,7 +3959,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>  			/* It is a BUG(), but trigger recovery instead. */
>  			goto recovery;
>  
> -		ret = adjust_va_to_fit_type(va, start, size, type);
> +		ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list,
> +					    va, start, size, type);
>  		if (unlikely(ret))
>  			goto recovery;
>  
> -- 
> 2.30.2
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC 3/5] modules, x86: use vmalloc_exec for module core
  2022-08-18 22:42 ` [RFC 3/5] modules, x86: use vmalloc_exec for module core Song Liu
@ 2022-10-06 23:38   ` Luis Chamberlain
  2022-10-07  6:46     ` Song Liu
  0 siblings, 1 reply; 20+ messages in thread
From: Luis Chamberlain @ 2022-10-06 23:38 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-mm, linux-kernel, akpm, x86, peterz, hch, kernel-team,
	rick.p.edgecombe, dave.hansen

On Thu, Aug 18, 2022 at 03:42:16PM -0700, Song Liu wrote:
> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
> index 100446ffdc1d..570af623e28f 100644
> --- a/arch/x86/kernel/module.c
> +++ b/arch/x86/kernel/module.c
> @@ -229,6 +229,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
>  	bool early = me->state == MODULE_STATE_UNFORMED;
>  	void *(*write)(void *, const void *, size_t) = memcpy;
>  
> +	early = false;
>  	if (!early) {
>  		write = text_poke;
>  		mutex_lock(&text_mutex);

As per 88fc078a7a8f6 ("x86/module: Use text_poke() for late
relocations") I'm curious why we have to take the live patching
path now all the time?

  Luis

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC 4/5] vmalloc_exec: share a huge page with kernel text
  2022-08-18 22:42 ` [RFC 4/5] vmalloc_exec: share a huge page with kernel text Song Liu
@ 2022-10-06 23:44   ` Luis Chamberlain
  2022-10-07  6:53     ` Song Liu
  0 siblings, 1 reply; 20+ messages in thread
From: Luis Chamberlain @ 2022-10-06 23:44 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-mm, linux-kernel, akpm, x86, peterz, hch, kernel-team,
	rick.p.edgecombe, dave.hansen

On Thu, Aug 18, 2022 at 03:42:17PM -0700, Song Liu wrote:
> On x86 kernel, we allocate 2MB pages for kernel text up to
> round_down(_etext, 2MB). Therefore, some of the kernel text is still
> on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to
> round_up(_etext, 2MB), and use the rest of the page for modules and
> BPF programs.
> 
> Here is an example:
> 
> [root@eth50-1 ~]# grep _etext /proc/kallsyms
> ffffffff82202a08 T _etext
> 
> [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms  | tail -n 3
> ffffffff8220f920 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup       [bpf]
> ffffffff8220fa28 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new   [bpf]
> ffffffff8220fad4 t bpf_prog_3bf73fa16f5e3d92_handle__sched_switch       [bpf]
> 
> [root@eth50-1 ~]#  grep 0xffffffff82200000 /sys/kernel/debug/page_tables/kernel
> 0xffffffff82200000-0xffffffff82400000     2M     ro   PSE         x  pmd
> 
> [root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms
> ffffffff822ba910 t xfs_flush_inodes_worker      [xfs]
> ffffffff822bc580 t xfs_flush_inodes     [xfs]
> 
> ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, xfs
> module, and bpf programs.

This is pretty rad. I'm not sure how you were able to squeeze xfs and
*more* into one 2 MiB huge page though at least on debian 5.17.0-1-amd64
xfs is 3.6847 MiB. How big is your XFS module?

I don't grok mm stuff, but I'd like to understand why we gain the ability
of re-use the same 2 MiB page with this patch, from the code I really
can't tail. Any pointers?

But, I'm still concerned about the free'ing case in terms of
fragmentation for contigous memory, when free huage pages are available.

  Luis

> ---
>  arch/x86/mm/init_64.c |  3 ++-
>  mm/vmalloc.c          | 27 +++++++++++++++++++++++++++
>  2 files changed, 29 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 39c5246964a9..d27d0af5beb5 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -1367,12 +1367,13 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
>  
>  int kernel_set_to_readonly;
>  
> +#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
>  void mark_rodata_ro(void)
>  {
>  	unsigned long start = PFN_ALIGN(_text);
>  	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
>  	unsigned long end = (unsigned long)__end_rodata_hpage_align;
> -	unsigned long text_end = PFN_ALIGN(_etext);
> +	unsigned long text_end = PMD_ALIGN(_etext);
>  	unsigned long rodata_end = PFN_ALIGN(__end_rodata);
>  	unsigned long all_end;
>  
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 472287e71bf1..5f3b5df9313f 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -72,6 +72,11 @@ early_param("nohugevmalloc", set_nohugevmalloc);
>  static const bool vmap_allow_huge = false;
>  #endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
>  
> +#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
> +
> +static struct vm_struct text_tail_vm;
> +static struct vmap_area text_tail_va;
> +
>  bool is_vmalloc_addr(const void *x)
>  {
>  	unsigned long addr = (unsigned long)kasan_reset_tag(x);
> @@ -634,6 +639,8 @@ int is_vmalloc_or_module_addr(const void *x)
>  	unsigned long addr = (unsigned long)kasan_reset_tag(x);
>  	if (addr >= MODULES_VADDR && addr < MODULES_END)
>  		return 1;
> +	if (addr >= text_tail_va.va_start && addr < text_tail_va.va_end)
> +		return 1;
>  #endif
>  	return is_vmalloc_addr(x);
>  }
> @@ -2371,6 +2378,25 @@ static void vmap_init_free_space(void)
>  	}
>  }
>  
> +static void register_text_tail_vm(void)
> +{
> +	unsigned long start = PFN_ALIGN(_etext);
> +	unsigned long end = PMD_ALIGN(_etext);
> +	struct vmap_area *va;
> +
> +	va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
> +	if (WARN_ON_ONCE(!va))
> +		return;
> +	text_tail_vm.addr = (void *)start;
> +	text_tail_vm.size = end - start;
> +	text_tail_vm.flags = VM_KERNEL_EXEC;
> +	text_tail_va.va_start = start;
> +	text_tail_va.va_end = end;
> +	text_tail_va.vm = &text_tail_vm;
> +	memcpy(va, &text_tail_va, sizeof(*va));
> +	insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
> +}
> +
>  void __init vmalloc_init(void)
>  {
>  	struct vmap_area *va;
> @@ -2381,6 +2407,7 @@ void __init vmalloc_init(void)
>  	 * Create the cache for vmap_area objects.
>  	 */
>  	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
> +	register_text_tail_vm();
>  
>  	for_each_possible_cpu(i) {
>  		struct vmap_block_queue *vbq;
> -- 
> 2.30.2
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec
  2022-10-06 23:15   ` Luis Chamberlain
@ 2022-10-07  6:39     ` Song Liu
  0 siblings, 0 replies; 20+ messages in thread
From: Song Liu @ 2022-10-07  6:39 UTC (permalink / raw)
  To: Luis Chamberlain
  Cc: Song Liu, Vlastimil Babka, Mel Gorman, Linux-MM, linux-kernel,
	akpm, x86, peterz, hch, Kernel Team, rick.p.edgecombe,
	dave.hansen



> On Oct 6, 2022, at 4:15 PM, Luis Chamberlain <mcgrof@kernel.org> wrote:
> 
> On Thu, Aug 18, 2022 at 03:42:14PM -0700, Song Liu wrote:
>> --- a/mm/nommu.c
>> +++ b/mm/nommu.c
>> @@ -372,6 +372,13 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
>> }
>> EXPORT_SYMBOL(vm_map_pages_zero);
>> 
>> +void *vmalloc_exec(unsigned long size, unsigned long align)
>> +{
>> +	return NULL;
>> +}
> 
> Well that's not so nice for no-mmu systems. Shouldn't we have a
> fallback?

This is still early version, so I am not quite sure whether we 
need the fallback for no-mmu system. 

> 
>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>> index effd1ff6a4b4..472287e71bf1 100644
>> --- a/mm/vmalloc.c
>> +++ b/mm/vmalloc.c
>> @@ -1583,9 +1592,17 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>> 	va->va_end = addr + size;
>> 	va->vm = NULL;
>> 
>> -	spin_lock(&vmap_area_lock);
>> -	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
>> -	spin_unlock(&vmap_area_lock);
>> +	if (vm_flags & VM_KERNEL_EXEC) {
>> +		spin_lock(&free_text_area_lock);
>> +		insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
>> +		/* update subtree_max_size now as we need this soon */
>> +		augment_tree_propagate_from(va);
> 
> Sorry, it is not clear to me why its needed only for exec, can you elaborate a
> bit more?

This version was wrong. We should use insert_vmap_area_augment() here. 
Actually, I changed this in latest version. 

> 
>> +		spin_unlock(&free_text_area_lock);
>> +	} else {
>> +		spin_lock(&vmap_area_lock);
>> +		insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
>> +		spin_unlock(&vmap_area_lock);
>> +	}
>> 
>> 	BUG_ON(!IS_ALIGNED(va->va_start, align));
>> 	BUG_ON(va->va_start < vstart);
> 
> <-- snip -->
> 
>> @@ -3265,6 +3282,97 @@ void *vmalloc(unsigned long size)
>> }
>> EXPORT_SYMBOL(vmalloc);
>> 
>> +void *vmalloc_exec(unsigned long size, unsigned long align)
>> +{
>> +	struct vmap_area *va, *tmp;
>> +	unsigned long addr;
>> +	enum fit_type type;
>> +	int ret;
>> +
>> +	va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE);
>> +	if (unlikely(!va))
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +again:
>> +	preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE);
>> +	tmp = find_vmap_lowest_match(free_text_area_root.rb_node,
>> +				     size, align, 1, false);
>> +
>> +	if (!tmp) {
>> +		unsigned long alloc_size;
>> +		void *ptr;
>> +
>> +		spin_unlock(&free_text_area_lock);
>> +
>> +		alloc_size = roundup(size, PMD_SIZE * num_online_nodes());
>> +		ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, MODULES_VADDR,
>> +					   MODULES_END, GFP_KERNEL, PAGE_KERNEL,
>> +					   VM_KERNEL_EXEC | VM_ALLOW_HUGE_VMAP | VM_NO_GUARD,
>> +					   NUMA_NO_NODE, __builtin_return_address(0));
> 
> We can review the guard stuff on the other thread with Peter.
> 
>> +		if (unlikely(!ptr)) {
>> +			ret = -ENOMEM;
>> +			goto err_out;
>> +		}
>> +		memset(ptr, 0, alloc_size);
>> +		set_memory_ro((unsigned long)ptr, alloc_size >> PAGE_SHIFT);
>> +		set_memory_x((unsigned long)ptr, alloc_size >> PAGE_SHIFT);
> 
> I *really* like that this is now not something users have to muck with thanks!

Well, this pushed some other complexity to the user side, for example, all
those hacks with text_poke in 3/5. 

> 
>> +
>> +		goto again;
>> +	}
>> +
>> +	addr = roundup(tmp->va_start, align);
>> +	type = classify_va_fit_type(tmp, addr, size);
>> +	if (WARN_ON_ONCE(type == NOTHING_FIT)) {
>> +		addr = -ENOMEM;
>> +		goto err_out;
>> +	}
>> +
>> +	ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list,
>> +				    tmp, addr, size, type);
>> +	if (ret) {
>> +		addr = ret;
>> +		goto err_out;
>> +	}
>> +	spin_unlock(&free_text_area_lock);
>> +
>> +	va->va_start = addr;
>> +	va->va_end = addr + size;
>> +	va->vm = tmp->vm;
>> +
>> +	spin_lock(&vmap_area_lock);
>> +	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
>> +	spin_unlock(&vmap_area_lock);
>> +
>> +	return (void *)addr;
>> +
>> +err_out:
>> +	spin_unlock(&free_text_area_lock);
>> +	return ERR_PTR(ret);
>> +}
>> +
>> +void vfree_exec(const void *addr)
>> +{
>> +	struct vmap_area *va;
>> +
>> +	might_sleep();
>> +
>> +	spin_lock(&vmap_area_lock);
>> +	va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node);
>> +	if (WARN_ON_ONCE(!va)) {
>> +		spin_unlock(&vmap_area_lock);
>> +		return;
>> +	}
>> +
>> +	unlink_va(va, &vmap_area_root);
> 
> Curious why we don't memset to 0 before merge_or_add_vmap_area_augment()?
> I realize other code doesn't seem to do it, though.

We should do the memset here. We will need the text_poke version of it. 

> 
>> +	spin_unlock(&vmap_area_lock);
>> +
>> +	spin_lock(&free_text_area_lock);
>> +	merge_or_add_vmap_area_augment(va,
>> +		&free_text_area_root, &free_text_area_list);
> 
> I have concern that we can be using precious physically contigous memory
> from huge pages to then end up in a situation where we create our own
> pool and allow things to be non-contigous afterwards.
> 
> I'm starting to suspect that if the allocation is > PAGE_SIZE we just
> give it back generally. Otherwise wouldn't the fragmentation cause us
> to eventually just eat up most huge pages available? Probably not for
> eBPF but if we use this on a system with tons of module insertions /
> deletions this seems like it could happen?

Currently, bpf_prog_pack doesn't let allocation > PMD_SIZE to share with
smaller allocations. I guess it is similar to the idea here? I am not 
sure what's the proper threshold for modules. We can discuss this later. 

Thanks,
Song

> 
>  Luis
> 
>> +	spin_unlock(&free_text_area_lock);
>> +	/* TODO: when the whole vm_struct is not in use, free it */
>> +}
>> +
>> /**
>>  * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
>>  * @size:      allocation size
>> @@ -3851,7 +3959,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>> 			/* It is a BUG(), but trigger recovery instead. */
>> 			goto recovery;
>> 
>> -		ret = adjust_va_to_fit_type(va, start, size, type);
>> +		ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list,
>> +					    va, start, size, type);
>> 		if (unlikely(ret))
>> 			goto recovery;
>> 
>> -- 
>> 2.30.2
>> 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC 3/5] modules, x86: use vmalloc_exec for module core
  2022-10-06 23:38   ` Luis Chamberlain
@ 2022-10-07  6:46     ` Song Liu
  0 siblings, 0 replies; 20+ messages in thread
From: Song Liu @ 2022-10-07  6:46 UTC (permalink / raw)
  To: Luis Chamberlain
  Cc: Song Liu, linux-mm, linux-kernel, akpm, x86, peterz, hch,
	Kernel Team, rick.p.edgecombe, dave.hansen



> On Oct 6, 2022, at 4:38 PM, Luis Chamberlain <mcgrof@kernel.org> wrote:
> 
> On Thu, Aug 18, 2022 at 03:42:16PM -0700, Song Liu wrote:
>> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
>> index 100446ffdc1d..570af623e28f 100644
>> --- a/arch/x86/kernel/module.c
>> +++ b/arch/x86/kernel/module.c
>> @@ -229,6 +229,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
>> 	bool early = me->state == MODULE_STATE_UNFORMED;
>> 	void *(*write)(void *, const void *, size_t) = memcpy;
>> 
>> +	early = false;
>> 	if (!early) {
>> 		write = text_poke;
>> 		mutex_lock(&text_mutex);
> 
> As per 88fc078a7a8f6 ("x86/module: Use text_poke() for late
> relocations") I'm curious why we have to take the live patching
> path now all the time?

Since vmalloc_exec returns read-only memory, we need text_poke()
for any operation to it. 

Does this answer your question?

Song

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC 4/5] vmalloc_exec: share a huge page with kernel text
  2022-10-06 23:44   ` Luis Chamberlain
@ 2022-10-07  6:53     ` Song Liu
  0 siblings, 0 replies; 20+ messages in thread
From: Song Liu @ 2022-10-07  6:53 UTC (permalink / raw)
  To: Luis Chamberlain
  Cc: Song Liu, linux-mm, linux-kernel, akpm, x86, peterz, hch,
	Kernel Team, rick.p.edgecombe, dave.hansen



> On Oct 6, 2022, at 4:44 PM, Luis Chamberlain <mcgrof@kernel.org> wrote:
> 
> On Thu, Aug 18, 2022 at 03:42:17PM -0700, Song Liu wrote:
>> On x86 kernel, we allocate 2MB pages for kernel text up to
>> round_down(_etext, 2MB). Therefore, some of the kernel text is still
>> on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to
>> round_up(_etext, 2MB), and use the rest of the page for modules and
>> BPF programs.
>> 
>> Here is an example:
>> 
>> [root@eth50-1 ~]# grep _etext /proc/kallsyms
>> ffffffff82202a08 T _etext
>> 
>> [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms  | tail -n 3
>> ffffffff8220f920 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup       [bpf]
>> ffffffff8220fa28 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new   [bpf]
>> ffffffff8220fad4 t bpf_prog_3bf73fa16f5e3d92_handle__sched_switch       [bpf]
>> 
>> [root@eth50-1 ~]#  grep 0xffffffff82200000 /sys/kernel/debug/page_tables/kernel
>> 0xffffffff82200000-0xffffffff82400000     2M     ro   PSE         x  pmd
>> 
>> [root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms
>> ffffffff822ba910 t xfs_flush_inodes_worker      [xfs]
>> ffffffff822bc580 t xfs_flush_inodes     [xfs]
>> 
>> ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, xfs
>> module, and bpf programs.
> 
> This is pretty rad. I'm not sure how you were able to squeeze xfs and
> *more* into one 2 MiB huge page though at least on debian 5.17.0-1-amd64
> xfs is 3.6847 MiB. How big is your XFS module?

In my build, xfs.ko is 50MB before strip, and 3.1MB after strip. But the
text section is about 1.3MB, so it fits in one 2MB page. 

> 
> I don't grok mm stuff, but I'd like to understand why we gain the ability
> of re-use the same 2 MiB page with this patch, from the code I really
> can't tail. Any pointers?

I don't quite follow the question here. In this case, we allocate one more
2MB page, so that some static kernel text can use it, and shall it with 
dynamic kernel text. Does this answer your questions?

I am working on a newer version of this. I am planning to resend when it 
is stable for BPF programs. For modules, I think we will need more 
discussion about the interface with arch code. 

Thanks,
Song

> 
> But, I'm still concerned about the free'ing case in terms of
> fragmentation for contigous memory, when free huage pages are available.
> 
>  Luis
> 
>> ---
>> arch/x86/mm/init_64.c |  3 ++-
>> mm/vmalloc.c          | 27 +++++++++++++++++++++++++++
>> 2 files changed, 29 insertions(+), 1 deletion(-)
>> 
>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>> index 39c5246964a9..d27d0af5beb5 100644
>> --- a/arch/x86/mm/init_64.c
>> +++ b/arch/x86/mm/init_64.c
>> @@ -1367,12 +1367,13 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
>> 
>> int kernel_set_to_readonly;
>> 
>> +#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
>> void mark_rodata_ro(void)
>> {
>> 	unsigned long start = PFN_ALIGN(_text);
>> 	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
>> 	unsigned long end = (unsigned long)__end_rodata_hpage_align;
>> -	unsigned long text_end = PFN_ALIGN(_etext);
>> +	unsigned long text_end = PMD_ALIGN(_etext);
>> 	unsigned long rodata_end = PFN_ALIGN(__end_rodata);
>> 	unsigned long all_end;
>> 
>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>> index 472287e71bf1..5f3b5df9313f 100644
>> --- a/mm/vmalloc.c
>> +++ b/mm/vmalloc.c
>> @@ -72,6 +72,11 @@ early_param("nohugevmalloc", set_nohugevmalloc);
>> static const bool vmap_allow_huge = false;
>> #endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
>> 
>> +#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
>> +
>> +static struct vm_struct text_tail_vm;
>> +static struct vmap_area text_tail_va;
>> +
>> bool is_vmalloc_addr(const void *x)
>> {
>> 	unsigned long addr = (unsigned long)kasan_reset_tag(x);
>> @@ -634,6 +639,8 @@ int is_vmalloc_or_module_addr(const void *x)
>> 	unsigned long addr = (unsigned long)kasan_reset_tag(x);
>> 	if (addr >= MODULES_VADDR && addr < MODULES_END)
>> 		return 1;
>> +	if (addr >= text_tail_va.va_start && addr < text_tail_va.va_end)
>> +		return 1;
>> #endif
>> 	return is_vmalloc_addr(x);
>> }
>> @@ -2371,6 +2378,25 @@ static void vmap_init_free_space(void)
>> 	}
>> }
>> 
>> +static void register_text_tail_vm(void)
>> +{
>> +	unsigned long start = PFN_ALIGN(_etext);
>> +	unsigned long end = PMD_ALIGN(_etext);
>> +	struct vmap_area *va;
>> +
>> +	va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
>> +	if (WARN_ON_ONCE(!va))
>> +		return;
>> +	text_tail_vm.addr = (void *)start;
>> +	text_tail_vm.size = end - start;
>> +	text_tail_vm.flags = VM_KERNEL_EXEC;
>> +	text_tail_va.va_start = start;
>> +	text_tail_va.va_end = end;
>> +	text_tail_va.vm = &text_tail_vm;
>> +	memcpy(va, &text_tail_va, sizeof(*va));
>> +	insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
>> +}
>> +
>> void __init vmalloc_init(void)
>> {
>> 	struct vmap_area *va;
>> @@ -2381,6 +2407,7 @@ void __init vmalloc_init(void)
>> 	 * Create the cache for vmap_area objects.
>> 	 */
>> 	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
>> +	register_text_tail_vm();
>> 
>> 	for_each_possible_cpu(i) {
>> 		struct vmap_block_queue *vbq;
>> -- 
>> 2.30.2
>> 


^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2022-10-07  6:54 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-18 22:42 [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
2022-08-18 22:42 ` [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec Song Liu
2022-10-06 23:15   ` Luis Chamberlain
2022-10-07  6:39     ` Song Liu
2022-08-18 22:42 ` [RFC 2/5] bpf: use vmalloc_exec Song Liu
2022-08-18 22:42 ` [RFC 3/5] modules, x86: use vmalloc_exec for module core Song Liu
2022-10-06 23:38   ` Luis Chamberlain
2022-10-07  6:46     ` Song Liu
2022-08-18 22:42 ` [RFC 4/5] vmalloc_exec: share a huge page with kernel text Song Liu
2022-10-06 23:44   ` Luis Chamberlain
2022-10-07  6:53     ` Song Liu
2022-08-18 22:42 ` [RFC 5/5] vmalloc: vfree_exec: free unused vm_struct Song Liu
2022-08-22 15:46 ` [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
2022-08-22 16:34   ` Peter Zijlstra
2022-08-22 16:56     ` Song Liu
2022-08-23  5:42       ` Peter Zijlstra
2022-08-23  6:39         ` Christophe Leroy
2022-08-23  6:57           ` Song Liu
2022-08-23  6:55         ` Song Liu
2022-08-24 17:06       ` Song Liu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.