* [PATCH v7 bpf-next 1/9] x86/Kconfig: select HAVE_ARCH_HUGE_VMALLOC with HAVE_ARCH_HUGE_VMAP
2022-01-28 23:45 [PATCH v7 bpf-next 0/9] bpf_prog_pack allocator Song Liu
@ 2022-01-28 23:45 ` Song Liu
2022-01-28 23:45 ` [PATCH v7 bpf-next 2/9] bpf: use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem Song Liu
` (7 subsequent siblings)
8 siblings, 0 replies; 14+ messages in thread
From: Song Liu @ 2022-01-28 23:45 UTC (permalink / raw)
To: bpf, netdev, linux-kernel
Cc: ast, daniel, andrii, kernel-team, peterz, x86, iii, Song Liu
From: Song Liu <songliubraving@fb.com>
This enables module_alloc() to allocate huge page for 2MB+ requests.
To check the difference of this change, we need enable config
CONFIG_PTDUMP_DEBUGFS, and call module_alloc(2MB). Before the change,
/sys/kernel/debug/page_tables/kernel shows pte for this map. With the
change, /sys/kernel/debug/page_tables/ show pmd for thie map.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
arch/x86/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8910b09b5601..b5d1582ea848 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -158,6 +158,7 @@ config X86
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
+ select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_JUMP_LABEL_RELATIVE
select HAVE_ARCH_KASAN if X86_64
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v7 bpf-next 2/9] bpf: use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem
2022-01-28 23:45 [PATCH v7 bpf-next 0/9] bpf_prog_pack allocator Song Liu
2022-01-28 23:45 ` [PATCH v7 bpf-next 1/9] x86/Kconfig: select HAVE_ARCH_HUGE_VMALLOC with HAVE_ARCH_HUGE_VMAP Song Liu
@ 2022-01-28 23:45 ` Song Liu
2022-01-28 23:45 ` [PATCH v7 bpf-next 3/9] bpf: use size instead of pages in bpf_binary_header Song Liu
` (6 subsequent siblings)
8 siblings, 0 replies; 14+ messages in thread
From: Song Liu @ 2022-01-28 23:45 UTC (permalink / raw)
To: bpf, netdev, linux-kernel
Cc: ast, daniel, andrii, kernel-team, peterz, x86, iii, Song Liu
From: Song Liu <songliubraving@fb.com>
This enables sub-page memory charge and allocation.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
include/linux/bpf.h | 4 ++--
kernel/bpf/core.c | 17 ++++++++---------
kernel/bpf/trampoline.c | 6 +++---
3 files changed, 13 insertions(+), 14 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e8ec8d2f2fe3..28ac752f21ba 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -838,8 +838,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
void bpf_image_ksym_del(struct bpf_ksym *ksym);
void bpf_ksym_add(struct bpf_ksym *ksym);
void bpf_ksym_del(struct bpf_ksym *ksym);
-int bpf_jit_charge_modmem(u32 pages);
-void bpf_jit_uncharge_modmem(u32 pages);
+int bpf_jit_charge_modmem(u32 size);
+void bpf_jit_uncharge_modmem(u32 size);
bool bpf_prog_has_trampoline(const struct bpf_prog *prog);
#else
static inline int bpf_trampoline_link_prog(struct bpf_prog *prog,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 0a1cfd8544b9..d96ad87f0a2c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -833,12 +833,11 @@ static int __init bpf_jit_charge_init(void)
}
pure_initcall(bpf_jit_charge_init);
-int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 size)
{
- if (atomic_long_add_return(pages, &bpf_jit_current) >
- (bpf_jit_limit >> PAGE_SHIFT)) {
+ if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) {
if (!bpf_capable()) {
- atomic_long_sub(pages, &bpf_jit_current);
+ atomic_long_sub(size, &bpf_jit_current);
return -EPERM;
}
}
@@ -846,9 +845,9 @@ int bpf_jit_charge_modmem(u32 pages)
return 0;
}
-void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 size)
{
- atomic_long_sub(pages, &bpf_jit_current);
+ atomic_long_sub(size, &bpf_jit_current);
}
void *__weak bpf_jit_alloc_exec(unsigned long size)
@@ -879,11 +878,11 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
pages = size / PAGE_SIZE;
- if (bpf_jit_charge_modmem(pages))
+ if (bpf_jit_charge_modmem(size))
return NULL;
hdr = bpf_jit_alloc_exec(size);
if (!hdr) {
- bpf_jit_uncharge_modmem(pages);
+ bpf_jit_uncharge_modmem(size);
return NULL;
}
@@ -906,7 +905,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
u32 pages = hdr->pages;
bpf_jit_free_exec(hdr);
- bpf_jit_uncharge_modmem(pages);
+ bpf_jit_uncharge_modmem(pages << PAGE_SHIFT);
}
/* This symbol is only overridden by archs that have different
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 4b6974a195c1..e76a488c09c3 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -213,7 +213,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work)
im = container_of(work, struct bpf_tramp_image, work);
bpf_image_ksym_del(&im->ksym);
bpf_jit_free_exec(im->image);
- bpf_jit_uncharge_modmem(1);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
percpu_ref_exit(&im->pcref);
kfree_rcu(im, rcu);
}
@@ -310,7 +310,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
if (!im)
goto out;
- err = bpf_jit_charge_modmem(1);
+ err = bpf_jit_charge_modmem(PAGE_SIZE);
if (err)
goto out_free_im;
@@ -332,7 +332,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
out_free_image:
bpf_jit_free_exec(im->image);
out_uncharge:
- bpf_jit_uncharge_modmem(1);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
out_free_im:
kfree(im);
out:
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v7 bpf-next 3/9] bpf: use size instead of pages in bpf_binary_header
2022-01-28 23:45 [PATCH v7 bpf-next 0/9] bpf_prog_pack allocator Song Liu
2022-01-28 23:45 ` [PATCH v7 bpf-next 1/9] x86/Kconfig: select HAVE_ARCH_HUGE_VMALLOC with HAVE_ARCH_HUGE_VMAP Song Liu
2022-01-28 23:45 ` [PATCH v7 bpf-next 2/9] bpf: use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem Song Liu
@ 2022-01-28 23:45 ` Song Liu
2022-01-28 23:45 ` [PATCH v7 bpf-next 4/9] bpf: use prog->jited_len in bpf_prog_ksym_set_addr() Song Liu
` (5 subsequent siblings)
8 siblings, 0 replies; 14+ messages in thread
From: Song Liu @ 2022-01-28 23:45 UTC (permalink / raw)
To: bpf, netdev, linux-kernel
Cc: ast, daniel, andrii, kernel-team, peterz, x86, iii, Song Liu
From: Song Liu <songliubraving@fb.com>
This is necessary to charge sub page memory for the BPF program.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
include/linux/filter.h | 6 +++---
kernel/bpf/core.c | 11 +++++------
2 files changed, 8 insertions(+), 9 deletions(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index d23e999dc032..5855eb474c62 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -548,7 +548,7 @@ struct sock_fprog_kern {
#define BPF_IMAGE_ALIGNMENT 8
struct bpf_binary_header {
- u32 pages;
+ u32 size;
u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
};
@@ -886,8 +886,8 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
{
set_vm_flush_reset_perms(hdr);
- set_memory_ro((unsigned long)hdr, hdr->pages);
- set_memory_x((unsigned long)hdr, hdr->pages);
+ set_memory_ro((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
+ set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
}
static inline struct bpf_binary_header *
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d96ad87f0a2c..69f348d9f816 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -543,7 +543,7 @@ bpf_prog_ksym_set_addr(struct bpf_prog *prog)
WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
prog->aux->ksym.start = (unsigned long) prog->bpf_func;
- prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE;
+ prog->aux->ksym.end = addr + hdr->size;
}
static void
@@ -866,7 +866,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_binary_header *hdr;
- u32 size, hole, start, pages;
+ u32 size, hole, start;
WARN_ON_ONCE(!is_power_of_2(alignment) ||
alignment > BPF_IMAGE_ALIGNMENT);
@@ -876,7 +876,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
* random section of illegal instructions.
*/
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
- pages = size / PAGE_SIZE;
if (bpf_jit_charge_modmem(size))
return NULL;
@@ -889,7 +888,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
/* Fill space with illegal/arch-dep instructions. */
bpf_fill_ill_insns(hdr, size);
- hdr->pages = pages;
+ hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
start = (get_random_int() % hole) & ~(alignment - 1);
@@ -902,10 +901,10 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
- u32 pages = hdr->pages;
+ u32 size = hdr->size;
bpf_jit_free_exec(hdr);
- bpf_jit_uncharge_modmem(pages << PAGE_SHIFT);
+ bpf_jit_uncharge_modmem(size);
}
/* This symbol is only overridden by archs that have different
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v7 bpf-next 4/9] bpf: use prog->jited_len in bpf_prog_ksym_set_addr()
2022-01-28 23:45 [PATCH v7 bpf-next 0/9] bpf_prog_pack allocator Song Liu
` (2 preceding siblings ...)
2022-01-28 23:45 ` [PATCH v7 bpf-next 3/9] bpf: use size instead of pages in bpf_binary_header Song Liu
@ 2022-01-28 23:45 ` Song Liu
2022-01-28 23:45 ` [PATCH v7 bpf-next 5/9] x86/alternative: introduce text_poke_copy Song Liu
` (4 subsequent siblings)
8 siblings, 0 replies; 14+ messages in thread
From: Song Liu @ 2022-01-28 23:45 UTC (permalink / raw)
To: bpf, netdev, linux-kernel
Cc: ast, daniel, andrii, kernel-team, peterz, x86, iii, Song Liu
Using prog->jited_len is simpler and more accurate than current
estimation (header + header->size).
Signed-off-by: Song Liu <song@kernel.org>
---
kernel/bpf/core.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 69f348d9f816..7cbcf6bfbb52 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -537,13 +537,10 @@ long bpf_jit_limit_max __read_mostly;
static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
{
- const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
- unsigned long addr = (unsigned long)hdr;
-
WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
prog->aux->ksym.start = (unsigned long) prog->bpf_func;
- prog->aux->ksym.end = addr + hdr->size;
+ prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len;
}
static void
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v7 bpf-next 5/9] x86/alternative: introduce text_poke_copy
2022-01-28 23:45 [PATCH v7 bpf-next 0/9] bpf_prog_pack allocator Song Liu
` (3 preceding siblings ...)
2022-01-28 23:45 ` [PATCH v7 bpf-next 4/9] bpf: use prog->jited_len in bpf_prog_ksym_set_addr() Song Liu
@ 2022-01-28 23:45 ` Song Liu
2022-01-28 23:45 ` [PATCH v7 bpf-next 6/9] bpf: introduce bpf_arch_text_copy Song Liu
` (3 subsequent siblings)
8 siblings, 0 replies; 14+ messages in thread
From: Song Liu @ 2022-01-28 23:45 UTC (permalink / raw)
To: bpf, netdev, linux-kernel
Cc: ast, daniel, andrii, kernel-team, peterz, x86, iii, Song Liu
This will be used by BPF jit compiler to dump JITed binary to a RX huge
page, and thus allow multiple BPF programs sharing the a huge (2MB) page.
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Song Liu <song@kernel.org>
---
arch/x86/include/asm/text-patching.h | 1 +
arch/x86/kernel/alternative.c | 32 ++++++++++++++++++++++++++++
2 files changed, 33 insertions(+)
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index b7421780e4e9..4cc18ba1b75e 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -44,6 +44,7 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len);
extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void text_poke_sync(void);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
+extern void *text_poke_copy(void *addr, const void *opcode, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 23fb4d51a5da..903a415c19fa 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1102,6 +1102,38 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
return __text_poke(addr, opcode, len);
}
+/**
+ * text_poke_copy - Copy instructions into (an unused part of) RX memory
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy, could be more than 2x PAGE_SIZE
+ *
+ * Not safe against concurrent execution; useful for JITs to dump
+ * new code blocks into unused regions of RX memory. Can be used in
+ * conjunction with synchronize_rcu_tasks() to wait for existing
+ * execution to quiesce after having made sure no existing functions
+ * pointers are live.
+ */
+void *text_poke_copy(void *addr, const void *opcode, size_t len)
+{
+ unsigned long start = (unsigned long)addr;
+ size_t patched = 0;
+
+ if (WARN_ON_ONCE(core_kernel_text(start)))
+ return NULL;
+
+ while (patched < len) {
+ unsigned long ptr = start + patched;
+ size_t s;
+
+ s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
+
+ __text_poke((void *)ptr, opcode + patched, s);
+ patched += s;
+ }
+ return addr;
+}
+
static void do_sync_core(void *info)
{
sync_core();
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v7 bpf-next 6/9] bpf: introduce bpf_arch_text_copy
2022-01-28 23:45 [PATCH v7 bpf-next 0/9] bpf_prog_pack allocator Song Liu
` (4 preceding siblings ...)
2022-01-28 23:45 ` [PATCH v7 bpf-next 5/9] x86/alternative: introduce text_poke_copy Song Liu
@ 2022-01-28 23:45 ` Song Liu
2022-01-28 23:45 ` [PATCH v7 bpf-next 7/9] bpf: introduce bpf_prog_pack allocator Song Liu
` (2 subsequent siblings)
8 siblings, 0 replies; 14+ messages in thread
From: Song Liu @ 2022-01-28 23:45 UTC (permalink / raw)
To: bpf, netdev, linux-kernel
Cc: ast, daniel, andrii, kernel-team, peterz, x86, iii, Song Liu
This will be used to copy JITed text to RO protected module memory. On
x86, bpf_arch_text_copy is implemented with text_poke_copy.
bpf_arch_text_copy returns pointer to dst on success, and ERR_PTR(errno)
on errors.
Signed-off-by: Song Liu <song@kernel.org>
---
arch/x86/net/bpf_jit_comp.c | 7 +++++++
include/linux/bpf.h | 2 ++
kernel/bpf/core.c | 5 +++++
3 files changed, 14 insertions(+)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index ce1f86f245c9..9792bf10d881 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2413,3 +2413,10 @@ bool bpf_jit_supports_kfunc_call(void)
{
return true;
}
+
+void *bpf_arch_text_copy(void *dst, void *src, size_t len)
+{
+ if (text_poke_copy(dst, src, len) == NULL)
+ return ERR_PTR(-EINVAL);
+ return dst;
+}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 28ac752f21ba..7f58fe256671 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2350,6 +2350,8 @@ enum bpf_text_poke_type {
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *addr1, void *addr2);
+void *bpf_arch_text_copy(void *dst, void *src, size_t len);
+
struct btf_id_set;
bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7cbcf6bfbb52..dc0142e20c72 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2448,6 +2448,11 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
return -ENOTSUPP;
}
+void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
+{
+ return ERR_PTR(-ENOTSUPP);
+}
+
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v7 bpf-next 7/9] bpf: introduce bpf_prog_pack allocator
2022-01-28 23:45 [PATCH v7 bpf-next 0/9] bpf_prog_pack allocator Song Liu
` (5 preceding siblings ...)
2022-01-28 23:45 ` [PATCH v7 bpf-next 6/9] bpf: introduce bpf_arch_text_copy Song Liu
@ 2022-01-28 23:45 ` Song Liu
2022-02-01 0:06 ` Daniel Borkmann
2022-01-28 23:45 ` [PATCH v7 bpf-next 8/9] bpf: introduce bpf_jit_binary_pack_[alloc|finalize|free] Song Liu
2022-01-28 23:45 ` [PATCH v7 bpf-next 9/9] bpf, x86_64: use bpf_jit_binary_pack_alloc Song Liu
8 siblings, 1 reply; 14+ messages in thread
From: Song Liu @ 2022-01-28 23:45 UTC (permalink / raw)
To: bpf, netdev, linux-kernel
Cc: ast, daniel, andrii, kernel-team, peterz, x86, iii, Song Liu
Most BPF programs are small, but they consume a page each. For systems
with busy traffic and many BPF programs, this could add significant
pressure to instruction TLB.
Introduce bpf_prog_pack allocator to pack multiple BPF programs in a huge
page. The memory is then allocated in 64 byte chunks.
Memory allocated by bpf_prog_pack allocator is RO protected after initial
allocation. To write to it, the user (jit engine) need to use text poke
API.
Signed-off-by: Song Liu <song@kernel.org>
---
kernel/bpf/core.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 127 insertions(+)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index dc0142e20c72..25e34caa9a95 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -805,6 +805,133 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
return slot;
}
+/*
+ * BPF program pack allocator.
+ *
+ * Most BPF programs are pretty small. Allocating a hole page for each
+ * program is sometime a waste. Many small bpf program also adds pressure
+ * to instruction TLB. To solve this issue, we introduce a BPF program pack
+ * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
+ * to host BPF programs.
+ */
+#define BPF_PROG_PACK_SIZE HPAGE_PMD_SIZE
+#define BPF_PROG_CHUNK_SHIFT 6
+#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT)
+#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1))
+#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
+
+struct bpf_prog_pack {
+ struct list_head list;
+ void *ptr;
+ unsigned long bitmap[BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)];
+};
+
+#define BPF_PROG_MAX_PACK_PROG_SIZE HPAGE_PMD_SIZE
+#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
+
+static DEFINE_MUTEX(pack_mutex);
+static LIST_HEAD(pack_list);
+
+static struct bpf_prog_pack *alloc_new_pack(void)
+{
+ struct bpf_prog_pack *pack;
+
+ pack = kzalloc(sizeof(*pack), GFP_KERNEL);
+ if (!pack)
+ return NULL;
+ pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
+ if (!pack->ptr) {
+ kfree(pack);
+ return NULL;
+ }
+ bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
+ list_add_tail(&pack->list, &pack_list);
+
+ set_vm_flush_reset_perms(pack->ptr);
+ set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ return pack;
+}
+
+static void *bpf_prog_pack_alloc(u32 size)
+{
+ unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
+ struct bpf_prog_pack *pack;
+ unsigned long pos;
+ void *ptr = NULL;
+
+ if (size > BPF_PROG_MAX_PACK_PROG_SIZE) {
+ size = round_up(size, PAGE_SIZE);
+ ptr = module_alloc(size);
+ if (ptr) {
+ set_vm_flush_reset_perms(ptr);
+ set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
+ set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
+ }
+ return ptr;
+ }
+ mutex_lock(&pack_mutex);
+ list_for_each_entry(pack, &pack_list, list) {
+ pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+ nbits, 0);
+ if (pos < BPF_PROG_CHUNK_COUNT)
+ goto found_free_area;
+ }
+
+ pack = alloc_new_pack();
+ if (!pack)
+ goto out;
+
+ pos = 0;
+
+found_free_area:
+ bitmap_set(pack->bitmap, pos, nbits);
+ ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
+
+out:
+ mutex_unlock(&pack_mutex);
+ return ptr;
+}
+
+static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+{
+ struct bpf_prog_pack *pack = NULL, *tmp;
+ unsigned int nbits;
+ unsigned long pos;
+ void *pack_ptr;
+
+ if (hdr->size > BPF_PROG_MAX_PACK_PROG_SIZE) {
+ module_memfree(hdr);
+ return;
+ }
+
+ pack_ptr = (void *)((unsigned long)hdr & ~(BPF_PROG_PACK_SIZE - 1));
+ mutex_lock(&pack_mutex);
+
+ list_for_each_entry(tmp, &pack_list, list) {
+ if (tmp->ptr == pack_ptr) {
+ pack = tmp;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
+ goto out;
+
+ nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
+ pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
+
+ bitmap_clear(pack->bitmap, pos, nbits);
+ if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+ BPF_PROG_CHUNK_COUNT, 0) == 0) {
+ list_del(&pack->list);
+ module_memfree(pack->ptr);
+ kfree(pack);
+ }
+out:
+ mutex_unlock(&pack_mutex);
+}
+
static atomic_long_t bpf_jit_current;
/* Can be overridden by an arch's JIT compiler if it has a custom,
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v7 bpf-next 7/9] bpf: introduce bpf_prog_pack allocator
2022-01-28 23:45 ` [PATCH v7 bpf-next 7/9] bpf: introduce bpf_prog_pack allocator Song Liu
@ 2022-02-01 0:06 ` Daniel Borkmann
2022-02-01 1:34 ` Song Liu
0 siblings, 1 reply; 14+ messages in thread
From: Daniel Borkmann @ 2022-02-01 0:06 UTC (permalink / raw)
To: Song Liu, bpf, netdev, linux-kernel
Cc: ast, andrii, kernel-team, peterz, x86, iii, npiggin
On 1/29/22 12:45 AM, Song Liu wrote:
> Most BPF programs are small, but they consume a page each. For systems
> with busy traffic and many BPF programs, this could add significant
> pressure to instruction TLB.
>
> Introduce bpf_prog_pack allocator to pack multiple BPF programs in a huge
> page. The memory is then allocated in 64 byte chunks.
>
> Memory allocated by bpf_prog_pack allocator is RO protected after initial
> allocation. To write to it, the user (jit engine) need to use text poke
> API.
Did you benchmark the program load times under this API, e.g. how much
overhead is expected for very large programs?
> Signed-off-by: Song Liu <song@kernel.org>
> ---
> kernel/bpf/core.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 127 insertions(+)
>
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index dc0142e20c72..25e34caa9a95 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -805,6 +805,133 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
> return slot;
> }
>
> +/*
> + * BPF program pack allocator.
> + *
> + * Most BPF programs are pretty small. Allocating a hole page for each
> + * program is sometime a waste. Many small bpf program also adds pressure
> + * to instruction TLB. To solve this issue, we introduce a BPF program pack
> + * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
> + * to host BPF programs.
> + */
> +#define BPF_PROG_PACK_SIZE HPAGE_PMD_SIZE
> +#define BPF_PROG_CHUNK_SHIFT 6
> +#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT)
> +#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1))
> +#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
> +
> +struct bpf_prog_pack {
> + struct list_head list;
> + void *ptr;
> + unsigned long bitmap[BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)];
> +};
> +
> +#define BPF_PROG_MAX_PACK_PROG_SIZE HPAGE_PMD_SIZE
> +#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
> +
> +static DEFINE_MUTEX(pack_mutex);
> +static LIST_HEAD(pack_list);
> +
> +static struct bpf_prog_pack *alloc_new_pack(void)
> +{
> + struct bpf_prog_pack *pack;
> +
> + pack = kzalloc(sizeof(*pack), GFP_KERNEL);
> + if (!pack)
> + return NULL;
> + pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
> + if (!pack->ptr) {
> + kfree(pack);
> + return NULL;
> + }
> + bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
> + list_add_tail(&pack->list, &pack_list);
> +
> + set_vm_flush_reset_perms(pack->ptr);
> + set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
> + set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
> + return pack;
> +}
> +
> +static void *bpf_prog_pack_alloc(u32 size)
> +{
> + unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
> + struct bpf_prog_pack *pack;
> + unsigned long pos;
> + void *ptr = NULL;
> +
> + if (size > BPF_PROG_MAX_PACK_PROG_SIZE) {
> + size = round_up(size, PAGE_SIZE);
> + ptr = module_alloc(size);
> + if (ptr) {
> + set_vm_flush_reset_perms(ptr);
> + set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
> + set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
> + }
> + return ptr;
> + }
> + mutex_lock(&pack_mutex);
> + list_for_each_entry(pack, &pack_list, list) {
> + pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
> + nbits, 0);
> + if (pos < BPF_PROG_CHUNK_COUNT)
> + goto found_free_area;
> + }
> +
> + pack = alloc_new_pack();
> + if (!pack)
> + goto out;
Will this effectively disable the JIT for all bpf_prog_pack_alloc requests <=
BPF_PROG_MAX_PACK_PROG_SIZE when vmap_allow_huge is false (e.g. boot param via
nohugevmalloc) ?
> + pos = 0;
> +
> +found_free_area:
> + bitmap_set(pack->bitmap, pos, nbits);
> + ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
> +
> +out:
> + mutex_unlock(&pack_mutex);
> + return ptr;
> +}
> +
> +static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
> +{
> + struct bpf_prog_pack *pack = NULL, *tmp;
> + unsigned int nbits;
> + unsigned long pos;
> + void *pack_ptr;
> +
> + if (hdr->size > BPF_PROG_MAX_PACK_PROG_SIZE) {
> + module_memfree(hdr);
> + return;
> + }
> +
> + pack_ptr = (void *)((unsigned long)hdr & ~(BPF_PROG_PACK_SIZE - 1));
> + mutex_lock(&pack_mutex);
> +
> + list_for_each_entry(tmp, &pack_list, list) {
> + if (tmp->ptr == pack_ptr) {
> + pack = tmp;
> + break;
> + }
> + }
> +
> + if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
> + goto out;
> +
> + nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
> + pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
> +
> + bitmap_clear(pack->bitmap, pos, nbits);
> + if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
> + BPF_PROG_CHUNK_COUNT, 0) == 0) {
> + list_del(&pack->list);
> + module_memfree(pack->ptr);
> + kfree(pack);
> + }
> +out:
> + mutex_unlock(&pack_mutex);
> +}
> +
> static atomic_long_t bpf_jit_current;
>
> /* Can be overridden by an arch's JIT compiler if it has a custom,
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v7 bpf-next 7/9] bpf: introduce bpf_prog_pack allocator
2022-02-01 0:06 ` Daniel Borkmann
@ 2022-02-01 1:34 ` Song Liu
0 siblings, 0 replies; 14+ messages in thread
From: Song Liu @ 2022-02-01 1:34 UTC (permalink / raw)
To: Daniel Borkmann
Cc: bpf, Networking, open list, Alexei Starovoitov, Andrii Nakryiko,
Kernel Team, Peter Zijlstra, X86 ML, Ilya Leoshkevich,
Nicholas Piggin
On Mon, Jan 31, 2022 at 4:06 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 1/29/22 12:45 AM, Song Liu wrote:
> > Most BPF programs are small, but they consume a page each. For systems
> > with busy traffic and many BPF programs, this could add significant
> > pressure to instruction TLB.
> >
> > Introduce bpf_prog_pack allocator to pack multiple BPF programs in a huge
> > page. The memory is then allocated in 64 byte chunks.
> >
> > Memory allocated by bpf_prog_pack allocator is RO protected after initial
> > allocation. To write to it, the user (jit engine) need to use text poke
> > API.
>
> Did you benchmark the program load times under this API, e.g. how much
> overhead is expected for very large programs?
For the two scale tests in test_verifier:
./test_verifier 965 966
#965/p scale: scale test 1 OK
#966/p scale: scale test 2 OK
The runtime is about 0.6 second before the set and 0.7 second after.
Is this a good benchmark?
>
> > Signed-off-by: Song Liu <song@kernel.org>
> > ---
> > kernel/bpf/core.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 127 insertions(+)
> >
[...]
> > + }
> > + mutex_lock(&pack_mutex);
> > + list_for_each_entry(pack, &pack_list, list) {
> > + pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
> > + nbits, 0);
> > + if (pos < BPF_PROG_CHUNK_COUNT)
> > + goto found_free_area;
> > + }
> > +
> > + pack = alloc_new_pack();
> > + if (!pack)
> > + goto out;
>
> Will this effectively disable the JIT for all bpf_prog_pack_alloc requests <=
> BPF_PROG_MAX_PACK_PROG_SIZE when vmap_allow_huge is false (e.g. boot param via
> nohugevmalloc) ?
This won't disable JIT. It will just allocate 512x 4k pages for a 2MB pack. We
will mark the whole 2MB RO, same as a 2MB huge page. We still benefit
from this as this avoids poking the linear mapping (1GB pages) to 4kB pages
with set_memory_ro().
Thanks,
Song
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v7 bpf-next 8/9] bpf: introduce bpf_jit_binary_pack_[alloc|finalize|free]
2022-01-28 23:45 [PATCH v7 bpf-next 0/9] bpf_prog_pack allocator Song Liu
` (6 preceding siblings ...)
2022-01-28 23:45 ` [PATCH v7 bpf-next 7/9] bpf: introduce bpf_prog_pack allocator Song Liu
@ 2022-01-28 23:45 ` Song Liu
2022-02-01 0:21 ` Daniel Borkmann
2022-01-28 23:45 ` [PATCH v7 bpf-next 9/9] bpf, x86_64: use bpf_jit_binary_pack_alloc Song Liu
8 siblings, 1 reply; 14+ messages in thread
From: Song Liu @ 2022-01-28 23:45 UTC (permalink / raw)
To: bpf, netdev, linux-kernel
Cc: ast, daniel, andrii, kernel-team, peterz, x86, iii, Song Liu
From: Song Liu <songliubraving@fb.com>
This is the jit binary allocator built on top of bpf_prog_pack.
bpf_prog_pack allocates RO memory, which cannot be used directly by the
JIT engine. Therefore, a temporary rw buffer is allocated for the JIT
engine. Once JIT is done, bpf_jit_binary_pack_finalize is used to copy
the program to the RO memory.
bpf_jit_binary_pack_alloc reserves 16 bytes of extra space for illegal
instructions, which is small than the 128 bytes space reserved by
bpf_jit_binary_alloc. This change is necessary for bpf_jit_binary_hdr
to find the correct header. Also, flag use_bpf_prog_pack is added to
differentiate a program allocated by bpf_jit_binary_pack_alloc.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
include/linux/bpf.h | 1 +
include/linux/filter.h | 21 ++++----
kernel/bpf/core.c | 108 ++++++++++++++++++++++++++++++++++++++++-
3 files changed, 120 insertions(+), 10 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7f58fe256671..06d119c472e7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -945,6 +945,7 @@ struct bpf_prog_aux {
bool sleepable;
bool tail_call_reachable;
bool xdp_has_frags;
+ bool use_bpf_prog_pack;
struct hlist_node tramp_hlist;
/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
const struct btf_type *attach_func_proto;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 5855eb474c62..1cb1af917617 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -890,15 +890,6 @@ static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
}
-static inline struct bpf_binary_header *
-bpf_jit_binary_hdr(const struct bpf_prog *fp)
-{
- unsigned long real_start = (unsigned long)fp->bpf_func;
- unsigned long addr = real_start & PAGE_MASK;
-
- return (void *)addr;
-}
-
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
{
@@ -1068,6 +1059,18 @@ void *bpf_jit_alloc_exec(unsigned long size);
void bpf_jit_free_exec(void *addr);
void bpf_jit_free(struct bpf_prog *fp);
+struct bpf_binary_header *
+bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image,
+ unsigned int alignment,
+ struct bpf_binary_header **rw_hdr,
+ u8 **rw_image,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns);
+int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
+ struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header);
+void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header);
+
int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
struct bpf_jit_poke_descriptor *poke);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 25e34caa9a95..ff0c51ef1cb7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1031,6 +1031,109 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
bpf_jit_uncharge_modmem(size);
}
+/* Allocate jit binary from bpf_prog_pack allocator.
+ * Since the allocated meory is RO+X, the JIT engine cannot write directly
+ * to the memory. To solve this problem, a RW buffer is also allocated at
+ * as the same time. The JIT engine should calculate offsets based on the
+ * RO memory address, but write JITed program to the RW buffer. Once the
+ * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
+ * the JITed program to the RO memory.
+ */
+struct bpf_binary_header *
+bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
+ unsigned int alignment,
+ struct bpf_binary_header **rw_header,
+ u8 **rw_image,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_binary_header *ro_header;
+ u32 size, hole, start;
+
+ WARN_ON_ONCE(!is_power_of_2(alignment) ||
+ alignment > BPF_IMAGE_ALIGNMENT);
+
+ /* add 16 bytes for a random section of illegal instructions */
+ size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
+
+ if (bpf_jit_charge_modmem(size))
+ return NULL;
+ ro_header = bpf_prog_pack_alloc(size);
+ if (!ro_header) {
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ *rw_header = kvmalloc(size, GFP_KERNEL);
+ if (!*rw_header) {
+ bpf_prog_pack_free(ro_header);
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ /* Fill space with illegal/arch-dep instructions. */
+ bpf_fill_ill_insns(*rw_header, size);
+ (*rw_header)->size = size;
+
+ hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
+ BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
+ start = (get_random_int() % hole) & ~(alignment - 1);
+
+ *image_ptr = &ro_header->image[start];
+ *rw_image = &(*rw_header)->image[start];
+
+ return ro_header;
+}
+
+/* Copy JITed text from rw_header to its final location, the ro_header. */
+int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
+ struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header)
+{
+ void *ptr;
+
+ ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
+
+ kvfree(rw_header);
+
+ if (IS_ERR(ptr)) {
+ bpf_prog_pack_free(ro_header);
+ return PTR_ERR(ptr);
+ }
+ prog->aux->use_bpf_prog_pack = true;
+ return 0;
+}
+
+/* bpf_jit_binary_pack_free is called in two different scenarios:
+ * 1) when the program is freed after;
+ * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
+ * For case 2), we need to free both the RO memory and the RW buffer.
+ * Also, ro_header->size in 2) is not properly set yet, so rw_header->size
+ * is used for uncharge.
+ */
+void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header)
+{
+ u32 size = rw_header ? rw_header->size : ro_header->size;
+
+ bpf_prog_pack_free(ro_header);
+ kvfree(rw_header);
+ bpf_jit_uncharge_modmem(size);
+}
+
+static inline struct bpf_binary_header *
+bpf_jit_binary_hdr(const struct bpf_prog *fp)
+{
+ unsigned long real_start = (unsigned long)fp->bpf_func;
+ unsigned long addr;
+
+ if (fp->aux->use_bpf_prog_pack)
+ addr = real_start & BPF_PROG_CHUNK_MASK;
+ else
+ addr = real_start & PAGE_MASK;
+
+ return (void *)addr;
+}
+
/* This symbol is only overridden by archs that have different
* requirements than the usual eBPF JITs, f.e. when they only
* implement cBPF JIT, do not set images read-only, etc.
@@ -1040,7 +1143,10 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
if (fp->jited) {
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
- bpf_jit_binary_free(hdr);
+ if (fp->aux->use_bpf_prog_pack)
+ bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */);
+ else
+ bpf_jit_binary_free(hdr);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
}
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v7 bpf-next 8/9] bpf: introduce bpf_jit_binary_pack_[alloc|finalize|free]
2022-01-28 23:45 ` [PATCH v7 bpf-next 8/9] bpf: introduce bpf_jit_binary_pack_[alloc|finalize|free] Song Liu
@ 2022-02-01 0:21 ` Daniel Borkmann
2022-02-01 0:35 ` Song Liu
0 siblings, 1 reply; 14+ messages in thread
From: Daniel Borkmann @ 2022-02-01 0:21 UTC (permalink / raw)
To: Song Liu, bpf, netdev, linux-kernel
Cc: ast, andrii, kernel-team, peterz, x86, iii, Song Liu
On 1/29/22 12:45 AM, Song Liu wrote:
[...]
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 25e34caa9a95..ff0c51ef1cb7 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -1031,6 +1031,109 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
> bpf_jit_uncharge_modmem(size);
> }
>
> +/* Allocate jit binary from bpf_prog_pack allocator.
> + * Since the allocated meory is RO+X, the JIT engine cannot write directly
nit: meory
> + * to the memory. To solve this problem, a RW buffer is also allocated at
> + * as the same time. The JIT engine should calculate offsets based on the
> + * RO memory address, but write JITed program to the RW buffer. Once the
> + * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
> + * the JITed program to the RO memory.
> + */
> +struct bpf_binary_header *
> +bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
> + unsigned int alignment,
> + struct bpf_binary_header **rw_header,
> + u8 **rw_image,
> + bpf_jit_fill_hole_t bpf_fill_ill_insns)
> +{
> + struct bpf_binary_header *ro_header;
> + u32 size, hole, start;
> +
> + WARN_ON_ONCE(!is_power_of_2(alignment) ||
> + alignment > BPF_IMAGE_ALIGNMENT);
> +
> + /* add 16 bytes for a random section of illegal instructions */
> + size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
> +
> + if (bpf_jit_charge_modmem(size))
> + return NULL;
> + ro_header = bpf_prog_pack_alloc(size);
> + if (!ro_header) {
> + bpf_jit_uncharge_modmem(size);
> + return NULL;
> + }
> +
> + *rw_header = kvmalloc(size, GFP_KERNEL);
> + if (!*rw_header) {
> + bpf_prog_pack_free(ro_header);
> + bpf_jit_uncharge_modmem(size);
> + return NULL;
> + }
> +
> + /* Fill space with illegal/arch-dep instructions. */
> + bpf_fill_ill_insns(*rw_header, size);
> + (*rw_header)->size = size;
> +
> + hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
> + BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
> + start = (get_random_int() % hole) & ~(alignment - 1);
> +
> + *image_ptr = &ro_header->image[start];
> + *rw_image = &(*rw_header)->image[start];
> +
> + return ro_header;
> +}
> +
> +/* Copy JITed text from rw_header to its final location, the ro_header. */
> +int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
> + struct bpf_binary_header *ro_header,
> + struct bpf_binary_header *rw_header)
> +{
> + void *ptr;
> +
> + ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
Does this need to be wrapped with a text_mutex lock/unlock pair given
text_poke_copy() internally relies on __text_poke() ?
> + kvfree(rw_header);
> +
> + if (IS_ERR(ptr)) {
> + bpf_prog_pack_free(ro_header);
> + return PTR_ERR(ptr);
> + }
> + prog->aux->use_bpf_prog_pack = true;
> + return 0;
> +}
> +
[...]
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v7 bpf-next 8/9] bpf: introduce bpf_jit_binary_pack_[alloc|finalize|free]
2022-02-01 0:21 ` Daniel Borkmann
@ 2022-02-01 0:35 ` Song Liu
0 siblings, 0 replies; 14+ messages in thread
From: Song Liu @ 2022-02-01 0:35 UTC (permalink / raw)
To: Daniel Borkmann
Cc: bpf, Networking, open list, Alexei Starovoitov, Andrii Nakryiko,
Kernel Team, Peter Zijlstra, X86 ML, Ilya Leoshkevich, Song Liu
On Mon, Jan 31, 2022 at 4:21 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 1/29/22 12:45 AM, Song Liu wrote:
> [...]
[...]
> > +}
> > +
> > +/* Copy JITed text from rw_header to its final location, the ro_header. */
> > +int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
> > + struct bpf_binary_header *ro_header,
> > + struct bpf_binary_header *rw_header)
> > +{
> > + void *ptr;
> > +
> > + ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
>
> Does this need to be wrapped with a text_mutex lock/unlock pair given
> text_poke_copy() internally relies on __text_poke() ?
Yes... Good catch. I guess we may do the lock in text_poke_copy().
Thanks,
Song
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v7 bpf-next 9/9] bpf, x86_64: use bpf_jit_binary_pack_alloc
2022-01-28 23:45 [PATCH v7 bpf-next 0/9] bpf_prog_pack allocator Song Liu
` (7 preceding siblings ...)
2022-01-28 23:45 ` [PATCH v7 bpf-next 8/9] bpf: introduce bpf_jit_binary_pack_[alloc|finalize|free] Song Liu
@ 2022-01-28 23:45 ` Song Liu
8 siblings, 0 replies; 14+ messages in thread
From: Song Liu @ 2022-01-28 23:45 UTC (permalink / raw)
To: bpf, netdev, linux-kernel
Cc: ast, daniel, andrii, kernel-team, peterz, x86, iii, Song Liu
From: Song Liu <songliubraving@fb.com>
Use bpf_jit_binary_pack_alloc in x86_64 jit. The jit engine first writes
the program to the rw buffer. When the jit is done, the program is copied
to the final location with bpf_jit_binary_pack_finalize.
Note that we need to do bpf_tail_call_direct_fixup after finalize.
Therefore, the text_live = false logic in __bpf_arch_text_poke is no
longer needed.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
arch/x86/net/bpf_jit_comp.c | 59 +++++++++++++++++++------------------
1 file changed, 31 insertions(+), 28 deletions(-)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 9792bf10d881..afb957e63e3d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -330,8 +330,7 @@ static int emit_jump(u8 **pprog, void *func, void *ip)
}
static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
- void *old_addr, void *new_addr,
- const bool text_live)
+ void *old_addr, void *new_addr)
{
const u8 *nop_insn = x86_nops[5];
u8 old_insn[X86_PATCH_SIZE];
@@ -365,10 +364,7 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
goto out;
ret = 1;
if (memcmp(ip, new_insn, X86_PATCH_SIZE)) {
- if (text_live)
- text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
- else
- memcpy(ip, new_insn, X86_PATCH_SIZE);
+ text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
ret = 0;
}
out:
@@ -384,7 +380,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
/* BPF poking in modules is not supported */
return -EINVAL;
- return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true);
+ return __bpf_arch_text_poke(ip, t, old_addr, new_addr);
}
#define EMIT_LFENCE() EMIT3(0x0F, 0xAE, 0xE8)
@@ -558,24 +554,15 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
mutex_lock(&array->aux->poke_mutex);
target = array->ptrs[poke->tail_call.key];
if (target) {
- /* Plain memcpy is used when image is not live yet
- * and still not locked as read-only. Once poke
- * location is active (poke->tailcall_target_stable),
- * any parallel bpf_arch_text_poke() might occur
- * still on the read-write image until we finally
- * locked it as read-only. Both modifications on
- * the given image are under text_mutex to avoid
- * interference.
- */
ret = __bpf_arch_text_poke(poke->tailcall_target,
BPF_MOD_JUMP, NULL,
(u8 *)target->bpf_func +
- poke->adj_off, false);
+ poke->adj_off);
BUG_ON(ret < 0);
ret = __bpf_arch_text_poke(poke->tailcall_bypass,
BPF_MOD_JUMP,
(u8 *)poke->tailcall_target +
- X86_PATCH_SIZE, NULL, false);
+ X86_PATCH_SIZE, NULL);
BUG_ON(ret < 0);
}
WRITE_ONCE(poke->tailcall_target_stable, true);
@@ -867,7 +854,7 @@ static void emit_nops(u8 **pprog, int len)
#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
-static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
int oldproglen, struct jit_context *ctx, bool jmp_padding)
{
bool tail_call_reachable = bpf_prog->aux->tail_call_reachable;
@@ -894,8 +881,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
push_callee_regs(&prog, callee_regs_used);
ilen = prog - temp;
- if (image)
- memcpy(image + proglen, temp, ilen);
+ if (rw_image)
+ memcpy(rw_image + proglen, temp, ilen);
proglen += ilen;
addrs[0] = proglen;
prog = temp;
@@ -1324,8 +1311,10 @@ st: if (is_imm8(insn->off))
pr_err("extable->insn doesn't fit into 32-bit\n");
return -EFAULT;
}
- ex->insn = delta;
+ /* switch ex to rw buffer for writes */
+ ex = (void *)rw_image + ((void *)ex - (void *)image);
+ ex->insn = delta;
ex->type = EX_TYPE_BPF;
if (dst_reg > BPF_REG_9) {
@@ -1706,7 +1695,7 @@ st: if (is_imm8(insn->off))
pr_err("bpf_jit: fatal error\n");
return -EFAULT;
}
- memcpy(image + proglen, temp, ilen);
+ memcpy(rw_image + proglen, temp, ilen);
}
proglen += ilen;
addrs[i] = proglen;
@@ -2247,6 +2236,7 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
}
struct x64_jit_data {
+ struct bpf_binary_header *rw_header;
struct bpf_binary_header *header;
int *addrs;
u8 *image;
@@ -2259,6 +2249,7 @@ struct x64_jit_data {
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
{
+ struct bpf_binary_header *rw_header = NULL;
struct bpf_binary_header *header = NULL;
struct bpf_prog *tmp, *orig_prog = prog;
struct x64_jit_data *jit_data;
@@ -2267,6 +2258,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
bool tmp_blinded = false;
bool extra_pass = false;
bool padding = false;
+ u8 *rw_image = NULL;
u8 *image = NULL;
int *addrs;
int pass;
@@ -2302,6 +2294,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
oldproglen = jit_data->proglen;
image = jit_data->image;
header = jit_data->header;
+ rw_header = jit_data->rw_header;
+ rw_image = (void *)rw_header + ((void *)image - (void *)header);
extra_pass = true;
padding = true;
goto skip_init_addrs;
@@ -2332,12 +2326,12 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
for (pass = 0; pass < MAX_PASSES || image; pass++) {
if (!padding && pass >= PADDING_PASSES)
padding = true;
- proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding);
+ proglen = do_jit(prog, addrs, image, rw_image, oldproglen, &ctx, padding);
if (proglen <= 0) {
out_image:
image = NULL;
if (header)
- bpf_jit_binary_free(header);
+ bpf_jit_binary_pack_free(header, rw_header);
prog = orig_prog;
goto out_addrs;
}
@@ -2361,8 +2355,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
sizeof(struct exception_table_entry);
/* allocate module memory for x86 insns and extable */
- header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size,
- &image, align, jit_fill_hole);
+ header = bpf_jit_binary_pack_alloc(roundup(proglen, align) + extable_size,
+ &image, align, &rw_header, &rw_image,
+ jit_fill_hole);
if (!header) {
prog = orig_prog;
goto out_addrs;
@@ -2378,14 +2373,22 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
if (image) {
if (!prog->is_func || extra_pass) {
+ /*
+ * bpf_jit_binary_pack_finalize fails in two scenarios:
+ * 1) header is not pointing to proper module memory;
+ * 2) the arch doesn't support bpf_arch_text_copy().
+ *
+ * Both cases are serious bugs that we should not continue.
+ */
+ BUG_ON(bpf_jit_binary_pack_finalize(prog, header, rw_header));
bpf_tail_call_direct_fixup(prog);
- bpf_jit_binary_lock_ro(header);
} else {
jit_data->addrs = addrs;
jit_data->ctx = ctx;
jit_data->proglen = proglen;
jit_data->image = image;
jit_data->header = header;
+ jit_data->rw_header = rw_header;
}
prog->bpf_func = (void *)image;
prog->jited = 1;
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread