* [PATCH v3 bpf-next 1/7] x86/Kconfig: select HAVE_ARCH_HUGE_VMALLOC with HAVE_ARCH_HUGE_VMAP
2022-01-06 2:25 [PATCH v3 bpf-next 0/7] bpf_prog_pack allocator Song Liu
@ 2022-01-06 2:25 ` Song Liu
2022-01-06 2:25 ` [PATCH v3 bpf-next 2/7] bpf: use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem Song Liu
` (5 subsequent siblings)
6 siblings, 0 replies; 12+ messages in thread
From: Song Liu @ 2022-01-06 2:25 UTC (permalink / raw)
To: bpf, netdev, linux-kernel
Cc: ast, daniel, andrii, kernel-team, peterz, x86, Song Liu
From: Song Liu <songliubraving@fb.com>
This enables module_alloc() to allocate huge page for 2MB+ requests.
To check the difference of this change, we need enable config
CONFIG_PTDUMP_DEBUGFS, and call module_alloc(2MB). Before the change,
/sys/kernel/debug/page_tables/kernel shows pte for this map. With the
change, /sys/kernel/debug/page_tables/ show pmd for thie map.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
arch/x86/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5c2ccb85f2ef..21c4db9475a8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -158,6 +158,7 @@ config X86
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
+ select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_JUMP_LABEL_RELATIVE
select HAVE_ARCH_KASAN if X86_64
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v3 bpf-next 2/7] bpf: use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem
2022-01-06 2:25 [PATCH v3 bpf-next 0/7] bpf_prog_pack allocator Song Liu
2022-01-06 2:25 ` [PATCH v3 bpf-next 1/7] x86/Kconfig: select HAVE_ARCH_HUGE_VMALLOC with HAVE_ARCH_HUGE_VMAP Song Liu
@ 2022-01-06 2:25 ` Song Liu
2022-01-06 2:25 ` [PATCH v3 bpf-next 3/7] bpf: use size instead of pages in bpf_binary_header Song Liu
` (4 subsequent siblings)
6 siblings, 0 replies; 12+ messages in thread
From: Song Liu @ 2022-01-06 2:25 UTC (permalink / raw)
To: bpf, netdev, linux-kernel
Cc: ast, daniel, andrii, kernel-team, peterz, x86, Song Liu
From: Song Liu <songliubraving@fb.com>
This enables sub-page memory charge and allocation.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
include/linux/bpf.h | 4 ++--
kernel/bpf/core.c | 19 +++++++++----------
kernel/bpf/trampoline.c | 6 +++---
3 files changed, 14 insertions(+), 15 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 26753139d5b4..4d2cc13e9bf3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -828,8 +828,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
void bpf_image_ksym_del(struct bpf_ksym *ksym);
void bpf_ksym_add(struct bpf_ksym *ksym);
void bpf_ksym_del(struct bpf_ksym *ksym);
-int bpf_jit_charge_modmem(u32 pages);
-void bpf_jit_uncharge_modmem(u32 pages);
+int bpf_jit_charge_modmem(u32 size);
+void bpf_jit_uncharge_modmem(u32 size);
bool bpf_prog_has_trampoline(const struct bpf_prog *prog);
#else
static inline int bpf_trampoline_link_prog(struct bpf_prog *prog,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index de3e5bc6781f..495e3b2c36ff 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -808,7 +808,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
return slot;
}
-static atomic_long_t bpf_jit_current;
+static atomic64_t bpf_jit_current;
/* Can be overridden by an arch's JIT compiler if it has a custom,
* dedicated BPF backend memory area, or if neither of the two
@@ -833,12 +833,11 @@ static int __init bpf_jit_charge_init(void)
}
pure_initcall(bpf_jit_charge_init);
-int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 size)
{
- if (atomic_long_add_return(pages, &bpf_jit_current) >
- (bpf_jit_limit >> PAGE_SHIFT)) {
+ if (atomic64_add_return(size, &bpf_jit_current) > bpf_jit_limit) {
if (!bpf_capable()) {
- atomic_long_sub(pages, &bpf_jit_current);
+ atomic64_sub(size, &bpf_jit_current);
return -EPERM;
}
}
@@ -846,9 +845,9 @@ int bpf_jit_charge_modmem(u32 pages)
return 0;
}
-void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 size)
{
- atomic_long_sub(pages, &bpf_jit_current);
+ atomic64_sub(size, &bpf_jit_current);
}
void *__weak bpf_jit_alloc_exec(unsigned long size)
@@ -879,11 +878,11 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
pages = size / PAGE_SIZE;
- if (bpf_jit_charge_modmem(pages))
+ if (bpf_jit_charge_modmem(size))
return NULL;
hdr = bpf_jit_alloc_exec(size);
if (!hdr) {
- bpf_jit_uncharge_modmem(pages);
+ bpf_jit_uncharge_modmem(size);
return NULL;
}
@@ -906,7 +905,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
u32 pages = hdr->pages;
bpf_jit_free_exec(hdr);
- bpf_jit_uncharge_modmem(pages);
+ bpf_jit_uncharge_modmem(pages << PAGE_SHIFT);
}
/* This symbol is only overridden by archs that have different
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 4b6974a195c1..e76a488c09c3 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -213,7 +213,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work)
im = container_of(work, struct bpf_tramp_image, work);
bpf_image_ksym_del(&im->ksym);
bpf_jit_free_exec(im->image);
- bpf_jit_uncharge_modmem(1);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
percpu_ref_exit(&im->pcref);
kfree_rcu(im, rcu);
}
@@ -310,7 +310,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
if (!im)
goto out;
- err = bpf_jit_charge_modmem(1);
+ err = bpf_jit_charge_modmem(PAGE_SIZE);
if (err)
goto out_free_im;
@@ -332,7 +332,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
out_free_image:
bpf_jit_free_exec(im->image);
out_uncharge:
- bpf_jit_uncharge_modmem(1);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
out_free_im:
kfree(im);
out:
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v3 bpf-next 3/7] bpf: use size instead of pages in bpf_binary_header
2022-01-06 2:25 [PATCH v3 bpf-next 0/7] bpf_prog_pack allocator Song Liu
2022-01-06 2:25 ` [PATCH v3 bpf-next 1/7] x86/Kconfig: select HAVE_ARCH_HUGE_VMALLOC with HAVE_ARCH_HUGE_VMAP Song Liu
2022-01-06 2:25 ` [PATCH v3 bpf-next 2/7] bpf: use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem Song Liu
@ 2022-01-06 2:25 ` Song Liu
2022-01-06 2:25 ` [PATCH v3 bpf-next 4/7] bpf: add a pointer of bpf_binary_header to bpf_prog Song Liu
` (3 subsequent siblings)
6 siblings, 0 replies; 12+ messages in thread
From: Song Liu @ 2022-01-06 2:25 UTC (permalink / raw)
To: bpf, netdev, linux-kernel
Cc: ast, daniel, andrii, kernel-team, peterz, x86, Song Liu
From: Song Liu <songliubraving@fb.com>
This is necessary to charge sub page memory for the BPF program.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
include/linux/filter.h | 6 +++---
kernel/bpf/core.c | 11 +++++------
2 files changed, 8 insertions(+), 9 deletions(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 60eec80fa1d4..6d73d89c99a4 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -548,7 +548,7 @@ struct sock_fprog_kern {
#define BPF_IMAGE_ALIGNMENT 8
struct bpf_binary_header {
- u32 pages;
+ u32 size;
u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
};
@@ -886,8 +886,8 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
{
set_vm_flush_reset_perms(hdr);
- set_memory_ro((unsigned long)hdr, hdr->pages);
- set_memory_x((unsigned long)hdr, hdr->pages);
+ set_memory_ro((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
+ set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
}
static inline struct bpf_binary_header *
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 495e3b2c36ff..684a8a972adf 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -543,7 +543,7 @@ bpf_prog_ksym_set_addr(struct bpf_prog *prog)
WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
prog->aux->ksym.start = (unsigned long) prog->bpf_func;
- prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE;
+ prog->aux->ksym.end = addr + hdr->size;
}
static void
@@ -866,7 +866,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_binary_header *hdr;
- u32 size, hole, start, pages;
+ u32 size, hole, start;
WARN_ON_ONCE(!is_power_of_2(alignment) ||
alignment > BPF_IMAGE_ALIGNMENT);
@@ -876,7 +876,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
* random section of illegal instructions.
*/
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
- pages = size / PAGE_SIZE;
if (bpf_jit_charge_modmem(size))
return NULL;
@@ -889,7 +888,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
/* Fill space with illegal/arch-dep instructions. */
bpf_fill_ill_insns(hdr, size);
- hdr->pages = pages;
+ hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
start = (get_random_int() % hole) & ~(alignment - 1);
@@ -902,10 +901,10 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
- u32 pages = hdr->pages;
+ u32 size = hdr->size;
bpf_jit_free_exec(hdr);
- bpf_jit_uncharge_modmem(pages << PAGE_SHIFT);
+ bpf_jit_uncharge_modmem(size);
}
/* This symbol is only overridden by archs that have different
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v3 bpf-next 4/7] bpf: add a pointer of bpf_binary_header to bpf_prog
2022-01-06 2:25 [PATCH v3 bpf-next 0/7] bpf_prog_pack allocator Song Liu
` (2 preceding siblings ...)
2022-01-06 2:25 ` [PATCH v3 bpf-next 3/7] bpf: use size instead of pages in bpf_binary_header Song Liu
@ 2022-01-06 2:25 ` Song Liu
2022-01-06 2:25 ` [PATCH v3 bpf-next 5/7] x86/alternative: introduce text_poke_jit Song Liu
` (2 subsequent siblings)
6 siblings, 0 replies; 12+ messages in thread
From: Song Liu @ 2022-01-06 2:25 UTC (permalink / raw)
To: bpf, netdev, linux-kernel
Cc: ast, daniel, andrii, kernel-team, peterz, x86, Song Liu, Song Liu
With sub page allocation, we cannot simply use bpf_func & PAGE_MASK to
find the bpf_binary_header. Add a pointer to struct bpf_prog to avoid
this logic.
Use this point for x86_64. If the pointer is not set by the jit engine,
fall back to original logic.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
arch/x86/net/bpf_jit_comp.c | 2 ++
include/linux/filter.h | 10 ++++++++--
2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index ce1f86f245c9..fe4f08e25a1d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2339,6 +2339,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
if (header)
bpf_jit_binary_free(header);
prog = orig_prog;
+ header = NULL;
goto out_addrs;
}
if (image) {
@@ -2406,6 +2407,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
if (tmp_blinded)
bpf_jit_prog_release_other(prog, prog == orig_prog ?
tmp : orig_prog);
+ prog->hdr = header;
return prog;
}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6d73d89c99a4..b5c7e12f7675 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -584,6 +584,7 @@ struct bpf_prog {
const struct bpf_insn *insn);
struct bpf_prog_aux *aux; /* Auxiliary fields */
struct sock_fprog_kern *orig_prog; /* Original BPF program */
+ struct bpf_binary_header *hdr;
/* Instructions for interpreter */
union {
DECLARE_FLEX_ARRAY(struct sock_filter, insns);
@@ -893,9 +894,14 @@ static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
static inline struct bpf_binary_header *
bpf_jit_binary_hdr(const struct bpf_prog *fp)
{
- unsigned long real_start = (unsigned long)fp->bpf_func;
- unsigned long addr = real_start & PAGE_MASK;
+ unsigned long real_start;
+ unsigned long addr;
+ if (fp->hdr)
+ return fp->hdr;
+
+ real_start = (unsigned long)fp->bpf_func;
+ addr = real_start & PAGE_MASK;
return (void *)addr;
}
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v3 bpf-next 5/7] x86/alternative: introduce text_poke_jit
2022-01-06 2:25 [PATCH v3 bpf-next 0/7] bpf_prog_pack allocator Song Liu
` (3 preceding siblings ...)
2022-01-06 2:25 ` [PATCH v3 bpf-next 4/7] bpf: add a pointer of bpf_binary_header to bpf_prog Song Liu
@ 2022-01-06 2:25 ` Song Liu
2022-01-11 12:13 ` Peter Zijlstra
2022-01-06 2:25 ` [PATCH v3 bpf-next 6/7] bpf: introduce bpf_prog_pack allocator Song Liu
2022-01-06 2:25 ` [PATCH v3 bpf-next 7/7] bpf, x86_64: use " Song Liu
6 siblings, 1 reply; 12+ messages in thread
From: Song Liu @ 2022-01-06 2:25 UTC (permalink / raw)
To: bpf, netdev, linux-kernel
Cc: ast, daniel, andrii, kernel-team, peterz, x86, Song Liu
This will be used by BPF jit compiler to dump JITed binary to a RX huge
page, and thus allow multiple BPF programs sharing the a huge (2MB) page.
Signed-off-by: Song Liu <song@kernel.org>
---
arch/x86/include/asm/text-patching.h | 1 +
arch/x86/kernel/alternative.c | 28 ++++++++++++++++++++++++++++
2 files changed, 29 insertions(+)
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index b7421780e4e9..991058c9b4b1 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -44,6 +44,7 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len);
extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void text_poke_sync(void);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
+extern void *text_poke_jit(void *addr, const void *opcode, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 23fb4d51a5da..02c35725cc62 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1102,6 +1102,34 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
return __text_poke(addr, opcode, len);
}
+/**
+ * text_poke_jit - Update instructions on a live kernel by jit engine
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy, could be more than 2x PAGE_SIZE
+ *
+ * Only module memory taking jit text (e.g. for bpf) should be patched.
+ */
+void *text_poke_jit(void *addr, const void *opcode, size_t len)
+{
+ unsigned long start = (unsigned long)addr;
+ size_t patched = 0;
+
+ if (WARN_ON_ONCE(core_kernel_text(start)))
+ return NULL;
+
+ while (patched < len) {
+ unsigned long ptr = start + patched;
+ size_t s;
+
+ s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
+
+ __text_poke((void *)ptr, opcode + patched, s);
+ patched += s;
+ }
+ return addr;
+}
+
static void do_sync_core(void *info)
{
sync_core();
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v3 bpf-next 5/7] x86/alternative: introduce text_poke_jit
2022-01-06 2:25 ` [PATCH v3 bpf-next 5/7] x86/alternative: introduce text_poke_jit Song Liu
@ 2022-01-11 12:13 ` Peter Zijlstra
2022-01-11 17:43 ` Song Liu
0 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2022-01-11 12:13 UTC (permalink / raw)
To: Song Liu; +Cc: bpf, netdev, linux-kernel, ast, daniel, andrii, kernel-team, x86
On Wed, Jan 05, 2022 at 06:25:31PM -0800, Song Liu wrote:
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index 23fb4d51a5da..02c35725cc62 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -1102,6 +1102,34 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
> return __text_poke(addr, opcode, len);
> }
>
> +/**
> + * text_poke_jit - Update instructions on a live kernel by jit engine
> + * @addr: address to modify
> + * @opcode: source of the copy
> + * @len: length to copy, could be more than 2x PAGE_SIZE
> + *
> + * Only module memory taking jit text (e.g. for bpf) should be patched.
> + */
Maybe:
text_poke_copy() - Copy instructions into (an unused part of) RX memory
@args...
Not safe against concurrent execution; useful for JITs to dump
new code blocks into unused regions of RX memory. Can be used in
conjunction with synchronize_rcu_tasks() to wait for existing
execution to quiesce after having made sure no existing
functions pointers are life.
or something along those lines?
> +void *text_poke_jit(void *addr, const void *opcode, size_t len)
> +{
> + unsigned long start = (unsigned long)addr;
> + size_t patched = 0;
> +
> + if (WARN_ON_ONCE(core_kernel_text(start)))
> + return NULL;
> +
> + while (patched < len) {
> + unsigned long ptr = start + patched;
> + size_t s;
> +
> + s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
> +
> + __text_poke((void *)ptr, opcode + patched, s);
> + patched += s;
> + }
> + return addr;
> +}
> +
> static void do_sync_core(void *info)
> {
> sync_core();
> --
> 2.30.2
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3 bpf-next 5/7] x86/alternative: introduce text_poke_jit
2022-01-11 12:13 ` Peter Zijlstra
@ 2022-01-11 17:43 ` Song Liu
0 siblings, 0 replies; 12+ messages in thread
From: Song Liu @ 2022-01-11 17:43 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Song Liu, bpf, netdev, linux-kernel, ast, daniel, andrii,
Kernel Team, x86
> On Jan 11, 2022, at 4:13 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Jan 05, 2022 at 06:25:31PM -0800, Song Liu wrote:
>
>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>> index 23fb4d51a5da..02c35725cc62 100644
>> --- a/arch/x86/kernel/alternative.c
>> +++ b/arch/x86/kernel/alternative.c
>> @@ -1102,6 +1102,34 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
>> return __text_poke(addr, opcode, len);
>> }
>>
>> +/**
>> + * text_poke_jit - Update instructions on a live kernel by jit engine
>> + * @addr: address to modify
>> + * @opcode: source of the copy
>> + * @len: length to copy, could be more than 2x PAGE_SIZE
>> + *
>> + * Only module memory taking jit text (e.g. for bpf) should be patched.
>> + */
>
> Maybe:
>
> text_poke_copy() - Copy instructions into (an unused part of) RX memory
> @args...
>
> Not safe against concurrent execution; useful for JITs to dump
> new code blocks into unused regions of RX memory. Can be used in
> conjunction with synchronize_rcu_tasks() to wait for existing
> execution to quiesce after having made sure no existing
> functions pointers are life.
>
> or something along those lines?
This sounds good! Thanks!
Song
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v3 bpf-next 6/7] bpf: introduce bpf_prog_pack allocator
2022-01-06 2:25 [PATCH v3 bpf-next 0/7] bpf_prog_pack allocator Song Liu
` (4 preceding siblings ...)
2022-01-06 2:25 ` [PATCH v3 bpf-next 5/7] x86/alternative: introduce text_poke_jit Song Liu
@ 2022-01-06 2:25 ` Song Liu
2022-01-06 2:25 ` [PATCH v3 bpf-next 7/7] bpf, x86_64: use " Song Liu
6 siblings, 0 replies; 12+ messages in thread
From: Song Liu @ 2022-01-06 2:25 UTC (permalink / raw)
To: bpf, netdev, linux-kernel
Cc: ast, daniel, andrii, kernel-team, peterz, x86, Song Liu
From: Song Liu <songliubraving@fb.com>
Most BPF programs are small, but they consume a page each. For systems
with busy traffic and many BPF programs, this could add significant
pressure to instruction TLB.
Introduce bpf_prog_pack allocator to pack multiple BPF programs in a huge
page. The memory is then allocated in 64 byte chunks.
Memory allocated by bpf_prog_pack allocator is RO protected after initial
allocation. To write to it, the user (jit engine) need to use text poke
API.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
include/linux/filter.h | 7 ++
kernel/bpf/core.c | 187 ++++++++++++++++++++++++++++++++++++++++-
2 files changed, 190 insertions(+), 4 deletions(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index b5c7e12f7675..d3a4f037b42b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1070,6 +1070,13 @@ void *bpf_jit_alloc_exec(unsigned long size);
void bpf_jit_free_exec(void *addr);
void bpf_jit_free(struct bpf_prog *fp);
+struct bpf_binary_header *
+bpf_jit_binary_alloc_pack(unsigned int proglen, u8 **image_r_ptr,
+ unsigned int alignment,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns);
+void bpf_jit_binary_free_pack(struct bpf_binary_header *hdr);
+int bpf_prog_pack_max_size(void);
+
int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
struct bpf_jit_poke_descriptor *poke);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 684a8a972adf..94c7d9ff9d6c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -808,6 +808,116 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
return slot;
}
+/*
+ * BPF program pack allocator.
+ *
+ * Most BPF programs are pretty small. Allocating a hole page for each
+ * program is sometime a waste. Many small bpf program also adds pressure
+ * to instruction TLB. To solve this issue, we introduce a BPF program pack
+ * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
+ * to host BPF programs.
+ */
+#define BPF_PROG_PACK_SIZE HPAGE_PMD_SIZE
+#define BPF_PROG_CHUNK_SHIFT 6
+#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT)
+#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
+
+struct bpf_prog_pack {
+ struct list_head list;
+ void *ptr;
+ unsigned long bitmap[BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)];
+};
+
+#define BPF_PROG_MAX_PACK_PROG_SIZE HPAGE_PMD_SIZE
+#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
+
+static DEFINE_MUTEX(pack_mutex);
+static LIST_HEAD(pack_list);
+
+static struct bpf_prog_pack *alloc_new_pack(void)
+{
+ struct bpf_prog_pack *pack;
+
+ pack = kzalloc(sizeof(*pack), GFP_KERNEL);
+ if (!pack)
+ return NULL;
+ pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
+ if (!pack->ptr) {
+ kfree(pack);
+ return NULL;
+ }
+ bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
+ list_add_tail(&pack->list, &pack_list);
+
+ set_vm_flush_reset_perms(pack);
+ set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ return pack;
+}
+
+static void *bpf_prog_pack_alloc(u32 size)
+{
+ unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
+ struct bpf_prog_pack *pack;
+ unsigned long pos;
+ void *ptr = NULL;
+
+ mutex_lock(&pack_mutex);
+ list_for_each_entry(pack, &pack_list, list) {
+ pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+ nbits, 0);
+ if (pos < BPF_PROG_CHUNK_COUNT)
+ goto found_free_area;
+ }
+
+ pack = alloc_new_pack();
+ if (!pack)
+ goto out;
+
+ pos = 0;
+
+found_free_area:
+ bitmap_set(pack->bitmap, pos, nbits);
+ ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
+
+out:
+ mutex_unlock(&pack_mutex);
+ return ptr;
+}
+
+static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+{
+ void *pack_ptr = (void *)((unsigned long)hdr & ~(BPF_PROG_PACK_SIZE - 1));
+ struct bpf_prog_pack *pack = NULL, *tmp;
+ unsigned int nbits;
+ unsigned long pos;
+
+ mutex_lock(&pack_mutex);
+
+ list_for_each_entry(tmp, &pack_list, list) {
+ if (tmp->ptr == pack_ptr) {
+ pack = tmp;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
+ goto out;
+
+ nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
+ pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
+
+ bitmap_clear(pack->bitmap, pos, nbits);
+ if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+ BPF_PROG_CHUNK_COUNT, 0) == 0) {
+ list_del(&pack->list);
+ module_memfree(pack->ptr);
+ kfree(pack);
+ }
+out:
+ mutex_unlock(&pack_mutex);
+}
+
static atomic64_t bpf_jit_current;
/* Can be overridden by an arch's JIT compiler if it has a custom,
@@ -860,10 +970,59 @@ void __weak bpf_jit_free_exec(void *addr)
module_memfree(addr);
}
+static struct bpf_binary_header *
+__bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
+ unsigned int alignment,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns,
+ u32 round_up_to)
+{
+ struct bpf_binary_header *hdr;
+ u32 size, hole, start;
+
+ WARN_ON_ONCE(!is_power_of_2(alignment) ||
+ alignment > BPF_IMAGE_ALIGNMENT);
+
+ /* Most of BPF filters are really small, but if some of them
+ * fill a page, allow at least 128 extra bytes to insert a
+ * random section of illegal instructions.
+ */
+ size = round_up(proglen + sizeof(*hdr) + 128, round_up_to);
+
+ if (bpf_jit_charge_modmem(size))
+ return NULL;
+ hdr = bpf_jit_alloc_exec(size);
+ if (!hdr) {
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ /* Fill space with illegal/arch-dep instructions. */
+ bpf_fill_ill_insns(hdr, size);
+
+ hdr->size = size;
+ hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
+ PAGE_SIZE - sizeof(*hdr));
+ start = (get_random_int() % hole) & ~(alignment - 1);
+
+ /* Leave a random number of instructions before BPF code. */
+ *image_ptr = &hdr->image[start];
+
+ return hdr;
+}
+
struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
unsigned int alignment,
bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ return __bpf_jit_binary_alloc(proglen, image_ptr, alignment,
+ bpf_fill_ill_insns, PAGE_SIZE);
+}
+
+struct bpf_binary_header *
+bpf_jit_binary_alloc_pack(unsigned int proglen, u8 **image_ptr,
+ unsigned int alignment,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_binary_header *hdr;
u32 size, hole, start;
@@ -875,11 +1034,19 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
* fill a page, allow at least 128 extra bytes to insert a
* random section of illegal instructions.
*/
- size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
+ size = round_up(proglen + sizeof(*hdr) + 128, BPF_PROG_CHUNK_SIZE);
+
+ /* for too big program, use __bpf_jit_binary_alloc with round_up_to
+ * of BPF_PROG_MAX_PACK_PROG_SIZE.
+ */
+ if (size > BPF_PROG_MAX_PACK_PROG_SIZE)
+ return __bpf_jit_binary_alloc(proglen, image_ptr,
+ alignment, bpf_fill_ill_insns,
+ BPF_PROG_MAX_PACK_PROG_SIZE);
if (bpf_jit_charge_modmem(size))
return NULL;
- hdr = bpf_jit_alloc_exec(size);
+ hdr = bpf_prog_pack_alloc(size);
if (!hdr) {
bpf_jit_uncharge_modmem(size);
return NULL;
@@ -888,9 +1055,8 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
/* Fill space with illegal/arch-dep instructions. */
bpf_fill_ill_insns(hdr, size);
- hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
- PAGE_SIZE - sizeof(*hdr));
+ BPF_PROG_CHUNK_SIZE - sizeof(*hdr));
start = (get_random_int() % hole) & ~(alignment - 1);
/* Leave a random number of instructions before BPF code. */
@@ -907,6 +1073,19 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
bpf_jit_uncharge_modmem(size);
}
+void bpf_jit_binary_free_pack(struct bpf_binary_header *hdr)
+{
+ u32 size = hdr->size;
+
+ bpf_prog_pack_free(hdr);
+ bpf_jit_uncharge_modmem(size);
+}
+
+int bpf_prog_pack_max_size(void)
+{
+ return BPF_PROG_MAX_PACK_PROG_SIZE;
+}
+
/* This symbol is only overridden by archs that have different
* requirements than the usual eBPF JITs, f.e. when they only
* implement cBPF JIT, do not set images read-only, etc.
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v3 bpf-next 7/7] bpf, x86_64: use bpf_prog_pack allocator
2022-01-06 2:25 [PATCH v3 bpf-next 0/7] bpf_prog_pack allocator Song Liu
` (5 preceding siblings ...)
2022-01-06 2:25 ` [PATCH v3 bpf-next 6/7] bpf: introduce bpf_prog_pack allocator Song Liu
@ 2022-01-06 2:25 ` Song Liu
2022-01-11 12:04 ` Peter Zijlstra
6 siblings, 1 reply; 12+ messages in thread
From: Song Liu @ 2022-01-06 2:25 UTC (permalink / raw)
To: bpf, netdev, linux-kernel
Cc: ast, daniel, andrii, kernel-team, peterz, x86, Song Liu
From: Song Liu <songliubraving@fb.com>
Use bpf_prog_pack allocator in x86_64 jit.
The program header from bpf_prog_pack is read only during the jit process.
Therefore, the binary is first written to a temporary buffer, and later
copied to final location with text_poke_jit().
Similarly, jit_fill_hole() is updated to fill the hole with 0xcc using
text_poke_jit().
Signed-off-by: Song Liu <songliubraving@fb.com>
---
arch/x86/net/bpf_jit_comp.c | 131 +++++++++++++++++++++++++++---------
1 file changed, 100 insertions(+), 31 deletions(-)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index fe4f08e25a1d..ad69a64ee4fe 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -216,11 +216,33 @@ static u8 simple_alu_opcodes[] = {
[BPF_ARSH] = 0xF8,
};
+static char jit_hole_buffer[PAGE_SIZE] = {};
+
static void jit_fill_hole(void *area, unsigned int size)
+{
+ struct bpf_binary_header *hdr = area;
+ int i;
+
+ for (i = 0; i < roundup(size, PAGE_SIZE); i += PAGE_SIZE) {
+ int s;
+
+ s = min_t(int, PAGE_SIZE, size - i);
+ text_poke_jit(area + i, jit_hole_buffer, s);
+ }
+
+ /* bpf_jit_binary_alloc_pack cannot write size directly to the ro
+ * mapping. Write it here with text_poke_jit().
+ */
+ text_poke_jit(&hdr->size, &size, sizeof(size));
+}
+
+static int __init x86_jit_fill_hole_init(void)
{
/* Fill whole space with INT3 instructions */
- memset(area, 0xcc, size);
+ memset(jit_hole_buffer, 0xcc, PAGE_SIZE);
+ return 0;
}
+pure_initcall(x86_jit_fill_hole_init);
struct jit_context {
int cleanup_addr; /* Epilogue code offset */
@@ -361,14 +383,11 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
ret = -EBUSY;
mutex_lock(&text_mutex);
- if (memcmp(ip, old_insn, X86_PATCH_SIZE))
+ if (text_live && memcmp(ip, old_insn, X86_PATCH_SIZE))
goto out;
ret = 1;
if (memcmp(ip, new_insn, X86_PATCH_SIZE)) {
- if (text_live)
- text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
- else
- memcpy(ip, new_insn, X86_PATCH_SIZE);
+ text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
ret = 0;
}
out:
@@ -537,7 +556,7 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
*pprog = prog;
}
-static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
+static void bpf_tail_call_direct_fixup(struct bpf_prog *prog, bool text_live)
{
struct bpf_jit_poke_descriptor *poke;
struct bpf_array *array;
@@ -558,24 +577,15 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
mutex_lock(&array->aux->poke_mutex);
target = array->ptrs[poke->tail_call.key];
if (target) {
- /* Plain memcpy is used when image is not live yet
- * and still not locked as read-only. Once poke
- * location is active (poke->tailcall_target_stable),
- * any parallel bpf_arch_text_poke() might occur
- * still on the read-write image until we finally
- * locked it as read-only. Both modifications on
- * the given image are under text_mutex to avoid
- * interference.
- */
ret = __bpf_arch_text_poke(poke->tailcall_target,
BPF_MOD_JUMP, NULL,
(u8 *)target->bpf_func +
- poke->adj_off, false);
+ poke->adj_off, text_live);
BUG_ON(ret < 0);
ret = __bpf_arch_text_poke(poke->tailcall_bypass,
BPF_MOD_JUMP,
(u8 *)poke->tailcall_target +
- X86_PATCH_SIZE, NULL, false);
+ X86_PATCH_SIZE, NULL, text_live);
BUG_ON(ret < 0);
}
WRITE_ONCE(poke->tailcall_target_stable, true);
@@ -867,7 +877,7 @@ static void emit_nops(u8 **pprog, int len)
#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
-static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *tmp_image,
int oldproglen, struct jit_context *ctx, bool jmp_padding)
{
bool tail_call_reachable = bpf_prog->aux->tail_call_reachable;
@@ -894,8 +904,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
push_callee_regs(&prog, callee_regs_used);
ilen = prog - temp;
- if (image)
- memcpy(image + proglen, temp, ilen);
+ if (tmp_image)
+ memcpy(tmp_image + proglen, temp, ilen);
proglen += ilen;
addrs[0] = proglen;
prog = temp;
@@ -1324,8 +1334,10 @@ st: if (is_imm8(insn->off))
pr_err("extable->insn doesn't fit into 32-bit\n");
return -EFAULT;
}
- ex->insn = delta;
+ /* switch ex to temporary buffer for writes */
+ ex = (void *)tmp_image + ((void *)ex - (void *)image);
+ ex->insn = delta;
ex->type = EX_TYPE_BPF;
if (dst_reg > BPF_REG_9) {
@@ -1706,7 +1718,7 @@ st: if (is_imm8(insn->off))
pr_err("bpf_jit: fatal error\n");
return -EFAULT;
}
- memcpy(image + proglen, temp, ilen);
+ memcpy(tmp_image + proglen, temp, ilen);
}
proglen += ilen;
addrs[i] = proglen;
@@ -2248,8 +2260,10 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
struct x64_jit_data {
struct bpf_binary_header *header;
+ struct bpf_binary_header *tmp_header;
int *addrs;
u8 *image;
+ u8 *tmp_image;
int proglen;
struct jit_context ctx;
};
@@ -2259,6 +2273,7 @@ struct x64_jit_data {
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
{
+ struct bpf_binary_header *tmp_header = NULL;
struct bpf_binary_header *header = NULL;
struct bpf_prog *tmp, *orig_prog = prog;
struct x64_jit_data *jit_data;
@@ -2267,6 +2282,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
bool tmp_blinded = false;
bool extra_pass = false;
bool padding = false;
+ u8 *tmp_image = NULL;
u8 *image = NULL;
int *addrs;
int pass;
@@ -2301,7 +2317,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
ctx = jit_data->ctx;
oldproglen = jit_data->proglen;
image = jit_data->image;
+ tmp_image = jit_data->tmp_image;
header = jit_data->header;
+ tmp_header = jit_data->tmp_header;
extra_pass = true;
padding = true;
goto skip_init_addrs;
@@ -2332,14 +2350,18 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
for (pass = 0; pass < MAX_PASSES || image; pass++) {
if (!padding && pass >= PADDING_PASSES)
padding = true;
- proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding);
+ proglen = do_jit(prog, addrs, image, tmp_image, oldproglen, &ctx, padding);
if (proglen <= 0) {
out_image:
image = NULL;
- if (header)
- bpf_jit_binary_free(header);
+ tmp_image = NULL;
+ if (header) {
+ bpf_jit_binary_free_pack(header);
+ kfree(tmp_header);
+ }
prog = orig_prog;
header = NULL;
+ tmp_header = NULL;
goto out_addrs;
}
if (image) {
@@ -2362,13 +2384,27 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
sizeof(struct exception_table_entry);
/* allocate module memory for x86 insns and extable */
- header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size,
- &image, align, jit_fill_hole);
+ header = bpf_jit_binary_alloc_pack(roundup(proglen, align) + extable_size,
+ &image, align, jit_fill_hole);
if (!header) {
prog = orig_prog;
goto out_addrs;
}
- prog->aux->extable = (void *) image + roundup(proglen, align);
+ if (header->size > bpf_prog_pack_max_size()) {
+ tmp_header = header;
+ tmp_image = image;
+ } else {
+ tmp_header = kzalloc(header->size, GFP_KERNEL);
+ if (!tmp_header) {
+ bpf_jit_binary_free_pack(header);
+ header = NULL;
+ prog = orig_prog;
+ goto out_addrs;
+ }
+ tmp_header->size = header->size;
+ tmp_image = (void *)tmp_header + ((void *)image - (void *)header);
+ }
+ prog->aux->extable = (void *)image + roundup(proglen, align);
}
oldproglen = proglen;
cond_resched();
@@ -2379,14 +2415,23 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
if (image) {
if (!prog->is_func || extra_pass) {
- bpf_tail_call_direct_fixup(prog);
- bpf_jit_binary_lock_ro(header);
+ if (header->size > bpf_prog_pack_max_size()) {
+ /* bpf_prog_pack cannot handle too big
+ * program (> ~2MB). Fall back to regular
+ * module_alloc(), and do the fixup and
+ * lock_ro here.
+ */
+ bpf_tail_call_direct_fixup(prog, false);
+ bpf_jit_binary_lock_ro(header);
+ }
} else {
jit_data->addrs = addrs;
jit_data->ctx = ctx;
jit_data->proglen = proglen;
jit_data->image = image;
+ jit_data->tmp_image = tmp_image;
jit_data->header = header;
+ jit_data->tmp_header = tmp_header;
}
prog->bpf_func = (void *)image;
prog->jited = 1;
@@ -2402,6 +2447,16 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
kvfree(addrs);
kfree(jit_data);
prog->aux->jit_data = NULL;
+ jit_data = NULL;
+ if (tmp_header != header) {
+ text_poke_jit(header, tmp_header, header->size);
+ kfree(tmp_header);
+ /* Do the fixup after final text_poke_jit().
+ * Otherwise, the fix up will be overwritten by
+ * text_poke_jit().
+ */
+ bpf_tail_call_direct_fixup(prog, true);
+ }
}
out:
if (tmp_blinded)
@@ -2415,3 +2470,17 @@ bool bpf_jit_supports_kfunc_call(void)
{
return true;
}
+
+void bpf_jit_free(struct bpf_prog *fp)
+{
+ if (fp->jited) {
+ struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
+
+ if (hdr->size > bpf_prog_pack_max_size())
+ bpf_jit_binary_free(hdr);
+ else
+ bpf_jit_binary_free_pack(hdr);
+ }
+
+ bpf_prog_unlock_free(fp);
+}
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v3 bpf-next 7/7] bpf, x86_64: use bpf_prog_pack allocator
2022-01-06 2:25 ` [PATCH v3 bpf-next 7/7] bpf, x86_64: use " Song Liu
@ 2022-01-11 12:04 ` Peter Zijlstra
2022-01-11 17:26 ` Song Liu
0 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2022-01-11 12:04 UTC (permalink / raw)
To: Song Liu
Cc: bpf, netdev, linux-kernel, ast, daniel, andrii, kernel-team, x86,
Song Liu
On Wed, Jan 05, 2022 at 06:25:33PM -0800, Song Liu wrote:
> From: Song Liu <songliubraving@fb.com>
>
> Use bpf_prog_pack allocator in x86_64 jit.
>
> The program header from bpf_prog_pack is read only during the jit process.
> Therefore, the binary is first written to a temporary buffer, and later
> copied to final location with text_poke_jit().
>
> Similarly, jit_fill_hole() is updated to fill the hole with 0xcc using
> text_poke_jit().
>
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
> arch/x86/net/bpf_jit_comp.c | 131 +++++++++++++++++++++++++++---------
> 1 file changed, 100 insertions(+), 31 deletions(-)
>
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index fe4f08e25a1d..ad69a64ee4fe 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -216,11 +216,33 @@ static u8 simple_alu_opcodes[] = {
> [BPF_ARSH] = 0xF8,
> };
>
> +static char jit_hole_buffer[PAGE_SIZE] = {};
> +
> static void jit_fill_hole(void *area, unsigned int size)
> +{
> + struct bpf_binary_header *hdr = area;
> + int i;
> +
> + for (i = 0; i < roundup(size, PAGE_SIZE); i += PAGE_SIZE) {
> + int s;
> +
> + s = min_t(int, PAGE_SIZE, size - i);
> + text_poke_jit(area + i, jit_hole_buffer, s);
> + }
> +
> + /* bpf_jit_binary_alloc_pack cannot write size directly to the ro
> + * mapping. Write it here with text_poke_jit().
> + */
Could we move this file towards regular comment style please? It's
already mixed style, let's take the opportunity and not add more
net-style comments.
> + text_poke_jit(&hdr->size, &size, sizeof(size));
> +}
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3 bpf-next 7/7] bpf, x86_64: use bpf_prog_pack allocator
2022-01-11 12:04 ` Peter Zijlstra
@ 2022-01-11 17:26 ` Song Liu
0 siblings, 0 replies; 12+ messages in thread
From: Song Liu @ 2022-01-11 17:26 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Song Liu, bpf, Networking, linux-kernel, ast, daniel, andrii,
Kernel Team, x86
> On Jan 11, 2022, at 4:04 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Jan 05, 2022 at 06:25:33PM -0800, Song Liu wrote:
>> From: Song Liu <songliubraving@fb.com>
>>
>> Use bpf_prog_pack allocator in x86_64 jit.
>>
>> The program header from bpf_prog_pack is read only during the jit process.
>> Therefore, the binary is first written to a temporary buffer, and later
>> copied to final location with text_poke_jit().
>>
>> Similarly, jit_fill_hole() is updated to fill the hole with 0xcc using
>> text_poke_jit().
>>
>> Signed-off-by: Song Liu <songliubraving@fb.com>
>> ---
>> arch/x86/net/bpf_jit_comp.c | 131 +++++++++++++++++++++++++++---------
>> 1 file changed, 100 insertions(+), 31 deletions(-)
>>
>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>> index fe4f08e25a1d..ad69a64ee4fe 100644
>> --- a/arch/x86/net/bpf_jit_comp.c
>> +++ b/arch/x86/net/bpf_jit_comp.c
>> @@ -216,11 +216,33 @@ static u8 simple_alu_opcodes[] = {
>> [BPF_ARSH] = 0xF8,
>> };
>>
>> +static char jit_hole_buffer[PAGE_SIZE] = {};
>> +
>> static void jit_fill_hole(void *area, unsigned int size)
>> +{
>> + struct bpf_binary_header *hdr = area;
>> + int i;
>> +
>> + for (i = 0; i < roundup(size, PAGE_SIZE); i += PAGE_SIZE) {
>> + int s;
>> +
>> + s = min_t(int, PAGE_SIZE, size - i);
>> + text_poke_jit(area + i, jit_hole_buffer, s);
>> + }
>> +
>> + /* bpf_jit_binary_alloc_pack cannot write size directly to the ro
>> + * mapping. Write it here with text_poke_jit().
>> + */
>
> Could we move this file towards regular comment style please? It's
> already mixed style, let's take the opportunity and not add more
> net-style comments.
Aha, I didn't realize the file is about 50:50 with the two styles. I can
change it in v4.
Thanks,
Song
^ permalink raw reply [flat|nested] 12+ messages in thread