All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack
@ 2022-04-21  7:22 Song Liu
  2022-04-21 17:09 ` Linus Torvalds
  0 siblings, 1 reply; 13+ messages in thread
From: Song Liu @ 2022-04-21  7:22 UTC (permalink / raw)
  To: bpf, linux-mm, linux-kernel
  Cc: ast, daniel, kernel-team, akpm, rick.p.edgecombe, hch, torvalds,
	andrii, Song Liu

bpf_prog_pack enables sharing huge pages among multiple BPF programs.
These pages are marked as executable, but some part of these huge page
may not contain proper BPF programs. To make these pages safe, fill such
unused part of these pages with invalid instructions. This is done when a
pack is first allocated, and when a bpf program is freed..

Fixes: 57631054fae6 ("bpf: Introduce bpf_prog_pack allocator")
Fixes: 33c9805860e5 ("bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free]")
Signed-off-by: Song Liu <song@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 22 ++++++++++++++++++++++
 include/linux/bpf.h         |  2 ++
 kernel/bpf/core.c           | 20 ++++++++++++++++----
 3 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 8fe35ed11fd6..57099d89f2bf 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -228,6 +228,28 @@ static void jit_fill_hole(void *area, unsigned int size)
 	memset(area, 0xcc, size);
 }
 
+#define INVALID_BUF_SIZE PAGE_SIZE
+static char invalid_insn_buf[INVALID_BUF_SIZE];
+
+static int __init bpf_init_invalid_insn_buf(void)
+{
+	jit_fill_hole(invalid_insn_buf, INVALID_BUF_SIZE);
+	return 0;
+}
+pure_initcall(bpf_init_invalid_insn_buf);
+
+void bpf_arch_invalidate_text(void *dst, size_t len)
+{
+	size_t i = 0;
+
+	while (i < len) {
+		size_t s = min_t(size_t, len - i, INVALID_BUF_SIZE);
+
+		bpf_arch_text_copy(dst + i, invalid_insn_buf, s);
+		i += s;
+	}
+}
+
 struct jit_context {
 	int cleanup_addr; /* Epilogue code offset */
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bdb5298735ce..2157392a94d1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2382,6 +2382,8 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 
 void *bpf_arch_text_copy(void *dst, void *src, size_t len);
 
+void bpf_arch_invalidate_text(void *dst, size_t len);
+
 struct btf_id_set;
 bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b2a634d0f842..3f8bdbc0e994 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -873,7 +873,7 @@ static size_t select_bpf_prog_pack_size(void)
 	return size;
 }
 
-static struct bpf_prog_pack *alloc_new_pack(void)
+static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
 {
 	struct bpf_prog_pack *pack;
 
@@ -886,6 +886,7 @@ static struct bpf_prog_pack *alloc_new_pack(void)
 		kfree(pack);
 		return NULL;
 	}
+	bpf_fill_ill_insns(pack->ptr, bpf_prog_pack_size);
 	bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE);
 	list_add_tail(&pack->list, &pack_list);
 
@@ -894,7 +895,7 @@ static struct bpf_prog_pack *alloc_new_pack(void)
 	return pack;
 }
 
-static void *bpf_prog_pack_alloc(u32 size)
+static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
 {
 	unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
 	struct bpf_prog_pack *pack;
@@ -922,7 +923,7 @@ static void *bpf_prog_pack_alloc(u32 size)
 			goto found_free_area;
 	}
 
-	pack = alloc_new_pack();
+	pack = alloc_new_pack(bpf_fill_ill_insns);
 	if (!pack)
 		goto out;
 
@@ -965,6 +966,7 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
 	nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
 	pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
 
+	bpf_arch_invalidate_text(hdr, hdr->size);
 	bitmap_clear(pack->bitmap, pos, nbits);
 	if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
 				       bpf_prog_chunk_count(), 0) == 0) {
@@ -1103,7 +1105,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
 
 	if (bpf_jit_charge_modmem(size))
 		return NULL;
-	ro_header = bpf_prog_pack_alloc(size);
+	ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
 	if (!ro_header) {
 		bpf_jit_uncharge_modmem(size);
 		return NULL;
@@ -1204,6 +1206,16 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
 	bpf_prog_unlock_free(fp);
 }
 
+void __weak bpf_arch_invalidate_text(void *dst, size_t len)
+{
+	char buf[1] = {};
+	int i;
+
+	WARN_ONCE(1, "Please override bpf_arch_invalidate_text for bpf_prog_pack\n");
+	for (i = 0; i < len; i++)
+		bpf_arch_text_copy(dst + i, buf, 1);
+}
+
 int bpf_jit_get_func_addr(const struct bpf_prog *prog,
 			  const struct bpf_insn *insn, bool extra_pass,
 			  u64 *func_addr, bool *func_addr_fixed)
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack
  2022-04-21  7:22 [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack Song Liu
@ 2022-04-21 17:09 ` Linus Torvalds
  2022-04-21 18:24   ` Alexei Starovoitov
  0 siblings, 1 reply; 13+ messages in thread
From: Linus Torvalds @ 2022-04-21 17:09 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, Linux-MM, Linux Kernel Mailing List, Alexei Starovoitov,
	Daniel Borkmann, Kernel Team, Andrew Morton, Edgecombe, Rick P,
	Christoph Hellwig, Andrii Nakryiko

On Thu, Apr 21, 2022 at 12:27 AM Song Liu <song@kernel.org> wrote:
>
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -228,6 +228,28 @@ static void jit_fill_hole(void *area, unsigned int size)
>         memset(area, 0xcc, size);
>  }
>
> +#define INVALID_BUF_SIZE PAGE_SIZE
> +static char invalid_insn_buf[INVALID_BUF_SIZE];
> +
> +static int __init bpf_init_invalid_insn_buf(void)
> +{
> +       jit_fill_hole(invalid_insn_buf, INVALID_BUF_SIZE);
> +       return 0;
> +}
> +pure_initcall(bpf_init_invalid_insn_buf);
> +
> +void bpf_arch_invalidate_text(void *dst, size_t len)
> +{
> +       size_t i = 0;
> +
> +       while (i < len) {
> +               size_t s = min_t(size_t, len - i, INVALID_BUF_SIZE);
> +
> +               bpf_arch_text_copy(dst + i, invalid_insn_buf, s);
> +               i += s;
> +       }
> +}

Why do we need this new infrastructure?

Why bpf_arch_invalidate_text()?

Why not jit_fill_hole() unconditionally?

It seems a bit pointless to have page buffer for containing this data,
when we already have a (trivial) function to fill an area with invalid
instructions.

On x86, it's literally just "memset(0xcc)" (ie all 'int3' instructions).

And on most RISC architectures, it would be some variation of
"memset32(TRAP_INSN)".

And all bpf targets should already have that nicely as that
jit_fill_hole() function, no?

The pack-allocator bpf code already *does* that, and is already passed
that function.

But it's just that it does it too late. Instead of doing it when
allocating a new pack, it does it in the sub-allocator.

Afaik the code in bpf/core.c already has all the information it needs,
and already has that jit_fill_hole() function pointer, but is applying
it at the wrong point.

So I think the fix should be to just pass in that 'bpf_fill_ill_insns'
function pointer all the way to alloc_new_pack(), instead of using it
in bpf_jit_binary_alloc().

NOTE! Once again, I'm not actually all that familiar with the code.
Maybe I'm missing something.

             Linus

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack
  2022-04-21 17:09 ` Linus Torvalds
@ 2022-04-21 18:24   ` Alexei Starovoitov
  2022-04-21 18:59     ` Linus Torvalds
  0 siblings, 1 reply; 13+ messages in thread
From: Alexei Starovoitov @ 2022-04-21 18:24 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Song Liu, bpf, Linux-MM, Linux Kernel Mailing List,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team, Andrew Morton,
	Edgecombe, Rick P, Christoph Hellwig, Andrii Nakryiko

On Thu, Apr 21, 2022 at 10:09 AM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 21, 2022 at 12:27 AM Song Liu <song@kernel.org> wrote:
> >
> > --- a/arch/x86/net/bpf_jit_comp.c
> > +++ b/arch/x86/net/bpf_jit_comp.c
> > @@ -228,6 +228,28 @@ static void jit_fill_hole(void *area, unsigned int size)
> >         memset(area, 0xcc, size);
> >  }
> >
> > +#define INVALID_BUF_SIZE PAGE_SIZE
> > +static char invalid_insn_buf[INVALID_BUF_SIZE];
> > +
> > +static int __init bpf_init_invalid_insn_buf(void)
> > +{
> > +       jit_fill_hole(invalid_insn_buf, INVALID_BUF_SIZE);
> > +       return 0;
> > +}
> > +pure_initcall(bpf_init_invalid_insn_buf);
> > +
> > +void bpf_arch_invalidate_text(void *dst, size_t len)
> > +{
> > +       size_t i = 0;
> > +
> > +       while (i < len) {
> > +               size_t s = min_t(size_t, len - i, INVALID_BUF_SIZE);
> > +
> > +               bpf_arch_text_copy(dst + i, invalid_insn_buf, s);
> > +               i += s;
> > +       }
> > +}
>
> Why do we need this new infrastructure?
>
> Why bpf_arch_invalidate_text()?
>
> Why not jit_fill_hole() unconditionally?
>
> It seems a bit pointless to have page buffer for containing this data,
> when we already have a (trivial) function to fill an area with invalid
> instructions.
>
> On x86, it's literally just "memset(0xcc)" (ie all 'int3' instructions).
>
> And on most RISC architectures, it would be some variation of
> "memset32(TRAP_INSN)".
>
> And all bpf targets should already have that nicely as that
> jit_fill_hole() function, no?
>
> The pack-allocator bpf code already *does* that, and is already passed
> that function.
>
> But it's just that it does it too late. Instead of doing it when
> allocating a new pack, it does it in the sub-allocator.
>
> Afaik the code in bpf/core.c already has all the information it needs,
> and already has that jit_fill_hole() function pointer, but is applying
> it at the wrong point.
>
> So I think the fix should be to just pass in that 'bpf_fill_ill_insns'
> function pointer all the way to alloc_new_pack(), instead of using it
> in bpf_jit_binary_alloc().

jit_fill_hole is an overkill here.

Long ago when jit spraying attack was fixed there was
a concern that memset(0) essentially populates the code page
with valid 'add BYTE PTR [rax],al' instructions.
Jumping anywhere in the zero page with a valid address in rax will
eventually lead to execution of the first insn in jit-ed bpf prog.
So memset(0xcc) was added to make it a bit harder to guess
the start address.
jit spraying is only a concern for archs that can
jump in the middle of the instruction and cpus will interpret
the byte stream differently.
The existing bpf_prog_pack code still does memset(0xcc)
a random range of bytes before and after jit-ed bpf code.
So doing memset(0xcc) for the whole huge page is not necessary at all.
Just memset(0) of a huge page at init time and memset(0)
when prog is freed is enough.
Jumping into zero page of 'valid' insns the cpu
will eventually stumble on 0xcc before reaching the first insn.
Let's not complicate the logic by dragging jit_fill_hole
further into generic allocation.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack
  2022-04-21 18:24   ` Alexei Starovoitov
@ 2022-04-21 18:59     ` Linus Torvalds
  2022-04-21 19:40       ` Song Liu
  0 siblings, 1 reply; 13+ messages in thread
From: Linus Torvalds @ 2022-04-21 18:59 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Song Liu, bpf, Linux-MM, Linux Kernel Mailing List,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team, Andrew Morton,
	Edgecombe, Rick P, Christoph Hellwig, Andrii Nakryiko

[-- Attachment #1: Type: text/plain, Size: 1387 bytes --]

On Thu, Apr 21, 2022 at 11:24 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> Let's not complicate the logic by dragging jit_fill_hole
> further into generic allocation.

I agree that just zeroing the page is probably perfectly fine in
practice on x86, but I'm also not really seeing the "complication" of
just doing things right.

> The existing bpf_prog_pack code still does memset(0xcc)
> a random range of bytes before and after jit-ed bpf code.

That is actually wishful thinking, and not based on reality.

From what I can tell, the end of the jit'ed bpf code is actually the
exception table entries, so we have that data being marked executable.

Honestly, what is wrong with this trivial patch?

I've not *tested* it, but it looks really really simple to me. Take it
as a "something like this" rather than anything else.

And yes, it would be better if bpf_jit_binary_pack_free did it too, so
that you don't have random old JIT'ed code lying around (and possibly
still in CPU branch history caches or whatever).

And it would be lovely if the exception table entries would be part of
another allocation and not marked executable.

But I certainly don't see the _downside_ (or complexity) of just doing
this, instead of zeroing things.

So this is by no means perfect, but it seems at least incrementally
_better_ than just zeroing. No?

                    Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 1975 bytes --]

 kernel/bpf/core.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 13e9dbeeedf3..a4cfc353a52d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -873,7 +873,7 @@ static size_t select_bpf_prog_pack_size(void)
 	return size;
 }
 
-static struct bpf_prog_pack *alloc_new_pack(void)
+static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
 {
 	struct bpf_prog_pack *pack;
 
@@ -889,13 +889,14 @@ static struct bpf_prog_pack *alloc_new_pack(void)
 	bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE);
 	list_add_tail(&pack->list, &pack_list);
 
+	bpf_fill_ill_insns(pack->ptr, bpf_prog_pack_size);
 	set_vm_flush_reset_perms(pack->ptr);
 	set_memory_ro((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
 	set_memory_x((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
 	return pack;
 }
 
-static void *bpf_prog_pack_alloc(u32 size)
+static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
 {
 	unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
 	struct bpf_prog_pack *pack;
@@ -910,6 +911,7 @@ static void *bpf_prog_pack_alloc(u32 size)
 		size = round_up(size, PAGE_SIZE);
 		ptr = module_alloc(size);
 		if (ptr) {
+			bpf_fill_ill_insns(ptr, size);
 			set_vm_flush_reset_perms(ptr);
 			set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
 			set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
@@ -923,7 +925,7 @@ static void *bpf_prog_pack_alloc(u32 size)
 			goto found_free_area;
 	}
 
-	pack = alloc_new_pack();
+	pack = alloc_new_pack(bpf_fill_ill_insns);
 	if (!pack)
 		goto out;
 
@@ -1102,7 +1104,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
 
 	if (bpf_jit_charge_modmem(size))
 		return NULL;
-	ro_header = bpf_prog_pack_alloc(size);
+	ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
 	if (!ro_header) {
 		bpf_jit_uncharge_modmem(size);
 		return NULL;

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack
  2022-04-21 18:59     ` Linus Torvalds
@ 2022-04-21 19:40       ` Song Liu
  2022-04-21 21:28         ` Linus Torvalds
  0 siblings, 1 reply; 13+ messages in thread
From: Song Liu @ 2022-04-21 19:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alexei Starovoitov, bpf, Linux-MM, Linux Kernel Mailing List,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team, Andrew Morton,
	Edgecombe, Rick P, Christoph Hellwig, Andrii Nakryiko

Hi Linus,

On Thu, Apr 21, 2022 at 11:59 AM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 21, 2022 at 11:24 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > Let's not complicate the logic by dragging jit_fill_hole
> > further into generic allocation.
>
> I agree that just zeroing the page is probably perfectly fine in
> practice on x86, but I'm also not really seeing the "complication" of
> just doing things right.
>
> > The existing bpf_prog_pack code still does memset(0xcc)
> > a random range of bytes before and after jit-ed bpf code.
>
> That is actually wishful thinking, and not based on reality.
>
> From what I can tell, the end of the jit'ed bpf code is actually the
> exception table entries, so we have that data being marked executable.
>
> Honestly, what is wrong with this trivial patch?

This version would fill the memory with illegal instruction when we
allocate the bpf_prog_pack.

The extra logic I had in the original patch was to erase the memory
when a BPF program is freed. In this case, the memory will be
returned to the bpf_prog_pack, and stays as RO+X. Actually, I
am not quite sure whether we need this logic. If not, we only need
the much simpler version.

Thanks,
Song

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack
  2022-04-21 19:40       ` Song Liu
@ 2022-04-21 21:28         ` Linus Torvalds
  2022-04-21 21:52           ` Song Liu
  0 siblings, 1 reply; 13+ messages in thread
From: Linus Torvalds @ 2022-04-21 21:28 UTC (permalink / raw)
  To: Song Liu
  Cc: Alexei Starovoitov, bpf, Linux-MM, Linux Kernel Mailing List,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team, Andrew Morton,
	Edgecombe, Rick P, Christoph Hellwig, Andrii Nakryiko

On Thu, Apr 21, 2022 at 12:41 PM Song Liu <song@kernel.org> wrote:
>
> The extra logic I had in the original patch was to erase the memory
> when a BPF program is freed. In this case, the memory will be
> returned to the bpf_prog_pack, and stays as RO+X. Actually, I
> am not quite sure whether we need this logic. If not, we only need
> the much simpler version.

Oh, I think it would be good to do at free time too.

I just would want that to use the same function we already have for
the allocation-time thing, instead of introducing completely new
infrastructure. That was what looked very odd to me.

Now, the _smallest_ patch would likely be to just save away that
'bpf_fill_ill_insns' function pointer in the 'struct bpf_prog_pack'
thing.

It's admittedly kind of silly to do, but it matches that whole silly
"let's pass around a function pointer to a fixed function" model at
allocation time.

I say that's silly, because it's a fixed architecture function and we
could just call it directly. The only valid function there is
jit_fill_hole(), and the only reason it uses that function pointer
seems to be that it's never been exposed as a real function.

So passing it along as a function seems to be _purely_ for the silly
reason that people haven't agreed on a name, and different
architectures use different names (ie power uses
'bpf_jit_fill_ill_insns()', RISC-V calls it 'bpf_fill_ill_insns()',
and everybody else seems to use 'jit_fill_hole'.

I don't know why that decision was made. It looks like a bad one to
me, honestly.

Why not just agree on a name - I suggest 'bpf_jit_fill_hole()' - and
just get rid of that stupid 'bpf_jit_fill_hole_t' type name that only
exists because of this thing?

The bpf headers seem to literally have agreed on a name for that
function -type- only in order to be able to disagree on the name of
the function -name-, and then pass it along as a function pointer
argument instead of just calling it directly.

Very counter-productive.

                 Linus

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack
  2022-04-21 21:28         ` Linus Torvalds
@ 2022-04-21 21:52           ` Song Liu
  2022-04-21 22:30             ` Linus Torvalds
  0 siblings, 1 reply; 13+ messages in thread
From: Song Liu @ 2022-04-21 21:52 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Song Liu, Alexei Starovoitov, bpf, Linux-MM,
	Linux Kernel Mailing List, Alexei Starovoitov, Daniel Borkmann,
	Kernel Team, Andrew Morton, Edgecombe, Rick P, Christoph Hellwig,
	Andrii Nakryiko

Hi Linus, 

> On Apr 21, 2022, at 2:28 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
> On Thu, Apr 21, 2022 at 12:41 PM Song Liu <song@kernel.org> wrote:
>> 
>> The extra logic I had in the original patch was to erase the memory
>> when a BPF program is freed. In this case, the memory will be
>> returned to the bpf_prog_pack, and stays as RO+X. Actually, I
>> am not quite sure whether we need this logic. If not, we only need
>> the much simpler version.
> 
> Oh, I think it would be good to do at free time too.
> 
> I just would want that to use the same function we already have for
> the allocation-time thing, instead of introducing completely new
> infrastructure. That was what looked very odd to me.
> 
> Now, the _smallest_ patch would likely be to just save away that
> 'bpf_fill_ill_insns' function pointer in the 'struct bpf_prog_pack'
> thing.

[...]
> 
> Why not just agree on a name - I suggest 'bpf_jit_fill_hole()' - and
> just get rid of that stupid 'bpf_jit_fill_hole_t' type name that only
> exists because of this thing?

Last night, I had a version which is about 90% same as this idea.

However, we cannot really use the same function at free time. The
huge page is RO+X at free time, but we are only zeroing out a chunk 
of it. So regular memset/memcpy won’t work. Instead, we will need 
something like bpf_arch_text_copy(). 
 
Thanks,
Song


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack
  2022-04-21 21:52           ` Song Liu
@ 2022-04-21 22:30             ` Linus Torvalds
  2022-04-21 22:51               ` Song Liu
  0 siblings, 1 reply; 13+ messages in thread
From: Linus Torvalds @ 2022-04-21 22:30 UTC (permalink / raw)
  To: Song Liu
  Cc: Song Liu, Alexei Starovoitov, bpf, Linux-MM,
	Linux Kernel Mailing List, Alexei Starovoitov, Daniel Borkmann,
	Kernel Team, Andrew Morton, Edgecombe, Rick P, Christoph Hellwig,
	Andrii Nakryiko

On Thu, Apr 21, 2022 at 2:53 PM Song Liu <songliubraving@fb.com> wrote:
>
> However, we cannot really use the same function at free time. The
> huge page is RO+X at free time, but we are only zeroing out a chunk
> of it. So regular memset/memcpy won’t work. Instead, we will need
> something like bpf_arch_text_copy().

I actually think bpf_arch_text_copy() is another horribly badly done thing.

It seems only implemented on x86 (I'm not sure how anything else is
supposed to work, I didn't go look), and there it is horribly badly
done, using __text_poke() that does all these magical things just to
make it atomic wrt concurrent code execution.

None of which is *AT*ALL* relevant for this case, since concurrent
code execution simply isn't a thing (and if it were, you would already
have lost).

And if that wasn't pointless enough, it does all that magic "map the
page writably at a different virtual address using poking_addr in
poking_mm" and a different address space entirely.

All of that is required for REAL KERNEL CODE.

But the thing is, for bpf_prog_pack, all of that is just completely
pointless and stupid complexity.

We already *have* the other non-executable address that is writable:
it's the actual pages that got vmalloc'ed. Just use vmalloc_to_page()
and it's RIGHT THERE.

At which point you just use the same bpf_jit_fill_hole() function, and
you're done.

In other words, all of this seems excessively stupidly done, for no
good reason.  It's only making it much too complicated, and just not
doing the right thing at all.

                Linus

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack
  2022-04-21 22:30             ` Linus Torvalds
@ 2022-04-21 22:51               ` Song Liu
  2022-04-21 23:10                 ` Linus Torvalds
  2022-04-22  7:31                 ` Peter Zijlstra
  0 siblings, 2 replies; 13+ messages in thread
From: Song Liu @ 2022-04-21 22:51 UTC (permalink / raw)
  To: Linus Torvalds, Peter Zijlstra
  Cc: Song Liu, Alexei Starovoitov, bpf, Linux-MM,
	Linux Kernel Mailing List, Alexei Starovoitov, Daniel Borkmann,
	Kernel Team, Andrew Morton, Edgecombe, Rick P, Christoph Hellwig,
	Andrii Nakryiko

Hi Linus,

> On Apr 21, 2022, at 3:30 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
> On Thu, Apr 21, 2022 at 2:53 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> However, we cannot really use the same function at free time. The
>> huge page is RO+X at free time, but we are only zeroing out a chunk
>> of it. So regular memset/memcpy won’t work. Instead, we will need
>> something like bpf_arch_text_copy().
> 
> I actually think bpf_arch_text_copy() is another horribly badly done thing.
> 
> It seems only implemented on x86 (I'm not sure how anything else is
> supposed to work, I didn't go look), and there it is horribly badly
> done, using __text_poke() that does all these magical things just to
> make it atomic wrt concurrent code execution.
> 
> None of which is *AT*ALL* relevant for this case, since concurrent
> code execution simply isn't a thing (and if it were, you would already
> have lost).
> 
> And if that wasn't pointless enough, it does all that magic "map the
> page writably at a different virtual address using poking_addr in
> poking_mm" and a different address space entirely.
> 
> All of that is required for REAL KERNEL CODE.
> 
> But the thing is, for bpf_prog_pack, all of that is just completely
> pointless and stupid complexity.
> 
> We already *have* the other non-executable address that is writable:
> it's the actual pages that got vmalloc'ed. Just use vmalloc_to_page()
> and it's RIGHT THERE.

I think this won’t work, as set_memory_ro makes all the aliases of 
these pages read only. We can probably add set_memory_ro_noalias(), 
which will be similar to set_memory_np_noalias(). This approach was 
NACKed by Peter[1]. So we went with the bpf_arch_text_copy approach.

If we can loosen the W^X requirement for BPF programs, other parts 
of bpf_prog_pack could also be simplified. 

Thanks,
Song

[1] https://lore.kernel.org/netdev/20211116080051.GU174703@worktop.programming.kicks-ass.net/

> 
> At which point you just use the same bpf_jit_fill_hole() function, and
> you're done.
> 
> In other words, all of this seems excessively stupidly done, for no
> good reason.  It's only making it much too complicated, and just not
> doing the right thing at all.
> 
>                Linus


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack
  2022-04-21 22:51               ` Song Liu
@ 2022-04-21 23:10                 ` Linus Torvalds
  2022-04-22  1:31                   ` Song Liu
  2022-04-22  7:31                 ` Peter Zijlstra
  1 sibling, 1 reply; 13+ messages in thread
From: Linus Torvalds @ 2022-04-21 23:10 UTC (permalink / raw)
  To: Song Liu
  Cc: Peter Zijlstra, Song Liu, Alexei Starovoitov, bpf, Linux-MM,
	Linux Kernel Mailing List, Alexei Starovoitov, Daniel Borkmann,
	Kernel Team, Andrew Morton, Edgecombe, Rick P, Christoph Hellwig,
	Andrii Nakryiko

On Thu, Apr 21, 2022 at 3:52 PM Song Liu <songliubraving@fb.com> wrote:
>
> I think this won’t work, as set_memory_ro makes all the aliases of
> these pages read only.

Argh. I thought we only did that for the whole memory type thing
(history: nasty machine checks possible on some hardware if you mix
memory types for the same physical page with virtual mappings), but if
we do it for RO too, then yeah.

It's sad to use that horrid machinery for basically non-live code, but
whatever.

                  Linus

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack
  2022-04-21 23:10                 ` Linus Torvalds
@ 2022-04-22  1:31                   ` Song Liu
  0 siblings, 0 replies; 13+ messages in thread
From: Song Liu @ 2022-04-22  1:31 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Song Liu, Alexei Starovoitov, bpf, Linux-MM,
	Linux Kernel Mailing List, Alexei Starovoitov, Daniel Borkmann,
	Kernel Team, Andrew Morton, Edgecombe, Rick P, Christoph Hellwig,
	Andrii Nakryiko

Hi Linus,

> On Apr 21, 2022, at 4:10 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
> On Thu, Apr 21, 2022 at 3:52 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> I think this won’t work, as set_memory_ro makes all the aliases of
>> these pages read only.
> 
> Argh. I thought we only did that for the whole memory type thing
> (history: nasty machine checks possible on some hardware if you mix
> memory types for the same physical page with virtual mappings), but if
> we do it for RO too, then yeah.
> 
> It's sad to use that horrid machinery for basically non-live code, but
> whatever.

I guess we will stick with bpf_arch_text_copy(), and we will keep the 
Invalidation at BPF program free time?

I will reorder and resend pending patches. Then we can decide which ones
to ship with 5.18. 

Thanks,
Song

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack
  2022-04-21 22:51               ` Song Liu
  2022-04-21 23:10                 ` Linus Torvalds
@ 2022-04-22  7:31                 ` Peter Zijlstra
  2022-04-23  5:25                   ` Song Liu
  1 sibling, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2022-04-22  7:31 UTC (permalink / raw)
  To: Song Liu
  Cc: Linus Torvalds, Song Liu, Alexei Starovoitov, bpf, Linux-MM,
	Linux Kernel Mailing List, Alexei Starovoitov, Daniel Borkmann,
	Kernel Team, Andrew Morton, Edgecombe, Rick P, Christoph Hellwig,
	Andrii Nakryiko

> > On Apr 21, 2022, at 3:30 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote:

> > I actually think bpf_arch_text_copy() is another horribly badly done thing.
> > 
> > It seems only implemented on x86 (I'm not sure how anything else is
> > supposed to work, I didn't go look), and there it is horribly badly
> > done, using __text_poke() that does all these magical things just to
> > make it atomic wrt concurrent code execution.
> > 
> > None of which is *AT*ALL* relevant for this case, since concurrent
> > code execution simply isn't a thing (and if it were, you would already
> > have lost).
> > 
> > And if that wasn't pointless enough, it does all that magic "map the
> > page writably at a different virtual address using poking_addr in
> > poking_mm" and a different address space entirely.
> > 
> > All of that is required for REAL KERNEL CODE.
> > 
> > But the thing is, for bpf_prog_pack, all of that is just completely
> > pointless and stupid complexity.

I think the point is that this hole will likely share a page with active
code, and as such there should not be a writable mapping mapping to it,
necessitating the whole __text_poke() mess.

That said; it does seem somewhat silly have a whole page worth of int3
around just for this.

Perhaps we can do something like the completely untested below?

---
 arch/x86/kernel/alternative.c | 48 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index d374cb3cf024..60afa9105307 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -994,7 +994,20 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
 __ro_after_init struct mm_struct *poking_mm;
 __ro_after_init unsigned long poking_addr;
 
-static void *__text_poke(void *addr, const void *opcode, size_t len)
+static void text_poke_memcpy(void *dst, const void *src, size_t len)
+{
+	memcpy(dst, src, len);
+}
+
+static void text_poke_memset(void *dst, const void *src, size_t len)
+{
+	int c = *(int *)src;
+	memset(dst, c, len);
+}
+
+typedef void text_poke_f(void *dst, const void *src, size_t len);
+
+static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
 {
 	bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
 	struct page *pages[2] = {NULL};
@@ -1059,7 +1072,7 @@ static void *__text_poke(void *addr, const void *opcode, size_t len)
 	prev = use_temporary_mm(poking_mm);
 
 	kasan_disable_current();
-	memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
+	func((void *)poking_addr + offset_in_page(addr), src, len);
 	kasan_enable_current();
 
 	/*
@@ -1091,7 +1104,8 @@ static void *__text_poke(void *addr, const void *opcode, size_t len)
 	 * If the text does not match what we just wrote then something is
 	 * fundamentally screwy; there's nothing we can really do about that.
 	 */
-	BUG_ON(memcmp(addr, opcode, len));
+	if (func == text_poke_memcpy)
+		BUG_ON(memcmp(addr, src, len));
 
 	local_irq_restore(flags);
 	pte_unmap_unlock(ptep, ptl);
@@ -1118,7 +1132,7 @@ void *text_poke(void *addr, const void *opcode, size_t len)
 {
 	lockdep_assert_held(&text_mutex);
 
-	return __text_poke(addr, opcode, len);
+	return __text_poke(text_poke_memcpy, addr, opcode, len);
 }
 
 /**
@@ -1137,7 +1151,7 @@ void *text_poke(void *addr, const void *opcode, size_t len)
  */
 void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
 {
-	return __text_poke(addr, opcode, len);
+	return __text_poke(text_poke_memcpy, addr, opcode, len);
 }
 
 /**
@@ -1167,7 +1181,29 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len)
 
 		s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
 
-		__text_poke((void *)ptr, opcode + patched, s);
+		__text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
+		patched += s;
+	}
+	mutex_unlock(&text_mutex);
+	return addr;
+}
+
+void *text_poke_set(void *addr, int c, size_t len)
+{
+	unsigned long start = (unsigned long)addr;
+	size_t patched = 0;
+
+	if (WARN_ON_ONCE(core_kernel_text(start)))
+		return NULL;
+
+	mutex_lock(&text_mutex);
+	while (patched < len) {
+		unsigned long ptr = start + patched;
+		size_t s;
+
+		s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
+
+		__text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
 		patched += s;
 	}
 	mutex_unlock(&text_mutex);

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack
  2022-04-22  7:31                 ` Peter Zijlstra
@ 2022-04-23  5:25                   ` Song Liu
  0 siblings, 0 replies; 13+ messages in thread
From: Song Liu @ 2022-04-23  5:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Song Liu, Linus Torvalds, Alexei Starovoitov, bpf, Linux-MM,
	Linux Kernel Mailing List, Alexei Starovoitov, Daniel Borkmann,
	Kernel Team, Andrew Morton, Edgecombe, Rick P, Christoph Hellwig,
	Andrii Nakryiko

On Fri, Apr 22, 2022 at 12:31 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> > > On Apr 21, 2022, at 3:30 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> > > I actually think bpf_arch_text_copy() is another horribly badly done thing.
> > >
> > > It seems only implemented on x86 (I'm not sure how anything else is
> > > supposed to work, I didn't go look), and there it is horribly badly
> > > done, using __text_poke() that does all these magical things just to
> > > make it atomic wrt concurrent code execution.
> > >
> > > None of which is *AT*ALL* relevant for this case, since concurrent
> > > code execution simply isn't a thing (and if it were, you would already
> > > have lost).
> > >
> > > And if that wasn't pointless enough, it does all that magic "map the
> > > page writably at a different virtual address using poking_addr in
> > > poking_mm" and a different address space entirely.
> > >
> > > All of that is required for REAL KERNEL CODE.
> > >
> > > But the thing is, for bpf_prog_pack, all of that is just completely
> > > pointless and stupid complexity.
>
> I think the point is that this hole will likely share a page with active
> code, and as such there should not be a writable mapping mapping to it,
> necessitating the whole __text_poke() mess.
>
> That said; it does seem somewhat silly have a whole page worth of int3
> around just for this.
>
> Perhaps we can do something like the completely untested below?

Yeah, this looks like a better approach. I will draft v2 based on this.

Thanks,
Song

>
> ---
>  arch/x86/kernel/alternative.c | 48 +++++++++++++++++++++++++++++++++++++------
>  1 file changed, 42 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index d374cb3cf024..60afa9105307 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -994,7 +994,20 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
>  __ro_after_init struct mm_struct *poking_mm;
>  __ro_after_init unsigned long poking_addr;
>
> -static void *__text_poke(void *addr, const void *opcode, size_t len)
> +static void text_poke_memcpy(void *dst, const void *src, size_t len)
> +{
> +       memcpy(dst, src, len);
> +}
> +
> +static void text_poke_memset(void *dst, const void *src, size_t len)
> +{
> +       int c = *(int *)src;
> +       memset(dst, c, len);
> +}
> +
> +typedef void text_poke_f(void *dst, const void *src, size_t len);
> +
> +static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
>  {
>         bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
>         struct page *pages[2] = {NULL};
> @@ -1059,7 +1072,7 @@ static void *__text_poke(void *addr, const void *opcode, size_t len)
>         prev = use_temporary_mm(poking_mm);
>
>         kasan_disable_current();
> -       memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
> +       func((void *)poking_addr + offset_in_page(addr), src, len);
>         kasan_enable_current();
>
>         /*
> @@ -1091,7 +1104,8 @@ static void *__text_poke(void *addr, const void *opcode, size_t len)
>          * If the text does not match what we just wrote then something is
>          * fundamentally screwy; there's nothing we can really do about that.
>          */
> -       BUG_ON(memcmp(addr, opcode, len));
> +       if (func == text_poke_memcpy)
> +               BUG_ON(memcmp(addr, src, len));
>
>         local_irq_restore(flags);
>         pte_unmap_unlock(ptep, ptl);
> @@ -1118,7 +1132,7 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>  {
>         lockdep_assert_held(&text_mutex);
>
> -       return __text_poke(addr, opcode, len);
> +       return __text_poke(text_poke_memcpy, addr, opcode, len);
>  }
>
>  /**
> @@ -1137,7 +1151,7 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>   */
>  void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
>  {
> -       return __text_poke(addr, opcode, len);
> +       return __text_poke(text_poke_memcpy, addr, opcode, len);
>  }
>
>  /**
> @@ -1167,7 +1181,29 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len)
>
>                 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
>
> -               __text_poke((void *)ptr, opcode + patched, s);
> +               __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
> +               patched += s;
> +       }
> +       mutex_unlock(&text_mutex);
> +       return addr;
> +}
> +
> +void *text_poke_set(void *addr, int c, size_t len)
> +{
> +       unsigned long start = (unsigned long)addr;
> +       size_t patched = 0;
> +
> +       if (WARN_ON_ONCE(core_kernel_text(start)))
> +               return NULL;
> +
> +       mutex_lock(&text_mutex);
> +       while (patched < len) {
> +               unsigned long ptr = start + patched;
> +               size_t s;
> +
> +               s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
> +
> +               __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
>                 patched += s;
>         }
>         mutex_unlock(&text_mutex);

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-04-23  5:25 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-21  7:22 [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack Song Liu
2022-04-21 17:09 ` Linus Torvalds
2022-04-21 18:24   ` Alexei Starovoitov
2022-04-21 18:59     ` Linus Torvalds
2022-04-21 19:40       ` Song Liu
2022-04-21 21:28         ` Linus Torvalds
2022-04-21 21:52           ` Song Liu
2022-04-21 22:30             ` Linus Torvalds
2022-04-21 22:51               ` Song Liu
2022-04-21 23:10                 ` Linus Torvalds
2022-04-22  1:31                   ` Song Liu
2022-04-22  7:31                 ` Peter Zijlstra
2022-04-23  5:25                   ` Song Liu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.