netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH bpf-next] selftests/bpf: two scale tests
@ 2019-04-12 21:41 Alexei Starovoitov
  2019-04-12 23:24 ` Song Liu
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Alexei Starovoitov @ 2019-04-12 21:41 UTC (permalink / raw)
  To: davem; +Cc: daniel, netdev, bpf, kernel-team

Add two tests to check that sequence of 1024 jumps is verifiable.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_verifier.c  | 70 ++++++++++++++++++++
 tools/testing/selftests/bpf/verifier/scale.c | 18 +++++
 2 files changed, 88 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/verifier/scale.c

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index e2ebcaddbe78..6cb6a1074fd1 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -208,6 +208,76 @@ static void bpf_fill_rand_ld_dw(struct bpf_test *self)
 	self->retval = (uint32_t)res;
 }
 
+/* test the sequence of 1k jumps */
+static void bpf_fill_scale1(struct bpf_test *self)
+{
+	struct bpf_insn *insn = self->fill_insns;
+	int i = 0, k = 0;
+
+	insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+	/* test to check that the sequence of 1024 jumps is acceptable */
+	while (k++ < 1024) {
+		insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+					 BPF_FUNC_get_prandom_u32);
+		insn[i++] = BPF_JMP_IMM(BPF_JGT, BPF_REG_0, bpf_semi_rand_get(), 2);
+		insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_10);
+		insn[i++] = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6,
+					-8 * (k % 64 + 1));
+	}
+	/* every jump adds 1024 steps to insn_processed, so to stay exactly
+	 * within 1m limit add MAX_TEST_INSNS - 1025 MOVs and 1 EXIT
+	 */
+	while (i < MAX_TEST_INSNS - 1025)
+		insn[i++] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 42);
+	insn[i] = BPF_EXIT_INSN();
+	self->prog_len = i + 1;
+	self->retval = 42;
+}
+
+/* test the sequence of 1k jumps in inner most function (function depth 8)*/
+static void bpf_fill_scale2(struct bpf_test *self)
+{
+	struct bpf_insn *insn = self->fill_insns;
+	int i = 0, k = 0;
+
+#define FUNC_NEST 7
+	for (k = 0; k < FUNC_NEST; k++) {
+		insn[i++] = BPF_CALL_REL(1);
+		insn[i++] = BPF_EXIT_INSN();
+	}
+	insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+	/* test to check that the sequence of 1024 jumps is acceptable */
+	while (k++ < 1024) {
+		insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+					 BPF_FUNC_get_prandom_u32);
+		insn[i++] = BPF_JMP_IMM(BPF_JGT, BPF_REG_0, bpf_semi_rand_get(), 2);
+		insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_10);
+		insn[i++] = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6,
+					-8 * (k % (64 - 4 * FUNC_NEST) + 1));
+	}
+	/* every jump adds 1024 steps to insn_processed, so to stay exactly
+	 * within 1m limit add MAX_TEST_INSNS - 1025 MOVs and 1 EXIT
+	 */
+	while (i < MAX_TEST_INSNS - 1025)
+		insn[i++] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 42);
+	insn[i] = BPF_EXIT_INSN();
+	self->prog_len = i + 1;
+	self->retval = 42;
+}
+
+static void bpf_fill_scale(struct bpf_test *self)
+{
+	switch (self->retval) {
+	case 1:
+		return bpf_fill_scale1(self);
+	case 2:
+		return bpf_fill_scale2(self);
+	default:
+		self->prog_len = 0;
+		break;
+	}
+}
+
 /* BPF_SK_LOOKUP contains 13 instructions, if you need to fix up maps */
 #define BPF_SK_LOOKUP(func)						\
 	/* struct bpf_sock_tuple tuple = {} */				\
diff --git a/tools/testing/selftests/bpf/verifier/scale.c b/tools/testing/selftests/bpf/verifier/scale.c
new file mode 100644
index 000000000000..7f868d4802e0
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/scale.c
@@ -0,0 +1,18 @@
+{
+	"scale: scale test 1",
+	.insns = { },
+	.data = { },
+	.fill_helper = bpf_fill_scale,
+	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	.result = ACCEPT,
+	.retval = 1,
+},
+{
+	"scale: scale test 2",
+	.insns = { },
+	.data = { },
+	.fill_helper = bpf_fill_scale,
+	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	.result = ACCEPT,
+	.retval = 2,
+},
-- 
2.20.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] selftests/bpf: two scale tests
  2019-04-12 21:41 [PATCH bpf-next] selftests/bpf: two scale tests Alexei Starovoitov
@ 2019-04-12 23:24 ` Song Liu
  2019-04-12 23:32   ` Alexei Starovoitov
  2019-04-16  8:30 ` Daniel Borkmann
  2019-04-24 23:07 ` 32-bit zext time complexity (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests) Jiong Wang
  2 siblings, 1 reply; 18+ messages in thread
From: Song Liu @ 2019-04-12 23:24 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David S . Miller, Daniel Borkmann, Networking, bpf, Kernel Team

On Fri, Apr 12, 2019 at 2:41 PM Alexei Starovoitov <ast@kernel.org> wrote:
>
> Add two tests to check that sequence of 1024 jumps is verifiable.
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Acked-by: Song Liu <songliubraving@fb.com>

Shall we add a test that go beyond the 1M limit?

> ---
>  tools/testing/selftests/bpf/test_verifier.c  | 70 ++++++++++++++++++++
>  tools/testing/selftests/bpf/verifier/scale.c | 18 +++++
>  2 files changed, 88 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/verifier/scale.c
>
> diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
> index e2ebcaddbe78..6cb6a1074fd1 100644
> --- a/tools/testing/selftests/bpf/test_verifier.c
> +++ b/tools/testing/selftests/bpf/test_verifier.c
> @@ -208,6 +208,76 @@ static void bpf_fill_rand_ld_dw(struct bpf_test *self)
>         self->retval = (uint32_t)res;
>  }
>
> +/* test the sequence of 1k jumps */
> +static void bpf_fill_scale1(struct bpf_test *self)
> +{
> +       struct bpf_insn *insn = self->fill_insns;
> +       int i = 0, k = 0;
> +
> +       insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
> +       /* test to check that the sequence of 1024 jumps is acceptable */
> +       while (k++ < 1024) {
> +               insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
> +                                        BPF_FUNC_get_prandom_u32);
> +               insn[i++] = BPF_JMP_IMM(BPF_JGT, BPF_REG_0, bpf_semi_rand_get(), 2);
> +               insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_10);
> +               insn[i++] = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6,
> +                                       -8 * (k % 64 + 1));
> +       }
> +       /* every jump adds 1024 steps to insn_processed, so to stay exactly
> +        * within 1m limit add MAX_TEST_INSNS - 1025 MOVs and 1 EXIT
> +        */
> +       while (i < MAX_TEST_INSNS - 1025)
> +               insn[i++] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 42);
> +       insn[i] = BPF_EXIT_INSN();
> +       self->prog_len = i + 1;
> +       self->retval = 42;
> +}
> +
> +/* test the sequence of 1k jumps in inner most function (function depth 8)*/
> +static void bpf_fill_scale2(struct bpf_test *self)
> +{
> +       struct bpf_insn *insn = self->fill_insns;
> +       int i = 0, k = 0;
> +
> +#define FUNC_NEST 7
> +       for (k = 0; k < FUNC_NEST; k++) {
> +               insn[i++] = BPF_CALL_REL(1);
> +               insn[i++] = BPF_EXIT_INSN();
> +       }
> +       insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
> +       /* test to check that the sequence of 1024 jumps is acceptable */
> +       while (k++ < 1024) {
> +               insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
> +                                        BPF_FUNC_get_prandom_u32);
> +               insn[i++] = BPF_JMP_IMM(BPF_JGT, BPF_REG_0, bpf_semi_rand_get(), 2);
> +               insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_10);
> +               insn[i++] = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6,
> +                                       -8 * (k % (64 - 4 * FUNC_NEST) + 1));
> +       }
> +       /* every jump adds 1024 steps to insn_processed, so to stay exactly
> +        * within 1m limit add MAX_TEST_INSNS - 1025 MOVs and 1 EXIT
> +        */
> +       while (i < MAX_TEST_INSNS - 1025)
> +               insn[i++] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 42);
> +       insn[i] = BPF_EXIT_INSN();
> +       self->prog_len = i + 1;
> +       self->retval = 42;
> +}
> +
> +static void bpf_fill_scale(struct bpf_test *self)
> +{
> +       switch (self->retval) {
> +       case 1:
> +               return bpf_fill_scale1(self);
> +       case 2:
> +               return bpf_fill_scale2(self);
> +       default:
> +               self->prog_len = 0;
> +               break;
> +       }
> +}
> +
>  /* BPF_SK_LOOKUP contains 13 instructions, if you need to fix up maps */
>  #define BPF_SK_LOOKUP(func)                                            \
>         /* struct bpf_sock_tuple tuple = {} */                          \
> diff --git a/tools/testing/selftests/bpf/verifier/scale.c b/tools/testing/selftests/bpf/verifier/scale.c
> new file mode 100644
> index 000000000000..7f868d4802e0
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/verifier/scale.c
> @@ -0,0 +1,18 @@
> +{
> +       "scale: scale test 1",
> +       .insns = { },
> +       .data = { },
> +       .fill_helper = bpf_fill_scale,
> +       .prog_type = BPF_PROG_TYPE_SCHED_CLS,
> +       .result = ACCEPT,
> +       .retval = 1,
> +},
> +{
> +       "scale: scale test 2",
> +       .insns = { },
> +       .data = { },
> +       .fill_helper = bpf_fill_scale,
> +       .prog_type = BPF_PROG_TYPE_SCHED_CLS,
> +       .result = ACCEPT,
> +       .retval = 2,
> +},
> --
> 2.20.0
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] selftests/bpf: two scale tests
  2019-04-12 23:24 ` Song Liu
@ 2019-04-12 23:32   ` Alexei Starovoitov
  2019-04-15  5:59     ` Song Liu
  0 siblings, 1 reply; 18+ messages in thread
From: Alexei Starovoitov @ 2019-04-12 23:32 UTC (permalink / raw)
  To: Song Liu
  Cc: Alexei Starovoitov, David S . Miller, Daniel Borkmann,
	Networking, bpf, Kernel Team

On Fri, Apr 12, 2019 at 04:24:51PM -0700, Song Liu wrote:
> On Fri, Apr 12, 2019 at 2:41 PM Alexei Starovoitov <ast@kernel.org> wrote:
> >
> > Add two tests to check that sequence of 1024 jumps is verifiable.
> >
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> 
> Acked-by: Song Liu <songliubraving@fb.com>
> 
> Shall we add a test that go beyond the 1M limit?

1m is not uapi limit. I'm working on the doc patch to stress that point.
Adding a test to check that it fails at 1m would kinda imply
that it is uapi and I very much want to avoid that.

The purpose of these tests is to stress the verifier to its
internal limits, but not more.
In particular in these two tests 1024, 8, 512, and another 1M
are limits too.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] selftests/bpf: two scale tests
  2019-04-12 23:32   ` Alexei Starovoitov
@ 2019-04-15  5:59     ` Song Liu
  0 siblings, 0 replies; 18+ messages in thread
From: Song Liu @ 2019-04-15  5:59 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alexei Starovoitov, David S . Miller, Daniel Borkmann,
	Networking, bpf, Kernel Team

On Fri, Apr 12, 2019 at 4:32 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Apr 12, 2019 at 04:24:51PM -0700, Song Liu wrote:
> > On Fri, Apr 12, 2019 at 2:41 PM Alexei Starovoitov <ast@kernel.org> wrote:
> > >
> > > Add two tests to check that sequence of 1024 jumps is verifiable.
> > >
> > > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> >
> > Acked-by: Song Liu <songliubraving@fb.com>
> >
> > Shall we add a test that go beyond the 1M limit?
>
> 1m is not uapi limit. I'm working on the doc patch to stress that point.
> Adding a test to check that it fails at 1m would kinda imply
> that it is uapi and I very much want to avoid that.
>
> The purpose of these tests is to stress the verifier to its
> internal limits, but not more.
> In particular in these two tests 1024, 8, 512, and another 1M
> are limits too.
>

Yeah, this makes sense. Thanks for the explanation.

Song

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] selftests/bpf: two scale tests
  2019-04-12 21:41 [PATCH bpf-next] selftests/bpf: two scale tests Alexei Starovoitov
  2019-04-12 23:24 ` Song Liu
@ 2019-04-16  8:30 ` Daniel Borkmann
  2019-04-24 23:07 ` 32-bit zext time complexity (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests) Jiong Wang
  2 siblings, 0 replies; 18+ messages in thread
From: Daniel Borkmann @ 2019-04-16  8:30 UTC (permalink / raw)
  To: Alexei Starovoitov, davem; +Cc: netdev, bpf, kernel-team

On 04/12/2019 11:41 PM, Alexei Starovoitov wrote:
> Add two tests to check that sequence of 1024 jumps is verifiable.
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Applied, thanks!

^ permalink raw reply	[flat|nested] 18+ messages in thread

* 32-bit zext time complexity (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests)
  2019-04-12 21:41 [PATCH bpf-next] selftests/bpf: two scale tests Alexei Starovoitov
  2019-04-12 23:24 ` Song Liu
  2019-04-16  8:30 ` Daniel Borkmann
@ 2019-04-24 23:07 ` Jiong Wang
  2019-04-25  4:33   ` Alexei Starovoitov
  2 siblings, 1 reply; 18+ messages in thread
From: Jiong Wang @ 2019-04-24 23:07 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: daniel, netdev, bpf, Jakub Kicinski, oss-drivers


Alexei Starovoitov writes:

> Add two tests to check that sequence of 1024 jumps is verifiable.
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  tools/testing/selftests/bpf/test_verifier.c  | 70 ++++++++++++++++++++
>  tools/testing/selftests/bpf/verifier/scale.c | 18 +++++

I am rebasing 32-bit opt pass on top of latest bpf-next and found these new
tests take more than 20 minutes to run and had not finished after that.

The reason the following insn filling insde bpf_fill_scale1 is generating
nearly 1M insn whose results are recognized as safe to be poisoned.

  bpf_fill_scale1:
    while (i < MAX_TEST_INSNS - 1025)
      insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 42);

For each hi32 poisoning, there will be one call to "bpf_patch_insn_data"
which actually is not cheap (adjust jump insns, insn aux info etc). Now,
1M call to it has exhausted server resources as described, 20minutes running
still not finished.

For real world applications, we don't do hi32 poisoning, and there isn't much
lo32 zext. Benchmarking those bpf programs inside Cilium shows the final
zext pass adds about 8% ~ 15% verification time.

The zext pass based on top of "bpf_patch_insn_data" looks more and more is
not the best approach to utilize the read32 analysis results.

Previously, in v1 cover letter, I listed some of my other thoughts on how to
utilize the liveness analysis results:

   1 Minor change on back-end JIT hook, also pass aux_insn information to
     back-ends so they could have per insn information and they could do
     zero extension for the marked insn themselves using the most
     efficient native insn.

   2 Introduce zero extension insn for eBPF. Then verifier could insert
     the new zext insn instead of lshift + rshift. zext could be JITed
     more efficiently.

   3 Otherwise JIT back-ends need to do peephole to catch lshift + rshift
     and turn them into native zext.

More thinking on this, perhaps we should just go with approach 1 which
actually doesn't require much change I guess.

There actually is no need to change back-end JIT hook. After we finished
liveness analysis, just copy insn zext info from env->aux_insn_data.zext_dst
to new dynamic allocated storage inside something like
"bpf_prog->aux->insn_zext_dst" (a bool * pointer, with 1 * prog_len storage).
Such optimization information then is available to JIT back-end automatically,
because the currect JIT hook is 

  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)

After bpf_int_jit_compile finished, the dynamic allocated storage could be
freed, no need to keep it along with bpf_prog's life time.

Backends also have finer control on how to do zero extension so concerns like
https://www.spinics.net/lists/netdev/msg564489.html could be addressed
naturally and no need to introduce new BPF zero-extension instruction.

I am intended to send out the updated version using approach 1 so we could
see how it looks like, comments?

Regards,
Jiong

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: 32-bit zext time complexity (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests)
  2019-04-24 23:07 ` 32-bit zext time complexity (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests) Jiong Wang
@ 2019-04-25  4:33   ` Alexei Starovoitov
  2019-04-25  7:25     ` Jiong Wang
  0 siblings, 1 reply; 18+ messages in thread
From: Alexei Starovoitov @ 2019-04-25  4:33 UTC (permalink / raw)
  To: Jiong Wang
  Cc: Alexei Starovoitov, daniel, netdev, bpf, Jakub Kicinski, oss-drivers

On Thu, Apr 25, 2019 at 12:07:06AM +0100, Jiong Wang wrote:
> 
> Alexei Starovoitov writes:
> 
> > Add two tests to check that sequence of 1024 jumps is verifiable.
> >
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > ---
> >  tools/testing/selftests/bpf/test_verifier.c  | 70 ++++++++++++++++++++
> >  tools/testing/selftests/bpf/verifier/scale.c | 18 +++++
> 
> I am rebasing 32-bit opt pass on top of latest bpf-next and found these new
> tests take more than 20 minutes to run and had not finished after that.
> 
> The reason the following insn filling insde bpf_fill_scale1 is generating
> nearly 1M insn whose results are recognized as safe to be poisoned.
> 
>   bpf_fill_scale1:
>     while (i < MAX_TEST_INSNS - 1025)
>       insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 42);
> 
> For each hi32 poisoning, there will be one call to "bpf_patch_insn_data"
> which actually is not cheap (adjust jump insns, insn aux info etc). Now,
> 1M call to it has exhausted server resources as described, 20minutes running
> still not finished.
> 
> For real world applications, we don't do hi32 poisoning, and there isn't much
> lo32 zext. Benchmarking those bpf programs inside Cilium shows the final
> zext pass adds about 8% ~ 15% verification time.
> 
> The zext pass based on top of "bpf_patch_insn_data" looks more and more is
> not the best approach to utilize the read32 analysis results.
> 
> Previously, in v1 cover letter, I listed some of my other thoughts on how to
> utilize the liveness analysis results:
> 
>    1 Minor change on back-end JIT hook, also pass aux_insn information to
>      back-ends so they could have per insn information and they could do
>      zero extension for the marked insn themselves using the most
>      efficient native insn.
> 
>    2 Introduce zero extension insn for eBPF. Then verifier could insert
>      the new zext insn instead of lshift + rshift. zext could be JITed
>      more efficiently.
> 
>    3 Otherwise JIT back-ends need to do peephole to catch lshift + rshift
>      and turn them into native zext.

all options sounds like hacks to workaround inefficient bpf_patch_insn_data().
Especially option 2 will work only because single insn is replaced
with another insn ?
Let's fix the algo of bpf_patch_insn_data() instead, so that 1 insn -> 2+ insn
is also fast.
The main point of bumping the internal limits to 1M and these tests
was to expose such algorithmic inefficiencies.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: 32-bit zext time complexity (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests)
  2019-04-25  4:33   ` Alexei Starovoitov
@ 2019-04-25  7:25     ` Jiong Wang
  2019-04-25  9:49       ` Jiong Wang
  2019-04-25 22:10       ` Alexei Starovoitov
  0 siblings, 2 replies; 18+ messages in thread
From: Jiong Wang @ 2019-04-25  7:25 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Jiong Wang, Alexei Starovoitov, daniel, netdev, bpf,
	Jakub Kicinski, oss-drivers


Alexei Starovoitov writes:

> On Thu, Apr 25, 2019 at 12:07:06AM +0100, Jiong Wang wrote:
>> 
>> Alexei Starovoitov writes:
>> 
>> > Add two tests to check that sequence of 1024 jumps is verifiable.
>> >
>> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>> > ---
>> >  tools/testing/selftests/bpf/test_verifier.c  | 70 ++++++++++++++++++++
>> >  tools/testing/selftests/bpf/verifier/scale.c | 18 +++++
>> 
>> I am rebasing 32-bit opt pass on top of latest bpf-next and found these new
>> tests take more than 20 minutes to run and had not finished after that.
>> 
>> The reason the following insn filling insde bpf_fill_scale1 is generating
>> nearly 1M insn whose results are recognized as safe to be poisoned.
>> 
>>   bpf_fill_scale1:
>>     while (i < MAX_TEST_INSNS - 1025)
>>       insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 42);
>> 
>> For each hi32 poisoning, there will be one call to "bpf_patch_insn_data"
>> which actually is not cheap (adjust jump insns, insn aux info etc). Now,
>> 1M call to it has exhausted server resources as described, 20minutes running
>> still not finished.
>> 
>> For real world applications, we don't do hi32 poisoning, and there isn't much
>> lo32 zext. Benchmarking those bpf programs inside Cilium shows the final
>> zext pass adds about 8% ~ 15% verification time.
>> 
>> The zext pass based on top of "bpf_patch_insn_data" looks more and more is
>> not the best approach to utilize the read32 analysis results.
>> 
>> Previously, in v1 cover letter, I listed some of my other thoughts on how to
>> utilize the liveness analysis results:
>> 
>>    1 Minor change on back-end JIT hook, also pass aux_insn information to
>>      back-ends so they could have per insn information and they could do
>>      zero extension for the marked insn themselves using the most
>>      efficient native insn.
>> 
>>    2 Introduce zero extension insn for eBPF. Then verifier could insert
>>      the new zext insn instead of lshift + rshift. zext could be JITed
>>      more efficiently.
>> 
>>    3 Otherwise JIT back-ends need to do peephole to catch lshift + rshift
>>      and turn them into native zext.
>
> all options sounds like hacks to workaround inefficient bpf_patch_insn_data().
> Especially option 2 will work only because single insn is replaced
> with another insn ?

Option 1 should be a generic solution. It is passing verifier analysis
results generated by insn walk down to JIT back-ends. The information
passed down could be any analysis result useful for JIT code-gen.

> Let's fix the algo of bpf_patch_insn_data() instead, so that 1 insn -> 2+ insn
> is also fast.

The issue with 1 insn -> 2+ insn should be calling of bpf_adj_branches
which is doing another for_each_insn_in_prog traversal, so the zext
insertion becomes something like:

  for_each_insn_in_prog
  ...
     if (zext)
     ...
       for_each_insn_in_prog

which is quadratic. One solution is we chain all branch insns during
previous insn traversal in for example cfg check, and keep the information
somewhere info bpf_prog (env->insn_aux_data is a good place to keep such
information, but insn patch helpers are supposed to work with bpf_prog)
then bpf_adj_branches could traversal this chain instead of iterating
through all insns.

Regards,
Jiong


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: 32-bit zext time complexity (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests)
  2019-04-25  7:25     ` Jiong Wang
@ 2019-04-25  9:49       ` Jiong Wang
  2019-04-25 22:10       ` Alexei Starovoitov
  1 sibling, 0 replies; 18+ messages in thread
From: Jiong Wang @ 2019-04-25  9:49 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alexei Starovoitov, daniel, netdev, bpf, Jakub Kicinski, oss-drivers


> On 25 Apr 2019, at 08:25, Jiong Wang <jiong.wang@netronome.com> wrote:
> 
> 
> Alexei Starovoitov writes:
> 
>> On Thu, Apr 25, 2019 at 12:07:06AM +0100, Jiong Wang wrote:
>>> 
>>> Alexei Starovoitov writes:
>>> 
>>>> Add two tests to check that sequence of 1024 jumps is verifiable.
>>>> 
>>>> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>>>> ---
>>>> tools/testing/selftests/bpf/test_verifier.c  | 70 ++++++++++++++++++++
>>>> tools/testing/selftests/bpf/verifier/scale.c | 18 +++++
>>> 
>>> I am rebasing 32-bit opt pass on top of latest bpf-next and found these new
>>> tests take more than 20 minutes to run and had not finished after that.
>>> 
>>> The reason the following insn filling insde bpf_fill_scale1 is generating
>>> nearly 1M insn whose results are recognized as safe to be poisoned.
>>> 
>>>  bpf_fill_scale1:
>>>    while (i < MAX_TEST_INSNS - 1025)
>>>      insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 42);
>>> 
>>> For each hi32 poisoning, there will be one call to "bpf_patch_insn_data"
>>> which actually is not cheap (adjust jump insns, insn aux info etc). Now,
>>> 1M call to it has exhausted server resources as described, 20minutes running
>>> still not finished.
>>> 
>>> For real world applications, we don't do hi32 poisoning, and there isn't much
>>> lo32 zext. Benchmarking those bpf programs inside Cilium shows the final
>>> zext pass adds about 8% ~ 15% verification time.
>>> 
>>> The zext pass based on top of "bpf_patch_insn_data" looks more and more is
>>> not the best approach to utilize the read32 analysis results.
>>> 
>>> Previously, in v1 cover letter, I listed some of my other thoughts on how to
>>> utilize the liveness analysis results:
>>> 
>>>   1 Minor change on back-end JIT hook, also pass aux_insn information to
>>>     back-ends so they could have per insn information and they could do
>>>     zero extension for the marked insn themselves using the most
>>>     efficient native insn.
>>> 
>>>   2 Introduce zero extension insn for eBPF. Then verifier could insert
>>>     the new zext insn instead of lshift + rshift. zext could be JITed
>>>     more efficiently.
>>> 
>>>   3 Otherwise JIT back-ends need to do peephole to catch lshift + rshift
>>>     and turn them into native zext.
>> 
>> all options sounds like hacks to workaround inefficient bpf_patch_insn_data().
>> Especially option 2 will work only because single insn is replaced
>> with another insn ?
> 
> Option 1 should be a generic solution. It is passing verifier analysis
> results generated by insn walk down to JIT back-ends. The information
> passed down could be any analysis result useful for JIT code-gen.
> 
>> Let's fix the algo of bpf_patch_insn_data() instead, so that 1 insn -> 2+ insn
>> is also fast.
> 
> The issue with 1 insn -> 2+ insn should be calling of bpf_adj_branches
> which is doing another for_each_insn_in_prog traversal, so the zext
> insertion becomes something like:
> 
>  for_each_insn_in_prog
>  ...
>     if (zext)
>     ...
>       for_each_insn_in_prog
> 
> which is quadratic. One solution

s/solution/mitigation/

> is we chain all branch insns during
> previous insn traversal in for example cfg check, and keep the information
> somewhere info bpf_prog (env->insn_aux_data is a good place to keep such
> information, but insn patch helpers are supposed to work with bpf_prog)
> then bpf_adj_branches could traversal this chain instead of iterating
> through all insns.
> 
> Regards,
> Jiong


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: 32-bit zext time complexity (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests)
  2019-04-25  7:25     ` Jiong Wang
  2019-04-25  9:49       ` Jiong Wang
@ 2019-04-25 22:10       ` Alexei Starovoitov
  2019-04-26 13:06         ` Jiong Wang
  1 sibling, 1 reply; 18+ messages in thread
From: Alexei Starovoitov @ 2019-04-25 22:10 UTC (permalink / raw)
  To: Jiong Wang
  Cc: Alexei Starovoitov, daniel, netdev, bpf, Jakub Kicinski, oss-drivers

On Thu, Apr 25, 2019 at 08:25:44AM +0100, Jiong Wang wrote:
> 
> Alexei Starovoitov writes:
> 
> > On Thu, Apr 25, 2019 at 12:07:06AM +0100, Jiong Wang wrote:
> >> 
> >> Alexei Starovoitov writes:
> >> 
> >> > Add two tests to check that sequence of 1024 jumps is verifiable.
> >> >
> >> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> >> > ---
> >> >  tools/testing/selftests/bpf/test_verifier.c  | 70 ++++++++++++++++++++
> >> >  tools/testing/selftests/bpf/verifier/scale.c | 18 +++++
> >> 
> >> I am rebasing 32-bit opt pass on top of latest bpf-next and found these new
> >> tests take more than 20 minutes to run and had not finished after that.
> >> 
> >> The reason the following insn filling insde bpf_fill_scale1 is generating
> >> nearly 1M insn whose results are recognized as safe to be poisoned.
> >> 
> >>   bpf_fill_scale1:
> >>     while (i < MAX_TEST_INSNS - 1025)
> >>       insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 42);
> >> 
> >> For each hi32 poisoning, there will be one call to "bpf_patch_insn_data"
> >> which actually is not cheap (adjust jump insns, insn aux info etc). Now,
> >> 1M call to it has exhausted server resources as described, 20minutes running
> >> still not finished.
> >> 
> >> For real world applications, we don't do hi32 poisoning, and there isn't much
> >> lo32 zext. Benchmarking those bpf programs inside Cilium shows the final
> >> zext pass adds about 8% ~ 15% verification time.
> >> 
> >> The zext pass based on top of "bpf_patch_insn_data" looks more and more is
> >> not the best approach to utilize the read32 analysis results.
> >> 
> >> Previously, in v1 cover letter, I listed some of my other thoughts on how to
> >> utilize the liveness analysis results:
> >> 
> >>    1 Minor change on back-end JIT hook, also pass aux_insn information to
> >>      back-ends so they could have per insn information and they could do
> >>      zero extension for the marked insn themselves using the most
> >>      efficient native insn.
> >> 
> >>    2 Introduce zero extension insn for eBPF. Then verifier could insert
> >>      the new zext insn instead of lshift + rshift. zext could be JITed
> >>      more efficiently.
> >> 
> >>    3 Otherwise JIT back-ends need to do peephole to catch lshift + rshift
> >>      and turn them into native zext.
> >
> > all options sounds like hacks to workaround inefficient bpf_patch_insn_data().
> > Especially option 2 will work only because single insn is replaced
> > with another insn ?
> 
> Option 1 should be a generic solution. It is passing verifier analysis
> results generated by insn walk down to JIT back-ends. The information
> passed down could be any analysis result useful for JIT code-gen.
> 
> > Let's fix the algo of bpf_patch_insn_data() instead, so that 1 insn -> 2+ insn
> > is also fast.
> 
> The issue with 1 insn -> 2+ insn should be calling of bpf_adj_branches
> which is doing another for_each_insn_in_prog traversal, so the zext
> insertion becomes something like:
> 
>   for_each_insn_in_prog
>   ...
>      if (zext)
>      ...
>        for_each_insn_in_prog
> 
> which is quadratic. One solution is we chain all branch insns during
> previous insn traversal in for example cfg check, and keep the information
> somewhere info bpf_prog (env->insn_aux_data is a good place to keep such
> information, but insn patch helpers are supposed to work with bpf_prog)
> then bpf_adj_branches could traversal this chain instead of iterating
> through all insns.

I don't think it will make it much faster. There could be just as many
jumps as there are instructions.
Note that bpf_patch_insn_single() is calling bpf_adj_branches() twice too.
And dead_code + convert_ctx + fixup_bpf_calls are calling
bpf_patch_insn_single() a lot.
How about before dead_code pass we convert the program into basic-block
format, patch it all, and then convert from bb back to offsets.
Patching will become very cheap, since no loop over program will be
necessary. A jump from bb-N to bb-M will stay as-is regardless
of amount of patching was done inside each bb.
The loops inside these patching passes will be converted from:
for (i = 0; i < insn_cnt; i++, insn++)
into:
for each bb
  for each insn in bb

As far as option 1 "also pass aux_insn information to JITs"...
in theory it's fine, but looks like big refactoring to existing code.
So if you want to make this bb conversion as step 2 and unblock the
current patch set faster I suggest to go with option 2 "Introduce zero extension insn".


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: 32-bit zext time complexity (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests)
  2019-04-25 22:10       ` Alexei Starovoitov
@ 2019-04-26 13:06         ` Jiong Wang
  2019-04-26 14:50           ` Edward Cree
  2019-04-27  3:05           ` Alexei Starovoitov
  0 siblings, 2 replies; 18+ messages in thread
From: Jiong Wang @ 2019-04-26 13:06 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Jiong Wang, Alexei Starovoitov, daniel, netdev, bpf,
	Jakub Kicinski, oss-drivers


Alexei Starovoitov writes:

> On Thu, Apr 25, 2019 at 08:25:44AM +0100, Jiong Wang wrote:
>> 
>> Alexei Starovoitov writes:
>> 
>> > On Thu, Apr 25, 2019 at 12:07:06AM +0100, Jiong Wang wrote:
>> >> 
>> >> Alexei Starovoitov writes:
>> >> 
>> >> > Add two tests to check that sequence of 1024 jumps is verifiable.
>> >> >
>> >> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>> >> > ---
>> >> >  tools/testing/selftests/bpf/test_verifier.c  | 70 ++++++++++++++++++++
>> >> >  tools/testing/selftests/bpf/verifier/scale.c | 18 +++++
>> >> 
>> >> I am rebasing 32-bit opt pass on top of latest bpf-next and found these new
>> >> tests take more than 20 minutes to run and had not finished after that.
>> >> 
>> >> The reason the following insn filling insde bpf_fill_scale1 is generating
>> >> nearly 1M insn whose results are recognized as safe to be poisoned.
>> >> 
>> >>   bpf_fill_scale1:
>> >>     while (i < MAX_TEST_INSNS - 1025)
>> >>       insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 42);
>> >> 
>> >> For each hi32 poisoning, there will be one call to "bpf_patch_insn_data"
>> >> which actually is not cheap (adjust jump insns, insn aux info etc). Now,
>> >> 1M call to it has exhausted server resources as described, 20minutes running
>> >> still not finished.
>> >> 
>> >> For real world applications, we don't do hi32 poisoning, and there isn't much
>> >> lo32 zext. Benchmarking those bpf programs inside Cilium shows the final
>> >> zext pass adds about 8% ~ 15% verification time.
>> >> 
>> >> The zext pass based on top of "bpf_patch_insn_data" looks more and more is
>> >> not the best approach to utilize the read32 analysis results.
>> >> 
>> >> Previously, in v1 cover letter, I listed some of my other thoughts on how to
>> >> utilize the liveness analysis results:
>> >> 
>> >>    1 Minor change on back-end JIT hook, also pass aux_insn information to
>> >>      back-ends so they could have per insn information and they could do
>> >>      zero extension for the marked insn themselves using the most
>> >>      efficient native insn.
>> >> 
>> >>    2 Introduce zero extension insn for eBPF. Then verifier could insert
>> >>      the new zext insn instead of lshift + rshift. zext could be JITed
>> >>      more efficiently.
>> >> 
>> >>    3 Otherwise JIT back-ends need to do peephole to catch lshift + rshift
>> >>      and turn them into native zext.
>> >
>> > all options sounds like hacks to workaround inefficient bpf_patch_insn_data().
>> > Especially option 2 will work only because single insn is replaced
>> > with another insn ?
>> 
>> Option 1 should be a generic solution. It is passing verifier analysis
>> results generated by insn walk down to JIT back-ends. The information
>> passed down could be any analysis result useful for JIT code-gen.
>> 
>> > Let's fix the algo of bpf_patch_insn_data() instead, so that 1 insn -> 2+ insn
>> > is also fast.
>> 
>> The issue with 1 insn -> 2+ insn should be calling of bpf_adj_branches
>> which is doing another for_each_insn_in_prog traversal, so the zext
>> insertion becomes something like:
>> 
>>   for_each_insn_in_prog
>>   ...
>>      if (zext)
>>      ...
>>        for_each_insn_in_prog
>> 
>> which is quadratic. One solution is we chain all branch insns during
>> previous insn traversal in for example cfg check, and keep the information
>> somewhere info bpf_prog (env->insn_aux_data is a good place to keep such
>> information, but insn patch helpers are supposed to work with bpf_prog)
>> then bpf_adj_branches could traversal this chain instead of iterating
>> through all insns.

Thanks very much for the feedbacks.

> I don't think it will make it much faster. There could be just as many
> jumps as there are instructions.

Benchmarked a basic implemention on a couple of bpf programs in Cilium repo
(the impl is a relocation list kept in a array built as by-product of
check_subprogs. during patch, walk this relocation list instead of all
insns. Then calculate time by 10 times run and take average)

 - The time spent on zero-extension pass is ~30% less
 - The time spent on convert_ctx_accesses + fixup_bpf_call +
   fixup_call_args is ~15% less
 - The time spent on dead code elim pass is not changed which makes sense
   as dead code is rare so patch infra is not triggered much.

So, looks like could help a little bit on daily program, but agree no
fundamental improvements, it is still N(insn_needs_patch) * N(branch), and
both N could go very big.

> Note that bpf_patch_insn_single() is calling bpf_adj_branches() twice too.
> And dead_code + convert_ctx + fixup_bpf_calls are calling
> bpf_patch_insn_single() a lot.
> How about before dead_code pass we convert the program into basic-block
> format, patch it all, and then convert from bb back to offsets.
> Patching will become very cheap, since no loop over program will be
> necessary. A jump from bb-N to bb-M will stay as-is regardless
> of amount of patching was done inside each bb.
> The loops inside these patching passes will be converted from:
> for (i = 0; i < insn_cnt; i++, insn++)
> into:
> for each bb
>   for each insn in bb

Interesting. If I am understanding correctly, BB then needs to support
dynamic insn buffer resize. And after all insn patching finished, all BBs
are finalized, we then linearized BBs (in a best order) to generate the
final bpf image.

> As far as option 1 "also pass aux_insn information to JITs"...
> in theory it's fine, but looks like big refactoring to existing code.

Will do quick explore, might turn out to be small change.

> So if you want to make this bb conversion as step 2 and unblock the
> current patch set faster I suggest to go with option 2 "Introduce zero
> extension insn".

A second think, even zero extension insn introduced, it is inserted after
the sub-register write insn, so we are still doing insert *not*
replacement, insn_delta inside bpf_patch_insn_single will be 1, so the slow
path will always be taken (memmove + bpf_prog_realloc + 2 x
bpf_adj_branches).

For the 1M-scale test, bpf_patch_insn_single is triggered because of hi32
poisoning, not lo32 zext. So we just need to change MOV32 to MOV64 in the
testcase which doesn't break the initial testing purpose of this testcase
from my understanding. This then could avoid 1M call to
bpf_patch_insn_single and pass the test after 32-bit opt patch set applied.

Without this change and with hi32 randomization enabled, scale tests will
still hang before insn patch infra improved.

@@ -228,7 +228,7 @@ static void bpf_fill_scale1(struct bpf_test *self)           
-             insn[i++] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 42);
+             insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 42);

This is change is not to paperover the underlying issue. We now know the
existing insn patch infra doesn't scale to million level, so could work on
improving it in the next step.

At the same time the issue exposed from hi32 poisoning does raise alarm
that there could be the same issue for lo32 zext, therefore this patch set
doesn't scale if there are lots of insns to be zero extended, even though
this may not happen in real world bpf prog.

IMHO, avoid using insn patching when possible might always be better. So,
if option 1 turns out to also generate clean patch set and introduce small
changes, I am going to follow it in the update version.

Please let me know if you have different opinion.

Regards,
Jiong

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: 32-bit zext time complexity (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests)
  2019-04-26 13:06         ` Jiong Wang
@ 2019-04-26 14:50           ` Edward Cree
  2019-04-27  3:11             ` Alexei Starovoitov
  2019-04-27  3:05           ` Alexei Starovoitov
  1 sibling, 1 reply; 18+ messages in thread
From: Edward Cree @ 2019-04-26 14:50 UTC (permalink / raw)
  To: Jiong Wang, Alexei Starovoitov
  Cc: Alexei Starovoitov, daniel, netdev, bpf, Jakub Kicinski, oss-drivers

On 26/04/2019 14:06, Jiong Wang wrote:
> Alexei Starovoitov writes:
>> Note that bpf_patch_insn_single() is calling bpf_adj_branches() twice too.
>> And dead_code + convert_ctx + fixup_bpf_calls are calling
>> bpf_patch_insn_single() a lot.
>> How about before dead_code pass we convert the program into basic-block
>> format, patch it all, and then convert from bb back to offsets.
>> Patching will become very cheap, since no loop over program will be
>> necessary. A jump from bb-N to bb-M will stay as-is regardless
>> of amount of patching was done inside each bb.
>> The loops inside these patching passes will be converted from:
>> for (i = 0; i < insn_cnt; i++, insn++)
>> into:
>> for each bb
>>   for each insn in bb
> Interesting. If I am understanding correctly, BB then needs to support
> dynamic insn buffer resize. And after all insn patching finished, all BBs
> are finalized, we then linearized BBs (in a best order) to generate the
> final bpf image.
Does any verifier metadata ever take the index of an insn that was added by
 patching?  If not, then we could have, instead of an array of insns, an
 array of list_heads, each of which is a list of insns (plus aux data etc.).
At entry the original program is converted into an array of 1-element lists.
On patching we edit the list, which doesn't change the index of any insn.
Then after all insn patching finishes, we linearise as above.

Just a thought,
-Ed

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: 32-bit zext time complexity (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests)
  2019-04-26 13:06         ` Jiong Wang
  2019-04-26 14:50           ` Edward Cree
@ 2019-04-27  3:05           ` Alexei Starovoitov
  2019-04-28 22:11             ` Jiong Wang
  1 sibling, 1 reply; 18+ messages in thread
From: Alexei Starovoitov @ 2019-04-27  3:05 UTC (permalink / raw)
  To: Jiong Wang
  Cc: Alexei Starovoitov, daniel, netdev, bpf, Jakub Kicinski, oss-drivers

On Fri, Apr 26, 2019 at 02:06:33PM +0100, Jiong Wang wrote:
> 
> Alexei Starovoitov writes:
> 
> > On Thu, Apr 25, 2019 at 08:25:44AM +0100, Jiong Wang wrote:
> >> 
> >> Alexei Starovoitov writes:
> >> 
> >> > On Thu, Apr 25, 2019 at 12:07:06AM +0100, Jiong Wang wrote:
> >> >> 
> >> >> Alexei Starovoitov writes:
> >> >> 
> >> >> > Add two tests to check that sequence of 1024 jumps is verifiable.
> >> >> >
> >> >> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> >> >> > ---
> >> >> >  tools/testing/selftests/bpf/test_verifier.c  | 70 ++++++++++++++++++++
> >> >> >  tools/testing/selftests/bpf/verifier/scale.c | 18 +++++
> >> >> 
> >> >> I am rebasing 32-bit opt pass on top of latest bpf-next and found these new
> >> >> tests take more than 20 minutes to run and had not finished after that.
> >> >> 
> >> >> The reason the following insn filling insde bpf_fill_scale1 is generating
> >> >> nearly 1M insn whose results are recognized as safe to be poisoned.
> >> >> 
> >> >>   bpf_fill_scale1:
> >> >>     while (i < MAX_TEST_INSNS - 1025)
> >> >>       insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 42);
> >> >> 
> >> >> For each hi32 poisoning, there will be one call to "bpf_patch_insn_data"
> >> >> which actually is not cheap (adjust jump insns, insn aux info etc). Now,
> >> >> 1M call to it has exhausted server resources as described, 20minutes running
> >> >> still not finished.
> >> >> 
> >> >> For real world applications, we don't do hi32 poisoning, and there isn't much
> >> >> lo32 zext. Benchmarking those bpf programs inside Cilium shows the final
> >> >> zext pass adds about 8% ~ 15% verification time.
> >> >> 
> >> >> The zext pass based on top of "bpf_patch_insn_data" looks more and more is
> >> >> not the best approach to utilize the read32 analysis results.
> >> >> 
> >> >> Previously, in v1 cover letter, I listed some of my other thoughts on how to
> >> >> utilize the liveness analysis results:
> >> >> 
> >> >>    1 Minor change on back-end JIT hook, also pass aux_insn information to
> >> >>      back-ends so they could have per insn information and they could do
> >> >>      zero extension for the marked insn themselves using the most
> >> >>      efficient native insn.
> >> >> 
> >> >>    2 Introduce zero extension insn for eBPF. Then verifier could insert
> >> >>      the new zext insn instead of lshift + rshift. zext could be JITed
> >> >>      more efficiently.
> >> >> 
> >> >>    3 Otherwise JIT back-ends need to do peephole to catch lshift + rshift
> >> >>      and turn them into native zext.
> >> >
> >> > all options sounds like hacks to workaround inefficient bpf_patch_insn_data().
> >> > Especially option 2 will work only because single insn is replaced
> >> > with another insn ?
> >> 
> >> Option 1 should be a generic solution. It is passing verifier analysis
> >> results generated by insn walk down to JIT back-ends. The information
> >> passed down could be any analysis result useful for JIT code-gen.
> >> 
> >> > Let's fix the algo of bpf_patch_insn_data() instead, so that 1 insn -> 2+ insn
> >> > is also fast.
> >> 
> >> The issue with 1 insn -> 2+ insn should be calling of bpf_adj_branches
> >> which is doing another for_each_insn_in_prog traversal, so the zext
> >> insertion becomes something like:
> >> 
> >>   for_each_insn_in_prog
> >>   ...
> >>      if (zext)
> >>      ...
> >>        for_each_insn_in_prog
> >> 
> >> which is quadratic. One solution is we chain all branch insns during
> >> previous insn traversal in for example cfg check, and keep the information
> >> somewhere info bpf_prog (env->insn_aux_data is a good place to keep such
> >> information, but insn patch helpers are supposed to work with bpf_prog)
> >> then bpf_adj_branches could traversal this chain instead of iterating
> >> through all insns.
> 
> Thanks very much for the feedbacks.
> 
> > I don't think it will make it much faster. There could be just as many
> > jumps as there are instructions.
> 
> Benchmarked a basic implemention on a couple of bpf programs in Cilium repo
> (the impl is a relocation list kept in a array built as by-product of
> check_subprogs. during patch, walk this relocation list instead of all
> insns. Then calculate time by 10 times run and take average)
> 
>  - The time spent on zero-extension pass is ~30% less
>  - The time spent on convert_ctx_accesses + fixup_bpf_call +
>    fixup_call_args is ~15% less
>  - The time spent on dead code elim pass is not changed which makes sense
>    as dead code is rare so patch infra is not triggered much.
> 
> So, looks like could help a little bit on daily program, but agree no
> fundamental improvements, it is still N(insn_needs_patch) * N(branch), and
> both N could go very big.
> 
> > Note that bpf_patch_insn_single() is calling bpf_adj_branches() twice too.
> > And dead_code + convert_ctx + fixup_bpf_calls are calling
> > bpf_patch_insn_single() a lot.
> > How about before dead_code pass we convert the program into basic-block
> > format, patch it all, and then convert from bb back to offsets.
> > Patching will become very cheap, since no loop over program will be
> > necessary. A jump from bb-N to bb-M will stay as-is regardless
> > of amount of patching was done inside each bb.
> > The loops inside these patching passes will be converted from:
> > for (i = 0; i < insn_cnt; i++, insn++)
> > into:
> > for each bb
> >   for each insn in bb
> 
> Interesting. If I am understanding correctly, BB then needs to support
> dynamic insn buffer resize. And after all insn patching finished, all BBs
> are finalized, we then linearized BBs (in a best order) to generate the
> final bpf image.

dynamic BB resize could be done similar to existing prog resize.
It grows in page increments.

> > As far as option 1 "also pass aux_insn information to JITs"...
> > in theory it's fine, but looks like big refactoring to existing code.
> 
> Will do quick explore, might turn out to be small change.
> 
> > So if you want to make this bb conversion as step 2 and unblock the
> > current patch set faster I suggest to go with option 2 "Introduce zero
> > extension insn".
> 
> A second think, even zero extension insn introduced, it is inserted after
> the sub-register write insn, so we are still doing insert *not*
> replacement, insn_delta inside bpf_patch_insn_single will be 1, so the slow
> path will always be taken (memmove + bpf_prog_realloc + 2 x
> bpf_adj_branches).

ahh. right.

> For the 1M-scale test, bpf_patch_insn_single is triggered because of hi32
> poisoning, not lo32 zext. So we just need to change MOV32 to MOV64 in the
> testcase which doesn't break the initial testing purpose of this testcase
> from my understanding. This then could avoid 1M call to
> bpf_patch_insn_single and pass the test after 32-bit opt patch set applied.
> 
> Without this change and with hi32 randomization enabled, scale tests will
> still hang before insn patch infra improved.
> 
> @@ -228,7 +228,7 @@ static void bpf_fill_scale1(struct bpf_test *self)           
> -             insn[i++] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 42);
> +             insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 42);
> 
> This is change is not to paperover the underlying issue. We now know the
> existing insn patch infra doesn't scale to million level, so could work on
> improving it in the next step.

I'm hesitant to step back.
Do you see a program that can hit this patch_insn issue already?
(I mean without your hi32/lo32 zext changes).

> At the same time the issue exposed from hi32 poisoning does raise alarm
> that there could be the same issue for lo32 zext, therefore this patch set
> doesn't scale if there are lots of insns to be zero extended, even though
> this may not happen in real world bpf prog.
> 
> IMHO, avoid using insn patching when possible might always be better. So,
> if option 1 turns out to also generate clean patch set and introduce small
> changes, I am going to follow it in the update version.
> 
> Please let me know if you have different opinion.

if you can craft a test that shows patch_insn issue before your set,
then it's ok to hack bpf_fill_scale1 to use alu64.
I would also prefer to go with option 2 (new zext insn) for JITs.
I still don't have good feeling about option 1.
Exposing all of aux_data to JITs may become a headache
in the verifier development. It needs to be done carefully.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: 32-bit zext time complexity (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests)
  2019-04-26 14:50           ` Edward Cree
@ 2019-04-27  3:11             ` Alexei Starovoitov
  2019-04-29 10:43               ` Edward Cree
  0 siblings, 1 reply; 18+ messages in thread
From: Alexei Starovoitov @ 2019-04-27  3:11 UTC (permalink / raw)
  To: Edward Cree
  Cc: Jiong Wang, Alexei Starovoitov, daniel, netdev, bpf,
	Jakub Kicinski, oss-drivers

On Fri, Apr 26, 2019 at 03:50:33PM +0100, Edward Cree wrote:
> On 26/04/2019 14:06, Jiong Wang wrote:
> > Alexei Starovoitov writes:
> >> Note that bpf_patch_insn_single() is calling bpf_adj_branches() twice too.
> >> And dead_code + convert_ctx + fixup_bpf_calls are calling
> >> bpf_patch_insn_single() a lot.
> >> How about before dead_code pass we convert the program into basic-block
> >> format, patch it all, and then convert from bb back to offsets.
> >> Patching will become very cheap, since no loop over program will be
> >> necessary. A jump from bb-N to bb-M will stay as-is regardless
> >> of amount of patching was done inside each bb.
> >> The loops inside these patching passes will be converted from:
> >> for (i = 0; i < insn_cnt; i++, insn++)
> >> into:
> >> for each bb
> >>   for each insn in bb
> > Interesting. If I am understanding correctly, BB then needs to support
> > dynamic insn buffer resize. And after all insn patching finished, all BBs
> > are finalized, we then linearized BBs (in a best order) to generate the
> > final bpf image.
> Does any verifier metadata ever take the index of an insn that was added by
>  patching?  If not, then we could have, instead of an array of insns, an
>  array of list_heads, each of which is a list of insns (plus aux data etc.).
> At entry the original program is converted into an array of 1-element lists.
> On patching we edit the list, which doesn't change the index of any insn.
> Then after all insn patching finishes, we linearise as above.

Interesting idea. It can be optimized further:
instead of converting all insns into lists of 1 before all patching
it can be done on demand:
convert from insn to list only when patching is needed.
Patched insn becomes a pointer to a block of new insns.
We have reserved opcodes to recognize such situation.
The question is how to linearise it once at the end?


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: 32-bit zext time complexity (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests)
  2019-04-27  3:05           ` Alexei Starovoitov
@ 2019-04-28 22:11             ` Jiong Wang
  2019-05-01 14:59               ` Jiong Wang
  0 siblings, 1 reply; 18+ messages in thread
From: Jiong Wang @ 2019-04-28 22:11 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alexei Starovoitov, daniel, netdev, bpf, Jakub Kicinski, oss-drivers


> On 27 Apr 2019, at 04:05, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> On Fri, Apr 26, 2019 at 02:06:33PM +0100, Jiong Wang wrote:

<snip>

>> 
>>> Note that bpf_patch_insn_single() is calling bpf_adj_branches() twice too.
>>> And dead_code + convert_ctx + fixup_bpf_calls are calling
>>> bpf_patch_insn_single() a lot.
>>> How about before dead_code pass we convert the program into basic-block
>>> format, patch it all, and then convert from bb back to offsets.
>>> Patching will become very cheap, since no loop over program will be
>>> necessary. A jump from bb-N to bb-M will stay as-is regardless
>>> of amount of patching was done inside each bb.
>>> The loops inside these patching passes will be converted from:
>>> for (i = 0; i < insn_cnt; i++, insn++)
>>> into:
>>> for each bb
>>>  for each insn in bb
>> 
>> Interesting. If I am understanding correctly, BB then needs to support
>> dynamic insn buffer resize. And after all insn patching finished, all BBs
>> are finalized, we then linearized BBs (in a best order) to generate the
>> final bpf image.
> 
> dynamic BB resize could be done similar to existing prog resize.
> It grows in page increments.
> 
>>> As far as option 1 "also pass aux_insn information to JITs"...
>>> in theory it's fine, but looks like big refactoring to existing code.
>> 
>> Will do quick explore, might turn out to be small change.
>> 
>>> So if you want to make this bb conversion as step 2 and unblock the
>>> current patch set faster I suggest to go with option 2 "Introduce zero
>>> extension insn".
>> 
>> A second think, even zero extension insn introduced, it is inserted after
>> the sub-register write insn, so we are still doing insert *not*
>> replacement, insn_delta inside bpf_patch_insn_single will be 1, so the slow
>> path will always be taken (memmove + bpf_prog_realloc + 2 x
>> bpf_adj_branches).
> 
> ahh. right.
> 
>> For the 1M-scale test, bpf_patch_insn_single is triggered because of hi32
>> poisoning, not lo32 zext. So we just need to change MOV32 to MOV64 in the
>> testcase which doesn't break the initial testing purpose of this testcase
>> from my understanding. This then could avoid 1M call to
>> bpf_patch_insn_single and pass the test after 32-bit opt patch set applied.
>> 
>> Without this change and with hi32 randomization enabled, scale tests will
>> still hang before insn patch infra improved.
>> 
>> @@ -228,7 +228,7 @@ static void bpf_fill_scale1(struct bpf_test *self)           
>> -             insn[i++] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 42);
>> +             insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 42);
>> 
>> This is change is not to paperover the underlying issue. We now know the
>> existing insn patch infra doesn't scale to million level, so could work on
>> improving it in the next step.
> 
> I'm hesitant to step back.
> Do you see a program that can hit this patch_insn issue already?
> (I mean without your hi32/lo32 zext changes).

No really on real world program which I feel won't do that much insn patching.   
                                                                                 
But on test_verifier, could be reproduced through enabling jit blinding,         
because the "scale" test contains imm insn.                                      
                                                                                 
For example, on bpf-next master, just run:                                       
                                                                                 
  sysctl net/core/bpf_jit_enable=1                                               
  sysctl net/core/bpf_jit_harden=2                                               
  test_verifier 732 (732 is the test number for “scale: scale test1” on my env)  
                                                                                 
This enables constant blinding which also needs insn patching.                   
test 732 contains nearly 1M BPF_MOV_IMM to be blinded, so could                  
have similar effect as hi32 poisoning.                                           
                                                                                 
And benchmarking shows, once insn patch helper is called over 15000 times,     
then the user could fell the load delay, and when it is called around       
50000 times, it will take about half minutes to finish verification.             
                                                                                 
15000 :   3s                                                                     
45000 :  29s                                                                     
95000 : 125s                                                                     
195000: 712s 

> 
>> At the same time the issue exposed from hi32 poisoning does raise alarm
>> that there could be the same issue for lo32 zext, therefore this patch set
>> doesn't scale if there are lots of insns to be zero extended, even though
>> this may not happen in real world bpf prog.
>> 
>> IMHO, avoid using insn patching when possible might always be better. So,
>> if option 1 turns out to also generate clean patch set and introduce small
>> changes, I am going to follow it in the update version.
>> 
>> Please let me know if you have different opinion.
> 
> if you can craft a test that shows patch_insn issue before your set,
> then it's ok to hack bpf_fill_scale1 to use alu64.

As described above, does the test_verifier 732 + jit blinding looks convincing?

> I would also prefer to go with option 2 (new zext insn) for JITs.

Got it.

> I still don't have good feeling about option 1.
> Exposing all of aux_data to JITs may become a headache
> in the verifier development. It needs to be done carefully.

OK, understood.

Just for clarification, I thought to just add a field, something like            
"bool *zext_dst" inside bpf_prog_aux. Then only copy                             
"env->insn_aux_data[*].zext_dst" to bpf_prog->aux->zext_dst, not copy all         
aux_data generated by verifier. The field in bpf_prog could latter be extended   
to a structure contains those analysis info verifier want to push down to        
JIT back-ends, and JIT back-end must free them as soon as JIT compilation is     
done.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: 32-bit zext time complexity (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests)
  2019-04-27  3:11             ` Alexei Starovoitov
@ 2019-04-29 10:43               ` Edward Cree
  0 siblings, 0 replies; 18+ messages in thread
From: Edward Cree @ 2019-04-29 10:43 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Jiong Wang, Alexei Starovoitov, daniel, netdev, bpf,
	Jakub Kicinski, oss-drivers

On 27/04/2019 04:11, Alexei Starovoitov wrote:
> instead of converting all insns into lists of 1 before all patching
> it can be done on demand:
> convert from insn to list only when patching is needed.
Makes sense.
> Patched insn becomes a pointer to a block of new insns.
> We have reserved opcodes to recognize such situation.
It's not clear to me where you can fit everything though.  The pointer
 is 64 bits, which is the same as struct bpf_insn.  Are you suggesting
 relying on kernel pointers always starting 0xff?
> The question is how to linearise it once at the end?
Walk the old prog once to calculate out_insn_idx for each in_insn
 (since we will only ever be jumping to the first insn of a list (or
 to a non-list insn), that's all we need), as well as out_len.
Allocate enough pages for out_len (let's not try to do any of this
 in-place, that would be painful), then walk the old prog to copy it
 insn-by-insn into the new one, recalculating any jump offsets by
 looking up the dest insn's out_insn_idx and subtracting our own
 out_insn_idx (plus an offset if we're not the first insn in the list
 of course).  While we're at it we can also fix up e.g.
 linfo[].insn_off: if in_insn_idx matches linfo[li_idx].insn_off,
 then set linfo[li_idx++].insn_off = out_insn_idx.  If we still need
 aux_data at this point we can copy that across too.
Runtime O(out_len), and gets rid of all the adjusts on
 patch_insn_single — branches, linfo, subprog_starts, aux_data.
Have I missed anything?  If I have time I'll put together an RFC
 patch in the next few days.

-Ed

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: 32-bit zext time complexity (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests)
  2019-04-28 22:11             ` Jiong Wang
@ 2019-05-01 14:59               ` Jiong Wang
  2019-05-03 17:12                 ` 32-bit zext JIT efficiency " Jiong Wang
  0 siblings, 1 reply; 18+ messages in thread
From: Jiong Wang @ 2019-05-01 14:59 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alexei Starovoitov, Daniel Borkmann, Networking, bpf,
	Jakub Kicinski, oss-drivers

> > if you can craft a test that shows patch_insn issue before your set,
> > then it's ok to hack bpf_fill_scale1 to use alu64.
>
> As described above, does the test_verifier 732 + jit blinding looks convincing?
>
> > I would also prefer to go with option 2 (new zext insn) for JITs.
>
> Got it.

I followed option 2 and have sent out v5 with latests changes/fixes:

The major changes are:
  - introduced BPF_ZEXT, even though it doesn't resolve insn patch in-efficient,
    but could let JIT back-ends do optimal code-gen, and the change is small,
    so perhap just better to support it in this set.
  - while look insn patch code, I feel patched-insn need to be conservatiely
    marked if any insn inside patch buffer define sub-register.
  - Also fixed helper function return value handling bug. I am thinking helper
    function should have accurate return value type description, otherwise
    there could be bug. For example arm32 back-end just executes the native
    helper functions and doesn't do anything special on the return value. So
    a function returns u32 would only set native reg r0, not r1 in the pair.
    Then if the outside eBPF insn is casting it into u64, there needs to be
    zext.
  - adjusted test_verifier to make sure it could pass on hosts w and w/o hw
    zext.

For more info, please see the cover letter and patch description at v5.

Thanks.
Regards,
Jiong

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: 32-bit zext JIT efficiency (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests)
  2019-05-01 14:59               ` Jiong Wang
@ 2019-05-03 17:12                 ` Jiong Wang
  0 siblings, 0 replies; 18+ messages in thread
From: Jiong Wang @ 2019-05-03 17:12 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Daniel Borkmann, Networking, bpf, Jakub Kicinski, oss-drivers


Jiong Wang writes:

>> > if you can craft a test that shows patch_insn issue before your set,
>> > then it's ok to hack bpf_fill_scale1 to use alu64.
>>
>> As described above, does the test_verifier 732 + jit blinding looks convincing?
>>
>> > I would also prefer to go with option 2 (new zext insn) for JITs.
>>
>> Got it.
>
> I followed option 2 and have sent out v5 with latests changes/fixes:

Had done second look at various back-ends, now noticed one new issue,
some arches are not consistent on implicit zext. For example, for s390,
alu32 move could be JITed using single instruction "llgfr" which will do
implicit zext, but alu32 move on PowerPC needs explicit zext. Then for
riscv, all BPF_ALU | BPF_K needs zext but not for some of BPF_ALU | BPF_X.
So, while these arches are generally better off after verifier zext
insertion enabled, but there do have unnecessary zext inserted by verifier
for them case by case.

Also, for 64-bit arches like PowerPC, S390 etc, they normally has zero
extended load, so narrowed load doesn't need extra zext insn, but for
32-bit arches like arm, narrowed load always need explicit zext.

All these differences are because of BPF_ALU32 or BPF_LDX + B | H | W
will be eventually mapped to diversified back-ends which do not have
consistent ISA semantics.

Given all these, looks like pass down the analysis info to back-ends
and let them do the decision become the choice again?

Regards,
Jiong

> The major changes are:
>   - introduced BPF_ZEXT, even though it doesn't resolve insn patch in-efficient,
>     but could let JIT back-ends do optimal code-gen, and the change is small,
>     so perhap just better to support it in this set.
>   - while look insn patch code, I feel patched-insn need to be conservatiely
>     marked if any insn inside patch buffer define sub-register.
>   - Also fixed helper function return value handling bug. I am thinking helper
>     function should have accurate return value type description, otherwise
>     there could be bug. For example arm32 back-end just executes the native
>     helper functions and doesn't do anything special on the return value. So
>     a function returns u32 would only set native reg r0, not r1 in the pair.
>     Then if the outside eBPF insn is casting it into u64, there needs to be
>     zext.
>   - adjusted test_verifier to make sure it could pass on hosts w and w/o hw
>     zext.
>
> For more info, please see the cover letter and patch description at v5.
>
> Thanks.
> Regards,
> Jiong


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2019-05-03 17:12 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-12 21:41 [PATCH bpf-next] selftests/bpf: two scale tests Alexei Starovoitov
2019-04-12 23:24 ` Song Liu
2019-04-12 23:32   ` Alexei Starovoitov
2019-04-15  5:59     ` Song Liu
2019-04-16  8:30 ` Daniel Borkmann
2019-04-24 23:07 ` 32-bit zext time complexity (Was Re: [PATCH bpf-next] selftests/bpf: two scale tests) Jiong Wang
2019-04-25  4:33   ` Alexei Starovoitov
2019-04-25  7:25     ` Jiong Wang
2019-04-25  9:49       ` Jiong Wang
2019-04-25 22:10       ` Alexei Starovoitov
2019-04-26 13:06         ` Jiong Wang
2019-04-26 14:50           ` Edward Cree
2019-04-27  3:11             ` Alexei Starovoitov
2019-04-29 10:43               ` Edward Cree
2019-04-27  3:05           ` Alexei Starovoitov
2019-04-28 22:11             ` Jiong Wang
2019-05-01 14:59               ` Jiong Wang
2019-05-03 17:12                 ` 32-bit zext JIT efficiency " Jiong Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).