[4/4] arm64: bpf: Elide some moves to a0 after calls
diff mbox series

Message ID 20200128021145.36774-5-palmerdabbelt@google.com
State New
Headers show
Series
  • [1/4] selftests/bpf: Elide a check for LLVM versions that can't compile it
Related show

Commit Message

Palmer Dabbelt Jan. 28, 2020, 2:11 a.m. UTC
On arm64, the BPF function ABI doesn't match the C function ABI.  Specifically,
arm64 encodes calls as `a0 = f(a0, a1, ...)` while BPF encodes calls as
`BPF_REG_0 = f(BPF_REG_1, BPF_REG_2, ...)`.  This discrepancy results in
function calls being encoded as a two operations sequence that first does a C
ABI calls and then moves the return register into the right place.  This
results in one extra instruction for every function call.

This patch adds an optimization to the arm64 BPF JIT backend that aims to avoid
some of these moves.

I've done no benchmarking to determine if this is correct.  I ran the BPF
selftests before and after the change on arm64 in QEMU and found that I had a
single failure both before and after.  I'm not at all confident this code
actually works as it's my first time doing anything with both ARM64 and BPF and
I didn't even open the documentation for either of these.  I was particularly
surprised that the code didn't fail any tests -- I was kind of assuming this
would fail the tests, get put on the backburner, sit long enough for me to stop
caring, and then get deleted.

Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 arch/arm64/net/bpf_jit_comp.c | 71 +++++++++++++++++++++++++++++++++--
 1 file changed, 68 insertions(+), 3 deletions(-)

Comments

Björn Töpel Feb. 4, 2020, 7:13 p.m. UTC | #1
On Tue, 28 Jan 2020 at 03:15, Palmer Dabbelt <palmerdabbelt@google.com> wrote:
>
> On arm64, the BPF function ABI doesn't match the C function ABI.  Specifically,
> arm64 encodes calls as `a0 = f(a0, a1, ...)` while BPF encodes calls as
> `BPF_REG_0 = f(BPF_REG_1, BPF_REG_2, ...)`.  This discrepancy results in
> function calls being encoded as a two operations sequence that first does a C
> ABI calls and then moves the return register into the right place.  This
> results in one extra instruction for every function call.
>

It's a lot of extra work for one reg-to-reg move, but it always
annoyed me in the RISC-V JIT. :-) So, if it *can* be avoided, why not.

[...]
>
> +static int dead_register(const struct jit_ctx *ctx, int offset, int bpf_reg)

Given that a lot of archs (RISC-V, arm?, MIPS?) might benefit from
this, it would be nice if it could be made generic (it already is
pretty much), and moved to kernel/bpf.

> +{
> +       const struct bpf_prog *prog = ctx->prog;
> +       int i;
> +
> +       for (i = offset; i < prog->len; ++i) {
> +               const struct bpf_insn *insn = &prog->insnsi[i];
> +               const u8 code = insn->code;
> +               const u8 bpf_dst = insn->dst_reg;
> +               const u8 bpf_src = insn->src_reg;
> +               const int writes_dst = !((code & BPF_ST) || (code & BPF_STX)
> +                                        || (code & BPF_JMP32) || (code & BPF_JMP));
> +               const int reads_dst  = !((code & BPF_LD));
> +               const int reads_src  = true;
> +
> +               /* Calls are a bit special in that they clobber a bunch of regisers. */
> +               if ((code & (BPF_JMP | BPF_CALL)) || (code & (BPF_JMP | BPF_TAIL_CALL)))
> +                       if ((bpf_reg >= BPF_REG_0) && (bpf_reg <= BPF_REG_5))
> +                               return false;
> +
> +               /* Registers that are read before they're written are alive.
> +                * Most opcodes are of the form DST = DEST op SRC, but there
> +                * are some exceptions.*/
> +               if (bpf_src == bpf_reg && reads_src)
> +                       return false;
> +
> +               if (bpf_dst == bpf_reg && reads_dst)
> +                       return false;
> +
> +               if (bpf_dst == bpf_reg && writes_dst)
> +                       return true;
> +
> +               /* Most BPF instructions are 8 bits long, but some ar 16 bits
> +                * long. */

A bunch of spelling errors above.


Cheers,
Björn
Alexei Starovoitov Feb. 11, 2020, 12:15 a.m. UTC | #2
On Mon, Jan 27, 2020 at 06:11:45PM -0800, Palmer Dabbelt wrote:
>  
> +	/* Handle BPF_REG_0, which may be in the wrong place because the ARM64
> +	 * ABI doesn't match the BPF ABI for function calls. */
> +	if (ctx->reg0_in_reg1) {
> +		/* If we're writing BPF_REG_0 then we don't need to do any
> +		 * extra work to get the registers back in their correct
> +		 * locations. */
> +		if (insn->dst_reg == BPF_REG_0)
> +			ctx->reg0_in_reg1 = false;
> +
> +		/* If we're writing to BPF_REG_1 then we need to save BPF_REG_0
> +		 * into the correct location if it's still alive, as otherwise
> +		 * it will be clobbered. */
> +		if (insn->dst_reg == BPF_REG_1) {
> +			if (!dead_register(ctx, off + 1, BPF_REG_0))
> +				emit(A64_MOV(1, A64_R(7), A64_R(0)), ctx);
> +			ctx->reg0_in_reg1 = false;
> +		}
> +	}

I'm not sure this is correct, since it processes insns as a linear code, but
there could be jumps in the middle. The logic should be following the control
flow of the program. The verifier is a better place to do such analysis.
I don't see how JITs can do it on their own.

Patch
diff mbox series

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index fba5b1b00cd7..48d900cc7258 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -58,10 +58,14 @@  struct jit_ctx {
 	int *offset;
 	__le32 *image;
 	u32 stack_size;
+	int reg0_in_reg1;
 };
 
 static inline int bpf2a64(struct jit_ctx *ctx, int bpf_reg)
 {
+	if (ctx->reg0_in_reg1 && bpf_reg == BPF_REG_0)
+		bpf_reg = BPF_REG_1;
+
 	return bpf2a64_default[bpf_reg];
 }
 
@@ -338,6 +342,47 @@  static void build_epilogue(struct jit_ctx *ctx)
 	emit(A64_RET(A64_LR), ctx);
 }
 
+static int dead_register(const struct jit_ctx *ctx, int offset, int bpf_reg)
+{
+	const struct bpf_prog *prog = ctx->prog;
+	int i;
+
+	for (i = offset; i < prog->len; ++i) {
+		const struct bpf_insn *insn = &prog->insnsi[i];
+		const u8 code = insn->code;
+		const u8 bpf_dst = insn->dst_reg;
+		const u8 bpf_src = insn->src_reg;
+		const int writes_dst = !((code & BPF_ST) || (code & BPF_STX)
+					 || (code & BPF_JMP32) || (code & BPF_JMP));
+		const int reads_dst  = !((code & BPF_LD));
+		const int reads_src  = true;
+
+		/* Calls are a bit special in that they clobber a bunch of regisers. */
+		if ((code & (BPF_JMP | BPF_CALL)) || (code & (BPF_JMP | BPF_TAIL_CALL)))
+			if ((bpf_reg >= BPF_REG_0) && (bpf_reg <= BPF_REG_5))
+				return false;
+
+		/* Registers that are read before they're written are alive.
+		 * Most opcodes are of the form DST = DEST op SRC, but there
+		 * are some exceptions.*/
+		if (bpf_src == bpf_reg && reads_src)
+			return false;
+
+		if (bpf_dst == bpf_reg && reads_dst)
+			return false;
+		
+		if (bpf_dst == bpf_reg && writes_dst)
+			return true;
+
+		/* Most BPF instructions are 8 bits long, but some ar 16 bits
+		 * long. */
+		if (code & (BPF_LD | BPF_IMM | BPF_DW))
+			++i;
+	}
+
+	return true;
+}
+
 /* JITs an eBPF instruction.
  * Returns:
  * 0  - successfully JITed an 8-byte eBPF instruction.
@@ -348,7 +393,7 @@  static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 		      bool extra_pass)
 {
 	const u8 code = insn->code;
-	const u8 dstw = bpf2a64(ctx, insn->dst_reg);
+	u8 dstw;
 	const u8 dstr = bpf2a64(ctx, insn->dst_reg);
 	const u8 src = bpf2a64(ctx, insn->src_reg);
 	const u8 tmp = bpf2a64(ctx, TMP_REG_1);
@@ -374,6 +419,27 @@  static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 #define check_imm19(imm) check_imm(19, imm)
 #define check_imm26(imm) check_imm(26, imm)
 
+	/* Handle BPF_REG_0, which may be in the wrong place because the ARM64
+	 * ABI doesn't match the BPF ABI for function calls. */
+	if (ctx->reg0_in_reg1) {
+		/* If we're writing BPF_REG_0 then we don't need to do any
+		 * extra work to get the registers back in their correct
+		 * locations. */
+		if (insn->dst_reg == BPF_REG_0)
+			ctx->reg0_in_reg1 = false;
+
+		/* If we're writing to BPF_REG_1 then we need to save BPF_REG_0
+		 * into the correct location if it's still alive, as otherwise
+		 * it will be clobbered. */
+		if (insn->dst_reg == BPF_REG_1) {
+			if (!dead_register(ctx, off + 1, BPF_REG_0))
+				emit(A64_MOV(1, A64_R(7), A64_R(0)), ctx);
+			ctx->reg0_in_reg1 = false;
+		}
+	}
+
+	dstw = bpf2a64(ctx, insn->dst_reg);
+
 	switch (code) {
 	/* dst = src */
 	case BPF_ALU | BPF_MOV | BPF_X:
@@ -640,7 +706,6 @@  static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 	/* function call */
 	case BPF_JMP | BPF_CALL:
 	{
-		const u8 r0 = bpf2a64(ctx, BPF_REG_0);
 		bool func_addr_fixed;
 		u64 func_addr;
 		int ret;
@@ -651,7 +716,7 @@  static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 			return ret;
 		emit_addr_mov_i64(tmp, func_addr, ctx);
 		emit(A64_BLR(tmp), ctx);
-		emit(A64_MOV(1, r0, A64_R(0)), ctx);
+		ctx->reg0_in_reg1 = true;
 		break;
 	}
 	/* tail call */