All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Björn Töpel" <bjorn.topel@gmail.com>
To: linux-riscv@lists.infradead.org
Cc: "Björn Töpel" <bjorn.topel@gmail.com>,
	daniel@iogearbox.net, palmer@sifive.com, davidlee@sifive.com,
	netdev@vger.kernel.org
Subject: [RFC PATCH 3/3] bpf, riscv: added eBPF JIT for RV64G
Date: Tue, 15 Jan 2019 09:35:18 +0100	[thread overview]
Message-ID: <20190115083518.10149-4-bjorn.topel@gmail.com> (raw)
In-Reply-To: <20190115083518.10149-1-bjorn.topel@gmail.com>

This commit adds eBPF JIT for RV64G.

Codewise, it needs some refactoring. Currently there's a bit too much
copy-and-paste going on, and I know some places where I could optimize
the code generation a bit (mostly BPF_K type of instructions, dealing
with immediates).

>From a features perspective, two things are missing:

* tail calls
* "far-branches", i.e. conditional branches that reach beyond 13b.

The test_bpf.ko passes all tests.

Signed-off-by: Björn Töpel <bjorn.topel@gmail.com>
---
 arch/riscv/net/bpf_jit_comp.c | 1608 +++++++++++++++++++++++++++++++++
 1 file changed, 1608 insertions(+)

diff --git a/arch/riscv/net/bpf_jit_comp.c b/arch/riscv/net/bpf_jit_comp.c
index 7e359d3249ee..562d56eb8d23 100644
--- a/arch/riscv/net/bpf_jit_comp.c
+++ b/arch/riscv/net/bpf_jit_comp.c
@@ -1,4 +1,1612 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BPF JIT compiler for RV64G
+ *
+ * Copyright(c) 2019 Björn Töpel <bjorn.topel@gmail.com>
+ *
+ */
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <asm/cacheflush.h>
+
+#define TMP_REG_0 (MAX_BPF_JIT_REG + 0)
+#define TMP_REG_1 (MAX_BPF_JIT_REG + 1)
+#define TAIL_CALL_REG (MAX_BPF_JIT_REG + 2)
+
+enum rv_register {
+	RV_REG_ZERO =	0,	/* The constant value 0 */
+	RV_REG_RA =	1,	/* Return address */
+	RV_REG_SP =	2,	/* Stack pointer */
+	RV_REG_GP =	3,	/* Global pointer */
+	RV_REG_TP =	4,	/* Thread pointer */
+	RV_REG_T0 =	5,	/* Temporaries */
+	RV_REG_T1 =	6,
+	RV_REG_T2 =	7,
+	RV_REG_FP =	8,
+	RV_REG_S1 =	9,	/* Saved registers */
+	RV_REG_A0 =	10,	/* Function argument/return values */
+	RV_REG_A1 =	11,	/* Function arguments */
+	RV_REG_A2 =	12,
+	RV_REG_A3 =	13,
+	RV_REG_A4 =	14,
+	RV_REG_A5 =	15,
+	RV_REG_A6 =	16,
+	RV_REG_A7 =	17,
+	RV_REG_S2 =	18,	/* Saved registers */
+	RV_REG_S3 =	19,
+	RV_REG_S4 =	20,
+	RV_REG_S5 =	21,
+	RV_REG_S6 =	22,
+	RV_REG_S7 =	23,
+	RV_REG_S8 =	24,
+	RV_REG_S9 =	25,
+	RV_REG_S10 =	26,
+	RV_REG_S11 =	27,
+	RV_REG_T3 =	28,	/* Temporaries */
+	RV_REG_T4 =	29,
+	RV_REG_T5 =	30,
+	RV_REG_T6 =	31,
+};
+
+struct rv_jit_context {
+	struct bpf_prog *prog;
+	u32 *insns; /* RV insns */
+	int ninsns;
+	int epilogue_offset;
+	int *offset; /* BPF to RV */
+	unsigned long seen_reg_bits;
+	int stack_size;
+};
+
+struct rv_jit_data {
+	struct bpf_binary_header *header;
+	u8 *image;
+	struct rv_jit_context ctx;
+};
+
+static u8 bpf_to_rv_reg(int bpf_reg, struct rv_jit_context *ctx)
+{
+	switch (bpf_reg) {
+	/* Return value */
+	case BPF_REG_0:
+		__set_bit(RV_REG_A5, &ctx->seen_reg_bits);
+		return RV_REG_A5;
+	/* Function arguments */
+	case BPF_REG_1:
+		__set_bit(RV_REG_A0, &ctx->seen_reg_bits);
+		return RV_REG_A0;
+	case BPF_REG_2:
+		__set_bit(RV_REG_A1, &ctx->seen_reg_bits);
+		return RV_REG_A1;
+	case BPF_REG_3:
+		__set_bit(RV_REG_A2, &ctx->seen_reg_bits);
+		return RV_REG_A2;
+	case BPF_REG_4:
+		__set_bit(RV_REG_A3, &ctx->seen_reg_bits);
+		return RV_REG_A3;
+	case BPF_REG_5:
+		__set_bit(RV_REG_A4, &ctx->seen_reg_bits);
+		return RV_REG_A4;
+	/* Callee saved registers */
+	case BPF_REG_6:
+		__set_bit(RV_REG_S1, &ctx->seen_reg_bits);
+		return RV_REG_S1;
+	case BPF_REG_7:
+		__set_bit(RV_REG_S2, &ctx->seen_reg_bits);
+		return RV_REG_S2;
+	case BPF_REG_8:
+		__set_bit(RV_REG_S3, &ctx->seen_reg_bits);
+		return RV_REG_S3;
+	case BPF_REG_9:
+		__set_bit(RV_REG_S4, &ctx->seen_reg_bits);
+		return RV_REG_S4;
+	/* Stack read-only frame pointer to access stack */
+	case BPF_REG_FP:
+		__set_bit(RV_REG_S5, &ctx->seen_reg_bits);
+		return RV_REG_S5;
+	/* Temporary register */
+	case BPF_REG_AX:
+		__set_bit(RV_REG_T0, &ctx->seen_reg_bits);
+		return RV_REG_T0;
+	/* Tail call counter */
+	case TAIL_CALL_REG:
+		__set_bit(RV_REG_S6, &ctx->seen_reg_bits);
+		return RV_REG_S6;
+	default:
+		return 0;
+	}
+};
+
+static void seen_call(struct rv_jit_context *ctx)
+{
+	__set_bit(RV_REG_RA, &ctx->seen_reg_bits);
+}
+
+static bool seen_reg(int rv_reg, struct rv_jit_context *ctx)
+{
+	return test_bit(rv_reg, &ctx->seen_reg_bits);
+}
+
+static void emit(const u32 insn, struct rv_jit_context *ctx)
+{
+	if (ctx->insns)
+		ctx->insns[ctx->ninsns] = insn;
+
+	ctx->ninsns++;
+}
+
+static u32 rv_r_insn(u8 funct7, u8 rs2, u8 rs1, u8 funct3, u8 rd, u8 opcode)
+{
+	return (funct7 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) |
+		(rd << 7) | opcode;
+}
+
+static u32 rv_i_insn(u16 imm11_0, u8 rs1, u8 funct3, u8 rd, u8 opcode)
+{
+	return (imm11_0 << 20) | (rs1 << 15) | (funct3 << 12) | (rd << 7) |
+		opcode;
+}
+
+static u32 rv_s_insn(u16 imm11_0, u8 rs2, u8 rs1, u8 funct3, u8 opcode)
+{
+	u8 imm11_5 = imm11_0 >> 5, imm4_0 = imm11_0 & 0x1f;
+
+	return (imm11_5 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) |
+		(imm4_0 << 7) | opcode;
+}
+
+static u32 rv_sb_insn(u16 imm12_1, u8 rs2, u8 rs1, u8 funct3, u8 opcode)
+{
+	u8 imm12 = ((imm12_1 & 0x800) >> 5) | ((imm12_1 & 0x3f0) >> 4);
+	u8 imm4_1 = ((imm12_1 & 0xf) << 1) | ((imm12_1 & 0x400) >> 10);
+
+	return (imm12 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) |
+		(imm4_1 << 7) | opcode;
+}
+
+static u32 rv_u_insn(u32 imm31_12, u8 rd, u8 opcode)
+{
+	return (imm31_12 << 12) | (rd << 7) | opcode;
+}
+
+static u32 rv_uj_insn(u32 imm20_1, u8 rd, u8 opcode)
+{
+	u32 imm;
+
+	imm = (imm20_1 & 0x80000) |  ((imm20_1 & 0x3ff) << 9) |
+	      ((imm20_1 & 0x400) >> 2) | ((imm20_1 & 0x7f800) >> 11);
+
+	return (imm << 12) | (rd << 7) | opcode;
+}
+
+static u32 rv_amo_insn(u8 funct5, u8 aq, u8 rl, u8 rs2, u8 rs1,
+		       u8 funct3, u8 rd, u8 opcode)
+{
+	u8 funct7 = (funct5 << 2) | (aq << 1) | rl;
+
+	return rv_r_insn(funct7, rs2, rs1, funct3, rd, opcode);
+}
+
+static u32 rv_addiw(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 0, rd, 0x1b);
+}
+
+static u32 rv_addi(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 0, rd, 0x13);
+}
+
+static u32 rv_addw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 0, rd, 0x3b);
+}
+
+static u32 rv_add(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 0, rd, 0x33);
+}
+
+static u32 rv_subw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0x20, rs2, rs1, 0, rd, 0x3b);
+}
+
+static u32 rv_sub(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0x20, rs2, rs1, 0, rd, 0x33);
+}
+
+static u32 rv_and(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 7, rd, 0x33);
+}
+
+static u32 rv_or(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 6, rd, 0x33);
+}
+
+static u32 rv_xor(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 4, rd, 0x33);
+}
+
+static u32 rv_mulw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 0, rd, 0x3b);
+}
+
+static u32 rv_mul(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 0, rd, 0x33);
+}
+
+static u32 rv_divuw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 5, rd, 0x3b);
+}
+
+static u32 rv_divu(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 5, rd, 0x33);
+}
+
+static u32 rv_remuw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 7, rd, 0x3b);
+}
+
+static u32 rv_remu(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 7, rd, 0x33);
+}
+
+static u32 rv_sllw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 1, rd, 0x3b);
+}
+
+static u32 rv_sll(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 1, rd, 0x33);
+}
+
+static u32 rv_srlw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 5, rd, 0x3b);
+}
+
+static u32 rv_srl(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 5, rd, 0x33);
+}
+
+static u32 rv_sraw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0x20, rs2, rs1, 5, rd, 0x3b);
+}
+
+static u32 rv_sra(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0x20, rs2, rs1, 5, rd, 0x33);
+}
+
+static u32 rv_lui(u8 rd, u32 imm31_12)
+{
+	return rv_u_insn(imm31_12, rd, 0x37);
+}
+
+static u32 rv_slli(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 1, rd, 0x13);
+}
+
+static u32 rv_andi(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 7, rd, 0x13);
+}
+
+static u32 rv_ori(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 6, rd, 0x13);
+}
+
+static u32 rv_xori(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 4, rd, 0x13);
+}
+
+static u32 rv_slliw(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 1, rd, 0x1b);
+}
+
+static u32 rv_srliw(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 5, rd, 0x1b);
+}
+
+static u32 rv_srli(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 5, rd, 0x13);
+}
+
+static u32 rv_sraiw(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(0x400 | imm11_0, rs1, 5, rd, 0x1b);
+}
+
+static u32 rv_srai(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(0x400 | imm11_0, rs1, 5, rd, 0x13);
+}
+
+#if 0
+static u32 rv_auipc(u8 rd, u32 imm31_12)
+{
+	return rv_u_insn(imm31_12, rd, 0x17);
+}
+#endif
+
+static u32 rv_jal(u8 rd, u32 imm20_1)
+{
+	return rv_uj_insn(imm20_1, rd, 0x6f);
+}
+
+static u32 rv_jalr(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 0, rd, 0x67);
+}
+
+static u32 rv_beq(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 0, 0x63);
+}
+
+static u32 rv_bltu(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 6, 0x63);
+}
+
+static u32 rv_bgeu(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 7, 0x63);
+}
+
+static u32 rv_bne(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 1, 0x63);
+}
+
+static u32 rv_blt(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 4, 0x63);
+}
+
+static u32 rv_bge(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 5, 0x63);
+}
+
+static u32 rv_sb(u8 rs1, u16 imm11_0, u8 rs2)
+{
+	return rv_s_insn(imm11_0, rs2, rs1, 0, 0x23);
+}
+
+static u32 rv_sh(u8 rs1, u16 imm11_0, u8 rs2)
+{
+	return rv_s_insn(imm11_0, rs2, rs1, 1, 0x23);
+}
+
+static u32 rv_sw(u8 rs1, u16 imm11_0, u8 rs2)
+{
+	return rv_s_insn(imm11_0, rs2, rs1, 2, 0x23);
+}
+
+static u32 rv_sd(u8 rs1, u16 imm11_0, u8 rs2)
+{
+	return rv_s_insn(imm11_0, rs2, rs1, 3, 0x23);
+}
+
+#if 0
+static u32 rv_lb(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 0, rd, 0x03);
+}
+#endif
+
+static u32 rv_lbu(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 4, rd, 0x03);
+}
+
+#if 0
+static u32 rv_lh(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 1, rd, 0x03);
+}
+#endif
+
+static u32 rv_lhu(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 5, rd, 0x03);
+}
+
+#if 0
+static u32 rv_lw(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 2, rd, 0x03);
+}
+#endif
+
+static u32 rv_lwu(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 6, rd, 0x03);
+}
+
+static u32 rv_ld(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 3, rd, 0x03);
+}
+
+static u32 rv_amoadd_w(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl)
+{
+	return rv_amo_insn(0, aq, rl, rs2, rs1, 2, rd, 0x2f);
+}
+
+static u32 rv_amoadd_d(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl)
+{
+	return rv_amo_insn(0, aq, rl, rs2, rs1, 3, rd, 0x2f);
+}
+
+static bool is_12b_int(s64 val)
+{
+	return -(1 << 11) <= val && val < (1 << 11);
+}
+
+static bool is_32b_int(s64 val)
+{
+	return -(1L << 31) <= val && val < (1L << 31);
+}
+
+/* jumps */
+static bool is_21b_int(s64 val)
+{
+	return -(1L << 20) <= val && val < (1L << 20);
+
+}
+
+/* conditional branches */
+static bool is_13b_int(s64 val)
+{
+	return -(1 << 12) <= val && val < (1 << 12);
+}
+
+static void emit_imm(u8 rd, s64 val, struct rv_jit_context *ctx)
+{
+	/* Note that the immediate from the add is sign-extended,
+	 * which means that we need to compensate this by adding 2^12,
+	 * when the 12th bit is set. A simpler way of doing this, and
+	 * getting rid of the check, is to just add 2**11 before the
+	 * shift. The "Loading a 32-Bit constant" example from the
+	 * "Computer Organization and Design, RISC-V edition" book by
+	 * Patterson/Hennessy highlights this fact.
+	 *
+	 * This also means that we need to process LSB to MSB.
+	 */
+	s64 upper = (val + (1 << 11)) >> 12, lower = val & 0xfff;
+	int shift;
+
+	if (is_32b_int(val)) {
+		if (upper)
+			emit(rv_lui(rd, upper), ctx);
+
+		if (!upper) {
+			emit(rv_addi(rd, RV_REG_ZERO, lower), ctx);
+			return;
+		}
+
+		emit(rv_addiw(rd, rd, lower), ctx);
+		return;
+	}
+
+	shift = __ffs(upper);
+	upper >>= shift;
+	shift += 12;
+
+	emit_imm(rd, upper, ctx);
+
+	emit(rv_slli(rd, rd, shift), ctx);
+	if (lower)
+		emit(rv_addi(rd, rd, lower), ctx);
+}
+
+static int rv_offset(int bpf_to, int bpf_from, struct rv_jit_context *ctx)
+{
+	int from = ctx->offset[bpf_from] - 1, to = ctx->offset[bpf_to];
+
+	return (to - from) << 2;
+}
+
+static int epilogue_offset(struct rv_jit_context *ctx)
+{
+	int to = ctx->epilogue_offset, from = ctx->ninsns;
+
+	return (to - from) << 2;
+}
+
+static int emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
+		     bool extra_pass)
+{
+	bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+	int rvoff, i = insn - ctx->prog->insnsi;
+	u8 rd, rs, code = insn->code;
+	s16 off = insn->off;
+	s32 imm = insn->imm;
+
+	switch (code) {
+	/* dst = src */
+	case BPF_ALU | BPF_MOV | BPF_X:
+	case BPF_ALU64 | BPF_MOV | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_addi(rd, rs, 0) : rv_addiw(rd, rs, 0), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+
+	/* dst = dst OP src */
+	case BPF_ALU | BPF_ADD | BPF_X:
+	case BPF_ALU64 | BPF_ADD | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_add(rd, rd, rs) : rv_addw(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_SUB | BPF_X:
+	case BPF_ALU64 | BPF_SUB | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_sub(rd, rd, rs) : rv_subw(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_AND | BPF_X:
+	case BPF_ALU64 | BPF_AND | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_and(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_OR | BPF_X:
+	case BPF_ALU64 | BPF_OR | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_or(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_XOR | BPF_X:
+	case BPF_ALU64 | BPF_XOR | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_xor(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_MUL | BPF_X:
+	case BPF_ALU64 | BPF_MUL | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_mul(rd, rd, rs) : rv_mulw(rd, rd, rs), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_DIV | BPF_X:
+	case BPF_ALU64 | BPF_DIV | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_divu(rd, rd, rs) : rv_divuw(rd, rd, rs), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_MOD | BPF_X:
+	case BPF_ALU64 | BPF_MOD | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_remu(rd, rd, rs) : rv_remuw(rd, rd, rs), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_LSH | BPF_X:
+	case BPF_ALU64 | BPF_LSH | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_sll(rd, rd, rs) : rv_sllw(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_RSH | BPF_X:
+	case BPF_ALU64 | BPF_RSH | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_srl(rd, rd, rs) : rv_srlw(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_ARSH | BPF_X:
+	case BPF_ALU64 | BPF_ARSH | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_sra(rd, rd, rs) : rv_sraw(rd, rd, rs), ctx);
+		break;
+
+	/* dst = -dst */
+	case BPF_ALU | BPF_NEG:
+	case BPF_ALU64 | BPF_NEG:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ?
+		     rv_sub(rd, RV_REG_ZERO, rd) :
+		     rv_subw(rd, RV_REG_ZERO, rd),
+		     ctx);
+		break;
+
+	/* dst = BSWAP##imm(dst) */
+	case BPF_ALU | BPF_END | BPF_FROM_LE:
+	{
+		int shift = 64 - imm;
+
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_slli(rd, rd, shift), ctx);
+		emit(rv_srli(rd, rd, shift), ctx);
+		break;
+	}
+	case BPF_ALU | BPF_END | BPF_FROM_BE:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+
+		emit(rv_addi(RV_REG_T2, RV_REG_ZERO, 0), ctx);
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+		if (imm == 16)
+			goto out_be;
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+		if (imm == 32)
+			goto out_be;
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+	out_be:
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+
+		emit(rv_addi(rd, RV_REG_T2, 0), ctx);
+		break;
+
+	/* dst = imm */
+	case BPF_ALU | BPF_MOV | BPF_K:
+	case BPF_ALU64 | BPF_MOV | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(rd, imm, ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+
+	/* dst = dst OP imm */
+	case BPF_ALU | BPF_ADD | BPF_K:
+	case BPF_ALU64 | BPF_ADD | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(imm)) {
+			emit(is64 ? rv_addi(rd, rd, imm) :
+			     rv_addiw(rd, rd, imm), ctx);
+			if (!is64) {
+				emit(rv_slli(rd, rd, 32), ctx);
+				emit(rv_srli(rd, rd, 32), ctx);
+			}
+			break;
+		}
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(is64 ? rv_add(rd, rd, RV_REG_T1) :
+		     rv_addw(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_SUB | BPF_K:
+	case BPF_ALU64 | BPF_SUB | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(-imm)) {
+			emit(is64 ? rv_addi(rd, rd, -imm) :
+			     rv_addiw(rd, rd, -imm), ctx);
+			if (!is64) {
+				emit(rv_slli(rd, rd, 32), ctx);
+				emit(rv_srli(rd, rd, 32), ctx);
+			}
+			break;
+		}
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(is64 ? rv_sub(rd, rd, RV_REG_T1) :
+		     rv_subw(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_AND | BPF_K:
+	case BPF_ALU64 | BPF_AND | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(imm)) {
+			emit(rv_andi(rd, rd, imm), ctx);
+			if (!is64) {
+				emit(rv_slli(rd, rd, 32), ctx);
+				emit(rv_srli(rd, rd, 32), ctx);
+			}
+			break;
+		}
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_and(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_OR | BPF_K:
+	case BPF_ALU64 | BPF_OR | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(imm)) {
+			emit(rv_ori(rd, rd, imm), ctx);
+			if (!is64) {
+				emit(rv_slli(rd, rd, 32), ctx);
+				emit(rv_srli(rd, rd, 32), ctx);
+			}
+			break;
+		}
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_or(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_XOR | BPF_K:
+	case BPF_ALU64 | BPF_XOR | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(imm)) {
+			emit(rv_xori(rd, rd, imm), ctx);
+			if (!is64) {
+				emit(rv_slli(rd, rd, 32), ctx);
+				emit(rv_srli(rd, rd, 32), ctx);
+			}
+			break;
+		}
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_xor(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_MUL | BPF_K:
+	case BPF_ALU64 | BPF_MUL | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(is64 ? rv_mul(rd, rd, RV_REG_T1) :
+		     rv_mulw(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_DIV | BPF_K:
+	case BPF_ALU64 | BPF_DIV | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(is64 ? rv_divu(rd, rd, RV_REG_T1) :
+		     rv_divuw(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_MOD | BPF_K:
+	case BPF_ALU64 | BPF_MOD | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(is64 ? rv_remu(rd, rd, RV_REG_T1) :
+		     rv_remuw(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_LSH | BPF_K:
+	case BPF_ALU64 | BPF_LSH | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_slli(rd, rd, imm) :
+		     rv_slliw(rd, rd, imm),  ctx);
+		break;
+	case BPF_ALU | BPF_RSH | BPF_K:
+	case BPF_ALU64 | BPF_RSH | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_srli(rd, rd, imm) :
+		     rv_srliw(rd, rd, imm),  ctx);
+		break;
+	case BPF_ALU | BPF_ARSH | BPF_K:
+	case BPF_ALU64 | BPF_ARSH | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_srai(rd, rd, imm) :
+		     rv_sraiw(rd, rd, imm),  ctx);
+		break;
+
+	/* JUMP off */
+	case BPF_JMP | BPF_JA:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_21b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, rvoff);
+			return -1;
+		}
+
+		emit(rv_jal(RV_REG_ZERO, rvoff >> 1), ctx);
+		break;
+
+	/* IF (dst COND src) JUMP off */
+	case BPF_JMP | BPF_JEQ | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_beq(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JGT | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bltu(rs, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JLT | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bltu(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JGE | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bgeu(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JLE | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bgeu(rs, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JNE | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bne(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSGT | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_blt(rs, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSLT | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_blt(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSGE | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bge(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSLE | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bge(rs, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSET | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_and(RV_REG_T1, rd, rs), ctx);
+		emit(rv_bne(RV_REG_T1, RV_REG_ZERO, rvoff >> 1), ctx);
+		break;
+
+	/* IF (dst COND imm) JUMP off */
+	case BPF_JMP | BPF_JEQ | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_beq(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JGT | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bltu(RV_REG_T1, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JLT | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bltu(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JGE | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bgeu(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JLE | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bgeu(RV_REG_T1, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JNE | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bne(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSGT | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_blt(RV_REG_T1, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSLT | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_blt(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSGE | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bge(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSLE | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bge(RV_REG_T1, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSET | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T2, imm, ctx);
+		emit(rv_and(RV_REG_T1, rd, RV_REG_T2), ctx);
+		emit(rv_bne(RV_REG_T1, RV_REG_ZERO, rvoff >> 1), ctx);
+		break;
+
+	/* function call */
+	case BPF_JMP | BPF_CALL:
+	{
+		bool fixed;
+		int i, ret;
+		u64 addr;
+
+		seen_call(ctx);
+		ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass, &addr,
+					    &fixed);
+		if (ret < 0)
+			return ret;
+		if (fixed) {
+			emit_imm(RV_REG_T1, addr, ctx);
+		} else {
+			i = ctx->ninsns;
+			emit_imm(RV_REG_T1, addr, ctx);
+			for (i = ctx->ninsns - i; i < 8; i++) {
+				/* nop */
+				emit(rv_addi(RV_REG_ZERO, RV_REG_ZERO, 0),
+				     ctx);
+			}
+		}
+		emit(rv_jalr(RV_REG_RA, RV_REG_T1, 0), ctx);
+		rd = bpf_to_rv_reg(BPF_REG_0, ctx);
+		emit(rv_addi(rd, RV_REG_A0, 0), ctx);
+		break;
+	}
+	/* tail call */
+	case BPF_JMP | BPF_TAIL_CALL:
+		rd = bpf_to_rv_reg(TAIL_CALL_REG, ctx);
+		pr_err("bpf-jit: tail call not supported yet!\n");
+		return -1;
+
+	/* function return */
+	case BPF_JMP | BPF_EXIT:
+		if (i == ctx->prog->len - 1)
+			break;
+
+		rvoff = epilogue_offset(ctx);
+		if (!is_21b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, rvoff);
+			return -1;
+		}
+
+		emit(rv_jal(RV_REG_ZERO, rvoff >> 1), ctx);
+		break;
+
+	/* dst = imm64 */
+	case BPF_LD | BPF_IMM | BPF_DW:
+	{
+		struct bpf_insn insn1 = insn[1];
+		u64 imm64;
+
+		imm64 = (u64)insn1.imm << 32 | (u32)imm;
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(rd, imm64, ctx);
+		return 1;
+	}
+
+	/* LDX: dst = *(size *)(src + off) */
+	case BPF_LDX | BPF_MEM | BPF_B:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_lbu(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
+		emit(rv_lbu(rd, 0, RV_REG_T1), ctx);
+		break;
+	case BPF_LDX | BPF_MEM | BPF_H:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_lhu(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
+		emit(rv_lhu(rd, 0, RV_REG_T1), ctx);
+		break;
+	case BPF_LDX | BPF_MEM | BPF_W:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_lwu(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
+		emit(rv_lwu(rd, 0, RV_REG_T1), ctx);
+		break;
+	case BPF_LDX | BPF_MEM | BPF_DW:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_ld(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
+		emit(rv_ld(rd, 0, RV_REG_T1), ctx);
+		break;
+
+	/* ST: *(size *)(dst + off) = imm */
+	case BPF_ST | BPF_MEM | BPF_B:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sb(rd, off, RV_REG_T1), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T2, off, ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
+		emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx);
+		break;
+
+	case BPF_ST | BPF_MEM | BPF_H:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sh(rd, off, RV_REG_T1), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T2, off, ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
+		emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx);
+		break;
+	case BPF_ST | BPF_MEM | BPF_W:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sw(rd, off, RV_REG_T1), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T2, off, ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
+		emit(rv_sw(RV_REG_T2, 0, RV_REG_T1), ctx);
+		break;
+	case BPF_ST | BPF_MEM | BPF_DW:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sd(rd, off, RV_REG_T1), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T2, off, ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
+		emit(rv_sd(RV_REG_T2, 0, RV_REG_T1), ctx);
+		break;
+
+	/* STX: *(size *)(dst + off) = src */
+	case BPF_STX | BPF_MEM | BPF_B:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sb(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+		emit(rv_sb(RV_REG_T1, 0, rs), ctx);
+		break;
+	case BPF_STX | BPF_MEM | BPF_H:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sh(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+		emit(rv_sh(RV_REG_T1, 0, rs), ctx);
+		break;
+	case BPF_STX | BPF_MEM | BPF_W:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sw(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+		emit(rv_sw(RV_REG_T1, 0, rs), ctx);
+		break;
+	case BPF_STX | BPF_MEM | BPF_DW:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sd(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+		emit(rv_sd(RV_REG_T1, 0, rs), ctx);
+		break;
+	/* STX XADD: lock *(u32 *)(dst + off) += src */
+	case BPF_STX | BPF_XADD | BPF_W:
+	/* STX XADD: lock *(u64 *)(dst + off) += src */
+	case BPF_STX | BPF_XADD | BPF_DW:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (off) {
+			if (is_12b_int(off)) {
+				emit(rv_addi(RV_REG_T1, rd, off), ctx);
+			} else {
+				emit_imm(RV_REG_T1, off, ctx);
+				emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+			}
+
+			rd = RV_REG_T1;
+		}
+
+		emit(BPF_SIZE(code) == BPF_W ?
+		     rv_amoadd_w(RV_REG_ZERO, rs, rd, 0, 0) :
+		     rv_amoadd_d(RV_REG_ZERO, rs, rd, 0, 0), ctx);
+		break;
+	default:
+		pr_err("bpf-jit: unknown opcode %02x\n", code);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void build_prologue(struct rv_jit_context *ctx)
+{
+	int stack_adjust = 0, store_offset, bpf_stack_adjust;
+
+	if (seen_reg(RV_REG_RA, ctx))
+		stack_adjust += 8;
+	stack_adjust += 8; /* RV_REG_FP */
+	if (seen_reg(RV_REG_S1, ctx))
+		stack_adjust += 8;
+	if (seen_reg(RV_REG_S2, ctx))
+		stack_adjust += 8;
+	if (seen_reg(RV_REG_S3, ctx))
+		stack_adjust += 8;
+	if (seen_reg(RV_REG_S4, ctx))
+		stack_adjust += 8;
+	if (seen_reg(RV_REG_S5, ctx))
+		stack_adjust += 8;
+	if (seen_reg(RV_REG_S6, ctx))
+		stack_adjust += 8;
+
+	stack_adjust = round_up(stack_adjust, 16);
+	bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16);
+	stack_adjust += bpf_stack_adjust;
+
+	store_offset = stack_adjust - 8;
+
+	emit(rv_addi(RV_REG_SP, RV_REG_SP, -stack_adjust), ctx);
+
+	if (seen_reg(RV_REG_RA, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_RA), ctx);
+		store_offset -= 8;
+	}
+	emit(rv_sd(RV_REG_SP, store_offset, RV_REG_FP), ctx);
+	store_offset -= 8;
+	if (seen_reg(RV_REG_S1, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S1), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S2, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S2), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S3, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S3), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S4, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S4), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S5, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S5), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S6, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S6), ctx);
+		store_offset -= 8;
+	}
+
+	emit(rv_addi(RV_REG_FP, RV_REG_SP, stack_adjust), ctx);
+
+	if (bpf_stack_adjust) {
+		if (!seen_reg(RV_REG_S5, ctx))
+			pr_warn("bpf-jit: not seen BPF_REG_FP, stack is %d\n",
+				bpf_stack_adjust);
+		emit(rv_addi(RV_REG_S5, RV_REG_SP, bpf_stack_adjust), ctx);
+	}
+
+	ctx->stack_size = stack_adjust;
+}
+
+static void build_epilogue(struct rv_jit_context *ctx)
+{
+	int stack_adjust = ctx->stack_size, store_offset = stack_adjust - 8;
+
+	if (seen_reg(RV_REG_RA, ctx)) {
+		emit(rv_ld(RV_REG_RA, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	emit(rv_ld(RV_REG_FP, store_offset, RV_REG_SP), ctx);
+	store_offset -= 8;
+	if (seen_reg(RV_REG_S1, ctx)) {
+		emit(rv_ld(RV_REG_S1, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S2, ctx)) {
+		emit(rv_ld(RV_REG_S2, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S3, ctx)) {
+		emit(rv_ld(RV_REG_S3, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S4, ctx)) {
+		emit(rv_ld(RV_REG_S4, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S5, ctx)) {
+		emit(rv_ld(RV_REG_S5, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S6, ctx)) {
+		emit(rv_ld(RV_REG_S6, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+
+	emit(rv_addi(RV_REG_SP, RV_REG_SP, stack_adjust), ctx);
+	/* Set return value. */
+	emit(rv_addi(RV_REG_A0, RV_REG_A5, 0), ctx);
+	emit(rv_jalr(RV_REG_ZERO, RV_REG_RA, 0), ctx);
+}
+
+static int build_body(struct rv_jit_context *ctx, bool extra_pass)
+{
+	const struct bpf_prog *prog = ctx->prog;
+	int i;
+
+	for (i = 0; i < prog->len; i++) {
+		const struct bpf_insn *insn = &prog->insnsi[i];
+		int ret;
+
+		ret = emit_insn(insn, ctx, extra_pass);
+		if (ret > 0) {
+			i++;
+			if (ctx->insns == NULL)
+				ctx->offset[i] = ctx->ninsns;
+			continue;
+		}
+		if (ctx->insns == NULL)
+			ctx->offset[i] = ctx->ninsns;
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static void bpf_fill_ill_insns(void *area, unsigned int size)
+{
+	memset(area, 0, size);
+}
+
+static void bpf_flush_icache(void *start, void *end)
+{
+	flush_icache_range((unsigned long)start, (unsigned long)end);
+}
+
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
+	bool tmp_blinded = false, extra_pass = false;
+	struct bpf_prog *tmp, *orig_prog = prog;
+	struct rv_jit_data *jit_data;
+	struct rv_jit_context *ctx;
+	unsigned int image_size;
+
+	if (!prog->jit_requested)
+		return orig_prog;
+
+	tmp = bpf_jit_blind_constants(prog);
+	if (IS_ERR(tmp))
+		return orig_prog;
+	if (tmp != prog) {
+		tmp_blinded = true;
+		prog = tmp;
+	}
+
+	jit_data = prog->aux->jit_data;
+	if (!jit_data) {
+		jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
+		if (!jit_data) {
+			prog = orig_prog;
+			goto out;
+		}
+		prog->aux->jit_data = jit_data;
+	}
+
+	ctx = &jit_data->ctx;
+
+	if (ctx->offset) {
+		extra_pass = true;
+		image_size = sizeof(u32) * ctx->ninsns;
+		goto skip_init_ctx;
+	}
+
+	ctx->prog = prog;
+	ctx->offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL);
+	if (!ctx->offset) {
+		prog = orig_prog;
+		goto out_offset;
+	}
+
+	/* First pass generates the ctx->offset, but does not emit an image. */
+	if (build_body(ctx, extra_pass)) {
+		prog = orig_prog;
+		goto out_offset;
+	}
+	build_prologue(ctx);
+	ctx->epilogue_offset = ctx->ninsns;
+	build_epilogue(ctx);
+
+	/* Allocate image, now that we know the size. */
+	image_size = sizeof(u32) * ctx->ninsns;
+	jit_data->header = bpf_jit_binary_alloc(image_size, &jit_data->image,
+						sizeof(u32),
+						bpf_fill_ill_insns);
+	if (!jit_data->header) {
+		prog = orig_prog;
+		goto out_offset;
+	}
+
+	/* Second, real pass, that acutally emits the image. */
+	ctx->insns = (u32 *)jit_data->image;
+skip_init_ctx:
+	ctx->ninsns = 0;
+
+	build_prologue(ctx);
+	if (build_body(ctx, extra_pass)) {
+		bpf_jit_binary_free(jit_data->header);
+		prog = orig_prog;
+		goto out_offset;
+	}
+	build_epilogue(ctx);
+
+	if (bpf_jit_enable > 1)
+		bpf_jit_dump(prog->len, image_size, 2, ctx->insns);
+
+	prog->bpf_func = (void *)ctx->insns;
+	prog->jited = 1;
+	prog->jited_len = image_size;
+
+	bpf_flush_icache(jit_data->header, (u8 *)ctx->insns + ctx->ninsns);
+
+	if (!prog->is_func || extra_pass) {
+out_offset:
+		kfree(ctx->offset);
+		kfree(jit_data);
+		prog->aux->jit_data = NULL;
+	}
+out:
+	if (tmp_blinded)
+		bpf_jit_prog_release_other(prog, prog == orig_prog ?
+					   tmp : orig_prog);
 	return prog;
 }
-- 
2.19.1

WARNING: multiple messages have this Message-ID (diff)
From: "Björn Töpel" <bjorn.topel@gmail.com>
To: linux-riscv@lists.infradead.org
Cc: "Björn Töpel" <bjorn.topel@gmail.com>,
	daniel@iogearbox.net, palmer@sifive.com, davidlee@sifive.com,
	netdev@vger.kernel.org
Subject: [RFC PATCH 3/3] bpf, riscv: added eBPF JIT for RV64G
Date: Tue, 15 Jan 2019 09:35:18 +0100	[thread overview]
Message-ID: <20190115083518.10149-4-bjorn.topel@gmail.com> (raw)
Message-ID: <20190115083518.XFYQCIH0fHyu9yRgZB0qSwvpzIoA4bZBaPtkHr2UufQ@z> (raw)
In-Reply-To: <20190115083518.10149-1-bjorn.topel@gmail.com>

This commit adds eBPF JIT for RV64G.

Codewise, it needs some refactoring. Currently there's a bit too much
copy-and-paste going on, and I know some places where I could optimize
the code generation a bit (mostly BPF_K type of instructions, dealing
with immediates).

From a features perspective, two things are missing:

* tail calls
* "far-branches", i.e. conditional branches that reach beyond 13b.

The test_bpf.ko passes all tests.

Signed-off-by: Björn Töpel <bjorn.topel@gmail.com>
---
 arch/riscv/net/bpf_jit_comp.c | 1608 +++++++++++++++++++++++++++++++++
 1 file changed, 1608 insertions(+)

diff --git a/arch/riscv/net/bpf_jit_comp.c b/arch/riscv/net/bpf_jit_comp.c
index 7e359d3249ee..562d56eb8d23 100644
--- a/arch/riscv/net/bpf_jit_comp.c
+++ b/arch/riscv/net/bpf_jit_comp.c
@@ -1,4 +1,1612 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BPF JIT compiler for RV64G
+ *
+ * Copyright(c) 2019 Björn Töpel <bjorn.topel@gmail.com>
+ *
+ */
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <asm/cacheflush.h>
+
+#define TMP_REG_0 (MAX_BPF_JIT_REG + 0)
+#define TMP_REG_1 (MAX_BPF_JIT_REG + 1)
+#define TAIL_CALL_REG (MAX_BPF_JIT_REG + 2)
+
+enum rv_register {
+	RV_REG_ZERO =	0,	/* The constant value 0 */
+	RV_REG_RA =	1,	/* Return address */
+	RV_REG_SP =	2,	/* Stack pointer */
+	RV_REG_GP =	3,	/* Global pointer */
+	RV_REG_TP =	4,	/* Thread pointer */
+	RV_REG_T0 =	5,	/* Temporaries */
+	RV_REG_T1 =	6,
+	RV_REG_T2 =	7,
+	RV_REG_FP =	8,
+	RV_REG_S1 =	9,	/* Saved registers */
+	RV_REG_A0 =	10,	/* Function argument/return values */
+	RV_REG_A1 =	11,	/* Function arguments */
+	RV_REG_A2 =	12,
+	RV_REG_A3 =	13,
+	RV_REG_A4 =	14,
+	RV_REG_A5 =	15,
+	RV_REG_A6 =	16,
+	RV_REG_A7 =	17,
+	RV_REG_S2 =	18,	/* Saved registers */
+	RV_REG_S3 =	19,
+	RV_REG_S4 =	20,
+	RV_REG_S5 =	21,
+	RV_REG_S6 =	22,
+	RV_REG_S7 =	23,
+	RV_REG_S8 =	24,
+	RV_REG_S9 =	25,
+	RV_REG_S10 =	26,
+	RV_REG_S11 =	27,
+	RV_REG_T3 =	28,	/* Temporaries */
+	RV_REG_T4 =	29,
+	RV_REG_T5 =	30,
+	RV_REG_T6 =	31,
+};
+
+struct rv_jit_context {
+	struct bpf_prog *prog;
+	u32 *insns; /* RV insns */
+	int ninsns;
+	int epilogue_offset;
+	int *offset; /* BPF to RV */
+	unsigned long seen_reg_bits;
+	int stack_size;
+};
+
+struct rv_jit_data {
+	struct bpf_binary_header *header;
+	u8 *image;
+	struct rv_jit_context ctx;
+};
+
+static u8 bpf_to_rv_reg(int bpf_reg, struct rv_jit_context *ctx)
+{
+	switch (bpf_reg) {
+	/* Return value */
+	case BPF_REG_0:
+		__set_bit(RV_REG_A5, &ctx->seen_reg_bits);
+		return RV_REG_A5;
+	/* Function arguments */
+	case BPF_REG_1:
+		__set_bit(RV_REG_A0, &ctx->seen_reg_bits);
+		return RV_REG_A0;
+	case BPF_REG_2:
+		__set_bit(RV_REG_A1, &ctx->seen_reg_bits);
+		return RV_REG_A1;
+	case BPF_REG_3:
+		__set_bit(RV_REG_A2, &ctx->seen_reg_bits);
+		return RV_REG_A2;
+	case BPF_REG_4:
+		__set_bit(RV_REG_A3, &ctx->seen_reg_bits);
+		return RV_REG_A3;
+	case BPF_REG_5:
+		__set_bit(RV_REG_A4, &ctx->seen_reg_bits);
+		return RV_REG_A4;
+	/* Callee saved registers */
+	case BPF_REG_6:
+		__set_bit(RV_REG_S1, &ctx->seen_reg_bits);
+		return RV_REG_S1;
+	case BPF_REG_7:
+		__set_bit(RV_REG_S2, &ctx->seen_reg_bits);
+		return RV_REG_S2;
+	case BPF_REG_8:
+		__set_bit(RV_REG_S3, &ctx->seen_reg_bits);
+		return RV_REG_S3;
+	case BPF_REG_9:
+		__set_bit(RV_REG_S4, &ctx->seen_reg_bits);
+		return RV_REG_S4;
+	/* Stack read-only frame pointer to access stack */
+	case BPF_REG_FP:
+		__set_bit(RV_REG_S5, &ctx->seen_reg_bits);
+		return RV_REG_S5;
+	/* Temporary register */
+	case BPF_REG_AX:
+		__set_bit(RV_REG_T0, &ctx->seen_reg_bits);
+		return RV_REG_T0;
+	/* Tail call counter */
+	case TAIL_CALL_REG:
+		__set_bit(RV_REG_S6, &ctx->seen_reg_bits);
+		return RV_REG_S6;
+	default:
+		return 0;
+	}
+};
+
+static void seen_call(struct rv_jit_context *ctx)
+{
+	__set_bit(RV_REG_RA, &ctx->seen_reg_bits);
+}
+
+static bool seen_reg(int rv_reg, struct rv_jit_context *ctx)
+{
+	return test_bit(rv_reg, &ctx->seen_reg_bits);
+}
+
+static void emit(const u32 insn, struct rv_jit_context *ctx)
+{
+	if (ctx->insns)
+		ctx->insns[ctx->ninsns] = insn;
+
+	ctx->ninsns++;
+}
+
+static u32 rv_r_insn(u8 funct7, u8 rs2, u8 rs1, u8 funct3, u8 rd, u8 opcode)
+{
+	return (funct7 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) |
+		(rd << 7) | opcode;
+}
+
+static u32 rv_i_insn(u16 imm11_0, u8 rs1, u8 funct3, u8 rd, u8 opcode)
+{
+	return (imm11_0 << 20) | (rs1 << 15) | (funct3 << 12) | (rd << 7) |
+		opcode;
+}
+
+static u32 rv_s_insn(u16 imm11_0, u8 rs2, u8 rs1, u8 funct3, u8 opcode)
+{
+	u8 imm11_5 = imm11_0 >> 5, imm4_0 = imm11_0 & 0x1f;
+
+	return (imm11_5 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) |
+		(imm4_0 << 7) | opcode;
+}
+
+static u32 rv_sb_insn(u16 imm12_1, u8 rs2, u8 rs1, u8 funct3, u8 opcode)
+{
+	u8 imm12 = ((imm12_1 & 0x800) >> 5) | ((imm12_1 & 0x3f0) >> 4);
+	u8 imm4_1 = ((imm12_1 & 0xf) << 1) | ((imm12_1 & 0x400) >> 10);
+
+	return (imm12 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) |
+		(imm4_1 << 7) | opcode;
+}
+
+static u32 rv_u_insn(u32 imm31_12, u8 rd, u8 opcode)
+{
+	return (imm31_12 << 12) | (rd << 7) | opcode;
+}
+
+static u32 rv_uj_insn(u32 imm20_1, u8 rd, u8 opcode)
+{
+	u32 imm;
+
+	imm = (imm20_1 & 0x80000) |  ((imm20_1 & 0x3ff) << 9) |
+	      ((imm20_1 & 0x400) >> 2) | ((imm20_1 & 0x7f800) >> 11);
+
+	return (imm << 12) | (rd << 7) | opcode;
+}
+
+static u32 rv_amo_insn(u8 funct5, u8 aq, u8 rl, u8 rs2, u8 rs1,
+		       u8 funct3, u8 rd, u8 opcode)
+{
+	u8 funct7 = (funct5 << 2) | (aq << 1) | rl;
+
+	return rv_r_insn(funct7, rs2, rs1, funct3, rd, opcode);
+}
+
+static u32 rv_addiw(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 0, rd, 0x1b);
+}
+
+static u32 rv_addi(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 0, rd, 0x13);
+}
+
+static u32 rv_addw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 0, rd, 0x3b);
+}
+
+static u32 rv_add(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 0, rd, 0x33);
+}
+
+static u32 rv_subw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0x20, rs2, rs1, 0, rd, 0x3b);
+}
+
+static u32 rv_sub(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0x20, rs2, rs1, 0, rd, 0x33);
+}
+
+static u32 rv_and(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 7, rd, 0x33);
+}
+
+static u32 rv_or(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 6, rd, 0x33);
+}
+
+static u32 rv_xor(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 4, rd, 0x33);
+}
+
+static u32 rv_mulw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 0, rd, 0x3b);
+}
+
+static u32 rv_mul(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 0, rd, 0x33);
+}
+
+static u32 rv_divuw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 5, rd, 0x3b);
+}
+
+static u32 rv_divu(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 5, rd, 0x33);
+}
+
+static u32 rv_remuw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 7, rd, 0x3b);
+}
+
+static u32 rv_remu(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 7, rd, 0x33);
+}
+
+static u32 rv_sllw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 1, rd, 0x3b);
+}
+
+static u32 rv_sll(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 1, rd, 0x33);
+}
+
+static u32 rv_srlw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 5, rd, 0x3b);
+}
+
+static u32 rv_srl(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 5, rd, 0x33);
+}
+
+static u32 rv_sraw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0x20, rs2, rs1, 5, rd, 0x3b);
+}
+
+static u32 rv_sra(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0x20, rs2, rs1, 5, rd, 0x33);
+}
+
+static u32 rv_lui(u8 rd, u32 imm31_12)
+{
+	return rv_u_insn(imm31_12, rd, 0x37);
+}
+
+static u32 rv_slli(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 1, rd, 0x13);
+}
+
+static u32 rv_andi(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 7, rd, 0x13);
+}
+
+static u32 rv_ori(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 6, rd, 0x13);
+}
+
+static u32 rv_xori(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 4, rd, 0x13);
+}
+
+static u32 rv_slliw(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 1, rd, 0x1b);
+}
+
+static u32 rv_srliw(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 5, rd, 0x1b);
+}
+
+static u32 rv_srli(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 5, rd, 0x13);
+}
+
+static u32 rv_sraiw(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(0x400 | imm11_0, rs1, 5, rd, 0x1b);
+}
+
+static u32 rv_srai(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(0x400 | imm11_0, rs1, 5, rd, 0x13);
+}
+
+#if 0
+static u32 rv_auipc(u8 rd, u32 imm31_12)
+{
+	return rv_u_insn(imm31_12, rd, 0x17);
+}
+#endif
+
+static u32 rv_jal(u8 rd, u32 imm20_1)
+{
+	return rv_uj_insn(imm20_1, rd, 0x6f);
+}
+
+static u32 rv_jalr(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 0, rd, 0x67);
+}
+
+static u32 rv_beq(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 0, 0x63);
+}
+
+static u32 rv_bltu(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 6, 0x63);
+}
+
+static u32 rv_bgeu(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 7, 0x63);
+}
+
+static u32 rv_bne(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 1, 0x63);
+}
+
+static u32 rv_blt(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 4, 0x63);
+}
+
+static u32 rv_bge(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 5, 0x63);
+}
+
+static u32 rv_sb(u8 rs1, u16 imm11_0, u8 rs2)
+{
+	return rv_s_insn(imm11_0, rs2, rs1, 0, 0x23);
+}
+
+static u32 rv_sh(u8 rs1, u16 imm11_0, u8 rs2)
+{
+	return rv_s_insn(imm11_0, rs2, rs1, 1, 0x23);
+}
+
+static u32 rv_sw(u8 rs1, u16 imm11_0, u8 rs2)
+{
+	return rv_s_insn(imm11_0, rs2, rs1, 2, 0x23);
+}
+
+static u32 rv_sd(u8 rs1, u16 imm11_0, u8 rs2)
+{
+	return rv_s_insn(imm11_0, rs2, rs1, 3, 0x23);
+}
+
+#if 0
+static u32 rv_lb(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 0, rd, 0x03);
+}
+#endif
+
+static u32 rv_lbu(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 4, rd, 0x03);
+}
+
+#if 0
+static u32 rv_lh(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 1, rd, 0x03);
+}
+#endif
+
+static u32 rv_lhu(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 5, rd, 0x03);
+}
+
+#if 0
+static u32 rv_lw(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 2, rd, 0x03);
+}
+#endif
+
+static u32 rv_lwu(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 6, rd, 0x03);
+}
+
+static u32 rv_ld(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 3, rd, 0x03);
+}
+
+static u32 rv_amoadd_w(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl)
+{
+	return rv_amo_insn(0, aq, rl, rs2, rs1, 2, rd, 0x2f);
+}
+
+static u32 rv_amoadd_d(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl)
+{
+	return rv_amo_insn(0, aq, rl, rs2, rs1, 3, rd, 0x2f);
+}
+
+static bool is_12b_int(s64 val)
+{
+	return -(1 << 11) <= val && val < (1 << 11);
+}
+
+static bool is_32b_int(s64 val)
+{
+	return -(1L << 31) <= val && val < (1L << 31);
+}
+
+/* jumps */
+static bool is_21b_int(s64 val)
+{
+	return -(1L << 20) <= val && val < (1L << 20);
+
+}
+
+/* conditional branches */
+static bool is_13b_int(s64 val)
+{
+	return -(1 << 12) <= val && val < (1 << 12);
+}
+
+static void emit_imm(u8 rd, s64 val, struct rv_jit_context *ctx)
+{
+	/* Note that the immediate from the add is sign-extended,
+	 * which means that we need to compensate this by adding 2^12,
+	 * when the 12th bit is set. A simpler way of doing this, and
+	 * getting rid of the check, is to just add 2**11 before the
+	 * shift. The "Loading a 32-Bit constant" example from the
+	 * "Computer Organization and Design, RISC-V edition" book by
+	 * Patterson/Hennessy highlights this fact.
+	 *
+	 * This also means that we need to process LSB to MSB.
+	 */
+	s64 upper = (val + (1 << 11)) >> 12, lower = val & 0xfff;
+	int shift;
+
+	if (is_32b_int(val)) {
+		if (upper)
+			emit(rv_lui(rd, upper), ctx);
+
+		if (!upper) {
+			emit(rv_addi(rd, RV_REG_ZERO, lower), ctx);
+			return;
+		}
+
+		emit(rv_addiw(rd, rd, lower), ctx);
+		return;
+	}
+
+	shift = __ffs(upper);
+	upper >>= shift;
+	shift += 12;
+
+	emit_imm(rd, upper, ctx);
+
+	emit(rv_slli(rd, rd, shift), ctx);
+	if (lower)
+		emit(rv_addi(rd, rd, lower), ctx);
+}
+
+static int rv_offset(int bpf_to, int bpf_from, struct rv_jit_context *ctx)
+{
+	int from = ctx->offset[bpf_from] - 1, to = ctx->offset[bpf_to];
+
+	return (to - from) << 2;
+}
+
+static int epilogue_offset(struct rv_jit_context *ctx)
+{
+	int to = ctx->epilogue_offset, from = ctx->ninsns;
+
+	return (to - from) << 2;
+}
+
+static int emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
+		     bool extra_pass)
+{
+	bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+	int rvoff, i = insn - ctx->prog->insnsi;
+	u8 rd, rs, code = insn->code;
+	s16 off = insn->off;
+	s32 imm = insn->imm;
+
+	switch (code) {
+	/* dst = src */
+	case BPF_ALU | BPF_MOV | BPF_X:
+	case BPF_ALU64 | BPF_MOV | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_addi(rd, rs, 0) : rv_addiw(rd, rs, 0), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+
+	/* dst = dst OP src */
+	case BPF_ALU | BPF_ADD | BPF_X:
+	case BPF_ALU64 | BPF_ADD | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_add(rd, rd, rs) : rv_addw(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_SUB | BPF_X:
+	case BPF_ALU64 | BPF_SUB | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_sub(rd, rd, rs) : rv_subw(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_AND | BPF_X:
+	case BPF_ALU64 | BPF_AND | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_and(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_OR | BPF_X:
+	case BPF_ALU64 | BPF_OR | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_or(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_XOR | BPF_X:
+	case BPF_ALU64 | BPF_XOR | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_xor(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_MUL | BPF_X:
+	case BPF_ALU64 | BPF_MUL | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_mul(rd, rd, rs) : rv_mulw(rd, rd, rs), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_DIV | BPF_X:
+	case BPF_ALU64 | BPF_DIV | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_divu(rd, rd, rs) : rv_divuw(rd, rd, rs), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_MOD | BPF_X:
+	case BPF_ALU64 | BPF_MOD | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_remu(rd, rd, rs) : rv_remuw(rd, rd, rs), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_LSH | BPF_X:
+	case BPF_ALU64 | BPF_LSH | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_sll(rd, rd, rs) : rv_sllw(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_RSH | BPF_X:
+	case BPF_ALU64 | BPF_RSH | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_srl(rd, rd, rs) : rv_srlw(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_ARSH | BPF_X:
+	case BPF_ALU64 | BPF_ARSH | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_sra(rd, rd, rs) : rv_sraw(rd, rd, rs), ctx);
+		break;
+
+	/* dst = -dst */
+	case BPF_ALU | BPF_NEG:
+	case BPF_ALU64 | BPF_NEG:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ?
+		     rv_sub(rd, RV_REG_ZERO, rd) :
+		     rv_subw(rd, RV_REG_ZERO, rd),
+		     ctx);
+		break;
+
+	/* dst = BSWAP##imm(dst) */
+	case BPF_ALU | BPF_END | BPF_FROM_LE:
+	{
+		int shift = 64 - imm;
+
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_slli(rd, rd, shift), ctx);
+		emit(rv_srli(rd, rd, shift), ctx);
+		break;
+	}
+	case BPF_ALU | BPF_END | BPF_FROM_BE:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+
+		emit(rv_addi(RV_REG_T2, RV_REG_ZERO, 0), ctx);
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+		if (imm == 16)
+			goto out_be;
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+		if (imm == 32)
+			goto out_be;
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+	out_be:
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+
+		emit(rv_addi(rd, RV_REG_T2, 0), ctx);
+		break;
+
+	/* dst = imm */
+	case BPF_ALU | BPF_MOV | BPF_K:
+	case BPF_ALU64 | BPF_MOV | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(rd, imm, ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+
+	/* dst = dst OP imm */
+	case BPF_ALU | BPF_ADD | BPF_K:
+	case BPF_ALU64 | BPF_ADD | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(imm)) {
+			emit(is64 ? rv_addi(rd, rd, imm) :
+			     rv_addiw(rd, rd, imm), ctx);
+			if (!is64) {
+				emit(rv_slli(rd, rd, 32), ctx);
+				emit(rv_srli(rd, rd, 32), ctx);
+			}
+			break;
+		}
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(is64 ? rv_add(rd, rd, RV_REG_T1) :
+		     rv_addw(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_SUB | BPF_K:
+	case BPF_ALU64 | BPF_SUB | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(-imm)) {
+			emit(is64 ? rv_addi(rd, rd, -imm) :
+			     rv_addiw(rd, rd, -imm), ctx);
+			if (!is64) {
+				emit(rv_slli(rd, rd, 32), ctx);
+				emit(rv_srli(rd, rd, 32), ctx);
+			}
+			break;
+		}
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(is64 ? rv_sub(rd, rd, RV_REG_T1) :
+		     rv_subw(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_AND | BPF_K:
+	case BPF_ALU64 | BPF_AND | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(imm)) {
+			emit(rv_andi(rd, rd, imm), ctx);
+			if (!is64) {
+				emit(rv_slli(rd, rd, 32), ctx);
+				emit(rv_srli(rd, rd, 32), ctx);
+			}
+			break;
+		}
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_and(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_OR | BPF_K:
+	case BPF_ALU64 | BPF_OR | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(imm)) {
+			emit(rv_ori(rd, rd, imm), ctx);
+			if (!is64) {
+				emit(rv_slli(rd, rd, 32), ctx);
+				emit(rv_srli(rd, rd, 32), ctx);
+			}
+			break;
+		}
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_or(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_XOR | BPF_K:
+	case BPF_ALU64 | BPF_XOR | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(imm)) {
+			emit(rv_xori(rd, rd, imm), ctx);
+			if (!is64) {
+				emit(rv_slli(rd, rd, 32), ctx);
+				emit(rv_srli(rd, rd, 32), ctx);
+			}
+			break;
+		}
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_xor(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_MUL | BPF_K:
+	case BPF_ALU64 | BPF_MUL | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(is64 ? rv_mul(rd, rd, RV_REG_T1) :
+		     rv_mulw(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_DIV | BPF_K:
+	case BPF_ALU64 | BPF_DIV | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(is64 ? rv_divu(rd, rd, RV_REG_T1) :
+		     rv_divuw(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_MOD | BPF_K:
+	case BPF_ALU64 | BPF_MOD | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(is64 ? rv_remu(rd, rd, RV_REG_T1) :
+		     rv_remuw(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_LSH | BPF_K:
+	case BPF_ALU64 | BPF_LSH | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_slli(rd, rd, imm) :
+		     rv_slliw(rd, rd, imm),  ctx);
+		break;
+	case BPF_ALU | BPF_RSH | BPF_K:
+	case BPF_ALU64 | BPF_RSH | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_srli(rd, rd, imm) :
+		     rv_srliw(rd, rd, imm),  ctx);
+		break;
+	case BPF_ALU | BPF_ARSH | BPF_K:
+	case BPF_ALU64 | BPF_ARSH | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_srai(rd, rd, imm) :
+		     rv_sraiw(rd, rd, imm),  ctx);
+		break;
+
+	/* JUMP off */
+	case BPF_JMP | BPF_JA:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_21b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, rvoff);
+			return -1;
+		}
+
+		emit(rv_jal(RV_REG_ZERO, rvoff >> 1), ctx);
+		break;
+
+	/* IF (dst COND src) JUMP off */
+	case BPF_JMP | BPF_JEQ | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_beq(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JGT | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bltu(rs, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JLT | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bltu(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JGE | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bgeu(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JLE | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bgeu(rs, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JNE | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bne(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSGT | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_blt(rs, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSLT | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_blt(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSGE | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bge(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSLE | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bge(rs, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSET | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_and(RV_REG_T1, rd, rs), ctx);
+		emit(rv_bne(RV_REG_T1, RV_REG_ZERO, rvoff >> 1), ctx);
+		break;
+
+	/* IF (dst COND imm) JUMP off */
+	case BPF_JMP | BPF_JEQ | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_beq(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JGT | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bltu(RV_REG_T1, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JLT | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bltu(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JGE | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bgeu(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JLE | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bgeu(RV_REG_T1, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JNE | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bne(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSGT | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_blt(RV_REG_T1, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSLT | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_blt(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSGE | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bge(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSLE | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bge(RV_REG_T1, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSET | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T2, imm, ctx);
+		emit(rv_and(RV_REG_T1, rd, RV_REG_T2), ctx);
+		emit(rv_bne(RV_REG_T1, RV_REG_ZERO, rvoff >> 1), ctx);
+		break;
+
+	/* function call */
+	case BPF_JMP | BPF_CALL:
+	{
+		bool fixed;
+		int i, ret;
+		u64 addr;
+
+		seen_call(ctx);
+		ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass, &addr,
+					    &fixed);
+		if (ret < 0)
+			return ret;
+		if (fixed) {
+			emit_imm(RV_REG_T1, addr, ctx);
+		} else {
+			i = ctx->ninsns;
+			emit_imm(RV_REG_T1, addr, ctx);
+			for (i = ctx->ninsns - i; i < 8; i++) {
+				/* nop */
+				emit(rv_addi(RV_REG_ZERO, RV_REG_ZERO, 0),
+				     ctx);
+			}
+		}
+		emit(rv_jalr(RV_REG_RA, RV_REG_T1, 0), ctx);
+		rd = bpf_to_rv_reg(BPF_REG_0, ctx);
+		emit(rv_addi(rd, RV_REG_A0, 0), ctx);
+		break;
+	}
+	/* tail call */
+	case BPF_JMP | BPF_TAIL_CALL:
+		rd = bpf_to_rv_reg(TAIL_CALL_REG, ctx);
+		pr_err("bpf-jit: tail call not supported yet!\n");
+		return -1;
+
+	/* function return */
+	case BPF_JMP | BPF_EXIT:
+		if (i == ctx->prog->len - 1)
+			break;
+
+		rvoff = epilogue_offset(ctx);
+		if (!is_21b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, rvoff);
+			return -1;
+		}
+
+		emit(rv_jal(RV_REG_ZERO, rvoff >> 1), ctx);
+		break;
+
+	/* dst = imm64 */
+	case BPF_LD | BPF_IMM | BPF_DW:
+	{
+		struct bpf_insn insn1 = insn[1];
+		u64 imm64;
+
+		imm64 = (u64)insn1.imm << 32 | (u32)imm;
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(rd, imm64, ctx);
+		return 1;
+	}
+
+	/* LDX: dst = *(size *)(src + off) */
+	case BPF_LDX | BPF_MEM | BPF_B:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_lbu(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
+		emit(rv_lbu(rd, 0, RV_REG_T1), ctx);
+		break;
+	case BPF_LDX | BPF_MEM | BPF_H:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_lhu(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
+		emit(rv_lhu(rd, 0, RV_REG_T1), ctx);
+		break;
+	case BPF_LDX | BPF_MEM | BPF_W:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_lwu(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
+		emit(rv_lwu(rd, 0, RV_REG_T1), ctx);
+		break;
+	case BPF_LDX | BPF_MEM | BPF_DW:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_ld(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
+		emit(rv_ld(rd, 0, RV_REG_T1), ctx);
+		break;
+
+	/* ST: *(size *)(dst + off) = imm */
+	case BPF_ST | BPF_MEM | BPF_B:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sb(rd, off, RV_REG_T1), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T2, off, ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
+		emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx);
+		break;
+
+	case BPF_ST | BPF_MEM | BPF_H:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sh(rd, off, RV_REG_T1), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T2, off, ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
+		emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx);
+		break;
+	case BPF_ST | BPF_MEM | BPF_W:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sw(rd, off, RV_REG_T1), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T2, off, ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
+		emit(rv_sw(RV_REG_T2, 0, RV_REG_T1), ctx);
+		break;
+	case BPF_ST | BPF_MEM | BPF_DW:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sd(rd, off, RV_REG_T1), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T2, off, ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
+		emit(rv_sd(RV_REG_T2, 0, RV_REG_T1), ctx);
+		break;
+
+	/* STX: *(size *)(dst + off) = src */
+	case BPF_STX | BPF_MEM | BPF_B:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sb(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+		emit(rv_sb(RV_REG_T1, 0, rs), ctx);
+		break;
+	case BPF_STX | BPF_MEM | BPF_H:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sh(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+		emit(rv_sh(RV_REG_T1, 0, rs), ctx);
+		break;
+	case BPF_STX | BPF_MEM | BPF_W:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sw(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+		emit(rv_sw(RV_REG_T1, 0, rs), ctx);
+		break;
+	case BPF_STX | BPF_MEM | BPF_DW:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sd(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+		emit(rv_sd(RV_REG_T1, 0, rs), ctx);
+		break;
+	/* STX XADD: lock *(u32 *)(dst + off) += src */
+	case BPF_STX | BPF_XADD | BPF_W:
+	/* STX XADD: lock *(u64 *)(dst + off) += src */
+	case BPF_STX | BPF_XADD | BPF_DW:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (off) {
+			if (is_12b_int(off)) {
+				emit(rv_addi(RV_REG_T1, rd, off), ctx);
+			} else {
+				emit_imm(RV_REG_T1, off, ctx);
+				emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+			}
+
+			rd = RV_REG_T1;
+		}
+
+		emit(BPF_SIZE(code) == BPF_W ?
+		     rv_amoadd_w(RV_REG_ZERO, rs, rd, 0, 0) :
+		     rv_amoadd_d(RV_REG_ZERO, rs, rd, 0, 0), ctx);
+		break;
+	default:
+		pr_err("bpf-jit: unknown opcode %02x\n", code);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void build_prologue(struct rv_jit_context *ctx)
+{
+	int stack_adjust = 0, store_offset, bpf_stack_adjust;
+
+	if (seen_reg(RV_REG_RA, ctx))
+		stack_adjust += 8;
+	stack_adjust += 8; /* RV_REG_FP */
+	if (seen_reg(RV_REG_S1, ctx))
+		stack_adjust += 8;
+	if (seen_reg(RV_REG_S2, ctx))
+		stack_adjust += 8;
+	if (seen_reg(RV_REG_S3, ctx))
+		stack_adjust += 8;
+	if (seen_reg(RV_REG_S4, ctx))
+		stack_adjust += 8;
+	if (seen_reg(RV_REG_S5, ctx))
+		stack_adjust += 8;
+	if (seen_reg(RV_REG_S6, ctx))
+		stack_adjust += 8;
+
+	stack_adjust = round_up(stack_adjust, 16);
+	bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16);
+	stack_adjust += bpf_stack_adjust;
+
+	store_offset = stack_adjust - 8;
+
+	emit(rv_addi(RV_REG_SP, RV_REG_SP, -stack_adjust), ctx);
+
+	if (seen_reg(RV_REG_RA, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_RA), ctx);
+		store_offset -= 8;
+	}
+	emit(rv_sd(RV_REG_SP, store_offset, RV_REG_FP), ctx);
+	store_offset -= 8;
+	if (seen_reg(RV_REG_S1, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S1), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S2, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S2), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S3, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S3), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S4, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S4), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S5, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S5), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S6, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S6), ctx);
+		store_offset -= 8;
+	}
+
+	emit(rv_addi(RV_REG_FP, RV_REG_SP, stack_adjust), ctx);
+
+	if (bpf_stack_adjust) {
+		if (!seen_reg(RV_REG_S5, ctx))
+			pr_warn("bpf-jit: not seen BPF_REG_FP, stack is %d\n",
+				bpf_stack_adjust);
+		emit(rv_addi(RV_REG_S5, RV_REG_SP, bpf_stack_adjust), ctx);
+	}
+
+	ctx->stack_size = stack_adjust;
+}
+
+static void build_epilogue(struct rv_jit_context *ctx)
+{
+	int stack_adjust = ctx->stack_size, store_offset = stack_adjust - 8;
+
+	if (seen_reg(RV_REG_RA, ctx)) {
+		emit(rv_ld(RV_REG_RA, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	emit(rv_ld(RV_REG_FP, store_offset, RV_REG_SP), ctx);
+	store_offset -= 8;
+	if (seen_reg(RV_REG_S1, ctx)) {
+		emit(rv_ld(RV_REG_S1, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S2, ctx)) {
+		emit(rv_ld(RV_REG_S2, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S3, ctx)) {
+		emit(rv_ld(RV_REG_S3, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S4, ctx)) {
+		emit(rv_ld(RV_REG_S4, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S5, ctx)) {
+		emit(rv_ld(RV_REG_S5, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S6, ctx)) {
+		emit(rv_ld(RV_REG_S6, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+
+	emit(rv_addi(RV_REG_SP, RV_REG_SP, stack_adjust), ctx);
+	/* Set return value. */
+	emit(rv_addi(RV_REG_A0, RV_REG_A5, 0), ctx);
+	emit(rv_jalr(RV_REG_ZERO, RV_REG_RA, 0), ctx);
+}
+
+static int build_body(struct rv_jit_context *ctx, bool extra_pass)
+{
+	const struct bpf_prog *prog = ctx->prog;
+	int i;
+
+	for (i = 0; i < prog->len; i++) {
+		const struct bpf_insn *insn = &prog->insnsi[i];
+		int ret;
+
+		ret = emit_insn(insn, ctx, extra_pass);
+		if (ret > 0) {
+			i++;
+			if (ctx->insns == NULL)
+				ctx->offset[i] = ctx->ninsns;
+			continue;
+		}
+		if (ctx->insns == NULL)
+			ctx->offset[i] = ctx->ninsns;
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static void bpf_fill_ill_insns(void *area, unsigned int size)
+{
+	memset(area, 0, size);
+}
+
+static void bpf_flush_icache(void *start, void *end)
+{
+	flush_icache_range((unsigned long)start, (unsigned long)end);
+}
+
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
+	bool tmp_blinded = false, extra_pass = false;
+	struct bpf_prog *tmp, *orig_prog = prog;
+	struct rv_jit_data *jit_data;
+	struct rv_jit_context *ctx;
+	unsigned int image_size;
+
+	if (!prog->jit_requested)
+		return orig_prog;
+
+	tmp = bpf_jit_blind_constants(prog);
+	if (IS_ERR(tmp))
+		return orig_prog;
+	if (tmp != prog) {
+		tmp_blinded = true;
+		prog = tmp;
+	}
+
+	jit_data = prog->aux->jit_data;
+	if (!jit_data) {
+		jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
+		if (!jit_data) {
+			prog = orig_prog;
+			goto out;
+		}
+		prog->aux->jit_data = jit_data;
+	}
+
+	ctx = &jit_data->ctx;
+
+	if (ctx->offset) {
+		extra_pass = true;
+		image_size = sizeof(u32) * ctx->ninsns;
+		goto skip_init_ctx;
+	}
+
+	ctx->prog = prog;
+	ctx->offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL);
+	if (!ctx->offset) {
+		prog = orig_prog;
+		goto out_offset;
+	}
+
+	/* First pass generates the ctx->offset, but does not emit an image. */
+	if (build_body(ctx, extra_pass)) {
+		prog = orig_prog;
+		goto out_offset;
+	}
+	build_prologue(ctx);
+	ctx->epilogue_offset = ctx->ninsns;
+	build_epilogue(ctx);
+
+	/* Allocate image, now that we know the size. */
+	image_size = sizeof(u32) * ctx->ninsns;
+	jit_data->header = bpf_jit_binary_alloc(image_size, &jit_data->image,
+						sizeof(u32),
+						bpf_fill_ill_insns);
+	if (!jit_data->header) {
+		prog = orig_prog;
+		goto out_offset;
+	}
+
+	/* Second, real pass, that acutally emits the image. */
+	ctx->insns = (u32 *)jit_data->image;
+skip_init_ctx:
+	ctx->ninsns = 0;
+
+	build_prologue(ctx);
+	if (build_body(ctx, extra_pass)) {
+		bpf_jit_binary_free(jit_data->header);
+		prog = orig_prog;
+		goto out_offset;
+	}
+	build_epilogue(ctx);
+
+	if (bpf_jit_enable > 1)
+		bpf_jit_dump(prog->len, image_size, 2, ctx->insns);
+
+	prog->bpf_func = (void *)ctx->insns;
+	prog->jited = 1;
+	prog->jited_len = image_size;
+
+	bpf_flush_icache(jit_data->header, (u8 *)ctx->insns + ctx->ninsns);
+
+	if (!prog->is_func || extra_pass) {
+out_offset:
+		kfree(ctx->offset);
+		kfree(jit_data);
+		prog->aux->jit_data = NULL;
+	}
+out:
+	if (tmp_blinded)
+		bpf_jit_prog_release_other(prog, prog == orig_prog ?
+					   tmp : orig_prog);
 	return prog;
 }
-- 
2.19.1


WARNING: multiple messages have this Message-ID (diff)
From: "Björn Töpel" <bjorn.topel@gmail.com>
To: linux-riscv@lists.infradead.org
Cc: "Björn Töpel" <bjorn.topel@gmail.com>,
	palmer@sifive.com, davidlee@sifive.com, daniel@iogearbox.net,
	netdev@vger.kernel.org
Subject: [RFC PATCH 3/3] bpf, riscv: added eBPF JIT for RV64G
Date: Tue, 15 Jan 2019 09:35:18 +0100	[thread overview]
Message-ID: <20190115083518.10149-4-bjorn.topel@gmail.com> (raw)
In-Reply-To: <20190115083518.10149-1-bjorn.topel@gmail.com>

This commit adds eBPF JIT for RV64G.

Codewise, it needs some refactoring. Currently there's a bit too much
copy-and-paste going on, and I know some places where I could optimize
the code generation a bit (mostly BPF_K type of instructions, dealing
with immediates).

From a features perspective, two things are missing:

* tail calls
* "far-branches", i.e. conditional branches that reach beyond 13b.

The test_bpf.ko passes all tests.

Signed-off-by: Björn Töpel <bjorn.topel@gmail.com>
---
 arch/riscv/net/bpf_jit_comp.c | 1608 +++++++++++++++++++++++++++++++++
 1 file changed, 1608 insertions(+)

diff --git a/arch/riscv/net/bpf_jit_comp.c b/arch/riscv/net/bpf_jit_comp.c
index 7e359d3249ee..562d56eb8d23 100644
--- a/arch/riscv/net/bpf_jit_comp.c
+++ b/arch/riscv/net/bpf_jit_comp.c
@@ -1,4 +1,1612 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BPF JIT compiler for RV64G
+ *
+ * Copyright(c) 2019 Björn Töpel <bjorn.topel@gmail.com>
+ *
+ */
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <asm/cacheflush.h>
+
+#define TMP_REG_0 (MAX_BPF_JIT_REG + 0)
+#define TMP_REG_1 (MAX_BPF_JIT_REG + 1)
+#define TAIL_CALL_REG (MAX_BPF_JIT_REG + 2)
+
+enum rv_register {
+	RV_REG_ZERO =	0,	/* The constant value 0 */
+	RV_REG_RA =	1,	/* Return address */
+	RV_REG_SP =	2,	/* Stack pointer */
+	RV_REG_GP =	3,	/* Global pointer */
+	RV_REG_TP =	4,	/* Thread pointer */
+	RV_REG_T0 =	5,	/* Temporaries */
+	RV_REG_T1 =	6,
+	RV_REG_T2 =	7,
+	RV_REG_FP =	8,
+	RV_REG_S1 =	9,	/* Saved registers */
+	RV_REG_A0 =	10,	/* Function argument/return values */
+	RV_REG_A1 =	11,	/* Function arguments */
+	RV_REG_A2 =	12,
+	RV_REG_A3 =	13,
+	RV_REG_A4 =	14,
+	RV_REG_A5 =	15,
+	RV_REG_A6 =	16,
+	RV_REG_A7 =	17,
+	RV_REG_S2 =	18,	/* Saved registers */
+	RV_REG_S3 =	19,
+	RV_REG_S4 =	20,
+	RV_REG_S5 =	21,
+	RV_REG_S6 =	22,
+	RV_REG_S7 =	23,
+	RV_REG_S8 =	24,
+	RV_REG_S9 =	25,
+	RV_REG_S10 =	26,
+	RV_REG_S11 =	27,
+	RV_REG_T3 =	28,	/* Temporaries */
+	RV_REG_T4 =	29,
+	RV_REG_T5 =	30,
+	RV_REG_T6 =	31,
+};
+
+struct rv_jit_context {
+	struct bpf_prog *prog;
+	u32 *insns; /* RV insns */
+	int ninsns;
+	int epilogue_offset;
+	int *offset; /* BPF to RV */
+	unsigned long seen_reg_bits;
+	int stack_size;
+};
+
+struct rv_jit_data {
+	struct bpf_binary_header *header;
+	u8 *image;
+	struct rv_jit_context ctx;
+};
+
+static u8 bpf_to_rv_reg(int bpf_reg, struct rv_jit_context *ctx)
+{
+	switch (bpf_reg) {
+	/* Return value */
+	case BPF_REG_0:
+		__set_bit(RV_REG_A5, &ctx->seen_reg_bits);
+		return RV_REG_A5;
+	/* Function arguments */
+	case BPF_REG_1:
+		__set_bit(RV_REG_A0, &ctx->seen_reg_bits);
+		return RV_REG_A0;
+	case BPF_REG_2:
+		__set_bit(RV_REG_A1, &ctx->seen_reg_bits);
+		return RV_REG_A1;
+	case BPF_REG_3:
+		__set_bit(RV_REG_A2, &ctx->seen_reg_bits);
+		return RV_REG_A2;
+	case BPF_REG_4:
+		__set_bit(RV_REG_A3, &ctx->seen_reg_bits);
+		return RV_REG_A3;
+	case BPF_REG_5:
+		__set_bit(RV_REG_A4, &ctx->seen_reg_bits);
+		return RV_REG_A4;
+	/* Callee saved registers */
+	case BPF_REG_6:
+		__set_bit(RV_REG_S1, &ctx->seen_reg_bits);
+		return RV_REG_S1;
+	case BPF_REG_7:
+		__set_bit(RV_REG_S2, &ctx->seen_reg_bits);
+		return RV_REG_S2;
+	case BPF_REG_8:
+		__set_bit(RV_REG_S3, &ctx->seen_reg_bits);
+		return RV_REG_S3;
+	case BPF_REG_9:
+		__set_bit(RV_REG_S4, &ctx->seen_reg_bits);
+		return RV_REG_S4;
+	/* Stack read-only frame pointer to access stack */
+	case BPF_REG_FP:
+		__set_bit(RV_REG_S5, &ctx->seen_reg_bits);
+		return RV_REG_S5;
+	/* Temporary register */
+	case BPF_REG_AX:
+		__set_bit(RV_REG_T0, &ctx->seen_reg_bits);
+		return RV_REG_T0;
+	/* Tail call counter */
+	case TAIL_CALL_REG:
+		__set_bit(RV_REG_S6, &ctx->seen_reg_bits);
+		return RV_REG_S6;
+	default:
+		return 0;
+	}
+};
+
+static void seen_call(struct rv_jit_context *ctx)
+{
+	__set_bit(RV_REG_RA, &ctx->seen_reg_bits);
+}
+
+static bool seen_reg(int rv_reg, struct rv_jit_context *ctx)
+{
+	return test_bit(rv_reg, &ctx->seen_reg_bits);
+}
+
+static void emit(const u32 insn, struct rv_jit_context *ctx)
+{
+	if (ctx->insns)
+		ctx->insns[ctx->ninsns] = insn;
+
+	ctx->ninsns++;
+}
+
+static u32 rv_r_insn(u8 funct7, u8 rs2, u8 rs1, u8 funct3, u8 rd, u8 opcode)
+{
+	return (funct7 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) |
+		(rd << 7) | opcode;
+}
+
+static u32 rv_i_insn(u16 imm11_0, u8 rs1, u8 funct3, u8 rd, u8 opcode)
+{
+	return (imm11_0 << 20) | (rs1 << 15) | (funct3 << 12) | (rd << 7) |
+		opcode;
+}
+
+static u32 rv_s_insn(u16 imm11_0, u8 rs2, u8 rs1, u8 funct3, u8 opcode)
+{
+	u8 imm11_5 = imm11_0 >> 5, imm4_0 = imm11_0 & 0x1f;
+
+	return (imm11_5 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) |
+		(imm4_0 << 7) | opcode;
+}
+
+static u32 rv_sb_insn(u16 imm12_1, u8 rs2, u8 rs1, u8 funct3, u8 opcode)
+{
+	u8 imm12 = ((imm12_1 & 0x800) >> 5) | ((imm12_1 & 0x3f0) >> 4);
+	u8 imm4_1 = ((imm12_1 & 0xf) << 1) | ((imm12_1 & 0x400) >> 10);
+
+	return (imm12 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) |
+		(imm4_1 << 7) | opcode;
+}
+
+static u32 rv_u_insn(u32 imm31_12, u8 rd, u8 opcode)
+{
+	return (imm31_12 << 12) | (rd << 7) | opcode;
+}
+
+static u32 rv_uj_insn(u32 imm20_1, u8 rd, u8 opcode)
+{
+	u32 imm;
+
+	imm = (imm20_1 & 0x80000) |  ((imm20_1 & 0x3ff) << 9) |
+	      ((imm20_1 & 0x400) >> 2) | ((imm20_1 & 0x7f800) >> 11);
+
+	return (imm << 12) | (rd << 7) | opcode;
+}
+
+static u32 rv_amo_insn(u8 funct5, u8 aq, u8 rl, u8 rs2, u8 rs1,
+		       u8 funct3, u8 rd, u8 opcode)
+{
+	u8 funct7 = (funct5 << 2) | (aq << 1) | rl;
+
+	return rv_r_insn(funct7, rs2, rs1, funct3, rd, opcode);
+}
+
+static u32 rv_addiw(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 0, rd, 0x1b);
+}
+
+static u32 rv_addi(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 0, rd, 0x13);
+}
+
+static u32 rv_addw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 0, rd, 0x3b);
+}
+
+static u32 rv_add(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 0, rd, 0x33);
+}
+
+static u32 rv_subw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0x20, rs2, rs1, 0, rd, 0x3b);
+}
+
+static u32 rv_sub(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0x20, rs2, rs1, 0, rd, 0x33);
+}
+
+static u32 rv_and(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 7, rd, 0x33);
+}
+
+static u32 rv_or(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 6, rd, 0x33);
+}
+
+static u32 rv_xor(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 4, rd, 0x33);
+}
+
+static u32 rv_mulw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 0, rd, 0x3b);
+}
+
+static u32 rv_mul(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 0, rd, 0x33);
+}
+
+static u32 rv_divuw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 5, rd, 0x3b);
+}
+
+static u32 rv_divu(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 5, rd, 0x33);
+}
+
+static u32 rv_remuw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 7, rd, 0x3b);
+}
+
+static u32 rv_remu(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(1, rs2, rs1, 7, rd, 0x33);
+}
+
+static u32 rv_sllw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 1, rd, 0x3b);
+}
+
+static u32 rv_sll(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 1, rd, 0x33);
+}
+
+static u32 rv_srlw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 5, rd, 0x3b);
+}
+
+static u32 rv_srl(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0, rs2, rs1, 5, rd, 0x33);
+}
+
+static u32 rv_sraw(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0x20, rs2, rs1, 5, rd, 0x3b);
+}
+
+static u32 rv_sra(u8 rd, u8 rs1, u8 rs2)
+{
+	return rv_r_insn(0x20, rs2, rs1, 5, rd, 0x33);
+}
+
+static u32 rv_lui(u8 rd, u32 imm31_12)
+{
+	return rv_u_insn(imm31_12, rd, 0x37);
+}
+
+static u32 rv_slli(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 1, rd, 0x13);
+}
+
+static u32 rv_andi(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 7, rd, 0x13);
+}
+
+static u32 rv_ori(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 6, rd, 0x13);
+}
+
+static u32 rv_xori(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 4, rd, 0x13);
+}
+
+static u32 rv_slliw(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 1, rd, 0x1b);
+}
+
+static u32 rv_srliw(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 5, rd, 0x1b);
+}
+
+static u32 rv_srli(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 5, rd, 0x13);
+}
+
+static u32 rv_sraiw(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(0x400 | imm11_0, rs1, 5, rd, 0x1b);
+}
+
+static u32 rv_srai(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(0x400 | imm11_0, rs1, 5, rd, 0x13);
+}
+
+#if 0
+static u32 rv_auipc(u8 rd, u32 imm31_12)
+{
+	return rv_u_insn(imm31_12, rd, 0x17);
+}
+#endif
+
+static u32 rv_jal(u8 rd, u32 imm20_1)
+{
+	return rv_uj_insn(imm20_1, rd, 0x6f);
+}
+
+static u32 rv_jalr(u8 rd, u8 rs1, u16 imm11_0)
+{
+	return rv_i_insn(imm11_0, rs1, 0, rd, 0x67);
+}
+
+static u32 rv_beq(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 0, 0x63);
+}
+
+static u32 rv_bltu(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 6, 0x63);
+}
+
+static u32 rv_bgeu(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 7, 0x63);
+}
+
+static u32 rv_bne(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 1, 0x63);
+}
+
+static u32 rv_blt(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 4, 0x63);
+}
+
+static u32 rv_bge(u8 rs1, u8 rs2, u16 imm12_1)
+{
+	return rv_sb_insn(imm12_1, rs2, rs1, 5, 0x63);
+}
+
+static u32 rv_sb(u8 rs1, u16 imm11_0, u8 rs2)
+{
+	return rv_s_insn(imm11_0, rs2, rs1, 0, 0x23);
+}
+
+static u32 rv_sh(u8 rs1, u16 imm11_0, u8 rs2)
+{
+	return rv_s_insn(imm11_0, rs2, rs1, 1, 0x23);
+}
+
+static u32 rv_sw(u8 rs1, u16 imm11_0, u8 rs2)
+{
+	return rv_s_insn(imm11_0, rs2, rs1, 2, 0x23);
+}
+
+static u32 rv_sd(u8 rs1, u16 imm11_0, u8 rs2)
+{
+	return rv_s_insn(imm11_0, rs2, rs1, 3, 0x23);
+}
+
+#if 0
+static u32 rv_lb(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 0, rd, 0x03);
+}
+#endif
+
+static u32 rv_lbu(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 4, rd, 0x03);
+}
+
+#if 0
+static u32 rv_lh(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 1, rd, 0x03);
+}
+#endif
+
+static u32 rv_lhu(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 5, rd, 0x03);
+}
+
+#if 0
+static u32 rv_lw(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 2, rd, 0x03);
+}
+#endif
+
+static u32 rv_lwu(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 6, rd, 0x03);
+}
+
+static u32 rv_ld(u8 rd, u16 imm11_0, u8 rs1)
+{
+	return rv_i_insn(imm11_0, rs1, 3, rd, 0x03);
+}
+
+static u32 rv_amoadd_w(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl)
+{
+	return rv_amo_insn(0, aq, rl, rs2, rs1, 2, rd, 0x2f);
+}
+
+static u32 rv_amoadd_d(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl)
+{
+	return rv_amo_insn(0, aq, rl, rs2, rs1, 3, rd, 0x2f);
+}
+
+static bool is_12b_int(s64 val)
+{
+	return -(1 << 11) <= val && val < (1 << 11);
+}
+
+static bool is_32b_int(s64 val)
+{
+	return -(1L << 31) <= val && val < (1L << 31);
+}
+
+/* jumps */
+static bool is_21b_int(s64 val)
+{
+	return -(1L << 20) <= val && val < (1L << 20);
+
+}
+
+/* conditional branches */
+static bool is_13b_int(s64 val)
+{
+	return -(1 << 12) <= val && val < (1 << 12);
+}
+
+static void emit_imm(u8 rd, s64 val, struct rv_jit_context *ctx)
+{
+	/* Note that the immediate from the add is sign-extended,
+	 * which means that we need to compensate this by adding 2^12,
+	 * when the 12th bit is set. A simpler way of doing this, and
+	 * getting rid of the check, is to just add 2**11 before the
+	 * shift. The "Loading a 32-Bit constant" example from the
+	 * "Computer Organization and Design, RISC-V edition" book by
+	 * Patterson/Hennessy highlights this fact.
+	 *
+	 * This also means that we need to process LSB to MSB.
+	 */
+	s64 upper = (val + (1 << 11)) >> 12, lower = val & 0xfff;
+	int shift;
+
+	if (is_32b_int(val)) {
+		if (upper)
+			emit(rv_lui(rd, upper), ctx);
+
+		if (!upper) {
+			emit(rv_addi(rd, RV_REG_ZERO, lower), ctx);
+			return;
+		}
+
+		emit(rv_addiw(rd, rd, lower), ctx);
+		return;
+	}
+
+	shift = __ffs(upper);
+	upper >>= shift;
+	shift += 12;
+
+	emit_imm(rd, upper, ctx);
+
+	emit(rv_slli(rd, rd, shift), ctx);
+	if (lower)
+		emit(rv_addi(rd, rd, lower), ctx);
+}
+
+static int rv_offset(int bpf_to, int bpf_from, struct rv_jit_context *ctx)
+{
+	int from = ctx->offset[bpf_from] - 1, to = ctx->offset[bpf_to];
+
+	return (to - from) << 2;
+}
+
+static int epilogue_offset(struct rv_jit_context *ctx)
+{
+	int to = ctx->epilogue_offset, from = ctx->ninsns;
+
+	return (to - from) << 2;
+}
+
+static int emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
+		     bool extra_pass)
+{
+	bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+	int rvoff, i = insn - ctx->prog->insnsi;
+	u8 rd, rs, code = insn->code;
+	s16 off = insn->off;
+	s32 imm = insn->imm;
+
+	switch (code) {
+	/* dst = src */
+	case BPF_ALU | BPF_MOV | BPF_X:
+	case BPF_ALU64 | BPF_MOV | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_addi(rd, rs, 0) : rv_addiw(rd, rs, 0), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+
+	/* dst = dst OP src */
+	case BPF_ALU | BPF_ADD | BPF_X:
+	case BPF_ALU64 | BPF_ADD | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_add(rd, rd, rs) : rv_addw(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_SUB | BPF_X:
+	case BPF_ALU64 | BPF_SUB | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_sub(rd, rd, rs) : rv_subw(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_AND | BPF_X:
+	case BPF_ALU64 | BPF_AND | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_and(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_OR | BPF_X:
+	case BPF_ALU64 | BPF_OR | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_or(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_XOR | BPF_X:
+	case BPF_ALU64 | BPF_XOR | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_xor(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_MUL | BPF_X:
+	case BPF_ALU64 | BPF_MUL | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_mul(rd, rd, rs) : rv_mulw(rd, rd, rs), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_DIV | BPF_X:
+	case BPF_ALU64 | BPF_DIV | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_divu(rd, rd, rs) : rv_divuw(rd, rd, rs), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_MOD | BPF_X:
+	case BPF_ALU64 | BPF_MOD | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_remu(rd, rd, rs) : rv_remuw(rd, rd, rs), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_LSH | BPF_X:
+	case BPF_ALU64 | BPF_LSH | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_sll(rd, rd, rs) : rv_sllw(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_RSH | BPF_X:
+	case BPF_ALU64 | BPF_RSH | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_srl(rd, rd, rs) : rv_srlw(rd, rd, rs), ctx);
+		break;
+	case BPF_ALU | BPF_ARSH | BPF_X:
+	case BPF_ALU64 | BPF_ARSH | BPF_X:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_sra(rd, rd, rs) : rv_sraw(rd, rd, rs), ctx);
+		break;
+
+	/* dst = -dst */
+	case BPF_ALU | BPF_NEG:
+	case BPF_ALU64 | BPF_NEG:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ?
+		     rv_sub(rd, RV_REG_ZERO, rd) :
+		     rv_subw(rd, RV_REG_ZERO, rd),
+		     ctx);
+		break;
+
+	/* dst = BSWAP##imm(dst) */
+	case BPF_ALU | BPF_END | BPF_FROM_LE:
+	{
+		int shift = 64 - imm;
+
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_slli(rd, rd, shift), ctx);
+		emit(rv_srli(rd, rd, shift), ctx);
+		break;
+	}
+	case BPF_ALU | BPF_END | BPF_FROM_BE:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+
+		emit(rv_addi(RV_REG_T2, RV_REG_ZERO, 0), ctx);
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+		if (imm == 16)
+			goto out_be;
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+		if (imm == 32)
+			goto out_be;
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+		emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
+		emit(rv_srli(rd, rd, 8), ctx);
+	out_be:
+		emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+
+		emit(rv_addi(rd, RV_REG_T2, 0), ctx);
+		break;
+
+	/* dst = imm */
+	case BPF_ALU | BPF_MOV | BPF_K:
+	case BPF_ALU64 | BPF_MOV | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(rd, imm, ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+
+	/* dst = dst OP imm */
+	case BPF_ALU | BPF_ADD | BPF_K:
+	case BPF_ALU64 | BPF_ADD | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(imm)) {
+			emit(is64 ? rv_addi(rd, rd, imm) :
+			     rv_addiw(rd, rd, imm), ctx);
+			if (!is64) {
+				emit(rv_slli(rd, rd, 32), ctx);
+				emit(rv_srli(rd, rd, 32), ctx);
+			}
+			break;
+		}
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(is64 ? rv_add(rd, rd, RV_REG_T1) :
+		     rv_addw(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_SUB | BPF_K:
+	case BPF_ALU64 | BPF_SUB | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(-imm)) {
+			emit(is64 ? rv_addi(rd, rd, -imm) :
+			     rv_addiw(rd, rd, -imm), ctx);
+			if (!is64) {
+				emit(rv_slli(rd, rd, 32), ctx);
+				emit(rv_srli(rd, rd, 32), ctx);
+			}
+			break;
+		}
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(is64 ? rv_sub(rd, rd, RV_REG_T1) :
+		     rv_subw(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_AND | BPF_K:
+	case BPF_ALU64 | BPF_AND | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(imm)) {
+			emit(rv_andi(rd, rd, imm), ctx);
+			if (!is64) {
+				emit(rv_slli(rd, rd, 32), ctx);
+				emit(rv_srli(rd, rd, 32), ctx);
+			}
+			break;
+		}
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_and(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_OR | BPF_K:
+	case BPF_ALU64 | BPF_OR | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(imm)) {
+			emit(rv_ori(rd, rd, imm), ctx);
+			if (!is64) {
+				emit(rv_slli(rd, rd, 32), ctx);
+				emit(rv_srli(rd, rd, 32), ctx);
+			}
+			break;
+		}
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_or(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_XOR | BPF_K:
+	case BPF_ALU64 | BPF_XOR | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(imm)) {
+			emit(rv_xori(rd, rd, imm), ctx);
+			if (!is64) {
+				emit(rv_slli(rd, rd, 32), ctx);
+				emit(rv_srli(rd, rd, 32), ctx);
+			}
+			break;
+		}
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_xor(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_MUL | BPF_K:
+	case BPF_ALU64 | BPF_MUL | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(is64 ? rv_mul(rd, rd, RV_REG_T1) :
+		     rv_mulw(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_DIV | BPF_K:
+	case BPF_ALU64 | BPF_DIV | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(is64 ? rv_divu(rd, rd, RV_REG_T1) :
+		     rv_divuw(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_MOD | BPF_K:
+	case BPF_ALU64 | BPF_MOD | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(is64 ? rv_remu(rd, rd, RV_REG_T1) :
+		     rv_remuw(rd, rd, RV_REG_T1), ctx);
+		if (!is64) {
+			emit(rv_slli(rd, rd, 32), ctx);
+			emit(rv_srli(rd, rd, 32), ctx);
+		}
+		break;
+	case BPF_ALU | BPF_LSH | BPF_K:
+	case BPF_ALU64 | BPF_LSH | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_slli(rd, rd, imm) :
+		     rv_slliw(rd, rd, imm),  ctx);
+		break;
+	case BPF_ALU | BPF_RSH | BPF_K:
+	case BPF_ALU64 | BPF_RSH | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_srli(rd, rd, imm) :
+		     rv_srliw(rd, rd, imm),  ctx);
+		break;
+	case BPF_ALU | BPF_ARSH | BPF_K:
+	case BPF_ALU64 | BPF_ARSH | BPF_K:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(is64 ? rv_srai(rd, rd, imm) :
+		     rv_sraiw(rd, rd, imm),  ctx);
+		break;
+
+	/* JUMP off */
+	case BPF_JMP | BPF_JA:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_21b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, rvoff);
+			return -1;
+		}
+
+		emit(rv_jal(RV_REG_ZERO, rvoff >> 1), ctx);
+		break;
+
+	/* IF (dst COND src) JUMP off */
+	case BPF_JMP | BPF_JEQ | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_beq(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JGT | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bltu(rs, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JLT | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bltu(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JGE | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bgeu(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JLE | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bgeu(rs, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JNE | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bne(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSGT | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_blt(rs, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSLT | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_blt(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSGE | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bge(rd, rs, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSLE | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_bge(rs, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSET | BPF_X:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit(rv_and(RV_REG_T1, rd, rs), ctx);
+		emit(rv_bne(RV_REG_T1, RV_REG_ZERO, rvoff >> 1), ctx);
+		break;
+
+	/* IF (dst COND imm) JUMP off */
+	case BPF_JMP | BPF_JEQ | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_beq(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JGT | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bltu(RV_REG_T1, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JLT | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bltu(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JGE | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bgeu(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JLE | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bgeu(RV_REG_T1, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JNE | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bne(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSGT | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_blt(RV_REG_T1, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSLT | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_blt(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSGE | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bge(rd, RV_REG_T1, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSLE | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		emit(rv_bge(RV_REG_T1, rd, rvoff >> 1), ctx);
+		break;
+	case BPF_JMP | BPF_JSET | BPF_K:
+		rvoff = rv_offset(i + off, i, ctx);
+		if (!is_13b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, (int)rvoff);
+			return -1;
+		}
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T2, imm, ctx);
+		emit(rv_and(RV_REG_T1, rd, RV_REG_T2), ctx);
+		emit(rv_bne(RV_REG_T1, RV_REG_ZERO, rvoff >> 1), ctx);
+		break;
+
+	/* function call */
+	case BPF_JMP | BPF_CALL:
+	{
+		bool fixed;
+		int i, ret;
+		u64 addr;
+
+		seen_call(ctx);
+		ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass, &addr,
+					    &fixed);
+		if (ret < 0)
+			return ret;
+		if (fixed) {
+			emit_imm(RV_REG_T1, addr, ctx);
+		} else {
+			i = ctx->ninsns;
+			emit_imm(RV_REG_T1, addr, ctx);
+			for (i = ctx->ninsns - i; i < 8; i++) {
+				/* nop */
+				emit(rv_addi(RV_REG_ZERO, RV_REG_ZERO, 0),
+				     ctx);
+			}
+		}
+		emit(rv_jalr(RV_REG_RA, RV_REG_T1, 0), ctx);
+		rd = bpf_to_rv_reg(BPF_REG_0, ctx);
+		emit(rv_addi(rd, RV_REG_A0, 0), ctx);
+		break;
+	}
+	/* tail call */
+	case BPF_JMP | BPF_TAIL_CALL:
+		rd = bpf_to_rv_reg(TAIL_CALL_REG, ctx);
+		pr_err("bpf-jit: tail call not supported yet!\n");
+		return -1;
+
+	/* function return */
+	case BPF_JMP | BPF_EXIT:
+		if (i == ctx->prog->len - 1)
+			break;
+
+		rvoff = epilogue_offset(ctx);
+		if (!is_21b_int(rvoff)) {
+			pr_err("bpf-jit: %d offset=%d not supported yet!\n",
+			       __LINE__, rvoff);
+			return -1;
+		}
+
+		emit(rv_jal(RV_REG_ZERO, rvoff >> 1), ctx);
+		break;
+
+	/* dst = imm64 */
+	case BPF_LD | BPF_IMM | BPF_DW:
+	{
+		struct bpf_insn insn1 = insn[1];
+		u64 imm64;
+
+		imm64 = (u64)insn1.imm << 32 | (u32)imm;
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(rd, imm64, ctx);
+		return 1;
+	}
+
+	/* LDX: dst = *(size *)(src + off) */
+	case BPF_LDX | BPF_MEM | BPF_B:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_lbu(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
+		emit(rv_lbu(rd, 0, RV_REG_T1), ctx);
+		break;
+	case BPF_LDX | BPF_MEM | BPF_H:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_lhu(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
+		emit(rv_lhu(rd, 0, RV_REG_T1), ctx);
+		break;
+	case BPF_LDX | BPF_MEM | BPF_W:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_lwu(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
+		emit(rv_lwu(rd, 0, RV_REG_T1), ctx);
+		break;
+	case BPF_LDX | BPF_MEM | BPF_DW:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_ld(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
+		emit(rv_ld(rd, 0, RV_REG_T1), ctx);
+		break;
+
+	/* ST: *(size *)(dst + off) = imm */
+	case BPF_ST | BPF_MEM | BPF_B:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sb(rd, off, RV_REG_T1), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T2, off, ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
+		emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx);
+		break;
+
+	case BPF_ST | BPF_MEM | BPF_H:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sh(rd, off, RV_REG_T1), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T2, off, ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
+		emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx);
+		break;
+	case BPF_ST | BPF_MEM | BPF_W:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sw(rd, off, RV_REG_T1), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T2, off, ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
+		emit(rv_sw(RV_REG_T2, 0, RV_REG_T1), ctx);
+		break;
+	case BPF_ST | BPF_MEM | BPF_DW:
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		emit_imm(RV_REG_T1, imm, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sd(rd, off, RV_REG_T1), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T2, off, ctx);
+		emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
+		emit(rv_sd(RV_REG_T2, 0, RV_REG_T1), ctx);
+		break;
+
+	/* STX: *(size *)(dst + off) = src */
+	case BPF_STX | BPF_MEM | BPF_B:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sb(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+		emit(rv_sb(RV_REG_T1, 0, rs), ctx);
+		break;
+	case BPF_STX | BPF_MEM | BPF_H:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sh(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+		emit(rv_sh(RV_REG_T1, 0, rs), ctx);
+		break;
+	case BPF_STX | BPF_MEM | BPF_W:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sw(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+		emit(rv_sw(RV_REG_T1, 0, rs), ctx);
+		break;
+	case BPF_STX | BPF_MEM | BPF_DW:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (is_12b_int(off)) {
+			emit(rv_sd(rd, off, rs), ctx);
+			break;
+		}
+
+		emit_imm(RV_REG_T1, off, ctx);
+		emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+		emit(rv_sd(RV_REG_T1, 0, rs), ctx);
+		break;
+	/* STX XADD: lock *(u32 *)(dst + off) += src */
+	case BPF_STX | BPF_XADD | BPF_W:
+	/* STX XADD: lock *(u64 *)(dst + off) += src */
+	case BPF_STX | BPF_XADD | BPF_DW:
+		rs = bpf_to_rv_reg(insn->src_reg, ctx);
+		rd = bpf_to_rv_reg(insn->dst_reg, ctx);
+		if (off) {
+			if (is_12b_int(off)) {
+				emit(rv_addi(RV_REG_T1, rd, off), ctx);
+			} else {
+				emit_imm(RV_REG_T1, off, ctx);
+				emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+			}
+
+			rd = RV_REG_T1;
+		}
+
+		emit(BPF_SIZE(code) == BPF_W ?
+		     rv_amoadd_w(RV_REG_ZERO, rs, rd, 0, 0) :
+		     rv_amoadd_d(RV_REG_ZERO, rs, rd, 0, 0), ctx);
+		break;
+	default:
+		pr_err("bpf-jit: unknown opcode %02x\n", code);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void build_prologue(struct rv_jit_context *ctx)
+{
+	int stack_adjust = 0, store_offset, bpf_stack_adjust;
+
+	if (seen_reg(RV_REG_RA, ctx))
+		stack_adjust += 8;
+	stack_adjust += 8; /* RV_REG_FP */
+	if (seen_reg(RV_REG_S1, ctx))
+		stack_adjust += 8;
+	if (seen_reg(RV_REG_S2, ctx))
+		stack_adjust += 8;
+	if (seen_reg(RV_REG_S3, ctx))
+		stack_adjust += 8;
+	if (seen_reg(RV_REG_S4, ctx))
+		stack_adjust += 8;
+	if (seen_reg(RV_REG_S5, ctx))
+		stack_adjust += 8;
+	if (seen_reg(RV_REG_S6, ctx))
+		stack_adjust += 8;
+
+	stack_adjust = round_up(stack_adjust, 16);
+	bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16);
+	stack_adjust += bpf_stack_adjust;
+
+	store_offset = stack_adjust - 8;
+
+	emit(rv_addi(RV_REG_SP, RV_REG_SP, -stack_adjust), ctx);
+
+	if (seen_reg(RV_REG_RA, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_RA), ctx);
+		store_offset -= 8;
+	}
+	emit(rv_sd(RV_REG_SP, store_offset, RV_REG_FP), ctx);
+	store_offset -= 8;
+	if (seen_reg(RV_REG_S1, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S1), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S2, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S2), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S3, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S3), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S4, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S4), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S5, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S5), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S6, ctx)) {
+		emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S6), ctx);
+		store_offset -= 8;
+	}
+
+	emit(rv_addi(RV_REG_FP, RV_REG_SP, stack_adjust), ctx);
+
+	if (bpf_stack_adjust) {
+		if (!seen_reg(RV_REG_S5, ctx))
+			pr_warn("bpf-jit: not seen BPF_REG_FP, stack is %d\n",
+				bpf_stack_adjust);
+		emit(rv_addi(RV_REG_S5, RV_REG_SP, bpf_stack_adjust), ctx);
+	}
+
+	ctx->stack_size = stack_adjust;
+}
+
+static void build_epilogue(struct rv_jit_context *ctx)
+{
+	int stack_adjust = ctx->stack_size, store_offset = stack_adjust - 8;
+
+	if (seen_reg(RV_REG_RA, ctx)) {
+		emit(rv_ld(RV_REG_RA, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	emit(rv_ld(RV_REG_FP, store_offset, RV_REG_SP), ctx);
+	store_offset -= 8;
+	if (seen_reg(RV_REG_S1, ctx)) {
+		emit(rv_ld(RV_REG_S1, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S2, ctx)) {
+		emit(rv_ld(RV_REG_S2, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S3, ctx)) {
+		emit(rv_ld(RV_REG_S3, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S4, ctx)) {
+		emit(rv_ld(RV_REG_S4, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S5, ctx)) {
+		emit(rv_ld(RV_REG_S5, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+	if (seen_reg(RV_REG_S6, ctx)) {
+		emit(rv_ld(RV_REG_S6, store_offset, RV_REG_SP), ctx);
+		store_offset -= 8;
+	}
+
+	emit(rv_addi(RV_REG_SP, RV_REG_SP, stack_adjust), ctx);
+	/* Set return value. */
+	emit(rv_addi(RV_REG_A0, RV_REG_A5, 0), ctx);
+	emit(rv_jalr(RV_REG_ZERO, RV_REG_RA, 0), ctx);
+}
+
+static int build_body(struct rv_jit_context *ctx, bool extra_pass)
+{
+	const struct bpf_prog *prog = ctx->prog;
+	int i;
+
+	for (i = 0; i < prog->len; i++) {
+		const struct bpf_insn *insn = &prog->insnsi[i];
+		int ret;
+
+		ret = emit_insn(insn, ctx, extra_pass);
+		if (ret > 0) {
+			i++;
+			if (ctx->insns == NULL)
+				ctx->offset[i] = ctx->ninsns;
+			continue;
+		}
+		if (ctx->insns == NULL)
+			ctx->offset[i] = ctx->ninsns;
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static void bpf_fill_ill_insns(void *area, unsigned int size)
+{
+	memset(area, 0, size);
+}
+
+static void bpf_flush_icache(void *start, void *end)
+{
+	flush_icache_range((unsigned long)start, (unsigned long)end);
+}
+
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
+	bool tmp_blinded = false, extra_pass = false;
+	struct bpf_prog *tmp, *orig_prog = prog;
+	struct rv_jit_data *jit_data;
+	struct rv_jit_context *ctx;
+	unsigned int image_size;
+
+	if (!prog->jit_requested)
+		return orig_prog;
+
+	tmp = bpf_jit_blind_constants(prog);
+	if (IS_ERR(tmp))
+		return orig_prog;
+	if (tmp != prog) {
+		tmp_blinded = true;
+		prog = tmp;
+	}
+
+	jit_data = prog->aux->jit_data;
+	if (!jit_data) {
+		jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
+		if (!jit_data) {
+			prog = orig_prog;
+			goto out;
+		}
+		prog->aux->jit_data = jit_data;
+	}
+
+	ctx = &jit_data->ctx;
+
+	if (ctx->offset) {
+		extra_pass = true;
+		image_size = sizeof(u32) * ctx->ninsns;
+		goto skip_init_ctx;
+	}
+
+	ctx->prog = prog;
+	ctx->offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL);
+	if (!ctx->offset) {
+		prog = orig_prog;
+		goto out_offset;
+	}
+
+	/* First pass generates the ctx->offset, but does not emit an image. */
+	if (build_body(ctx, extra_pass)) {
+		prog = orig_prog;
+		goto out_offset;
+	}
+	build_prologue(ctx);
+	ctx->epilogue_offset = ctx->ninsns;
+	build_epilogue(ctx);
+
+	/* Allocate image, now that we know the size. */
+	image_size = sizeof(u32) * ctx->ninsns;
+	jit_data->header = bpf_jit_binary_alloc(image_size, &jit_data->image,
+						sizeof(u32),
+						bpf_fill_ill_insns);
+	if (!jit_data->header) {
+		prog = orig_prog;
+		goto out_offset;
+	}
+
+	/* Second, real pass, that acutally emits the image. */
+	ctx->insns = (u32 *)jit_data->image;
+skip_init_ctx:
+	ctx->ninsns = 0;
+
+	build_prologue(ctx);
+	if (build_body(ctx, extra_pass)) {
+		bpf_jit_binary_free(jit_data->header);
+		prog = orig_prog;
+		goto out_offset;
+	}
+	build_epilogue(ctx);
+
+	if (bpf_jit_enable > 1)
+		bpf_jit_dump(prog->len, image_size, 2, ctx->insns);
+
+	prog->bpf_func = (void *)ctx->insns;
+	prog->jited = 1;
+	prog->jited_len = image_size;
+
+	bpf_flush_icache(jit_data->header, (u8 *)ctx->insns + ctx->ninsns);
+
+	if (!prog->is_func || extra_pass) {
+out_offset:
+		kfree(ctx->offset);
+		kfree(jit_data);
+		prog->aux->jit_data = NULL;
+	}
+out:
+	if (tmp_blinded)
+		bpf_jit_prog_release_other(prog, prog == orig_prog ?
+					   tmp : orig_prog);
 	return prog;
 }
-- 
2.19.1


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

  parent reply	other threads:[~2019-01-15  8:36 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-01-15  8:35 [RFC PATCH 0/3] RV64G eBPF JIT Björn Töpel
2019-01-15  8:35 ` Björn Töpel
2019-01-15  8:35 ` Björn Töpel
2019-01-15  8:35 ` [RFC PATCH 1/3] riscv: set HAVE_EFFICIENT_UNALIGNED_ACCESS Björn Töpel
2019-01-15  8:35   ` Björn Töpel
2019-01-15 15:39   ` Christoph Hellwig
2019-01-15 15:39     ` Christoph Hellwig
2019-01-15 16:06     ` Björn Töpel
2019-01-15 16:06       ` Björn Töpel
2019-01-25 20:21       ` Palmer Dabbelt
2019-01-25 20:21         ` Palmer Dabbelt
2019-01-26  1:33         ` Jim Wilson
2019-01-26  1:33           ` Jim Wilson
2019-01-29  2:43           ` Palmer Dabbelt
2019-01-29  2:43             ` Palmer Dabbelt
2019-01-15  8:35 ` [RFC PATCH 2/3] riscv: add build infra for JIT compiler Björn Töpel
2019-01-15  8:35   ` Björn Töpel
2019-01-15 15:43   ` Christoph Hellwig
2019-01-15 15:43     ` Christoph Hellwig
2019-01-15 16:09     ` Björn Töpel
2019-01-15 16:09       ` Björn Töpel
2019-01-15  8:35 ` Björn Töpel [this message]
2019-01-15  8:35   ` [RFC PATCH 3/3] bpf, riscv: added eBPF JIT for RV64G Björn Töpel
2019-01-15  8:35   ` Björn Töpel
2019-01-15 23:49   ` Daniel Borkmann
2019-01-15 23:49     ` Daniel Borkmann
2019-01-16  7:23     ` Björn Töpel
2019-01-16  7:23       ` Björn Töpel
2019-01-16 15:41       ` Daniel Borkmann
2019-01-16 15:41         ` Daniel Borkmann
2019-01-16 19:06         ` Björn Töpel
2019-01-16 19:06           ` Björn Töpel
2019-01-15 15:40 ` [RFC PATCH 0/3] RV64G eBPF JIT Christoph Hellwig
2019-01-15 15:40   ` Christoph Hellwig
2019-01-15 16:03   ` Björn Töpel
2019-01-15 16:03     ` Björn Töpel
2019-01-25 19:02     ` Palmer Dabbelt
2019-01-25 19:02       ` Palmer Dabbelt
2019-01-25 19:54 ` Paul Walmsley
2019-01-25 19:54   ` Paul Walmsley
2019-01-27 12:28   ` Björn Töpel
2019-01-27 12:28     ` Björn Töpel
2019-01-30  2:02 ` Palmer Dabbelt
2019-01-30  2:02   ` Palmer Dabbelt

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190115083518.10149-4-bjorn.topel@gmail.com \
    --to=bjorn.topel@gmail.com \
    --cc=daniel@iogearbox.net \
    --cc=davidlee@sifive.com \
    --cc=linux-riscv@lists.infradead.org \
    --cc=netdev@vger.kernel.org \
    --cc=palmer@sifive.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.