All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] ARM: net: JIT compiler for packet filters
@ 2011-12-19  8:40 ` Mircea Gherzan
  0 siblings, 0 replies; 30+ messages in thread
From: Mircea Gherzan @ 2011-12-19  8:40 UTC (permalink / raw)
  To: linux-arm-kernel; +Cc: mgherzan, netdev

Based of Matt Evans's PPC64 implementation.

Supports only ARM mode with EABI.

Supports both little and big endian. Depends on the support for
unaligned loads on ARMv7. Does not support all the BPF opcodes
that deal with ancillary data. The scratch memory of the filter
lives on the stack.

Enabled in the same way as for x86-64 and PPC64:

	echo 1 > /proc/sys/net/core/bpf_jit_enable

A value greater than 1 enables opcode output.

Signed-off-by: Mircea Gherzan <mgherzan@gmail.com>
---

Changes in v2:
 * enable the compiler ony for ARMv5+ because of the BLX instruction
 * use the same comparison for the ARM version checks
 * use misaligned accesses on ARMv6
 * fix the SEEN_MEM
 * fix the mem_words_used()

 arch/arm/Kconfig          |    1 +
 arch/arm/Makefile         |    1 +
 arch/arm/net/Makefile     |    3 +
 arch/arm/net/bpf_jit_32.c |  838 +++++++++++++++++++++++++++++++++++++++++++++
 arch/arm/net/bpf_jit_32.h |  174 ++++++++++
 5 files changed, 1017 insertions(+), 0 deletions(-)
 create mode 100644 arch/arm/net/Makefile
 create mode 100644 arch/arm/net/bpf_jit_32.c
 create mode 100644 arch/arm/net/bpf_jit_32.h

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index abba5b8..ea65c41 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -30,6 +30,7 @@ config ARM
 	select HAVE_SPARSE_IRQ
 	select GENERIC_IRQ_SHOW
 	select CPU_PM if (SUSPEND || CPU_IDLE)
+	select HAVE_BPF_JIT if (!THUMB2_KERNEL && AEABI)
 	help
 	  The ARM series is a line of low-power-consumption RISC chip designs
 	  licensed by ARM Ltd and targeted at embedded applications and
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index dfcf3b0..8810a10 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -255,6 +255,7 @@ core-$(CONFIG_VFP)		+= arch/arm/vfp/
 
 # If we have a machine-specific directory, then include it in the build.
 core-y				+= arch/arm/kernel/ arch/arm/mm/ arch/arm/common/
+core-y				+= arch/arm/net/
 core-y				+= $(machdirs) $(platdirs)
 
 drivers-$(CONFIG_OPROFILE)      += arch/arm/oprofile/
diff --git a/arch/arm/net/Makefile b/arch/arm/net/Makefile
new file mode 100644
index 0000000..c2c1084
--- /dev/null
+++ b/arch/arm/net/Makefile
@@ -0,0 +1,3 @@
+# ARM-specific networking code
+
+obj-$(CONFIG_BPF_JIT) += bpf_jit_32.o
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
new file mode 100644
index 0000000..d2b86fa
--- /dev/null
+++ b/arch/arm/net/bpf_jit_32.c
@@ -0,0 +1,838 @@
+/*
+ * Just-In-Time compiler for BPF filters on 32bit ARM
+ *
+ * Copyright (c) 2011 Mircea Gherzan <mgherzan@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License.
+ */
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/filter.h>
+#include <linux/moduleloader.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <asm/cacheflush.h>
+
+#include "bpf_jit_32.h"
+
+/* BLX is available on ARMv5 or higher */
+#if __LINUX_ARM_ARCH__ < 5
+# error "The BPF JIT compiler is only available on ARMv5 or higher!"
+#endif
+
+/*
+ * ABI:
+ *
+ * r0	scratch register
+ * r4	BPF register A
+ * r5	BPF register X
+ * r6	pointer to the skb
+ * r7	skb->data
+ * r8	skb_headlen(skb)
+ */
+
+#define r_scratch	ARM_R0
+/* r1-r3 are (also) used for the unaligned loads on the non-ARMv7 slowpath */
+#define r_off		ARM_R1
+#define r_A		ARM_R4
+#define r_X		ARM_R5
+#define r_skb		ARM_R6
+#define r_skb_data	ARM_R7
+#define r_skb_hl	ARM_R8
+
+#define SCRATCH_SP_OFFSET	0
+#define SCRATCH_OFF(k)		(SCRATCH_SP_OFFSET + (k))
+
+#define SEEN_MEM		((1 << BPF_MEMWORDS) - 1)
+#define SEEN_MEM_WORD(k)	(1 << (k))
+#define SEEN_X			(1 << BPF_MEMWORDS)
+#define SEEN_CALL		(1 << (BPF_MEMWORDS + 1))
+#define SEEN_DATA		(1 << (BPF_MEMWORDS + 2))
+#define SEEN_LEN		(1 << (BPF_MEMWORDS + 3))
+
+struct jit_ctx {
+	const struct sk_filter *skf;
+	unsigned idx;
+	unsigned prologue_bytes;
+	int ret0_fp_idx;
+	u32 seen;
+	u32 *offsets;
+	u32 *target;
+#if __LINUX_ARM_ARCH__ < 7
+	u16 epilogue_bytes;
+	u16 imm_count;
+	u32 *imms;
+#endif
+};
+
+int bpf_jit_enable __read_mostly;
+
+static u8 jit_get_skb_b(struct sk_buff *skb, unsigned offset)
+{
+	u8 ret;
+	skb_copy_bits(skb, offset, &ret, 1);
+	return ret;
+}
+
+static u16 jit_get_skb_h(struct sk_buff *skb, unsigned offset)
+{
+	u16 ret;
+	skb_copy_bits(skb, offset, &ret, 2);
+	return ntohs(ret);
+}
+
+static u32 jit_get_skb_w(struct sk_buff *skb, unsigned offset)
+{
+	u32 ret;
+	skb_copy_bits(skb, offset, &ret, 4);
+	return ntohl(ret);
+}
+
+static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx)
+{
+	if (ctx->target != NULL)
+		ctx->target[ctx->idx] = inst | (cond << 28);
+
+	ctx->idx++;
+}
+
+/*
+ * Emit an instruction that will be executed unconditionally.
+ */
+static inline void emit(u32 inst, struct jit_ctx *ctx)
+{
+	_emit(ARM_COND_AL, inst, ctx);
+}
+
+static u16 saved_regs(struct jit_ctx *ctx)
+{
+	u16 ret = 0;
+
+	if (ctx->skf->len > 1)
+		ret |= 1 << r_A;
+
+#ifdef CONFIG_FRAME_POINTER
+	ret |= (1 << ARM_FP) | (1 << ARM_IP) | (1 << ARM_LR) | (1 << ARM_PC);
+#else
+	if (ctx->seen & SEEN_CALL)
+		ret |= 1 << ARM_LR;
+#endif
+	if (ctx->seen & (SEEN_DATA | SEEN_LEN))
+		ret |= 1 << r_skb;
+	if (ctx->seen & SEEN_DATA)
+		ret |= (1 << r_skb_data) | (1 << r_skb_hl);
+	if (ctx->seen & SEEN_X)
+		ret |= 1 << r_X;
+
+	return ret;
+}
+
+static inline int mem_words_used(struct jit_ctx *ctx)
+{
+	u32 words = ctx->seen & SEEN_MEM;
+	/* yes, we do waste some stack space IF there are "holes" in the set" */
+	return 32 - __builtin_clz(words);
+}
+
+static inline bool is_load_to_a(u16 inst)
+{
+	switch (inst) {
+	case BPF_S_LD_W_LEN:
+	case BPF_S_LD_W_ABS:
+	case BPF_S_LD_H_ABS:
+	case BPF_S_LD_B_ABS:
+	case BPF_S_ANC_CPU:
+	case BPF_S_ANC_IFINDEX:
+	case BPF_S_ANC_MARK:
+	case BPF_S_ANC_PROTOCOL:
+	case BPF_S_ANC_RXHASH:
+	case BPF_S_ANC_QUEUE:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static void build_prologue(struct jit_ctx *ctx)
+{
+	u16 reg_set = saved_regs(ctx);
+	u16 first_inst = ctx->skf->insns[0].code;
+	u16 off;
+
+#ifdef CONFIG_FRAME_POINTER
+	emit(ARM_MOV_R(ARM_IP, ARM_SP), ctx);
+	emit(ARM_PUSH(reg_set), ctx);
+	emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx);
+#else
+	if (reg_set)
+		emit(ARM_PUSH(reg_set), ctx);
+#endif
+
+	if (ctx->seen & (SEEN_DATA | SEEN_LEN))
+		emit(ARM_MOV_R(r_skb, ARM_R0), ctx);
+
+	if (ctx->seen & SEEN_DATA) {
+		off = offsetof(struct sk_buff, data);
+		emit(ARM_LDR_I(r_skb_data, r_skb, off), ctx);
+		/* headlen = len - data_len */
+		off = offsetof(struct sk_buff, len);
+		emit(ARM_LDR_I(r_skb_hl, r_skb, off), ctx);
+		off = offsetof(struct sk_buff, data_len);
+		emit(ARM_LDR_I(r_scratch, r_skb, off), ctx);
+		emit(ARM_SUB_R(r_skb_hl, r_skb_hl, r_scratch), ctx);
+	}
+
+	if (ctx->seen & SEEN_X)
+		emit(ARM_MOV_I(r_X, 0), ctx);
+
+	/* do not leak kernel data to userspace */
+	if ((first_inst != BPF_S_RET_K) && !(is_load_to_a(first_inst)))
+		emit(ARM_MOV_I(r_A, 0), ctx);
+
+	/* stack space for the BPF_MEM words */
+	if (ctx->seen & SEEN_MEM)
+		emit(ARM_SUB_I(ARM_SP, ARM_SP, mem_words_used(ctx) * 4), ctx);
+}
+
+static void build_epilogue(struct jit_ctx *ctx)
+{
+	u16 reg_set = saved_regs(ctx);
+
+	if (ctx->seen & SEEN_MEM)
+		emit(ARM_ADD_I(ARM_SP, ARM_SP, mem_words_used(ctx) * 4), ctx);
+
+	reg_set &= ~(1 << ARM_LR);
+
+#ifdef CONFIG_FRAME_POINTER
+	/* the first instruction of the prologue was: mov ip, sp */
+	reg_set &= ~(1 << ARM_IP);
+	reg_set |= (1 << ARM_SP);
+	emit(ARM_LDM(ARM_SP, reg_set), ctx);
+#else
+	if (ctx->seen)
+		emit(ARM_POP(reg_set | (1 << ARM_PC)), ctx);
+	else
+		emit(ARM_BX(ARM_LR), ctx);
+#endif
+}
+
+static int16_t imm8m(u32 x)
+{
+	u32 rot;
+
+	for (rot = 0; rot < 16; rot++)
+		if ((x & ~ror32(0xff, 2 * rot)) == 0)
+			return rol32(x, 2 * rot) | (rot << 8);
+
+	return -1;
+}
+
+#if __LINUX_ARM_ARCH__ < 7
+
+static u16 imm_offset(u32 k, struct jit_ctx *ctx)
+{
+	unsigned i = 0, offset;
+	u16 imm;
+
+	/* on the "fake" run we just count them (duplicates included) */
+	if (ctx->target == NULL) {
+		ctx->imm_count++;
+		return 0;
+	}
+
+	while ((i < ctx->imm_count) && ctx->imms[i]) {
+		if (ctx->imms[i] == k)
+			break;
+		i++;
+	}
+
+	if (ctx->imms[i] == 0)
+		ctx->imms[i] = k;
+
+	/* constants go just after the epilogue */
+	offset =  ctx->offsets[ctx->skf->len];
+	offset += ctx->prologue_bytes;
+	offset += ctx->epilogue_bytes;
+	offset += i * 4;
+
+	ctx->target[offset / 4] = k;
+
+	/* PC in ARM mode == address of the instruction + 8 */
+	imm = offset - (8 + ctx->idx * 4);
+
+	return imm;
+}
+
+#endif /* __LINUX_ARM_ARCH__ */
+
+/*
+ * Move an immediate that's not an imm8m to a core register.
+ */
+static inline void emit_mov_i_no8m(int rd, u32 val, struct jit_ctx *ctx)
+{
+#if __LINUX_ARM_ARCH__ < 7
+	emit(ARM_LDR_I(rd, ARM_PC, imm_offset(val, ctx)), ctx);
+#else
+	emit(ARM_MOVW(rd, val & 0xffff), ctx);
+	if (val > 0xffff)
+		emit(ARM_MOVT(rd, val >> 16), ctx);
+#endif
+}
+
+static inline void emit_mov_i(int rd, u32 val, struct jit_ctx *ctx)
+{
+	int imm12 = imm8m(val);
+
+	if (imm12 >= 0)
+		emit(ARM_MOV_I(rd, imm12), ctx);
+	else
+		emit_mov_i_no8m(rd, val, ctx);
+}
+
+#if __LINUX_ARM_ARCH__ < 6
+
+static void emit_load_be32(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx)
+{
+	_emit(cond, ARM_LDRB_I(ARM_R3, r_addr, 1), ctx);
+	_emit(cond, ARM_LDRB_I(ARM_R1, r_addr, 0), ctx);
+	_emit(cond, ARM_LDRB_I(ARM_R2, r_addr, 3), ctx);
+	_emit(cond, ARM_LSL_I(ARM_R3, ARM_R3, 16), ctx);
+	_emit(cond, ARM_LDRB_I(ARM_R0, r_addr, 2), ctx);
+	_emit(cond, ARM_ORR_S(ARM_R3, ARM_R3, ARM_R1, SRTYPE_LSL, 24), ctx);
+	_emit(cond, ARM_ORR_R(ARM_R3, ARM_R3, ARM_R2), ctx);
+	_emit(cond, ARM_ORR_S(r_res, ARM_R3, ARM_R0, SRTYPE_LSL, 8), ctx);
+}
+
+static void emit_load_be16(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx)
+{
+	_emit(cond, ARM_LDRB_I(ARM_R1, r_addr, 0), ctx);
+	_emit(cond, ARM_LDRB_I(ARM_R2, r_addr, 1), ctx);
+	_emit(cond, ARM_ORR_S(r_res, ARM_R2, ARM_R1, SRTYPE_LSL, 8), ctx);
+}
+
+static inline void emit_swap16(u8 r_dst, u8 r_src, struct jit_ctx *ctx)
+{
+	emit(ARM_LSL_R(ARM_R1, r_src, 8), ctx);
+	emit(ARM_ORR_S(r_dst, ARM_R1, r_src, SRTYPE_LSL, 8), ctx);
+	emit(ARM_LSL_I(r_dst, r_dst, 8), ctx);
+	emit(ARM_LSL_R(r_dst, r_dst, 8), ctx);
+}
+
+#else  /* ARMv6+ */
+
+static void emit_load_be32(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx)
+{
+	_emit(cond, ARM_LDR_I(r_res, r_addr, 0), ctx);
+#ifdef __LITTLE_ENDIAN
+	_emit(cond, ARM_REV(r_res, r_res), ctx);
+#endif
+}
+
+static void emit_load_be16(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx)
+{
+	_emit(cond, ARM_LDRH_I(r_res, r_addr, 0), ctx);
+#ifdef __LITTLE_ENDIAN
+	_emit(cond, ARM_REV16(r_res, r_res), ctx);
+#endif
+}
+
+static inline void emit_swap16(u8 r_dst __maybe_unused,
+			       u8 r_src __maybe_unused,
+			       struct jit_ctx *ctx __maybe_unused)
+{
+#ifdef __LITTLE_ENDIAN
+	emit(ARM_REV16(r_dst, r_src), ctx);
+#endif
+}
+
+#endif /* __LINUX_ARM_ARCH__ < 6 */
+
+
+/* Compute the immediate value for a PC-relative branch. */
+static inline u32 b_imm(unsigned tgt, struct jit_ctx *ctx)
+{
+	u32 imm;
+
+	if (ctx->target == NULL)
+		return 0;
+	/*
+	 * BPF allows only forward jumps and the offset of the target is
+	 * still the one computed during the first pass.
+	 */
+	imm  = ctx->offsets[tgt] + ctx->prologue_bytes - (ctx->idx * 4 + 8);
+
+	return imm >> 2;
+}
+
+#define OP_IMM3(op, r1, r2, imm_val, ctx)				\
+	do {								\
+		imm12 = imm8m(imm_val);					\
+		if (imm12 < 0) {					\
+			emit_mov_i_no8m(r_scratch, imm_val, ctx);	\
+			emit(op ## _R((r1), (r2), r_scratch), ctx);	\
+		} else {						\
+			emit(op ## _I((r1), (r2), imm12), ctx);		\
+		}							\
+	} while (0)
+
+static inline void emit_err_ret(u8 cond, struct jit_ctx *ctx)
+{
+	if (ctx->ret0_fp_idx >= 0) {
+		_emit(cond, ARM_B(b_imm(ctx->ret0_fp_idx, ctx)), ctx);
+		/* NOP to keep the size constant between passes */
+		emit(ARM_MOV_R(ARM_R0, ARM_R0), ctx);
+	} else {
+		_emit(cond, ARM_MOV_I(ARM_R0, 0), ctx);
+		_emit(cond, ARM_B(b_imm(ctx->skf->len, ctx)), ctx);
+	}
+}
+
+static int build_body(struct jit_ctx *ctx)
+{
+	void  *load_func[] = {jit_get_skb_b, jit_get_skb_h, jit_get_skb_w};
+	const struct sk_filter *prog = ctx->skf;
+	const struct sock_filter *inst;
+	unsigned i, load_order, off, condt, condf;
+	int imm12;
+	u32 k;
+
+	for (i = 0; i < prog->len; i++) {
+		inst = &(prog->insns[i]);
+		/* K as an immediate value operand */
+		k = inst->k;
+
+		/* compute offsets only in the fake pass */
+		if (ctx->target == NULL)
+			ctx->offsets[i] = ctx->idx * 4;
+
+		switch (inst->code) {
+		case BPF_S_LD_IMM:
+			emit_mov_i(r_A, k, ctx);
+			break;
+		case BPF_S_LD_W_LEN:
+			ctx->seen |= SEEN_LEN;
+			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
+			emit(ARM_LDR_I(r_A, r_skb,
+				       offsetof(struct sk_buff, len)), ctx);
+			break;
+		case BPF_S_LD_MEM:
+			/* A = scratch[k] */
+			ctx->seen |= SEEN_MEM_WORD(k);
+			emit(ARM_LDR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx);
+			break;
+		case BPF_S_LD_W_ABS:
+			load_order = 2;
+			goto load;
+		case BPF_S_LD_H_ABS:
+			load_order = 1;
+			goto load;
+		case BPF_S_LD_B_ABS:
+			load_order = 0;
+load:
+			emit_mov_i(r_off, k, ctx);
+load_common:
+			ctx->seen |= SEEN_DATA | SEEN_CALL;
+
+			if (load_order > 0) {
+				emit(ARM_SUB_I(r_scratch, r_skb_hl,
+					       1 << load_order), ctx);
+				emit(ARM_CMP_R(r_scratch, r_off), ctx);
+				condt = ARM_COND_HS;
+			} else {
+				emit(ARM_CMP_R(r_skb_hl, r_off), ctx);
+				condt = ARM_COND_HI;
+			}
+
+			_emit(condt, ARM_ADD_R(r_scratch, r_off, r_skb_data),
+			      ctx);
+
+			if (load_order == 0)
+				_emit(condt, ARM_LDRB_I(r_A, r_scratch, 0),
+				      ctx);
+			else if (load_order == 1)
+				emit_load_be16(condt, r_A, r_scratch, ctx);
+			else if (load_order == 2)
+				emit_load_be32(condt, r_A, r_scratch, ctx);
+
+			_emit(condt, ARM_B(b_imm(i + 1, ctx)), ctx);
+
+			/* the slowpath */
+			emit_mov_i(ARM_R3, (u32)load_func[load_order], ctx);
+			emit(ARM_MOV_R(ARM_R0, r_skb), ctx);
+			/* the offset is already in R1 */
+			emit(ARM_BLX_R(ARM_R3), ctx);
+			emit(ARM_MOV_R(r_A, ARM_R0), ctx);
+			break;
+		case BPF_S_LD_W_IND:
+			load_order = 2;
+			goto load_ind;
+		case BPF_S_LD_H_IND:
+			load_order = 1;
+			goto load_ind;
+		case BPF_S_LD_B_IND:
+			load_order = 0;
+load_ind:
+			OP_IMM3(ARM_ADD, r_off, r_X, k, ctx);
+			goto load_common;
+		case BPF_S_LDX_IMM:
+			ctx->seen |= SEEN_X;
+			emit_mov_i(r_X, k, ctx);
+			break;
+		case BPF_S_LDX_W_LEN:
+			ctx->seen |= SEEN_X;
+			emit(ARM_LDR_I(r_X, r_skb,
+				       offsetof(struct sk_buff, len)), ctx);
+			break;
+		case BPF_S_LDX_MEM:
+			ctx->seen |= SEEN_X | SEEN_MEM_WORD(k);
+			emit(ARM_LDR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx);
+			break;
+		case BPF_S_LDX_B_MSH:
+			/* x = ((*(frame + k)) & 0xf) << 2; */
+			ctx->seen |= SEEN_X | SEEN_DATA | SEEN_CALL;
+			/* offset in r1: we might have to take the slow path */
+			emit_mov_i(r_off, k, ctx);
+			emit(ARM_CMP_R(r_skb_hl, r_off), ctx);
+
+			/* load in r0: common with the slowpath */
+			_emit(ARM_COND_HI, ARM_LDRB_R(ARM_R0, r_skb_data,
+						      ARM_R1), ctx);
+			_emit(ARM_COND_HI, ARM_B((3 * 4 - 8) >> 2), ctx);
+
+			emit(ARM_MOV_R(ARM_R0, r_skb), ctx);
+			/* r_off is r1 */
+			emit(ARM_MOV_I(ARM_R3, (u32)jit_get_skb_b), ctx);
+			emit(ARM_BLX_R(ARM_R3), ctx);
+
+			emit(ARM_ORR_I(r_X, ARM_R0, 0x00f), ctx);
+			emit(ARM_LSL_I(r_X, r_X, 2), ctx);
+			break;
+		case BPF_S_ST:
+			ctx->seen |= SEEN_MEM_WORD(k);
+			emit(ARM_STR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx);
+			break;
+		case BPF_S_STX:
+			ctx->seen |= SEEN_MEM_WORD(k) | SEEN_X;
+			emit(ARM_STR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx);
+			break;
+		case BPF_S_ALU_ADD_K:
+			/* A += K */
+			OP_IMM3(ARM_ADD, r_A, r_A, k, ctx);
+			break;
+		case BPF_S_ALU_ADD_X:
+			ctx->seen |= SEEN_X;
+			emit(ARM_ADD_R(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_SUB_K:
+			/* A -= K */
+			OP_IMM3(ARM_SUB, r_A, r_A, k, ctx);
+			break;
+		case BPF_S_ALU_SUB_X:
+			ctx->seen |= SEEN_X;
+			emit(ARM_SUB_R(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_MUL_K:
+			/* A *= K */
+			emit_mov_i(r_scratch, k, ctx);
+			emit(ARM_MUL(r_A, r_A, r_scratch), ctx);
+			break;
+		case BPF_S_ALU_MUL_X:
+			ctx->seen |= SEEN_X;
+			emit(ARM_MUL(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_DIV_K:
+			/* K is not zero, it was previously checked */
+			emit_mov_i(ARM_R1, k, ctx);
+			goto div;
+		case BPF_S_ALU_DIV_X:
+			ctx->seen |= SEEN_X;
+			emit(ARM_CMP_I(r_X, 0), ctx);
+			emit_err_ret(ARM_COND_EQ, ctx);
+			emit(ARM_MOV_R(ARM_R1, r_X), ctx);
+div:
+			ctx->seen |= SEEN_CALL;
+
+			emit(ARM_MOV_R(ARM_R0, r_A), ctx);
+			emit_mov_i(r_scratch, (u32)__aeabi_uidiv, ctx);
+			emit(ARM_BLX_R(r_scratch), ctx);
+			emit(ARM_MOV_R(r_A, ARM_R0), ctx);
+			break;
+		case BPF_S_ALU_OR_K:
+			/* A |= K */
+			OP_IMM3(ARM_ORR, r_A, r_A, k, ctx);
+			break;
+		case BPF_S_ALU_OR_X:
+			ctx->seen |= SEEN_X;
+			emit(ARM_ORR_I(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_AND_K:
+			/* A &= K */
+			OP_IMM3(ARM_AND, r_A, r_A, k, ctx);
+			break;
+		case BPF_S_ALU_AND_X:
+			ctx->seen |= SEEN_X;
+			emit(ARM_AND_R(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_LSH_K:
+			if (unlikely(k > 31))
+				return -1;
+			emit(ARM_LSL_I(r_A, r_A, k), ctx);
+			break;
+		case BPF_S_ALU_LSH_X:
+			ctx->seen |= SEEN_X;
+			emit(ARM_LSL_R(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_RSH_K:
+			if (unlikely(k > 31))
+				return -1;
+			emit(ARM_LSR_I(r_A, r_A, k), ctx);
+			break;
+		case BPF_S_ALU_RSH_X:
+			ctx->seen |= SEEN_X;
+			emit(ARM_LSR_R(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_JMP_JA:
+			/* pc += K */
+			emit(ARM_B(b_imm(i + k, ctx)), ctx);
+			break;
+		case BPF_S_JMP_JEQ_K:
+			/* pc += (A == K) ? pc->jt : pc->jf */
+			condt   = ARM_COND_EQ;
+			condf  = ARM_COND_NE;
+			goto cmp_imm;
+		case BPF_S_JMP_JGT_K:
+			/* pc += (A > K) ? pc->jt : pc->jf */
+			condt   = ARM_COND_HI;
+			condf  = ARM_COND_LS;
+			goto cmp_imm;
+		case BPF_S_JMP_JGE_K:
+			/* pc += (A >= K) ? pc->jt : pc->jf */
+			condt   = ARM_COND_HI;
+			condf  = ARM_COND_LS;
+			/* (x >= y) IFF (x > y - 1) */
+			if (unlikely(k == 0))
+				return -1;
+			k--;
+cmp_imm:
+			imm12 = imm8m(k);
+			if (imm12 < 0) {
+				emit_mov_i_no8m(r_scratch, k, ctx);
+				emit(ARM_CMP_R(r_A, r_scratch), ctx);
+			} else {
+				emit(ARM_CMP_I(r_A, imm12), ctx);
+			}
+cond_jump:
+			if (inst->jt)
+				_emit(condt, ARM_B(b_imm(i + inst->jt + 1,
+						   ctx)), ctx);
+			if (inst->jf)
+				_emit(condf, ARM_B(b_imm(i + inst->jf + 1,
+						   ctx)), ctx);
+			break;
+		case BPF_S_JMP_JEQ_X:
+			/* pc += (A == X) ? pc->jt : pc->jf */
+			condt   = ARM_COND_EQ;
+			condf  = ARM_COND_NE;
+			goto cmp_x;
+		case BPF_S_JMP_JGT_X:
+			/* pc += (A > X) ? pc->jt : pc->jf */
+			condt   = ARM_COND_HI;
+			condf  = ARM_COND_LS;
+			goto cmp_x;
+		case BPF_S_JMP_JGE_X:
+			/* pc += (A >= X) ? pc->jt : pc->jf */
+			condt   = ARM_COND_CS;
+			condf  = ARM_COND_CC;
+cmp_x:
+			ctx->seen |= SEEN_X;
+			emit(ARM_CMP_R(r_A, r_X), ctx);
+			goto cond_jump;
+		case BPF_S_JMP_JSET_K:
+			/* pc += (A & K) ? pc->jt : pc->jf */
+			condt  = ARM_COND_NE;
+			/* not set iff all zeroes iff Z==1 iff EQ */
+			condf  = ARM_COND_EQ;
+
+			imm12 = imm8m(k);
+			if (imm12 < 0) {
+				emit_mov_i_no8m(r_scratch, k, ctx);
+				emit(ARM_TST_R(r_A, r_scratch), ctx);
+			} else {
+				emit(ARM_TST_I(r_A, imm12), ctx);
+			}
+			goto cond_jump;
+		case BPF_S_JMP_JSET_X:
+			/* pc += (A & X) ? pc->jt : pc->jf */
+			condt  = ARM_COND_NE;
+			condf  = ARM_COND_EQ;
+			emit(ARM_TST_R(r_A, r_X), ctx);
+			goto cond_jump;
+		case BPF_S_RET_A:
+			emit(ARM_MOV_R(ARM_R0, r_A), ctx);
+			goto b_epilogue;
+		case BPF_S_RET_K:
+			if ((k == 0) && (ctx->ret0_fp_idx < 0))
+				ctx->ret0_fp_idx = i;
+			emit_mov_i(ARM_R0, k, ctx);
+b_epilogue:
+			if (i != ctx->skf->len - 1)
+				emit(ARM_B(b_imm(prog->len, ctx)), ctx);
+			break;
+		case BPF_S_MISC_TAX:
+			/* X = A */
+			emit(ARM_MOV_R(r_X, r_A), ctx);
+			break;
+		case BPF_S_MISC_TXA:
+			/* A = X */
+			emit(ARM_MOV_R(r_A, r_X), ctx);
+			break;
+		case BPF_S_ANC_PROTOCOL:
+			/* A = ntohs(skb->protocol) */
+			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
+						  protocol) != 2);
+			off = offsetof(struct sk_buff, protocol);
+			emit(ARM_LDRH_I(r_scratch, r_skb, off), ctx);
+			emit_swap16(r_A, r_scratch, ctx);
+			break;
+		case BPF_S_ANC_CPU:
+			/* A = current_thread_info()->cpu */
+			emit_mov_i(r_scratch, (~(THREAD_SIZE - 1)), ctx);
+			emit(ARM_ADD_R(r_scratch, ARM_SP, r_scratch), ctx);
+
+			BUILD_BUG_ON(FIELD_SIZEOF(struct thread_info, cpu) != 4);
+			off = offsetof(struct thread_info, cpu);
+			emit(ARM_LDR_I(r_A, r_scratch, off), ctx);
+			break;
+		case BPF_S_ANC_IFINDEX:
+			/* A = skb->dev->ifindex */
+			off = offsetof(struct sk_buff, dev);
+			emit(ARM_LDR_I(r_scratch, r_skb, off), ctx);
+
+			emit(ARM_CMP_I(r_scratch, 0), ctx);
+			emit_err_ret(ARM_COND_EQ, ctx);
+
+			BUILD_BUG_ON(FIELD_SIZEOF(struct net_device,
+						  ifindex) != 4);
+			off = offsetof(struct net_device, ifindex);
+			emit(ARM_LDR_I(r_A, r_scratch, off), ctx);
+			break;
+		case BPF_S_ANC_MARK:
+			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+			off = offsetof(struct sk_buff, mark);
+			emit(ARM_LDR_I(r_A, r_skb, off), ctx);
+			break;
+		case BPF_S_ANC_RXHASH:
+			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, rxhash) != 4);
+			off = offsetof(struct sk_buff, rxhash);
+			emit(ARM_LDR_I(r_A, r_skb, off), ctx);
+			break;
+		case BPF_S_ANC_QUEUE:
+			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
+						  queue_mapping) != 2);
+			BUILD_BUG_ON(offsetof(struct sk_buff,
+					      queue_mapping) > 0xff);
+			off = offsetof(struct sk_buff, queue_mapping);
+			emit(ARM_LDRH_I(r_A, r_skb, off), ctx);
+			break;
+		default:
+			return -1;
+		}
+	}
+
+	/* compute offsets only during the first pass */
+	if (ctx->target == NULL)
+		ctx->offsets[i] = ctx->idx * 4;
+
+	return 0;
+}
+
+
+void bpf_jit_compile(struct sk_filter *fp)
+{
+	struct jit_ctx ctx;
+	unsigned tmp_idx;
+	unsigned alloc_size;
+
+	memset(&ctx, 0, sizeof(ctx));
+	ctx.skf		= fp;
+	ctx.ret0_fp_idx = -1;
+
+	ctx.offsets = kzalloc(GFP_KERNEL, 4 * (ctx.skf->len + 1));
+	if (ctx.offsets == NULL)
+		return;
+
+	/* fake pass to fill in the ctx->seen */
+	if (unlikely(build_body(&ctx)))
+		goto out;
+
+	tmp_idx = ctx.idx;
+	build_prologue(&ctx);
+	ctx.prologue_bytes = (ctx.idx - tmp_idx) * 4;
+
+#if __LINUX_ARM_ARCH__ < 7
+	tmp_idx = ctx.idx;
+	build_epilogue(&ctx);
+	ctx.epilogue_bytes = (ctx.idx - tmp_idx) * 4;
+
+	ctx.idx += ctx.imm_count;
+	if (ctx.imm_count) {
+		ctx.imms = kzalloc(GFP_KERNEL, 4 * ctx.imm_count);
+		if (ctx.imms == NULL)
+			goto out;
+	}
+#else
+	/* there's nothing after the epilogue on ARMv7 */
+	build_epilogue(&ctx);
+#endif
+
+	alloc_size = 4 * ctx.idx;
+	ctx.target = module_alloc(alloc_size > sizeof(struct work_struct) ?
+				  alloc_size : sizeof(struct work_struct));
+	if (unlikely(ctx.target == NULL))
+		goto out;
+
+	ctx.idx = 0;
+	build_prologue(&ctx);
+	build_body(&ctx);
+	build_epilogue(&ctx);
+
+	flush_icache_range((u32)ctx.target, (u32)(ctx.target + ctx.idx));
+
+#if __LINUX_ARM_ARCH__ < 7
+	if (ctx.imm_count)
+		kfree(ctx.imms);
+#endif
+
+	if (bpf_jit_enable > 1)
+		print_hex_dump(KERN_INFO, "BPF JIT code: ",
+			       DUMP_PREFIX_ADDRESS, 16, 4, ctx.target,
+			       alloc_size, false);
+
+	fp->bpf_func = (void *)ctx.target;
+
+out:
+	kfree(ctx.offsets);
+	return;
+}
+
+static void bpf_jit_free_worker(struct work_struct *work)
+{
+	module_free(NULL, work);
+}
+
+void bpf_jit_free(struct sk_filter *fp)
+{
+	struct work_struct *work;
+
+	if (fp->bpf_func != sk_run_filter) {
+		work = (struct work_struct *)fp->bpf_func;
+
+		INIT_WORK(work, bpf_jit_free_worker);
+		schedule_work(work);
+	}
+}
+
diff --git a/arch/arm/net/bpf_jit_32.h b/arch/arm/net/bpf_jit_32.h
new file mode 100644
index 0000000..de68a51
--- /dev/null
+++ b/arch/arm/net/bpf_jit_32.h
@@ -0,0 +1,174 @@
+/*
+ * Just-In-Time compiler for BPF filters on 32bit ARM
+ *
+ * Copyright (c) 2011 Mircea Gherzan <mgherzan@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License.
+ */
+
+#ifndef PFILTER_OPCODES_ARM_H
+#define PFILTER_OPCODES_ARM_H
+
+extern void *__aeabi_uidiv;
+
+#define ARM_R0	0
+#define ARM_R1	1
+#define ARM_R2	2
+#define ARM_R3	3
+#define ARM_R4	4
+#define ARM_R5	5
+#define ARM_R6	6
+#define ARM_R7	7
+#define ARM_R8	8
+#define ARM_R9	9
+#define ARM_R10	10
+#define ARM_FP	11
+#define ARM_IP	12
+#define ARM_SP	13
+#define ARM_LR	14
+#define ARM_PC	15
+
+#define ARM_COND_EQ		0x0
+#define ARM_COND_NE		0x1
+#define ARM_COND_CS		0x2
+#define ARM_COND_HS		ARM_COND_CS
+#define ARM_COND_CC		0x3
+#define ARM_COND_LO		ARM_COND_CC
+#define ARM_COND_MI		0x4
+#define ARM_COND_PL		0x5
+#define ARM_COND_VS		0x6
+#define ARM_COND_VC		0x7
+#define ARM_COND_HI		0x8
+#define ARM_COND_LS		0x9
+#define ARM_COND_GE		0xa
+#define ARM_COND_LT		0xb
+#define ARM_COND_GT		0xc
+#define ARM_COND_LE		0xd
+#define ARM_COND_AL		0xe
+
+/* register shift types */
+#define SRTYPE_LSL		0
+#define SRTYPE_LSR		1
+#define SRTYPE_ASR		2
+#define SRTYPE_ROR		3
+
+#define ARM_INST_ADD_R		0x00800000
+#define ARM_INST_ADD_I		0x02800000
+
+#define ARM_INST_AND_R		0x00000000
+#define ARM_INST_AND_I		0x02000000
+
+#define ARM_INST_B		0x0a000000
+#define ARM_INST_BX		0x012FFF10
+#define ARM_INST_BLX_R		0x012fff30
+
+#define ARM_INST_CMP_R		0x01500000
+#define ARM_INST_CMP_I		0x03500000
+
+#define ARM_INST_LDRB_I		0x05d00000
+#define ARM_INST_LDRB_R		0x07d00000
+#define ARM_INST_LDRH_I		0x01d000b0
+#define ARM_INST_LDR_I		0x05900000
+
+#define ARM_INST_LDM		0x08900000
+
+#define ARM_INST_LSL_I		0x01a00000
+#define ARM_INST_LSL_R		0x01a00010
+
+#define ARM_INST_LSR_I		0x01a00020
+#define ARM_INST_LSR_R		0x01a00030
+
+#define ARM_INST_MOV_R		0x01a00000
+#define ARM_INST_MOV_I		0x03a00000
+#define ARM_INST_MOVW		0x03000000
+#define ARM_INST_MOVT		0x03400000
+
+#define ARM_INST_MUL		0x00000090
+
+#define ARM_INST_POP		0x08bd0000
+#define ARM_INST_PUSH		0x092d0000
+
+#define ARM_INST_REV		0x06bf0f30
+#define ARM_INST_REV16		0x06bf0fb0
+
+#define ARM_INST_ORR_R		0x01800000
+#define ARM_INST_ORR_I		0x03800000
+
+#define ARM_INST_SUB_R		0x00400000
+#define ARM_INST_SUB_I		0x02400000
+
+#define ARM_INST_STR_I		0x05800000
+
+#define ARM_INST_TST_R		0x01100000
+#define ARM_INST_TST_I		0x03100000
+
+
+/* register */
+#define _AL3_R(op, rd, rn, rm)	((op ## _R) | (rd) << 12 | (rn) << 16 | (rm))
+/* immediate */
+#define _AL3_I(op, rd, rn, imm)	((op ## _I) | (rd) << 12 | (rn) << 16 | (imm))
+
+#define ARM_ADD_R(rd, rn, rm)	_AL3_R(ARM_INST_ADD, rd, rn, rm)
+#define ARM_ADD_I(rd, rn, imm)	_AL3_I(ARM_INST_ADD, rd, rn, imm)
+
+#define ARM_AND_R(rd, rn, rm)	_AL3_R(ARM_INST_AND, rd, rn, rm)
+#define ARM_AND_I(rd, rn, imm)	_AL3_I(ARM_INST_AND, rd, rn, imm)
+
+#define ARM_B(imm24)		(ARM_INST_B | (imm24))
+#define ARM_BX(rm)		(ARM_INST_BX | (rm))
+#define ARM_BLX_R(rm)		(ARM_INST_BLX_R | (rm))
+
+#define ARM_CMP_R(rn, rm)	_AL3_R(ARM_INST_CMP, 0, rn, rm)
+#define ARM_CMP_I(rn, imm)	_AL3_I(ARM_INST_CMP, 0, rn, imm)
+
+#define ARM_LDR_I(rt, rn, off)	(ARM_INST_LDR_I | (rt) << 12 | (rn) << 16 \
+				 | (off))
+#define ARM_LDRB_I(rt, rn, off)	(ARM_INST_LDRB_I | (rt) << 12 | (rn) << 16 \
+				 | (off))
+#define ARM_LDRB_R(rt, rn, rm)	(ARM_INST_LDRB_R | (rt) << 12 | (rn) << 16 \
+				 | (rm))
+#define ARM_LDRH_I(rt, rn, off)	(ARM_INST_LDRH_I | (rt) << 12 | (rn) << 16 \
+				 | (((off) & 0xf0) << 4) | ((off) & 0xf))
+
+#define ARM_LDM(rn, regs)	(ARM_INST_LDM | (rn) << 16 | (regs))
+
+#define ARM_LSL_R(rd, rn, rm)	(_AL3_R(ARM_INST_LSL, rd, 0, rn) | (rm) << 8)
+#define ARM_LSL_I(rd, rn, imm)	(_AL3_R(ARM_INST_LSL, rd, 0, rn) | (imm) << 7)
+
+#define ARM_LSR_R(rd, rn, rm)	(_AL3_R(ARM_INST_LSR, rd, 0, rn) | (rm) << 8)
+#define ARM_LSR_I(rd, rn, imm)	(_AL3_R(ARM_INST_LSR, rd, 0, rn) | (imm) << 7)
+
+#define ARM_MOV_R(rd, rm)	_AL3_R(ARM_INST_MOV, rd, 0, rm)
+#define ARM_MOV_I(rd, imm)	_AL3_I(ARM_INST_MOV, rd, 0, imm)
+
+#define ARM_MOVW(rd, imm)	\
+	(ARM_INST_MOVW | ((imm) >> 12) << 16 | (rd) << 12 | ((imm) & 0x0fff))
+
+#define ARM_MOVT(rd, imm)	\
+	(ARM_INST_MOVT | ((imm) >> 12) << 16 | (rd) << 12 | ((imm) & 0x0fff))
+
+#define ARM_MUL(rd, rm, rn)	(ARM_INST_MUL | (rd) << 16 | (rm) << 8 | (rn))
+
+#define ARM_POP(regs)		(ARM_INST_POP | (regs))
+#define ARM_PUSH(regs)		(ARM_INST_PUSH | (regs))
+
+#define ARM_ORR_R(rd, rn, rm)	_AL3_R(ARM_INST_ORR, rd, rn, rm)
+#define ARM_ORR_I(rd, rn, imm)	_AL3_I(ARM_INST_ORR, rd, rn, imm)
+#define ARM_ORR_S(rd, rn, rm, type, rs)	\
+	(ARM_ORR_R(rd, rn, rm) | (type) << 5 | (rs) << 7)
+
+#define ARM_REV(rd, rm)		(ARM_INST_REV | (rd) << 12 | (rm))
+#define ARM_REV16(rd, rm)	(ARM_INST_REV16 | (rd) << 12 | (rm))
+
+#define ARM_SUB_R(rd, rn, rm)	_AL3_R(ARM_INST_SUB, rd, rn, rm)
+#define ARM_SUB_I(rd, rn, imm)	_AL3_I(ARM_INST_SUB, rd, rn, imm)
+
+#define ARM_STR_I(rt, rn, off)	(ARM_INST_STR_I | (rt) << 12 | (rn) << 16 \
+				 | (off))
+
+#define ARM_TST_R(rn, rm)	_AL3_R(ARM_INST_TST, 0, rn, rm)
+#define ARM_TST_I(rn, imm)	_AL3_I(ARM_INST_TST, 0, rn, imm)
+
+#endif /* PFILTER_OPCODES_ARM_H */
-- 
1.7.7.3

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH v2] ARM: net: JIT compiler for packet filters
@ 2011-12-19  8:40 ` Mircea Gherzan
  0 siblings, 0 replies; 30+ messages in thread
From: Mircea Gherzan @ 2011-12-19  8:40 UTC (permalink / raw)
  To: linux-arm-kernel

Based of Matt Evans's PPC64 implementation.

Supports only ARM mode with EABI.

Supports both little and big endian. Depends on the support for
unaligned loads on ARMv7. Does not support all the BPF opcodes
that deal with ancillary data. The scratch memory of the filter
lives on the stack.

Enabled in the same way as for x86-64 and PPC64:

	echo 1 > /proc/sys/net/core/bpf_jit_enable

A value greater than 1 enables opcode output.

Signed-off-by: Mircea Gherzan <mgherzan@gmail.com>
---

Changes in v2:
 * enable the compiler ony for ARMv5+ because of the BLX instruction
 * use the same comparison for the ARM version checks
 * use misaligned accesses on ARMv6
 * fix the SEEN_MEM
 * fix the mem_words_used()

 arch/arm/Kconfig          |    1 +
 arch/arm/Makefile         |    1 +
 arch/arm/net/Makefile     |    3 +
 arch/arm/net/bpf_jit_32.c |  838 +++++++++++++++++++++++++++++++++++++++++++++
 arch/arm/net/bpf_jit_32.h |  174 ++++++++++
 5 files changed, 1017 insertions(+), 0 deletions(-)
 create mode 100644 arch/arm/net/Makefile
 create mode 100644 arch/arm/net/bpf_jit_32.c
 create mode 100644 arch/arm/net/bpf_jit_32.h

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index abba5b8..ea65c41 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -30,6 +30,7 @@ config ARM
 	select HAVE_SPARSE_IRQ
 	select GENERIC_IRQ_SHOW
 	select CPU_PM if (SUSPEND || CPU_IDLE)
+	select HAVE_BPF_JIT if (!THUMB2_KERNEL && AEABI)
 	help
 	  The ARM series is a line of low-power-consumption RISC chip designs
 	  licensed by ARM Ltd and targeted at embedded applications and
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index dfcf3b0..8810a10 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -255,6 +255,7 @@ core-$(CONFIG_VFP)		+= arch/arm/vfp/
 
 # If we have a machine-specific directory, then include it in the build.
 core-y				+= arch/arm/kernel/ arch/arm/mm/ arch/arm/common/
+core-y				+= arch/arm/net/
 core-y				+= $(machdirs) $(platdirs)
 
 drivers-$(CONFIG_OPROFILE)      += arch/arm/oprofile/
diff --git a/arch/arm/net/Makefile b/arch/arm/net/Makefile
new file mode 100644
index 0000000..c2c1084
--- /dev/null
+++ b/arch/arm/net/Makefile
@@ -0,0 +1,3 @@
+# ARM-specific networking code
+
+obj-$(CONFIG_BPF_JIT) += bpf_jit_32.o
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
new file mode 100644
index 0000000..d2b86fa
--- /dev/null
+++ b/arch/arm/net/bpf_jit_32.c
@@ -0,0 +1,838 @@
+/*
+ * Just-In-Time compiler for BPF filters on 32bit ARM
+ *
+ * Copyright (c) 2011 Mircea Gherzan <mgherzan@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License.
+ */
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/filter.h>
+#include <linux/moduleloader.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <asm/cacheflush.h>
+
+#include "bpf_jit_32.h"
+
+/* BLX is available on ARMv5 or higher */
+#if __LINUX_ARM_ARCH__ < 5
+# error "The BPF JIT compiler is only available on ARMv5 or higher!"
+#endif
+
+/*
+ * ABI:
+ *
+ * r0	scratch register
+ * r4	BPF register A
+ * r5	BPF register X
+ * r6	pointer to the skb
+ * r7	skb->data
+ * r8	skb_headlen(skb)
+ */
+
+#define r_scratch	ARM_R0
+/* r1-r3 are (also) used for the unaligned loads on the non-ARMv7 slowpath */
+#define r_off		ARM_R1
+#define r_A		ARM_R4
+#define r_X		ARM_R5
+#define r_skb		ARM_R6
+#define r_skb_data	ARM_R7
+#define r_skb_hl	ARM_R8
+
+#define SCRATCH_SP_OFFSET	0
+#define SCRATCH_OFF(k)		(SCRATCH_SP_OFFSET + (k))
+
+#define SEEN_MEM		((1 << BPF_MEMWORDS) - 1)
+#define SEEN_MEM_WORD(k)	(1 << (k))
+#define SEEN_X			(1 << BPF_MEMWORDS)
+#define SEEN_CALL		(1 << (BPF_MEMWORDS + 1))
+#define SEEN_DATA		(1 << (BPF_MEMWORDS + 2))
+#define SEEN_LEN		(1 << (BPF_MEMWORDS + 3))
+
+struct jit_ctx {
+	const struct sk_filter *skf;
+	unsigned idx;
+	unsigned prologue_bytes;
+	int ret0_fp_idx;
+	u32 seen;
+	u32 *offsets;
+	u32 *target;
+#if __LINUX_ARM_ARCH__ < 7
+	u16 epilogue_bytes;
+	u16 imm_count;
+	u32 *imms;
+#endif
+};
+
+int bpf_jit_enable __read_mostly;
+
+static u8 jit_get_skb_b(struct sk_buff *skb, unsigned offset)
+{
+	u8 ret;
+	skb_copy_bits(skb, offset, &ret, 1);
+	return ret;
+}
+
+static u16 jit_get_skb_h(struct sk_buff *skb, unsigned offset)
+{
+	u16 ret;
+	skb_copy_bits(skb, offset, &ret, 2);
+	return ntohs(ret);
+}
+
+static u32 jit_get_skb_w(struct sk_buff *skb, unsigned offset)
+{
+	u32 ret;
+	skb_copy_bits(skb, offset, &ret, 4);
+	return ntohl(ret);
+}
+
+static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx)
+{
+	if (ctx->target != NULL)
+		ctx->target[ctx->idx] = inst | (cond << 28);
+
+	ctx->idx++;
+}
+
+/*
+ * Emit an instruction that will be executed unconditionally.
+ */
+static inline void emit(u32 inst, struct jit_ctx *ctx)
+{
+	_emit(ARM_COND_AL, inst, ctx);
+}
+
+static u16 saved_regs(struct jit_ctx *ctx)
+{
+	u16 ret = 0;
+
+	if (ctx->skf->len > 1)
+		ret |= 1 << r_A;
+
+#ifdef CONFIG_FRAME_POINTER
+	ret |= (1 << ARM_FP) | (1 << ARM_IP) | (1 << ARM_LR) | (1 << ARM_PC);
+#else
+	if (ctx->seen & SEEN_CALL)
+		ret |= 1 << ARM_LR;
+#endif
+	if (ctx->seen & (SEEN_DATA | SEEN_LEN))
+		ret |= 1 << r_skb;
+	if (ctx->seen & SEEN_DATA)
+		ret |= (1 << r_skb_data) | (1 << r_skb_hl);
+	if (ctx->seen & SEEN_X)
+		ret |= 1 << r_X;
+
+	return ret;
+}
+
+static inline int mem_words_used(struct jit_ctx *ctx)
+{
+	u32 words = ctx->seen & SEEN_MEM;
+	/* yes, we do waste some stack space IF there are "holes" in the set" */
+	return 32 - __builtin_clz(words);
+}
+
+static inline bool is_load_to_a(u16 inst)
+{
+	switch (inst) {
+	case BPF_S_LD_W_LEN:
+	case BPF_S_LD_W_ABS:
+	case BPF_S_LD_H_ABS:
+	case BPF_S_LD_B_ABS:
+	case BPF_S_ANC_CPU:
+	case BPF_S_ANC_IFINDEX:
+	case BPF_S_ANC_MARK:
+	case BPF_S_ANC_PROTOCOL:
+	case BPF_S_ANC_RXHASH:
+	case BPF_S_ANC_QUEUE:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static void build_prologue(struct jit_ctx *ctx)
+{
+	u16 reg_set = saved_regs(ctx);
+	u16 first_inst = ctx->skf->insns[0].code;
+	u16 off;
+
+#ifdef CONFIG_FRAME_POINTER
+	emit(ARM_MOV_R(ARM_IP, ARM_SP), ctx);
+	emit(ARM_PUSH(reg_set), ctx);
+	emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx);
+#else
+	if (reg_set)
+		emit(ARM_PUSH(reg_set), ctx);
+#endif
+
+	if (ctx->seen & (SEEN_DATA | SEEN_LEN))
+		emit(ARM_MOV_R(r_skb, ARM_R0), ctx);
+
+	if (ctx->seen & SEEN_DATA) {
+		off = offsetof(struct sk_buff, data);
+		emit(ARM_LDR_I(r_skb_data, r_skb, off), ctx);
+		/* headlen = len - data_len */
+		off = offsetof(struct sk_buff, len);
+		emit(ARM_LDR_I(r_skb_hl, r_skb, off), ctx);
+		off = offsetof(struct sk_buff, data_len);
+		emit(ARM_LDR_I(r_scratch, r_skb, off), ctx);
+		emit(ARM_SUB_R(r_skb_hl, r_skb_hl, r_scratch), ctx);
+	}
+
+	if (ctx->seen & SEEN_X)
+		emit(ARM_MOV_I(r_X, 0), ctx);
+
+	/* do not leak kernel data to userspace */
+	if ((first_inst != BPF_S_RET_K) && !(is_load_to_a(first_inst)))
+		emit(ARM_MOV_I(r_A, 0), ctx);
+
+	/* stack space for the BPF_MEM words */
+	if (ctx->seen & SEEN_MEM)
+		emit(ARM_SUB_I(ARM_SP, ARM_SP, mem_words_used(ctx) * 4), ctx);
+}
+
+static void build_epilogue(struct jit_ctx *ctx)
+{
+	u16 reg_set = saved_regs(ctx);
+
+	if (ctx->seen & SEEN_MEM)
+		emit(ARM_ADD_I(ARM_SP, ARM_SP, mem_words_used(ctx) * 4), ctx);
+
+	reg_set &= ~(1 << ARM_LR);
+
+#ifdef CONFIG_FRAME_POINTER
+	/* the first instruction of the prologue was: mov ip, sp */
+	reg_set &= ~(1 << ARM_IP);
+	reg_set |= (1 << ARM_SP);
+	emit(ARM_LDM(ARM_SP, reg_set), ctx);
+#else
+	if (ctx->seen)
+		emit(ARM_POP(reg_set | (1 << ARM_PC)), ctx);
+	else
+		emit(ARM_BX(ARM_LR), ctx);
+#endif
+}
+
+static int16_t imm8m(u32 x)
+{
+	u32 rot;
+
+	for (rot = 0; rot < 16; rot++)
+		if ((x & ~ror32(0xff, 2 * rot)) == 0)
+			return rol32(x, 2 * rot) | (rot << 8);
+
+	return -1;
+}
+
+#if __LINUX_ARM_ARCH__ < 7
+
+static u16 imm_offset(u32 k, struct jit_ctx *ctx)
+{
+	unsigned i = 0, offset;
+	u16 imm;
+
+	/* on the "fake" run we just count them (duplicates included) */
+	if (ctx->target == NULL) {
+		ctx->imm_count++;
+		return 0;
+	}
+
+	while ((i < ctx->imm_count) && ctx->imms[i]) {
+		if (ctx->imms[i] == k)
+			break;
+		i++;
+	}
+
+	if (ctx->imms[i] == 0)
+		ctx->imms[i] = k;
+
+	/* constants go just after the epilogue */
+	offset =  ctx->offsets[ctx->skf->len];
+	offset += ctx->prologue_bytes;
+	offset += ctx->epilogue_bytes;
+	offset += i * 4;
+
+	ctx->target[offset / 4] = k;
+
+	/* PC in ARM mode == address of the instruction + 8 */
+	imm = offset - (8 + ctx->idx * 4);
+
+	return imm;
+}
+
+#endif /* __LINUX_ARM_ARCH__ */
+
+/*
+ * Move an immediate that's not an imm8m to a core register.
+ */
+static inline void emit_mov_i_no8m(int rd, u32 val, struct jit_ctx *ctx)
+{
+#if __LINUX_ARM_ARCH__ < 7
+	emit(ARM_LDR_I(rd, ARM_PC, imm_offset(val, ctx)), ctx);
+#else
+	emit(ARM_MOVW(rd, val & 0xffff), ctx);
+	if (val > 0xffff)
+		emit(ARM_MOVT(rd, val >> 16), ctx);
+#endif
+}
+
+static inline void emit_mov_i(int rd, u32 val, struct jit_ctx *ctx)
+{
+	int imm12 = imm8m(val);
+
+	if (imm12 >= 0)
+		emit(ARM_MOV_I(rd, imm12), ctx);
+	else
+		emit_mov_i_no8m(rd, val, ctx);
+}
+
+#if __LINUX_ARM_ARCH__ < 6
+
+static void emit_load_be32(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx)
+{
+	_emit(cond, ARM_LDRB_I(ARM_R3, r_addr, 1), ctx);
+	_emit(cond, ARM_LDRB_I(ARM_R1, r_addr, 0), ctx);
+	_emit(cond, ARM_LDRB_I(ARM_R2, r_addr, 3), ctx);
+	_emit(cond, ARM_LSL_I(ARM_R3, ARM_R3, 16), ctx);
+	_emit(cond, ARM_LDRB_I(ARM_R0, r_addr, 2), ctx);
+	_emit(cond, ARM_ORR_S(ARM_R3, ARM_R3, ARM_R1, SRTYPE_LSL, 24), ctx);
+	_emit(cond, ARM_ORR_R(ARM_R3, ARM_R3, ARM_R2), ctx);
+	_emit(cond, ARM_ORR_S(r_res, ARM_R3, ARM_R0, SRTYPE_LSL, 8), ctx);
+}
+
+static void emit_load_be16(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx)
+{
+	_emit(cond, ARM_LDRB_I(ARM_R1, r_addr, 0), ctx);
+	_emit(cond, ARM_LDRB_I(ARM_R2, r_addr, 1), ctx);
+	_emit(cond, ARM_ORR_S(r_res, ARM_R2, ARM_R1, SRTYPE_LSL, 8), ctx);
+}
+
+static inline void emit_swap16(u8 r_dst, u8 r_src, struct jit_ctx *ctx)
+{
+	emit(ARM_LSL_R(ARM_R1, r_src, 8), ctx);
+	emit(ARM_ORR_S(r_dst, ARM_R1, r_src, SRTYPE_LSL, 8), ctx);
+	emit(ARM_LSL_I(r_dst, r_dst, 8), ctx);
+	emit(ARM_LSL_R(r_dst, r_dst, 8), ctx);
+}
+
+#else  /* ARMv6+ */
+
+static void emit_load_be32(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx)
+{
+	_emit(cond, ARM_LDR_I(r_res, r_addr, 0), ctx);
+#ifdef __LITTLE_ENDIAN
+	_emit(cond, ARM_REV(r_res, r_res), ctx);
+#endif
+}
+
+static void emit_load_be16(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx)
+{
+	_emit(cond, ARM_LDRH_I(r_res, r_addr, 0), ctx);
+#ifdef __LITTLE_ENDIAN
+	_emit(cond, ARM_REV16(r_res, r_res), ctx);
+#endif
+}
+
+static inline void emit_swap16(u8 r_dst __maybe_unused,
+			       u8 r_src __maybe_unused,
+			       struct jit_ctx *ctx __maybe_unused)
+{
+#ifdef __LITTLE_ENDIAN
+	emit(ARM_REV16(r_dst, r_src), ctx);
+#endif
+}
+
+#endif /* __LINUX_ARM_ARCH__ < 6 */
+
+
+/* Compute the immediate value for a PC-relative branch. */
+static inline u32 b_imm(unsigned tgt, struct jit_ctx *ctx)
+{
+	u32 imm;
+
+	if (ctx->target == NULL)
+		return 0;
+	/*
+	 * BPF allows only forward jumps and the offset of the target is
+	 * still the one computed during the first pass.
+	 */
+	imm  = ctx->offsets[tgt] + ctx->prologue_bytes - (ctx->idx * 4 + 8);
+
+	return imm >> 2;
+}
+
+#define OP_IMM3(op, r1, r2, imm_val, ctx)				\
+	do {								\
+		imm12 = imm8m(imm_val);					\
+		if (imm12 < 0) {					\
+			emit_mov_i_no8m(r_scratch, imm_val, ctx);	\
+			emit(op ## _R((r1), (r2), r_scratch), ctx);	\
+		} else {						\
+			emit(op ## _I((r1), (r2), imm12), ctx);		\
+		}							\
+	} while (0)
+
+static inline void emit_err_ret(u8 cond, struct jit_ctx *ctx)
+{
+	if (ctx->ret0_fp_idx >= 0) {
+		_emit(cond, ARM_B(b_imm(ctx->ret0_fp_idx, ctx)), ctx);
+		/* NOP to keep the size constant between passes */
+		emit(ARM_MOV_R(ARM_R0, ARM_R0), ctx);
+	} else {
+		_emit(cond, ARM_MOV_I(ARM_R0, 0), ctx);
+		_emit(cond, ARM_B(b_imm(ctx->skf->len, ctx)), ctx);
+	}
+}
+
+static int build_body(struct jit_ctx *ctx)
+{
+	void  *load_func[] = {jit_get_skb_b, jit_get_skb_h, jit_get_skb_w};
+	const struct sk_filter *prog = ctx->skf;
+	const struct sock_filter *inst;
+	unsigned i, load_order, off, condt, condf;
+	int imm12;
+	u32 k;
+
+	for (i = 0; i < prog->len; i++) {
+		inst = &(prog->insns[i]);
+		/* K as an immediate value operand */
+		k = inst->k;
+
+		/* compute offsets only in the fake pass */
+		if (ctx->target == NULL)
+			ctx->offsets[i] = ctx->idx * 4;
+
+		switch (inst->code) {
+		case BPF_S_LD_IMM:
+			emit_mov_i(r_A, k, ctx);
+			break;
+		case BPF_S_LD_W_LEN:
+			ctx->seen |= SEEN_LEN;
+			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
+			emit(ARM_LDR_I(r_A, r_skb,
+				       offsetof(struct sk_buff, len)), ctx);
+			break;
+		case BPF_S_LD_MEM:
+			/* A = scratch[k] */
+			ctx->seen |= SEEN_MEM_WORD(k);
+			emit(ARM_LDR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx);
+			break;
+		case BPF_S_LD_W_ABS:
+			load_order = 2;
+			goto load;
+		case BPF_S_LD_H_ABS:
+			load_order = 1;
+			goto load;
+		case BPF_S_LD_B_ABS:
+			load_order = 0;
+load:
+			emit_mov_i(r_off, k, ctx);
+load_common:
+			ctx->seen |= SEEN_DATA | SEEN_CALL;
+
+			if (load_order > 0) {
+				emit(ARM_SUB_I(r_scratch, r_skb_hl,
+					       1 << load_order), ctx);
+				emit(ARM_CMP_R(r_scratch, r_off), ctx);
+				condt = ARM_COND_HS;
+			} else {
+				emit(ARM_CMP_R(r_skb_hl, r_off), ctx);
+				condt = ARM_COND_HI;
+			}
+
+			_emit(condt, ARM_ADD_R(r_scratch, r_off, r_skb_data),
+			      ctx);
+
+			if (load_order == 0)
+				_emit(condt, ARM_LDRB_I(r_A, r_scratch, 0),
+				      ctx);
+			else if (load_order == 1)
+				emit_load_be16(condt, r_A, r_scratch, ctx);
+			else if (load_order == 2)
+				emit_load_be32(condt, r_A, r_scratch, ctx);
+
+			_emit(condt, ARM_B(b_imm(i + 1, ctx)), ctx);
+
+			/* the slowpath */
+			emit_mov_i(ARM_R3, (u32)load_func[load_order], ctx);
+			emit(ARM_MOV_R(ARM_R0, r_skb), ctx);
+			/* the offset is already in R1 */
+			emit(ARM_BLX_R(ARM_R3), ctx);
+			emit(ARM_MOV_R(r_A, ARM_R0), ctx);
+			break;
+		case BPF_S_LD_W_IND:
+			load_order = 2;
+			goto load_ind;
+		case BPF_S_LD_H_IND:
+			load_order = 1;
+			goto load_ind;
+		case BPF_S_LD_B_IND:
+			load_order = 0;
+load_ind:
+			OP_IMM3(ARM_ADD, r_off, r_X, k, ctx);
+			goto load_common;
+		case BPF_S_LDX_IMM:
+			ctx->seen |= SEEN_X;
+			emit_mov_i(r_X, k, ctx);
+			break;
+		case BPF_S_LDX_W_LEN:
+			ctx->seen |= SEEN_X;
+			emit(ARM_LDR_I(r_X, r_skb,
+				       offsetof(struct sk_buff, len)), ctx);
+			break;
+		case BPF_S_LDX_MEM:
+			ctx->seen |= SEEN_X | SEEN_MEM_WORD(k);
+			emit(ARM_LDR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx);
+			break;
+		case BPF_S_LDX_B_MSH:
+			/* x = ((*(frame + k)) & 0xf) << 2; */
+			ctx->seen |= SEEN_X | SEEN_DATA | SEEN_CALL;
+			/* offset in r1: we might have to take the slow path */
+			emit_mov_i(r_off, k, ctx);
+			emit(ARM_CMP_R(r_skb_hl, r_off), ctx);
+
+			/* load in r0: common with the slowpath */
+			_emit(ARM_COND_HI, ARM_LDRB_R(ARM_R0, r_skb_data,
+						      ARM_R1), ctx);
+			_emit(ARM_COND_HI, ARM_B((3 * 4 - 8) >> 2), ctx);
+
+			emit(ARM_MOV_R(ARM_R0, r_skb), ctx);
+			/* r_off is r1 */
+			emit(ARM_MOV_I(ARM_R3, (u32)jit_get_skb_b), ctx);
+			emit(ARM_BLX_R(ARM_R3), ctx);
+
+			emit(ARM_ORR_I(r_X, ARM_R0, 0x00f), ctx);
+			emit(ARM_LSL_I(r_X, r_X, 2), ctx);
+			break;
+		case BPF_S_ST:
+			ctx->seen |= SEEN_MEM_WORD(k);
+			emit(ARM_STR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx);
+			break;
+		case BPF_S_STX:
+			ctx->seen |= SEEN_MEM_WORD(k) | SEEN_X;
+			emit(ARM_STR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx);
+			break;
+		case BPF_S_ALU_ADD_K:
+			/* A += K */
+			OP_IMM3(ARM_ADD, r_A, r_A, k, ctx);
+			break;
+		case BPF_S_ALU_ADD_X:
+			ctx->seen |= SEEN_X;
+			emit(ARM_ADD_R(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_SUB_K:
+			/* A -= K */
+			OP_IMM3(ARM_SUB, r_A, r_A, k, ctx);
+			break;
+		case BPF_S_ALU_SUB_X:
+			ctx->seen |= SEEN_X;
+			emit(ARM_SUB_R(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_MUL_K:
+			/* A *= K */
+			emit_mov_i(r_scratch, k, ctx);
+			emit(ARM_MUL(r_A, r_A, r_scratch), ctx);
+			break;
+		case BPF_S_ALU_MUL_X:
+			ctx->seen |= SEEN_X;
+			emit(ARM_MUL(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_DIV_K:
+			/* K is not zero, it was previously checked */
+			emit_mov_i(ARM_R1, k, ctx);
+			goto div;
+		case BPF_S_ALU_DIV_X:
+			ctx->seen |= SEEN_X;
+			emit(ARM_CMP_I(r_X, 0), ctx);
+			emit_err_ret(ARM_COND_EQ, ctx);
+			emit(ARM_MOV_R(ARM_R1, r_X), ctx);
+div:
+			ctx->seen |= SEEN_CALL;
+
+			emit(ARM_MOV_R(ARM_R0, r_A), ctx);
+			emit_mov_i(r_scratch, (u32)__aeabi_uidiv, ctx);
+			emit(ARM_BLX_R(r_scratch), ctx);
+			emit(ARM_MOV_R(r_A, ARM_R0), ctx);
+			break;
+		case BPF_S_ALU_OR_K:
+			/* A |= K */
+			OP_IMM3(ARM_ORR, r_A, r_A, k, ctx);
+			break;
+		case BPF_S_ALU_OR_X:
+			ctx->seen |= SEEN_X;
+			emit(ARM_ORR_I(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_AND_K:
+			/* A &= K */
+			OP_IMM3(ARM_AND, r_A, r_A, k, ctx);
+			break;
+		case BPF_S_ALU_AND_X:
+			ctx->seen |= SEEN_X;
+			emit(ARM_AND_R(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_LSH_K:
+			if (unlikely(k > 31))
+				return -1;
+			emit(ARM_LSL_I(r_A, r_A, k), ctx);
+			break;
+		case BPF_S_ALU_LSH_X:
+			ctx->seen |= SEEN_X;
+			emit(ARM_LSL_R(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_RSH_K:
+			if (unlikely(k > 31))
+				return -1;
+			emit(ARM_LSR_I(r_A, r_A, k), ctx);
+			break;
+		case BPF_S_ALU_RSH_X:
+			ctx->seen |= SEEN_X;
+			emit(ARM_LSR_R(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_JMP_JA:
+			/* pc += K */
+			emit(ARM_B(b_imm(i + k, ctx)), ctx);
+			break;
+		case BPF_S_JMP_JEQ_K:
+			/* pc += (A == K) ? pc->jt : pc->jf */
+			condt   = ARM_COND_EQ;
+			condf  = ARM_COND_NE;
+			goto cmp_imm;
+		case BPF_S_JMP_JGT_K:
+			/* pc += (A > K) ? pc->jt : pc->jf */
+			condt   = ARM_COND_HI;
+			condf  = ARM_COND_LS;
+			goto cmp_imm;
+		case BPF_S_JMP_JGE_K:
+			/* pc += (A >= K) ? pc->jt : pc->jf */
+			condt   = ARM_COND_HI;
+			condf  = ARM_COND_LS;
+			/* (x >= y) IFF (x > y - 1) */
+			if (unlikely(k == 0))
+				return -1;
+			k--;
+cmp_imm:
+			imm12 = imm8m(k);
+			if (imm12 < 0) {
+				emit_mov_i_no8m(r_scratch, k, ctx);
+				emit(ARM_CMP_R(r_A, r_scratch), ctx);
+			} else {
+				emit(ARM_CMP_I(r_A, imm12), ctx);
+			}
+cond_jump:
+			if (inst->jt)
+				_emit(condt, ARM_B(b_imm(i + inst->jt + 1,
+						   ctx)), ctx);
+			if (inst->jf)
+				_emit(condf, ARM_B(b_imm(i + inst->jf + 1,
+						   ctx)), ctx);
+			break;
+		case BPF_S_JMP_JEQ_X:
+			/* pc += (A == X) ? pc->jt : pc->jf */
+			condt   = ARM_COND_EQ;
+			condf  = ARM_COND_NE;
+			goto cmp_x;
+		case BPF_S_JMP_JGT_X:
+			/* pc += (A > X) ? pc->jt : pc->jf */
+			condt   = ARM_COND_HI;
+			condf  = ARM_COND_LS;
+			goto cmp_x;
+		case BPF_S_JMP_JGE_X:
+			/* pc += (A >= X) ? pc->jt : pc->jf */
+			condt   = ARM_COND_CS;
+			condf  = ARM_COND_CC;
+cmp_x:
+			ctx->seen |= SEEN_X;
+			emit(ARM_CMP_R(r_A, r_X), ctx);
+			goto cond_jump;
+		case BPF_S_JMP_JSET_K:
+			/* pc += (A & K) ? pc->jt : pc->jf */
+			condt  = ARM_COND_NE;
+			/* not set iff all zeroes iff Z==1 iff EQ */
+			condf  = ARM_COND_EQ;
+
+			imm12 = imm8m(k);
+			if (imm12 < 0) {
+				emit_mov_i_no8m(r_scratch, k, ctx);
+				emit(ARM_TST_R(r_A, r_scratch), ctx);
+			} else {
+				emit(ARM_TST_I(r_A, imm12), ctx);
+			}
+			goto cond_jump;
+		case BPF_S_JMP_JSET_X:
+			/* pc += (A & X) ? pc->jt : pc->jf */
+			condt  = ARM_COND_NE;
+			condf  = ARM_COND_EQ;
+			emit(ARM_TST_R(r_A, r_X), ctx);
+			goto cond_jump;
+		case BPF_S_RET_A:
+			emit(ARM_MOV_R(ARM_R0, r_A), ctx);
+			goto b_epilogue;
+		case BPF_S_RET_K:
+			if ((k == 0) && (ctx->ret0_fp_idx < 0))
+				ctx->ret0_fp_idx = i;
+			emit_mov_i(ARM_R0, k, ctx);
+b_epilogue:
+			if (i != ctx->skf->len - 1)
+				emit(ARM_B(b_imm(prog->len, ctx)), ctx);
+			break;
+		case BPF_S_MISC_TAX:
+			/* X = A */
+			emit(ARM_MOV_R(r_X, r_A), ctx);
+			break;
+		case BPF_S_MISC_TXA:
+			/* A = X */
+			emit(ARM_MOV_R(r_A, r_X), ctx);
+			break;
+		case BPF_S_ANC_PROTOCOL:
+			/* A = ntohs(skb->protocol) */
+			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
+						  protocol) != 2);
+			off = offsetof(struct sk_buff, protocol);
+			emit(ARM_LDRH_I(r_scratch, r_skb, off), ctx);
+			emit_swap16(r_A, r_scratch, ctx);
+			break;
+		case BPF_S_ANC_CPU:
+			/* A = current_thread_info()->cpu */
+			emit_mov_i(r_scratch, (~(THREAD_SIZE - 1)), ctx);
+			emit(ARM_ADD_R(r_scratch, ARM_SP, r_scratch), ctx);
+
+			BUILD_BUG_ON(FIELD_SIZEOF(struct thread_info, cpu) != 4);
+			off = offsetof(struct thread_info, cpu);
+			emit(ARM_LDR_I(r_A, r_scratch, off), ctx);
+			break;
+		case BPF_S_ANC_IFINDEX:
+			/* A = skb->dev->ifindex */
+			off = offsetof(struct sk_buff, dev);
+			emit(ARM_LDR_I(r_scratch, r_skb, off), ctx);
+
+			emit(ARM_CMP_I(r_scratch, 0), ctx);
+			emit_err_ret(ARM_COND_EQ, ctx);
+
+			BUILD_BUG_ON(FIELD_SIZEOF(struct net_device,
+						  ifindex) != 4);
+			off = offsetof(struct net_device, ifindex);
+			emit(ARM_LDR_I(r_A, r_scratch, off), ctx);
+			break;
+		case BPF_S_ANC_MARK:
+			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+			off = offsetof(struct sk_buff, mark);
+			emit(ARM_LDR_I(r_A, r_skb, off), ctx);
+			break;
+		case BPF_S_ANC_RXHASH:
+			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, rxhash) != 4);
+			off = offsetof(struct sk_buff, rxhash);
+			emit(ARM_LDR_I(r_A, r_skb, off), ctx);
+			break;
+		case BPF_S_ANC_QUEUE:
+			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
+						  queue_mapping) != 2);
+			BUILD_BUG_ON(offsetof(struct sk_buff,
+					      queue_mapping) > 0xff);
+			off = offsetof(struct sk_buff, queue_mapping);
+			emit(ARM_LDRH_I(r_A, r_skb, off), ctx);
+			break;
+		default:
+			return -1;
+		}
+	}
+
+	/* compute offsets only during the first pass */
+	if (ctx->target == NULL)
+		ctx->offsets[i] = ctx->idx * 4;
+
+	return 0;
+}
+
+
+void bpf_jit_compile(struct sk_filter *fp)
+{
+	struct jit_ctx ctx;
+	unsigned tmp_idx;
+	unsigned alloc_size;
+
+	memset(&ctx, 0, sizeof(ctx));
+	ctx.skf		= fp;
+	ctx.ret0_fp_idx = -1;
+
+	ctx.offsets = kzalloc(GFP_KERNEL, 4 * (ctx.skf->len + 1));
+	if (ctx.offsets == NULL)
+		return;
+
+	/* fake pass to fill in the ctx->seen */
+	if (unlikely(build_body(&ctx)))
+		goto out;
+
+	tmp_idx = ctx.idx;
+	build_prologue(&ctx);
+	ctx.prologue_bytes = (ctx.idx - tmp_idx) * 4;
+
+#if __LINUX_ARM_ARCH__ < 7
+	tmp_idx = ctx.idx;
+	build_epilogue(&ctx);
+	ctx.epilogue_bytes = (ctx.idx - tmp_idx) * 4;
+
+	ctx.idx += ctx.imm_count;
+	if (ctx.imm_count) {
+		ctx.imms = kzalloc(GFP_KERNEL, 4 * ctx.imm_count);
+		if (ctx.imms == NULL)
+			goto out;
+	}
+#else
+	/* there's nothing after the epilogue on ARMv7 */
+	build_epilogue(&ctx);
+#endif
+
+	alloc_size = 4 * ctx.idx;
+	ctx.target = module_alloc(alloc_size > sizeof(struct work_struct) ?
+				  alloc_size : sizeof(struct work_struct));
+	if (unlikely(ctx.target == NULL))
+		goto out;
+
+	ctx.idx = 0;
+	build_prologue(&ctx);
+	build_body(&ctx);
+	build_epilogue(&ctx);
+
+	flush_icache_range((u32)ctx.target, (u32)(ctx.target + ctx.idx));
+
+#if __LINUX_ARM_ARCH__ < 7
+	if (ctx.imm_count)
+		kfree(ctx.imms);
+#endif
+
+	if (bpf_jit_enable > 1)
+		print_hex_dump(KERN_INFO, "BPF JIT code: ",
+			       DUMP_PREFIX_ADDRESS, 16, 4, ctx.target,
+			       alloc_size, false);
+
+	fp->bpf_func = (void *)ctx.target;
+
+out:
+	kfree(ctx.offsets);
+	return;
+}
+
+static void bpf_jit_free_worker(struct work_struct *work)
+{
+	module_free(NULL, work);
+}
+
+void bpf_jit_free(struct sk_filter *fp)
+{
+	struct work_struct *work;
+
+	if (fp->bpf_func != sk_run_filter) {
+		work = (struct work_struct *)fp->bpf_func;
+
+		INIT_WORK(work, bpf_jit_free_worker);
+		schedule_work(work);
+	}
+}
+
diff --git a/arch/arm/net/bpf_jit_32.h b/arch/arm/net/bpf_jit_32.h
new file mode 100644
index 0000000..de68a51
--- /dev/null
+++ b/arch/arm/net/bpf_jit_32.h
@@ -0,0 +1,174 @@
+/*
+ * Just-In-Time compiler for BPF filters on 32bit ARM
+ *
+ * Copyright (c) 2011 Mircea Gherzan <mgherzan@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License.
+ */
+
+#ifndef PFILTER_OPCODES_ARM_H
+#define PFILTER_OPCODES_ARM_H
+
+extern void *__aeabi_uidiv;
+
+#define ARM_R0	0
+#define ARM_R1	1
+#define ARM_R2	2
+#define ARM_R3	3
+#define ARM_R4	4
+#define ARM_R5	5
+#define ARM_R6	6
+#define ARM_R7	7
+#define ARM_R8	8
+#define ARM_R9	9
+#define ARM_R10	10
+#define ARM_FP	11
+#define ARM_IP	12
+#define ARM_SP	13
+#define ARM_LR	14
+#define ARM_PC	15
+
+#define ARM_COND_EQ		0x0
+#define ARM_COND_NE		0x1
+#define ARM_COND_CS		0x2
+#define ARM_COND_HS		ARM_COND_CS
+#define ARM_COND_CC		0x3
+#define ARM_COND_LO		ARM_COND_CC
+#define ARM_COND_MI		0x4
+#define ARM_COND_PL		0x5
+#define ARM_COND_VS		0x6
+#define ARM_COND_VC		0x7
+#define ARM_COND_HI		0x8
+#define ARM_COND_LS		0x9
+#define ARM_COND_GE		0xa
+#define ARM_COND_LT		0xb
+#define ARM_COND_GT		0xc
+#define ARM_COND_LE		0xd
+#define ARM_COND_AL		0xe
+
+/* register shift types */
+#define SRTYPE_LSL		0
+#define SRTYPE_LSR		1
+#define SRTYPE_ASR		2
+#define SRTYPE_ROR		3
+
+#define ARM_INST_ADD_R		0x00800000
+#define ARM_INST_ADD_I		0x02800000
+
+#define ARM_INST_AND_R		0x00000000
+#define ARM_INST_AND_I		0x02000000
+
+#define ARM_INST_B		0x0a000000
+#define ARM_INST_BX		0x012FFF10
+#define ARM_INST_BLX_R		0x012fff30
+
+#define ARM_INST_CMP_R		0x01500000
+#define ARM_INST_CMP_I		0x03500000
+
+#define ARM_INST_LDRB_I		0x05d00000
+#define ARM_INST_LDRB_R		0x07d00000
+#define ARM_INST_LDRH_I		0x01d000b0
+#define ARM_INST_LDR_I		0x05900000
+
+#define ARM_INST_LDM		0x08900000
+
+#define ARM_INST_LSL_I		0x01a00000
+#define ARM_INST_LSL_R		0x01a00010
+
+#define ARM_INST_LSR_I		0x01a00020
+#define ARM_INST_LSR_R		0x01a00030
+
+#define ARM_INST_MOV_R		0x01a00000
+#define ARM_INST_MOV_I		0x03a00000
+#define ARM_INST_MOVW		0x03000000
+#define ARM_INST_MOVT		0x03400000
+
+#define ARM_INST_MUL		0x00000090
+
+#define ARM_INST_POP		0x08bd0000
+#define ARM_INST_PUSH		0x092d0000
+
+#define ARM_INST_REV		0x06bf0f30
+#define ARM_INST_REV16		0x06bf0fb0
+
+#define ARM_INST_ORR_R		0x01800000
+#define ARM_INST_ORR_I		0x03800000
+
+#define ARM_INST_SUB_R		0x00400000
+#define ARM_INST_SUB_I		0x02400000
+
+#define ARM_INST_STR_I		0x05800000
+
+#define ARM_INST_TST_R		0x01100000
+#define ARM_INST_TST_I		0x03100000
+
+
+/* register */
+#define _AL3_R(op, rd, rn, rm)	((op ## _R) | (rd) << 12 | (rn) << 16 | (rm))
+/* immediate */
+#define _AL3_I(op, rd, rn, imm)	((op ## _I) | (rd) << 12 | (rn) << 16 | (imm))
+
+#define ARM_ADD_R(rd, rn, rm)	_AL3_R(ARM_INST_ADD, rd, rn, rm)
+#define ARM_ADD_I(rd, rn, imm)	_AL3_I(ARM_INST_ADD, rd, rn, imm)
+
+#define ARM_AND_R(rd, rn, rm)	_AL3_R(ARM_INST_AND, rd, rn, rm)
+#define ARM_AND_I(rd, rn, imm)	_AL3_I(ARM_INST_AND, rd, rn, imm)
+
+#define ARM_B(imm24)		(ARM_INST_B | (imm24))
+#define ARM_BX(rm)		(ARM_INST_BX | (rm))
+#define ARM_BLX_R(rm)		(ARM_INST_BLX_R | (rm))
+
+#define ARM_CMP_R(rn, rm)	_AL3_R(ARM_INST_CMP, 0, rn, rm)
+#define ARM_CMP_I(rn, imm)	_AL3_I(ARM_INST_CMP, 0, rn, imm)
+
+#define ARM_LDR_I(rt, rn, off)	(ARM_INST_LDR_I | (rt) << 12 | (rn) << 16 \
+				 | (off))
+#define ARM_LDRB_I(rt, rn, off)	(ARM_INST_LDRB_I | (rt) << 12 | (rn) << 16 \
+				 | (off))
+#define ARM_LDRB_R(rt, rn, rm)	(ARM_INST_LDRB_R | (rt) << 12 | (rn) << 16 \
+				 | (rm))
+#define ARM_LDRH_I(rt, rn, off)	(ARM_INST_LDRH_I | (rt) << 12 | (rn) << 16 \
+				 | (((off) & 0xf0) << 4) | ((off) & 0xf))
+
+#define ARM_LDM(rn, regs)	(ARM_INST_LDM | (rn) << 16 | (regs))
+
+#define ARM_LSL_R(rd, rn, rm)	(_AL3_R(ARM_INST_LSL, rd, 0, rn) | (rm) << 8)
+#define ARM_LSL_I(rd, rn, imm)	(_AL3_R(ARM_INST_LSL, rd, 0, rn) | (imm) << 7)
+
+#define ARM_LSR_R(rd, rn, rm)	(_AL3_R(ARM_INST_LSR, rd, 0, rn) | (rm) << 8)
+#define ARM_LSR_I(rd, rn, imm)	(_AL3_R(ARM_INST_LSR, rd, 0, rn) | (imm) << 7)
+
+#define ARM_MOV_R(rd, rm)	_AL3_R(ARM_INST_MOV, rd, 0, rm)
+#define ARM_MOV_I(rd, imm)	_AL3_I(ARM_INST_MOV, rd, 0, imm)
+
+#define ARM_MOVW(rd, imm)	\
+	(ARM_INST_MOVW | ((imm) >> 12) << 16 | (rd) << 12 | ((imm) & 0x0fff))
+
+#define ARM_MOVT(rd, imm)	\
+	(ARM_INST_MOVT | ((imm) >> 12) << 16 | (rd) << 12 | ((imm) & 0x0fff))
+
+#define ARM_MUL(rd, rm, rn)	(ARM_INST_MUL | (rd) << 16 | (rm) << 8 | (rn))
+
+#define ARM_POP(regs)		(ARM_INST_POP | (regs))
+#define ARM_PUSH(regs)		(ARM_INST_PUSH | (regs))
+
+#define ARM_ORR_R(rd, rn, rm)	_AL3_R(ARM_INST_ORR, rd, rn, rm)
+#define ARM_ORR_I(rd, rn, imm)	_AL3_I(ARM_INST_ORR, rd, rn, imm)
+#define ARM_ORR_S(rd, rn, rm, type, rs)	\
+	(ARM_ORR_R(rd, rn, rm) | (type) << 5 | (rs) << 7)
+
+#define ARM_REV(rd, rm)		(ARM_INST_REV | (rd) << 12 | (rm))
+#define ARM_REV16(rd, rm)	(ARM_INST_REV16 | (rd) << 12 | (rm))
+
+#define ARM_SUB_R(rd, rn, rm)	_AL3_R(ARM_INST_SUB, rd, rn, rm)
+#define ARM_SUB_I(rd, rn, imm)	_AL3_I(ARM_INST_SUB, rd, rn, imm)
+
+#define ARM_STR_I(rt, rn, off)	(ARM_INST_STR_I | (rt) << 12 | (rn) << 16 \
+				 | (off))
+
+#define ARM_TST_R(rn, rm)	_AL3_R(ARM_INST_TST, 0, rn, rm)
+#define ARM_TST_I(rn, imm)	_AL3_I(ARM_INST_TST, 0, rn, imm)
+
+#endif /* PFILTER_OPCODES_ARM_H */
-- 
1.7.7.3

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] ARM: net: JIT compiler for packet filters
  2011-12-19  8:40 ` Mircea Gherzan
@ 2011-12-19 12:50   ` Dave Martin
  -1 siblings, 0 replies; 30+ messages in thread
From: Dave Martin @ 2011-12-19 12:50 UTC (permalink / raw)
  To: Mircea Gherzan; +Cc: linux-arm-kernel, netdev

On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
> Based of Matt Evans's PPC64 implementation.
> 
> Supports only ARM mode with EABI.
> 
> Supports both little and big endian. Depends on the support for
> unaligned loads on ARMv7. Does not support all the BPF opcodes
> that deal with ancillary data. The scratch memory of the filter
> lives on the stack.
> 
> Enabled in the same way as for x86-64 and PPC64:
> 
> 	echo 1 > /proc/sys/net/core/bpf_jit_enable
> 
> A value greater than 1 enables opcode output.
> 
> Signed-off-by: Mircea Gherzan <mgherzan@gmail.com>
> ---

Interesting patch... I haven't reviewed in detail, but I have a few
quick comments.

> 
> Changes in v2:
>  * enable the compiler ony for ARMv5+ because of the BLX instruction
>  * use the same comparison for the ARM version checks
>  * use misaligned accesses on ARMv6

You probably want to change the commit message now to reflect this.

>  * fix the SEEN_MEM
>  * fix the mem_words_used()
> 
>  arch/arm/Kconfig          |    1 +
>  arch/arm/Makefile         |    1 +
>  arch/arm/net/Makefile     |    3 +
>  arch/arm/net/bpf_jit_32.c |  838 +++++++++++++++++++++++++++++++++++++++++++++
>  arch/arm/net/bpf_jit_32.h |  174 ++++++++++
>  5 files changed, 1017 insertions(+), 0 deletions(-)
>  create mode 100644 arch/arm/net/Makefile
>  create mode 100644 arch/arm/net/bpf_jit_32.c
>  create mode 100644 arch/arm/net/bpf_jit_32.h
> 
> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> index abba5b8..ea65c41 100644
> --- a/arch/arm/Kconfig
> +++ b/arch/arm/Kconfig
> @@ -30,6 +30,7 @@ config ARM
>  	select HAVE_SPARSE_IRQ
>  	select GENERIC_IRQ_SHOW
>  	select CPU_PM if (SUSPEND || CPU_IDLE)
> +	select HAVE_BPF_JIT if (!THUMB2_KERNEL && AEABI)

Have to tried your code with a Thumb-2 kernel?

Quickly skimming though your patch, I don't see an obvious reason why we
can't have that working, though I haven't tried it yet.

Note that it's fine to have the JIT generating ARM code, even if the rest
if the kernel is Thumb-2.  This would only start to cause problems if we
want to do things like set kprobes in the JITted code, or unwind out of
the JITted code.

It's just necessary to make sure that calls/returns into/out of the
JITted code are handled correctly.  You don't seem to do any scary
arithmetic or mov to or from pc or lr, and it doesn't look like you ever
call back into the kernel from JITted code, so the implementation is
probably safe for ARM/Thumb interworking already (if I've understood
correctly).

It doesn't look hard to port the JIT to generate Thumb-2 code directly
either -- but I suggest not to worry about that initially.  So long as
the ARM-based JIT works in a Thumb-2 kernel, it will be useful.

[...]

> diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
> new file mode 100644
> index 0000000..d2b86fa
> --- /dev/null
> +++ b/arch/arm/net/bpf_jit_32.c

[...]

> +static int build_body(struct jit_ctx *ctx)
> +{

[...]

> +		case BPF_S_ALU_DIV_K:
> +			/* K is not zero, it was previously checked */
> +			emit_mov_i(ARM_R1, k, ctx);
> +			goto div;
> +		case BPF_S_ALU_DIV_X:
> +			ctx->seen |= SEEN_X;
> +			emit(ARM_CMP_I(r_X, 0), ctx);
> +			emit_err_ret(ARM_COND_EQ, ctx);
> +			emit(ARM_MOV_R(ARM_R1, r_X), ctx);
> +div:
> +			ctx->seen |= SEEN_CALL;
> +
> +			emit(ARM_MOV_R(ARM_R0, r_A), ctx);
> +			emit_mov_i(r_scratch, (u32)__aeabi_uidiv, ctx);
> +			emit(ARM_BLX_R(r_scratch), ctx);
> +			emit(ARM_MOV_R(r_A, ARM_R0), ctx);
> +			break;

I don't know how much division is used by the packet filter JIT.  If
it gets used a significant amount, you might want to support hardware
divide for CPUs that have it:

Cortex-A15 and later processors may have hardware integer divide
support.  You can check for its availability at runtime using by testing
the HWCAP_IDIVA (for ARM) or HWCAP_IDIVT (for Thumb) bits in elf_hwcap
(see arch/arm/include/asm/hwcap.h).

Cheers
---Dave

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v2] ARM: net: JIT compiler for packet filters
@ 2011-12-19 12:50   ` Dave Martin
  0 siblings, 0 replies; 30+ messages in thread
From: Dave Martin @ 2011-12-19 12:50 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
> Based of Matt Evans's PPC64 implementation.
> 
> Supports only ARM mode with EABI.
> 
> Supports both little and big endian. Depends on the support for
> unaligned loads on ARMv7. Does not support all the BPF opcodes
> that deal with ancillary data. The scratch memory of the filter
> lives on the stack.
> 
> Enabled in the same way as for x86-64 and PPC64:
> 
> 	echo 1 > /proc/sys/net/core/bpf_jit_enable
> 
> A value greater than 1 enables opcode output.
> 
> Signed-off-by: Mircea Gherzan <mgherzan@gmail.com>
> ---

Interesting patch... I haven't reviewed in detail, but I have a few
quick comments.

> 
> Changes in v2:
>  * enable the compiler ony for ARMv5+ because of the BLX instruction
>  * use the same comparison for the ARM version checks
>  * use misaligned accesses on ARMv6

You probably want to change the commit message now to reflect this.

>  * fix the SEEN_MEM
>  * fix the mem_words_used()
> 
>  arch/arm/Kconfig          |    1 +
>  arch/arm/Makefile         |    1 +
>  arch/arm/net/Makefile     |    3 +
>  arch/arm/net/bpf_jit_32.c |  838 +++++++++++++++++++++++++++++++++++++++++++++
>  arch/arm/net/bpf_jit_32.h |  174 ++++++++++
>  5 files changed, 1017 insertions(+), 0 deletions(-)
>  create mode 100644 arch/arm/net/Makefile
>  create mode 100644 arch/arm/net/bpf_jit_32.c
>  create mode 100644 arch/arm/net/bpf_jit_32.h
> 
> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> index abba5b8..ea65c41 100644
> --- a/arch/arm/Kconfig
> +++ b/arch/arm/Kconfig
> @@ -30,6 +30,7 @@ config ARM
>  	select HAVE_SPARSE_IRQ
>  	select GENERIC_IRQ_SHOW
>  	select CPU_PM if (SUSPEND || CPU_IDLE)
> +	select HAVE_BPF_JIT if (!THUMB2_KERNEL && AEABI)

Have to tried your code with a Thumb-2 kernel?

Quickly skimming though your patch, I don't see an obvious reason why we
can't have that working, though I haven't tried it yet.

Note that it's fine to have the JIT generating ARM code, even if the rest
if the kernel is Thumb-2.  This would only start to cause problems if we
want to do things like set kprobes in the JITted code, or unwind out of
the JITted code.

It's just necessary to make sure that calls/returns into/out of the
JITted code are handled correctly.  You don't seem to do any scary
arithmetic or mov to or from pc or lr, and it doesn't look like you ever
call back into the kernel from JITted code, so the implementation is
probably safe for ARM/Thumb interworking already (if I've understood
correctly).

It doesn't look hard to port the JIT to generate Thumb-2 code directly
either -- but I suggest not to worry about that initially.  So long as
the ARM-based JIT works in a Thumb-2 kernel, it will be useful.

[...]

> diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
> new file mode 100644
> index 0000000..d2b86fa
> --- /dev/null
> +++ b/arch/arm/net/bpf_jit_32.c

[...]

> +static int build_body(struct jit_ctx *ctx)
> +{

[...]

> +		case BPF_S_ALU_DIV_K:
> +			/* K is not zero, it was previously checked */
> +			emit_mov_i(ARM_R1, k, ctx);
> +			goto div;
> +		case BPF_S_ALU_DIV_X:
> +			ctx->seen |= SEEN_X;
> +			emit(ARM_CMP_I(r_X, 0), ctx);
> +			emit_err_ret(ARM_COND_EQ, ctx);
> +			emit(ARM_MOV_R(ARM_R1, r_X), ctx);
> +div:
> +			ctx->seen |= SEEN_CALL;
> +
> +			emit(ARM_MOV_R(ARM_R0, r_A), ctx);
> +			emit_mov_i(r_scratch, (u32)__aeabi_uidiv, ctx);
> +			emit(ARM_BLX_R(r_scratch), ctx);
> +			emit(ARM_MOV_R(r_A, ARM_R0), ctx);
> +			break;

I don't know how much division is used by the packet filter JIT.  If
it gets used a significant amount, you might want to support hardware
divide for CPUs that have it:

Cortex-A15 and later processors may have hardware integer divide
support.  You can check for its availability at runtime using by testing
the HWCAP_IDIVA (for ARM) or HWCAP_IDIVT (for Thumb) bits in elf_hwcap
(see arch/arm/include/asm/hwcap.h).

Cheers
---Dave

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] ARM: net: JIT compiler for packet filters
  2011-12-19 12:50   ` Dave Martin
@ 2011-12-19 15:19     ` Uwe Kleine-König
  -1 siblings, 0 replies; 30+ messages in thread
From: Uwe Kleine-König @ 2011-12-19 15:19 UTC (permalink / raw)
  To: Dave Martin; +Cc: Mircea Gherzan, netdev, linux-arm-kernel

On Mon, Dec 19, 2011 at 12:50:21PM +0000, Dave Martin wrote:
> On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
> > Based of Matt Evans's PPC64 implementation.
> > 
> > Supports only ARM mode with EABI.
> > 
> > Supports both little and big endian. Depends on the support for
> > unaligned loads on ARMv7. Does not support all the BPF opcodes
> > that deal with ancillary data. The scratch memory of the filter
> > lives on the stack.
> > 
> > Enabled in the same way as for x86-64 and PPC64:
> > 
> > 	echo 1 > /proc/sys/net/core/bpf_jit_enable
> > 
> > A value greater than 1 enables opcode output.
> > 
> > Signed-off-by: Mircea Gherzan <mgherzan@gmail.com>
> > ---
> 
> Interesting patch... I haven't reviewed in detail, but I have a few
> quick comments.
> 
> > 
> > Changes in v2:
> >  * enable the compiler ony for ARMv5+ because of the BLX instruction
> >  * use the same comparison for the ARM version checks
> >  * use misaligned accesses on ARMv6
> 
> You probably want to change the commit message now to reflect this.
> 
> >  * fix the SEEN_MEM
> >  * fix the mem_words_used()
> > 
> >  arch/arm/Kconfig          |    1 +
> >  arch/arm/Makefile         |    1 +
> >  arch/arm/net/Makefile     |    3 +
> >  arch/arm/net/bpf_jit_32.c |  838 +++++++++++++++++++++++++++++++++++++++++++++
> >  arch/arm/net/bpf_jit_32.h |  174 ++++++++++
> >  5 files changed, 1017 insertions(+), 0 deletions(-)
> >  create mode 100644 arch/arm/net/Makefile
> >  create mode 100644 arch/arm/net/bpf_jit_32.c
> >  create mode 100644 arch/arm/net/bpf_jit_32.h
> > 
> > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> > index abba5b8..ea65c41 100644
> > --- a/arch/arm/Kconfig
> > +++ b/arch/arm/Kconfig
> > @@ -30,6 +30,7 @@ config ARM
> >  	select HAVE_SPARSE_IRQ
> >  	select GENERIC_IRQ_SHOW
> >  	select CPU_PM if (SUSPEND || CPU_IDLE)
> > +	select HAVE_BPF_JIT if (!THUMB2_KERNEL && AEABI)
> 
> Have to tried your code with a Thumb-2 kernel?
> 
> Quickly skimming though your patch, I don't see an obvious reason why we
> can't have that working, though I haven't tried it yet.
> 
> Note that it's fine to have the JIT generating ARM code, even if the rest
> if the kernel is Thumb-2.  This would only start to cause problems if we
> want to do things like set kprobes in the JITted code, or unwind out of
> the JITted code.
or use a CPU that doesn't speak ARM (e.g. v7M)

> It's just necessary to make sure that calls/returns into/out of the
> JITted code are handled correctly.  You don't seem to do any scary
> arithmetic or mov to or from pc or lr, and it doesn't look like you ever
> call back into the kernel from JITted code, so the implementation is
> probably safe for ARM/Thumb interworking already (if I've understood
> correctly).
> 
> It doesn't look hard to port the JIT to generate Thumb-2 code directly
> either -- but I suggest not to worry about that initially.  So long as
> the ARM-based JIT works in a Thumb-2 kernel, it will be useful.

Best regards
Uwe

-- 
Pengutronix e.K.                           | Uwe Kleine-König            |
Industrial Linux Solutions                 | http://www.pengutronix.de/  |

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v2] ARM: net: JIT compiler for packet filters
@ 2011-12-19 15:19     ` Uwe Kleine-König
  0 siblings, 0 replies; 30+ messages in thread
From: Uwe Kleine-König @ 2011-12-19 15:19 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 19, 2011 at 12:50:21PM +0000, Dave Martin wrote:
> On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
> > Based of Matt Evans's PPC64 implementation.
> > 
> > Supports only ARM mode with EABI.
> > 
> > Supports both little and big endian. Depends on the support for
> > unaligned loads on ARMv7. Does not support all the BPF opcodes
> > that deal with ancillary data. The scratch memory of the filter
> > lives on the stack.
> > 
> > Enabled in the same way as for x86-64 and PPC64:
> > 
> > 	echo 1 > /proc/sys/net/core/bpf_jit_enable
> > 
> > A value greater than 1 enables opcode output.
> > 
> > Signed-off-by: Mircea Gherzan <mgherzan@gmail.com>
> > ---
> 
> Interesting patch... I haven't reviewed in detail, but I have a few
> quick comments.
> 
> > 
> > Changes in v2:
> >  * enable the compiler ony for ARMv5+ because of the BLX instruction
> >  * use the same comparison for the ARM version checks
> >  * use misaligned accesses on ARMv6
> 
> You probably want to change the commit message now to reflect this.
> 
> >  * fix the SEEN_MEM
> >  * fix the mem_words_used()
> > 
> >  arch/arm/Kconfig          |    1 +
> >  arch/arm/Makefile         |    1 +
> >  arch/arm/net/Makefile     |    3 +
> >  arch/arm/net/bpf_jit_32.c |  838 +++++++++++++++++++++++++++++++++++++++++++++
> >  arch/arm/net/bpf_jit_32.h |  174 ++++++++++
> >  5 files changed, 1017 insertions(+), 0 deletions(-)
> >  create mode 100644 arch/arm/net/Makefile
> >  create mode 100644 arch/arm/net/bpf_jit_32.c
> >  create mode 100644 arch/arm/net/bpf_jit_32.h
> > 
> > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> > index abba5b8..ea65c41 100644
> > --- a/arch/arm/Kconfig
> > +++ b/arch/arm/Kconfig
> > @@ -30,6 +30,7 @@ config ARM
> >  	select HAVE_SPARSE_IRQ
> >  	select GENERIC_IRQ_SHOW
> >  	select CPU_PM if (SUSPEND || CPU_IDLE)
> > +	select HAVE_BPF_JIT if (!THUMB2_KERNEL && AEABI)
> 
> Have to tried your code with a Thumb-2 kernel?
> 
> Quickly skimming though your patch, I don't see an obvious reason why we
> can't have that working, though I haven't tried it yet.
> 
> Note that it's fine to have the JIT generating ARM code, even if the rest
> if the kernel is Thumb-2.  This would only start to cause problems if we
> want to do things like set kprobes in the JITted code, or unwind out of
> the JITted code.
or use a CPU that doesn't speak ARM (e.g. v7M)

> It's just necessary to make sure that calls/returns into/out of the
> JITted code are handled correctly.  You don't seem to do any scary
> arithmetic or mov to or from pc or lr, and it doesn't look like you ever
> call back into the kernel from JITted code, so the implementation is
> probably safe for ARM/Thumb interworking already (if I've understood
> correctly).
> 
> It doesn't look hard to port the JIT to generate Thumb-2 code directly
> either -- but I suggest not to worry about that initially.  So long as
> the ARM-based JIT works in a Thumb-2 kernel, it will be useful.

Best regards
Uwe

-- 
Pengutronix e.K.                           | Uwe Kleine-K?nig            |
Industrial Linux Solutions                 | http://www.pengutronix.de/  |

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] ARM: net: JIT compiler for packet filters
  2011-12-19 15:19     ` Uwe Kleine-König
@ 2011-12-19 15:24       ` Dave Martin
  -1 siblings, 0 replies; 30+ messages in thread
From: Dave Martin @ 2011-12-19 15:24 UTC (permalink / raw)
  To: Uwe Kleine-K?nig; +Cc: Mircea Gherzan, netdev, linux-arm-kernel

On Mon, Dec 19, 2011 at 04:19:58PM +0100, Uwe Kleine-K?nig wrote:
> On Mon, Dec 19, 2011 at 12:50:21PM +0000, Dave Martin wrote:
> > On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
> > > Based of Matt Evans's PPC64 implementation.
> > > 
> > > Supports only ARM mode with EABI.
> > > 
> > > Supports both little and big endian. Depends on the support for
> > > unaligned loads on ARMv7. Does not support all the BPF opcodes
> > > that deal with ancillary data. The scratch memory of the filter
> > > lives on the stack.
> > > 
> > > Enabled in the same way as for x86-64 and PPC64:
> > > 
> > > 	echo 1 > /proc/sys/net/core/bpf_jit_enable
> > > 
> > > A value greater than 1 enables opcode output.
> > > 
> > > Signed-off-by: Mircea Gherzan <mgherzan@gmail.com>
> > > ---
> > 
> > Interesting patch... I haven't reviewed in detail, but I have a few
> > quick comments.
> > 
> > > 
> > > Changes in v2:
> > >  * enable the compiler ony for ARMv5+ because of the BLX instruction
> > >  * use the same comparison for the ARM version checks
> > >  * use misaligned accesses on ARMv6
> > 
> > You probably want to change the commit message now to reflect this.
> > 
> > >  * fix the SEEN_MEM
> > >  * fix the mem_words_used()
> > > 
> > >  arch/arm/Kconfig          |    1 +
> > >  arch/arm/Makefile         |    1 +
> > >  arch/arm/net/Makefile     |    3 +
> > >  arch/arm/net/bpf_jit_32.c |  838 +++++++++++++++++++++++++++++++++++++++++++++
> > >  arch/arm/net/bpf_jit_32.h |  174 ++++++++++
> > >  5 files changed, 1017 insertions(+), 0 deletions(-)
> > >  create mode 100644 arch/arm/net/Makefile
> > >  create mode 100644 arch/arm/net/bpf_jit_32.c
> > >  create mode 100644 arch/arm/net/bpf_jit_32.h
> > > 
> > > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> > > index abba5b8..ea65c41 100644
> > > --- a/arch/arm/Kconfig
> > > +++ b/arch/arm/Kconfig
> > > @@ -30,6 +30,7 @@ config ARM
> > >  	select HAVE_SPARSE_IRQ
> > >  	select GENERIC_IRQ_SHOW
> > >  	select CPU_PM if (SUSPEND || CPU_IDLE)
> > > +	select HAVE_BPF_JIT if (!THUMB2_KERNEL && AEABI)
> > 
> > Have to tried your code with a Thumb-2 kernel?
> > 
> > Quickly skimming though your patch, I don't see an obvious reason why we
> > can't have that working, though I haven't tried it yet.
> > 
> > Note that it's fine to have the JIT generating ARM code, even if the rest
> > if the kernel is Thumb-2.  This would only start to cause problems if we
> > want to do things like set kprobes in the JITted code, or unwind out of
> > the JITted code.
> or use a CPU that doesn't speak ARM (e.g. v7M)

Indeed... I was assuming though that that would be out of scope for the
first iteration of this code.

Cheers
---Dave

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v2] ARM: net: JIT compiler for packet filters
@ 2011-12-19 15:24       ` Dave Martin
  0 siblings, 0 replies; 30+ messages in thread
From: Dave Martin @ 2011-12-19 15:24 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 19, 2011 at 04:19:58PM +0100, Uwe Kleine-K?nig wrote:
> On Mon, Dec 19, 2011 at 12:50:21PM +0000, Dave Martin wrote:
> > On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
> > > Based of Matt Evans's PPC64 implementation.
> > > 
> > > Supports only ARM mode with EABI.
> > > 
> > > Supports both little and big endian. Depends on the support for
> > > unaligned loads on ARMv7. Does not support all the BPF opcodes
> > > that deal with ancillary data. The scratch memory of the filter
> > > lives on the stack.
> > > 
> > > Enabled in the same way as for x86-64 and PPC64:
> > > 
> > > 	echo 1 > /proc/sys/net/core/bpf_jit_enable
> > > 
> > > A value greater than 1 enables opcode output.
> > > 
> > > Signed-off-by: Mircea Gherzan <mgherzan@gmail.com>
> > > ---
> > 
> > Interesting patch... I haven't reviewed in detail, but I have a few
> > quick comments.
> > 
> > > 
> > > Changes in v2:
> > >  * enable the compiler ony for ARMv5+ because of the BLX instruction
> > >  * use the same comparison for the ARM version checks
> > >  * use misaligned accesses on ARMv6
> > 
> > You probably want to change the commit message now to reflect this.
> > 
> > >  * fix the SEEN_MEM
> > >  * fix the mem_words_used()
> > > 
> > >  arch/arm/Kconfig          |    1 +
> > >  arch/arm/Makefile         |    1 +
> > >  arch/arm/net/Makefile     |    3 +
> > >  arch/arm/net/bpf_jit_32.c |  838 +++++++++++++++++++++++++++++++++++++++++++++
> > >  arch/arm/net/bpf_jit_32.h |  174 ++++++++++
> > >  5 files changed, 1017 insertions(+), 0 deletions(-)
> > >  create mode 100644 arch/arm/net/Makefile
> > >  create mode 100644 arch/arm/net/bpf_jit_32.c
> > >  create mode 100644 arch/arm/net/bpf_jit_32.h
> > > 
> > > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> > > index abba5b8..ea65c41 100644
> > > --- a/arch/arm/Kconfig
> > > +++ b/arch/arm/Kconfig
> > > @@ -30,6 +30,7 @@ config ARM
> > >  	select HAVE_SPARSE_IRQ
> > >  	select GENERIC_IRQ_SHOW
> > >  	select CPU_PM if (SUSPEND || CPU_IDLE)
> > > +	select HAVE_BPF_JIT if (!THUMB2_KERNEL && AEABI)
> > 
> > Have to tried your code with a Thumb-2 kernel?
> > 
> > Quickly skimming though your patch, I don't see an obvious reason why we
> > can't have that working, though I haven't tried it yet.
> > 
> > Note that it's fine to have the JIT generating ARM code, even if the rest
> > if the kernel is Thumb-2.  This would only start to cause problems if we
> > want to do things like set kprobes in the JITted code, or unwind out of
> > the JITted code.
> or use a CPU that doesn't speak ARM (e.g. v7M)

Indeed... I was assuming though that that would be out of scope for the
first iteration of this code.

Cheers
---Dave

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] ARM: net: JIT compiler for packet filters
  2011-12-19 15:24       ` Dave Martin
@ 2011-12-19 15:33         ` Uwe Kleine-König
  -1 siblings, 0 replies; 30+ messages in thread
From: Uwe Kleine-König @ 2011-12-19 15:33 UTC (permalink / raw)
  To: Dave Martin; +Cc: Mircea Gherzan, netdev, linux-arm-kernel

Hello,

On Mon, Dec 19, 2011 at 03:24:18PM +0000, Dave Martin wrote:
> On Mon, Dec 19, 2011 at 04:19:58PM +0100, Uwe Kleine-K?nig wrote:
> > On Mon, Dec 19, 2011 at 12:50:21PM +0000, Dave Martin wrote:
> > > On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
> > > > Based of Matt Evans's PPC64 implementation.
> > > Note that it's fine to have the JIT generating ARM code, even if the rest
> > > if the kernel is Thumb-2.  This would only start to cause problems if we
> > > want to do things like set kprobes in the JITted code, or unwind out of
> > > the JITted code.
> > or use a CPU that doesn't speak ARM (e.g. v7M)
> 
> Indeed... I was assuming though that that would be out of scope for the
> first iteration of this code.
Right, depending on !THUMB2 is fine. Only generating ARM code with
THUMB2=y is not.

Uwe

-- 
Pengutronix e.K.                           | Uwe Kleine-König            |
Industrial Linux Solutions                 | http://www.pengutronix.de/  |

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v2] ARM: net: JIT compiler for packet filters
@ 2011-12-19 15:33         ` Uwe Kleine-König
  0 siblings, 0 replies; 30+ messages in thread
From: Uwe Kleine-König @ 2011-12-19 15:33 UTC (permalink / raw)
  To: linux-arm-kernel

Hello,

On Mon, Dec 19, 2011 at 03:24:18PM +0000, Dave Martin wrote:
> On Mon, Dec 19, 2011 at 04:19:58PM +0100, Uwe Kleine-K?nig wrote:
> > On Mon, Dec 19, 2011 at 12:50:21PM +0000, Dave Martin wrote:
> > > On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
> > > > Based of Matt Evans's PPC64 implementation.
> > > Note that it's fine to have the JIT generating ARM code, even if the rest
> > > if the kernel is Thumb-2.  This would only start to cause problems if we
> > > want to do things like set kprobes in the JITted code, or unwind out of
> > > the JITted code.
> > or use a CPU that doesn't speak ARM (e.g. v7M)
> 
> Indeed... I was assuming though that that would be out of scope for the
> first iteration of this code.
Right, depending on !THUMB2 is fine. Only generating ARM code with
THUMB2=y is not.

Uwe

-- 
Pengutronix e.K.                           | Uwe Kleine-K?nig            |
Industrial Linux Solutions                 | http://www.pengutronix.de/  |

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] ARM: net: JIT compiler for packet filters
  2011-12-19 15:33         ` Uwe Kleine-König
@ 2011-12-19 15:52           ` Dave Martin
  -1 siblings, 0 replies; 30+ messages in thread
From: Dave Martin @ 2011-12-19 15:52 UTC (permalink / raw)
  To: Uwe Kleine-K?nig; +Cc: Mircea Gherzan, netdev, linux-arm-kernel

On Mon, Dec 19, 2011 at 04:33:38PM +0100, Uwe Kleine-K?nig wrote:
> Hello,
> 
> On Mon, Dec 19, 2011 at 03:24:18PM +0000, Dave Martin wrote:
> > On Mon, Dec 19, 2011 at 04:19:58PM +0100, Uwe Kleine-K?nig wrote:
> > > On Mon, Dec 19, 2011 at 12:50:21PM +0000, Dave Martin wrote:
> > > > On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
> > > > > Based of Matt Evans's PPC64 implementation.
> > > > Note that it's fine to have the JIT generating ARM code, even if the rest
> > > > if the kernel is Thumb-2.  This would only start to cause problems if we
> > > > want to do things like set kprobes in the JITted code, or unwind out of
> > > > the JITted code.
> > > or use a CPU that doesn't speak ARM (e.g. v7M)
> > 
> > Indeed... I was assuming though that that would be out of scope for the
> > first iteration of this code.
> Right, depending on !THUMB2 is fine. Only generating ARM code with
> THUMB2=y is not.

The kernel doesn't support v7-M upstream though.  CONFIG_THUMB2_KERNEL=y
doesn't mean that there is no ARM code present -- there _is_ ARM code in
the kernel image even with this option (though not very much).

IMHO, CONFIG_THUMB2_KERNEL should be viewed as similiar to an optimisation
option or other code gen options -- it's a hint to use the Thumb-2
instruction set in preference to ARM, but doesn't make a 100% guarantee.
Some parts of the kernel do contain ARM code even in this configuration.


The real problem we have here is that we do not distinguish between v7-A
and v7-M in Kconfig (this is not the same thing as CONFIG_MMU, since
there's no special reason why it should be impossible to build a ucLinux
kernel for v7-A -- in which case the ARM instruction is available)

In the meantime, we could temporarily make the JIT dependent on
MMU || !THUMB2_KERNEL, but that doesn't seem strictly correct.


If we had dedicated Kconfig variables describing the (non-) availability
of the two instruction sets, that might enable us to get this right.

Cheers
---Dave

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v2] ARM: net: JIT compiler for packet filters
@ 2011-12-19 15:52           ` Dave Martin
  0 siblings, 0 replies; 30+ messages in thread
From: Dave Martin @ 2011-12-19 15:52 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 19, 2011 at 04:33:38PM +0100, Uwe Kleine-K?nig wrote:
> Hello,
> 
> On Mon, Dec 19, 2011 at 03:24:18PM +0000, Dave Martin wrote:
> > On Mon, Dec 19, 2011 at 04:19:58PM +0100, Uwe Kleine-K?nig wrote:
> > > On Mon, Dec 19, 2011 at 12:50:21PM +0000, Dave Martin wrote:
> > > > On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
> > > > > Based of Matt Evans's PPC64 implementation.
> > > > Note that it's fine to have the JIT generating ARM code, even if the rest
> > > > if the kernel is Thumb-2.  This would only start to cause problems if we
> > > > want to do things like set kprobes in the JITted code, or unwind out of
> > > > the JITted code.
> > > or use a CPU that doesn't speak ARM (e.g. v7M)
> > 
> > Indeed... I was assuming though that that would be out of scope for the
> > first iteration of this code.
> Right, depending on !THUMB2 is fine. Only generating ARM code with
> THUMB2=y is not.

The kernel doesn't support v7-M upstream though.  CONFIG_THUMB2_KERNEL=y
doesn't mean that there is no ARM code present -- there _is_ ARM code in
the kernel image even with this option (though not very much).

IMHO, CONFIG_THUMB2_KERNEL should be viewed as similiar to an optimisation
option or other code gen options -- it's a hint to use the Thumb-2
instruction set in preference to ARM, but doesn't make a 100% guarantee.
Some parts of the kernel do contain ARM code even in this configuration.


The real problem we have here is that we do not distinguish between v7-A
and v7-M in Kconfig (this is not the same thing as CONFIG_MMU, since
there's no special reason why it should be impossible to build a ucLinux
kernel for v7-A -- in which case the ARM instruction is available)

In the meantime, we could temporarily make the JIT dependent on
MMU || !THUMB2_KERNEL, but that doesn't seem strictly correct.


If we had dedicated Kconfig variables describing the (non-) availability
of the two instruction sets, that might enable us to get this right.

Cheers
---Dave

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] ARM: net: JIT compiler for packet filters
  2011-12-19 12:50   ` Dave Martin
@ 2011-12-19 16:45     ` Mircea Gherzan
  -1 siblings, 0 replies; 30+ messages in thread
From: Mircea Gherzan @ 2011-12-19 16:45 UTC (permalink / raw)
  To: Dave Martin; +Cc: linux-arm-kernel, netdev

Hi,

On Mon, Dec 19, 2011 at 12:50:21PM +0000, Dave Martin wrote:
> On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
> > Based of Matt Evans's PPC64 implementation.
> > 
> > Supports only ARM mode with EABI.
> > 
> > Supports both little and big endian. Depends on the support for
> > unaligned loads on ARMv7. Does not support all the BPF opcodes
> > that deal with ancillary data. The scratch memory of the filter
> > lives on the stack.
> > 
> > Enabled in the same way as for x86-64 and PPC64:
> > 
> > 	echo 1 > /proc/sys/net/core/bpf_jit_enable
> > 
> > A value greater than 1 enables opcode output.
> > 
> > Signed-off-by: Mircea Gherzan <mgherzan@gmail.com>
> > ---
> 
> Interesting patch... I haven't reviewed in detail, but I have a few
> quick comments.
> 
> > 
> > Changes in v2:
> >  * enable the compiler ony for ARMv5+ because of the BLX instruction
> >  * use the same comparison for the ARM version checks
> >  * use misaligned accesses on ARMv6
> 
> You probably want to change the commit message now to reflect this.

Will do in the next version.

> 
> >  * fix the SEEN_MEM
> >  * fix the mem_words_used()
> > 
> >  arch/arm/Kconfig          |    1 +
> >  arch/arm/Makefile         |    1 +
> >  arch/arm/net/Makefile     |    3 +
> >  arch/arm/net/bpf_jit_32.c |  838 +++++++++++++++++++++++++++++++++++++++++++++
> >  arch/arm/net/bpf_jit_32.h |  174 ++++++++++
> >  5 files changed, 1017 insertions(+), 0 deletions(-)
> >  create mode 100644 arch/arm/net/Makefile
> >  create mode 100644 arch/arm/net/bpf_jit_32.c
> >  create mode 100644 arch/arm/net/bpf_jit_32.h
> > 
> > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> > index abba5b8..ea65c41 100644
> > --- a/arch/arm/Kconfig
> > +++ b/arch/arm/Kconfig
> > @@ -30,6 +30,7 @@ config ARM
> >  	select HAVE_SPARSE_IRQ
> >  	select GENERIC_IRQ_SHOW
> >  	select CPU_PM if (SUSPEND || CPU_IDLE)
> > +	select HAVE_BPF_JIT if (!THUMB2_KERNEL && AEABI)
> 
> Have to tried your code with a Thumb-2 kernel?

Not yet.

> Quickly skimming though your patch, I don't see an obvious reason why we
> can't have that working, though I haven't tried it yet.
> 
> Note that it's fine to have the JIT generating ARM code, even if the rest
> if the kernel is Thumb-2.  This would only start to cause problems if we
> want to do things like set kprobes in the JITted code, or unwind out of
> the JITted code.
> 
> It's just necessary to make sure that calls/returns into/out of the
> JITted code are handled correctly.  You don't seem to do any scary
> arithmetic or mov to or from pc or lr, and it doesn't look like you ever
> call back into the kernel from JITted code, so the implementation is
> probably safe for ARM/Thumb interworking already (if I've understood
> correctly).

The JITed code calls back to the kernel for the load helpers. So setting
bit 0 is required.

> It doesn't look hard to port the JIT to generate Thumb-2 code directly
> either -- but I suggest not to worry about that initially.  So long as
> the ARM-based JIT works in a Thumb-2 kernel, it will be useful.
> 
> [...]
> 
> > diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
> > new file mode 100644
> > index 0000000..d2b86fa
> > --- /dev/null
> > +++ b/arch/arm/net/bpf_jit_32.c
> 
> [...]
> 
> > +static int build_body(struct jit_ctx *ctx)
> > +{
> 
> [...]
> 
> > +		case BPF_S_ALU_DIV_K:
> > +			/* K is not zero, it was previously checked */
> > +			emit_mov_i(ARM_R1, k, ctx);
> > +			goto div;
> > +		case BPF_S_ALU_DIV_X:
> > +			ctx->seen |= SEEN_X;
> > +			emit(ARM_CMP_I(r_X, 0), ctx);
> > +			emit_err_ret(ARM_COND_EQ, ctx);
> > +			emit(ARM_MOV_R(ARM_R1, r_X), ctx);
> > +div:
> > +			ctx->seen |= SEEN_CALL;
> > +
> > +			emit(ARM_MOV_R(ARM_R0, r_A), ctx);
> > +			emit_mov_i(r_scratch, (u32)__aeabi_uidiv, ctx);
> > +			emit(ARM_BLX_R(r_scratch), ctx);
> > +			emit(ARM_MOV_R(r_A, ARM_R0), ctx);
> > +			break;
> 
> I don't know how much division is used by the packet filter JIT.  If
> it gets used a significant amount, you might want to support hardware
> divide for CPUs that have it:

Division rarely appears in "normal" BPF filters: it must be an explicit
part of the human-readable filter expression (the BPF compiler does not
generate division opcodes in other cases, AFAICT). Nonetheless, support
for hardware division would spare a bit of stack space for filters like
"len / 100 == 1".

> Cortex-A15 and later processors may have hardware integer divide
> support.  You can check for its availability at runtime using by testing
> the HWCAP_IDIVA (for ARM) or HWCAP_IDIVT (for Thumb) bits in elf_hwcap
> (see arch/arm/include/asm/hwcap.h).

I will include this in the next version of the patch.

Cheers,
Mircea

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v2] ARM: net: JIT compiler for packet filters
@ 2011-12-19 16:45     ` Mircea Gherzan
  0 siblings, 0 replies; 30+ messages in thread
From: Mircea Gherzan @ 2011-12-19 16:45 UTC (permalink / raw)
  To: linux-arm-kernel

Hi,

On Mon, Dec 19, 2011 at 12:50:21PM +0000, Dave Martin wrote:
> On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
> > Based of Matt Evans's PPC64 implementation.
> > 
> > Supports only ARM mode with EABI.
> > 
> > Supports both little and big endian. Depends on the support for
> > unaligned loads on ARMv7. Does not support all the BPF opcodes
> > that deal with ancillary data. The scratch memory of the filter
> > lives on the stack.
> > 
> > Enabled in the same way as for x86-64 and PPC64:
> > 
> > 	echo 1 > /proc/sys/net/core/bpf_jit_enable
> > 
> > A value greater than 1 enables opcode output.
> > 
> > Signed-off-by: Mircea Gherzan <mgherzan@gmail.com>
> > ---
> 
> Interesting patch... I haven't reviewed in detail, but I have a few
> quick comments.
> 
> > 
> > Changes in v2:
> >  * enable the compiler ony for ARMv5+ because of the BLX instruction
> >  * use the same comparison for the ARM version checks
> >  * use misaligned accesses on ARMv6
> 
> You probably want to change the commit message now to reflect this.

Will do in the next version.

> 
> >  * fix the SEEN_MEM
> >  * fix the mem_words_used()
> > 
> >  arch/arm/Kconfig          |    1 +
> >  arch/arm/Makefile         |    1 +
> >  arch/arm/net/Makefile     |    3 +
> >  arch/arm/net/bpf_jit_32.c |  838 +++++++++++++++++++++++++++++++++++++++++++++
> >  arch/arm/net/bpf_jit_32.h |  174 ++++++++++
> >  5 files changed, 1017 insertions(+), 0 deletions(-)
> >  create mode 100644 arch/arm/net/Makefile
> >  create mode 100644 arch/arm/net/bpf_jit_32.c
> >  create mode 100644 arch/arm/net/bpf_jit_32.h
> > 
> > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> > index abba5b8..ea65c41 100644
> > --- a/arch/arm/Kconfig
> > +++ b/arch/arm/Kconfig
> > @@ -30,6 +30,7 @@ config ARM
> >  	select HAVE_SPARSE_IRQ
> >  	select GENERIC_IRQ_SHOW
> >  	select CPU_PM if (SUSPEND || CPU_IDLE)
> > +	select HAVE_BPF_JIT if (!THUMB2_KERNEL && AEABI)
> 
> Have to tried your code with a Thumb-2 kernel?

Not yet.

> Quickly skimming though your patch, I don't see an obvious reason why we
> can't have that working, though I haven't tried it yet.
> 
> Note that it's fine to have the JIT generating ARM code, even if the rest
> if the kernel is Thumb-2.  This would only start to cause problems if we
> want to do things like set kprobes in the JITted code, or unwind out of
> the JITted code.
> 
> It's just necessary to make sure that calls/returns into/out of the
> JITted code are handled correctly.  You don't seem to do any scary
> arithmetic or mov to or from pc or lr, and it doesn't look like you ever
> call back into the kernel from JITted code, so the implementation is
> probably safe for ARM/Thumb interworking already (if I've understood
> correctly).

The JITed code calls back to the kernel for the load helpers. So setting
bit 0 is required.

> It doesn't look hard to port the JIT to generate Thumb-2 code directly
> either -- but I suggest not to worry about that initially.  So long as
> the ARM-based JIT works in a Thumb-2 kernel, it will be useful.
> 
> [...]
> 
> > diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
> > new file mode 100644
> > index 0000000..d2b86fa
> > --- /dev/null
> > +++ b/arch/arm/net/bpf_jit_32.c
> 
> [...]
> 
> > +static int build_body(struct jit_ctx *ctx)
> > +{
> 
> [...]
> 
> > +		case BPF_S_ALU_DIV_K:
> > +			/* K is not zero, it was previously checked */
> > +			emit_mov_i(ARM_R1, k, ctx);
> > +			goto div;
> > +		case BPF_S_ALU_DIV_X:
> > +			ctx->seen |= SEEN_X;
> > +			emit(ARM_CMP_I(r_X, 0), ctx);
> > +			emit_err_ret(ARM_COND_EQ, ctx);
> > +			emit(ARM_MOV_R(ARM_R1, r_X), ctx);
> > +div:
> > +			ctx->seen |= SEEN_CALL;
> > +
> > +			emit(ARM_MOV_R(ARM_R0, r_A), ctx);
> > +			emit_mov_i(r_scratch, (u32)__aeabi_uidiv, ctx);
> > +			emit(ARM_BLX_R(r_scratch), ctx);
> > +			emit(ARM_MOV_R(r_A, ARM_R0), ctx);
> > +			break;
> 
> I don't know how much division is used by the packet filter JIT.  If
> it gets used a significant amount, you might want to support hardware
> divide for CPUs that have it:

Division rarely appears in "normal" BPF filters: it must be an explicit
part of the human-readable filter expression (the BPF compiler does not
generate division opcodes in other cases, AFAICT). Nonetheless, support
for hardware division would spare a bit of stack space for filters like
"len / 100 == 1".

> Cortex-A15 and later processors may have hardware integer divide
> support.  You can check for its availability at runtime using by testing
> the HWCAP_IDIVA (for ARM) or HWCAP_IDIVT (for Thumb) bits in elf_hwcap
> (see arch/arm/include/asm/hwcap.h).

I will include this in the next version of the patch.

Cheers,
Mircea

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] ARM: net: JIT compiler for packet filters
  2011-12-19 15:52           ` Dave Martin
@ 2011-12-19 17:14             ` Mircea Gherzan
  -1 siblings, 0 replies; 30+ messages in thread
From: Mircea Gherzan @ 2011-12-19 17:14 UTC (permalink / raw)
  To: Dave Martin; +Cc: linux-arm-kernel, netdev

Hi,

On Mon, Dec 19, 2011 at 03:52:52PM +0000, Dave Martin wrote:
> On Mon, Dec 19, 2011 at 04:33:38PM +0100, Uwe Kleine-K?nig wrote:
> > Hello,
> > 
> > On Mon, Dec 19, 2011 at 03:24:18PM +0000, Dave Martin wrote:
> > > On Mon, Dec 19, 2011 at 04:19:58PM +0100, Uwe Kleine-K?nig wrote:
> > > > On Mon, Dec 19, 2011 at 12:50:21PM +0000, Dave Martin wrote:
> > > > > On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
> > > > > > Based of Matt Evans's PPC64 implementation.
> > > > > Note that it's fine to have the JIT generating ARM code, even if the rest
> > > > > if the kernel is Thumb-2.  This would only start to cause problems if we
> > > > > want to do things like set kprobes in the JITted code, or unwind out of
> > > > > the JITted code.
> > > > or use a CPU that doesn't speak ARM (e.g. v7M)
> > > 
> > > Indeed... I was assuming though that that would be out of scope for the
> > > first iteration of this code.
> > Right, depending on !THUMB2 is fine. Only generating ARM code with
> > THUMB2=y is not.
> 
> The kernel doesn't support v7-M upstream though.  CONFIG_THUMB2_KERNEL=y
> doesn't mean that there is no ARM code present -- there _is_ ARM code in
> the kernel image even with this option (though not very much).

What about relying at runtime on bits[3:0] of ID_PFR0? And this only for
ARMv7.

Are there any other ARM cores of interest that do _not_ support the ARM
instruction set?

Thanks,
Mircea

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v2] ARM: net: JIT compiler for packet filters
@ 2011-12-19 17:14             ` Mircea Gherzan
  0 siblings, 0 replies; 30+ messages in thread
From: Mircea Gherzan @ 2011-12-19 17:14 UTC (permalink / raw)
  To: linux-arm-kernel

Hi,

On Mon, Dec 19, 2011 at 03:52:52PM +0000, Dave Martin wrote:
> On Mon, Dec 19, 2011 at 04:33:38PM +0100, Uwe Kleine-K?nig wrote:
> > Hello,
> > 
> > On Mon, Dec 19, 2011 at 03:24:18PM +0000, Dave Martin wrote:
> > > On Mon, Dec 19, 2011 at 04:19:58PM +0100, Uwe Kleine-K?nig wrote:
> > > > On Mon, Dec 19, 2011 at 12:50:21PM +0000, Dave Martin wrote:
> > > > > On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
> > > > > > Based of Matt Evans's PPC64 implementation.
> > > > > Note that it's fine to have the JIT generating ARM code, even if the rest
> > > > > if the kernel is Thumb-2.  This would only start to cause problems if we
> > > > > want to do things like set kprobes in the JITted code, or unwind out of
> > > > > the JITted code.
> > > > or use a CPU that doesn't speak ARM (e.g. v7M)
> > > 
> > > Indeed... I was assuming though that that would be out of scope for the
> > > first iteration of this code.
> > Right, depending on !THUMB2 is fine. Only generating ARM code with
> > THUMB2=y is not.
> 
> The kernel doesn't support v7-M upstream though.  CONFIG_THUMB2_KERNEL=y
> doesn't mean that there is no ARM code present -- there _is_ ARM code in
> the kernel image even with this option (though not very much).

What about relying at runtime on bits[3:0] of ID_PFR0? And this only for
ARMv7.

Are there any other ARM cores of interest that do _not_ support the ARM
instruction set?

Thanks,
Mircea

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] ARM: net: JIT compiler for packet filters
  2011-12-19 15:52           ` Dave Martin
@ 2011-12-19 17:39             ` Catalin Marinas
  -1 siblings, 0 replies; 30+ messages in thread
From: Catalin Marinas @ 2011-12-19 17:39 UTC (permalink / raw)
  To: Dave Martin; +Cc: Uwe Kleine-K?nig, Mircea Gherzan, linux-arm-kernel, netdev

On 19 December 2011 15:52, Dave Martin <dave.martin@linaro.org> wrote:
> On Mon, Dec 19, 2011 at 04:33:38PM +0100, Uwe Kleine-K?nig wrote:
>> On Mon, Dec 19, 2011 at 03:24:18PM +0000, Dave Martin wrote:
>> > On Mon, Dec 19, 2011 at 04:19:58PM +0100, Uwe Kleine-K?nig wrote:
>> > > On Mon, Dec 19, 2011 at 12:50:21PM +0000, Dave Martin wrote:
>> > > > On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
>> > > > > Based of Matt Evans's PPC64 implementation.
>> > > > Note that it's fine to have the JIT generating ARM code, even if the rest
>> > > > if the kernel is Thumb-2.  This would only start to cause problems if we
>> > > > want to do things like set kprobes in the JITted code, or unwind out of
>> > > > the JITted code.
>> > > or use a CPU that doesn't speak ARM (e.g. v7M)
>> >
>> > Indeed... I was assuming though that that would be out of scope for the
>> > first iteration of this code.
>> Right, depending on !THUMB2 is fine. Only generating ARM code with
>> THUMB2=y is not.
>
> The kernel doesn't support v7-M upstream though.  CONFIG_THUMB2_KERNEL=y
> doesn't mean that there is no ARM code present -- there _is_ ARM code in
> the kernel image even with this option (though not very much).
>
> IMHO, CONFIG_THUMB2_KERNEL should be viewed as similiar to an optimisation
> option or other code gen options -- it's a hint to use the Thumb-2
> instruction set in preference to ARM, but doesn't make a 100% guarantee.
> Some parts of the kernel do contain ARM code even in this configuration.
>
>
> The real problem we have here is that we do not distinguish between v7-A
> and v7-M in Kconfig (this is not the same thing as CONFIG_MMU, since
> there's no special reason why it should be impossible to build a ucLinux
> kernel for v7-A -- in which case the ARM instruction is available)

If we ever take the time to push ARMv7-M into mainline, my patches
actually introduce CONFIG_CPU_V7M:

http://www.linux-arm.org/git?p=linux-2.6-stable.git;a=commitdiff;h=53a9ca6eccb3e14c7d9045870464cbfb37b0bb3b

-- 
Catalin

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v2] ARM: net: JIT compiler for packet filters
@ 2011-12-19 17:39             ` Catalin Marinas
  0 siblings, 0 replies; 30+ messages in thread
From: Catalin Marinas @ 2011-12-19 17:39 UTC (permalink / raw)
  To: linux-arm-kernel

On 19 December 2011 15:52, Dave Martin <dave.martin@linaro.org> wrote:
> On Mon, Dec 19, 2011 at 04:33:38PM +0100, Uwe Kleine-K?nig wrote:
>> On Mon, Dec 19, 2011 at 03:24:18PM +0000, Dave Martin wrote:
>> > On Mon, Dec 19, 2011 at 04:19:58PM +0100, Uwe Kleine-K?nig wrote:
>> > > On Mon, Dec 19, 2011 at 12:50:21PM +0000, Dave Martin wrote:
>> > > > On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
>> > > > > Based of Matt Evans's PPC64 implementation.
>> > > > Note that it's fine to have the JIT generating ARM code, even if the rest
>> > > > if the kernel is Thumb-2. ?This would only start to cause problems if we
>> > > > want to do things like set kprobes in the JITted code, or unwind out of
>> > > > the JITted code.
>> > > or use a CPU that doesn't speak ARM (e.g. v7M)
>> >
>> > Indeed... I was assuming though that that would be out of scope for the
>> > first iteration of this code.
>> Right, depending on !THUMB2 is fine. Only generating ARM code with
>> THUMB2=y is not.
>
> The kernel doesn't support v7-M upstream though. ?CONFIG_THUMB2_KERNEL=y
> doesn't mean that there is no ARM code present -- there _is_ ARM code in
> the kernel image even with this option (though not very much).
>
> IMHO, CONFIG_THUMB2_KERNEL should be viewed as similiar to an optimisation
> option or other code gen options -- it's a hint to use the Thumb-2
> instruction set in preference to ARM, but doesn't make a 100% guarantee.
> Some parts of the kernel do contain ARM code even in this configuration.
>
>
> The real problem we have here is that we do not distinguish between v7-A
> and v7-M in Kconfig (this is not the same thing as CONFIG_MMU, since
> there's no special reason why it should be impossible to build a ucLinux
> kernel for v7-A -- in which case the ARM instruction is available)

If we ever take the time to push ARMv7-M into mainline, my patches
actually introduce CONFIG_CPU_V7M:

http://www.linux-arm.org/git?p=linux-2.6-stable.git;a=commitdiff;h=53a9ca6eccb3e14c7d9045870464cbfb37b0bb3b

-- 
Catalin

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] ARM: net: JIT compiler for packet filters
  2011-12-19 16:45     ` Mircea Gherzan
@ 2011-12-19 18:18       ` Dave Martin
  -1 siblings, 0 replies; 30+ messages in thread
From: Dave Martin @ 2011-12-19 18:18 UTC (permalink / raw)
  To: Mircea Gherzan; +Cc: linux-arm-kernel, netdev

On Mon, Dec 19, 2011 at 06:45:13PM +0200, Mircea Gherzan wrote:
> Hi,
> 
> On Mon, Dec 19, 2011 at 12:50:21PM +0000, Dave Martin wrote:
> > On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
> > > Based of Matt Evans's PPC64 implementation.
> > > 
> > > Supports only ARM mode with EABI.
> > > 
> > > Supports both little and big endian. Depends on the support for
> > > unaligned loads on ARMv7. Does not support all the BPF opcodes
> > > that deal with ancillary data. The scratch memory of the filter
> > > lives on the stack.
> > > 
> > > Enabled in the same way as for x86-64 and PPC64:
> > > 
> > > 	echo 1 > /proc/sys/net/core/bpf_jit_enable
> > > 
> > > A value greater than 1 enables opcode output.
> > > 
> > > Signed-off-by: Mircea Gherzan <mgherzan@gmail.com>
> > > ---
> > 
> > Interesting patch... I haven't reviewed in detail, but I have a few
> > quick comments.
> > 
> > > 
> > > Changes in v2:
> > >  * enable the compiler ony for ARMv5+ because of the BLX instruction
> > >  * use the same comparison for the ARM version checks
> > >  * use misaligned accesses on ARMv6
> > 
> > You probably want to change the commit message now to reflect this.
> 
> Will do in the next version.
> 
> > 
> > >  * fix the SEEN_MEM
> > >  * fix the mem_words_used()
> > > 
> > >  arch/arm/Kconfig          |    1 +
> > >  arch/arm/Makefile         |    1 +
> > >  arch/arm/net/Makefile     |    3 +
> > >  arch/arm/net/bpf_jit_32.c |  838 +++++++++++++++++++++++++++++++++++++++++++++
> > >  arch/arm/net/bpf_jit_32.h |  174 ++++++++++
> > >  5 files changed, 1017 insertions(+), 0 deletions(-)
> > >  create mode 100644 arch/arm/net/Makefile
> > >  create mode 100644 arch/arm/net/bpf_jit_32.c
> > >  create mode 100644 arch/arm/net/bpf_jit_32.h
> > > 
> > > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> > > index abba5b8..ea65c41 100644
> > > --- a/arch/arm/Kconfig
> > > +++ b/arch/arm/Kconfig
> > > @@ -30,6 +30,7 @@ config ARM
> > >  	select HAVE_SPARSE_IRQ
> > >  	select GENERIC_IRQ_SHOW
> > >  	select CPU_PM if (SUSPEND || CPU_IDLE)
> > > +	select HAVE_BPF_JIT if (!THUMB2_KERNEL && AEABI)
> > 
> > Have to tried your code with a Thumb-2 kernel?
> 
> Not yet.
> 
> > Quickly skimming though your patch, I don't see an obvious reason why we
> > can't have that working, though I haven't tried it yet.
> > 
> > Note that it's fine to have the JIT generating ARM code, even if the rest
> > if the kernel is Thumb-2.  This would only start to cause problems if we
> > want to do things like set kprobes in the JITted code, or unwind out of
> > the JITted code.
> > 
> > It's just necessary to make sure that calls/returns into/out of the
> > JITted code are handled correctly.  You don't seem to do any scary
> > arithmetic or mov to or from pc or lr, and it doesn't look like you ever
> > call back into the kernel from JITted code, so the implementation is
> > probably safe for ARM/Thumb interworking already (if I've understood
> > correctly).
> 
> The JITed code calls back to the kernel for the load helpers. So setting
> bit 0 is required.

When you take the address of a link-time external function symbol,
bit[0] in the address will automatically be set appropriately by the
linker to indicate the target instruction set -- you already use BX/BLX
to jump to such symbols, so you should switch correctly when calling
_to_ the kernel.

Returns should also work, except for old-style "mov pc,lr" returns made
in Thumb code (from ARM code, this magically works for >= v7).  Such returns
only happen in hand-written assembler: for C code, the compiler always
generates proper AEABI-compliant return sequences.

So, for calling load_func[], jit_get_skb_b etc. (which are C functions),
there should be no problem.

I think the only code which you call from the JIT output but which does
not return compliantly is __aeabi_uidiv() in arch/arm/lib/lib1funcs.S.


I have a quick hacked-up patch (below) which attempts to fix this;
I'd be interested if this works for you  -- but finalising your ARM-only
version of the patch should still be the priority.

If this fix does work, I'll turn it into a proper patch, as we can maybe
use it more widely.

[...]

> > > +		case BPF_S_ALU_DIV_X:
> > > +			ctx->seen |= SEEN_X;
> > > +			emit(ARM_CMP_I(r_X, 0), ctx);
> > > +			emit_err_ret(ARM_COND_EQ, ctx);
> > > +			emit(ARM_MOV_R(ARM_R1, r_X), ctx);
> > > +div:
> > > +			ctx->seen |= SEEN_CALL;
> > > +
> > > +			emit(ARM_MOV_R(ARM_R0, r_A), ctx);
> > > +			emit_mov_i(r_scratch, (u32)__aeabi_uidiv, ctx);
> > > +			emit(ARM_BLX_R(r_scratch), ctx);
> > > +			emit(ARM_MOV_R(r_A, ARM_R0), ctx);
> > > +			break;
> > 
> > I don't know how much division is used by the packet filter JIT.  If
> > it gets used a significant amount, you might want to support hardware
> > divide for CPUs that have it:
> 
> Division rarely appears in "normal" BPF filters: it must be an explicit
> part of the human-readable filter expression (the BPF compiler does not
> generate division opcodes in other cases, AFAICT). Nonetheless, support
> for hardware division would spare a bit of stack space for filters like
> "len / 100 == 1".
> 
> > Cortex-A15 and later processors may have hardware integer divide
> > support.  You can check for its availability at runtime using by testing
> > the HWCAP_IDIVA (for ARM) or HWCAP_IDIVT (for Thumb) bits in elf_hwcap
> > (see arch/arm/include/asm/hwcap.h).
> 
> I will include this in the next version of the patch.

Ok, cool

Cheers
---Dave

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v2] ARM: net: JIT compiler for packet filters
@ 2011-12-19 18:18       ` Dave Martin
  0 siblings, 0 replies; 30+ messages in thread
From: Dave Martin @ 2011-12-19 18:18 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 19, 2011 at 06:45:13PM +0200, Mircea Gherzan wrote:
> Hi,
> 
> On Mon, Dec 19, 2011 at 12:50:21PM +0000, Dave Martin wrote:
> > On Mon, Dec 19, 2011 at 09:40:30AM +0100, Mircea Gherzan wrote:
> > > Based of Matt Evans's PPC64 implementation.
> > > 
> > > Supports only ARM mode with EABI.
> > > 
> > > Supports both little and big endian. Depends on the support for
> > > unaligned loads on ARMv7. Does not support all the BPF opcodes
> > > that deal with ancillary data. The scratch memory of the filter
> > > lives on the stack.
> > > 
> > > Enabled in the same way as for x86-64 and PPC64:
> > > 
> > > 	echo 1 > /proc/sys/net/core/bpf_jit_enable
> > > 
> > > A value greater than 1 enables opcode output.
> > > 
> > > Signed-off-by: Mircea Gherzan <mgherzan@gmail.com>
> > > ---
> > 
> > Interesting patch... I haven't reviewed in detail, but I have a few
> > quick comments.
> > 
> > > 
> > > Changes in v2:
> > >  * enable the compiler ony for ARMv5+ because of the BLX instruction
> > >  * use the same comparison for the ARM version checks
> > >  * use misaligned accesses on ARMv6
> > 
> > You probably want to change the commit message now to reflect this.
> 
> Will do in the next version.
> 
> > 
> > >  * fix the SEEN_MEM
> > >  * fix the mem_words_used()
> > > 
> > >  arch/arm/Kconfig          |    1 +
> > >  arch/arm/Makefile         |    1 +
> > >  arch/arm/net/Makefile     |    3 +
> > >  arch/arm/net/bpf_jit_32.c |  838 +++++++++++++++++++++++++++++++++++++++++++++
> > >  arch/arm/net/bpf_jit_32.h |  174 ++++++++++
> > >  5 files changed, 1017 insertions(+), 0 deletions(-)
> > >  create mode 100644 arch/arm/net/Makefile
> > >  create mode 100644 arch/arm/net/bpf_jit_32.c
> > >  create mode 100644 arch/arm/net/bpf_jit_32.h
> > > 
> > > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> > > index abba5b8..ea65c41 100644
> > > --- a/arch/arm/Kconfig
> > > +++ b/arch/arm/Kconfig
> > > @@ -30,6 +30,7 @@ config ARM
> > >  	select HAVE_SPARSE_IRQ
> > >  	select GENERIC_IRQ_SHOW
> > >  	select CPU_PM if (SUSPEND || CPU_IDLE)
> > > +	select HAVE_BPF_JIT if (!THUMB2_KERNEL && AEABI)
> > 
> > Have to tried your code with a Thumb-2 kernel?
> 
> Not yet.
> 
> > Quickly skimming though your patch, I don't see an obvious reason why we
> > can't have that working, though I haven't tried it yet.
> > 
> > Note that it's fine to have the JIT generating ARM code, even if the rest
> > if the kernel is Thumb-2.  This would only start to cause problems if we
> > want to do things like set kprobes in the JITted code, or unwind out of
> > the JITted code.
> > 
> > It's just necessary to make sure that calls/returns into/out of the
> > JITted code are handled correctly.  You don't seem to do any scary
> > arithmetic or mov to or from pc or lr, and it doesn't look like you ever
> > call back into the kernel from JITted code, so the implementation is
> > probably safe for ARM/Thumb interworking already (if I've understood
> > correctly).
> 
> The JITed code calls back to the kernel for the load helpers. So setting
> bit 0 is required.

When you take the address of a link-time external function symbol,
bit[0] in the address will automatically be set appropriately by the
linker to indicate the target instruction set -- you already use BX/BLX
to jump to such symbols, so you should switch correctly when calling
_to_ the kernel.

Returns should also work, except for old-style "mov pc,lr" returns made
in Thumb code (from ARM code, this magically works for >= v7).  Such returns
only happen in hand-written assembler: for C code, the compiler always
generates proper AEABI-compliant return sequences.

So, for calling load_func[], jit_get_skb_b etc. (which are C functions),
there should be no problem.

I think the only code which you call from the JIT output but which does
not return compliantly is __aeabi_uidiv() in arch/arm/lib/lib1funcs.S.


I have a quick hacked-up patch (below) which attempts to fix this;
I'd be interested if this works for you  -- but finalising your ARM-only
version of the patch should still be the priority.

If this fix does work, I'll turn it into a proper patch, as we can maybe
use it more widely.

[...]

> > > +		case BPF_S_ALU_DIV_X:
> > > +			ctx->seen |= SEEN_X;
> > > +			emit(ARM_CMP_I(r_X, 0), ctx);
> > > +			emit_err_ret(ARM_COND_EQ, ctx);
> > > +			emit(ARM_MOV_R(ARM_R1, r_X), ctx);
> > > +div:
> > > +			ctx->seen |= SEEN_CALL;
> > > +
> > > +			emit(ARM_MOV_R(ARM_R0, r_A), ctx);
> > > +			emit_mov_i(r_scratch, (u32)__aeabi_uidiv, ctx);
> > > +			emit(ARM_BLX_R(r_scratch), ctx);
> > > +			emit(ARM_MOV_R(r_A, ARM_R0), ctx);
> > > +			break;
> > 
> > I don't know how much division is used by the packet filter JIT.  If
> > it gets used a significant amount, you might want to support hardware
> > divide for CPUs that have it:
> 
> Division rarely appears in "normal" BPF filters: it must be an explicit
> part of the human-readable filter expression (the BPF compiler does not
> generate division opcodes in other cases, AFAICT). Nonetheless, support
> for hardware division would spare a bit of stack space for filters like
> "len / 100 == 1".
> 
> > Cortex-A15 and later processors may have hardware integer divide
> > support.  You can check for its availability at runtime using by testing
> > the HWCAP_IDIVA (for ARM) or HWCAP_IDIVT (for Thumb) bits in elf_hwcap
> > (see arch/arm/include/asm/hwcap.h).
> 
> I will include this in the next version of the patch.

Ok, cool

Cheers
---Dave

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] ARM: net: JIT compiler for packet filters
  2011-12-19 18:18       ` Dave Martin
@ 2011-12-19 18:29         ` Dave Martin
  -1 siblings, 0 replies; 30+ messages in thread
From: Dave Martin @ 2011-12-19 18:29 UTC (permalink / raw)
  To: Mircea Gherzan; +Cc: linux-arm-kernel, netdev

On Mon, Dec 19, 2011 at 06:18:39PM +0000, Dave Martin wrote:
> On Mon, Dec 19, 2011 at 06:45:13PM +0200, Mircea Gherzan wrote:

[...]

> > The JITed code calls back to the kernel for the load helpers. So setting
> > bit 0 is required.
> 
> When you take the address of a link-time external function symbol,
> bit[0] in the address will automatically be set appropriately by the
> linker to indicate the target instruction set -- you already use BX/BLX
> to jump to such symbols, so you should switch correctly when calling
> _to_ the kernel.
> 
> Returns should also work, except for old-style "mov pc,lr" returns made
> in Thumb code (from ARM code, this magically works for >= v7).  Such returns
> only happen in hand-written assembler: for C code, the compiler always
> generates proper AEABI-compliant return sequences.
> 
> So, for calling load_func[], jit_get_skb_b etc. (which are C functions),
> there should be no problem.
> 
> I think the only code which you call from the JIT output but which does
> not return compliantly is __aeabi_uidiv() in arch/arm/lib/lib1funcs.S.
> 
> 
> I have a quick hacked-up patch (below) which attempts to fix this;
> I'd be interested if this works for you  -- but finalising your ARM-only
> version of the patch should still be the priority.
> 
> If this fix does work, I'll turn it into a proper patch, as we can maybe
> use it more widely.

Oops, I forgot to paste in my patch ... here it is.

Cheers
---Dave

diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index b6e65de..013bfaf 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -313,4 +313,39 @@
 	.size \name , . - \name
 	.endm
 
+/*
+ * Helper macro to abstract a function return, where the return address is
+ * held in a register:
+ */
+.macro __def_bret cc
+	.macro bret\cc Rm:req
+#if __LINUX_ARM_ARCH__ < 7
+		mov\cc	pc, \Rm
+#else
+		bx\cc	\Rm
+#endif
+	
+	.endm
+.endm
+
+__def_bret
+__def_bret eq
+__def_bret ne
+__def_bret hs
+__def_bret cs	
+__def_bret lo
+__def_bret cc
+__def_bret vs
+__def_bret vc
+__def_bret mi
+__def_bret pl
+__def_bret hi
+__def_bret ls
+__def_bret ge
+__def_bret lt
+__def_bret gt
+__def_bret le
+
+.purgem __def_bret
+
 #endif /* __ASM_ASSEMBLER_H__ */
diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S
index c562f64..c3a4dbf 100644
--- a/arch/arm/lib/lib1funcs.S
+++ b/arch/arm/lib/lib1funcs.S
@@ -210,7 +210,7 @@ ENTRY(__aeabi_uidiv)
 UNWIND(.fnstart)
 
 	subs	r2, r1, #1
-	moveq	pc, lr
+	breteq	lr
 	bcc	Ldiv0
 	cmp	r0, r1
 	bls	11f
@@ -220,16 +220,16 @@ UNWIND(.fnstart)
 	ARM_DIV_BODY r0, r1, r2, r3
 
 	mov	r0, r2
-	mov	pc, lr
+	bret	lr
 
 11:	moveq	r0, #1
 	movne	r0, #0
-	mov	pc, lr
+	bret	lr
 
 12:	ARM_DIV2_ORDER r1, r2
 
 	mov	r0, r0, lsr r2
-	mov	pc, lr
+	bret	lr
 
 UNWIND(.fnend)
 ENDPROC(__udivsi3)
@@ -244,11 +244,11 @@ UNWIND(.fnstart)
 	moveq   r0, #0
 	tsthi	r1, r2				@ see if divisor is power of 2
 	andeq	r0, r0, r2
-	movls	pc, lr
+	bretls	lr
 
 	ARM_MOD_BODY r0, r1, r2, r3
 
-	mov	pc, lr
+	bret	lr
 
 UNWIND(.fnend)
 ENDPROC(__umodsi3)
@@ -274,23 +274,23 @@ UNWIND(.fnstart)
 
 	cmp	ip, #0
 	rsbmi	r0, r0, #0
-	mov	pc, lr
+	bret	lr
 
 10:	teq	ip, r0				@ same sign ?
 	rsbmi	r0, r0, #0
-	mov	pc, lr
+	bret	lr
 
 11:	movlo	r0, #0
 	moveq	r0, ip, asr #31
 	orreq	r0, r0, #1
-	mov	pc, lr
+	bret	lr
 
 12:	ARM_DIV2_ORDER r1, r2
 
 	cmp	ip, #0
 	mov	r0, r3, lsr r2
 	rsbmi	r0, r0, #0
-	mov	pc, lr
+	bret	lr
 
 UNWIND(.fnend)
 ENDPROC(__divsi3)
@@ -315,7 +315,7 @@ UNWIND(.fnstart)
 
 10:	cmp	ip, #0
 	rsbmi	r0, r0, #0
-	mov	pc, lr
+	bret	lr
 
 UNWIND(.fnend)
 ENDPROC(__modsi3)
@@ -331,7 +331,7 @@ UNWIND(.save {r0, r1, ip, lr}	)
 	ldmfd	sp!, {r1, r2, ip, lr}
 	mul	r3, r0, r2
 	sub	r1, r1, r3
-	mov	pc, lr
+	bret	lr
 
 UNWIND(.fnend)
 ENDPROC(__aeabi_uidivmod)
@@ -344,7 +344,7 @@ UNWIND(.save {r0, r1, ip, lr}	)
 	ldmfd	sp!, {r1, r2, ip, lr}
 	mul	r3, r0, r2
 	sub	r1, r1, r3
-	mov	pc, lr
+	bret	lr
 
 UNWIND(.fnend)
 ENDPROC(__aeabi_idivmod)

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH v2] ARM: net: JIT compiler for packet filters
@ 2011-12-19 18:29         ` Dave Martin
  0 siblings, 0 replies; 30+ messages in thread
From: Dave Martin @ 2011-12-19 18:29 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 19, 2011 at 06:18:39PM +0000, Dave Martin wrote:
> On Mon, Dec 19, 2011 at 06:45:13PM +0200, Mircea Gherzan wrote:

[...]

> > The JITed code calls back to the kernel for the load helpers. So setting
> > bit 0 is required.
> 
> When you take the address of a link-time external function symbol,
> bit[0] in the address will automatically be set appropriately by the
> linker to indicate the target instruction set -- you already use BX/BLX
> to jump to such symbols, so you should switch correctly when calling
> _to_ the kernel.
> 
> Returns should also work, except for old-style "mov pc,lr" returns made
> in Thumb code (from ARM code, this magically works for >= v7).  Such returns
> only happen in hand-written assembler: for C code, the compiler always
> generates proper AEABI-compliant return sequences.
> 
> So, for calling load_func[], jit_get_skb_b etc. (which are C functions),
> there should be no problem.
> 
> I think the only code which you call from the JIT output but which does
> not return compliantly is __aeabi_uidiv() in arch/arm/lib/lib1funcs.S.
> 
> 
> I have a quick hacked-up patch (below) which attempts to fix this;
> I'd be interested if this works for you  -- but finalising your ARM-only
> version of the patch should still be the priority.
> 
> If this fix does work, I'll turn it into a proper patch, as we can maybe
> use it more widely.

Oops, I forgot to paste in my patch ... here it is.

Cheers
---Dave

diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index b6e65de..013bfaf 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -313,4 +313,39 @@
 	.size \name , . - \name
 	.endm
 
+/*
+ * Helper macro to abstract a function return, where the return address is
+ * held in a register:
+ */
+.macro __def_bret cc
+	.macro bret\cc Rm:req
+#if __LINUX_ARM_ARCH__ < 7
+		mov\cc	pc, \Rm
+#else
+		bx\cc	\Rm
+#endif
+	
+	.endm
+.endm
+
+__def_bret
+__def_bret eq
+__def_bret ne
+__def_bret hs
+__def_bret cs	
+__def_bret lo
+__def_bret cc
+__def_bret vs
+__def_bret vc
+__def_bret mi
+__def_bret pl
+__def_bret hi
+__def_bret ls
+__def_bret ge
+__def_bret lt
+__def_bret gt
+__def_bret le
+
+.purgem __def_bret
+
 #endif /* __ASM_ASSEMBLER_H__ */
diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S
index c562f64..c3a4dbf 100644
--- a/arch/arm/lib/lib1funcs.S
+++ b/arch/arm/lib/lib1funcs.S
@@ -210,7 +210,7 @@ ENTRY(__aeabi_uidiv)
 UNWIND(.fnstart)
 
 	subs	r2, r1, #1
-	moveq	pc, lr
+	breteq	lr
 	bcc	Ldiv0
 	cmp	r0, r1
 	bls	11f
@@ -220,16 +220,16 @@ UNWIND(.fnstart)
 	ARM_DIV_BODY r0, r1, r2, r3
 
 	mov	r0, r2
-	mov	pc, lr
+	bret	lr
 
 11:	moveq	r0, #1
 	movne	r0, #0
-	mov	pc, lr
+	bret	lr
 
 12:	ARM_DIV2_ORDER r1, r2
 
 	mov	r0, r0, lsr r2
-	mov	pc, lr
+	bret	lr
 
 UNWIND(.fnend)
 ENDPROC(__udivsi3)
@@ -244,11 +244,11 @@ UNWIND(.fnstart)
 	moveq   r0, #0
 	tsthi	r1, r2				@ see if divisor is power of 2
 	andeq	r0, r0, r2
-	movls	pc, lr
+	bretls	lr
 
 	ARM_MOD_BODY r0, r1, r2, r3
 
-	mov	pc, lr
+	bret	lr
 
 UNWIND(.fnend)
 ENDPROC(__umodsi3)
@@ -274,23 +274,23 @@ UNWIND(.fnstart)
 
 	cmp	ip, #0
 	rsbmi	r0, r0, #0
-	mov	pc, lr
+	bret	lr
 
 10:	teq	ip, r0				@ same sign ?
 	rsbmi	r0, r0, #0
-	mov	pc, lr
+	bret	lr
 
 11:	movlo	r0, #0
 	moveq	r0, ip, asr #31
 	orreq	r0, r0, #1
-	mov	pc, lr
+	bret	lr
 
 12:	ARM_DIV2_ORDER r1, r2
 
 	cmp	ip, #0
 	mov	r0, r3, lsr r2
 	rsbmi	r0, r0, #0
-	mov	pc, lr
+	bret	lr
 
 UNWIND(.fnend)
 ENDPROC(__divsi3)
@@ -315,7 +315,7 @@ UNWIND(.fnstart)
 
 10:	cmp	ip, #0
 	rsbmi	r0, r0, #0
-	mov	pc, lr
+	bret	lr
 
 UNWIND(.fnend)
 ENDPROC(__modsi3)
@@ -331,7 +331,7 @@ UNWIND(.save {r0, r1, ip, lr}	)
 	ldmfd	sp!, {r1, r2, ip, lr}
 	mul	r3, r0, r2
 	sub	r1, r1, r3
-	mov	pc, lr
+	bret	lr
 
 UNWIND(.fnend)
 ENDPROC(__aeabi_uidivmod)
@@ -344,7 +344,7 @@ UNWIND(.save {r0, r1, ip, lr}	)
 	ldmfd	sp!, {r1, r2, ip, lr}
 	mul	r3, r0, r2
 	sub	r1, r1, r3
-	mov	pc, lr
+	bret	lr
 
 UNWIND(.fnend)
 ENDPROC(__aeabi_idivmod)

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] ARM: net: JIT compiler for packet filters
  2011-12-19 18:29         ` Dave Martin
@ 2011-12-21 14:59           ` Mircea Gherzan
  -1 siblings, 0 replies; 30+ messages in thread
From: Mircea Gherzan @ 2011-12-21 14:59 UTC (permalink / raw)
  To: Dave Martin; +Cc: linux-arm-kernel, netdev, linux

On Mon, Dec 19, 2011 at 06:29:08PM +0000, Dave Martin wrote:
> On Mon, Dec 19, 2011 at 06:18:39PM +0000, Dave Martin wrote:
> > On Mon, Dec 19, 2011 at 06:45:13PM +0200, Mircea Gherzan wrote:
> 
> [...]
> 
> > > The JITed code calls back to the kernel for the load helpers. So setting
> > > bit 0 is required.
> > 
> > When you take the address of a link-time external function symbol,
> > bit[0] in the address will automatically be set appropriately by the
> > linker to indicate the target instruction set -- you already use BX/BLX
> > to jump to such symbols, so you should switch correctly when calling
> > _to_ the kernel.
> > 
> > Returns should also work, except for old-style "mov pc,lr" returns made
> > in Thumb code (from ARM code, this magically works for >= v7).  Such returns
> > only happen in hand-written assembler: for C code, the compiler always
> > generates proper AEABI-compliant return sequences.
> > 
> > So, for calling load_func[], jit_get_skb_b etc. (which are C functions),
> > there should be no problem.
> > 
> > I think the only code which you call from the JIT output but which does
> > not return compliantly is __aeabi_uidiv() in arch/arm/lib/lib1funcs.S.
> > 
> > 
> > I have a quick hacked-up patch (below) which attempts to fix this;
> > I'd be interested if this works for you  -- but finalising your ARM-only
> > version of the patch should still be the priority.
> > 
> > If this fix does work, I'll turn it into a proper patch, as we can maybe
> > use it more widely.
> 
> Oops, I forgot to paste in my patch ... here it is.
> 
> Cheers
> ---Dave
> 
> diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
> index b6e65de..013bfaf 100644
> --- a/arch/arm/include/asm/assembler.h
> +++ b/arch/arm/include/asm/assembler.h
> @@ -313,4 +313,39 @@
>  	.size \name , . - \name
>  	.endm
>  
> +/*
> + * Helper macro to abstract a function return, where the return address is
> + * held in a register:
> + */
> +.macro __def_bret cc
> +	.macro bret\cc Rm:req
> +#if __LINUX_ARM_ARCH__ < 7
> +		mov\cc	pc, \Rm
> +#else
> +		bx\cc	\Rm
> +#endif

This patch worked for me, because I'm using ARMv7 exclusively. But I'm
inclined to think that your patch will fail on ARMv6T (for example) with
a Thumb2 kernel, because the "mov pc, Rm" is a simple branch on pre-v7
cores.

Note however that v3 of my patch no longer requires any changes to the
assembly code: the C trampoline/wrapper for integer division ensures a
proper return. And this one is used only for the DIV_X BPF opcode, which
appears also quite rarely.

[...]

Cheers,
Mircea

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v2] ARM: net: JIT compiler for packet filters
@ 2011-12-21 14:59           ` Mircea Gherzan
  0 siblings, 0 replies; 30+ messages in thread
From: Mircea Gherzan @ 2011-12-21 14:59 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Dec 19, 2011 at 06:29:08PM +0000, Dave Martin wrote:
> On Mon, Dec 19, 2011 at 06:18:39PM +0000, Dave Martin wrote:
> > On Mon, Dec 19, 2011 at 06:45:13PM +0200, Mircea Gherzan wrote:
> 
> [...]
> 
> > > The JITed code calls back to the kernel for the load helpers. So setting
> > > bit 0 is required.
> > 
> > When you take the address of a link-time external function symbol,
> > bit[0] in the address will automatically be set appropriately by the
> > linker to indicate the target instruction set -- you already use BX/BLX
> > to jump to such symbols, so you should switch correctly when calling
> > _to_ the kernel.
> > 
> > Returns should also work, except for old-style "mov pc,lr" returns made
> > in Thumb code (from ARM code, this magically works for >= v7).  Such returns
> > only happen in hand-written assembler: for C code, the compiler always
> > generates proper AEABI-compliant return sequences.
> > 
> > So, for calling load_func[], jit_get_skb_b etc. (which are C functions),
> > there should be no problem.
> > 
> > I think the only code which you call from the JIT output but which does
> > not return compliantly is __aeabi_uidiv() in arch/arm/lib/lib1funcs.S.
> > 
> > 
> > I have a quick hacked-up patch (below) which attempts to fix this;
> > I'd be interested if this works for you  -- but finalising your ARM-only
> > version of the patch should still be the priority.
> > 
> > If this fix does work, I'll turn it into a proper patch, as we can maybe
> > use it more widely.
> 
> Oops, I forgot to paste in my patch ... here it is.
> 
> Cheers
> ---Dave
> 
> diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
> index b6e65de..013bfaf 100644
> --- a/arch/arm/include/asm/assembler.h
> +++ b/arch/arm/include/asm/assembler.h
> @@ -313,4 +313,39 @@
>  	.size \name , . - \name
>  	.endm
>  
> +/*
> + * Helper macro to abstract a function return, where the return address is
> + * held in a register:
> + */
> +.macro __def_bret cc
> +	.macro bret\cc Rm:req
> +#if __LINUX_ARM_ARCH__ < 7
> +		mov\cc	pc, \Rm
> +#else
> +		bx\cc	\Rm
> +#endif

This patch worked for me, because I'm using ARMv7 exclusively. But I'm
inclined to think that your patch will fail on ARMv6T (for example) with
a Thumb2 kernel, because the "mov pc, Rm" is a simple branch on pre-v7
cores.

Note however that v3 of my patch no longer requires any changes to the
assembly code: the C trampoline/wrapper for integer division ensures a
proper return. And this one is used only for the DIV_X BPF opcode, which
appears also quite rarely.

[...]

Cheers,
Mircea

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2] ARM: net: JIT compiler for packet filters
  2011-12-21 14:59           ` Mircea Gherzan
@ 2011-12-22 17:00             ` Dave Martin
  -1 siblings, 0 replies; 30+ messages in thread
From: Dave Martin @ 2011-12-22 17:00 UTC (permalink / raw)
  To: Mircea Gherzan; +Cc: linux-arm-kernel, netdev, linux

On Wed, Dec 21, 2011 at 04:59:13PM +0200, Mircea Gherzan wrote:
> On Mon, Dec 19, 2011 at 06:29:08PM +0000, Dave Martin wrote:
> > On Mon, Dec 19, 2011 at 06:18:39PM +0000, Dave Martin wrote:
> > > On Mon, Dec 19, 2011 at 06:45:13PM +0200, Mircea Gherzan wrote:
> > 
> > [...]
> > 
> > > > The JITed code calls back to the kernel for the load helpers. So setting
> > > > bit 0 is required.
> > > 
> > > When you take the address of a link-time external function symbol,
> > > bit[0] in the address will automatically be set appropriately by the
> > > linker to indicate the target instruction set -- you already use BX/BLX
> > > to jump to such symbols, so you should switch correctly when calling
> > > _to_ the kernel.
> > > 
> > > Returns should also work, except for old-style "mov pc,lr" returns made
> > > in Thumb code (from ARM code, this magically works for >= v7).  Such returns
> > > only happen in hand-written assembler: for C code, the compiler always
> > > generates proper AEABI-compliant return sequences.
> > > 
> > > So, for calling load_func[], jit_get_skb_b etc. (which are C functions),
> > > there should be no problem.
> > > 
> > > I think the only code which you call from the JIT output but which does
> > > not return compliantly is __aeabi_uidiv() in arch/arm/lib/lib1funcs.S.
> > > 
> > > 
> > > I have a quick hacked-up patch (below) which attempts to fix this;
> > > I'd be interested if this works for you  -- but finalising your ARM-only
> > > version of the patch should still be the priority.
> > > 
> > > If this fix does work, I'll turn it into a proper patch, as we can maybe
> > > use it more widely.
> > 
> > Oops, I forgot to paste in my patch ... here it is.
> > 
> > Cheers
> > ---Dave
> > 
> > diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
> > index b6e65de..013bfaf 100644
> > --- a/arch/arm/include/asm/assembler.h
> > +++ b/arch/arm/include/asm/assembler.h
> > @@ -313,4 +313,39 @@
> >  	.size \name , . - \name
> >  	.endm
> >  
> > +/*
> > + * Helper macro to abstract a function return, where the return address is
> > + * held in a register:
> > + */
> > +.macro __def_bret cc
> > +	.macro bret\cc Rm:req
> > +#if __LINUX_ARM_ARCH__ < 7
> > +		mov\cc	pc, \Rm
> > +#else
> > +		bx\cc	\Rm
> > +#endif
> 
> This patch worked for me, because I'm using ARMv7 exclusively. But I'm
> inclined to think that your patch will fail on ARMv6T (for example) with
> a Thumb2 kernel, because the "mov pc, Rm" is a simple branch on pre-v7
> cores.

Thumb-2 kernels are not currently supported on ARMv6T2, and that's
probably not going to change for the foreseeable future.  There are
not many v6T2 CPUs out there.

In principle, the condition could be __LINUX_ARM_ARCH__ < 5, since the
bx instruction is guaranteed to be available on all ARMv5 CPUs and
later (ARMv5T is not required).

For this sketch of the patch though, I just went for the principle of
least impact, and changed the behaviour only for CPUs where it's
definitely needed (i.e., v7+).

Thanks for trying it out, though.

> Note however that v3 of my patch no longer requires any changes to the
> assembly code: the C trampoline/wrapper for integer division ensures a
> proper return. And this one is used only for the DIV_X BPF opcode, which
> appears also quite rarely.

I may try to push my patch independently -- this way we might make
a lot of the existing assembler files properly EABI compliant at low
effort; as and when that happens, your C helper wouldn't be needed
any more, but it makes sense for you to keep if for now.

Cheers
---Dave

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v2] ARM: net: JIT compiler for packet filters
@ 2011-12-22 17:00             ` Dave Martin
  0 siblings, 0 replies; 30+ messages in thread
From: Dave Martin @ 2011-12-22 17:00 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Dec 21, 2011 at 04:59:13PM +0200, Mircea Gherzan wrote:
> On Mon, Dec 19, 2011 at 06:29:08PM +0000, Dave Martin wrote:
> > On Mon, Dec 19, 2011 at 06:18:39PM +0000, Dave Martin wrote:
> > > On Mon, Dec 19, 2011 at 06:45:13PM +0200, Mircea Gherzan wrote:
> > 
> > [...]
> > 
> > > > The JITed code calls back to the kernel for the load helpers. So setting
> > > > bit 0 is required.
> > > 
> > > When you take the address of a link-time external function symbol,
> > > bit[0] in the address will automatically be set appropriately by the
> > > linker to indicate the target instruction set -- you already use BX/BLX
> > > to jump to such symbols, so you should switch correctly when calling
> > > _to_ the kernel.
> > > 
> > > Returns should also work, except for old-style "mov pc,lr" returns made
> > > in Thumb code (from ARM code, this magically works for >= v7).  Such returns
> > > only happen in hand-written assembler: for C code, the compiler always
> > > generates proper AEABI-compliant return sequences.
> > > 
> > > So, for calling load_func[], jit_get_skb_b etc. (which are C functions),
> > > there should be no problem.
> > > 
> > > I think the only code which you call from the JIT output but which does
> > > not return compliantly is __aeabi_uidiv() in arch/arm/lib/lib1funcs.S.
> > > 
> > > 
> > > I have a quick hacked-up patch (below) which attempts to fix this;
> > > I'd be interested if this works for you  -- but finalising your ARM-only
> > > version of the patch should still be the priority.
> > > 
> > > If this fix does work, I'll turn it into a proper patch, as we can maybe
> > > use it more widely.
> > 
> > Oops, I forgot to paste in my patch ... here it is.
> > 
> > Cheers
> > ---Dave
> > 
> > diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
> > index b6e65de..013bfaf 100644
> > --- a/arch/arm/include/asm/assembler.h
> > +++ b/arch/arm/include/asm/assembler.h
> > @@ -313,4 +313,39 @@
> >  	.size \name , . - \name
> >  	.endm
> >  
> > +/*
> > + * Helper macro to abstract a function return, where the return address is
> > + * held in a register:
> > + */
> > +.macro __def_bret cc
> > +	.macro bret\cc Rm:req
> > +#if __LINUX_ARM_ARCH__ < 7
> > +		mov\cc	pc, \Rm
> > +#else
> > +		bx\cc	\Rm
> > +#endif
> 
> This patch worked for me, because I'm using ARMv7 exclusively. But I'm
> inclined to think that your patch will fail on ARMv6T (for example) with
> a Thumb2 kernel, because the "mov pc, Rm" is a simple branch on pre-v7
> cores.

Thumb-2 kernels are not currently supported on ARMv6T2, and that's
probably not going to change for the foreseeable future.  There are
not many v6T2 CPUs out there.

In principle, the condition could be __LINUX_ARM_ARCH__ < 5, since the
bx instruction is guaranteed to be available on all ARMv5 CPUs and
later (ARMv5T is not required).

For this sketch of the patch though, I just went for the principle of
least impact, and changed the behaviour only for CPUs where it's
definitely needed (i.e., v7+).

Thanks for trying it out, though.

> Note however that v3 of my patch no longer requires any changes to the
> assembly code: the C trampoline/wrapper for integer division ensures a
> proper return. And this one is used only for the DIV_X BPF opcode, which
> appears also quite rarely.

I may try to push my patch independently -- this way we might make
a lot of the existing assembler files properly EABI compliant at low
effort; as and when that happens, your C helper wouldn't be needed
any more, but it makes sense for you to keep if for now.

Cheers
---Dave

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: [PATCH v2] ARM: net: JIT compiler for packet filters
  2011-12-22 17:00             ` Dave Martin
@ 2011-12-22 17:34               ` David Laight
  -1 siblings, 0 replies; 30+ messages in thread
From: David Laight @ 2011-12-22 17:34 UTC (permalink / raw)
  To: Dave Martin, Mircea Gherzan; +Cc: linux-arm-kernel, netdev, linux

> In principle, the condition could be __LINUX_ARM_ARCH__ < 5, since the
> bx instruction is guaranteed to be available on all ARMv5 CPUs and
> later (ARMv5T is not required).

Can't this code look at the actual cpu being used, and
generate the relevant 'return' instruction?
(Instead of trying to use some compile-time value.)

	David

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v2] ARM: net: JIT compiler for packet filters
@ 2011-12-22 17:34               ` David Laight
  0 siblings, 0 replies; 30+ messages in thread
From: David Laight @ 2011-12-22 17:34 UTC (permalink / raw)
  To: linux-arm-kernel

> In principle, the condition could be __LINUX_ARM_ARCH__ < 5, since the
> bx instruction is guaranteed to be available on all ARMv5 CPUs and
> later (ARMv5T is not required).

Can't this code look at the actual cpu being used, and
generate the relevant 'return' instruction?
(Instead of trying to use some compile-time value.)

	David

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: [PATCH v2] ARM: net: JIT compiler for packet filters
  2011-12-22 17:34               ` David Laight
@ 2011-12-23  2:26                 ` Nicolas Pitre
  -1 siblings, 0 replies; 30+ messages in thread
From: Nicolas Pitre @ 2011-12-23  2:26 UTC (permalink / raw)
  To: David Laight; +Cc: Dave Martin, Mircea Gherzan, netdev, linux, linux-arm-kernel

On Thu, 22 Dec 2011, David Laight wrote:

> > In principle, the condition could be __LINUX_ARM_ARCH__ < 5, since the
> > bx instruction is guaranteed to be available on all ARMv5 CPUs and
> > later (ARMv5T is not required).
> 
> Can't this code look at the actual cpu being used, and
> generate the relevant 'return' instruction?
> (Instead of trying to use some compile-time value.)

There is really no point doing so.  We intend to ultimately have a 
kernel that may support ARMv5 and lower, and another kernel to support 
ARMv6 and ARMv7.  So in a given context, either bx is always available 
(ARMv6+) or never required (ARMv5-).


Nicolas

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v2] ARM: net: JIT compiler for packet filters
@ 2011-12-23  2:26                 ` Nicolas Pitre
  0 siblings, 0 replies; 30+ messages in thread
From: Nicolas Pitre @ 2011-12-23  2:26 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, 22 Dec 2011, David Laight wrote:

> > In principle, the condition could be __LINUX_ARM_ARCH__ < 5, since the
> > bx instruction is guaranteed to be available on all ARMv5 CPUs and
> > later (ARMv5T is not required).
> 
> Can't this code look at the actual cpu being used, and
> generate the relevant 'return' instruction?
> (Instead of trying to use some compile-time value.)

There is really no point doing so.  We intend to ultimately have a 
kernel that may support ARMv5 and lower, and another kernel to support 
ARMv6 and ARMv7.  So in a given context, either bx is always available 
(ARMv6+) or never required (ARMv5-).


Nicolas

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2011-12-23  2:26 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-12-19  8:40 [PATCH v2] ARM: net: JIT compiler for packet filters Mircea Gherzan
2011-12-19  8:40 ` Mircea Gherzan
2011-12-19 12:50 ` Dave Martin
2011-12-19 12:50   ` Dave Martin
2011-12-19 15:19   ` Uwe Kleine-König
2011-12-19 15:19     ` Uwe Kleine-König
2011-12-19 15:24     ` Dave Martin
2011-12-19 15:24       ` Dave Martin
2011-12-19 15:33       ` Uwe Kleine-König
2011-12-19 15:33         ` Uwe Kleine-König
2011-12-19 15:52         ` Dave Martin
2011-12-19 15:52           ` Dave Martin
2011-12-19 17:14           ` Mircea Gherzan
2011-12-19 17:14             ` Mircea Gherzan
2011-12-19 17:39           ` Catalin Marinas
2011-12-19 17:39             ` Catalin Marinas
2011-12-19 16:45   ` Mircea Gherzan
2011-12-19 16:45     ` Mircea Gherzan
2011-12-19 18:18     ` Dave Martin
2011-12-19 18:18       ` Dave Martin
2011-12-19 18:29       ` Dave Martin
2011-12-19 18:29         ` Dave Martin
2011-12-21 14:59         ` Mircea Gherzan
2011-12-21 14:59           ` Mircea Gherzan
2011-12-22 17:00           ` Dave Martin
2011-12-22 17:00             ` Dave Martin
2011-12-22 17:34             ` David Laight
2011-12-22 17:34               ` David Laight
2011-12-23  2:26               ` Nicolas Pitre
2011-12-23  2:26                 ` Nicolas Pitre

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.