[PATCH] MIPS: MSA: misaligned support

* [PATCH] MIPS: MSA: misaligned support
@ 2015-03-18  1:16 Leonid Yegoshin
  2015-03-18 11:27 ` James Hogan
  2015-03-18 11:41 ` James Hogan
  0 siblings, 2 replies; 8+ messages in thread
From: Leonid Yegoshin @ 2015-03-18  1:16 UTC (permalink / raw)
  To: linux-mips, wangr, peterz, qais.yousef, linux-kernel, ralf,
	davidlohr, chenhc, manuel.lauss, mingo

MIPS R5, MIPS R6 and MSA HW specs allow a broad range of address exception
on unalaigned MSA load/store operations - from none unaligned up to
full support in HW. In practice, it is expected that HW can occasionally
triggers AdE for non-aligned data access (misalignment). It is usually
expected on page boundaries because HW handling of two TLBs in single
data access operation may be complicated and expensive.

So, this patch handles MSA LD.df and ST.df Address Error exceptions.

It handles separately two cases - MSA owned by thread and MSA registers
saved in current->thread.fpu. If thread still ownes MSA unit then it
loads and stores directly with MSA unit and only one MSA register. Saving
and restoring the full MSA context (512bytes) on each misalign exception
is expensive! Preemption is disabled, of course.

Signed-off-by: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
---
 arch/mips/include/asm/processor.h |    2 +
 arch/mips/include/uapi/asm/inst.h |   21 +++++
 arch/mips/kernel/r4k_fpu.S        |  107 ++++++++++++++++++++++++++++
 arch/mips/kernel/unaligned.c      |  143 +++++++++++++++++++++++++++++++++++++
 4 files changed, 273 insertions(+)

diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h
index f1df4cb4a286..af2675060244 100644
--- a/arch/mips/include/asm/processor.h
+++ b/arch/mips/include/asm/processor.h
@@ -104,6 +104,8 @@ extern unsigned int vced_count, vcei_count;
 #endif
 
 union fpureg {
+	__u8    val8[FPU_REG_WIDTH / 8];
+	__u16   val16[FPU_REG_WIDTH / 16];
 	__u32	val32[FPU_REG_WIDTH / 32];
 	__u64	val64[FPU_REG_WIDTH / 64];
 };
diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 89c22433b1c6..7ab6987cb7d5 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -58,6 +58,7 @@ enum spec_op {
 	dsll_op, spec7_unused_op, dsrl_op, dsra_op,
 	dsll32_op, spec8_unused_op, dsrl32_op, dsra32_op
 };
+#define msa_op  mdmx_op
 
 /*
  * func field of spec2 opcode.
@@ -217,6 +218,14 @@ enum bshfl_func {
 };
 
 /*
+ * func field for MSA MI10 format
+ */
+enum msa_mi10_func {
+	msa_ld_op = 8,
+	msa_st_op = 9,
+};
+
+/*
  * (microMIPS) Major opcodes.
  */
 enum mm_major_op {
@@ -616,6 +625,17 @@ struct spec3_format {   /* SPEC3 */
 	;)))))
 };
 
+struct msa_mi10_format {        /* MSA */
+	__BITFIELD_FIELD(unsigned int opcode : 6,
+	__BITFIELD_FIELD(signed int s10 : 10,
+	__BITFIELD_FIELD(unsigned int rs : 5,
+	__BITFIELD_FIELD(unsigned int wd : 5,
+	__BITFIELD_FIELD(unsigned int func : 4,
+	__BITFIELD_FIELD(unsigned int df : 2,
+	;))))))
+};
+
+
 /*
  * microMIPS instruction formats (32-bit length)
  *
@@ -884,6 +904,7 @@ union mips_instruction {
 	struct p_format p_format;
 	struct f_format f_format;
 	struct ma_format ma_format;
+	struct msa_mi10_format msa_mi10_format;
 	struct b_format b_format;
 	struct ps_format ps_format;
 	struct v_format v_format;
diff --git a/arch/mips/kernel/r4k_fpu.S b/arch/mips/kernel/r4k_fpu.S
index 6c160c67984c..5f48f45f81e7 100644
--- a/arch/mips/kernel/r4k_fpu.S
+++ b/arch/mips/kernel/r4k_fpu.S
@@ -13,6 +13,7 @@
  * Copyright (C) 1999, 2001 Silicon Graphics, Inc.
  */
 #include <asm/asm.h>
+#include <asm/asmmacro.h>
 #include <asm/errno.h>
 #include <asm/fpregdef.h>
 #include <asm/mipsregs.h>
@@ -268,6 +269,112 @@ LEAF(_restore_fp_context32)
 	END(_restore_fp_context32)
 #endif
 
+#ifdef CONFIG_CPU_HAS_MSA
+
+	.macro  msa_ld_d    wd, base
+	ld_d    \wd, 0, \base
+	jalr    $0, $31
+	  nop
+	.align  4
+	.endm
+
+	.macro  msa_st_d    wd, base
+	st_d    \wd, 0, \base
+	jalr    $0, $31
+	  nop
+	.align  4
+	.endm
+
+LEAF(msa_to_wd)
+	.set    push
+	.set    noreorder
+	sll         t0, a0, 4
+	PTR_LA      t1, Lmsa_to
+	PTR_ADDU    t0, t0, t1
+	jalr        $0, t0
+	  nop
+Lmsa_to:
+	msa_ld_d    0, a1
+	msa_ld_d    1, a1
+	msa_ld_d    2, a1
+	msa_ld_d    3, a1
+	msa_ld_d    4, a1
+	msa_ld_d    5, a1
+	msa_ld_d    6, a1
+	msa_ld_d    7, a1
+	msa_ld_d    8, a1
+	msa_ld_d    9, a1
+	msa_ld_d    10, a1
+	msa_ld_d    11, a1
+	msa_ld_d    12, a1
+	msa_ld_d    13, a1
+	msa_ld_d    14, a1
+	msa_ld_d    15, a1
+	msa_ld_d    16, a1
+	msa_ld_d    17, a1
+	msa_ld_d    18, a1
+	msa_ld_d    19, a1
+	msa_ld_d    20, a1
+	msa_ld_d    21, a1
+	msa_ld_d    22, a1
+	msa_ld_d    23, a1
+	msa_ld_d    24, a1
+	msa_ld_d    25, a1
+	msa_ld_d    26, a1
+	msa_ld_d    27, a1
+	msa_ld_d    28, a1
+	msa_ld_d    29, a1
+	msa_ld_d    30, a1
+	msa_ld_d    31, a1
+	.set    pop
+	END(msa_to_wd)
+
+LEAF(msa_from_wd)
+	.set    push
+	.set    noreorder
+	sll         t0, a0, 4
+	PTR_LA      t1, Lmsa_from
+	PTR_ADDU    t0, t0, t1
+	jalr        $0, t0
+	  nop
+Lmsa_from:
+	msa_st_d    0, a1
+	msa_st_d    1, a1
+	msa_st_d    2, a1
+	msa_st_d    3, a1
+	msa_st_d    4, a1
+	msa_st_d    5, a1
+	msa_st_d    6, a1
+	msa_st_d    7, a1
+	msa_st_d    8, a1
+	msa_st_d    9, a1
+	msa_st_d    10, a1
+	msa_st_d    11, a1
+	msa_st_d    12, a1
+	msa_st_d    13, a1
+	msa_st_d    14, a1
+	msa_st_d    15, a1
+	msa_st_d    16, a1
+	msa_st_d    17, a1
+	msa_st_d    18, a1
+	msa_st_d    19, a1
+	msa_st_d    20, a1
+	msa_st_d    21, a1
+	msa_st_d    22, a1
+	msa_st_d    23, a1
+	msa_st_d    24, a1
+	msa_st_d    25, a1
+	msa_st_d    26, a1
+	msa_st_d    27, a1
+	msa_st_d    28, a1
+	msa_st_d    29, a1
+	msa_st_d    30, a1
+	msa_st_d    31, a1
+	.set    pop
+	END(msa_from_wd)
+
+#endif /* CONFIG_CPU_HAS_MSA */
+
 	.set	reorder
 
 	.type	fault@function
diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
index e11906dff885..558f41fa93c5 100644
--- a/arch/mips/kernel/unaligned.c
+++ b/arch/mips/kernel/unaligned.c
@@ -108,6 +108,11 @@ static u32 unaligned_action;
 #endif
 extern void show_registers(struct pt_regs *regs);
 
+#ifdef CONFIG_CPU_HAS_MSA
+void msa_to_wd(unsigned int wd, union fpureg *from);
+void msa_from_wd(unsigned int wd, union fpureg *to);
+#endif
+
 #ifdef __BIG_ENDIAN
 #define     LoadHW(addr, value, res)  \
 		__asm__ __volatile__ (".set\tnoat\n"        \
@@ -422,6 +427,64 @@ extern void show_registers(struct pt_regs *regs);
 		: "r" (value), "r" (addr), "i" (-EFAULT));
 #endif
 
+#ifdef CONFIG_CPU_HAS_MSA
+#ifdef __BIG_ENDIAN
+/*
+ * MSA data format conversion.
+ * Only for BIG ENDIAN - LITTLE ENDIAN has register format which matches memory
+ * layout contiguosly.
+ *
+ * Conversion is done between two Double words and other formats (W/H/B)
+ * because kernel uses LD.D and ST.D to load/store MSA registers and keeps
+ * MSA registers in this format in current->thread.fpu.fpr
+ */
+static void msa_convert(union fpureg *to, union fpureg *from, int fmt)
+{
+	switch (fmt) {
+	case 0: /* byte */
+		to->val8[0] = from->val8[7];
+		to->val8[1] = from->val8[6];
+		to->val8[2] = from->val8[5];
+		to->val8[3] = from->val8[4];
+		to->val8[4] = from->val8[3];
+		to->val8[5] = from->val8[2];
+		to->val8[6] = from->val8[1];
+		to->val8[7] = from->val8[0];
+		to->val8[8] = from->val8[15];
+		to->val8[9] = from->val8[14];
+		to->val8[10] = from->val8[13];
+		to->val8[11] = from->val8[12];
+		to->val8[12] = from->val8[11];
+		to->val8[13] = from->val8[10];
+		to->val8[14] = from->val8[9];
+		to->val8[15] = from->val8[8];
+		break;
+
+	case 1: /* halfword */
+		to->val16[0] = from->val16[3];
+		to->val16[1] = from->val16[2];
+		to->val16[2] = from->val16[1];
+		to->val16[3] = from->val16[0];
+		to->val16[4] = from->val16[7];
+		to->val16[5] = from->val16[6];
+		to->val16[6] = from->val16[5];
+		to->val16[7] = from->val16[4];
+		break;
+
+	case 2: /* word */
+		to->val32[0] = from->val32[1];
+		to->val32[1] = from->val32[0];
+		to->val32[2] = from->val32[3];
+		to->val32[3] = from->val32[2];
+		break;
+
+	case 3: /* doubleword, no conversion */
+		break;
+	}
+}
+#endif
+#endif
+
 static void emulate_load_store_insn(struct pt_regs *regs,
 	void __user *addr, unsigned int __user *pc)
 {
@@ -434,6 +497,10 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 #ifdef	CONFIG_EVA
 	mm_segment_t seg;
 #endif
+#ifdef CONFIG_CPU_HAS_MSA
+	union fpureg msadatabase[2], *msadata;
+	unsigned int func, df, rs, wd;
+#endif
 	origpc = (unsigned long)pc;
 	orig31 = regs->regs[31];
 
@@ -703,6 +770,82 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 			break;
 		return;
 
+#ifdef CONFIG_CPU_HAS_MSA
+	case msa_op:
+		if (cpu_has_mdmx)
+			goto sigill;
+
+		func = insn.msa_mi10_format.func;
+		switch (func) {
+		default:
+			goto sigbus;
+
+		case msa_ld_op:
+		case msa_st_op:
+			;
+		}
+
+		if (!thread_msa_context_live())
+			goto sigbus;
+
+		df = insn.msa_mi10_format.df;
+		rs = insn.msa_mi10_format.rs;
+		wd = insn.msa_mi10_format.wd;
+		addr = (unsigned long *)(regs->regs[rs] + (insn.msa_mi10_format.s10 * (1 << df)));
+		/* align a working space in stack... */
+		msadata = (union fpureg *)(((unsigned long)msadatabase + 15) & ~(unsigned long)0xf);
+		if (func == msa_ld_op) {
+			if (!access_ok(VERIFY_READ, addr, 16))
+				goto sigbus;
+			compute_return_epc(regs);
+			res = __copy_from_user_inatomic(msadata, addr, 16);
+			if (res)
+				goto fault;
+			preempt_disable();
+			if (test_thread_flag(TIF_USEDMSA)) {
+#ifdef __BIG_ENDIAN
+				msa_convert(&current->thread.fpu.fpr[wd], msadata, df);
+				msa_to_wd(wd, &current->thread.fpu.fpr[wd]);
+#else
+				msa_to_wd(wd, msadata);
+#endif
+				preempt_enable();
+			} else {
+				preempt_enable();
+#ifdef __BIG_ENDIAN
+				msa_convert(&current->thread.fpu.fpr[wd], msadata, df);
+#else
+				current->thread.fpu.fpr[wd] = *msadata;
+#endif
+			}
+		} else {
+			if (!access_ok(VERIFY_WRITE, addr, 16))
+				goto sigbus;
+			compute_return_epc(regs);
+			if (test_thread_flag(TIF_USEDMSA)) {
+#ifdef __BIG_ENDIAN
+				msa_from_wd(wd, &current->thread.fpu.fpr[wd]);
+				msa_convert(msadata, &current->thread.fpu.fpr[wd], df);
+#else
+				msa_from_wd(wd, msadata);
+#endif
+				preempt_enable();
+			} else {
+				preempt_enable();
+#ifdef __BIG_ENDIAN
+				msa_convert(msadata, &current->thread.fpu.fpr[wd], df);
+#else
+				*msadata = current->thread.fpu.fpr[wd];
+#endif
+			}
+			res = __copy_to_user_inatomic(addr, msadata, 16);
+			if (res)
+				goto fault;
+		}
+
+		break;
+#endif /* CONFIG_CPU_HAS_MSA */
+
 	/*
 	 * COP2 is available to implementor for application specific use.
 	 * It's up to applications to register a notifier chain and do


^ permalink raw reply related	[flat|nested] 8+ messages in thread