linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] MIPS: MSA: misaligned support
@ 2015-03-18  1:16 Leonid Yegoshin
  2015-03-18 11:27 ` James Hogan
  2015-03-18 11:41 ` James Hogan
  0 siblings, 2 replies; 8+ messages in thread
From: Leonid Yegoshin @ 2015-03-18  1:16 UTC (permalink / raw)
  To: linux-mips, wangr, peterz, qais.yousef, linux-kernel, ralf,
	davidlohr, chenhc, manuel.lauss, mingo

MIPS R5, MIPS R6 and MSA HW specs allow a broad range of address exception
on unalaigned MSA load/store operations - from none unaligned up to
full support in HW. In practice, it is expected that HW can occasionally
triggers AdE for non-aligned data access (misalignment). It is usually
expected on page boundaries because HW handling of two TLBs in single
data access operation may be complicated and expensive.

So, this patch handles MSA LD.df and ST.df Address Error exceptions.

It handles separately two cases - MSA owned by thread and MSA registers
saved in current->thread.fpu. If thread still ownes MSA unit then it
loads and stores directly with MSA unit and only one MSA register. Saving
and restoring the full MSA context (512bytes) on each misalign exception
is expensive! Preemption is disabled, of course.

Signed-off-by: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
---
 arch/mips/include/asm/processor.h |    2 +
 arch/mips/include/uapi/asm/inst.h |   21 +++++
 arch/mips/kernel/r4k_fpu.S        |  107 ++++++++++++++++++++++++++++
 arch/mips/kernel/unaligned.c      |  143 +++++++++++++++++++++++++++++++++++++
 4 files changed, 273 insertions(+)

diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h
index f1df4cb4a286..af2675060244 100644
--- a/arch/mips/include/asm/processor.h
+++ b/arch/mips/include/asm/processor.h
@@ -104,6 +104,8 @@ extern unsigned int vced_count, vcei_count;
 #endif
 
 union fpureg {
+	__u8    val8[FPU_REG_WIDTH / 8];
+	__u16   val16[FPU_REG_WIDTH / 16];
 	__u32	val32[FPU_REG_WIDTH / 32];
 	__u64	val64[FPU_REG_WIDTH / 64];
 };
diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 89c22433b1c6..7ab6987cb7d5 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -58,6 +58,7 @@ enum spec_op {
 	dsll_op, spec7_unused_op, dsrl_op, dsra_op,
 	dsll32_op, spec8_unused_op, dsrl32_op, dsra32_op
 };
+#define msa_op  mdmx_op
 
 /*
  * func field of spec2 opcode.
@@ -217,6 +218,14 @@ enum bshfl_func {
 };
 
 /*
+ * func field for MSA MI10 format
+ */
+enum msa_mi10_func {
+	msa_ld_op = 8,
+	msa_st_op = 9,
+};
+
+/*
  * (microMIPS) Major opcodes.
  */
 enum mm_major_op {
@@ -616,6 +625,17 @@ struct spec3_format {   /* SPEC3 */
 	;)))))
 };
 
+struct msa_mi10_format {        /* MSA */
+	__BITFIELD_FIELD(unsigned int opcode : 6,
+	__BITFIELD_FIELD(signed int s10 : 10,
+	__BITFIELD_FIELD(unsigned int rs : 5,
+	__BITFIELD_FIELD(unsigned int wd : 5,
+	__BITFIELD_FIELD(unsigned int func : 4,
+	__BITFIELD_FIELD(unsigned int df : 2,
+	;))))))
+};
+
+
 /*
  * microMIPS instruction formats (32-bit length)
  *
@@ -884,6 +904,7 @@ union mips_instruction {
 	struct p_format p_format;
 	struct f_format f_format;
 	struct ma_format ma_format;
+	struct msa_mi10_format msa_mi10_format;
 	struct b_format b_format;
 	struct ps_format ps_format;
 	struct v_format v_format;
diff --git a/arch/mips/kernel/r4k_fpu.S b/arch/mips/kernel/r4k_fpu.S
index 6c160c67984c..5f48f45f81e7 100644
--- a/arch/mips/kernel/r4k_fpu.S
+++ b/arch/mips/kernel/r4k_fpu.S
@@ -13,6 +13,7 @@
  * Copyright (C) 1999, 2001 Silicon Graphics, Inc.
  */
 #include <asm/asm.h>
+#include <asm/asmmacro.h>
 #include <asm/errno.h>
 #include <asm/fpregdef.h>
 #include <asm/mipsregs.h>
@@ -268,6 +269,112 @@ LEAF(_restore_fp_context32)
 	END(_restore_fp_context32)
 #endif
 
+#ifdef CONFIG_CPU_HAS_MSA
+
+	.macro  msa_ld_d    wd, base
+	ld_d    \wd, 0, \base
+	jalr    $0, $31
+	  nop
+	.align  4
+	.endm
+
+	.macro  msa_st_d    wd, base
+	st_d    \wd, 0, \base
+	jalr    $0, $31
+	  nop
+	.align  4
+	.endm
+
+LEAF(msa_to_wd)
+	.set    push
+	.set    noreorder
+	sll         t0, a0, 4
+	PTR_LA      t1, Lmsa_to
+	PTR_ADDU    t0, t0, t1
+	jalr        $0, t0
+	  nop
+Lmsa_to:
+	msa_ld_d    0, a1
+	msa_ld_d    1, a1
+	msa_ld_d    2, a1
+	msa_ld_d    3, a1
+	msa_ld_d    4, a1
+	msa_ld_d    5, a1
+	msa_ld_d    6, a1
+	msa_ld_d    7, a1
+	msa_ld_d    8, a1
+	msa_ld_d    9, a1
+	msa_ld_d    10, a1
+	msa_ld_d    11, a1
+	msa_ld_d    12, a1
+	msa_ld_d    13, a1
+	msa_ld_d    14, a1
+	msa_ld_d    15, a1
+	msa_ld_d    16, a1
+	msa_ld_d    17, a1
+	msa_ld_d    18, a1
+	msa_ld_d    19, a1
+	msa_ld_d    20, a1
+	msa_ld_d    21, a1
+	msa_ld_d    22, a1
+	msa_ld_d    23, a1
+	msa_ld_d    24, a1
+	msa_ld_d    25, a1
+	msa_ld_d    26, a1
+	msa_ld_d    27, a1
+	msa_ld_d    28, a1
+	msa_ld_d    29, a1
+	msa_ld_d    30, a1
+	msa_ld_d    31, a1
+	.set    pop
+	END(msa_to_wd)
+
+LEAF(msa_from_wd)
+	.set    push
+	.set    noreorder
+	sll         t0, a0, 4
+	PTR_LA      t1, Lmsa_from
+	PTR_ADDU    t0, t0, t1
+	jalr        $0, t0
+	  nop
+Lmsa_from:
+	msa_st_d    0, a1
+	msa_st_d    1, a1
+	msa_st_d    2, a1
+	msa_st_d    3, a1
+	msa_st_d    4, a1
+	msa_st_d    5, a1
+	msa_st_d    6, a1
+	msa_st_d    7, a1
+	msa_st_d    8, a1
+	msa_st_d    9, a1
+	msa_st_d    10, a1
+	msa_st_d    11, a1
+	msa_st_d    12, a1
+	msa_st_d    13, a1
+	msa_st_d    14, a1
+	msa_st_d    15, a1
+	msa_st_d    16, a1
+	msa_st_d    17, a1
+	msa_st_d    18, a1
+	msa_st_d    19, a1
+	msa_st_d    20, a1
+	msa_st_d    21, a1
+	msa_st_d    22, a1
+	msa_st_d    23, a1
+	msa_st_d    24, a1
+	msa_st_d    25, a1
+	msa_st_d    26, a1
+	msa_st_d    27, a1
+	msa_st_d    28, a1
+	msa_st_d    29, a1
+	msa_st_d    30, a1
+	msa_st_d    31, a1
+	.set    pop
+	END(msa_from_wd)
+
+#endif /* CONFIG_CPU_HAS_MSA */
+
 	.set	reorder
 
 	.type	fault@function
diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
index e11906dff885..558f41fa93c5 100644
--- a/arch/mips/kernel/unaligned.c
+++ b/arch/mips/kernel/unaligned.c
@@ -108,6 +108,11 @@ static u32 unaligned_action;
 #endif
 extern void show_registers(struct pt_regs *regs);
 
+#ifdef CONFIG_CPU_HAS_MSA
+void msa_to_wd(unsigned int wd, union fpureg *from);
+void msa_from_wd(unsigned int wd, union fpureg *to);
+#endif
+
 #ifdef __BIG_ENDIAN
 #define     LoadHW(addr, value, res)  \
 		__asm__ __volatile__ (".set\tnoat\n"        \
@@ -422,6 +427,64 @@ extern void show_registers(struct pt_regs *regs);
 		: "r" (value), "r" (addr), "i" (-EFAULT));
 #endif
 
+#ifdef CONFIG_CPU_HAS_MSA
+#ifdef __BIG_ENDIAN
+/*
+ * MSA data format conversion.
+ * Only for BIG ENDIAN - LITTLE ENDIAN has register format which matches memory
+ * layout contiguosly.
+ *
+ * Conversion is done between two Double words and other formats (W/H/B)
+ * because kernel uses LD.D and ST.D to load/store MSA registers and keeps
+ * MSA registers in this format in current->thread.fpu.fpr
+ */
+static void msa_convert(union fpureg *to, union fpureg *from, int fmt)
+{
+	switch (fmt) {
+	case 0: /* byte */
+		to->val8[0] = from->val8[7];
+		to->val8[1] = from->val8[6];
+		to->val8[2] = from->val8[5];
+		to->val8[3] = from->val8[4];
+		to->val8[4] = from->val8[3];
+		to->val8[5] = from->val8[2];
+		to->val8[6] = from->val8[1];
+		to->val8[7] = from->val8[0];
+		to->val8[8] = from->val8[15];
+		to->val8[9] = from->val8[14];
+		to->val8[10] = from->val8[13];
+		to->val8[11] = from->val8[12];
+		to->val8[12] = from->val8[11];
+		to->val8[13] = from->val8[10];
+		to->val8[14] = from->val8[9];
+		to->val8[15] = from->val8[8];
+		break;
+
+	case 1: /* halfword */
+		to->val16[0] = from->val16[3];
+		to->val16[1] = from->val16[2];
+		to->val16[2] = from->val16[1];
+		to->val16[3] = from->val16[0];
+		to->val16[4] = from->val16[7];
+		to->val16[5] = from->val16[6];
+		to->val16[6] = from->val16[5];
+		to->val16[7] = from->val16[4];
+		break;
+
+	case 2: /* word */
+		to->val32[0] = from->val32[1];
+		to->val32[1] = from->val32[0];
+		to->val32[2] = from->val32[3];
+		to->val32[3] = from->val32[2];
+		break;
+
+	case 3: /* doubleword, no conversion */
+		break;
+	}
+}
+#endif
+#endif
+
 static void emulate_load_store_insn(struct pt_regs *regs,
 	void __user *addr, unsigned int __user *pc)
 {
@@ -434,6 +497,10 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 #ifdef	CONFIG_EVA
 	mm_segment_t seg;
 #endif
+#ifdef CONFIG_CPU_HAS_MSA
+	union fpureg msadatabase[2], *msadata;
+	unsigned int func, df, rs, wd;
+#endif
 	origpc = (unsigned long)pc;
 	orig31 = regs->regs[31];
 
@@ -703,6 +770,82 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 			break;
 		return;
 
+#ifdef CONFIG_CPU_HAS_MSA
+	case msa_op:
+		if (cpu_has_mdmx)
+			goto sigill;
+
+		func = insn.msa_mi10_format.func;
+		switch (func) {
+		default:
+			goto sigbus;
+
+		case msa_ld_op:
+		case msa_st_op:
+			;
+		}
+
+		if (!thread_msa_context_live())
+			goto sigbus;
+
+		df = insn.msa_mi10_format.df;
+		rs = insn.msa_mi10_format.rs;
+		wd = insn.msa_mi10_format.wd;
+		addr = (unsigned long *)(regs->regs[rs] + (insn.msa_mi10_format.s10 * (1 << df)));
+		/* align a working space in stack... */
+		msadata = (union fpureg *)(((unsigned long)msadatabase + 15) & ~(unsigned long)0xf);
+		if (func == msa_ld_op) {
+			if (!access_ok(VERIFY_READ, addr, 16))
+				goto sigbus;
+			compute_return_epc(regs);
+			res = __copy_from_user_inatomic(msadata, addr, 16);
+			if (res)
+				goto fault;
+			preempt_disable();
+			if (test_thread_flag(TIF_USEDMSA)) {
+#ifdef __BIG_ENDIAN
+				msa_convert(&current->thread.fpu.fpr[wd], msadata, df);
+				msa_to_wd(wd, &current->thread.fpu.fpr[wd]);
+#else
+				msa_to_wd(wd, msadata);
+#endif
+				preempt_enable();
+			} else {
+				preempt_enable();
+#ifdef __BIG_ENDIAN
+				msa_convert(&current->thread.fpu.fpr[wd], msadata, df);
+#else
+				current->thread.fpu.fpr[wd] = *msadata;
+#endif
+			}
+		} else {
+			if (!access_ok(VERIFY_WRITE, addr, 16))
+				goto sigbus;
+			compute_return_epc(regs);
+			if (test_thread_flag(TIF_USEDMSA)) {
+#ifdef __BIG_ENDIAN
+				msa_from_wd(wd, &current->thread.fpu.fpr[wd]);
+				msa_convert(msadata, &current->thread.fpu.fpr[wd], df);
+#else
+				msa_from_wd(wd, msadata);
+#endif
+				preempt_enable();
+			} else {
+				preempt_enable();
+#ifdef __BIG_ENDIAN
+				msa_convert(msadata, &current->thread.fpu.fpr[wd], df);
+#else
+				*msadata = current->thread.fpu.fpr[wd];
+#endif
+			}
+			res = __copy_to_user_inatomic(addr, msadata, 16);
+			if (res)
+				goto fault;
+		}
+
+		break;
+#endif /* CONFIG_CPU_HAS_MSA */
+
 	/*
 	 * COP2 is available to implementor for application specific use.
 	 * It's up to applications to register a notifier chain and do


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] MIPS: MSA: misaligned support
  2015-03-18  1:16 [PATCH] MIPS: MSA: misaligned support Leonid Yegoshin
@ 2015-03-18 11:27 ` James Hogan
  2015-03-18 19:46   ` Leonid Yegoshin
  2015-03-18 11:41 ` James Hogan
  1 sibling, 1 reply; 8+ messages in thread
From: James Hogan @ 2015-03-18 11:27 UTC (permalink / raw)
  To: Leonid Yegoshin, linux-mips, wangr, peterz, qais.yousef,
	linux-kernel, ralf, davidlohr, chenhc, manuel.lauss, mingo

[-- Attachment #1: Type: text/plain, Size: 13353 bytes --]

Hi Leonid,

On 18/03/15 01:16, Leonid Yegoshin wrote:
> MIPS R5, MIPS R6 and MSA HW specs allow a broad range of address exception
> on unalaigned MSA load/store operations - from none unaligned up to

unaligned

> full support in HW. In practice, it is expected that HW can occasionally
> triggers AdE for non-aligned data access (misalignment). It is usually
> expected on page boundaries because HW handling of two TLBs in single
> data access operation may be complicated and expensive.
> 
> So, this patch handles MSA LD.df and ST.df Address Error exceptions.
> 
> It handles separately two cases - MSA owned by thread and MSA registers
> saved in current->thread.fpu. If thread still ownes MSA unit then it

owns

> loads and stores directly with MSA unit and only one MSA register. Saving
> and restoring the full MSA context (512bytes) on each misalign exception
> is expensive! Preemption is disabled, of course.
> 
> Signed-off-by: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
> ---
>  arch/mips/include/asm/processor.h |    2 +
>  arch/mips/include/uapi/asm/inst.h |   21 +++++
>  arch/mips/kernel/r4k_fpu.S        |  107 ++++++++++++++++++++++++++++
>  arch/mips/kernel/unaligned.c      |  143 +++++++++++++++++++++++++++++++++++++
>  4 files changed, 273 insertions(+)
> 
> diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h
> index f1df4cb4a286..af2675060244 100644
> --- a/arch/mips/include/asm/processor.h
> +++ b/arch/mips/include/asm/processor.h
> @@ -104,6 +104,8 @@ extern unsigned int vced_count, vcei_count;
>  #endif
>  
>  union fpureg {
> +	__u8    val8[FPU_REG_WIDTH / 8];
> +	__u16   val16[FPU_REG_WIDTH / 16];
>  	__u32	val32[FPU_REG_WIDTH / 32];
>  	__u64	val64[FPU_REG_WIDTH / 64];
>  };
> diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
> index 89c22433b1c6..7ab6987cb7d5 100644
> --- a/arch/mips/include/uapi/asm/inst.h
> +++ b/arch/mips/include/uapi/asm/inst.h
> @@ -58,6 +58,7 @@ enum spec_op {
>  	dsll_op, spec7_unused_op, dsrl_op, dsra_op,
>  	dsll32_op, spec8_unused_op, dsrl32_op, dsra32_op
>  };
> +#define msa_op  mdmx_op
>  
>  /*
>   * func field of spec2 opcode.
> @@ -217,6 +218,14 @@ enum bshfl_func {
>  };
>  
>  /*
> + * func field for MSA MI10 format
> + */
> +enum msa_mi10_func {
> +	msa_ld_op = 8,
> +	msa_st_op = 9,

Most other opcode enumerations in this file are specified in hexadecimal.

> +};
> +
> +/*
>   * (microMIPS) Major opcodes.
>   */
>  enum mm_major_op {
> @@ -616,6 +625,17 @@ struct spec3_format {   /* SPEC3 */
>  	;)))))
>  };
>  
> +struct msa_mi10_format {        /* MSA */
> +	__BITFIELD_FIELD(unsigned int opcode : 6,
> +	__BITFIELD_FIELD(signed int s10 : 10,
> +	__BITFIELD_FIELD(unsigned int rs : 5,
> +	__BITFIELD_FIELD(unsigned int wd : 5,
> +	__BITFIELD_FIELD(unsigned int func : 4,
> +	__BITFIELD_FIELD(unsigned int df : 2,
> +	;))))))
> +};
> +
> +
>  /*
>   * microMIPS instruction formats (32-bit length)
>   *
> @@ -884,6 +904,7 @@ union mips_instruction {
>  	struct p_format p_format;
>  	struct f_format f_format;
>  	struct ma_format ma_format;
> +	struct msa_mi10_format msa_mi10_format;
>  	struct b_format b_format;
>  	struct ps_format ps_format;
>  	struct v_format v_format;
> diff --git a/arch/mips/kernel/r4k_fpu.S b/arch/mips/kernel/r4k_fpu.S
> index 6c160c67984c..5f48f45f81e7 100644
> --- a/arch/mips/kernel/r4k_fpu.S
> +++ b/arch/mips/kernel/r4k_fpu.S
> @@ -13,6 +13,7 @@
>   * Copyright (C) 1999, 2001 Silicon Graphics, Inc.
>   */
>  #include <asm/asm.h>
> +#include <asm/asmmacro.h>
>  #include <asm/errno.h>
>  #include <asm/fpregdef.h>
>  #include <asm/mipsregs.h>
> @@ -268,6 +269,112 @@ LEAF(_restore_fp_context32)
>  	END(_restore_fp_context32)
>  #endif
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +
> +	.macro  msa_ld_d    wd, base
> +	ld_d    \wd, 0, \base
> +	jalr    $0, $31

Why not just:
	jr	ra

like every other function in that file? I hope jr would be encoded
correctly on r6 automatically?

> +	  nop

I think a single extra space of indentation for delay slots is the
convention, rather than 2. Same below.

> +	.align  4

doesn't this mean the first one & label might not be suitably aligned.
Would it be better to put this before the ld_d (no need for it after
$w31 case) and putting another .align 4 before the Lmsa_to and Lmsa_from
labels (so the label itself is aligned)?

> +	.endm
> +
> +	.macro  msa_st_d    wd, base
> +	st_d    \wd, 0, \base
> +	jalr    $0, $31
> +	  nop
> +	.align  4

same comments as above.

> +	.endm
> +
> +LEAF(msa_to_wd)
> +	.set    push
> +	.set    noreorder
> +	sll         t0, a0, 4
> +	PTR_LA      t1, Lmsa_to
> +	PTR_ADDU    t0, t0, t1
> +	jalr        $0, t0

Likewise here, "jr t0"? and same for msa_from_wd

> +	  nop
> +Lmsa_to:
> +	msa_ld_d    0, a1
> +	msa_ld_d    1, a1
> +	msa_ld_d    2, a1
> +	msa_ld_d    3, a1
> +	msa_ld_d    4, a1
> +	msa_ld_d    5, a1
> +	msa_ld_d    6, a1
> +	msa_ld_d    7, a1
> +	msa_ld_d    8, a1
> +	msa_ld_d    9, a1
> +	msa_ld_d    10, a1
> +	msa_ld_d    11, a1
> +	msa_ld_d    12, a1
> +	msa_ld_d    13, a1
> +	msa_ld_d    14, a1
> +	msa_ld_d    15, a1
> +	msa_ld_d    16, a1
> +	msa_ld_d    17, a1
> +	msa_ld_d    18, a1
> +	msa_ld_d    19, a1
> +	msa_ld_d    20, a1
> +	msa_ld_d    21, a1
> +	msa_ld_d    22, a1
> +	msa_ld_d    23, a1
> +	msa_ld_d    24, a1
> +	msa_ld_d    25, a1
> +	msa_ld_d    26, a1
> +	msa_ld_d    27, a1
> +	msa_ld_d    28, a1
> +	msa_ld_d    29, a1
> +	msa_ld_d    30, a1
> +	msa_ld_d    31, a1
> +	.set    pop
> +	END(msa_to_wd)
> +
> +LEAF(msa_from_wd)
> +	.set    push
> +	.set    noreorder
> +	sll         t0, a0, 4
> +	PTR_LA      t1, Lmsa_from
> +	PTR_ADDU    t0, t0, t1
> +	jalr        $0, t0
> +	  nop
> +Lmsa_from:
> +	msa_st_d    0, a1
> +	msa_st_d    1, a1
> +	msa_st_d    2, a1
> +	msa_st_d    3, a1
> +	msa_st_d    4, a1
> +	msa_st_d    5, a1
> +	msa_st_d    6, a1
> +	msa_st_d    7, a1
> +	msa_st_d    8, a1
> +	msa_st_d    9, a1
> +	msa_st_d    10, a1
> +	msa_st_d    11, a1
> +	msa_st_d    12, a1
> +	msa_st_d    13, a1
> +	msa_st_d    14, a1
> +	msa_st_d    15, a1
> +	msa_st_d    16, a1
> +	msa_st_d    17, a1
> +	msa_st_d    18, a1
> +	msa_st_d    19, a1
> +	msa_st_d    20, a1
> +	msa_st_d    21, a1
> +	msa_st_d    22, a1
> +	msa_st_d    23, a1
> +	msa_st_d    24, a1
> +	msa_st_d    25, a1
> +	msa_st_d    26, a1
> +	msa_st_d    27, a1
> +	msa_st_d    28, a1
> +	msa_st_d    29, a1
> +	msa_st_d    30, a1
> +	msa_st_d    31, a1
> +	.set    pop
> +	END(msa_from_wd)
> +
> +#endif /* CONFIG_CPU_HAS_MSA */
> +
>  	.set	reorder
>  
>  	.type	fault@function
> diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
> index e11906dff885..558f41fa93c5 100644
> --- a/arch/mips/kernel/unaligned.c
> +++ b/arch/mips/kernel/unaligned.c
> @@ -108,6 +108,11 @@ static u32 unaligned_action;
>  #endif
>  extern void show_registers(struct pt_regs *regs);
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +void msa_to_wd(unsigned int wd, union fpureg *from);
> +void msa_from_wd(unsigned int wd, union fpureg *to);
> +#endif
> +
>  #ifdef __BIG_ENDIAN
>  #define     LoadHW(addr, value, res)  \
>  		__asm__ __volatile__ (".set\tnoat\n"        \
> @@ -422,6 +427,64 @@ extern void show_registers(struct pt_regs *regs);
>  		: "r" (value), "r" (addr), "i" (-EFAULT));
>  #endif
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +#ifdef __BIG_ENDIAN
> +/*
> + * MSA data format conversion.
> + * Only for BIG ENDIAN - LITTLE ENDIAN has register format which matches memory
> + * layout contiguosly.

contiguously

> + *
> + * Conversion is done between two Double words and other formats (W/H/B)
> + * because kernel uses LD.D and ST.D to load/store MSA registers and keeps
> + * MSA registers in this format in current->thread.fpu.fpr
> + */
> +static void msa_convert(union fpureg *to, union fpureg *from, int fmt)
> +{
> +	switch (fmt) {
> +	case 0: /* byte */
> +		to->val8[0] = from->val8[7];
> +		to->val8[1] = from->val8[6];
> +		to->val8[2] = from->val8[5];
> +		to->val8[3] = from->val8[4];
> +		to->val8[4] = from->val8[3];
> +		to->val8[5] = from->val8[2];
> +		to->val8[6] = from->val8[1];
> +		to->val8[7] = from->val8[0];
> +		to->val8[8] = from->val8[15];
> +		to->val8[9] = from->val8[14];
> +		to->val8[10] = from->val8[13];
> +		to->val8[11] = from->val8[12];
> +		to->val8[12] = from->val8[11];
> +		to->val8[13] = from->val8[10];
> +		to->val8[14] = from->val8[9];
> +		to->val8[15] = from->val8[8];
> +		break;
> +
> +	case 1: /* halfword */
> +		to->val16[0] = from->val16[3];
> +		to->val16[1] = from->val16[2];
> +		to->val16[2] = from->val16[1];
> +		to->val16[3] = from->val16[0];
> +		to->val16[4] = from->val16[7];
> +		to->val16[5] = from->val16[6];
> +		to->val16[6] = from->val16[5];
> +		to->val16[7] = from->val16[4];
> +		break;
> +
> +	case 2: /* word */
> +		to->val32[0] = from->val32[1];
> +		to->val32[1] = from->val32[0];
> +		to->val32[2] = from->val32[3];
> +		to->val32[3] = from->val32[2];

FWIW since the FP/MSA patches that Paul submitted, there are also
working endian agnostic accessors created with BUILD_FPR_ACCESS, which
use the FPR_IDX macro (see http://patchwork.linux-mips.org/patch/9169/),
which should work for 8bit and 16bit sizes too.

I wonder if the compiler would unroll/optimise this sort of thing:
	for (i = 0; i < (FPU_REG_WIDTH / 8); ++i)
		to_val8[i] = from->val[FPR_IDX(8, i)];

No worries if not.

> +		break;
> +
> +	case 3: /* doubleword, no conversion */
> +		break;

don't you still need to copy the value though?

> +	}
> +}
> +#endif
> +#endif
> +
>  static void emulate_load_store_insn(struct pt_regs *regs,
>  	void __user *addr, unsigned int __user *pc)
>  {
> @@ -434,6 +497,10 @@ static void emulate_load_store_insn(struct pt_regs *regs,
>  #ifdef	CONFIG_EVA
>  	mm_segment_t seg;
>  #endif
> +#ifdef CONFIG_CPU_HAS_MSA
> +	union fpureg msadatabase[2], *msadata;
> +	unsigned int func, df, rs, wd;
> +#endif
>  	origpc = (unsigned long)pc;
>  	orig31 = regs->regs[31];
>  
> @@ -703,6 +770,82 @@ static void emulate_load_store_insn(struct pt_regs *regs,
>  			break;
>  		return;
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +	case msa_op:
> +		if (cpu_has_mdmx)
> +			goto sigill;
> +
> +		func = insn.msa_mi10_format.func;
> +		switch (func) {
> +		default:
> +			goto sigbus;
> +
> +		case msa_ld_op:
> +		case msa_st_op:
> +			;
> +		}
> +
> +		if (!thread_msa_context_live())
> +			goto sigbus;

Will this ever happen? (I can't see AdE handler enabling interrupts).

If the MSA context genuinely isn't live (i.e. it can be considered
UNPREDICTABLE), then surely a load operation should still succeed?

> +
> +		df = insn.msa_mi10_format.df;
> +		rs = insn.msa_mi10_format.rs;
> +		wd = insn.msa_mi10_format.wd;
> +		addr = (unsigned long *)(regs->regs[rs] + (insn.msa_mi10_format.s10 * (1 << df)));

"* (1 << df)"?
why not just "<< df"?

> +		/* align a working space in stack... */
> +		msadata = (union fpureg *)(((unsigned long)msadatabase + 15) & ~(unsigned long)0xf);

Maybe you could just use __aligned(16) on a single local union fpureg.

> +		if (func == msa_ld_op) {
> +			if (!access_ok(VERIFY_READ, addr, 16))
> +				goto sigbus;
> +			compute_return_epc(regs);
> +			res = __copy_from_user_inatomic(msadata, addr, 16);
> +			if (res)
> +				goto fault;
> +			preempt_disable();
> +			if (test_thread_flag(TIF_USEDMSA)) {
> +#ifdef __BIG_ENDIAN
> +				msa_convert(&current->thread.fpu.fpr[wd], msadata, df);
> +				msa_to_wd(wd, &current->thread.fpu.fpr[wd]);
> +#else
> +				msa_to_wd(wd, msadata);
> +#endif
> +				preempt_enable();
> +			} else {
> +				preempt_enable();
> +#ifdef __BIG_ENDIAN
> +				msa_convert(&current->thread.fpu.fpr[wd], msadata, df);
> +#else
> +				current->thread.fpu.fpr[wd] = *msadata;
> +#endif

I'm not a fan of the ifdefs, but i can see its awkward to abstract
msa_convert without causing extra copies (although I don't think its a
critical code path).

> +			}
> +		} else {
> +			if (!access_ok(VERIFY_WRITE, addr, 16))
> +				goto sigbus;
> +			compute_return_epc(regs);

forgot to preempt_disable()?

> +			if (test_thread_flag(TIF_USEDMSA)) {
> +#ifdef __BIG_ENDIAN
> +				msa_from_wd(wd, &current->thread.fpu.fpr[wd]);
> +				msa_convert(msadata, &current->thread.fpu.fpr[wd], df);
> +#else
> +				msa_from_wd(wd, msadata);
> +#endif
> +				preempt_enable();
> +			} else {
> +				preempt_enable();
> +#ifdef __BIG_ENDIAN
> +				msa_convert(msadata, &current->thread.fpu.fpr[wd], df);
> +#else
> +				*msadata = current->thread.fpu.fpr[wd];

hmm, you could cheat and change this to the following?:
				msadata = &current->thread.fpu.fpr[wd];

> +#endif
> +			}
> +			res = __copy_to_user_inatomic(addr, msadata, 16);
> +			if (res)
> +				goto fault;
> +		}
> +
> +		break;
> +#endif /* CONFIG_CPU_HAS_MSA */
> +
>  	/*
>  	 * COP2 is available to implementor for application specific use.
>  	 * It's up to applications to register a notifier chain and do
> 
> 

Cheers
James


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] MIPS: MSA: misaligned support
  2015-03-18  1:16 [PATCH] MIPS: MSA: misaligned support Leonid Yegoshin
  2015-03-18 11:27 ` James Hogan
@ 2015-03-18 11:41 ` James Hogan
  1 sibling, 0 replies; 8+ messages in thread
From: James Hogan @ 2015-03-18 11:41 UTC (permalink / raw)
  To: Leonid Yegoshin, linux-mips, wangr, peterz, qais.yousef,
	linux-kernel, ralf, davidlohr, chenhc, manuel.lauss, mingo

[-- Attachment #1: Type: text/plain, Size: 11619 bytes --]

Hi Leonid,

On 18/03/15 01:16, Leonid Yegoshin wrote:
> MIPS R5, MIPS R6 and MSA HW specs allow a broad range of address exception
> on unalaigned MSA load/store operations - from none unaligned up to
> full support in HW. In practice, it is expected that HW can occasionally
> triggers AdE for non-aligned data access (misalignment). It is usually
> expected on page boundaries because HW handling of two TLBs in single
> data access operation may be complicated and expensive.
> 
> So, this patch handles MSA LD.df and ST.df Address Error exceptions.
> 
> It handles separately two cases - MSA owned by thread and MSA registers
> saved in current->thread.fpu. If thread still ownes MSA unit then it
> loads and stores directly with MSA unit and only one MSA register. Saving
> and restoring the full MSA context (512bytes) on each misalign exception
> is expensive! Preemption is disabled, of course.
> 
> Signed-off-by: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
> ---
>  arch/mips/include/asm/processor.h |    2 +
>  arch/mips/include/uapi/asm/inst.h |   21 +++++
>  arch/mips/kernel/r4k_fpu.S        |  107 ++++++++++++++++++++++++++++
>  arch/mips/kernel/unaligned.c      |  143 +++++++++++++++++++++++++++++++++++++
>  4 files changed, 273 insertions(+)

Please also run checkpatch (a few long lines) and rebase on v4.0-rcX
(minor conflict in unaligned.c).

Thanks
James

> 
> diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h
> index f1df4cb4a286..af2675060244 100644
> --- a/arch/mips/include/asm/processor.h
> +++ b/arch/mips/include/asm/processor.h
> @@ -104,6 +104,8 @@ extern unsigned int vced_count, vcei_count;
>  #endif
>  
>  union fpureg {
> +	__u8    val8[FPU_REG_WIDTH / 8];
> +	__u16   val16[FPU_REG_WIDTH / 16];
>  	__u32	val32[FPU_REG_WIDTH / 32];
>  	__u64	val64[FPU_REG_WIDTH / 64];
>  };
> diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
> index 89c22433b1c6..7ab6987cb7d5 100644
> --- a/arch/mips/include/uapi/asm/inst.h
> +++ b/arch/mips/include/uapi/asm/inst.h
> @@ -58,6 +58,7 @@ enum spec_op {
>  	dsll_op, spec7_unused_op, dsrl_op, dsra_op,
>  	dsll32_op, spec8_unused_op, dsrl32_op, dsra32_op
>  };
> +#define msa_op  mdmx_op
>  
>  /*
>   * func field of spec2 opcode.
> @@ -217,6 +218,14 @@ enum bshfl_func {
>  };
>  
>  /*
> + * func field for MSA MI10 format
> + */
> +enum msa_mi10_func {
> +	msa_ld_op = 8,
> +	msa_st_op = 9,
> +};
> +
> +/*
>   * (microMIPS) Major opcodes.
>   */
>  enum mm_major_op {
> @@ -616,6 +625,17 @@ struct spec3_format {   /* SPEC3 */
>  	;)))))
>  };
>  
> +struct msa_mi10_format {        /* MSA */
> +	__BITFIELD_FIELD(unsigned int opcode : 6,
> +	__BITFIELD_FIELD(signed int s10 : 10,
> +	__BITFIELD_FIELD(unsigned int rs : 5,
> +	__BITFIELD_FIELD(unsigned int wd : 5,
> +	__BITFIELD_FIELD(unsigned int func : 4,
> +	__BITFIELD_FIELD(unsigned int df : 2,
> +	;))))))
> +};
> +
> +
>  /*
>   * microMIPS instruction formats (32-bit length)
>   *
> @@ -884,6 +904,7 @@ union mips_instruction {
>  	struct p_format p_format;
>  	struct f_format f_format;
>  	struct ma_format ma_format;
> +	struct msa_mi10_format msa_mi10_format;
>  	struct b_format b_format;
>  	struct ps_format ps_format;
>  	struct v_format v_format;
> diff --git a/arch/mips/kernel/r4k_fpu.S b/arch/mips/kernel/r4k_fpu.S
> index 6c160c67984c..5f48f45f81e7 100644
> --- a/arch/mips/kernel/r4k_fpu.S
> +++ b/arch/mips/kernel/r4k_fpu.S
> @@ -13,6 +13,7 @@
>   * Copyright (C) 1999, 2001 Silicon Graphics, Inc.
>   */
>  #include <asm/asm.h>
> +#include <asm/asmmacro.h>
>  #include <asm/errno.h>
>  #include <asm/fpregdef.h>
>  #include <asm/mipsregs.h>
> @@ -268,6 +269,112 @@ LEAF(_restore_fp_context32)
>  	END(_restore_fp_context32)
>  #endif
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +
> +	.macro  msa_ld_d    wd, base
> +	ld_d    \wd, 0, \base
> +	jalr    $0, $31
> +	  nop
> +	.align  4
> +	.endm
> +
> +	.macro  msa_st_d    wd, base
> +	st_d    \wd, 0, \base
> +	jalr    $0, $31
> +	  nop
> +	.align  4
> +	.endm
> +
> +LEAF(msa_to_wd)
> +	.set    push
> +	.set    noreorder
> +	sll         t0, a0, 4
> +	PTR_LA      t1, Lmsa_to
> +	PTR_ADDU    t0, t0, t1
> +	jalr        $0, t0
> +	  nop
> +Lmsa_to:
> +	msa_ld_d    0, a1
> +	msa_ld_d    1, a1
> +	msa_ld_d    2, a1
> +	msa_ld_d    3, a1
> +	msa_ld_d    4, a1
> +	msa_ld_d    5, a1
> +	msa_ld_d    6, a1
> +	msa_ld_d    7, a1
> +	msa_ld_d    8, a1
> +	msa_ld_d    9, a1
> +	msa_ld_d    10, a1
> +	msa_ld_d    11, a1
> +	msa_ld_d    12, a1
> +	msa_ld_d    13, a1
> +	msa_ld_d    14, a1
> +	msa_ld_d    15, a1
> +	msa_ld_d    16, a1
> +	msa_ld_d    17, a1
> +	msa_ld_d    18, a1
> +	msa_ld_d    19, a1
> +	msa_ld_d    20, a1
> +	msa_ld_d    21, a1
> +	msa_ld_d    22, a1
> +	msa_ld_d    23, a1
> +	msa_ld_d    24, a1
> +	msa_ld_d    25, a1
> +	msa_ld_d    26, a1
> +	msa_ld_d    27, a1
> +	msa_ld_d    28, a1
> +	msa_ld_d    29, a1
> +	msa_ld_d    30, a1
> +	msa_ld_d    31, a1
> +	.set    pop
> +	END(msa_to_wd)
> +
> +LEAF(msa_from_wd)
> +	.set    push
> +	.set    noreorder
> +	sll         t0, a0, 4
> +	PTR_LA      t1, Lmsa_from
> +	PTR_ADDU    t0, t0, t1
> +	jalr        $0, t0
> +	  nop
> +Lmsa_from:
> +	msa_st_d    0, a1
> +	msa_st_d    1, a1
> +	msa_st_d    2, a1
> +	msa_st_d    3, a1
> +	msa_st_d    4, a1
> +	msa_st_d    5, a1
> +	msa_st_d    6, a1
> +	msa_st_d    7, a1
> +	msa_st_d    8, a1
> +	msa_st_d    9, a1
> +	msa_st_d    10, a1
> +	msa_st_d    11, a1
> +	msa_st_d    12, a1
> +	msa_st_d    13, a1
> +	msa_st_d    14, a1
> +	msa_st_d    15, a1
> +	msa_st_d    16, a1
> +	msa_st_d    17, a1
> +	msa_st_d    18, a1
> +	msa_st_d    19, a1
> +	msa_st_d    20, a1
> +	msa_st_d    21, a1
> +	msa_st_d    22, a1
> +	msa_st_d    23, a1
> +	msa_st_d    24, a1
> +	msa_st_d    25, a1
> +	msa_st_d    26, a1
> +	msa_st_d    27, a1
> +	msa_st_d    28, a1
> +	msa_st_d    29, a1
> +	msa_st_d    30, a1
> +	msa_st_d    31, a1
> +	.set    pop
> +	END(msa_from_wd)
> +
> +#endif /* CONFIG_CPU_HAS_MSA */
> +
>  	.set	reorder
>  
>  	.type	fault@function
> diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
> index e11906dff885..558f41fa93c5 100644
> --- a/arch/mips/kernel/unaligned.c
> +++ b/arch/mips/kernel/unaligned.c
> @@ -108,6 +108,11 @@ static u32 unaligned_action;
>  #endif
>  extern void show_registers(struct pt_regs *regs);
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +void msa_to_wd(unsigned int wd, union fpureg *from);
> +void msa_from_wd(unsigned int wd, union fpureg *to);
> +#endif
> +
>  #ifdef __BIG_ENDIAN
>  #define     LoadHW(addr, value, res)  \
>  		__asm__ __volatile__ (".set\tnoat\n"        \
> @@ -422,6 +427,64 @@ extern void show_registers(struct pt_regs *regs);
>  		: "r" (value), "r" (addr), "i" (-EFAULT));
>  #endif
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +#ifdef __BIG_ENDIAN
> +/*
> + * MSA data format conversion.
> + * Only for BIG ENDIAN - LITTLE ENDIAN has register format which matches memory
> + * layout contiguosly.
> + *
> + * Conversion is done between two Double words and other formats (W/H/B)
> + * because kernel uses LD.D and ST.D to load/store MSA registers and keeps
> + * MSA registers in this format in current->thread.fpu.fpr
> + */
> +static void msa_convert(union fpureg *to, union fpureg *from, int fmt)
> +{
> +	switch (fmt) {
> +	case 0: /* byte */
> +		to->val8[0] = from->val8[7];
> +		to->val8[1] = from->val8[6];
> +		to->val8[2] = from->val8[5];
> +		to->val8[3] = from->val8[4];
> +		to->val8[4] = from->val8[3];
> +		to->val8[5] = from->val8[2];
> +		to->val8[6] = from->val8[1];
> +		to->val8[7] = from->val8[0];
> +		to->val8[8] = from->val8[15];
> +		to->val8[9] = from->val8[14];
> +		to->val8[10] = from->val8[13];
> +		to->val8[11] = from->val8[12];
> +		to->val8[12] = from->val8[11];
> +		to->val8[13] = from->val8[10];
> +		to->val8[14] = from->val8[9];
> +		to->val8[15] = from->val8[8];
> +		break;
> +
> +	case 1: /* halfword */
> +		to->val16[0] = from->val16[3];
> +		to->val16[1] = from->val16[2];
> +		to->val16[2] = from->val16[1];
> +		to->val16[3] = from->val16[0];
> +		to->val16[4] = from->val16[7];
> +		to->val16[5] = from->val16[6];
> +		to->val16[6] = from->val16[5];
> +		to->val16[7] = from->val16[4];
> +		break;
> +
> +	case 2: /* word */
> +		to->val32[0] = from->val32[1];
> +		to->val32[1] = from->val32[0];
> +		to->val32[2] = from->val32[3];
> +		to->val32[3] = from->val32[2];
> +		break;
> +
> +	case 3: /* doubleword, no conversion */
> +		break;
> +	}
> +}
> +#endif
> +#endif
> +
>  static void emulate_load_store_insn(struct pt_regs *regs,
>  	void __user *addr, unsigned int __user *pc)
>  {
> @@ -434,6 +497,10 @@ static void emulate_load_store_insn(struct pt_regs *regs,
>  #ifdef	CONFIG_EVA
>  	mm_segment_t seg;
>  #endif
> +#ifdef CONFIG_CPU_HAS_MSA
> +	union fpureg msadatabase[2], *msadata;
> +	unsigned int func, df, rs, wd;
> +#endif
>  	origpc = (unsigned long)pc;
>  	orig31 = regs->regs[31];
>  
> @@ -703,6 +770,82 @@ static void emulate_load_store_insn(struct pt_regs *regs,
>  			break;
>  		return;
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +	case msa_op:
> +		if (cpu_has_mdmx)
> +			goto sigill;
> +
> +		func = insn.msa_mi10_format.func;
> +		switch (func) {
> +		default:
> +			goto sigbus;
> +
> +		case msa_ld_op:
> +		case msa_st_op:
> +			;
> +		}
> +
> +		if (!thread_msa_context_live())
> +			goto sigbus;
> +
> +		df = insn.msa_mi10_format.df;
> +		rs = insn.msa_mi10_format.rs;
> +		wd = insn.msa_mi10_format.wd;
> +		addr = (unsigned long *)(regs->regs[rs] + (insn.msa_mi10_format.s10 * (1 << df)));
> +		/* align a working space in stack... */
> +		msadata = (union fpureg *)(((unsigned long)msadatabase + 15) & ~(unsigned long)0xf);
> +		if (func == msa_ld_op) {
> +			if (!access_ok(VERIFY_READ, addr, 16))
> +				goto sigbus;
> +			compute_return_epc(regs);
> +			res = __copy_from_user_inatomic(msadata, addr, 16);
> +			if (res)
> +				goto fault;
> +			preempt_disable();
> +			if (test_thread_flag(TIF_USEDMSA)) {
> +#ifdef __BIG_ENDIAN
> +				msa_convert(&current->thread.fpu.fpr[wd], msadata, df);
> +				msa_to_wd(wd, &current->thread.fpu.fpr[wd]);
> +#else
> +				msa_to_wd(wd, msadata);
> +#endif
> +				preempt_enable();
> +			} else {
> +				preempt_enable();
> +#ifdef __BIG_ENDIAN
> +				msa_convert(&current->thread.fpu.fpr[wd], msadata, df);
> +#else
> +				current->thread.fpu.fpr[wd] = *msadata;
> +#endif
> +			}
> +		} else {
> +			if (!access_ok(VERIFY_WRITE, addr, 16))
> +				goto sigbus;
> +			compute_return_epc(regs);
> +			if (test_thread_flag(TIF_USEDMSA)) {
> +#ifdef __BIG_ENDIAN
> +				msa_from_wd(wd, &current->thread.fpu.fpr[wd]);
> +				msa_convert(msadata, &current->thread.fpu.fpr[wd], df);
> +#else
> +				msa_from_wd(wd, msadata);
> +#endif
> +				preempt_enable();
> +			} else {
> +				preempt_enable();
> +#ifdef __BIG_ENDIAN
> +				msa_convert(msadata, &current->thread.fpu.fpr[wd], df);
> +#else
> +				*msadata = current->thread.fpu.fpr[wd];
> +#endif
> +			}
> +			res = __copy_to_user_inatomic(addr, msadata, 16);
> +			if (res)
> +				goto fault;
> +		}
> +
> +		break;
> +#endif /* CONFIG_CPU_HAS_MSA */
> +
>  	/*
>  	 * COP2 is available to implementor for application specific use.
>  	 * It's up to applications to register a notifier chain and do
> 
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] MIPS: MSA: misaligned support
  2015-03-18 11:27 ` James Hogan
@ 2015-03-18 19:46   ` Leonid Yegoshin
  2015-03-18 22:12     ` James Hogan
  0 siblings, 1 reply; 8+ messages in thread
From: Leonid Yegoshin @ 2015-03-18 19:46 UTC (permalink / raw)
  To: James Hogan, linux-mips, wangr, peterz, Qais Yousef,
	linux-kernel, ralf, davidlohr, chenhc, manuel.lauss, mingo

On 03/18/2015 04:27 AM, James Hogan wrote:
>
> +	.align  4
> doesn't this mean the first one & label might not be suitably aligned.
> Would it be better to put this before the ld_d (no need for it after
> $w31 case) and putting another .align 4 before the Lmsa_to and Lmsa_from
> labels (so the label itself is aligned)?

Good point! I will issue V2.

>
> +
> +	case 2: /* word */
> +		to->val32[0] = from->val32[1];
> +		to->val32[1] = from->val32[0];
> +		to->val32[2] = from->val32[3];
> +		to->val32[3] = from->val32[2];
> FWIW since the FP/MSA patches that Paul submitted, there are also
> working endian agnostic accessors created with BUILD_FPR_ACCESS, which
> use the FPR_IDX macro (see http://patchwork.linux-mips.org/patch/9169/),
> which should work for 8bit and 16bit sizes too.
>
> I wonder if the compiler would unroll/optimise this sort of thing:
> 	for (i = 0; i < (FPU_REG_WIDTH / 8); ++i)
> 		to_val8[i] = from->val[FPR_IDX(8, i)];
>
> No worries if not.

There is a simple logic behind it - any code rolling in macro/subroutine 
definitely
has sense if any of that is correct:

- code can't be observed by single look (more than half page or window)
- there is a chance to reuse some part of it
- some code vars/limits/bit/positions/etc can be changed in future
- some logical connection to macro/subroutine is desirable to take attn 
during code maintenance.

None of it besides last one is true here. Logical connection to union 
fpureg is done. Changing of MSA register size definitely causes a lot of 
other changes in logic and many assumptions may be broken in multiple 
places, including process signal conventions.

Rolling code in macro (FPR_IDX) and turning it into loop from pretty 
short assignment actually INCREASES a code complexity for future 
maintenance.


>
>> +		break;
>> +
>> +	case 3: /* doubleword, no conversion */
>> +		break;
> don't you still need to copy the value though?

Good point! Never test this subroutine yet, HW team still should produce 
BIG ENDIAN CPU+MSA variant.
I will issue V2.

>
>> +	}
>> +}
>> +#endif
>> +#endif
>> +
>>   static void emulate_load_store_insn(struct pt_regs *regs,
>>   	void __user *addr, unsigned int __user *pc)
>>   {
>> @@ -434,6 +497,10 @@ static void emulate_load_store_insn(struct pt_regs *regs,
>>   #ifdef	CONFIG_EVA
>>   	mm_segment_t seg;
>>   #endif
>> +#ifdef CONFIG_CPU_HAS_MSA
>> +	union fpureg msadatabase[2], *msadata;
>> +	unsigned int func, df, rs, wd;
>> +#endif
>>   	origpc = (unsigned long)pc;
>>   	orig31 = regs->regs[31];
>>   
>> @@ -703,6 +770,82 @@ static void emulate_load_store_insn(struct pt_regs *regs,
>>   			break;
>>   		return;
>>   
>> +#ifdef CONFIG_CPU_HAS_MSA
>> +	case msa_op:
>> +		if (cpu_has_mdmx)
>> +			goto sigill;
>> +
>> +		func = insn.msa_mi10_format.func;
>> +		switch (func) {
>> +		default:
>> +			goto sigbus;
>> +
>> +		case msa_ld_op:
>> +		case msa_st_op:
>> +			;
>> +		}
>> +
>> +		if (!thread_msa_context_live())
>> +			goto sigbus;
> Will this ever happen? (I can't see AdE handler enabling interrupts).
>
> If the MSA context genuinely isn't live (i.e. it can be considered
> UNPREDICTABLE), then surely a load operation should still succeed?

thread_msa_context_live() == check of TIF_MSA_CTX_LIVE == existence of 
MSA context for thread.
It differs from MSA is owned by thread, it just says that thread has 
already initialized MSA.

Unfortunate choice of function name, I believe.

This is a guard against bad selection of exception priorities in HW... 
sometime it happens.

>
>> +
>> +		df = insn.msa_mi10_format.df;
>> +		rs = insn.msa_mi10_format.rs;
>> +		wd = insn.msa_mi10_format.wd;
>> +		addr = (unsigned long *)(regs->regs[rs] + (insn.msa_mi10_format.s10 * (1 << df)));
> "* (1 << df)"?
> why not just "<< df"?
>
>> +		/* align a working space in stack... */
>> +		msadata = (union fpureg *)(((unsigned long)msadatabase + 15) & ~(unsigned long)0xf);
> Maybe you could just use __aligned(16) on a single local union fpureg.

I am not sure that it works in stack. I don't trust toolchain here - 
they even are not able to align a frame in stack to 16bytes.

>
>> +			}
>> +		} else {
>> +			if (!access_ok(VERIFY_WRITE, addr, 16))
>> +				goto sigbus;
>> +			compute_return_epc(regs);
> forgot to preempt_disable()?

Yes.

>
>> +			if (test_thread_flag(TIF_USEDMSA)) {
>> +#ifdef __BIG_ENDIAN
>> +				msa_from_wd(wd, &current->thread.fpu.fpr[wd]);
>> +				msa_convert(msadata, &current->thread.fpu.fpr[wd], df);
>> +#else
>> +				msa_from_wd(wd, msadata);
>> +#endif
>> +				preempt_enable();
>> +			} else {
>> +				preempt_enable();
>> +#ifdef __BIG_ENDIAN
>> +				msa_convert(msadata, &current->thread.fpu.fpr[wd], df);
>> +#else
>> +				*msadata = current->thread.fpu.fpr[wd];
> hmm, you could cheat and change this to the following?:
> 				msadata = &current->thread.fpu.fpr[wd];
Yes. But has it sense? It is just 2 doublewords is replaced by single 
PTR assignment. However, if msadata is not changed it gives a compiler 
some room for optimization.

- Leonid.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] MIPS: MSA: misaligned support
  2015-03-18 19:46   ` Leonid Yegoshin
@ 2015-03-18 22:12     ` James Hogan
  2015-03-18 23:25       ` Leonid Yegoshin
  0 siblings, 1 reply; 8+ messages in thread
From: James Hogan @ 2015-03-18 22:12 UTC (permalink / raw)
  To: Leonid Yegoshin
  Cc: linux-mips, wangr, peterz, Qais Yousef, linux-kernel, ralf,
	davidlohr, chenhc, manuel.lauss, mingo

[-- Attachment #1: Type: text/plain, Size: 3574 bytes --]

Hi Leonid,

On Wed, Mar 18, 2015 at 12:46:51PM -0700, Leonid Yegoshin wrote:
> On 03/18/2015 04:27 AM, James Hogan wrote:
> >
> >> +		break;
> >> +
> >> +	case 3: /* doubleword, no conversion */
> >> +		break;
> > don't you still need to copy the value though?
> 
> Good point! Never test this subroutine yet, HW team still should produce 
> BIG ENDIAN CPU+MSA variant.

P5600 big endian should be usable on Malta (I've ran it before).

Failing that, it should be pretty easy to force qemu to trigger an AdE
exception on all unaligned ld/st in order to test this.

> I will issue V2.
> 
> >
> >> +	}
> >> +}
> >> +#endif
> >> +#endif
> >> +
> >>   static void emulate_load_store_insn(struct pt_regs *regs,
> >>   	void __user *addr, unsigned int __user *pc)
> >>   {
> >> @@ -434,6 +497,10 @@ static void emulate_load_store_insn(struct pt_regs *regs,
> >>   #ifdef	CONFIG_EVA
> >>   	mm_segment_t seg;
> >>   #endif
> >> +#ifdef CONFIG_CPU_HAS_MSA
> >> +	union fpureg msadatabase[2], *msadata;
> >> +	unsigned int func, df, rs, wd;
> >> +#endif
> >>   	origpc = (unsigned long)pc;
> >>   	orig31 = regs->regs[31];
> >>   
> >> @@ -703,6 +770,82 @@ static void emulate_load_store_insn(struct pt_regs *regs,
> >>   			break;
> >>   		return;
> >>   
> >> +#ifdef CONFIG_CPU_HAS_MSA
> >> +	case msa_op:
> >> +		if (cpu_has_mdmx)
> >> +			goto sigill;
> >> +
> >> +		func = insn.msa_mi10_format.func;
> >> +		switch (func) {
> >> +		default:
> >> +			goto sigbus;
> >> +
> >> +		case msa_ld_op:
> >> +		case msa_st_op:
> >> +			;
> >> +		}
> >> +
> >> +		if (!thread_msa_context_live())
> >> +			goto sigbus;
> > Will this ever happen? (I can't see AdE handler enabling interrupts).
> >
> > If the MSA context genuinely isn't live (i.e. it can be considered
> > UNPREDICTABLE), then surely a load operation should still succeed?
> 
> thread_msa_context_live() == check of TIF_MSA_CTX_LIVE == existence of 
> MSA context for thread.
> It differs from MSA is owned by thread, it just says that thread has 
> already initialized MSA.
> 
> Unfortunate choice of function name, I believe.

Right (I mis-read when its cleared when i grepped). Still, that would
make it even harder to hit since lose_fpu wouldn't clear it, and you
already would've taken an MSA disabled exception first.

Anyway, my point was that there's nothing invalid about an unaligned
load being the first MSA instruction. You might use it to load the
initial vector state.

> 
> This is a guard against bad selection of exception priorities in HW... 
> sometime it happens.
> 
> >
> >> +
> >> +		df = insn.msa_mi10_format.df;
> >> +		rs = insn.msa_mi10_format.rs;
> >> +		wd = insn.msa_mi10_format.wd;
> >> +		addr = (unsigned long *)(regs->regs[rs] + (insn.msa_mi10_format.s10 * (1 << df)));
> > "* (1 << df)"?
> > why not just "<< df"?
> >
> >> +		/* align a working space in stack... */
> >> +		msadata = (union fpureg *)(((unsigned long)msadatabase + 15) & ~(unsigned long)0xf);
> > Maybe you could just use __aligned(16) on a single local union fpureg.
> 
> I am not sure that it works in stack. I don't trust toolchain here - 
> they even are not able to align a frame in stack to 16bytes.

I did wonder that, but found the following bug report which seems to
indicate that it was fixed generically in 4.6:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=16660

Unfortunately Linux supports MSA built with a toolchain that doesn't, so
that may not be good enough, I don't know :-(.

Cheers
James

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] MIPS: MSA: misaligned support
  2015-03-18 22:12     ` James Hogan
@ 2015-03-18 23:25       ` Leonid Yegoshin
  2015-03-19  9:51         ` James Hogan
  0 siblings, 1 reply; 8+ messages in thread
From: Leonid Yegoshin @ 2015-03-18 23:25 UTC (permalink / raw)
  To: James Hogan
  Cc: linux-mips, wangr, peterz, Qais Yousef, linux-kernel, ralf,
	davidlohr, chenhc, manuel.lauss, mingo

On 03/18/2015 03:12 PM, James Hogan wrote:
> Hi Leonid,
>
> On Wed, Mar 18, 2015 at 12:46:51PM -0700, Leonid Yegoshin wrote:
>
>> thread_msa_context_live() == check of TIF_MSA_CTX_LIVE == existence of
>> MSA context for thread.
>> It differs from MSA is owned by thread, it just says that thread has
>> already initialized MSA.
>>
>> Unfortunate choice of function name, I believe.
> Right (I mis-read when its cleared when i grepped). Still, that would
> make it even harder to hit since lose_fpu wouldn't clear it, and you
> already would've taken an MSA disabled exception first.
No, lose_fpu disables MSA now, saves MSA context and switches off 
TIF_USEDMSA. See 33c771ba5c5d067f85a5a6c4b11047219b5b8f4e, "MIPS: 
save/disable MSA in lose_fpu".

However, a process still has MSA context initialized and it is indicated 
by TIF_MSA_CTX_LIVE.
It should have it before it can get any AdE exception on MSA instruction.

>
> Anyway, my point was that there's nothing invalid about an unaligned
> load being the first MSA instruction. You might use it to load the
> initial vector state.

No, it is invalid. If MSA is disabled it should trigger "MSA Disabled" 
exception.

Unfortunately, some HW versions had AdE first and it may be logical from 
some HW point (if access is done before instruction is completely 
decoded). But that is wrong.



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] MIPS: MSA: misaligned support
  2015-03-18 23:25       ` Leonid Yegoshin
@ 2015-03-19  9:51         ` James Hogan
  2015-03-19 23:23           ` Leonid Yegoshin
  0 siblings, 1 reply; 8+ messages in thread
From: James Hogan @ 2015-03-19  9:51 UTC (permalink / raw)
  To: Leonid Yegoshin
  Cc: linux-mips, wangr, peterz, Qais Yousef, linux-kernel, ralf,
	davidlohr, chenhc, manuel.lauss, mingo

[-- Attachment #1: Type: text/plain, Size: 2232 bytes --]

On 18/03/15 23:25, Leonid Yegoshin wrote:
> On 03/18/2015 03:12 PM, James Hogan wrote:
>> Hi Leonid,
>>
>> On Wed, Mar 18, 2015 at 12:46:51PM -0700, Leonid Yegoshin wrote:
>>
>>> thread_msa_context_live() == check of TIF_MSA_CTX_LIVE == existence of
>>> MSA context for thread.
>>> It differs from MSA is owned by thread, it just says that thread has
>>> already initialized MSA.
>>>
>>> Unfortunate choice of function name, I believe.
>> Right (I mis-read when its cleared when i grepped). Still, that would
>> make it even harder to hit since lose_fpu wouldn't clear it, and you
>> already would've taken an MSA disabled exception first.
> No, lose_fpu disables MSA now, saves MSA context and switches off
> TIF_USEDMSA. See 33c771ba5c5d067f85a5a6c4b11047219b5b8f4e, "MIPS:
> save/disable MSA in lose_fpu".
> 
> However, a process still has MSA context initialized and it is indicated
> by TIF_MSA_CTX_LIVE.
> It should have it before it can get any AdE exception on MSA instruction.

Yes, exactly.

> 
>>
>> Anyway, my point was that there's nothing invalid about an unaligned
>> load being the first MSA instruction. You might use it to load the
>> initial vector state.
> 
> No, it is invalid. If MSA is disabled it should trigger "MSA Disabled"
> exception.

It's valid for the user to start their program with a ld.b.
As you say, it'll raise an MSA disabled exception first though. The
handler will own MSA, and set TIF_MSA_CTX_LIVE, which makes the check
pointless?

I suppose an AdE from a normal unaligned load could still race with
another thread modifying the instruction to an MSA ld.b, but even if it
did, I don't think it would do any harm?

> 
> Unfortunately, some HW versions had AdE first and it may be logical from
> some HW point (if access is done before instruction is completely
> decoded). But that is wrong.

Yes, MSA Disabled would clearly come under "Instruction Validity
Exceptions", which is very sensibly higher priority than "Address error
- Data access".

Anyway, at the very least it needs a comment to justify what it is
trying to catch and what harm it is trying to avoid, since it isn't
obvious, and tbh seems pointless.

Cheers
James


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] MIPS: MSA: misaligned support
  2015-03-19  9:51         ` James Hogan
@ 2015-03-19 23:23           ` Leonid Yegoshin
  0 siblings, 0 replies; 8+ messages in thread
From: Leonid Yegoshin @ 2015-03-19 23:23 UTC (permalink / raw)
  To: James Hogan
  Cc: linux-mips, wangr, peterz, Qais Yousef, linux-kernel, ralf,
	davidlohr, chenhc, manuel.lauss, mingo

On 03/19/2015 02:51 AM, James Hogan wrote:
> On 18/03/15 23:25, Leonid Yegoshin wrote:
>> On 03/18/2015 03:12 PM, James Hogan wrote:
>>> Hi Leonid,
>>>
>>> On Wed, Mar 18, 2015 at 12:46:51PM -0700, Leonid Yegoshin wrote:
>>>
>>>> thread_msa_context_live() == check of TIF_MSA_CTX_LIVE == existence of
>>>> MSA context for thread.
>>>> It differs from MSA is owned by thread, it just says that thread has
>>>> already initialized MSA.
>>>>
>>>> Unfortunate choice of function name, I believe.
>>> Right (I mis-read when its cleared when i grepped). Still, that would
>>> make it even harder to hit since lose_fpu wouldn't clear it, and you
>>> already would've taken an MSA disabled exception first.
>> No, lose_fpu disables MSA now, saves MSA context and switches off
>> TIF_USEDMSA. See 33c771ba5c5d067f85a5a6c4b11047219b5b8f4e, "MIPS:
>> save/disable MSA in lose_fpu".
>>
>> However, a process still has MSA context initialized and it is indicated
>> by TIF_MSA_CTX_LIVE.
>> It should have it before it can get any AdE exception on MSA instruction.
> Yes, exactly.
>
>>> Anyway, my point was that there's nothing invalid about an unaligned
>>> load being the first MSA instruction. You might use it to load the
>>> initial vector state.
>> No, it is invalid. If MSA is disabled it should trigger "MSA Disabled"
>> exception.
> It's valid for the user to start their program with a ld.b.
> As you say, it'll raise an MSA disabled exception first though. The
> handler will own MSA, and set TIF_MSA_CTX_LIVE, which makes the check
> pointless?

"Unfortunately, some HW versions had AdE first and it may be logical 
from some HW point (if access is done before instruction is completely 
decoded). But that is wrong."

>
> I suppose an AdE from a normal unaligned load could still race with
> another thread modifying the instruction to an MSA ld.b, but even if it
> did, I don't think it would do any harm?
>
>> Unfortunately, some HW versions had AdE first and it may be logical from
>> some HW point (if access is done before instruction is completely
>> decoded). But that is wrong.
> Yes, MSA Disabled would clearly come under "Instruction Validity
> Exceptions", which is very sensibly higher priority than "Address error
> - Data access".
>
> Anyway, at the very least it needs a comment to justify what it is
> trying to catch and what harm it is trying to avoid, since it isn't
> obvious, and tbh seems pointless.
>
> Cheers
> James
>


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2015-03-19 23:23 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-18  1:16 [PATCH] MIPS: MSA: misaligned support Leonid Yegoshin
2015-03-18 11:27 ` James Hogan
2015-03-18 19:46   ` Leonid Yegoshin
2015-03-18 22:12     ` James Hogan
2015-03-18 23:25       ` Leonid Yegoshin
2015-03-19  9:51         ` James Hogan
2015-03-19 23:23           ` Leonid Yegoshin
2015-03-18 11:41 ` James Hogan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).