All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH v2 0/2] target-arm: fix Neon VUZP, VZIP instructions
@ 2011-02-14 10:22 Peter Maydell
  2011-02-14 10:22 ` [Qemu-devel] [PATCH v2 1/2] target-arm: Move Neon VUZP to helper functions Peter Maydell
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Peter Maydell @ 2011-02-14 10:22 UTC (permalink / raw)
  To: qemu-devel; +Cc: patches

This patch series fixes bugs in the Neon VZIP and VUZP instructions
by abandoning the existing inline implementations in favour of
calling out to straightforward helper functions. The inline routines
could generate 50+ TCG ops each, which is well over the recommended
limit in tcg/README for using helpers instead; they also did
not give the correct results...

I've tested these patches using the usual random instruction
generation approach.

V2 changes: moved the decoding of register numbers, size and q
flag out of the helper functions into translate.c; split the
helpers up into one per (size, q) combination.

Peter Maydell (2):
  target-arm: Move Neon VUZP to helper functions
  target-arm: Move Neon VZIP to helper functions

 target-arm/helpers.h     |   11 +++
 target-arm/neon_helper.c |  186 +++++++++++++++++++++++++++++++++++++++
 target-arm/translate.c   |  215 +++++++++++++++-------------------------------
 3 files changed, 267 insertions(+), 145 deletions(-)

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Qemu-devel] [PATCH v2 1/2] target-arm: Move Neon VUZP to helper functions
  2011-02-14 10:22 [Qemu-devel] [PATCH v2 0/2] target-arm: fix Neon VUZP, VZIP instructions Peter Maydell
@ 2011-02-14 10:22 ` Peter Maydell
  2011-02-14 10:22 ` [Qemu-devel] [PATCH v2 2/2] target-arm: Move Neon VZIP " Peter Maydell
  2011-02-20 16:35 ` [Qemu-devel] [PATCH v2 0/2] target-arm: fix Neon VUZP, VZIP instructions Aurelien Jarno
  2 siblings, 0 replies; 4+ messages in thread
From: Peter Maydell @ 2011-02-14 10:22 UTC (permalink / raw)
  To: qemu-devel; +Cc: patches

Move the implementation of the Neon VUZP unzip instruction from inline
code to helper functions. (At 50+ TCG ops it was well over the
recommended limit for coding inline.) The helper implementations also
fix the handling of the quadword version of the instruction.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target-arm/helpers.h     |    6 +++
 target-arm/neon_helper.c |   94 ++++++++++++++++++++++++++++++++++++++
 target-arm/translate.c   |  112 +++++++++++++++-------------------------------
 3 files changed, 137 insertions(+), 75 deletions(-)

diff --git a/target-arm/helpers.h b/target-arm/helpers.h
index 77f1635..88c2fce 100644
--- a/target-arm/helpers.h
+++ b/target-arm/helpers.h
@@ -460,4 +460,10 @@ DEF_HELPER_3(iwmmxt_muladdswl, i64, i64, i32, i32)
 
 DEF_HELPER_2(set_teecr, void, env, i32)
 
+DEF_HELPER_3(neon_unzip8, void, env, i32, i32)
+DEF_HELPER_3(neon_unzip16, void, env, i32, i32)
+DEF_HELPER_3(neon_qunzip8, void, env, i32, i32)
+DEF_HELPER_3(neon_qunzip16, void, env, i32, i32)
+DEF_HELPER_3(neon_qunzip32, void, env, i32, i32)
+
 #include "def-helper.h"
diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index dc09968..6e2a2fd 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -1663,3 +1663,97 @@ uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b)
     float32 f1 = float32_abs(vfp_itos(b));
     return (float32_compare_quiet(f0, f1, NFS) > 0) ? ~0 : 0;
 }
+
+#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
+
+void HELPER(neon_qunzip8)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
+    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
+    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
+    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
+    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
+        | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
+        | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
+        | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
+    uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
+        | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
+        | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
+        | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
+    uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
+        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
+        | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
+        | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
+    uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
+        | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
+        | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
+        | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
+    env->vfp.regs[rm] = make_float64(m0);
+    env->vfp.regs[rm + 1] = make_float64(m1);
+    env->vfp.regs[rd] = make_float64(d0);
+    env->vfp.regs[rd + 1] = make_float64(d1);
+}
+
+void HELPER(neon_qunzip16)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
+    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
+    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
+    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
+    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
+        | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
+    uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
+        | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
+    uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
+        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
+    uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
+        | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
+    env->vfp.regs[rm] = make_float64(m0);
+    env->vfp.regs[rm + 1] = make_float64(m1);
+    env->vfp.regs[rd] = make_float64(d0);
+    env->vfp.regs[rd + 1] = make_float64(d1);
+}
+
+void HELPER(neon_qunzip32)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
+    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
+    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
+    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
+    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
+    uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
+    uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
+    uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
+    env->vfp.regs[rm] = make_float64(m0);
+    env->vfp.regs[rm + 1] = make_float64(m1);
+    env->vfp.regs[rd] = make_float64(d0);
+    env->vfp.regs[rd + 1] = make_float64(d1);
+}
+
+void HELPER(neon_unzip8)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+    uint64_t zm = float64_val(env->vfp.regs[rm]);
+    uint64_t zd = float64_val(env->vfp.regs[rd]);
+    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
+        | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
+        | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
+        | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
+    uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
+        | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
+        | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
+        | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
+    env->vfp.regs[rm] = make_float64(m0);
+    env->vfp.regs[rd] = make_float64(d0);
+}
+
+void HELPER(neon_unzip16)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+    uint64_t zm = float64_val(env->vfp.regs[rm]);
+    uint64_t zd = float64_val(env->vfp.regs[rd]);
+    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
+        | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
+    uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
+        | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
+    env->vfp.regs[rm] = make_float64(m0);
+    env->vfp.regs[rd] = make_float64(d0);
+}
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 362d1d0..6d94223 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -3614,40 +3614,43 @@ static inline TCGv neon_get_scalar(int size, int reg)
     return tmp;
 }
 
-static void gen_neon_unzip_u8(TCGv t0, TCGv t1)
+static int gen_neon_unzip(int rd, int rm, int size, int q)
 {
-    TCGv rd, rm, tmp;
-
-    rd = new_tmp();
-    rm = new_tmp();
-    tmp = new_tmp();
-
-    tcg_gen_andi_i32(rd, t0, 0xff);
-    tcg_gen_shri_i32(tmp, t0, 8);
-    tcg_gen_andi_i32(tmp, tmp, 0xff00);
-    tcg_gen_or_i32(rd, rd, tmp);
-    tcg_gen_shli_i32(tmp, t1, 16);
-    tcg_gen_andi_i32(tmp, tmp, 0xff0000);
-    tcg_gen_or_i32(rd, rd, tmp);
-    tcg_gen_shli_i32(tmp, t1, 8);
-    tcg_gen_andi_i32(tmp, tmp, 0xff000000);
-    tcg_gen_or_i32(rd, rd, tmp);
-
-    tcg_gen_shri_i32(rm, t0, 8);
-    tcg_gen_andi_i32(rm, rm, 0xff);
-    tcg_gen_shri_i32(tmp, t0, 16);
-    tcg_gen_andi_i32(tmp, tmp, 0xff00);
-    tcg_gen_or_i32(rm, rm, tmp);
-    tcg_gen_shli_i32(tmp, t1, 8);
-    tcg_gen_andi_i32(tmp, tmp, 0xff0000);
-    tcg_gen_or_i32(rm, rm, tmp);
-    tcg_gen_andi_i32(tmp, t1, 0xff000000);
-    tcg_gen_or_i32(t1, rm, tmp);
-    tcg_gen_mov_i32(t0, rd);
-
-    dead_tmp(tmp);
-    dead_tmp(rm);
-    dead_tmp(rd);
+    TCGv tmp, tmp2;
+    if (size == 3 || (!q && size == 2)) {
+        return 1;
+    }
+    tmp = tcg_const_i32(rd);
+    tmp2 = tcg_const_i32(rm);
+    if (q) {
+        switch (size) {
+        case 0:
+            gen_helper_neon_qunzip8(cpu_env, tmp, tmp2);
+            break;
+        case 1:
+            gen_helper_neon_qunzip16(cpu_env, tmp, tmp2);
+            break;
+        case 2:
+            gen_helper_neon_qunzip32(cpu_env, tmp, tmp2);
+            break;
+        default:
+            abort();
+        }
+    } else {
+        switch (size) {
+        case 0:
+            gen_helper_neon_unzip8(cpu_env, tmp, tmp2);
+            break;
+        case 1:
+            gen_helper_neon_unzip16(cpu_env, tmp, tmp2);
+            break;
+        default:
+            abort();
+        }
+    }
+    tcg_temp_free_i32(tmp);
+    tcg_temp_free_i32(tmp2);
+    return 0;
 }
 
 static void gen_neon_zip_u8(TCGv t0, TCGv t1)
@@ -3705,25 +3708,6 @@ static void gen_neon_zip_u16(TCGv t0, TCGv t1)
     dead_tmp(tmp);
 }
 
-static void gen_neon_unzip(int reg, int q, int tmp, int size)
-{
-    int n;
-    TCGv t0, t1;
-
-    for (n = 0; n < q + 1; n += 2) {
-        t0 = neon_load_reg(reg, n);
-        t1 = neon_load_reg(reg, n + 1);
-        switch (size) {
-        case 0: gen_neon_unzip_u8(t0, t1); break;
-        case 1: gen_neon_zip_u16(t0, t1); break; /* zip and unzip are the same.  */
-        case 2: /* no-op */; break;
-        default: abort();
-        }
-        neon_store_scratch(tmp + n, t0);
-        neon_store_scratch(tmp + n + 1, t1);
-    }
-}
-
 static void gen_neon_trn_u8(TCGv t0, TCGv t1)
 {
     TCGv rd, tmp;
@@ -5436,30 +5420,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
                     }
                     break;
                 case 34: /* VUZP */
-                    /* Reg  Before       After
-                       Rd   A3 A2 A1 A0  B2 B0 A2 A0
-                       Rm   B3 B2 B1 B0  B3 B1 A3 A1
-                     */
-                    if (size == 3)
+                    if (gen_neon_unzip(rd, rm, size, q)) {
                         return 1;
-                    gen_neon_unzip(rd, q, 0, size);
-                    gen_neon_unzip(rm, q, 4, size);
-                    if (q) {
-                        static int unzip_order_q[8] =
-                            {0, 2, 4, 6, 1, 3, 5, 7};
-                        for (n = 0; n < 8; n++) {
-                            int reg = (n < 4) ? rd : rm;
-                            tmp = neon_load_scratch(unzip_order_q[n]);
-                            neon_store_reg(reg, n % 4, tmp);
-                        }
-                    } else {
-                        static int unzip_order[4] =
-                            {0, 4, 1, 5};
-                        for (n = 0; n < 4; n++) {
-                            int reg = (n < 2) ? rd : rm;
-                            tmp = neon_load_scratch(unzip_order[n]);
-                            neon_store_reg(reg, n % 2, tmp);
-                        }
                     }
                     break;
                 case 35: /* VZIP */
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [Qemu-devel] [PATCH v2 2/2] target-arm: Move Neon VZIP to helper functions
  2011-02-14 10:22 [Qemu-devel] [PATCH v2 0/2] target-arm: fix Neon VUZP, VZIP instructions Peter Maydell
  2011-02-14 10:22 ` [Qemu-devel] [PATCH v2 1/2] target-arm: Move Neon VUZP to helper functions Peter Maydell
@ 2011-02-14 10:22 ` Peter Maydell
  2011-02-20 16:35 ` [Qemu-devel] [PATCH v2 0/2] target-arm: fix Neon VUZP, VZIP instructions Aurelien Jarno
  2 siblings, 0 replies; 4+ messages in thread
From: Peter Maydell @ 2011-02-14 10:22 UTC (permalink / raw)
  To: qemu-devel; +Cc: patches

Move the implementation of the Neon VUZP unzip instruction from inline
code to helper functions. (At 50+ TCG ops it was well over the
recommended limit for coding inline.) The helper implementations also
give the correct answers where the inline implementation did not.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target-arm/helpers.h     |    5 ++
 target-arm/neon_helper.c |   92 ++++++++++++++++++++++++++++++++++++++
 target-arm/translate.c   |  109 +++++++++++++++-------------------------------
 3 files changed, 133 insertions(+), 73 deletions(-)

diff --git a/target-arm/helpers.h b/target-arm/helpers.h
index 88c2fce..e494b47 100644
--- a/target-arm/helpers.h
+++ b/target-arm/helpers.h
@@ -465,5 +465,10 @@ DEF_HELPER_3(neon_unzip16, void, env, i32, i32)
 DEF_HELPER_3(neon_qunzip8, void, env, i32, i32)
 DEF_HELPER_3(neon_qunzip16, void, env, i32, i32)
 DEF_HELPER_3(neon_qunzip32, void, env, i32, i32)
+DEF_HELPER_3(neon_zip8, void, env, i32, i32)
+DEF_HELPER_3(neon_zip16, void, env, i32, i32)
+DEF_HELPER_3(neon_qzip8, void, env, i32, i32)
+DEF_HELPER_3(neon_qzip16, void, env, i32, i32)
+DEF_HELPER_3(neon_qzip32, void, env, i32, i32)
 
 #include "def-helper.h"
diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index 6e2a2fd..cee9ee4 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -1757,3 +1757,95 @@ void HELPER(neon_unzip16)(CPUState *env, uint32_t rd, uint32_t rm)
     env->vfp.regs[rm] = make_float64(m0);
     env->vfp.regs[rd] = make_float64(d0);
 }
+
+void HELPER(neon_qzip8)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
+    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
+    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
+    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
+    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
+        | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
+        | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
+        | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
+    uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
+        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
+        | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
+        | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
+    uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
+        | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
+        | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
+        | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
+    uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
+        | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
+        | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
+        | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
+    env->vfp.regs[rm] = make_float64(m0);
+    env->vfp.regs[rm + 1] = make_float64(m1);
+    env->vfp.regs[rd] = make_float64(d0);
+    env->vfp.regs[rd + 1] = make_float64(d1);
+}
+
+void HELPER(neon_qzip16)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
+    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
+    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
+    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
+    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
+        | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
+    uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
+        | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
+    uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
+        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
+    uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
+        | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
+    env->vfp.regs[rm] = make_float64(m0);
+    env->vfp.regs[rm + 1] = make_float64(m1);
+    env->vfp.regs[rd] = make_float64(d0);
+    env->vfp.regs[rd + 1] = make_float64(d1);
+}
+
+void HELPER(neon_qzip32)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
+    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
+    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
+    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
+    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
+    uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
+    uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
+    uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
+    env->vfp.regs[rm] = make_float64(m0);
+    env->vfp.regs[rm + 1] = make_float64(m1);
+    env->vfp.regs[rd] = make_float64(d0);
+    env->vfp.regs[rd + 1] = make_float64(d1);
+}
+
+void HELPER(neon_zip8)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+    uint64_t zm = float64_val(env->vfp.regs[rm]);
+    uint64_t zd = float64_val(env->vfp.regs[rd]);
+    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
+        | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
+        | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
+        | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
+    uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
+        | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
+        | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
+        | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
+    env->vfp.regs[rm] = make_float64(m0);
+    env->vfp.regs[rd] = make_float64(d0);
+}
+
+void HELPER(neon_zip16)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+    uint64_t zm = float64_val(env->vfp.regs[rm]);
+    uint64_t zd = float64_val(env->vfp.regs[rd]);
+    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
+        | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
+    uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
+        | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
+    env->vfp.regs[rm] = make_float64(m0);
+    env->vfp.regs[rd] = make_float64(d0);
+}
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 6d94223..50c8fe0 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -3653,59 +3653,43 @@ static int gen_neon_unzip(int rd, int rm, int size, int q)
     return 0;
 }
 
-static void gen_neon_zip_u8(TCGv t0, TCGv t1)
-{
-    TCGv rd, rm, tmp;
-
-    rd = new_tmp();
-    rm = new_tmp();
-    tmp = new_tmp();
-
-    tcg_gen_andi_i32(rd, t0, 0xff);
-    tcg_gen_shli_i32(tmp, t1, 8);
-    tcg_gen_andi_i32(tmp, tmp, 0xff00);
-    tcg_gen_or_i32(rd, rd, tmp);
-    tcg_gen_shli_i32(tmp, t0, 16);
-    tcg_gen_andi_i32(tmp, tmp, 0xff0000);
-    tcg_gen_or_i32(rd, rd, tmp);
-    tcg_gen_shli_i32(tmp, t1, 24);
-    tcg_gen_andi_i32(tmp, tmp, 0xff000000);
-    tcg_gen_or_i32(rd, rd, tmp);
-
-    tcg_gen_andi_i32(rm, t1, 0xff000000);
-    tcg_gen_shri_i32(tmp, t0, 8);
-    tcg_gen_andi_i32(tmp, tmp, 0xff0000);
-    tcg_gen_or_i32(rm, rm, tmp);
-    tcg_gen_shri_i32(tmp, t1, 8);
-    tcg_gen_andi_i32(tmp, tmp, 0xff00);
-    tcg_gen_or_i32(rm, rm, tmp);
-    tcg_gen_shri_i32(tmp, t0, 16);
-    tcg_gen_andi_i32(tmp, tmp, 0xff);
-    tcg_gen_or_i32(t1, rm, tmp);
-    tcg_gen_mov_i32(t0, rd);
-
-    dead_tmp(tmp);
-    dead_tmp(rm);
-    dead_tmp(rd);
-}
-
-static void gen_neon_zip_u16(TCGv t0, TCGv t1)
+static int gen_neon_zip(int rd, int rm, int size, int q)
 {
     TCGv tmp, tmp2;
-
-    tmp = new_tmp();
-    tmp2 = new_tmp();
-
-    tcg_gen_andi_i32(tmp, t0, 0xffff);
-    tcg_gen_shli_i32(tmp2, t1, 16);
-    tcg_gen_or_i32(tmp, tmp, tmp2);
-    tcg_gen_andi_i32(t1, t1, 0xffff0000);
-    tcg_gen_shri_i32(tmp2, t0, 16);
-    tcg_gen_or_i32(t1, t1, tmp2);
-    tcg_gen_mov_i32(t0, tmp);
-
-    dead_tmp(tmp2);
-    dead_tmp(tmp);
+    if (size == 3 || (!q && size == 2)) {
+        return 1;
+    }
+    tmp = tcg_const_i32(rd);
+    tmp2 = tcg_const_i32(rm);
+    if (q) {
+        switch (size) {
+        case 0:
+            gen_helper_neon_qzip8(cpu_env, tmp, tmp2);
+            break;
+        case 1:
+            gen_helper_neon_qzip16(cpu_env, tmp, tmp2);
+            break;
+        case 2:
+            gen_helper_neon_qzip32(cpu_env, tmp, tmp2);
+            break;
+        default:
+            abort();
+        }
+    } else {
+        switch (size) {
+        case 0:
+            gen_helper_neon_zip8(cpu_env, tmp, tmp2);
+            break;
+        case 1:
+            gen_helper_neon_zip16(cpu_env, tmp, tmp2);
+            break;
+        default:
+            abort();
+        }
+    }
+    tcg_temp_free_i32(tmp);
+    tcg_temp_free_i32(tmp2);
+    return 0;
 }
 
 static void gen_neon_trn_u8(TCGv t0, TCGv t1)
@@ -5425,29 +5409,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
                     }
                     break;
                 case 35: /* VZIP */
-                    /* Reg  Before       After
-                       Rd   A3 A2 A1 A0  B1 A1 B0 A0
-                       Rm   B3 B2 B1 B0  B3 A3 B2 A2
-                     */
-                    if (size == 3)
+                    if (gen_neon_zip(rd, rm, size, q)) {
                         return 1;
-                    count = (q ? 4 : 2);
-                    for (n = 0; n < count; n++) {
-                        tmp = neon_load_reg(rd, n);
-                        tmp2 = neon_load_reg(rd, n);
-                        switch (size) {
-                        case 0: gen_neon_zip_u8(tmp, tmp2); break;
-                        case 1: gen_neon_zip_u16(tmp, tmp2); break;
-                        case 2: /* no-op */; break;
-                        default: abort();
-                        }
-                        neon_store_scratch(n * 2, tmp);
-                        neon_store_scratch(n * 2 + 1, tmp2);
-                    }
-                    for (n = 0; n < count * 2; n++) {
-                        int reg = (n < count) ? rd : rm;
-                        tmp = neon_load_scratch(n);
-                        neon_store_reg(reg, n % count, tmp);
                     }
                     break;
                 case 36: case 37: /* VMOVN, VQMOVUN, VQMOVN */
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [Qemu-devel] [PATCH v2 0/2] target-arm: fix Neon VUZP, VZIP instructions
  2011-02-14 10:22 [Qemu-devel] [PATCH v2 0/2] target-arm: fix Neon VUZP, VZIP instructions Peter Maydell
  2011-02-14 10:22 ` [Qemu-devel] [PATCH v2 1/2] target-arm: Move Neon VUZP to helper functions Peter Maydell
  2011-02-14 10:22 ` [Qemu-devel] [PATCH v2 2/2] target-arm: Move Neon VZIP " Peter Maydell
@ 2011-02-20 16:35 ` Aurelien Jarno
  2 siblings, 0 replies; 4+ messages in thread
From: Aurelien Jarno @ 2011-02-20 16:35 UTC (permalink / raw)
  To: Peter Maydell; +Cc: qemu-devel, patches

On Mon, Feb 14, 2011 at 10:22:47AM +0000, Peter Maydell wrote:
> This patch series fixes bugs in the Neon VZIP and VUZP instructions
> by abandoning the existing inline implementations in favour of
> calling out to straightforward helper functions. The inline routines
> could generate 50+ TCG ops each, which is well over the recommended
> limit in tcg/README for using helpers instead; they also did
> not give the correct results...
> 
> I've tested these patches using the usual random instruction
> generation approach.
> 
> V2 changes: moved the decoding of register numbers, size and q
> flag out of the helper functions into translate.c; split the
> helpers up into one per (size, q) combination.
> 
> Peter Maydell (2):
>   target-arm: Move Neon VUZP to helper functions
>   target-arm: Move Neon VZIP to helper functions
> 
>  target-arm/helpers.h     |   11 +++
>  target-arm/neon_helper.c |  186 +++++++++++++++++++++++++++++++++++++++
>  target-arm/translate.c   |  215 +++++++++++++++-------------------------------
>  3 files changed, 267 insertions(+), 145 deletions(-)
> 

Thanks, both applied.

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2011-02-20 16:36 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-02-14 10:22 [Qemu-devel] [PATCH v2 0/2] target-arm: fix Neon VUZP, VZIP instructions Peter Maydell
2011-02-14 10:22 ` [Qemu-devel] [PATCH v2 1/2] target-arm: Move Neon VUZP to helper functions Peter Maydell
2011-02-14 10:22 ` [Qemu-devel] [PATCH v2 2/2] target-arm: Move Neon VZIP " Peter Maydell
2011-02-20 16:35 ` [Qemu-devel] [PATCH v2 0/2] target-arm: fix Neon VUZP, VZIP instructions Aurelien Jarno

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.