[Qemu-devel] [RFC PATCH v1 1/2] target-arm: Update page size for aarch64

All of lore.kernel.org
 help / color / mirror / Atom feed

* [Qemu-devel] [RFC PATCH v1 1/2] target-arm: Update page size for aarch64
       [not found] <1459777195-7907-1-git-send-email-vijayak@caviumnetworks.com>
@ 2016-04-04 13:39 ` vijayak
  2016-04-04 13:44   ` Peter Maydell
  2016-04-04 13:39 ` [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking vijayak
  1 sibling, 1 reply; 18+ messages in thread
From: vijayak @ 2016-04-04 13:39 UTC (permalink / raw)
  To: qemu-arm, peter.maydell
  Cc: Prasun.Kapoor, Vijay, Vijaya Kumar K, qemu-devel, vijay.kilari

From: Vijay <vijayak@cavium.com>

Set target page size to minimum 4K for aarch64.
This helps to reduce live migration downtime significantly.

Signed-off-by: Vijaya Kumar K <vijayak@caviumnetworks.com>
---
 target-arm/cpu.h |    7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 066ff67..2e4b48f 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -1562,11 +1562,18 @@ bool write_cpustate_to_list(ARMCPU *cpu);
 #if defined(CONFIG_USER_ONLY)
 #define TARGET_PAGE_BITS 12
 #else
+/*
+ * Aarch64 support minimum 4K page size
+ */
+#if defined(TARGET_AARCH64)
+#define TARGET_PAGE_BITS 12
+#else
 /* The ARM MMU allows 1k pages.  */
 /* ??? Linux doesn't actually use these, and they're deprecated in recent
    architecture revisions.  Maybe a configure option to disable them.  */
 #define TARGET_PAGE_BITS 10
 #endif
+#endif
 
 #if defined(TARGET_AARCH64)
 #  define TARGET_PHYS_ADDR_SPACE_BITS 48
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking
       [not found] <1459777195-7907-1-git-send-email-vijayak@caviumnetworks.com>
  2016-04-04 13:39 ` [Qemu-devel] [RFC PATCH v1 1/2] target-arm: Update page size for aarch64 vijayak
@ 2016-04-04 13:39 ` vijayak
  2016-04-05 14:36   ` Peter Maydell
  2016-04-05 15:28   ` Peter Maydell
  1 sibling, 2 replies; 18+ messages in thread
From: vijayak @ 2016-04-04 13:39 UTC (permalink / raw)
  To: qemu-arm, peter.maydell
  Cc: Prasun.Kapoor, Vijay, Vijaya Kumar K, qemu-devel, vijay.kilari

From: Vijay <vijayak@cavium.com>

Use Neon instructions to perform zero checking of
buffer. This is helps in reducing downtime during
live migration.

Signed-off-by: Vijaya Kumar K <vijayak@caviumnetworks.com>
---
 util/cutils.c |   81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/util/cutils.c b/util/cutils.c
index 43d1afb..d343b9a 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -352,6 +352,87 @@ static void *can_use_buffer_find_nonzero_offset_ifunc(void)
     return func;
 }
 #pragma GCC pop_options
+
+#elif defined __aarch64__
+#include "arm_neon.h"
+
+#define NEON_VECTYPE               uint64x2_t
+#define NEON_LOAD_N_ORR(v1, v2)    vorrq_u64(vld1q_u64(&v1), vld1q_u64(&v2))
+#define NEON_ORR(v1, v2)           vorrq_u64(v1, v2)
+#define NEON_EQ_ZERO(v1) \
+        ((vgetq_lane_u64(vceqzq_u64(v1), 0) == 0) || \
+         (vgetq_lane_u64(vceqzq_u64(v1), 1)) == 0)
+
+#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16
+
+/*
+ * Zero page/buffer checking using SIMD(Neon)
+ */
+
+static bool
+can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON
+                   * sizeof(NEON_VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0);
+}
+
+static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len)
+{
+    size_t i;
+    NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6;
+    NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14;
+    uint64_t const *data = buf;
+
+    assert(can_use_buffer_find_nonzero_offset_neon(buf, len));
+    len /= sizeof(unsigned long);
+
+    for (i = 0; i < len; i += 32) {
+        d0 = NEON_LOAD_N_ORR(data[i], data[i + 2]);
+        d1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]);
+        d2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]);
+        d3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]);
+        d4 = NEON_ORR(d0, d1);
+        d5 = NEON_ORR(d2, d3);
+        d6 = NEON_ORR(d4, d5);
+
+        d7 = NEON_LOAD_N_ORR(data[i + 16], data[i + 18]);
+        d8 = NEON_LOAD_N_ORR(data[i + 20], data[i + 22]);
+        d9 = NEON_LOAD_N_ORR(data[i + 24], data[i + 26]);
+        d10 = NEON_LOAD_N_ORR(data[i + 28], data[i + 30]);
+        d11 = NEON_ORR(d7, d8);
+        d12 = NEON_ORR(d9, d10);
+        d13 = NEON_ORR(d11, d12);
+
+        d14 = NEON_ORR(d6, d13);
+        if (NEON_EQ_ZERO(d14)) {
+            break;
+        }
+    }
+
+    return i * sizeof(unsigned long);
+}
+
+static inline bool neon_support(void)
+{
+    /*
+     * Check if neon feature is supported.
+     * By default neon is supported for aarch64.
+     */
+    return true;
+}
+
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    return neon_support() ? can_use_buffer_find_nonzero_offset_neon(buf, len) :
+           can_use_buffer_find_nonzero_offset_inner(buf, len);
+}
+
+size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    return neon_support() ? buffer_find_nonzero_offset_neon(buf, len) :
+           buffer_find_nonzero_offset_inner(buf, len);
+}
 #else
 bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
 {
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v1 1/2] target-arm: Update page size for aarch64
  2016-04-04 13:39 ` [Qemu-devel] [RFC PATCH v1 1/2] target-arm: Update page size for aarch64 vijayak
@ 2016-04-04 13:44   ` Peter Maydell
  2016-04-04 16:40     ` Vijay Kilari
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Maydell @ 2016-04-04 13:44 UTC (permalink / raw)
  To: vijayak; +Cc: Prasun.Kapoor, Vijay, qemu-arm, QEMU Developers, Vijay Kilari

On 4 April 2016 at 14:39,  <vijayak@caviumnetworks.com> wrote:
> From: Vijay <vijayak@cavium.com>
>
> Set target page size to minimum 4K for aarch64.
> This helps to reduce live migration downtime significantly.
>
> Signed-off-by: Vijaya Kumar K <vijayak@caviumnetworks.com>
> ---
>  target-arm/cpu.h |    7 +++++++
>  1 file changed, 7 insertions(+)
>
> diff --git a/target-arm/cpu.h b/target-arm/cpu.h
> index 066ff67..2e4b48f 100644
> --- a/target-arm/cpu.h
> +++ b/target-arm/cpu.h
> @@ -1562,11 +1562,18 @@ bool write_cpustate_to_list(ARMCPU *cpu);
>  #if defined(CONFIG_USER_ONLY)
>  #define TARGET_PAGE_BITS 12
>  #else
> +/*
> + * Aarch64 support minimum 4K page size
> + */
> +#if defined(TARGET_AARCH64)
> +#define TARGET_PAGE_BITS 12

I agree that this would definitely improve performance (both for
migration and for emulated guests), but I'm afraid this breaks
running 32-bit ARMv5 and ARMv7M guests with this QEMU binary,
so we can't do this. If we want to allow the minimum page size to
be bigger than 1K for AArch64 CPUs then we need to make it a
runtime settable thing rather than compile-time (which is not
an entirely trivial thing).

> +#else
>  /* The ARM MMU allows 1k pages.  */
>  /* ??? Linux doesn't actually use these, and they're deprecated in recent
>     architecture revisions.  Maybe a configure option to disable them.  */
>  #define TARGET_PAGE_BITS 10
>  #endif
> +#endif
>
>  #if defined(TARGET_AARCH64)
>  #  define TARGET_PHYS_ADDR_SPACE_BITS 48

thanks
-- PMM

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v1 1/2] target-arm: Update page size for aarch64
  2016-04-04 13:44   ` Peter Maydell
@ 2016-04-04 16:40     ` Vijay Kilari
  2016-04-04 16:44       ` Peter Maydell
  0 siblings, 1 reply; 18+ messages in thread
From: Vijay Kilari @ 2016-04-04 16:40 UTC (permalink / raw)
  To: Peter Maydell
  Cc: Vijaya Kumar K, Prasun Kapoor, qemu-arm, QEMU Developers, Vijay

On Mon, Apr 4, 2016 at 7:14 PM, Peter Maydell <peter.maydell@linaro.org> wrote:
> On 4 April 2016 at 14:39,  <vijayak@caviumnetworks.com> wrote:
>> From: Vijay <vijayak@cavium.com>
>>
>> Set target page size to minimum 4K for aarch64.
>> This helps to reduce live migration downtime significantly.
>>
>> Signed-off-by: Vijaya Kumar K <vijayak@caviumnetworks.com>
>> ---
>>  target-arm/cpu.h |    7 +++++++
>>  1 file changed, 7 insertions(+)
>>
>> diff --git a/target-arm/cpu.h b/target-arm/cpu.h
>> index 066ff67..2e4b48f 100644
>> --- a/target-arm/cpu.h
>> +++ b/target-arm/cpu.h
>> @@ -1562,11 +1562,18 @@ bool write_cpustate_to_list(ARMCPU *cpu);
>>  #if defined(CONFIG_USER_ONLY)
>>  #define TARGET_PAGE_BITS 12
>>  #else
>> +/*
>> + * Aarch64 support minimum 4K page size
>> + */
>> +#if defined(TARGET_AARCH64)
>> +#define TARGET_PAGE_BITS 12
>
> I agree that this would definitely improve performance (both for
> migration and for emulated guests), but I'm afraid this breaks
> running 32-bit ARMv5 and ARMv7M guests with this QEMU binary,
> so we can't do this. If we want to allow the minimum page size to
> be bigger than 1K for AArch64 CPUs then we need to make it a
> runtime settable thing rather than compile-time (which is not
> an entirely trivial thing).

Do you mean to say that based on -cpu type qemu option
choose the page size at runtime?

>
>> +#else
>>  /* The ARM MMU allows 1k pages.  */
>>  /* ??? Linux doesn't actually use these, and they're deprecated in recent
>>     architecture revisions.  Maybe a configure option to disable them.  */
>>  #define TARGET_PAGE_BITS 10
>>  #endif
>> +#endif
>>
>>  #if defined(TARGET_AARCH64)
>>  #  define TARGET_PHYS_ADDR_SPACE_BITS 48
>
> thanks
> -- PMM

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v1 1/2] target-arm: Update page size for aarch64
  2016-04-04 16:40     ` Vijay Kilari
@ 2016-04-04 16:44       ` Peter Maydell
  2016-04-06 15:01         ` Vijay Kilari
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Maydell @ 2016-04-04 16:44 UTC (permalink / raw)
  To: Vijay Kilari
  Cc: Vijaya Kumar K, Prasun Kapoor, qemu-arm, QEMU Developers, Vijay

On 4 April 2016 at 17:40, Vijay Kilari <vijay.kilari@gmail.com> wrote:
> On Mon, Apr 4, 2016 at 7:14 PM, Peter Maydell <peter.maydell@linaro.org> wrote:
>> I agree that this would definitely improve performance (both for
>> migration and for emulated guests), but I'm afraid this breaks
>> running 32-bit ARMv5 and ARMv7M guests with this QEMU binary,
>> so we can't do this. If we want to allow the minimum page size to
>> be bigger than 1K for AArch64 CPUs then we need to make it a
>> runtime settable thing rather than compile-time (which is not
>> an entirely trivial thing).
>
> Do you mean to say that based on -cpu type qemu option
> choose the page size at runtime?

If you want to avoid defining TARGET_PAGE_SIZE to the
lowest-common-denominator 1K, then yes, you'd need to
choose it at runtime. That could be painful to implement.

thanks
-- PMM

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking
  2016-04-04 13:39 ` [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking vijayak
@ 2016-04-05 14:36   ` Peter Maydell
  2016-04-05 15:21     ` Paolo Bonzini
  2016-04-06  8:32     ` [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking Vijay Kilari
  2016-04-05 15:28   ` Peter Maydell
  1 sibling, 2 replies; 18+ messages in thread
From: Peter Maydell @ 2016-04-05 14:36 UTC (permalink / raw)
  To: Vijaya Kumar K
  Cc: Vijay Kilari, Prasun Kapoor, QEMU Developers, qemu-arm,
	Paolo Bonzini, Vijay

On 4 April 2016 at 14:39,  <vijayak@caviumnetworks.com> wrote:
> From: Vijay <vijayak@cavium.com>
>
> Use Neon instructions to perform zero checking of
> buffer. This is helps in reducing downtime during
> live migration.
>
> Signed-off-by: Vijaya Kumar K <vijayak@caviumnetworks.com>
> ---
>  util/cutils.c |   81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 81 insertions(+)
>
> diff --git a/util/cutils.c b/util/cutils.c
> index 43d1afb..d343b9a 100644
> --- a/util/cutils.c
> +++ b/util/cutils.c
> @@ -352,6 +352,87 @@ static void *can_use_buffer_find_nonzero_offset_ifunc(void)
>      return func;
>  }
>  #pragma GCC pop_options
> +
> +#elif defined __aarch64__
> +#include "arm_neon.h"

Can we rely on all compilers having this, or do we need to
test in configure?

> +
> +#define NEON_VECTYPE               uint64x2_t
> +#define NEON_LOAD_N_ORR(v1, v2)    vorrq_u64(vld1q_u64(&v1), vld1q_u64(&v2))
> +#define NEON_ORR(v1, v2)           vorrq_u64(v1, v2)
> +#define NEON_EQ_ZERO(v1) \
> +        ((vgetq_lane_u64(vceqzq_u64(v1), 0) == 0) || \
> +         (vgetq_lane_u64(vceqzq_u64(v1), 1)) == 0)

The intrinsics are a bit confusing, but shouldn't we be
testing that both lanes of v1 are 0, rather than whether
either of them is? (so "&&", not "||").

> +
> +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16
> +
> +/*
> + * Zero page/buffer checking using SIMD(Neon)
> + */
> +
> +static bool
> +can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len)
> +{
> +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON
> +                   * sizeof(NEON_VECTYPE)) == 0
> +            && ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0);
> +}
> +
> +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len)
> +{
> +    size_t i;
> +    NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6;
> +    NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14;
> +    uint64_t const *data = buf;
> +
> +    assert(can_use_buffer_find_nonzero_offset_neon(buf, len));
> +    len /= sizeof(unsigned long);
> +
> +    for (i = 0; i < len; i += 32) {
> +        d0 = NEON_LOAD_N_ORR(data[i], data[i + 2]);
> +        d1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]);
> +        d2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]);
> +        d3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]);
> +        d4 = NEON_ORR(d0, d1);
> +        d5 = NEON_ORR(d2, d3);
> +        d6 = NEON_ORR(d4, d5);
> +
> +        d7 = NEON_LOAD_N_ORR(data[i + 16], data[i + 18]);
> +        d8 = NEON_LOAD_N_ORR(data[i + 20], data[i + 22]);
> +        d9 = NEON_LOAD_N_ORR(data[i + 24], data[i + 26]);
> +        d10 = NEON_LOAD_N_ORR(data[i + 28], data[i + 30]);
> +        d11 = NEON_ORR(d7, d8);
> +        d12 = NEON_ORR(d9, d10);
> +        d13 = NEON_ORR(d11, d12);
> +
> +        d14 = NEON_ORR(d6, d13);
> +        if (NEON_EQ_ZERO(d14)) {
> +            break;
> +        }
> +    }

Both the other optimised find_nonzero implementations in this
file have two loops, not just one. Is it OK that this
implementation has only a single loop?

Paolo: do you know why we have two loops in the other
implementations?

> +
> +    return i * sizeof(unsigned long);
> +}
> +
> +static inline bool neon_support(void)
> +{
> +    /*
> +     * Check if neon feature is supported.
> +     * By default neon is supported for aarch64.
> +     */
> +    return true;
> +}

There doesn't seem much point in this. We can assume Neon exists
on any CPU we're going to run on (it's part of the ABI, the kernel
assumes it, etc etc). So you can just implement the functions without
the indirection functions below.

> +
> +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
> +{
> +    return neon_support() ? can_use_buffer_find_nonzero_offset_neon(buf, len) :
> +           can_use_buffer_find_nonzero_offset_inner(buf, len);
> +}
> +
> +size_t buffer_find_nonzero_offset(const void *buf, size_t len)
> +{
> +    return neon_support() ? buffer_find_nonzero_offset_neon(buf, len) :
> +           buffer_find_nonzero_offset_inner(buf, len);
> +}
>  #else
>  bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
>  {
> --

thanks
-- PMM

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking
  2016-04-05 14:36   ` Peter Maydell
@ 2016-04-05 15:21     ` Paolo Bonzini
  2016-04-05 16:01       ` Peter Maydell
  2016-04-06  8:32     ` [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking Vijay Kilari
  1 sibling, 1 reply; 18+ messages in thread
From: Paolo Bonzini @ 2016-04-05 15:21 UTC (permalink / raw)
  To: Peter Maydell, Vijaya Kumar K
  Cc: Prasun Kapoor, Vijay, qemu-arm, QEMU Developers, Vijay Kilari



On 05/04/2016 16:36, Peter Maydell wrote:
>> > +
>> > +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16
>> > +
>> > +/*
>> > + * Zero page/buffer checking using SIMD(Neon)
>> > + */
>> > +
>> > +static bool
>> > +can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len)
>> > +{
>> > +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON
>> > +                   * sizeof(NEON_VECTYPE)) == 0
>> > +            && ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0);
>> > +}
>> > +
>> > +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len)
>> > +{
>> > +    size_t i;
>> > +    NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6;
>> > +    NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14;
>> > +    uint64_t const *data = buf;
>> > +
>> > +    assert(can_use_buffer_find_nonzero_offset_neon(buf, len));
>> > +    len /= sizeof(unsigned long);
>> > +
>> > +    for (i = 0; i < len; i += 32) {
>> > +        d0 = NEON_LOAD_N_ORR(data[i], data[i + 2]);
>> > +        d1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]);
>> > +        d2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]);
>> > +        d3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]);
>> > +        d4 = NEON_ORR(d0, d1);
>> > +        d5 = NEON_ORR(d2, d3);
>> > +        d6 = NEON_ORR(d4, d5);
>> > +
>> > +        d7 = NEON_LOAD_N_ORR(data[i + 16], data[i + 18]);
>> > +        d8 = NEON_LOAD_N_ORR(data[i + 20], data[i + 22]);
>> > +        d9 = NEON_LOAD_N_ORR(data[i + 24], data[i + 26]);
>> > +        d10 = NEON_LOAD_N_ORR(data[i + 28], data[i + 30]);
>> > +        d11 = NEON_ORR(d7, d8);
>> > +        d12 = NEON_ORR(d9, d10);
>> > +        d13 = NEON_ORR(d11, d12);
>> > +
>> > +        d14 = NEON_ORR(d6, d13);
>> > +        if (NEON_EQ_ZERO(d14)) {
>> > +            break;
>> > +        }
>> > +    }
> Both the other optimised find_nonzero implementations in this
> file have two loops, not just one. Is it OK that this
> implementation has only a single loop?
> 
> Paolo: do you know why we have two loops in the other
> implementations?

Because usually the first one or two iterations are enough to exit the
function if the page is nonzero.  It's measurably slower to go through
the unrolled loop in that case.  On the other hand, once the first few
iterations found only zero bytes, the buffer is very likely entirely
zero and the unrolled loop helps.

But in theory it should be enough to add a new #elif branch like this:

#include "arm_neon.h"
#define VECTYPE   uint64x2_t
#define VEC_OR(a, b) ((a) | (b))
#define ALL_EQ(a, b) /* ??? :) */

around the

/* vector definitions */

comment in util/cutils.c.  GCC should do everything else.

Paolo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking
  2016-04-04 13:39 ` [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking vijayak
  2016-04-05 14:36   ` Peter Maydell
@ 2016-04-05 15:28   ` Peter Maydell
  1 sibling, 0 replies; 18+ messages in thread
From: Peter Maydell @ 2016-04-05 15:28 UTC (permalink / raw)
  To: Vijaya Kumar K
  Cc: Prasun Kapoor, Vijay, qemu-arm, QEMU Developers, Vijay Kilari

On 4 April 2016 at 14:39,  <vijayak@caviumnetworks.com> wrote:
> From: Vijay <vijayak@cavium.com>
>
> Use Neon instructions to perform zero checking of
> buffer. This is helps in reducing downtime during
> live migration.

One other comment I forgot:

> +#define NEON_VECTYPE               uint64x2_t

This is a 128-bit type...

> +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len)
> +{
> +    size_t i;
> +    NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6;
> +    NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14;

...so it's a bit confusing to use d0, d1, etc, which implies
a 64-bit value.

thanks
-- PMM

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking
  2016-04-05 15:21     ` Paolo Bonzini
@ 2016-04-05 16:01       ` Peter Maydell
       [not found]         ` <C94A741879221447B4FC9B607EB4FFCD79EA34F4@DGGEMA504-MBX.china.huawei.com>
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Maydell @ 2016-04-05 16:01 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Vijay Kilari, Vijaya Kumar K, QEMU Developers, Prasun Kapoor,
	qemu-arm, Vijay

On 5 April 2016 at 16:21, Paolo Bonzini <pbonzini@redhat.com> wrote:
> But in theory it should be enough to add a new #elif branch like this:
>
> #include "arm_neon.h"
> #define VECTYPE   uint64x2_t
> #define VEC_OR(a, b) ((a) | (b))
> #define ALL_EQ(a, b) /* ??? :) */

#define ALL_EQ(a, b) (vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) && \
                      vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1))

will do I think (probably suboptimal for a true vector compare but
works OK here as we're actually only interested in comparing against
constant zero; the compiler generates "load 64bit value from vector
register to integer; cbnz" for each half of the vector).

Worth benchmarking that (and the variant where we use the C code
but move the loop unrolling out to 16) against the handwritten
intrinsics version.

thanks
-- PMM

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking
  2016-04-05 14:36   ` Peter Maydell
  2016-04-05 15:21     ` Paolo Bonzini
@ 2016-04-06  8:32     ` Vijay Kilari
  1 sibling, 0 replies; 18+ messages in thread
From: Vijay Kilari @ 2016-04-06  8:32 UTC (permalink / raw)
  To: Peter Maydell
  Cc: Vijaya Kumar K, QEMU Developers, Prasun Kapoor, qemu-arm, Vijay,
	Paolo Bonzini

On Tue, Apr 5, 2016 at 8:06 PM, Peter Maydell <peter.maydell@linaro.org> wrote:
> On 4 April 2016 at 14:39,  <vijayak@caviumnetworks.com> wrote:
>> From: Vijay <vijayak@cavium.com>
>>
>> Use Neon instructions to perform zero checking of
>> buffer. This is helps in reducing downtime during
>> live migration.
>>
>> Signed-off-by: Vijaya Kumar K <vijayak@caviumnetworks.com>
>> ---
>>  util/cutils.c |   81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 81 insertions(+)
>>
>> diff --git a/util/cutils.c b/util/cutils.c
>> index 43d1afb..d343b9a 100644
>> --- a/util/cutils.c
>> +++ b/util/cutils.c
>> @@ -352,6 +352,87 @@ static void *can_use_buffer_find_nonzero_offset_ifunc(void)
>>      return func;
>>  }
>>  #pragma GCC pop_options
>> +
>> +#elif defined __aarch64__
>> +#include "arm_neon.h"
>
> Can we rely on all compilers having this, or do we need to
> test in configure?

GCC and armcc support the same intrinsics. Both needs inclusion
of arm_neon.h.

>
>> +
>> +#define NEON_VECTYPE               uint64x2_t
>> +#define NEON_LOAD_N_ORR(v1, v2)    vorrq_u64(vld1q_u64(&v1), vld1q_u64(&v2))
>> +#define NEON_ORR(v1, v2)           vorrq_u64(v1, v2)
>> +#define NEON_EQ_ZERO(v1) \
>> +        ((vgetq_lane_u64(vceqzq_u64(v1), 0) == 0) || \
>> +         (vgetq_lane_u64(vceqzq_u64(v1), 1)) == 0)
>
> The intrinsics are a bit confusing, but shouldn't we be
> testing that both lanes of v1 are 0, rather than whether
> either of them is? (so "&&", not "||").

Above check is correct. vceqzq() sets all bits to 1 if value is 0.
So if one lane is 0, then it means it is non-zero buffer. I think
redefining this macro as below would be better and avoid
vceqzq_u64()

#define NEON_NOT_EQ_ZERO(v1) \
        ((vgetq_lane_u64(v1, 0) != 0) || (vgetq_lane_u64(v1, 1)) != 0)

>
>> +
>> +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16
>> +
>> +/*
>> + * Zero page/buffer checking using SIMD(Neon)
>> + */
>> +
>> +static bool
>> +can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len)
>> +{
>> +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON
>> +                   * sizeof(NEON_VECTYPE)) == 0
>> +            && ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0);
>> +}
>> +
>> +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len)
>> +{
>> +    size_t i;
>> +    NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6;
>> +    NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14;
>> +    uint64_t const *data = buf;
>> +
>> +    assert(can_use_buffer_find_nonzero_offset_neon(buf, len));
>> +    len /= sizeof(unsigned long);
>> +
>> +    for (i = 0; i < len; i += 32) {
>> +        d0 = NEON_LOAD_N_ORR(data[i], data[i + 2]);
>> +        d1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]);
>> +        d2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]);
>> +        d3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]);
>> +        d4 = NEON_ORR(d0, d1);
>> +        d5 = NEON_ORR(d2, d3);
>> +        d6 = NEON_ORR(d4, d5);
>> +
>> +        d7 = NEON_LOAD_N_ORR(data[i + 16], data[i + 18]);
>> +        d8 = NEON_LOAD_N_ORR(data[i + 20], data[i + 22]);
>> +        d9 = NEON_LOAD_N_ORR(data[i + 24], data[i + 26]);
>> +        d10 = NEON_LOAD_N_ORR(data[i + 28], data[i + 30]);
>> +        d11 = NEON_ORR(d7, d8);
>> +        d12 = NEON_ORR(d9, d10);
>> +        d13 = NEON_ORR(d11, d12);
>> +
>> +        d14 = NEON_ORR(d6, d13);
>> +        if (NEON_EQ_ZERO(d14)) {
>> +            break;
>> +        }
>> +    }
>
> Both the other optimised find_nonzero implementations in this
> file have two loops, not just one. Is it OK that this
> implementation has only a single loop?
>
> Paolo: do you know why we have two loops in the other
> implementations?

Paolo was right as he mentioned in the previous email.
But with two loops, I don't see much benefit. So restricted to
one loop.

>
>> +
>> +    return i * sizeof(unsigned long);
>> +}
>> +
>> +static inline bool neon_support(void)
>> +{
>> +    /*
>> +     * Check if neon feature is supported.
>> +     * By default neon is supported for aarch64.
>> +     */
>> +    return true;
>> +}
>
> There doesn't seem much point in this. We can assume Neon exists
> on any CPU we're going to run on (it's part of the ABI, the kernel
> assumes it, etc etc). So you can just implement the functions without
> the indirection functions below.
>
 Hmm. One reason was compilation fails if we don't call
can_use_buffer_find_nonzero_offset_inner() function from inside neon
implementation.
So I added this similar to AVX2 intel. Also thought if any platform
does not implement
Neon, then can simply skip changes this function.

>> +
>> +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
>> +{
>> +    return neon_support() ? can_use_buffer_find_nonzero_offset_neon(buf, len) :
>> +           can_use_buffer_find_nonzero_offset_inner(buf, len);
>> +}
>> +
>> +size_t buffer_find_nonzero_offset(const void *buf, size_t len)
>> +{
>> +    return neon_support() ? buffer_find_nonzero_offset_neon(buf, len) :
>> +           buffer_find_nonzero_offset_inner(buf, len);
>> +}
>>  #else
>>  bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
>>  {
>> --
>
> thanks
> -- PMM

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v1 1/2] target-arm: Update page size for aarch64
  2016-04-04 16:44       ` Peter Maydell
@ 2016-04-06 15:01         ` Vijay Kilari
  2016-05-31  9:04           ` Vijay Kilari
  0 siblings, 1 reply; 18+ messages in thread
From: Vijay Kilari @ 2016-04-06 15:01 UTC (permalink / raw)
  To: Peter Maydell
  Cc: Vijaya Kumar K, Prasun Kapoor, qemu-arm, QEMU Developers, Vijay

On Mon, Apr 4, 2016 at 10:14 PM, Peter Maydell <peter.maydell@linaro.org> wrote:
> On 4 April 2016 at 17:40, Vijay Kilari <vijay.kilari@gmail.com> wrote:
>> On Mon, Apr 4, 2016 at 7:14 PM, Peter Maydell <peter.maydell@linaro.org> wrote:
>>> I agree that this would definitely improve performance (both for
>>> migration and for emulated guests), but I'm afraid this breaks
>>> running 32-bit ARMv5 and ARMv7M guests with this QEMU binary,
>>> so we can't do this. If we want to allow the minimum page size to
>>> be bigger than 1K for AArch64 CPUs then we need to make it a
>>> runtime settable thing rather than compile-time (which is not
>>> an entirely trivial thing).
>>
>> Do you mean to say that based on -cpu type qemu option
>> choose the page size at runtime?
>
> If you want to avoid defining TARGET_PAGE_SIZE to the
> lowest-common-denominator 1K, then yes, you'd need to
> choose it at runtime. That could be painful to implement.

Had a look at it. Needs some changes in common code as well.
I will send this as a separate patch series and drop this patch
from this series.

>
> thanks
> -- PMM

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v1 1/2] target-arm: Update page size for aarch64
  2016-04-06 15:01         ` Vijay Kilari
@ 2016-05-31  9:04           ` Vijay Kilari
  2016-05-31  9:31             ` Peter Maydell
  0 siblings, 1 reply; 18+ messages in thread
From: Vijay Kilari @ 2016-05-31  9:04 UTC (permalink / raw)
  To: Peter Maydell
  Cc: Vijaya Kumar K, qemu-arm, QEMU Developers, Prasun Kapoor, Vijay

Hi Peter

On Wed, Apr 6, 2016 at 8:31 PM, Vijay Kilari <vijay.kilari@gmail.com> wrote:
> On Mon, Apr 4, 2016 at 10:14 PM, Peter Maydell <peter.maydell@linaro.org> wrote:
>> On 4 April 2016 at 17:40, Vijay Kilari <vijay.kilari@gmail.com> wrote:
>>> On Mon, Apr 4, 2016 at 7:14 PM, Peter Maydell <peter.maydell@linaro.org> wrote:
>>>> I agree that this would definitely improve performance (both for
>>>> migration and for emulated guests), but I'm afraid this breaks
>>>> running 32-bit ARMv5 and ARMv7M guests with this QEMU binary,
>>>> so we can't do this. If we want to allow the minimum page size to
>>>> be bigger than 1K for AArch64 CPUs then we need to make it a
>>>> runtime settable thing rather than compile-time (which is not
>>>> an entirely trivial thing).
>>>
>>> Do you mean to say that based on -cpu type qemu option
>>> choose the page size at runtime?
>>
>> If you want to avoid defining TARGET_PAGE_SIZE to the
>> lowest-common-denominator 1K, then yes, you'd need to
>> choose it at runtime. That could be painful to implement.
>
> Had a look at it. Needs some changes in common code as well.
> I will send this as a separate patch series and drop this patch
> from this series.

The L1 page table size, L1 shift are dependent on TARGET_PAGE_BITS(page size).
as shown in snippet code below from translate-all.c

/* The bits remaining after N lower levels of page tables.  */
#define V_L1_BITS_REM \
    ((L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % V_L2_BITS)

#if V_L1_BITS_REM < 4
#define V_L1_BITS  (V_L1_BITS_REM + V_L2_BITS)
#else
#define V_L1_BITS  V_L1_BITS_REM
#endif

#define V_L1_SIZE  ((target_ulong)1 << V_L1_BITS)

#define V_L1_SHIFT (L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS - V_L1_BITS)

/* The bottom level has pointers to PageDesc */
static void *l1_map[V_L1_SIZE];

How about adding CONFIG_PAGE_SIZE option to configure?.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v1 1/2] target-arm: Update page size for aarch64
  2016-05-31  9:04           ` Vijay Kilari
@ 2016-05-31  9:31             ` Peter Maydell
  0 siblings, 0 replies; 18+ messages in thread
From: Peter Maydell @ 2016-05-31  9:31 UTC (permalink / raw)
  To: Vijay Kilari
  Cc: Vijaya Kumar K, qemu-arm, QEMU Developers, Prasun Kapoor, Vijay

On 31 May 2016 at 10:04, Vijay Kilari <vijay.kilari@gmail.com> wrote:
> On Wed, Apr 6, 2016 at 8:31 PM, Vijay Kilari <vijay.kilari@gmail.com> wrote:
>> On Mon, Apr 4, 2016 at 10:14 PM, Peter Maydell <peter.maydell@linaro.org> wrote:
>>> If you want to avoid defining TARGET_PAGE_SIZE to the
>>> lowest-common-denominator 1K, then yes, you'd need to
>>> choose it at runtime. That could be painful to implement.

> The L1 page table size, L1 shift are dependent on TARGET_PAGE_BITS(page size).
> as shown in snippet code below from translate-all.c

Yes, that's why I said "painful to implement" :-)

> How about adding CONFIG_PAGE_SIZE option to configure?.

I don't want this to be a configure option, because QEMU
should just work for everybody. Otherwise we have some QEMU
binaries which silently don't implement the architecture/CPU
that they ought to.

thanks
-- PMM

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [Qemu-arm] about armv8's prefetch decode
       [not found]         ` <C94A741879221447B4FC9B607EB4FFCD79EA34F4@DGGEMA504-MBX.china.huawei.com>
@ 2017-03-23 16:56           ` Pranith Kumar
  2017-03-24  6:14             ` [Qemu-devel] [Qemu-arm] [patch 1/1]about " Wangjintang
  0 siblings, 1 reply; 18+ messages in thread
From: Pranith Kumar @ 2017-03-23 16:56 UTC (permalink / raw)
  To: Wangjintang
  Cc: Peter Maydell, Shlomo Pongratz (A), qemu-arm, Ori Chalak (A), qemu-devel

Hi Jed,

On Mon, Mar 20, 2017 at 2:35 AM, Wangjintang <wangjintang@huawei.com> wrote:
> Hi,
>
>         We see that armv8's prefetch instruction decode have been skipped in qemu. For some user, they need prefetch instruction, for example, they use qemu to generate the instruction trace. We want to merge this patch to community, it's ok or not?  Thanks.
>

Your patch seems to be missing. Can you retry with the content of the
patch pasted in the email?

Thanks,
--
Pranith

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [Qemu-arm] [patch 1/1]about armv8's prefetch decode
  2017-03-23 16:56           ` [Qemu-devel] [Qemu-arm] about armv8's prefetch decode Pranith Kumar
@ 2017-03-24  6:14             ` Wangjintang
  2017-03-24 10:06               ` Peter Maydell
  0 siblings, 1 reply; 18+ messages in thread
From: Wangjintang @ 2017-03-24  6:14 UTC (permalink / raw)
  To: Pranith Kumar, Peter Maydell
  Cc: Shlomo Pongratz (A), qemu-arm, Ori Chalak (A),
	Wanghaibin (Benjamin),
	qemu-devel

Hi Pranith,
 
 	Thanks for your reply. patch as below, new added code default is off, please review. 
The major thinking is about translate Armv8's prefetch instruction into intermediate code, at the same time don't effect the rm/rn register. 


diff --git a/translate-a64.c b/translate-a64.c
index 814f30f..86da8ee 100644
--- a/translate-a64.c
+++ b/translate-a64.c
@@ -2061,7 +2061,11 @@ static void disas_ld_lit(DisasContext *s, uint32_t insn)
     } else {
         if (opc == 3) {
             /* PRFM (literal) : prefetch */
+            #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
+            ;
+            #else
             return;
+            #endif
         }
         size = 2 + extract32(opc, 0, 1);
         is_signed = extract32(opc, 1, 1);
@@ -2075,9 +2079,19 @@ static void disas_ld_lit(DisasContext *s, uint32_t insn)
     } else {
         /* Only unsigned 32bit loads target 32bit registers.  */
         bool iss_sf = opc != 0;
-
-        do_gpr_ld(s, tcg_rt, tcg_addr, size, is_signed, false,
-                  true, rt, iss_sf, false);
+        #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
+        if (opc == 3) {
+            TCGv_i64 v = tcg_temp_new_i64();
+            do_gpr_ld(s, v, tcg_addr, size, is_signed, false,
+                      true, rt, iss_sf, false);
+            tcg_temp_free_i64(v);
+        } else {
+        #endif
+            do_gpr_ld(s, tcg_rt, tcg_addr, size, is_signed, false,
+                      true, rt, iss_sf, false);
+        #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
+        }
+        #endif
     }
     tcg_temp_free_i64(tcg_addr);
 }
@@ -2283,7 +2297,11 @@ static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn,
                 unallocated_encoding(s);
                 return;
             }
+            #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
+            rn = 31;
+            #else
             return;
+            #endif
         }
         if (opc == 3 && size > 1) {
             unallocated_encoding(s);
@@ -2334,9 +2352,21 @@ static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn,
             do_gpr_st_memidx(s, tcg_rt, tcg_addr, size, memidx,
                              iss_valid, rt, iss_sf, false);
         } else {
-            do_gpr_ld_memidx(s, tcg_rt, tcg_addr, size,
-                             is_signed, is_extended, memidx,
-                             iss_valid, rt, iss_sf, false);
+            #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
+            if (size == 3 && opc == 2) {
+                TCGv_i64 v = tcg_temp_new_i64();
+                do_gpr_ld_memidx(s, v, tcg_addr, size,
+                                  is_signed, is_extended, memidx,
+                                  iss_valid, rt, iss_sf, false);
+                tcg_temp_free_i64(v);
+             } else {
+             #endif
+                 do_gpr_ld_memidx(s, tcg_rt, tcg_addr, size,
+                                  is_signed, is_extended, memidx,
+                                  iss_valid, rt, iss_sf, false);
+             #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
+             }
+             #endif
         }
     }

@@ -2383,6 +2413,9 @@ static void disas_ldst_reg_roffset(DisasContext *s, uint32_t insn,
     bool is_signed = false;
     bool is_store = false;
     bool is_extended = false;
+    #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
+    TCGv_i64 v = tcg_temp_new_i64();
+    #endif

     TCGv_i64 tcg_rm;
     TCGv_i64 tcg_addr;
@@ -2405,7 +2438,11 @@ static void disas_ldst_reg_roffset(DisasContext *s, uint32_t insn,
     } else {
         if (size == 3 && opc == 2) {
             /* PRFM - prefetch */
+            #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
+            rn = 31;
+            #else
             return;
+            #endif
         }
         if (opc == 3 && size > 1) {
             unallocated_encoding(s);
@@ -2422,9 +2459,17 @@ static void disas_ldst_reg_roffset(DisasContext *s, uint32_t insn,
     tcg_addr = read_cpu_reg_sp(s, rn, 1);

     tcg_rm = read_cpu_reg(s, rm, 1);
-    ext_and_shift_reg(tcg_rm, tcg_rm, opt, shift ? size : 0);
-
-    tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_rm);
+    #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
+    if ((!is_vector) &&(size == 3 && opc == 2)) {
+        ext_and_shift_reg(v, tcg_rm, opt, shift ? size : 0);
+        tcg_gen_add_i64(tcg_addr, tcg_addr, v);
+    } else {
+    #endif
+        ext_and_shift_reg(tcg_rm, tcg_rm, opt, shift ? size : 0);
+        tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_rm);
+    #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
+    }
+    #endif

     if (is_vector) {
         if (is_store) {
@@ -2439,11 +2484,25 @@ static void disas_ldst_reg_roffset(DisasContext *s, uint32_t insn,
             do_gpr_st(s, tcg_rt, tcg_addr, size,
                       true, rt, iss_sf, false);
         } else {
-            do_gpr_ld(s, tcg_rt, tcg_addr, size,
-                      is_signed, is_extended,
-                      true, rt, iss_sf, false);
+            #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
+            if (size == 3 && opc == 2) {
+                is_signed = false;
+                do_gpr_ld(s, v, tcg_addr, size,
+                          is_signed, is_extended,
+                          true, rt, iss_sf, false);
+            } else {
+            #endif
+                do_gpr_ld(s, tcg_rt, tcg_addr, size,
+                          is_signed, is_extended,
+                          true, rt, iss_sf, false);
+            #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
+            }
+            #endif
         }
     }
+    #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
+    tcg_temp_free_i64(v);
+    #endif
 }

 /*
@@ -2492,7 +2551,11 @@ static void disas_ldst_reg_unsigned_imm(DisasContext *s, uint32_t insn,
     } else {
         if (size == 3 && opc == 2) {
             /* PRFM - prefetch */
+            #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
+            rn = 31;
+            #else
             return;
+            #endif
         }
         if (opc == 3 && size > 1) {
             unallocated_encoding(s);
@@ -2523,8 +2586,20 @@ static void disas_ldst_reg_unsigned_imm(DisasContext *s, uint32_t insn,
             do_gpr_st(s, tcg_rt, tcg_addr, size,
                       true, rt, iss_sf, false);
         } else {
-            do_gpr_ld(s, tcg_rt, tcg_addr, size, is_signed, is_extended,
-                      true, rt, iss_sf, false);
+            #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
+            if (size == 3 && opc == 2) {
+                TCGv_i64 v = tcg_temp_new_i64();
+                is_signed = false;
+                do_gpr_ld(s, v, tcg_addr, size, is_signed, is_extended,
+                          true, rt, iss_sf, false);
+                tcg_temp_free_i64(v);
+            } else {
+            #endif
+                do_gpr_ld(s, tcg_rt, tcg_addr, size, is_signed, is_extended,
+                          true, rt, iss_sf, false);
+            #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
+            }
+            #endif
         }
     }
 }
diff --git a/translate.h b/translate.h
index 2cc4412..6bb1e24 100644
--- a/translate.h
+++ b/translate.h
@@ -135,6 +135,8 @@ static void disas_set_insn_syndrome(DisasContext *s, uint32_t syn)
 #define DISAS_SMC 9
 #define DISAS_YIELD 10

+#undef  TCG_AARCH64_PREFETCH_TRANSLATE
+
 #ifdef TARGET_AARCH64
 void a64_translate_init(void);
 void gen_intermediate_code_a64(ARMCPU *cpu, TranslationBlock *tb);


Best Regards,
Wang jintang / Jed



Huawei Technologies Co., Ltd. 
Email: wangjintang@huawei.com
Building Z8, Huawei R&D center, Jiangshu Rd 360, Binjiang District,Hangzhou 310051, P.R.China
http://www.huawei.com 

> -----Original Message-----
> From: Pranith Kumar [mailto:bobby.prani@gmail.com]
> Sent: Friday, March 24, 2017 12:56 AM
> To: Wangjintang
> Cc: Peter Maydell; Shlomo Pongratz (A); qemu-arm; Ori Chalak (A);
> qemu-devel
> Subject: Re: [Qemu-arm] about armv8's prefetch decode
> 
> Hi Jed,
> 
> On Mon, Mar 20, 2017 at 2:35 AM, Wangjintang <wangjintang@huawei.com>
> wrote:
> > Hi,
> >
> >         We see that armv8's prefetch instruction decode have been
> skipped in qemu. For some user, they need prefetch instruction, for example,
> they use qemu to generate the instruction trace. We want to merge this
> patch to community, it's ok or not?  Thanks.
> >
> 
> Your patch seems to be missing. Can you retry with the content of the
> patch pasted in the email?
> 
> Thanks,
> --
> Pranith

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [Qemu-arm] [patch 1/1]about armv8's prefetch decode
  2017-03-24  6:14             ` [Qemu-devel] [Qemu-arm] [patch 1/1]about " Wangjintang
@ 2017-03-24 10:06               ` Peter Maydell
  2017-03-25  2:22                 ` Wangjintang
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Maydell @ 2017-03-24 10:06 UTC (permalink / raw)
  To: Wangjintang
  Cc: Pranith Kumar, Shlomo Pongratz (A), Wanghaibin (Benjamin),
	qemu-arm, qemu-devel, Ori Chalak (A)

On 24 March 2017 at 06:14, Wangjintang <wangjintang@huawei.com> wrote:
> Hi Pranith,
>
>         Thanks for your reply. patch as below, new added code default is off, please review.
> The major thinking is about translate Armv8's prefetch instruction into intermediate code, at the same time don't effect the rm/rn register.
>
>
> diff --git a/translate-a64.c b/translate-a64.c
> index 814f30f..86da8ee 100644
> --- a/translate-a64.c
> +++ b/translate-a64.c
> @@ -2061,7 +2061,11 @@ static void disas_ld_lit(DisasContext *s, uint32_t insn)
>      } else {
>          if (opc == 3) {
>              /* PRFM (literal) : prefetch */
> +            #ifdef TCG_AARCH64_PREFETCH_TRANSLATE
> +            ;
> +            #else
>              return;
> +            #endif
>          }

No, these changes look wrong. PRFM instructions do not need to
do anything and should definitely not be emitting any intermediate
code. In particular if you let execution fall through and try
do_gpr_ld() then it will really do a load, which might cause
an exception -- this is specifically forbidden for PRFM.
Architecturally the ARM ARM says "it is valid for the PE to
treat any or all prefetch instructions as a NOP", which is
what QEMU does.

The existing code is correct. In general you should not
expect to be able to deduce the guest instructions from
the intermediate code representation.

thanks
-- PMM

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [Qemu-arm] [patch 1/1]about armv8's prefetch decode
  2017-03-24 10:06               ` Peter Maydell
@ 2017-03-25  2:22                 ` Wangjintang
  2017-03-25 12:35                   ` Peter Maydell
  0 siblings, 1 reply; 18+ messages in thread
From: Wangjintang @ 2017-03-25  2:22 UTC (permalink / raw)
  To: Peter Maydell
  Cc: Pranith Kumar, Shlomo Pongratz (A), Ori Chalak (A),
	Wanghaibin (Benjamin),
	qemu-arm, qemu-devel

Hi Peter,
	More detail illustration at below.

> -----Original Message-----
> From: Peter Maydell [mailto:peter.maydell@linaro.org]
> Sent: Friday, March 24, 2017 6:06 PM
> To: Wangjintang
> Cc: Pranith Kumar; Shlomo Pongratz (A); Wanghaibin (Benjamin); qemu-arm;
> qemu-devel; Ori Chalak (A)
> Subject: Re: [Qemu-arm] [patch 1/1]about armv8's prefetch decode
> No, these changes look wrong. PRFM instructions do not need to
> do anything and should definitely not be emitting any intermediate
> code. In particular if you let execution fall through and try
> do_gpr_ld() then it will really do a load, which might cause
> an exception -- this is specifically forbidden for PRFM.
> Architecturally the ARM ARM says "it is valid for the PE to
> treat any or all prefetch instructions as a NOP", which is
> what QEMU does.
> 
> The existing code is correct. In general you should not
> expect to be able to deduce the guest instructions from
> the intermediate code representation.
> 

"it is valid for the PE to treat any or all prefetch instructions as a NOP", 
from software view, it's right.
the patch regard the prefetch as load instruction, at the same time 
don't affect rm/rt register. Only the PRFM instruction been emitted to
intermediate code and do a really load, then we can get the memory 
address relative to the prefetch instruction. Because the rm/rt register 
don't been modified, so the application can run correctly. 
BTW, the new added code default is disable. So for the common user, have no 
affect to them.

In our case, we need all the instruction trace & ld/st instruction's 
access memory address, the trace as the input for chip cycle-accurate 
model. Similar with flexus + qemu. 
Current code that skip generate prefetch instructions' intermediate code, 
So we can get prefetch instruction, but can't get the prefetch instruction 
relative memory address. 
We have tested that the ratio of prefetch instructions is about 2%~3% during 
run Dhrystone in system mode. The ratio is high.
________________                       ________________
|                |                     |                |
|                |                     |                |
|   Qemu        |                     |  chip          |
|                |   instruction trace    | cycle-accurate   |
|                |    ----------------->      | model          |
|                |   memory trace      |                |
|________________|                     |________________|

Ori Chalak's explain this as below:
" Indeed, prefetch instruction affects only the micro architecture, 
and hence not needed for running correctly the generated code.
However, we developed a performance simulator for a detailed 
ARMv8 CPU model, and use Qemu to resolve the functionality.
And for this purpose we need to translate all instructions that 
may affect the pipeline behavior, caches, etc.

This is not the major usage of Qemu, however there may be 
others doing this and it may help them.
http://www.linux-kvm.org/images/4/45/01x09-Christopher_Covington-Using_Upstream_QEMU_for_CASS.pdf "

Best Regards,
Wang jintang / Jed
Huawei Technologies Co., Ltd. 
Email: wangjintang@huawei.com
http://www.huawei.com

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [Qemu-arm] [patch 1/1]about armv8's prefetch decode
  2017-03-25  2:22                 ` Wangjintang
@ 2017-03-25 12:35                   ` Peter Maydell
  0 siblings, 0 replies; 18+ messages in thread
From: Peter Maydell @ 2017-03-25 12:35 UTC (permalink / raw)
  To: Wangjintang
  Cc: Pranith Kumar, Shlomo Pongratz (A), Ori Chalak (A),
	Wanghaibin (Benjamin),
	qemu-arm, qemu-devel

On 25 March 2017 at 02:22, Wangjintang <wangjintang@huawei.com> wrote:
> the patch regard the prefetch as load instruction, at the same time
> don't affect rm/rt register. Only the PRFM instruction been emitted to
> intermediate code and do a really load, then we can get the memory
> address relative to the prefetch instruction. Because the rm/rt register
> don't been modified, so the application can run correctly.

It will still fault if the address is not valid, which is
not a permitted behaviour.

> In our case, we need all the instruction trace & ld/st instruction's
> access memory address, the trace as the input for chip cycle-accurate
> model. Similar with flexus + qemu.
> Current code that skip generate prefetch instructions' intermediate code,
> So we can get prefetch instruction, but can't get the prefetch instruction
> relative memory address.

I understand the use case you would like, but if we want
to support that kind of thing we should do it with a much
more significant and consistent degree of support for
tracing of guest code actions, not with a small ad-hoc
change that happens to fix the immediate thing you're
running into for your specific problem.

thanks
-- PMM

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2017-03-25 12:35 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <1459777195-7907-1-git-send-email-vijayak@caviumnetworks.com>
2016-04-04 13:39 ` [Qemu-devel] [RFC PATCH v1 1/2] target-arm: Update page size for aarch64 vijayak
2016-04-04 13:44   ` Peter Maydell
2016-04-04 16:40     ` Vijay Kilari
2016-04-04 16:44       ` Peter Maydell
2016-04-06 15:01         ` Vijay Kilari
2016-05-31  9:04           ` Vijay Kilari
2016-05-31  9:31             ` Peter Maydell
2016-04-04 13:39 ` [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking vijayak
2016-04-05 14:36   ` Peter Maydell
2016-04-05 15:21     ` Paolo Bonzini
2016-04-05 16:01       ` Peter Maydell
     [not found]         ` <C94A741879221447B4FC9B607EB4FFCD79EA34F4@DGGEMA504-MBX.china.huawei.com>
2017-03-23 16:56           ` [Qemu-devel] [Qemu-arm] about armv8's prefetch decode Pranith Kumar
2017-03-24  6:14             ` [Qemu-devel] [Qemu-arm] [patch 1/1]about " Wangjintang
2017-03-24 10:06               ` Peter Maydell
2017-03-25  2:22                 ` Wangjintang
2017-03-25 12:35                   ` Peter Maydell
2016-04-06  8:32     ` [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking Vijay Kilari
2016-04-05 15:28   ` Peter Maydell

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.