All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment
@ 2017-08-28  3:53 Pranith Kumar
  2017-08-28  3:53 ` [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items Pranith Kumar
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Pranith Kumar @ 2017-08-28  3:53 UTC (permalink / raw)
  To: alex.bennee, Peter Maydell, open list:ARM, open list:All patches CC here
  Cc: rth, pbonzini

Update the comment which is not true since MTTCG.

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
---
 target/arm/translate-a64.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 2200e25be0..f42b155d7d 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -2012,10 +2012,6 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn)
     }
     tcg_addr = read_cpu_reg_sp(s, rn, 1);
 
-    /* Note that since TCG is single threaded load-acquire/store-release
-     * semantics require no extra if (is_lasr) { ... } handling.
-     */
-
     if (is_excl) {
         if (!is_store) {
             s->is_ldex = true;
-- 
2.13.0

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items
  2017-08-28  3:53 [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment Pranith Kumar
@ 2017-08-28  3:53 ` Pranith Kumar
  2017-08-28 17:47   ` Richard Henderson
  2017-08-28 19:05   ` Emilio G. Cota
  2017-08-28  3:53 ` [Qemu-devel] [RFC PATCH 3/3] mttcg: Implement implicit ordering semantics Pranith Kumar
  2017-08-28 17:42 ` [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment Richard Henderson
  2 siblings, 2 replies; 12+ messages in thread
From: Pranith Kumar @ 2017-08-28  3:53 UTC (permalink / raw)
  To: alex.bennee, Paolo Bonzini, Richard Henderson, Sergey Fedorov,
	open list:All patches CC here

Using heaptrack, I found that quite a few of our temporary allocations
are coming from allocating work items. Instead of doing this
continously, we can cache the allocated items and reuse them instead
of freeing them.

This reduces the number of allocations by 25% (200000 -> 150000 for
ARM64 boot+shutdown test).

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
---
 cpus-common.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 70 insertions(+), 15 deletions(-)

diff --git a/cpus-common.c b/cpus-common.c
index 59f751ecf9..a1c4c7d1a3 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -24,6 +24,7 @@
 #include "sysemu/cpus.h"
 
 static QemuMutex qemu_cpu_list_lock;
+static QemuMutex qemu_wi_pool_lock;
 static QemuCond exclusive_cond;
 static QemuCond exclusive_resume;
 static QemuCond qemu_work_cond;
@@ -33,6 +34,58 @@ static QemuCond qemu_work_cond;
  */
 static int pending_cpus;
 
+typedef struct qemu_work_item {
+    struct qemu_work_item *next;
+    run_on_cpu_func func;
+    run_on_cpu_data data;
+    bool free, exclusive, done;
+} qemu_work_item;
+
+typedef struct qemu_wi_pool {
+    qemu_work_item *first, *last;
+} qemu_wi_pool;
+
+qemu_wi_pool *wi_free_pool;
+
+static void qemu_init_workitem_pool(void)
+{
+    wi_free_pool = g_malloc0(sizeof(qemu_wi_pool));
+    wi_free_pool->first = NULL;
+    wi_free_pool->last  = NULL;
+}
+
+static void qemu_wi_pool_insert(qemu_work_item *item)
+{
+    qemu_mutex_lock(&qemu_wi_pool_lock);
+    if (wi_free_pool->last == NULL) {
+        wi_free_pool->first = item;
+        wi_free_pool->last = item;
+    } else {
+        wi_free_pool->last->next = item;
+        wi_free_pool->last = item;
+    }
+    qemu_mutex_unlock(&qemu_wi_pool_lock);
+}
+
+static qemu_work_item* qemu_wi_pool_remove(void)
+{
+    qemu_mutex_lock(&qemu_wi_pool_lock);
+    qemu_work_item *ret = wi_free_pool->first;
+
+    if (ret == NULL)
+        goto out;
+
+    wi_free_pool->first = ret->next;
+    if (wi_free_pool->last == ret) {
+        wi_free_pool->last = NULL;
+    }
+    ret->next = NULL;
+
+ out:
+    qemu_mutex_unlock(&qemu_wi_pool_lock);
+    return ret;
+}
+
 void qemu_init_cpu_list(void)
 {
     /* This is needed because qemu_init_cpu_list is also called by the
@@ -43,6 +96,9 @@ void qemu_init_cpu_list(void)
     qemu_cond_init(&exclusive_cond);
     qemu_cond_init(&exclusive_resume);
     qemu_cond_init(&qemu_work_cond);
+
+    qemu_init_workitem_pool();
+    qemu_mutex_init(&qemu_wi_pool_lock);
 }
 
 void cpu_list_lock(void)
@@ -106,14 +162,7 @@ void cpu_list_remove(CPUState *cpu)
     qemu_mutex_unlock(&qemu_cpu_list_lock);
 }
 
-struct qemu_work_item {
-    struct qemu_work_item *next;
-    run_on_cpu_func func;
-    run_on_cpu_data data;
-    bool free, exclusive, done;
-};
-
-static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
+static void queue_work_on_cpu(CPUState *cpu, qemu_work_item *wi)
 {
     qemu_mutex_lock(&cpu->work_mutex);
     if (cpu->queued_work_first == NULL) {
@@ -132,7 +181,7 @@ static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
 void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data,
                    QemuMutex *mutex)
 {
-    struct qemu_work_item wi;
+    qemu_work_item wi;
 
     if (qemu_cpu_is_self(cpu)) {
         func(cpu, data);
@@ -145,6 +194,7 @@ void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data,
     wi.free = false;
     wi.exclusive = false;
 
+
     queue_work_on_cpu(cpu, &wi);
     while (!atomic_mb_read(&wi.done)) {
         CPUState *self_cpu = current_cpu;
@@ -156,9 +206,11 @@ void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data,
 
 void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
 {
-    struct qemu_work_item *wi;
+    qemu_work_item *wi = qemu_wi_pool_remove();
 
-    wi = g_malloc0(sizeof(struct qemu_work_item));
+    if (!wi) {
+        wi = g_malloc0(sizeof(qemu_work_item));
+    }
     wi->func = func;
     wi->data = data;
     wi->free = true;
@@ -299,9 +351,11 @@ void cpu_exec_end(CPUState *cpu)
 void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func,
                            run_on_cpu_data data)
 {
-    struct qemu_work_item *wi;
+    qemu_work_item *wi = qemu_wi_pool_remove();
 
-    wi = g_malloc0(sizeof(struct qemu_work_item));
+    if (!wi) {
+        wi = g_malloc0(sizeof(qemu_work_item));
+    }
     wi->func = func;
     wi->data = data;
     wi->free = true;
@@ -312,7 +366,7 @@ void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func,
 
 void process_queued_cpu_work(CPUState *cpu)
 {
-    struct qemu_work_item *wi;
+    qemu_work_item *wi;
 
     if (cpu->queued_work_first == NULL) {
         return;
@@ -343,7 +397,8 @@ void process_queued_cpu_work(CPUState *cpu)
         }
         qemu_mutex_lock(&cpu->work_mutex);
         if (wi->free) {
-            g_free(wi);
+            memset(wi, 0, sizeof(qemu_work_item));
+            qemu_wi_pool_insert(wi);
         } else {
             atomic_mb_set(&wi->done, true);
         }
-- 
2.13.0

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Qemu-devel] [RFC PATCH 3/3] mttcg: Implement implicit ordering semantics
  2017-08-28  3:53 [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment Pranith Kumar
  2017-08-28  3:53 ` [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items Pranith Kumar
@ 2017-08-28  3:53 ` Pranith Kumar
  2017-08-28 17:57   ` Richard Henderson
  2017-08-28 17:42 ` [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment Richard Henderson
  2 siblings, 1 reply; 12+ messages in thread
From: Pranith Kumar @ 2017-08-28  3:53 UTC (permalink / raw)
  To: alex.bennee, Claudio Fontana, Richard Henderson,
	Andrzej Zaborowski, Aurelien Jarno, open list:AArch64 target,
	open list:All patches CC here
  Cc: pbonzini

Currently, we cannot use mttcg for running strong memory model guests
on weak memory model hosts due to missing ordering semantics.

We implicitly generate fence instructions for stronger guests if an
ordering mismatch is detected. We generate fences only for the orders
for which fence instructions are necessary, for example a fence is not
necessary between a store and a subsequent load on x86 since its
absence in the guest binary tells that ordering need not be
ensured. Also note that if we find multiple subsequent fence
instructions in the generated IR, we combine them in the TCG
optimization pass.

This patch allows us to boot an x86 guest on ARM64 hosts using mttcg.

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
---
 tcg/aarch64/tcg-target.h |  2 ++
 tcg/arm/tcg-target.h     |  2 ++
 tcg/mips/tcg-target.h    |  2 ++
 tcg/ppc/tcg-target.h     |  2 ++
 tcg/tcg-op.c             | 17 +++++++++++++++++
 tcg/tcg-op.h             |  1 +
 6 files changed, 26 insertions(+)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index 55a46ac825..b41a248bee 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -117,4 +117,6 @@ static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
     __builtin___clear_cache((char *)start, (char *)stop);
 }
 
+#define TCG_TARGET_DEFAULT_MO (0)
+
 #endif /* AARCH64_TCG_TARGET_H */
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index 5ef1086710..a38be15a39 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -134,4 +134,6 @@ static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
     __builtin___clear_cache((char *) start, (char *) stop);
 }
 
+#define TCG_TARGET_DEFAULT_MO (0)
+
 #endif
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index d75cb63ed3..e9558d15bc 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -206,4 +206,6 @@ static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
     cacheflush ((void *)start, stop-start, ICACHE);
 }
 
+#define TCG_TARGET_DEFAULT_MO (0)
+
 #endif
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index 5f4a40a5b4..5a092b038a 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -125,4 +125,6 @@ extern bool have_isa_3_00;
 
 void flush_icache_range(uintptr_t start, uintptr_t stop);
 
+#define TCG_TARGET_DEFAULT_MO (0)
+
 #endif
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 87f673ef49..085fe66fb2 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -28,6 +28,7 @@
 #include "exec/exec-all.h"
 #include "tcg.h"
 #include "tcg-op.h"
+#include "tcg-mo.h"
 #include "trace-tcg.h"
 #include "trace/mem.h"
 
@@ -2662,8 +2663,21 @@ static void gen_ldst_i64(TCGOpcode opc, TCGv_i64 val, TCGv addr,
 #endif
 }
 
+void tcg_gen_req_mo(TCGBar type)
+{
+#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
+    TCGBar order_mismatch = type & (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO);
+    if (order_mismatch) {
+        tcg_gen_mb(order_mismatch | TCG_BAR_SC);
+    }
+#else
+    tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+#endif
+}
+
 void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop)
 {
+    tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_LD_ST);
     memop = tcg_canonicalize_memop(memop, 0, 0);
     trace_guest_mem_before_tcg(tcg_ctx.cpu, tcg_ctx.tcg_env,
                                addr, trace_mem_get_info(memop, 0));
@@ -2672,6 +2686,7 @@ void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop)
 
 void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop)
 {
+    tcg_gen_req_mo(TCG_MO_ST_LD | TCG_MO_ST_ST);
     memop = tcg_canonicalize_memop(memop, 0, 1);
     trace_guest_mem_before_tcg(tcg_ctx.cpu, tcg_ctx.tcg_env,
                                addr, trace_mem_get_info(memop, 1));
@@ -2680,6 +2695,7 @@ void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop)
 
 void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
 {
+    tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_LD_ST);
     if (TCG_TARGET_REG_BITS == 32 && (memop & MO_SIZE) < MO_64) {
         tcg_gen_qemu_ld_i32(TCGV_LOW(val), addr, idx, memop);
         if (memop & MO_SIGN) {
@@ -2698,6 +2714,7 @@ void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
 
 void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
 {
+    tcg_gen_req_mo(TCG_MO_ST_LD | TCG_MO_ST_ST);
     if (TCG_TARGET_REG_BITS == 32 && (memop & MO_SIZE) < MO_64) {
         tcg_gen_qemu_st_i32(TCGV_LOW(val), addr, idx, memop);
         return;
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 5d3278f243..6ad2c6d60e 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -262,6 +262,7 @@ static inline void tcg_gen_br(TCGLabel *l)
 }
 
 void tcg_gen_mb(TCGBar);
+void tcg_gen_req_mo(TCGBar type);
 
 /* Helper calls. */
 
-- 
2.13.0

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment
  2017-08-28  3:53 [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment Pranith Kumar
  2017-08-28  3:53 ` [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items Pranith Kumar
  2017-08-28  3:53 ` [Qemu-devel] [RFC PATCH 3/3] mttcg: Implement implicit ordering semantics Pranith Kumar
@ 2017-08-28 17:42 ` Richard Henderson
  2 siblings, 0 replies; 12+ messages in thread
From: Richard Henderson @ 2017-08-28 17:42 UTC (permalink / raw)
  To: Pranith Kumar, alex.bennee, Peter Maydell, open list:ARM,
	open list:All patches CC here
  Cc: pbonzini

On 08/27/2017 08:53 PM, Pranith Kumar wrote:
> Update the comment which is not true since MTTCG.
> 
> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
> ---
>  target/arm/translate-a64.c | 4 ----
>  1 file changed, 4 deletions(-)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>


r~

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items
  2017-08-28  3:53 ` [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items Pranith Kumar
@ 2017-08-28 17:47   ` Richard Henderson
  2017-08-28 21:43     ` Pranith Kumar
  2017-08-28 19:05   ` Emilio G. Cota
  1 sibling, 1 reply; 12+ messages in thread
From: Richard Henderson @ 2017-08-28 17:47 UTC (permalink / raw)
  To: Pranith Kumar, alex.bennee, Paolo Bonzini, Sergey Fedorov,
	open list:All patches CC here

On 08/27/2017 08:53 PM, Pranith Kumar wrote:
> Using heaptrack, I found that quite a few of our temporary allocations
> are coming from allocating work items. Instead of doing this
> continously, we can cache the allocated items and reuse them instead
> of freeing them.
> 
> This reduces the number of allocations by 25% (200000 -> 150000 for
> ARM64 boot+shutdown test).
> 
> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
> ---
>  cpus-common.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 70 insertions(+), 15 deletions(-)
> 
> diff --git a/cpus-common.c b/cpus-common.c
> index 59f751ecf9..a1c4c7d1a3 100644
> --- a/cpus-common.c
> +++ b/cpus-common.c
> @@ -24,6 +24,7 @@
>  #include "sysemu/cpus.h"
>  
>  static QemuMutex qemu_cpu_list_lock;
> +static QemuMutex qemu_wi_pool_lock;
>  static QemuCond exclusive_cond;
>  static QemuCond exclusive_resume;
>  static QemuCond qemu_work_cond;
> @@ -33,6 +34,58 @@ static QemuCond qemu_work_cond;
>   */
>  static int pending_cpus;
>  
> +typedef struct qemu_work_item {
> +    struct qemu_work_item *next;
> +    run_on_cpu_func func;
> +    run_on_cpu_data data;
> +    bool free, exclusive, done;
> +} qemu_work_item;
> +
> +typedef struct qemu_wi_pool {
> +    qemu_work_item *first, *last;
> +} qemu_wi_pool;
> +
> +qemu_wi_pool *wi_free_pool;
> +
> +static void qemu_init_workitem_pool(void)
> +{
> +    wi_free_pool = g_malloc0(sizeof(qemu_wi_pool));
> +    wi_free_pool->first = NULL;
> +    wi_free_pool->last  = NULL;
> +}
> +
> +static void qemu_wi_pool_insert(qemu_work_item *item)
> +{
> +    qemu_mutex_lock(&qemu_wi_pool_lock);
> +    if (wi_free_pool->last == NULL) {
> +        wi_free_pool->first = item;
> +        wi_free_pool->last = item;
> +    } else {
> +        wi_free_pool->last->next = item;
> +        wi_free_pool->last = item;
> +    }
> +    qemu_mutex_unlock(&qemu_wi_pool_lock);
> +}
> +
> +static qemu_work_item* qemu_wi_pool_remove(void)
> +{
> +    qemu_mutex_lock(&qemu_wi_pool_lock);
> +    qemu_work_item *ret = wi_free_pool->first;
> +
> +    if (ret == NULL)
> +        goto out;
> +
> +    wi_free_pool->first = ret->next;
> +    if (wi_free_pool->last == ret) {
> +        wi_free_pool->last = NULL;
> +    }

Why does this list need to record a "last" element?
It would seem a simple lifo would be sufficient.

(You would also be able to manage the list via cmpxchg without a separate lock,
but perhaps the difference between the two isn't measurable.)


r~

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 3/3] mttcg: Implement implicit ordering semantics
  2017-08-28  3:53 ` [Qemu-devel] [RFC PATCH 3/3] mttcg: Implement implicit ordering semantics Pranith Kumar
@ 2017-08-28 17:57   ` Richard Henderson
  2017-08-28 21:41     ` Pranith Kumar
  0 siblings, 1 reply; 12+ messages in thread
From: Richard Henderson @ 2017-08-28 17:57 UTC (permalink / raw)
  To: Pranith Kumar, alex.bennee, Claudio Fontana, Andrzej Zaborowski,
	Aurelien Jarno, open list:AArch64 target,
	open list:All patches CC here
  Cc: pbonzini

On 08/27/2017 08:53 PM, Pranith Kumar wrote:
> diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
> index 55a46ac825..b41a248bee 100644
> --- a/tcg/aarch64/tcg-target.h
> +++ b/tcg/aarch64/tcg-target.h
> @@ -117,4 +117,6 @@ static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
>      __builtin___clear_cache((char *)start, (char *)stop);
>  }
>  
> +#define TCG_TARGET_DEFAULT_MO (0)
> +
>  #endif /* AARCH64_TCG_TARGET_H */

Please add all of these in one patch, separate from the tcg-op.c changes.
We should also just make this mandatory and remove any related #ifdefs.

> +void tcg_gen_req_mo(TCGBar type)

static, until we find that we need it somewhere else.

> +#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
> +    TCGBar order_mismatch = type & (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO);
> +    if (order_mismatch) {
> +        tcg_gen_mb(order_mismatch | TCG_BAR_SC);
> +    }
> +#else
> +    tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
> +#endif

Hmm.  How about

static void tcg_gen_reg_mo(TCGBar type)
{
#ifdef TCG_GUEST_DEFAULT_MO
    type &= TCG_GUEST_DEFAULT_MO;
#endif
#ifdef TCG_TARGET_DEFAULT_MO
    type &= ~TCG_TARGET_DEFAULT_MO;
#endif
    if (type) {
        tcg_gen_mb(type | TCG_BAR_SC);
    }
}

Just because one of those is undefined doesn't mean we can't infer tighter
barriers from the others, including the initial value of TYPE.

>  void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop)
>  {
> +    tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_LD_ST);
>      memop = tcg_canonicalize_memop(memop, 0, 0);

You're putting the barrier before the load, so that should be

  TCG_MO_LD_LD | TCG_MO_ST_LD

i.e.  TCG_MO_<any>_<current op>

If you were putting the barrier afterward (an equally reasonable option), you'd
reverse that and use what you have above.


r~

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items
  2017-08-28  3:53 ` [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items Pranith Kumar
  2017-08-28 17:47   ` Richard Henderson
@ 2017-08-28 19:05   ` Emilio G. Cota
  2017-08-28 21:51     ` Pranith Kumar
  1 sibling, 1 reply; 12+ messages in thread
From: Emilio G. Cota @ 2017-08-28 19:05 UTC (permalink / raw)
  To: Pranith Kumar
  Cc: alex.bennee, Paolo Bonzini, Richard Henderson, Sergey Fedorov,
	open list:All patches CC here

On Sun, Aug 27, 2017 at 23:53:25 -0400, Pranith Kumar wrote:
> Using heaptrack, I found that quite a few of our temporary allocations
> are coming from allocating work items. Instead of doing this
> continously, we can cache the allocated items and reuse them instead
> of freeing them.
> 
> This reduces the number of allocations by 25% (200000 -> 150000 for
> ARM64 boot+shutdown test).
> 

But what is the perf difference, if any?

Adding a lock (or a cmpxchg) here is not a great idea. However, this is not yet
immediately obvious because of other scalability bottlenecks. (if
you boot many arm64 cores you'll see most of the time is spent idling
on the BQL, see
  https://lists.gnu.org/archive/html/qemu-devel/2017-08/msg05207.html )

You're most likely better off using glib's slices, see
  https://developer.gnome.org/glib/stable/glib-Memory-Slices.html
These slices use per-thread lists, so scalability should be OK.

I also suggest profiling with either or both of jemalloc/tcmalloc
(build with --enable-jemalloc/tcmalloc) in addition to using glibc's
allocator, and then based on perf numbers decide whether this is something
worth optimizing.

Thanks,

                Emilio

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 3/3] mttcg: Implement implicit ordering semantics
  2017-08-28 17:57   ` Richard Henderson
@ 2017-08-28 21:41     ` Pranith Kumar
  2017-08-28 22:39       ` Richard Henderson
  0 siblings, 1 reply; 12+ messages in thread
From: Pranith Kumar @ 2017-08-28 21:41 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Alex Bennée, Andrzej Zaborowski, Aurelien Jarno,
	open list:AArch64 target, open list:All patches CC here,
	Paolo Bonzini

On Mon, Aug 28, 2017 at 1:57 PM, Richard Henderson <rth@twiddle.net> wrote:
> On 08/27/2017 08:53 PM, Pranith Kumar wrote:
>> diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
>> index 55a46ac825..b41a248bee 100644
>> --- a/tcg/aarch64/tcg-target.h
>> +++ b/tcg/aarch64/tcg-target.h
>> @@ -117,4 +117,6 @@ static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
>>      __builtin___clear_cache((char *)start, (char *)stop);
>>  }
>>
>> +#define TCG_TARGET_DEFAULT_MO (0)
>> +
>>  #endif /* AARCH64_TCG_TARGET_H */
>
> Please add all of these in one patch, separate from the tcg-op.c changes.
> We should also just make this mandatory and remove any related #ifdefs.

I tried looking up ordering semantics for architectures like ia64 and
s390. It is not really clear. I think every arch but for x86 can be
defined as weak, even though archs like sparc can also be configured
as TSO. Is this right?

>
>> +void tcg_gen_req_mo(TCGBar type)
>
> static, until we find that we need it somewhere else.
>

Will fix.

>> +#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
>> +    TCGBar order_mismatch = type & (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO);
>> +    if (order_mismatch) {
>> +        tcg_gen_mb(order_mismatch | TCG_BAR_SC);
>> +    }
>> +#else
>> +    tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
>> +#endif
>
> Hmm.  How about
>
> static void tcg_gen_reg_mo(TCGBar type)
> {
> #ifdef TCG_GUEST_DEFAULT_MO
>     type &= TCG_GUEST_DEFAULT_MO;
> #endif
> #ifdef TCG_TARGET_DEFAULT_MO
>     type &= ~TCG_TARGET_DEFAULT_MO;
> #endif
>     if (type) {
>         tcg_gen_mb(type | TCG_BAR_SC);
>     }
> }

Yes, this looks better and until we can get all the possible
definitions of TCG_GUEST_DEFAULT_MO and TCG_TARGET_DEFAULT_MO figured
out I would like to keep the #ifdefs.

>
> Just because one of those is undefined doesn't mean we can't infer tighter
> barriers from the others, including the initial value of TYPE.
>
>>  void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop)
>>  {
>> +    tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_LD_ST);
>>      memop = tcg_canonicalize_memop(memop, 0, 0);
>
> You're putting the barrier before the load, so that should be
>
>   TCG_MO_LD_LD | TCG_MO_ST_LD
>
> i.e.  TCG_MO_<any>_<current op>
>
> If you were putting the barrier afterward (an equally reasonable option), you'd
> reverse that and use what you have above.

OK, will fix this. Thanks for the review.

-- 
Pranith

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items
  2017-08-28 17:47   ` Richard Henderson
@ 2017-08-28 21:43     ` Pranith Kumar
  2017-08-29 20:38       ` Paolo Bonzini
  0 siblings, 1 reply; 12+ messages in thread
From: Pranith Kumar @ 2017-08-28 21:43 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Alex Bennée, Paolo Bonzini, Sergey Fedorov,
	open list:All patches CC here

On Mon, Aug 28, 2017 at 1:47 PM, Richard Henderson
<richard.henderson@linaro.org> wrote:
> On 08/27/2017 08:53 PM, Pranith Kumar wrote:
>> Using heaptrack, I found that quite a few of our temporary allocations
>> are coming from allocating work items. Instead of doing this
>> continously, we can cache the allocated items and reuse them instead
>> of freeing them.
>>
>> This reduces the number of allocations by 25% (200000 -> 150000 for
>> ARM64 boot+shutdown test).
>>
>> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
>
> Why does this list need to record a "last" element?
> It would seem a simple lifo would be sufficient.
>
> (You would also be able to manage the list via cmpxchg without a separate lock,
> but perhaps the difference between the two isn't measurable.)
>

Yes, seems like a better design choice. Will fix in next iteration.

Thanks,
-- 
Pranith

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items
  2017-08-28 19:05   ` Emilio G. Cota
@ 2017-08-28 21:51     ` Pranith Kumar
  0 siblings, 0 replies; 12+ messages in thread
From: Pranith Kumar @ 2017-08-28 21:51 UTC (permalink / raw)
  To: Emilio G. Cota
  Cc: Alex Bennée, Paolo Bonzini, Richard Henderson,
	Sergey Fedorov, open list:All patches CC here

On Mon, Aug 28, 2017 at 3:05 PM, Emilio G. Cota <cota@braap.org> wrote:
> On Sun, Aug 27, 2017 at 23:53:25 -0400, Pranith Kumar wrote:
>> Using heaptrack, I found that quite a few of our temporary allocations
>> are coming from allocating work items. Instead of doing this
>> continously, we can cache the allocated items and reuse them instead
>> of freeing them.
>>
>> This reduces the number of allocations by 25% (200000 -> 150000 for
>> ARM64 boot+shutdown test).
>>
>
> But what is the perf difference, if any?
>
> Adding a lock (or a cmpxchg) here is not a great idea. However, this is not yet
> immediately obvious because of other scalability bottlenecks. (if
> you boot many arm64 cores you'll see most of the time is spent idling
> on the BQL, see
>   https://lists.gnu.org/archive/html/qemu-devel/2017-08/msg05207.html )
>
> You're most likely better off using glib's slices, see
>   https://developer.gnome.org/glib/stable/glib-Memory-Slices.html
> These slices use per-thread lists, so scalability should be OK.

I think we should modify our g_malloc() to internally use this. Seems
like an idea worth trying out.

>
> I also suggest profiling with either or both of jemalloc/tcmalloc
> (build with --enable-jemalloc/tcmalloc) in addition to using glibc's
> allocator, and then based on perf numbers decide whether this is something
> worth optimizing.
>

OK, I will try to get some perf numbers.

-- 
Pranith

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 3/3] mttcg: Implement implicit ordering semantics
  2017-08-28 21:41     ` Pranith Kumar
@ 2017-08-28 22:39       ` Richard Henderson
  0 siblings, 0 replies; 12+ messages in thread
From: Richard Henderson @ 2017-08-28 22:39 UTC (permalink / raw)
  To: Pranith Kumar
  Cc: Alex Bennée, Andrzej Zaborowski, Aurelien Jarno,
	open list:AArch64 target, open list:All patches CC here,
	Paolo Bonzini

On 08/28/2017 02:41 PM, Pranith Kumar wrote:
> On Mon, Aug 28, 2017 at 1:57 PM, Richard Henderson <rth@twiddle.net> wrote:
>> On 08/27/2017 08:53 PM, Pranith Kumar wrote:
>>> diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
>>> index 55a46ac825..b41a248bee 100644
>>> --- a/tcg/aarch64/tcg-target.h
>>> +++ b/tcg/aarch64/tcg-target.h
>>> @@ -117,4 +117,6 @@ static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
>>>      __builtin___clear_cache((char *)start, (char *)stop);
>>>  }
>>>
>>> +#define TCG_TARGET_DEFAULT_MO (0)
>>> +
>>>  #endif /* AARCH64_TCG_TARGET_H */
>>
>> Please add all of these in one patch, separate from the tcg-op.c changes.
>> We should also just make this mandatory and remove any related #ifdefs.
> 
> I tried looking up ordering semantics for architectures like ia64 and
> s390. It is not really clear. I think every arch but for x86 can be
> defined as weak, even though archs like sparc can also be configured
> as TSO. Is this right?

s390 has the same memory ordering as i386.

But you're right that the risc chips should generally be 0.

I'll try and figure out when sparc can use PSO (loosest for sparc < 8, and
modern niagara), but leave it 0 for now.


r~

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items
  2017-08-28 21:43     ` Pranith Kumar
@ 2017-08-29 20:38       ` Paolo Bonzini
  0 siblings, 0 replies; 12+ messages in thread
From: Paolo Bonzini @ 2017-08-29 20:38 UTC (permalink / raw)
  To: Pranith Kumar
  Cc: Alex Bennée, richard.henderson, Sergey Fedorov,
	open list:All patches CC here

Il 28 ago 2017 11:43 PM, "Pranith Kumar" <bobby.prani@gmail.com> ha scritto:

On Mon, Aug 28, 2017 at 1:47 PM, Richard Henderson
<richard.henderson@linaro.org> wrote:
> On 08/27/2017 08:53 PM, Pranith Kumar wrote:
>> Using heaptrack, I found that quite a few of our temporary allocations
>> are coming from allocating work items. Instead of doing this
>> continously, we can cache the allocated items and reuse them instead
>> of freeing them.
>>
>> This reduces the number of allocations by 25% (200000 -> 150000 for
>> ARM64 boot+shutdown test).
>>
>> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
>
> Why does this list need to record a "last" element?
> It would seem a simple lifo would be sufficient.
>
> (You would also be able to manage the list via cmpxchg without a separate
lock,
> but perhaps the difference between the two isn't measurable.)
>

Yes, seems like a better design choice. Will fix in next iteration.


More recent glibc will also have an efficient per-thread allocator, and
though I haven't yet benchmarked the newer glibc malloc, GSlice is slower
than at least both tcmalloc and jemalloc. Perhaps you could instead make
work items statically allocated?

Thanks,

Paolo


Thanks,
--
Pranith

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2017-08-29 20:38 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-28  3:53 [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment Pranith Kumar
2017-08-28  3:53 ` [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items Pranith Kumar
2017-08-28 17:47   ` Richard Henderson
2017-08-28 21:43     ` Pranith Kumar
2017-08-29 20:38       ` Paolo Bonzini
2017-08-28 19:05   ` Emilio G. Cota
2017-08-28 21:51     ` Pranith Kumar
2017-08-28  3:53 ` [Qemu-devel] [RFC PATCH 3/3] mttcg: Implement implicit ordering semantics Pranith Kumar
2017-08-28 17:57   ` Richard Henderson
2017-08-28 21:41     ` Pranith Kumar
2017-08-28 22:39       ` Richard Henderson
2017-08-28 17:42 ` [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment Richard Henderson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.