* [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment @ 2017-08-28 3:53 Pranith Kumar 2017-08-28 3:53 ` [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items Pranith Kumar ` (2 more replies) 0 siblings, 3 replies; 12+ messages in thread From: Pranith Kumar @ 2017-08-28 3:53 UTC (permalink / raw) To: alex.bennee, Peter Maydell, open list:ARM, open list:All patches CC here Cc: rth, pbonzini Update the comment which is not true since MTTCG. Signed-off-by: Pranith Kumar <bobby.prani@gmail.com> --- target/arm/translate-a64.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c index 2200e25be0..f42b155d7d 100644 --- a/target/arm/translate-a64.c +++ b/target/arm/translate-a64.c @@ -2012,10 +2012,6 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn) } tcg_addr = read_cpu_reg_sp(s, rn, 1); - /* Note that since TCG is single threaded load-acquire/store-release - * semantics require no extra if (is_lasr) { ... } handling. - */ - if (is_excl) { if (!is_store) { s->is_ldex = true; -- 2.13.0 ^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items 2017-08-28 3:53 [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment Pranith Kumar @ 2017-08-28 3:53 ` Pranith Kumar 2017-08-28 17:47 ` Richard Henderson 2017-08-28 19:05 ` Emilio G. Cota 2017-08-28 3:53 ` [Qemu-devel] [RFC PATCH 3/3] mttcg: Implement implicit ordering semantics Pranith Kumar 2017-08-28 17:42 ` [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment Richard Henderson 2 siblings, 2 replies; 12+ messages in thread From: Pranith Kumar @ 2017-08-28 3:53 UTC (permalink / raw) To: alex.bennee, Paolo Bonzini, Richard Henderson, Sergey Fedorov, open list:All patches CC here Using heaptrack, I found that quite a few of our temporary allocations are coming from allocating work items. Instead of doing this continously, we can cache the allocated items and reuse them instead of freeing them. This reduces the number of allocations by 25% (200000 -> 150000 for ARM64 boot+shutdown test). Signed-off-by: Pranith Kumar <bobby.prani@gmail.com> --- cpus-common.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 70 insertions(+), 15 deletions(-) diff --git a/cpus-common.c b/cpus-common.c index 59f751ecf9..a1c4c7d1a3 100644 --- a/cpus-common.c +++ b/cpus-common.c @@ -24,6 +24,7 @@ #include "sysemu/cpus.h" static QemuMutex qemu_cpu_list_lock; +static QemuMutex qemu_wi_pool_lock; static QemuCond exclusive_cond; static QemuCond exclusive_resume; static QemuCond qemu_work_cond; @@ -33,6 +34,58 @@ static QemuCond qemu_work_cond; */ static int pending_cpus; +typedef struct qemu_work_item { + struct qemu_work_item *next; + run_on_cpu_func func; + run_on_cpu_data data; + bool free, exclusive, done; +} qemu_work_item; + +typedef struct qemu_wi_pool { + qemu_work_item *first, *last; +} qemu_wi_pool; + +qemu_wi_pool *wi_free_pool; + +static void qemu_init_workitem_pool(void) +{ + wi_free_pool = g_malloc0(sizeof(qemu_wi_pool)); + wi_free_pool->first = NULL; + wi_free_pool->last = NULL; +} + +static void qemu_wi_pool_insert(qemu_work_item *item) +{ + qemu_mutex_lock(&qemu_wi_pool_lock); + if (wi_free_pool->last == NULL) { + wi_free_pool->first = item; + wi_free_pool->last = item; + } else { + wi_free_pool->last->next = item; + wi_free_pool->last = item; + } + qemu_mutex_unlock(&qemu_wi_pool_lock); +} + +static qemu_work_item* qemu_wi_pool_remove(void) +{ + qemu_mutex_lock(&qemu_wi_pool_lock); + qemu_work_item *ret = wi_free_pool->first; + + if (ret == NULL) + goto out; + + wi_free_pool->first = ret->next; + if (wi_free_pool->last == ret) { + wi_free_pool->last = NULL; + } + ret->next = NULL; + + out: + qemu_mutex_unlock(&qemu_wi_pool_lock); + return ret; +} + void qemu_init_cpu_list(void) { /* This is needed because qemu_init_cpu_list is also called by the @@ -43,6 +96,9 @@ void qemu_init_cpu_list(void) qemu_cond_init(&exclusive_cond); qemu_cond_init(&exclusive_resume); qemu_cond_init(&qemu_work_cond); + + qemu_init_workitem_pool(); + qemu_mutex_init(&qemu_wi_pool_lock); } void cpu_list_lock(void) @@ -106,14 +162,7 @@ void cpu_list_remove(CPUState *cpu) qemu_mutex_unlock(&qemu_cpu_list_lock); } -struct qemu_work_item { - struct qemu_work_item *next; - run_on_cpu_func func; - run_on_cpu_data data; - bool free, exclusive, done; -}; - -static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi) +static void queue_work_on_cpu(CPUState *cpu, qemu_work_item *wi) { qemu_mutex_lock(&cpu->work_mutex); if (cpu->queued_work_first == NULL) { @@ -132,7 +181,7 @@ static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi) void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data, QemuMutex *mutex) { - struct qemu_work_item wi; + qemu_work_item wi; if (qemu_cpu_is_self(cpu)) { func(cpu, data); @@ -145,6 +194,7 @@ void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data, wi.free = false; wi.exclusive = false; + queue_work_on_cpu(cpu, &wi); while (!atomic_mb_read(&wi.done)) { CPUState *self_cpu = current_cpu; @@ -156,9 +206,11 @@ void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data, void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data) { - struct qemu_work_item *wi; + qemu_work_item *wi = qemu_wi_pool_remove(); - wi = g_malloc0(sizeof(struct qemu_work_item)); + if (!wi) { + wi = g_malloc0(sizeof(qemu_work_item)); + } wi->func = func; wi->data = data; wi->free = true; @@ -299,9 +351,11 @@ void cpu_exec_end(CPUState *cpu) void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data) { - struct qemu_work_item *wi; + qemu_work_item *wi = qemu_wi_pool_remove(); - wi = g_malloc0(sizeof(struct qemu_work_item)); + if (!wi) { + wi = g_malloc0(sizeof(qemu_work_item)); + } wi->func = func; wi->data = data; wi->free = true; @@ -312,7 +366,7 @@ void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void process_queued_cpu_work(CPUState *cpu) { - struct qemu_work_item *wi; + qemu_work_item *wi; if (cpu->queued_work_first == NULL) { return; @@ -343,7 +397,8 @@ void process_queued_cpu_work(CPUState *cpu) } qemu_mutex_lock(&cpu->work_mutex); if (wi->free) { - g_free(wi); + memset(wi, 0, sizeof(qemu_work_item)); + qemu_wi_pool_insert(wi); } else { atomic_mb_set(&wi->done, true); } -- 2.13.0 ^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items 2017-08-28 3:53 ` [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items Pranith Kumar @ 2017-08-28 17:47 ` Richard Henderson 2017-08-28 21:43 ` Pranith Kumar 2017-08-28 19:05 ` Emilio G. Cota 1 sibling, 1 reply; 12+ messages in thread From: Richard Henderson @ 2017-08-28 17:47 UTC (permalink / raw) To: Pranith Kumar, alex.bennee, Paolo Bonzini, Sergey Fedorov, open list:All patches CC here On 08/27/2017 08:53 PM, Pranith Kumar wrote: > Using heaptrack, I found that quite a few of our temporary allocations > are coming from allocating work items. Instead of doing this > continously, we can cache the allocated items and reuse them instead > of freeing them. > > This reduces the number of allocations by 25% (200000 -> 150000 for > ARM64 boot+shutdown test). > > Signed-off-by: Pranith Kumar <bobby.prani@gmail.com> > --- > cpus-common.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++----------- > 1 file changed, 70 insertions(+), 15 deletions(-) > > diff --git a/cpus-common.c b/cpus-common.c > index 59f751ecf9..a1c4c7d1a3 100644 > --- a/cpus-common.c > +++ b/cpus-common.c > @@ -24,6 +24,7 @@ > #include "sysemu/cpus.h" > > static QemuMutex qemu_cpu_list_lock; > +static QemuMutex qemu_wi_pool_lock; > static QemuCond exclusive_cond; > static QemuCond exclusive_resume; > static QemuCond qemu_work_cond; > @@ -33,6 +34,58 @@ static QemuCond qemu_work_cond; > */ > static int pending_cpus; > > +typedef struct qemu_work_item { > + struct qemu_work_item *next; > + run_on_cpu_func func; > + run_on_cpu_data data; > + bool free, exclusive, done; > +} qemu_work_item; > + > +typedef struct qemu_wi_pool { > + qemu_work_item *first, *last; > +} qemu_wi_pool; > + > +qemu_wi_pool *wi_free_pool; > + > +static void qemu_init_workitem_pool(void) > +{ > + wi_free_pool = g_malloc0(sizeof(qemu_wi_pool)); > + wi_free_pool->first = NULL; > + wi_free_pool->last = NULL; > +} > + > +static void qemu_wi_pool_insert(qemu_work_item *item) > +{ > + qemu_mutex_lock(&qemu_wi_pool_lock); > + if (wi_free_pool->last == NULL) { > + wi_free_pool->first = item; > + wi_free_pool->last = item; > + } else { > + wi_free_pool->last->next = item; > + wi_free_pool->last = item; > + } > + qemu_mutex_unlock(&qemu_wi_pool_lock); > +} > + > +static qemu_work_item* qemu_wi_pool_remove(void) > +{ > + qemu_mutex_lock(&qemu_wi_pool_lock); > + qemu_work_item *ret = wi_free_pool->first; > + > + if (ret == NULL) > + goto out; > + > + wi_free_pool->first = ret->next; > + if (wi_free_pool->last == ret) { > + wi_free_pool->last = NULL; > + } Why does this list need to record a "last" element? It would seem a simple lifo would be sufficient. (You would also be able to manage the list via cmpxchg without a separate lock, but perhaps the difference between the two isn't measurable.) r~ ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items 2017-08-28 17:47 ` Richard Henderson @ 2017-08-28 21:43 ` Pranith Kumar 2017-08-29 20:38 ` Paolo Bonzini 0 siblings, 1 reply; 12+ messages in thread From: Pranith Kumar @ 2017-08-28 21:43 UTC (permalink / raw) To: Richard Henderson Cc: Alex Bennée, Paolo Bonzini, Sergey Fedorov, open list:All patches CC here On Mon, Aug 28, 2017 at 1:47 PM, Richard Henderson <richard.henderson@linaro.org> wrote: > On 08/27/2017 08:53 PM, Pranith Kumar wrote: >> Using heaptrack, I found that quite a few of our temporary allocations >> are coming from allocating work items. Instead of doing this >> continously, we can cache the allocated items and reuse them instead >> of freeing them. >> >> This reduces the number of allocations by 25% (200000 -> 150000 for >> ARM64 boot+shutdown test). >> >> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com> > > Why does this list need to record a "last" element? > It would seem a simple lifo would be sufficient. > > (You would also be able to manage the list via cmpxchg without a separate lock, > but perhaps the difference between the two isn't measurable.) > Yes, seems like a better design choice. Will fix in next iteration. Thanks, -- Pranith ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items 2017-08-28 21:43 ` Pranith Kumar @ 2017-08-29 20:38 ` Paolo Bonzini 0 siblings, 0 replies; 12+ messages in thread From: Paolo Bonzini @ 2017-08-29 20:38 UTC (permalink / raw) To: Pranith Kumar Cc: Alex Bennée, richard.henderson, Sergey Fedorov, open list:All patches CC here Il 28 ago 2017 11:43 PM, "Pranith Kumar" <bobby.prani@gmail.com> ha scritto: On Mon, Aug 28, 2017 at 1:47 PM, Richard Henderson <richard.henderson@linaro.org> wrote: > On 08/27/2017 08:53 PM, Pranith Kumar wrote: >> Using heaptrack, I found that quite a few of our temporary allocations >> are coming from allocating work items. Instead of doing this >> continously, we can cache the allocated items and reuse them instead >> of freeing them. >> >> This reduces the number of allocations by 25% (200000 -> 150000 for >> ARM64 boot+shutdown test). >> >> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com> > > Why does this list need to record a "last" element? > It would seem a simple lifo would be sufficient. > > (You would also be able to manage the list via cmpxchg without a separate lock, > but perhaps the difference between the two isn't measurable.) > Yes, seems like a better design choice. Will fix in next iteration. More recent glibc will also have an efficient per-thread allocator, and though I haven't yet benchmarked the newer glibc malloc, GSlice is slower than at least both tcmalloc and jemalloc. Perhaps you could instead make work items statically allocated? Thanks, Paolo Thanks, -- Pranith ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items 2017-08-28 3:53 ` [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items Pranith Kumar 2017-08-28 17:47 ` Richard Henderson @ 2017-08-28 19:05 ` Emilio G. Cota 2017-08-28 21:51 ` Pranith Kumar 1 sibling, 1 reply; 12+ messages in thread From: Emilio G. Cota @ 2017-08-28 19:05 UTC (permalink / raw) To: Pranith Kumar Cc: alex.bennee, Paolo Bonzini, Richard Henderson, Sergey Fedorov, open list:All patches CC here On Sun, Aug 27, 2017 at 23:53:25 -0400, Pranith Kumar wrote: > Using heaptrack, I found that quite a few of our temporary allocations > are coming from allocating work items. Instead of doing this > continously, we can cache the allocated items and reuse them instead > of freeing them. > > This reduces the number of allocations by 25% (200000 -> 150000 for > ARM64 boot+shutdown test). > But what is the perf difference, if any? Adding a lock (or a cmpxchg) here is not a great idea. However, this is not yet immediately obvious because of other scalability bottlenecks. (if you boot many arm64 cores you'll see most of the time is spent idling on the BQL, see https://lists.gnu.org/archive/html/qemu-devel/2017-08/msg05207.html ) You're most likely better off using glib's slices, see https://developer.gnome.org/glib/stable/glib-Memory-Slices.html These slices use per-thread lists, so scalability should be OK. I also suggest profiling with either or both of jemalloc/tcmalloc (build with --enable-jemalloc/tcmalloc) in addition to using glibc's allocator, and then based on perf numbers decide whether this is something worth optimizing. Thanks, Emilio ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items 2017-08-28 19:05 ` Emilio G. Cota @ 2017-08-28 21:51 ` Pranith Kumar 0 siblings, 0 replies; 12+ messages in thread From: Pranith Kumar @ 2017-08-28 21:51 UTC (permalink / raw) To: Emilio G. Cota Cc: Alex Bennée, Paolo Bonzini, Richard Henderson, Sergey Fedorov, open list:All patches CC here On Mon, Aug 28, 2017 at 3:05 PM, Emilio G. Cota <cota@braap.org> wrote: > On Sun, Aug 27, 2017 at 23:53:25 -0400, Pranith Kumar wrote: >> Using heaptrack, I found that quite a few of our temporary allocations >> are coming from allocating work items. Instead of doing this >> continously, we can cache the allocated items and reuse them instead >> of freeing them. >> >> This reduces the number of allocations by 25% (200000 -> 150000 for >> ARM64 boot+shutdown test). >> > > But what is the perf difference, if any? > > Adding a lock (or a cmpxchg) here is not a great idea. However, this is not yet > immediately obvious because of other scalability bottlenecks. (if > you boot many arm64 cores you'll see most of the time is spent idling > on the BQL, see > https://lists.gnu.org/archive/html/qemu-devel/2017-08/msg05207.html ) > > You're most likely better off using glib's slices, see > https://developer.gnome.org/glib/stable/glib-Memory-Slices.html > These slices use per-thread lists, so scalability should be OK. I think we should modify our g_malloc() to internally use this. Seems like an idea worth trying out. > > I also suggest profiling with either or both of jemalloc/tcmalloc > (build with --enable-jemalloc/tcmalloc) in addition to using glibc's > allocator, and then based on perf numbers decide whether this is something > worth optimizing. > OK, I will try to get some perf numbers. -- Pranith ^ permalink raw reply [flat|nested] 12+ messages in thread
* [Qemu-devel] [RFC PATCH 3/3] mttcg: Implement implicit ordering semantics 2017-08-28 3:53 [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment Pranith Kumar 2017-08-28 3:53 ` [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items Pranith Kumar @ 2017-08-28 3:53 ` Pranith Kumar 2017-08-28 17:57 ` Richard Henderson 2017-08-28 17:42 ` [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment Richard Henderson 2 siblings, 1 reply; 12+ messages in thread From: Pranith Kumar @ 2017-08-28 3:53 UTC (permalink / raw) To: alex.bennee, Claudio Fontana, Richard Henderson, Andrzej Zaborowski, Aurelien Jarno, open list:AArch64 target, open list:All patches CC here Cc: pbonzini Currently, we cannot use mttcg for running strong memory model guests on weak memory model hosts due to missing ordering semantics. We implicitly generate fence instructions for stronger guests if an ordering mismatch is detected. We generate fences only for the orders for which fence instructions are necessary, for example a fence is not necessary between a store and a subsequent load on x86 since its absence in the guest binary tells that ordering need not be ensured. Also note that if we find multiple subsequent fence instructions in the generated IR, we combine them in the TCG optimization pass. This patch allows us to boot an x86 guest on ARM64 hosts using mttcg. Signed-off-by: Pranith Kumar <bobby.prani@gmail.com> --- tcg/aarch64/tcg-target.h | 2 ++ tcg/arm/tcg-target.h | 2 ++ tcg/mips/tcg-target.h | 2 ++ tcg/ppc/tcg-target.h | 2 ++ tcg/tcg-op.c | 17 +++++++++++++++++ tcg/tcg-op.h | 1 + 6 files changed, 26 insertions(+) diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h index 55a46ac825..b41a248bee 100644 --- a/tcg/aarch64/tcg-target.h +++ b/tcg/aarch64/tcg-target.h @@ -117,4 +117,6 @@ static inline void flush_icache_range(uintptr_t start, uintptr_t stop) __builtin___clear_cache((char *)start, (char *)stop); } +#define TCG_TARGET_DEFAULT_MO (0) + #endif /* AARCH64_TCG_TARGET_H */ diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h index 5ef1086710..a38be15a39 100644 --- a/tcg/arm/tcg-target.h +++ b/tcg/arm/tcg-target.h @@ -134,4 +134,6 @@ static inline void flush_icache_range(uintptr_t start, uintptr_t stop) __builtin___clear_cache((char *) start, (char *) stop); } +#define TCG_TARGET_DEFAULT_MO (0) + #endif diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h index d75cb63ed3..e9558d15bc 100644 --- a/tcg/mips/tcg-target.h +++ b/tcg/mips/tcg-target.h @@ -206,4 +206,6 @@ static inline void flush_icache_range(uintptr_t start, uintptr_t stop) cacheflush ((void *)start, stop-start, ICACHE); } +#define TCG_TARGET_DEFAULT_MO (0) + #endif diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h index 5f4a40a5b4..5a092b038a 100644 --- a/tcg/ppc/tcg-target.h +++ b/tcg/ppc/tcg-target.h @@ -125,4 +125,6 @@ extern bool have_isa_3_00; void flush_icache_range(uintptr_t start, uintptr_t stop); +#define TCG_TARGET_DEFAULT_MO (0) + #endif diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c index 87f673ef49..085fe66fb2 100644 --- a/tcg/tcg-op.c +++ b/tcg/tcg-op.c @@ -28,6 +28,7 @@ #include "exec/exec-all.h" #include "tcg.h" #include "tcg-op.h" +#include "tcg-mo.h" #include "trace-tcg.h" #include "trace/mem.h" @@ -2662,8 +2663,21 @@ static void gen_ldst_i64(TCGOpcode opc, TCGv_i64 val, TCGv addr, #endif } +void tcg_gen_req_mo(TCGBar type) +{ +#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO) + TCGBar order_mismatch = type & (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO); + if (order_mismatch) { + tcg_gen_mb(order_mismatch | TCG_BAR_SC); + } +#else + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC); +#endif +} + void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop) { + tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_LD_ST); memop = tcg_canonicalize_memop(memop, 0, 0); trace_guest_mem_before_tcg(tcg_ctx.cpu, tcg_ctx.tcg_env, addr, trace_mem_get_info(memop, 0)); @@ -2672,6 +2686,7 @@ void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop) void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop) { + tcg_gen_req_mo(TCG_MO_ST_LD | TCG_MO_ST_ST); memop = tcg_canonicalize_memop(memop, 0, 1); trace_guest_mem_before_tcg(tcg_ctx.cpu, tcg_ctx.tcg_env, addr, trace_mem_get_info(memop, 1)); @@ -2680,6 +2695,7 @@ void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop) void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop) { + tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_LD_ST); if (TCG_TARGET_REG_BITS == 32 && (memop & MO_SIZE) < MO_64) { tcg_gen_qemu_ld_i32(TCGV_LOW(val), addr, idx, memop); if (memop & MO_SIGN) { @@ -2698,6 +2714,7 @@ void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop) void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop) { + tcg_gen_req_mo(TCG_MO_ST_LD | TCG_MO_ST_ST); if (TCG_TARGET_REG_BITS == 32 && (memop & MO_SIZE) < MO_64) { tcg_gen_qemu_st_i32(TCGV_LOW(val), addr, idx, memop); return; diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h index 5d3278f243..6ad2c6d60e 100644 --- a/tcg/tcg-op.h +++ b/tcg/tcg-op.h @@ -262,6 +262,7 @@ static inline void tcg_gen_br(TCGLabel *l) } void tcg_gen_mb(TCGBar); +void tcg_gen_req_mo(TCGBar type); /* Helper calls. */ -- 2.13.0 ^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [RFC PATCH 3/3] mttcg: Implement implicit ordering semantics 2017-08-28 3:53 ` [Qemu-devel] [RFC PATCH 3/3] mttcg: Implement implicit ordering semantics Pranith Kumar @ 2017-08-28 17:57 ` Richard Henderson 2017-08-28 21:41 ` Pranith Kumar 0 siblings, 1 reply; 12+ messages in thread From: Richard Henderson @ 2017-08-28 17:57 UTC (permalink / raw) To: Pranith Kumar, alex.bennee, Claudio Fontana, Andrzej Zaborowski, Aurelien Jarno, open list:AArch64 target, open list:All patches CC here Cc: pbonzini On 08/27/2017 08:53 PM, Pranith Kumar wrote: > diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h > index 55a46ac825..b41a248bee 100644 > --- a/tcg/aarch64/tcg-target.h > +++ b/tcg/aarch64/tcg-target.h > @@ -117,4 +117,6 @@ static inline void flush_icache_range(uintptr_t start, uintptr_t stop) > __builtin___clear_cache((char *)start, (char *)stop); > } > > +#define TCG_TARGET_DEFAULT_MO (0) > + > #endif /* AARCH64_TCG_TARGET_H */ Please add all of these in one patch, separate from the tcg-op.c changes. We should also just make this mandatory and remove any related #ifdefs. > +void tcg_gen_req_mo(TCGBar type) static, until we find that we need it somewhere else. > +#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO) > + TCGBar order_mismatch = type & (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO); > + if (order_mismatch) { > + tcg_gen_mb(order_mismatch | TCG_BAR_SC); > + } > +#else > + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC); > +#endif Hmm. How about static void tcg_gen_reg_mo(TCGBar type) { #ifdef TCG_GUEST_DEFAULT_MO type &= TCG_GUEST_DEFAULT_MO; #endif #ifdef TCG_TARGET_DEFAULT_MO type &= ~TCG_TARGET_DEFAULT_MO; #endif if (type) { tcg_gen_mb(type | TCG_BAR_SC); } } Just because one of those is undefined doesn't mean we can't infer tighter barriers from the others, including the initial value of TYPE. > void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop) > { > + tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_LD_ST); > memop = tcg_canonicalize_memop(memop, 0, 0); You're putting the barrier before the load, so that should be TCG_MO_LD_LD | TCG_MO_ST_LD i.e. TCG_MO_<any>_<current op> If you were putting the barrier afterward (an equally reasonable option), you'd reverse that and use what you have above. r~ ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [RFC PATCH 3/3] mttcg: Implement implicit ordering semantics 2017-08-28 17:57 ` Richard Henderson @ 2017-08-28 21:41 ` Pranith Kumar 2017-08-28 22:39 ` Richard Henderson 0 siblings, 1 reply; 12+ messages in thread From: Pranith Kumar @ 2017-08-28 21:41 UTC (permalink / raw) To: Richard Henderson Cc: Alex Bennée, Andrzej Zaborowski, Aurelien Jarno, open list:AArch64 target, open list:All patches CC here, Paolo Bonzini On Mon, Aug 28, 2017 at 1:57 PM, Richard Henderson <rth@twiddle.net> wrote: > On 08/27/2017 08:53 PM, Pranith Kumar wrote: >> diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h >> index 55a46ac825..b41a248bee 100644 >> --- a/tcg/aarch64/tcg-target.h >> +++ b/tcg/aarch64/tcg-target.h >> @@ -117,4 +117,6 @@ static inline void flush_icache_range(uintptr_t start, uintptr_t stop) >> __builtin___clear_cache((char *)start, (char *)stop); >> } >> >> +#define TCG_TARGET_DEFAULT_MO (0) >> + >> #endif /* AARCH64_TCG_TARGET_H */ > > Please add all of these in one patch, separate from the tcg-op.c changes. > We should also just make this mandatory and remove any related #ifdefs. I tried looking up ordering semantics for architectures like ia64 and s390. It is not really clear. I think every arch but for x86 can be defined as weak, even though archs like sparc can also be configured as TSO. Is this right? > >> +void tcg_gen_req_mo(TCGBar type) > > static, until we find that we need it somewhere else. > Will fix. >> +#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO) >> + TCGBar order_mismatch = type & (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO); >> + if (order_mismatch) { >> + tcg_gen_mb(order_mismatch | TCG_BAR_SC); >> + } >> +#else >> + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC); >> +#endif > > Hmm. How about > > static void tcg_gen_reg_mo(TCGBar type) > { > #ifdef TCG_GUEST_DEFAULT_MO > type &= TCG_GUEST_DEFAULT_MO; > #endif > #ifdef TCG_TARGET_DEFAULT_MO > type &= ~TCG_TARGET_DEFAULT_MO; > #endif > if (type) { > tcg_gen_mb(type | TCG_BAR_SC); > } > } Yes, this looks better and until we can get all the possible definitions of TCG_GUEST_DEFAULT_MO and TCG_TARGET_DEFAULT_MO figured out I would like to keep the #ifdefs. > > Just because one of those is undefined doesn't mean we can't infer tighter > barriers from the others, including the initial value of TYPE. > >> void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop) >> { >> + tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_LD_ST); >> memop = tcg_canonicalize_memop(memop, 0, 0); > > You're putting the barrier before the load, so that should be > > TCG_MO_LD_LD | TCG_MO_ST_LD > > i.e. TCG_MO_<any>_<current op> > > If you were putting the barrier afterward (an equally reasonable option), you'd > reverse that and use what you have above. OK, will fix this. Thanks for the review. -- Pranith ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [RFC PATCH 3/3] mttcg: Implement implicit ordering semantics 2017-08-28 21:41 ` Pranith Kumar @ 2017-08-28 22:39 ` Richard Henderson 0 siblings, 0 replies; 12+ messages in thread From: Richard Henderson @ 2017-08-28 22:39 UTC (permalink / raw) To: Pranith Kumar Cc: Alex Bennée, Andrzej Zaborowski, Aurelien Jarno, open list:AArch64 target, open list:All patches CC here, Paolo Bonzini On 08/28/2017 02:41 PM, Pranith Kumar wrote: > On Mon, Aug 28, 2017 at 1:57 PM, Richard Henderson <rth@twiddle.net> wrote: >> On 08/27/2017 08:53 PM, Pranith Kumar wrote: >>> diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h >>> index 55a46ac825..b41a248bee 100644 >>> --- a/tcg/aarch64/tcg-target.h >>> +++ b/tcg/aarch64/tcg-target.h >>> @@ -117,4 +117,6 @@ static inline void flush_icache_range(uintptr_t start, uintptr_t stop) >>> __builtin___clear_cache((char *)start, (char *)stop); >>> } >>> >>> +#define TCG_TARGET_DEFAULT_MO (0) >>> + >>> #endif /* AARCH64_TCG_TARGET_H */ >> >> Please add all of these in one patch, separate from the tcg-op.c changes. >> We should also just make this mandatory and remove any related #ifdefs. > > I tried looking up ordering semantics for architectures like ia64 and > s390. It is not really clear. I think every arch but for x86 can be > defined as weak, even though archs like sparc can also be configured > as TSO. Is this right? s390 has the same memory ordering as i386. But you're right that the risc chips should generally be 0. I'll try and figure out when sparc can use PSO (loosest for sparc < 8, and modern niagara), but leave it 0 for now. r~ ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment 2017-08-28 3:53 [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment Pranith Kumar 2017-08-28 3:53 ` [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items Pranith Kumar 2017-08-28 3:53 ` [Qemu-devel] [RFC PATCH 3/3] mttcg: Implement implicit ordering semantics Pranith Kumar @ 2017-08-28 17:42 ` Richard Henderson 2 siblings, 0 replies; 12+ messages in thread From: Richard Henderson @ 2017-08-28 17:42 UTC (permalink / raw) To: Pranith Kumar, alex.bennee, Peter Maydell, open list:ARM, open list:All patches CC here Cc: pbonzini On 08/27/2017 08:53 PM, Pranith Kumar wrote: > Update the comment which is not true since MTTCG. > > Signed-off-by: Pranith Kumar <bobby.prani@gmail.com> > --- > target/arm/translate-a64.c | 4 ---- > 1 file changed, 4 deletions(-) Reviewed-by: Richard Henderson <richard.henderson@linaro.org> r~ ^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2017-08-29 20:38 UTC | newest] Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2017-08-28 3:53 [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment Pranith Kumar 2017-08-28 3:53 ` [Qemu-devel] [RFC PATCH 2/3] cpus-common: Cache allocated work items Pranith Kumar 2017-08-28 17:47 ` Richard Henderson 2017-08-28 21:43 ` Pranith Kumar 2017-08-29 20:38 ` Paolo Bonzini 2017-08-28 19:05 ` Emilio G. Cota 2017-08-28 21:51 ` Pranith Kumar 2017-08-28 3:53 ` [Qemu-devel] [RFC PATCH 3/3] mttcg: Implement implicit ordering semantics Pranith Kumar 2017-08-28 17:57 ` Richard Henderson 2017-08-28 21:41 ` Pranith Kumar 2017-08-28 22:39 ` Richard Henderson 2017-08-28 17:42 ` [Qemu-devel] [PATCH 1/3] target/arm: Remove stale comment Richard Henderson
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.