From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:58994) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1bHboF-0006lH-FX for qemu-devel@nongnu.org; Mon, 27 Jun 2016 15:03:06 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1bHboC-0008Oa-74 for qemu-devel@nongnu.org; Mon, 27 Jun 2016 15:03:02 -0400 Received: from out3-smtp.messagingengine.com ([66.111.4.27]:42378) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1bHbo9-0008Gg-V2 for qemu-devel@nongnu.org; Mon, 27 Jun 2016 15:03:00 -0400 From: "Emilio G. Cota" Date: Mon, 27 Jun 2016 15:02:08 -0400 Message-Id: <1467054136-10430-23-git-send-email-cota@braap.org> In-Reply-To: <1467054136-10430-1-git-send-email-cota@braap.org> References: <1467054136-10430-1-git-send-email-cota@braap.org> Subject: [Qemu-devel] [RFC 22/30] target-arm: emulate LL/SC using cmpxchg helpers List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: QEMU Developers , MTTCG Devel Cc: =?UTF-8?q?Alex=20Benn=C3=A9e?= , Paolo Bonzini , Richard Henderson , Sergey Fedorov , Alvise Rigo , Peter Maydell Emulating LL/SC with cmpxchg is not correct, since it can suffer from the ABA problem. Portable parallel code, however, is written assuming only cmpxchg--and not LL/SC--is available. This means that in practice emulating LL/SC with cmpxchg is a viable alternative. The appended emulates LL/SC pairs in ARM with cmpxchg helpers. This works in both user and system mode. In usermode, it avoids pausing all other CPUs to perform the LL/SC pair. The subsequent performance and scalability improvement is significant, as the plots below show. They plot the throughput of atomic_add-bench compiled for ARM and executed on a 64-core x86 machine. Hi-res plots: http://imgur.com/a/aNQpB atomic_add-bench: 1000000 ops/thread, [0,1] range 9 ++---------+----------+----------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + + + | 8 +Emaster +-H--+ ++ | | | 7 ++E ++ | | | 6 ++++ ++ | | | 5 ++ | ++ 4 ++ | ++ | | | 3 ++ | ++ | | | 2 ++ | ++ |H++E+--- +++ ---+E+------+E+------+E| 1 +++ +E+-----+E+------+E+------+E+------+E+-- +++ +++ ++ ++H+ + +++ + +++ ++++ + + + | 0 ++--H----H-+-----H----+----------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,2] range 16 ++---------+----------+---------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + + + | 14 ++master +-H--+ ++ | | | 12 ++| ++ | E | 10 ++| ++ | | | 8 ++++ ++ |E+| | | | | 6 ++ | ++ | | | 4 ++ | ++ | +E+--- +++ +++ +++ ---+E+------+E| 2 +H+ +E+------E-------+E+-----+E+------+E+------+E+-- +++ + | + +++ + ++++ + + + | 0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,128] range 70 ++---------+----------+---------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + ++++ + | 60 ++master +-H--+ ----E------+E+-------++ | -+E+--- +++ +++ +E| | +++ ---- +++ ++| 50 ++ +++ ---+E+- ++ | -E--- | 40 ++ ---+++ ++ | +++--- | | -+E+ | 30 ++ +++---- ++ | +E+ | 20 ++ +++-- ++ | +E+ | |+E+ | 10 +E+ ++ + + + + + + + | 0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,1024] range 120 ++---------+---------+----------+---------+----------+----------+---++ +cmpxchg +-E--+ + + + + + | | master +-H--+ ++| 100 ++ ----E+ | +++ ---+E+--- ++| | --E--- +++ | 80 ++ ---- +++ ++ | ---+E+- | 60 ++ -+E+-- ++ | +++ ---- +++ | | -+E+- | 40 ++ +++---- ++ | +++ ---+E+ | | -+E+--- | 20 ++ +E+ ++ |+E+++ | +E+ + + + + + + | 0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads Signed-off-by: Emilio G. Cota --- target-arm/translate.c | 80 +++++++++----------------------------------------- 1 file changed, 14 insertions(+), 66 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index bd5d5cb..0d4a1a9 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -7715,15 +7715,6 @@ static void gen_logicq_cc(TCGv_i32 lo, TCGv_i32 hi) tcg_gen_or_i32(cpu_ZF, lo, hi); } -/* Load/Store exclusive instructions are implemented by remembering - the value/address loaded, and seeing if these are the same - when the store is performed. This should be sufficient to implement - the architecturally mandated semantics, and avoids having to monitor - regular stores. - - In system emulation mode only one CPU will be running at once, so - this sequence is effectively atomic. In user emulation mode we - throw an exception and handle the atomic operation elsewhere. */ static void gen_load_exclusive(DisasContext *s, int rt, int rt2, TCGv_i32 addr, int size) { @@ -7768,21 +7759,11 @@ static void gen_clrex(DisasContext *s) tcg_gen_movi_i64(cpu_exclusive_addr, -1); } -#ifdef CONFIG_USER_ONLY -static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, - TCGv_i32 addr, int size) -{ - tcg_gen_extu_i32_i64(cpu_exclusive_test, addr); - tcg_gen_movi_i32(cpu_exclusive_info, - size | (rd << 4) | (rt << 8) | (rt2 << 12)); - gen_exception_internal_insn(s, 4, EXCP_STREX); -} -#else static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, TCGv_i32 addr, int size) { - TCGv_i32 tmp; - TCGv_i64 val64, extaddr; + TCGv_i32 t0, t1, t2; + TCGv_i64 extaddr; TCGLabel *done_label; TCGLabel *fail_label; @@ -7799,69 +7780,36 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, fail_label); tcg_temp_free_i64(extaddr); - tmp = tcg_temp_new_i32(); + t0 = tcg_temp_new_i32(); + t1 = load_reg(s, rt); switch (size) { case 0: - gen_aa32_ld8u(s, tmp, addr, get_mem_index(s)); + gen_helper_cmpxchgb(t0, cpu_env, addr, cpu_exclusive_val, t1); break; case 1: - gen_aa32_ld16u(s, tmp, addr, get_mem_index(s)); + gen_helper_cmpxchgw(t0, cpu_env, addr, cpu_exclusive_val, t1); break; case 2: - case 3: - gen_aa32_ld32u(s, tmp, addr, get_mem_index(s)); - break; - default: - abort(); - } - - val64 = tcg_temp_new_i64(); - if (size == 3) { - TCGv_i32 tmp2 = tcg_temp_new_i32(); - TCGv_i32 tmp3 = tcg_temp_new_i32(); - tcg_gen_addi_i32(tmp2, addr, 4); - gen_aa32_ld32u(s, tmp3, tmp2, get_mem_index(s)); - tcg_temp_free_i32(tmp2); - tcg_gen_concat_i32_i64(val64, tmp, tmp3); - tcg_temp_free_i32(tmp3); - } else { - tcg_gen_extu_i32_i64(val64, tmp); - } - tcg_temp_free_i32(tmp); - - tcg_gen_brcond_i64(TCG_COND_NE, val64, cpu_exclusive_val, fail_label); - tcg_temp_free_i64(val64); - - tmp = load_reg(s, rt); - switch (size) { - case 0: - gen_aa32_st8(s, tmp, addr, get_mem_index(s)); + gen_helper_cmpxchgl(t0, cpu_env, addr, cpu_exclusive_val, t1); break; - case 1: - gen_aa32_st16(s, tmp, addr, get_mem_index(s)); - break; - case 2: case 3: - gen_aa32_st32(s, tmp, addr, get_mem_index(s)); + t2 = load_reg(s, rt2); + gen_helper_cmpxchgq(t0, cpu_env, addr, cpu_exclusive_val, t1, t2); + tcg_temp_free_i32(t2); break; default: abort(); } - tcg_temp_free_i32(tmp); - if (size == 3) { - tcg_gen_addi_i32(addr, addr, 4); - tmp = load_reg(s, rt2); - gen_aa32_st32(s, tmp, addr, get_mem_index(s)); - tcg_temp_free_i32(tmp); - } - tcg_gen_movi_i32(cpu_R[rd], 0); + tcg_temp_free_i32(t1); + tcg_gen_mov_i32(cpu_R[rd], t0); + tcg_temp_free_i32(t0); tcg_gen_br(done_label); + gen_set_label(fail_label); tcg_gen_movi_i32(cpu_R[rd], 1); gen_set_label(done_label); tcg_gen_movi_i64(cpu_exclusive_addr, -1); } -#endif /* gen_srs: * @env: CPUARMState -- 2.5.0