From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:48787) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1auiIz-0002tm-OH for qemu-devel@nongnu.org; Mon, 25 Apr 2016 11:20:11 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1auiIt-0007Re-DF for qemu-devel@nongnu.org; Mon, 25 Apr 2016 11:20:09 -0400 Received: from mail-wm0-x22d.google.com ([2a00:1450:400c:c09::22d]:35918) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1auiIs-0007Py-Jx for qemu-devel@nongnu.org; Mon, 25 Apr 2016 11:20:03 -0400 Received: by mail-wm0-x22d.google.com with SMTP id v188so102531143wme.1 for ; Mon, 25 Apr 2016 08:20:02 -0700 (PDT) References: <571A6C85.5020707@twiddle.net> <1461468462-31118-1-git-send-email-cota@braap.org> From: Alex =?utf-8?Q?Benn=C3=A9e?= In-reply-to: <1461468462-31118-1-git-send-email-cota@braap.org> Date: Mon, 25 Apr 2016 16:19:59 +0100 Message-ID: <87vb35emw0.fsf@linaro.org> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Subject: Re: [Qemu-devel] [RFC v2] translate-all: protect code_gen_buffer with RCU List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: "Emilio G. Cota" Cc: QEMU Developers , MTTCG Devel , Paolo Bonzini , Peter Crosthwaite , Richard Henderson , Sergey Fedorov Emilio G. Cota writes: > [ Applies on top of bennee/mttcg/enable-mttcg-for-armv7-v1 after > reverting "translate-all: introduces tb_flush_safe". A trivial > conflict must be solved after applying. ] > > This is a first attempt at making tb_flush not have to stop all CPUs. > There are issues as pointed out below, but this could be a good start. > > Context: > https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html > https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html > > Changes from v1: > - When a static buffer is used, split it in two instead of using > a second buffer. > > Known issues: > - Fails Alex' unit test with low enough -tb-size, see > https://lists.gnu.org/archive/html/qemu-devel/2016-04/msg03465.html > Seems to work in MTTCG, although I've only tested with tb_lock > always being taken in tb_find_fast. With --enable-debug-tcg I get it failing pretty quickly: #4 0x00005555556d332a in tcg_global_alloc (s=0x555556007ba0 ) at /home/alex/lsrc/qemu/qemu.git/tcg/tcg.c:463 463 tcg_debug_assert(s->nb_globals == s->nb_temps); (gdb) p s->nb_globals $1 = 24 (gdb) p s->nb_temps $2 = 31 Seems odd though, the other threads are all waiting on the tb_lock. > - Windows; not even compile-tested! > > Signed-off-by: Emilio G. Cota > --- > translate-all.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++++++----- > 1 file changed, 133 insertions(+), 13 deletions(-) > > diff --git a/translate-all.c b/translate-all.c > index 8e70583..6830371 100644 > --- a/translate-all.c > +++ b/translate-all.c > @@ -535,6 +535,9 @@ static inline void *split_cross_256mb(void *buf1, size_t size1) > #ifdef USE_STATIC_CODE_GEN_BUFFER > static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE] > __attribute__((aligned(CODE_GEN_ALIGN))); > +static int static_buf_mask = 1; > +static void *static_buf1; > +static void *static_buf2; > > # ifdef _WIN32 > static inline void do_protect(void *addr, long size, int prot) > @@ -577,6 +580,13 @@ static inline void map_none(void *addr, long size) > } > # endif /* WIN32 */ > > +static void map_static_code_gen_buffer(void *buf, size_t size) > +{ > + map_exec(buf, size); > + map_none(buf + size, qemu_real_host_page_size); > + qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE); > +} > + > static inline void *alloc_code_gen_buffer(void) > { > void *buf = static_code_gen_buffer; > @@ -586,28 +596,41 @@ static inline void *alloc_code_gen_buffer(void) > full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer)) > & qemu_real_host_page_mask) - (uintptr_t)buf; > > - /* Reserve a guard page. */ > - size = full_size - qemu_real_host_page_size; > + /* > + * Reserve two guard pages, one after each of the two buffers: > + * | buf1 |g1| buf2 |g2| > + */ > + size = full_size - 2 * qemu_real_host_page_size; > > /* Honor a command-line option limiting the size of the buffer. */ > if (size > tcg_ctx.code_gen_buffer_size) { > size = (((uintptr_t)buf + tcg_ctx.code_gen_buffer_size) > & qemu_real_host_page_mask) - (uintptr_t)buf; > } > - tcg_ctx.code_gen_buffer_size = size; > > #ifdef __mips__ > - if (cross_256mb(buf, size)) { > - buf = split_cross_256mb(buf, size); > - size = tcg_ctx.code_gen_buffer_size; > + /* > + * Pass 'size + page_size', since we want 'buf1 | guard1 | buf2' to be > + * within the boundary. > + */ > + if (cross_256mb(buf, size + qemu_real_host_page_size)) { > + buf = split_cross_256mb(buf, size + qemu_real_host_page_size); > + size = tcg_ctx.code_gen_buffer_size - qemu_real_host_page_size; > } > #endif > > - map_exec(buf, size); > - map_none(buf + size, qemu_real_host_page_size); > - qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE); > + /* split the buffer in two */ > + size /= 2; > + tcg_ctx.code_gen_buffer_size = size; > > - return buf; > + static_buf1 = buf; > + static_buf2 = buf + size + qemu_real_host_page_mask; > + > + map_static_code_gen_buffer(static_buf1, size); > + map_static_code_gen_buffer(static_buf2, size); > + > + assert(static_buf_mask == 1); > + return static_buf1; > } > #elif defined(_WIN32) > static inline void *alloc_code_gen_buffer(void) > @@ -825,10 +848,100 @@ static void page_flush_tb(void) > } > } > > +#ifdef USE_STATIC_CODE_GEN_BUFFER > + > +struct code_gen_desc { > + struct rcu_head rcu; > + int clear_bit; > +}; > + > +static void code_gen_buffer_clear(struct rcu_head *rcu) > +{ > + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); > + > + tb_lock(); > + static_buf_mask &= ~desc->clear_bit; > + tb_unlock(); > + g_free(desc); > +} > + > +static void *code_gen_buffer_replace(void) > +{ > + struct code_gen_desc *desc = g_malloc0(sizeof(*desc)); > + > + /* > + * If both bits are set, we're having two concurrent flushes. This > + * can easily happen if the buffers are heavily undersized. > + */ > + assert(static_buf_mask == 1 || static_buf_mask == 2); > + > + desc->clear_bit = static_buf_mask; > + call_rcu1(&desc->rcu, code_gen_buffer_clear); > + > + if (static_buf_mask == 1) { > + static_buf_mask |= 2; > + return static_buf2; > + } > + static_buf_mask |= 1; > + return static_buf1; > +} > + > +#elif defined(_WIN32) > + > +struct code_gen_desc { > + struct rcu_head rcu; > + void *buf; > +}; > + > +static void code_gen_buffer_vfree(struct rcu_head *rcu) > +{ > + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); > + > + VirtualFree(desc->buf, 0, MEM_RELEASE); > + g_free(desc); > +} > + > +static void *code_gen_buffer_replace(void) > +{ > + struct code_gen_desc *desc; > + > + desc = g_malloc0(sizeof(*desc)); > + desc->buf = tcg_ctx.code_gen_buffer; > + call_rcu1(&desc->rcu, code_gen_buffer_vfree); > + > + return alloc_code_gen_buffer(); > +} > + > +#else /* UNIX, dynamically-allocated code buffer */ > + > +struct code_gen_desc { > + struct rcu_head rcu; > + void *buf; > + size_t size; > +}; > + > +static void code_gen_buffer_unmap(struct rcu_head *rcu) > +{ > + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); > + > + munmap(desc->buf, desc->size + qemu_real_host_page_size); > + g_free(desc); > +} > + > +static void *code_gen_buffer_replace(void) > +{ > + struct code_gen_desc *desc; > + > + desc = g_malloc0(sizeof(*desc)); > + desc->buf = tcg_ctx.code_gen_buffer; > + desc->size = tcg_ctx.code_gen_buffer_size; > + call_rcu1(&desc->rcu, code_gen_buffer_unmap); > + > + return alloc_code_gen_buffer(); > +} > +#endif /* USE_STATIC_CODE_GEN_BUFFER */ > + > /* flush all the translation blocks */ > -/* XXX: tb_flush is currently not thread safe. System emulation calls it only > - * with tb_lock taken or from safe_work, so no need to take tb_lock here. > - */ > void tb_flush(CPUState *cpu) > { > #if defined(DEBUG_FLUSH) > @@ -852,10 +965,17 @@ void tb_flush(CPUState *cpu) > memset(tcg_ctx.tb_ctx.tb_phys_hash, 0, sizeof(tcg_ctx.tb_ctx.tb_phys_hash)); > page_flush_tb(); > > + tcg_ctx.code_gen_buffer = code_gen_buffer_replace(); > tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer; > + tcg_prologue_init(&tcg_ctx); > /* XXX: flush processor icache at this point if cache flush is > expensive */ > tcg_ctx.tb_ctx.tb_flush_count++; > + > + /* exit all CPUs so that the old buffer is quickly cleared. */ > + CPU_FOREACH(cpu) { > + cpu_exit(cpu); > + } > } > > #ifdef DEBUG_TB_CHECK -- Alex Bennée