[Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU

* [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU
@ 2016-04-22  0:06 Emilio G. Cota
  2016-04-22 14:41 ` Alex Bennée
  2016-04-22 18:25 ` Richard Henderson
  0 siblings, 2 replies; 21+ messages in thread
From: Emilio G. Cota @ 2016-04-22  0:06 UTC (permalink / raw)
  To: QEMU Developers, MTTCG Devel
  Cc: Alex Bennée, Paolo Bonzini, Peter Crosthwaite,
	Richard Henderson, Sergey Fedorov

This is a first attempt at making tb_flush not have to stop all CPUs.
There are issues as pointed out below, but this could be a good start.

Context:
  https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
  https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html

Known issues:
- Basically compile-tested only, since I've only run this with
  single-threaded TCG; I also tried running it with linux-user,
  but in order to trigger tb_flush I had to make code_gen_buffer
  so small that the CPU calling tb_flush would immediately fill
  the 2nd buffer, triggering the assert. If you have a working
  multi-threaded workload that would be good to test this, please
  let me know.
- Windows; not even compile-tested!

Signed-off-by: Emilio G. Cota <cota@braap.org>
---
 translate-all.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 117 insertions(+), 5 deletions(-)

diff --git a/translate-all.c b/translate-all.c
index bba9b62..4c14b4d 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -536,8 +536,13 @@ static inline void *split_cross_256mb(void *buf1, size_t size1)
 #endif
 
 #ifdef USE_STATIC_CODE_GEN_BUFFER
-static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
+static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE]
     __attribute__((aligned(CODE_GEN_ALIGN)));
+static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE]
+    __attribute__((aligned(CODE_GEN_ALIGN)));
+static int static_buf_mask = 1;
+static void *static_buf1;
+static void *static_buf2;
 
 # ifdef _WIN32
 static inline void do_protect(void *addr, long size, int prot)
@@ -580,13 +585,12 @@ static inline void map_none(void *addr, long size)
 }
 # endif /* WIN32 */
 
-static inline void *alloc_code_gen_buffer(void)
+static void *alloc_static_code_gen_buffer(void *buf)
 {
-    void *buf = static_code_gen_buffer;
     size_t full_size, size;
 
     /* The size of the buffer, rounded down to end on a page boundary.  */
-    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
+    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1))
                  & qemu_real_host_page_mask) - (uintptr_t)buf;
 
     /* Reserve a guard page.  */
@@ -612,6 +616,15 @@ static inline void *alloc_code_gen_buffer(void)
 
     return buf;
 }
+
+static inline void *alloc_code_gen_buffer(void)
+{
+    static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1);
+    static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2);
+
+    assert(static_buf_mask == 1);
+    return static_buf1;
+}
 #elif defined(_WIN32)
 static inline void *alloc_code_gen_buffer(void)
 {
@@ -829,8 +842,100 @@ static void page_flush_tb(void)
     }
 }
 
+#ifdef USE_STATIC_CODE_GEN_BUFFER
+
+struct code_gen_desc {
+    struct rcu_head rcu;
+    int clear_bit;
+};
+
+static void code_gen_buffer_clear(struct rcu_head *rcu)
+{
+    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+    tb_lock();
+    static_buf_mask &= ~desc->clear_bit;
+    tb_unlock();
+    g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+    struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
+
+    /*
+     * If both bits are set, we're having two concurrent flushes. This
+     * can easily happen if the buffers are heavily undersized.
+     */
+    assert(static_buf_mask == 1 || static_buf_mask == 2);
+
+    desc->clear_bit = static_buf_mask;
+    call_rcu1(&desc->rcu, code_gen_buffer_clear);
+
+    if (static_buf_mask == 1) {
+        static_buf_mask |= 2;
+        return static_buf2;
+    }
+    static_buf_mask |= 1;
+    return static_buf1;
+}
+
+#elif defined(_WIN32)
+
+struct code_gen_desc {
+    struct rcu_head rcu;
+    void *buf;
+};
+
+static void code_gen_buffer_vfree(struct rcu_head *rcu)
+{
+    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+    VirtualFree(desc->buf, 0, MEM_RELEASE);
+    g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+    struct code_gen_desc *desc;
+
+    desc = g_malloc0(sizeof(*desc));
+    desc->buf = tcg_ctx.code_gen_buffer;
+    call_rcu1(&desc->rcu, code_gen_buffer_vfree);
+
+    return alloc_code_gen_buffer();
+}
+
+#else /* UNIX, dynamically-allocated code buffer */
+
+struct code_gen_desc {
+    struct rcu_head rcu;
+    void *buf;
+    size_t size;
+};
+
+static void code_gen_buffer_unmap(struct rcu_head *rcu)
+{
+    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+    munmap(desc->buf, desc->size + qemu_real_host_page_size);
+    g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+    struct code_gen_desc *desc;
+
+    desc = g_malloc0(sizeof(*desc));
+    desc->buf = tcg_ctx.code_gen_buffer;
+    desc->size = tcg_ctx.code_gen_buffer_size;
+    call_rcu1(&desc->rcu, code_gen_buffer_unmap);
+
+    return alloc_code_gen_buffer();
+}
+#endif /* USE_STATIC_CODE_GEN_BUFFER */
+
 /* flush all the translation blocks */
-/* XXX: tb_flush is currently not thread safe */
 void tb_flush(CPUState *cpu)
 {
 #if defined(DEBUG_FLUSH)
@@ -853,10 +958,17 @@ void tb_flush(CPUState *cpu)
     qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
     page_flush_tb();
 
+    tcg_ctx.code_gen_buffer = code_gen_buffer_replace();
     tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
+    tcg_prologue_init(&tcg_ctx);
     /* XXX: flush processor icache at this point if cache flush is
        expensive */
     tcg_ctx.tb_ctx.tb_flush_count++;
+
+    /* exit all CPUs so that the old buffer is quickly cleared. */
+    CPU_FOREACH(cpu) {
+        cpu_exit(cpu);
+    }
 }
 
 #ifdef DEBUG_TB_CHECK
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 21+ messages in thread