All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU
@ 2016-04-22  0:06 Emilio G. Cota
  2016-04-22 14:41 ` Alex Bennée
  2016-04-22 18:25 ` Richard Henderson
  0 siblings, 2 replies; 21+ messages in thread
From: Emilio G. Cota @ 2016-04-22  0:06 UTC (permalink / raw)
  To: QEMU Developers, MTTCG Devel
  Cc: Alex Bennée, Paolo Bonzini, Peter Crosthwaite,
	Richard Henderson, Sergey Fedorov

This is a first attempt at making tb_flush not have to stop all CPUs.
There are issues as pointed out below, but this could be a good start.

Context:
  https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
  https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html

Known issues:
- Basically compile-tested only, since I've only run this with
  single-threaded TCG; I also tried running it with linux-user,
  but in order to trigger tb_flush I had to make code_gen_buffer
  so small that the CPU calling tb_flush would immediately fill
  the 2nd buffer, triggering the assert. If you have a working
  multi-threaded workload that would be good to test this, please
  let me know.
- Windows; not even compile-tested!

Signed-off-by: Emilio G. Cota <cota@braap.org>
---
 translate-all.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 117 insertions(+), 5 deletions(-)

diff --git a/translate-all.c b/translate-all.c
index bba9b62..4c14b4d 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -536,8 +536,13 @@ static inline void *split_cross_256mb(void *buf1, size_t size1)
 #endif
 
 #ifdef USE_STATIC_CODE_GEN_BUFFER
-static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
+static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE]
     __attribute__((aligned(CODE_GEN_ALIGN)));
+static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE]
+    __attribute__((aligned(CODE_GEN_ALIGN)));
+static int static_buf_mask = 1;
+static void *static_buf1;
+static void *static_buf2;
 
 # ifdef _WIN32
 static inline void do_protect(void *addr, long size, int prot)
@@ -580,13 +585,12 @@ static inline void map_none(void *addr, long size)
 }
 # endif /* WIN32 */
 
-static inline void *alloc_code_gen_buffer(void)
+static void *alloc_static_code_gen_buffer(void *buf)
 {
-    void *buf = static_code_gen_buffer;
     size_t full_size, size;
 
     /* The size of the buffer, rounded down to end on a page boundary.  */
-    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
+    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1))
                  & qemu_real_host_page_mask) - (uintptr_t)buf;
 
     /* Reserve a guard page.  */
@@ -612,6 +616,15 @@ static inline void *alloc_code_gen_buffer(void)
 
     return buf;
 }
+
+static inline void *alloc_code_gen_buffer(void)
+{
+    static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1);
+    static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2);
+
+    assert(static_buf_mask == 1);
+    return static_buf1;
+}
 #elif defined(_WIN32)
 static inline void *alloc_code_gen_buffer(void)
 {
@@ -829,8 +842,100 @@ static void page_flush_tb(void)
     }
 }
 
+#ifdef USE_STATIC_CODE_GEN_BUFFER
+
+struct code_gen_desc {
+    struct rcu_head rcu;
+    int clear_bit;
+};
+
+static void code_gen_buffer_clear(struct rcu_head *rcu)
+{
+    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+    tb_lock();
+    static_buf_mask &= ~desc->clear_bit;
+    tb_unlock();
+    g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+    struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
+
+    /*
+     * If both bits are set, we're having two concurrent flushes. This
+     * can easily happen if the buffers are heavily undersized.
+     */
+    assert(static_buf_mask == 1 || static_buf_mask == 2);
+
+    desc->clear_bit = static_buf_mask;
+    call_rcu1(&desc->rcu, code_gen_buffer_clear);
+
+    if (static_buf_mask == 1) {
+        static_buf_mask |= 2;
+        return static_buf2;
+    }
+    static_buf_mask |= 1;
+    return static_buf1;
+}
+
+#elif defined(_WIN32)
+
+struct code_gen_desc {
+    struct rcu_head rcu;
+    void *buf;
+};
+
+static void code_gen_buffer_vfree(struct rcu_head *rcu)
+{
+    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+    VirtualFree(desc->buf, 0, MEM_RELEASE);
+    g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+    struct code_gen_desc *desc;
+
+    desc = g_malloc0(sizeof(*desc));
+    desc->buf = tcg_ctx.code_gen_buffer;
+    call_rcu1(&desc->rcu, code_gen_buffer_vfree);
+
+    return alloc_code_gen_buffer();
+}
+
+#else /* UNIX, dynamically-allocated code buffer */
+
+struct code_gen_desc {
+    struct rcu_head rcu;
+    void *buf;
+    size_t size;
+};
+
+static void code_gen_buffer_unmap(struct rcu_head *rcu)
+{
+    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+    munmap(desc->buf, desc->size + qemu_real_host_page_size);
+    g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+    struct code_gen_desc *desc;
+
+    desc = g_malloc0(sizeof(*desc));
+    desc->buf = tcg_ctx.code_gen_buffer;
+    desc->size = tcg_ctx.code_gen_buffer_size;
+    call_rcu1(&desc->rcu, code_gen_buffer_unmap);
+
+    return alloc_code_gen_buffer();
+}
+#endif /* USE_STATIC_CODE_GEN_BUFFER */
+
 /* flush all the translation blocks */
-/* XXX: tb_flush is currently not thread safe */
 void tb_flush(CPUState *cpu)
 {
 #if defined(DEBUG_FLUSH)
@@ -853,10 +958,17 @@ void tb_flush(CPUState *cpu)
     qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
     page_flush_tb();
 
+    tcg_ctx.code_gen_buffer = code_gen_buffer_replace();
     tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
+    tcg_prologue_init(&tcg_ctx);
     /* XXX: flush processor icache at this point if cache flush is
        expensive */
     tcg_ctx.tb_ctx.tb_flush_count++;
+
+    /* exit all CPUs so that the old buffer is quickly cleared. */
+    CPU_FOREACH(cpu) {
+        cpu_exit(cpu);
+    }
 }
 
 #ifdef DEBUG_TB_CHECK
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU
  2016-04-22  0:06 [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU Emilio G. Cota
@ 2016-04-22 14:41 ` Alex Bennée
  2016-04-22 14:47   ` Alex Bennée
  2016-04-24  3:20   ` Emilio G. Cota
  2016-04-22 18:25 ` Richard Henderson
  1 sibling, 2 replies; 21+ messages in thread
From: Alex Bennée @ 2016-04-22 14:41 UTC (permalink / raw)
  To: Emilio G. Cota
  Cc: QEMU Developers, MTTCG Devel, Paolo Bonzini, Peter Crosthwaite,
	Richard Henderson, Sergey Fedorov


Emilio G. Cota <cota@braap.org> writes:

> This is a first attempt at making tb_flush not have to stop all CPUs.
> There are issues as pointed out below, but this could be a good start.
>
> Context:
>   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
>   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
>
> Known issues:
> - Basically compile-tested only, since I've only run this with
>   single-threaded TCG; I also tried running it with linux-user,
>   but in order to trigger tb_flush I had to make code_gen_buffer
>   so small that the CPU calling tb_flush would immediately fill
>   the 2nd buffer, triggering the assert. If you have a working
>   multi-threaded workload that would be good to test this, please
>   let me know.

With my latest mttcg unit tests:

./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \
  -device virtio-serial-device -device virtconsole,chardev=ctd \
  -chardev testdev,id=ctd -display none -serial stdio \
  -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \
  -append "tight smc irq mod=1 rounds=100000"  -name arm,debug-threads=on


> - Windows; not even compile-tested!
>
> Signed-off-by: Emilio G. Cota <cota@braap.org>
> ---
>  translate-all.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 117 insertions(+), 5 deletions(-)
>
> diff --git a/translate-all.c b/translate-all.c
> index bba9b62..4c14b4d 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -536,8 +536,13 @@ static inline void *split_cross_256mb(void *buf1, size_t size1)
>  #endif
>
>  #ifdef USE_STATIC_CODE_GEN_BUFFER
> -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
> +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE]
>      __attribute__((aligned(CODE_GEN_ALIGN)));
> +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE]
> +    __attribute__((aligned(CODE_GEN_ALIGN)));
> +static int static_buf_mask = 1;
> +static void *static_buf1;
> +static void *static_buf2;
>
>  # ifdef _WIN32
>  static inline void do_protect(void *addr, long size, int prot)
> @@ -580,13 +585,12 @@ static inline void map_none(void *addr, long size)
>  }
>  # endif /* WIN32 */
>
> -static inline void *alloc_code_gen_buffer(void)
> +static void *alloc_static_code_gen_buffer(void *buf)
>  {
> -    void *buf = static_code_gen_buffer;
>      size_t full_size, size;
>
>      /* The size of the buffer, rounded down to end on a page boundary.  */
> -    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
> +    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1))
>                   & qemu_real_host_page_mask) - (uintptr_t)buf;
>
>      /* Reserve a guard page.  */
> @@ -612,6 +616,15 @@ static inline void *alloc_code_gen_buffer(void)
>
>      return buf;
>  }
> +
> +static inline void *alloc_code_gen_buffer(void)
> +{
> +    static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1);
> +    static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2);
> +
> +    assert(static_buf_mask == 1);
> +    return static_buf1;
> +}
>  #elif defined(_WIN32)
>  static inline void *alloc_code_gen_buffer(void)
>  {
> @@ -829,8 +842,100 @@ static void page_flush_tb(void)
>      }
>  }
>
> +#ifdef USE_STATIC_CODE_GEN_BUFFER
> +
> +struct code_gen_desc {
> +    struct rcu_head rcu;
> +    int clear_bit;
> +};
> +
> +static void code_gen_buffer_clear(struct rcu_head *rcu)
> +{
> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> +    tb_lock();
> +    static_buf_mask &= ~desc->clear_bit;
> +    tb_unlock();
> +    g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> +    struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
> +
> +    /*
> +     * If both bits are set, we're having two concurrent flushes. This
> +     * can easily happen if the buffers are heavily undersized.
> +     */
> +    assert(static_buf_mask == 1 || static_buf_mask == 2);
> +
> +    desc->clear_bit = static_buf_mask;
> +    call_rcu1(&desc->rcu, code_gen_buffer_clear);
> +
> +    if (static_buf_mask == 1) {
> +        static_buf_mask |= 2;
> +        return static_buf2;
> +    }
> +    static_buf_mask |= 1;
> +    return static_buf1;
> +}
> +
> +#elif defined(_WIN32)
> +
> +struct code_gen_desc {
> +    struct rcu_head rcu;
> +    void *buf;
> +};
> +
> +static void code_gen_buffer_vfree(struct rcu_head *rcu)
> +{
> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> +    VirtualFree(desc->buf, 0, MEM_RELEASE);
> +    g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> +    struct code_gen_desc *desc;
> +
> +    desc = g_malloc0(sizeof(*desc));
> +    desc->buf = tcg_ctx.code_gen_buffer;
> +    call_rcu1(&desc->rcu, code_gen_buffer_vfree);
> +
> +    return alloc_code_gen_buffer();
> +}
> +
> +#else /* UNIX, dynamically-allocated code buffer */
> +
> +struct code_gen_desc {
> +    struct rcu_head rcu;
> +    void *buf;
> +    size_t size;
> +};
> +
> +static void code_gen_buffer_unmap(struct rcu_head *rcu)
> +{
> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> +    munmap(desc->buf, desc->size + qemu_real_host_page_size);
> +    g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> +    struct code_gen_desc *desc;
> +
> +    desc = g_malloc0(sizeof(*desc));
> +    desc->buf = tcg_ctx.code_gen_buffer;
> +    desc->size = tcg_ctx.code_gen_buffer_size;
> +    call_rcu1(&desc->rcu, code_gen_buffer_unmap);
> +
> +    return alloc_code_gen_buffer();
> +}
> +#endif /* USE_STATIC_CODE_GEN_BUFFER */
> +
>  /* flush all the translation blocks */
> -/* XXX: tb_flush is currently not thread safe */
>  void tb_flush(CPUState *cpu)
>  {
>  #if defined(DEBUG_FLUSH)
> @@ -853,10 +958,17 @@ void tb_flush(CPUState *cpu)
>      qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
>      page_flush_tb();
>
> +    tcg_ctx.code_gen_buffer = code_gen_buffer_replace();
>      tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
> +    tcg_prologue_init(&tcg_ctx);
>      /* XXX: flush processor icache at this point if cache flush is
>         expensive */
>      tcg_ctx.tb_ctx.tb_flush_count++;
> +
> +    /* exit all CPUs so that the old buffer is quickly cleared. */
> +    CPU_FOREACH(cpu) {
> +        cpu_exit(cpu);
> +    }
>  }
>
>  #ifdef DEBUG_TB_CHECK


--
Alex Bennée

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU
  2016-04-22 14:41 ` Alex Bennée
@ 2016-04-22 14:47   ` Alex Bennée
  2016-04-24  3:20   ` Emilio G. Cota
  1 sibling, 0 replies; 21+ messages in thread
From: Alex Bennée @ 2016-04-22 14:47 UTC (permalink / raw)
  To: Emilio G. Cota
  Cc: QEMU Developers, MTTCG Devel, Paolo Bonzini, Peter Crosthwaite,
	Richard Henderson, Sergey Fedorov


Alex Bennée <alex.bennee@linaro.org> writes:

> Emilio G. Cota <cota@braap.org> writes:
>
>> This is a first attempt at making tb_flush not have to stop all CPUs.
>> There are issues as pointed out below, but this could be a good start.
>>
>> Context:
>>   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
>>   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
>>
>> Known issues:
>> - Basically compile-tested only, since I've only run this with
>>   single-threaded TCG; I also tried running it with linux-user,
>>   but in order to trigger tb_flush I had to make code_gen_buffer
>>   so small that the CPU calling tb_flush would immediately fill
>>   the 2nd buffer, triggering the assert. If you have a working
>>   multi-threaded workload that would be good to test this, please
>>   let me know.
>
> With my latest mttcg unit tests:
>
> ./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \
>   -device virtio-serial-device -device virtconsole,chardev=ctd \
>   -chardev testdev,id=ctd -display none -serial stdio \
>   -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \
>   -append "tight smc irq mod=1 rounds=100000"  -name
>   arm,debug-threads=on

Ahh, I just realised you wanted a linux-user workload.

>
>
>> - Windows; not even compile-tested!
>>
>> Signed-off-by: Emilio G. Cota <cota@braap.org>
>> ---
>>  translate-all.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 117 insertions(+), 5 deletions(-)
>>
>> diff --git a/translate-all.c b/translate-all.c
>> index bba9b62..4c14b4d 100644
>> --- a/translate-all.c
>> +++ b/translate-all.c
>> @@ -536,8 +536,13 @@ static inline void *split_cross_256mb(void *buf1, size_t size1)
>>  #endif
>>
>>  #ifdef USE_STATIC_CODE_GEN_BUFFER
>> -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
>> +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE]
>>      __attribute__((aligned(CODE_GEN_ALIGN)));
>> +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE]
>> +    __attribute__((aligned(CODE_GEN_ALIGN)));
>> +static int static_buf_mask = 1;
>> +static void *static_buf1;
>> +static void *static_buf2;
>>
>>  # ifdef _WIN32
>>  static inline void do_protect(void *addr, long size, int prot)
>> @@ -580,13 +585,12 @@ static inline void map_none(void *addr, long size)
>>  }
>>  # endif /* WIN32 */
>>
>> -static inline void *alloc_code_gen_buffer(void)
>> +static void *alloc_static_code_gen_buffer(void *buf)
>>  {
>> -    void *buf = static_code_gen_buffer;
>>      size_t full_size, size;
>>
>>      /* The size of the buffer, rounded down to end on a page boundary.  */
>> -    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
>> +    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1))
>>                   & qemu_real_host_page_mask) - (uintptr_t)buf;
>>
>>      /* Reserve a guard page.  */
>> @@ -612,6 +616,15 @@ static inline void *alloc_code_gen_buffer(void)
>>
>>      return buf;
>>  }
>> +
>> +static inline void *alloc_code_gen_buffer(void)
>> +{
>> +    static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1);
>> +    static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2);
>> +
>> +    assert(static_buf_mask == 1);
>> +    return static_buf1;
>> +}
>>  #elif defined(_WIN32)
>>  static inline void *alloc_code_gen_buffer(void)
>>  {
>> @@ -829,8 +842,100 @@ static void page_flush_tb(void)
>>      }
>>  }
>>
>> +#ifdef USE_STATIC_CODE_GEN_BUFFER
>> +
>> +struct code_gen_desc {
>> +    struct rcu_head rcu;
>> +    int clear_bit;
>> +};
>> +
>> +static void code_gen_buffer_clear(struct rcu_head *rcu)
>> +{
>> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
>> +
>> +    tb_lock();
>> +    static_buf_mask &= ~desc->clear_bit;
>> +    tb_unlock();
>> +    g_free(desc);
>> +}
>> +
>> +static void *code_gen_buffer_replace(void)
>> +{
>> +    struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
>> +
>> +    /*
>> +     * If both bits are set, we're having two concurrent flushes. This
>> +     * can easily happen if the buffers are heavily undersized.
>> +     */
>> +    assert(static_buf_mask == 1 || static_buf_mask == 2);
>> +
>> +    desc->clear_bit = static_buf_mask;
>> +    call_rcu1(&desc->rcu, code_gen_buffer_clear);
>> +
>> +    if (static_buf_mask == 1) {
>> +        static_buf_mask |= 2;
>> +        return static_buf2;
>> +    }
>> +    static_buf_mask |= 1;
>> +    return static_buf1;
>> +}
>> +
>> +#elif defined(_WIN32)
>> +
>> +struct code_gen_desc {
>> +    struct rcu_head rcu;
>> +    void *buf;
>> +};
>> +
>> +static void code_gen_buffer_vfree(struct rcu_head *rcu)
>> +{
>> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
>> +
>> +    VirtualFree(desc->buf, 0, MEM_RELEASE);
>> +    g_free(desc);
>> +}
>> +
>> +static void *code_gen_buffer_replace(void)
>> +{
>> +    struct code_gen_desc *desc;
>> +
>> +    desc = g_malloc0(sizeof(*desc));
>> +    desc->buf = tcg_ctx.code_gen_buffer;
>> +    call_rcu1(&desc->rcu, code_gen_buffer_vfree);
>> +
>> +    return alloc_code_gen_buffer();
>> +}
>> +
>> +#else /* UNIX, dynamically-allocated code buffer */
>> +
>> +struct code_gen_desc {
>> +    struct rcu_head rcu;
>> +    void *buf;
>> +    size_t size;
>> +};
>> +
>> +static void code_gen_buffer_unmap(struct rcu_head *rcu)
>> +{
>> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
>> +
>> +    munmap(desc->buf, desc->size + qemu_real_host_page_size);
>> +    g_free(desc);
>> +}
>> +
>> +static void *code_gen_buffer_replace(void)
>> +{
>> +    struct code_gen_desc *desc;
>> +
>> +    desc = g_malloc0(sizeof(*desc));
>> +    desc->buf = tcg_ctx.code_gen_buffer;
>> +    desc->size = tcg_ctx.code_gen_buffer_size;
>> +    call_rcu1(&desc->rcu, code_gen_buffer_unmap);
>> +
>> +    return alloc_code_gen_buffer();
>> +}
>> +#endif /* USE_STATIC_CODE_GEN_BUFFER */
>> +
>>  /* flush all the translation blocks */
>> -/* XXX: tb_flush is currently not thread safe */
>>  void tb_flush(CPUState *cpu)
>>  {
>>  #if defined(DEBUG_FLUSH)
>> @@ -853,10 +958,17 @@ void tb_flush(CPUState *cpu)
>>      qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
>>      page_flush_tb();
>>
>> +    tcg_ctx.code_gen_buffer = code_gen_buffer_replace();
>>      tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
>> +    tcg_prologue_init(&tcg_ctx);
>>      /* XXX: flush processor icache at this point if cache flush is
>>         expensive */
>>      tcg_ctx.tb_ctx.tb_flush_count++;
>> +
>> +    /* exit all CPUs so that the old buffer is quickly cleared. */
>> +    CPU_FOREACH(cpu) {
>> +        cpu_exit(cpu);
>> +    }
>>  }
>>
>>  #ifdef DEBUG_TB_CHECK


--
Alex Bennée

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU
  2016-04-22  0:06 [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU Emilio G. Cota
  2016-04-22 14:41 ` Alex Bennée
@ 2016-04-22 18:25 ` Richard Henderson
  2016-04-24  3:27   ` [Qemu-devel] [RFC v2] " Emilio G. Cota
  1 sibling, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2016-04-22 18:25 UTC (permalink / raw)
  To: Emilio G. Cota, QEMU Developers, MTTCG Devel
  Cc: Alex Bennée, Paolo Bonzini, Peter Crosthwaite, Sergey Fedorov

On 04/21/2016 05:06 PM, Emilio G. Cota wrote:
>  #ifdef USE_STATIC_CODE_GEN_BUFFER
> -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
> +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE]
>      __attribute__((aligned(CODE_GEN_ALIGN)));
> +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE]
> +    __attribute__((aligned(CODE_GEN_ALIGN)));
> +static int static_buf_mask = 1;
> +static void *static_buf1;
> +static void *static_buf2;

I don't like this at all.

(1) This is (by default) 32MB we're adding to the RSS of the
    simulator.  Surely we can do better than this.

(2) On some hosts we require a maximum displacement from
    any point in the code gen buffer from the tcg prologue.
    That means you can't simply allocate two separate buffers.

    You have to take a single buffer, of known good size and
    alignment, and split it in half.


r~

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU
  2016-04-22 14:41 ` Alex Bennée
  2016-04-22 14:47   ` Alex Bennée
@ 2016-04-24  3:20   ` Emilio G. Cota
  2016-04-25  8:35     ` Alex Bennée
  1 sibling, 1 reply; 21+ messages in thread
From: Emilio G. Cota @ 2016-04-24  3:20 UTC (permalink / raw)
  To: Alex Bennée
  Cc: QEMU Developers, MTTCG Devel, Paolo Bonzini, Peter Crosthwaite,
	Richard Henderson, Sergey Fedorov

On Fri, Apr 22, 2016 at 15:41:13 +0100, Alex Bennée wrote:
> Emilio G. Cota <cota@braap.org> writes:
(snip)
> > Known issues:
> > - Basically compile-tested only, since I've only run this with
> >   single-threaded TCG; I also tried running it with linux-user,
> >   but in order to trigger tb_flush I had to make code_gen_buffer
> >   so small that the CPU calling tb_flush would immediately fill
> >   the 2nd buffer, triggering the assert. If you have a working
> >   multi-threaded workload that would be good to test this, please
> >   let me know.
> 
> With my latest mttcg unit tests:
> 
> ./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \
>   -device virtio-serial-device -device virtconsole,chardev=ctd \
>   -chardev testdev,id=ctd -display none -serial stdio \
>   -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \
>   -append "tight smc irq mod=1 rounds=100000"  -name arm,debug-threads=on

This is useful. Never mind the need for testing linux-user, I can test
both code paths (i.e. dynamic allocation and static buf) with qemu-system
by simply defining USE_STATIC_CODE_GEN_BUFFER.

After applying a modified version of this patch (that I'll send in
a jiffy) to your enable-mttcg-for-armv7-v1 branch (reverting first
"translate-all: introduces tb_flush_safe"), I can easily trigger
this error when setting a low enough TB size, e.g. -tb-size 32:

 CPU1: online and setting up with pattern 0xa0b78cbf
 CPU2: online and setting up with pattern 0x22287c45
 CPU3: online and setting up with pattern 0x6262c5c5
 CPU0: online and setting up with pattern 0xa65e7ad6
 qemu: flush code_size=10622184 nb_tbs=83886 avg_tb_size=126
 qemu: flush code_size=10469016 nb_tbs=83886 avg_tb_size=124
 qemu: flush code_size=10492920 nb_tbs=83886 avg_tb_size=125
 qemu: flush code_size=10477464 nb_tbs=83886 avg_tb_size=124
 qemu: flush code_size=10495800 nb_tbs=83886 avg_tb_size=125
 PASS: smc: irq: 0 errors, IRQs not checked
 Unhandled exception 3 (pabt)
 Exception frame registers:
 pc : [<e59f2028>]    lr : [<40010700>]    psr: a0000153
 sp : 400ac5c0  ip : 400ab4e8  fp : 40032ca8
 r10: 00000000  r9 : 00000000  r8 : 00000000
 r7 : 00000000  r6 : 00000000  r5 : 00000000  r4 : 00000000
 r3 : 00000000  r2 : 00000000  r1 : e59f2028  r0 : 00000000
 Flags: NzCv  IRQs on  FIQs off  Mode SVC_32
 Control: 00c5107d  Table: 40060000  DAC: 00000000
 IFAR: e59f2028    IFSR: 00000205

Any input on where to look would be appreciated. Thanks,

		Emilio

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Qemu-devel] [RFC v2] translate-all: protect code_gen_buffer with RCU
  2016-04-22 18:25 ` Richard Henderson
@ 2016-04-24  3:27   ` Emilio G. Cota
  2016-04-24 18:12     ` Richard Henderson
  2016-04-25 15:19     ` Alex Bennée
  0 siblings, 2 replies; 21+ messages in thread
From: Emilio G. Cota @ 2016-04-24  3:27 UTC (permalink / raw)
  To: QEMU Developers, MTTCG Devel
  Cc: Alex Bennée, Paolo Bonzini, Peter Crosthwaite,
	Richard Henderson, Sergey Fedorov

[ Applies on top of bennee/mttcg/enable-mttcg-for-armv7-v1 after
reverting "translate-all: introduces tb_flush_safe". A trivial
conflict must be solved after applying. ]

This is a first attempt at making tb_flush not have to stop all CPUs.
There are issues as pointed out below, but this could be a good start.

Context:
  https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
  https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html

Changes from v1:
- When a static buffer is used, split it in two instead of using
  a second buffer.

Known issues:
- Fails Alex' unit test with low enough -tb-size, see
  https://lists.gnu.org/archive/html/qemu-devel/2016-04/msg03465.html
  Seems to work in MTTCG, although I've only tested with tb_lock
  always being taken in tb_find_fast.
- Windows; not even compile-tested!

Signed-off-by: Emilio G. Cota <cota@braap.org>
---
 translate-all.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 133 insertions(+), 13 deletions(-)

diff --git a/translate-all.c b/translate-all.c
index 8e70583..6830371 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -535,6 +535,9 @@ static inline void *split_cross_256mb(void *buf1, size_t size1)
 #ifdef USE_STATIC_CODE_GEN_BUFFER
 static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
     __attribute__((aligned(CODE_GEN_ALIGN)));
+static int static_buf_mask = 1;
+static void *static_buf1;
+static void *static_buf2;
 
 # ifdef _WIN32
 static inline void do_protect(void *addr, long size, int prot)
@@ -577,6 +580,13 @@ static inline void map_none(void *addr, long size)
 }
 # endif /* WIN32 */
 
+static void map_static_code_gen_buffer(void *buf, size_t size)
+{
+    map_exec(buf, size);
+    map_none(buf + size, qemu_real_host_page_size);
+    qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
+}
+
 static inline void *alloc_code_gen_buffer(void)
 {
     void *buf = static_code_gen_buffer;
@@ -586,28 +596,41 @@ static inline void *alloc_code_gen_buffer(void)
     full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
                  & qemu_real_host_page_mask) - (uintptr_t)buf;
 
-    /* Reserve a guard page.  */
-    size = full_size - qemu_real_host_page_size;
+    /*
+     * Reserve two guard pages, one after each of the two buffers:
+     * |     buf1     |g1|    buf2    |g2|
+     */
+    size = full_size - 2 * qemu_real_host_page_size;
 
     /* Honor a command-line option limiting the size of the buffer.  */
     if (size > tcg_ctx.code_gen_buffer_size) {
         size = (((uintptr_t)buf + tcg_ctx.code_gen_buffer_size)
                 & qemu_real_host_page_mask) - (uintptr_t)buf;
     }
-    tcg_ctx.code_gen_buffer_size = size;
 
 #ifdef __mips__
-    if (cross_256mb(buf, size)) {
-        buf = split_cross_256mb(buf, size);
-        size = tcg_ctx.code_gen_buffer_size;
+    /*
+     * Pass 'size + page_size', since we want 'buf1 | guard1 | buf2' to be
+     * within the boundary.
+     */
+    if (cross_256mb(buf, size + qemu_real_host_page_size)) {
+        buf = split_cross_256mb(buf, size + qemu_real_host_page_size);
+        size = tcg_ctx.code_gen_buffer_size - qemu_real_host_page_size;
     }
 #endif
 
-    map_exec(buf, size);
-    map_none(buf + size, qemu_real_host_page_size);
-    qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
+    /* split the buffer in two */
+    size /= 2;
+    tcg_ctx.code_gen_buffer_size = size;
 
-    return buf;
+    static_buf1 = buf;
+    static_buf2 = buf + size + qemu_real_host_page_mask;
+
+    map_static_code_gen_buffer(static_buf1, size);
+    map_static_code_gen_buffer(static_buf2, size);
+
+    assert(static_buf_mask == 1);
+    return static_buf1;
 }
 #elif defined(_WIN32)
 static inline void *alloc_code_gen_buffer(void)
@@ -825,10 +848,100 @@ static void page_flush_tb(void)
     }
 }
 
+#ifdef USE_STATIC_CODE_GEN_BUFFER
+
+struct code_gen_desc {
+    struct rcu_head rcu;
+    int clear_bit;
+};
+
+static void code_gen_buffer_clear(struct rcu_head *rcu)
+{
+    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+    tb_lock();
+    static_buf_mask &= ~desc->clear_bit;
+    tb_unlock();
+    g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+    struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
+
+    /*
+     * If both bits are set, we're having two concurrent flushes. This
+     * can easily happen if the buffers are heavily undersized.
+     */
+    assert(static_buf_mask == 1 || static_buf_mask == 2);
+
+    desc->clear_bit = static_buf_mask;
+    call_rcu1(&desc->rcu, code_gen_buffer_clear);
+
+    if (static_buf_mask == 1) {
+        static_buf_mask |= 2;
+        return static_buf2;
+    }
+    static_buf_mask |= 1;
+    return static_buf1;
+}
+
+#elif defined(_WIN32)
+
+struct code_gen_desc {
+    struct rcu_head rcu;
+    void *buf;
+};
+
+static void code_gen_buffer_vfree(struct rcu_head *rcu)
+{
+    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+    VirtualFree(desc->buf, 0, MEM_RELEASE);
+    g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+    struct code_gen_desc *desc;
+
+    desc = g_malloc0(sizeof(*desc));
+    desc->buf = tcg_ctx.code_gen_buffer;
+    call_rcu1(&desc->rcu, code_gen_buffer_vfree);
+
+    return alloc_code_gen_buffer();
+}
+
+#else /* UNIX, dynamically-allocated code buffer */
+
+struct code_gen_desc {
+    struct rcu_head rcu;
+    void *buf;
+    size_t size;
+};
+
+static void code_gen_buffer_unmap(struct rcu_head *rcu)
+{
+    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+    munmap(desc->buf, desc->size + qemu_real_host_page_size);
+    g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+    struct code_gen_desc *desc;
+
+    desc = g_malloc0(sizeof(*desc));
+    desc->buf = tcg_ctx.code_gen_buffer;
+    desc->size = tcg_ctx.code_gen_buffer_size;
+    call_rcu1(&desc->rcu, code_gen_buffer_unmap);
+
+    return alloc_code_gen_buffer();
+}
+#endif /* USE_STATIC_CODE_GEN_BUFFER */
+
 /* flush all the translation blocks */
-/* XXX: tb_flush is currently not thread safe.  System emulation calls it only
- * with tb_lock taken or from safe_work, so no need to take tb_lock here.
- */
 void tb_flush(CPUState *cpu)
 {
 #if defined(DEBUG_FLUSH)
@@ -852,10 +965,17 @@ void tb_flush(CPUState *cpu)
     memset(tcg_ctx.tb_ctx.tb_phys_hash, 0, sizeof(tcg_ctx.tb_ctx.tb_phys_hash));
     page_flush_tb();
 
+    tcg_ctx.code_gen_buffer = code_gen_buffer_replace();
     tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
+    tcg_prologue_init(&tcg_ctx);
     /* XXX: flush processor icache at this point if cache flush is
        expensive */
     tcg_ctx.tb_ctx.tb_flush_count++;
+
+    /* exit all CPUs so that the old buffer is quickly cleared. */
+    CPU_FOREACH(cpu) {
+        cpu_exit(cpu);
+    }
 }
 
 #ifdef DEBUG_TB_CHECK
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC v2] translate-all: protect code_gen_buffer with RCU
  2016-04-24  3:27   ` [Qemu-devel] [RFC v2] " Emilio G. Cota
@ 2016-04-24 18:12     ` Richard Henderson
  2016-04-25 15:19     ` Alex Bennée
  1 sibling, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-04-24 18:12 UTC (permalink / raw)
  To: Emilio G. Cota, QEMU Developers, MTTCG Devel
  Cc: Alex Bennée, Paolo Bonzini, Peter Crosthwaite, Sergey Fedorov

On 04/23/2016 08:27 PM, Emilio G. Cota wrote:
> [ Applies on top of bennee/mttcg/enable-mttcg-for-armv7-v1 after
> reverting "translate-all: introduces tb_flush_safe". A trivial
> conflict must be solved after applying. ]
>
> This is a first attempt at making tb_flush not have to stop all CPUs.
> There are issues as pointed out below, but this could be a good start.
>
> Context:
>    https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
>    https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html

I will again say that I don't believe that wasting all of this memory is as 
good as using locks -- tb_flush doesn't happen *that* often.

> +static void map_static_code_gen_buffer(void *buf, size_t size)
> +{
> +    map_exec(buf, size);
> +    map_none(buf + size, qemu_real_host_page_size);
> +    qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
> +}

Nit: I know it's only startup, but there's no reason to make multiple map_exec 
or madvise calls.  You can cover the entire buffer in one go, and then call 
map_none on the guard pages.

> +#ifdef USE_STATIC_CODE_GEN_BUFFER
...
> +#elif defined(_WIN32)
...
> +#else /* UNIX, dynamically-allocated code buffer */
...
> +#endif /* USE_STATIC_CODE_GEN_BUFFER */

I'm not keen on your dynamic allocation implementations.  Why not split the one 
dynamic buffer the same way as the static buffer?  We are talking about >= 
256MB here, after all.

> +    tcg_prologue_init(&tcg_ctx);

We have some global variables in the tcg backends that are initialized by 
tcg_prologue_init.  I don't think we should be calling it again without locks 
being involved.

Of course, you don't have to call it again if you split one buffer.  Then you 
also get to share the same rcu implementation.


r~

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU
  2016-04-24  3:20   ` Emilio G. Cota
@ 2016-04-25  8:35     ` Alex Bennée
  0 siblings, 0 replies; 21+ messages in thread
From: Alex Bennée @ 2016-04-25  8:35 UTC (permalink / raw)
  To: Emilio G. Cota
  Cc: QEMU Developers, MTTCG Devel, Paolo Bonzini, Peter Crosthwaite,
	Richard Henderson, Sergey Fedorov


Emilio G. Cota <cota@braap.org> writes:

> On Fri, Apr 22, 2016 at 15:41:13 +0100, Alex Bennée wrote:
>> Emilio G. Cota <cota@braap.org> writes:
> (snip)
>> > Known issues:
>> > - Basically compile-tested only, since I've only run this with
>> >   single-threaded TCG; I also tried running it with linux-user,
>> >   but in order to trigger tb_flush I had to make code_gen_buffer
>> >   so small that the CPU calling tb_flush would immediately fill
>> >   the 2nd buffer, triggering the assert. If you have a working
>> >   multi-threaded workload that would be good to test this, please
>> >   let me know.
>>
>> With my latest mttcg unit tests:
>>
>> ./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \
>>   -device virtio-serial-device -device virtconsole,chardev=ctd \
>>   -chardev testdev,id=ctd -display none -serial stdio \
>>   -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \
>>   -append "tight smc irq mod=1 rounds=100000"  -name arm,debug-threads=on
>
> This is useful. Never mind the need for testing linux-user, I can test
> both code paths (i.e. dynamic allocation and static buf) with qemu-system
> by simply defining USE_STATIC_CODE_GEN_BUFFER.
>
> After applying a modified version of this patch (that I'll send in
> a jiffy) to your enable-mttcg-for-armv7-v1 branch (reverting first
> "translate-all: introduces tb_flush_safe"), I can easily trigger
> this error when setting a low enough TB size, e.g. -tb-size 32:
>
>  CPU1: online and setting up with pattern 0xa0b78cbf
>  CPU2: online and setting up with pattern 0x22287c45
>  CPU3: online and setting up with pattern 0x6262c5c5
>  CPU0: online and setting up with pattern 0xa65e7ad6
>  qemu: flush code_size=10622184 nb_tbs=83886 avg_tb_size=126
>  qemu: flush code_size=10469016 nb_tbs=83886 avg_tb_size=124
>  qemu: flush code_size=10492920 nb_tbs=83886 avg_tb_size=125
>  qemu: flush code_size=10477464 nb_tbs=83886 avg_tb_size=124
>  qemu: flush code_size=10495800 nb_tbs=83886 avg_tb_size=125
>  PASS: smc: irq: 0 errors, IRQs not checked
>  Unhandled exception 3 (pabt)
>  Exception frame registers:
>  pc : [<e59f2028>]    lr : [<40010700>]    psr: a0000153
>  sp : 400ac5c0  ip : 400ab4e8  fp : 40032ca8
>  r10: 00000000  r9 : 00000000  r8 : 00000000
>  r7 : 00000000  r6 : 00000000  r5 : 00000000  r4 : 00000000
>  r3 : 00000000  r2 : 00000000  r1 : e59f2028  r0 : 00000000
>  Flags: NzCv  IRQs on  FIQs off  Mode SVC_32
>  Control: 00c5107d  Table: 40060000  DAC: 00000000
>  IFAR: e59f2028    IFSR: 00000205
>
> Any input on where to look would be appreciated. Thanks,

I'll have a look and see if I can replicate.

>
> 		Emilio


--
Alex Bennée

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC v2] translate-all: protect code_gen_buffer with RCU
  2016-04-24  3:27   ` [Qemu-devel] [RFC v2] " Emilio G. Cota
  2016-04-24 18:12     ` Richard Henderson
@ 2016-04-25 15:19     ` Alex Bennée
  2016-04-25 15:25       ` Emilio G. Cota
  1 sibling, 1 reply; 21+ messages in thread
From: Alex Bennée @ 2016-04-25 15:19 UTC (permalink / raw)
  To: Emilio G. Cota
  Cc: QEMU Developers, MTTCG Devel, Paolo Bonzini, Peter Crosthwaite,
	Richard Henderson, Sergey Fedorov


Emilio G. Cota <cota@braap.org> writes:

> [ Applies on top of bennee/mttcg/enable-mttcg-for-armv7-v1 after
> reverting "translate-all: introduces tb_flush_safe". A trivial
> conflict must be solved after applying. ]
>
> This is a first attempt at making tb_flush not have to stop all CPUs.
> There are issues as pointed out below, but this could be a good start.
>
> Context:
>   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
>   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
>
> Changes from v1:
> - When a static buffer is used, split it in two instead of using
>   a second buffer.
>
> Known issues:
> - Fails Alex' unit test with low enough -tb-size, see
>   https://lists.gnu.org/archive/html/qemu-devel/2016-04/msg03465.html
>   Seems to work in MTTCG, although I've only tested with tb_lock
>   always being taken in tb_find_fast.

With --enable-debug-tcg I get it failing pretty quickly:

#4  0x00005555556d332a in tcg_global_alloc (s=0x555556007ba0 <tcg_ctx>)
 at /home/alex/lsrc/qemu/qemu.git/tcg/tcg.c:463
 463         tcg_debug_assert(s->nb_globals == s->nb_temps);
 (gdb) p s->nb_globals
 $1 = 24
 (gdb) p s->nb_temps
 $2 = 31

Seems odd though, the other threads are all waiting on the tb_lock.

> - Windows; not even compile-tested!
>
> Signed-off-by: Emilio G. Cota <cota@braap.org>
> ---
>  translate-all.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 133 insertions(+), 13 deletions(-)
>
> diff --git a/translate-all.c b/translate-all.c
> index 8e70583..6830371 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -535,6 +535,9 @@ static inline void *split_cross_256mb(void *buf1, size_t size1)
>  #ifdef USE_STATIC_CODE_GEN_BUFFER
>  static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
>      __attribute__((aligned(CODE_GEN_ALIGN)));
> +static int static_buf_mask = 1;
> +static void *static_buf1;
> +static void *static_buf2;
>
>  # ifdef _WIN32
>  static inline void do_protect(void *addr, long size, int prot)
> @@ -577,6 +580,13 @@ static inline void map_none(void *addr, long size)
>  }
>  # endif /* WIN32 */
>
> +static void map_static_code_gen_buffer(void *buf, size_t size)
> +{
> +    map_exec(buf, size);
> +    map_none(buf + size, qemu_real_host_page_size);
> +    qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
> +}
> +
>  static inline void *alloc_code_gen_buffer(void)
>  {
>      void *buf = static_code_gen_buffer;
> @@ -586,28 +596,41 @@ static inline void *alloc_code_gen_buffer(void)
>      full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
>                   & qemu_real_host_page_mask) - (uintptr_t)buf;
>
> -    /* Reserve a guard page.  */
> -    size = full_size - qemu_real_host_page_size;
> +    /*
> +     * Reserve two guard pages, one after each of the two buffers:
> +     * |     buf1     |g1|    buf2    |g2|
> +     */
> +    size = full_size - 2 * qemu_real_host_page_size;
>
>      /* Honor a command-line option limiting the size of the buffer.  */
>      if (size > tcg_ctx.code_gen_buffer_size) {
>          size = (((uintptr_t)buf + tcg_ctx.code_gen_buffer_size)
>                  & qemu_real_host_page_mask) - (uintptr_t)buf;
>      }
> -    tcg_ctx.code_gen_buffer_size = size;
>
>  #ifdef __mips__
> -    if (cross_256mb(buf, size)) {
> -        buf = split_cross_256mb(buf, size);
> -        size = tcg_ctx.code_gen_buffer_size;
> +    /*
> +     * Pass 'size + page_size', since we want 'buf1 | guard1 | buf2' to be
> +     * within the boundary.
> +     */
> +    if (cross_256mb(buf, size + qemu_real_host_page_size)) {
> +        buf = split_cross_256mb(buf, size + qemu_real_host_page_size);
> +        size = tcg_ctx.code_gen_buffer_size - qemu_real_host_page_size;
>      }
>  #endif
>
> -    map_exec(buf, size);
> -    map_none(buf + size, qemu_real_host_page_size);
> -    qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
> +    /* split the buffer in two */
> +    size /= 2;
> +    tcg_ctx.code_gen_buffer_size = size;
>
> -    return buf;
> +    static_buf1 = buf;
> +    static_buf2 = buf + size + qemu_real_host_page_mask;
> +
> +    map_static_code_gen_buffer(static_buf1, size);
> +    map_static_code_gen_buffer(static_buf2, size);
> +
> +    assert(static_buf_mask == 1);
> +    return static_buf1;
>  }
>  #elif defined(_WIN32)
>  static inline void *alloc_code_gen_buffer(void)
> @@ -825,10 +848,100 @@ static void page_flush_tb(void)
>      }
>  }
>
> +#ifdef USE_STATIC_CODE_GEN_BUFFER
> +
> +struct code_gen_desc {
> +    struct rcu_head rcu;
> +    int clear_bit;
> +};
> +
> +static void code_gen_buffer_clear(struct rcu_head *rcu)
> +{
> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> +    tb_lock();
> +    static_buf_mask &= ~desc->clear_bit;
> +    tb_unlock();
> +    g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> +    struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
> +
> +    /*
> +     * If both bits are set, we're having two concurrent flushes. This
> +     * can easily happen if the buffers are heavily undersized.
> +     */
> +    assert(static_buf_mask == 1 || static_buf_mask == 2);
> +
> +    desc->clear_bit = static_buf_mask;
> +    call_rcu1(&desc->rcu, code_gen_buffer_clear);
> +
> +    if (static_buf_mask == 1) {
> +        static_buf_mask |= 2;
> +        return static_buf2;
> +    }
> +    static_buf_mask |= 1;
> +    return static_buf1;
> +}
> +
> +#elif defined(_WIN32)
> +
> +struct code_gen_desc {
> +    struct rcu_head rcu;
> +    void *buf;
> +};
> +
> +static void code_gen_buffer_vfree(struct rcu_head *rcu)
> +{
> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> +    VirtualFree(desc->buf, 0, MEM_RELEASE);
> +    g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> +    struct code_gen_desc *desc;
> +
> +    desc = g_malloc0(sizeof(*desc));
> +    desc->buf = tcg_ctx.code_gen_buffer;
> +    call_rcu1(&desc->rcu, code_gen_buffer_vfree);
> +
> +    return alloc_code_gen_buffer();
> +}
> +
> +#else /* UNIX, dynamically-allocated code buffer */
> +
> +struct code_gen_desc {
> +    struct rcu_head rcu;
> +    void *buf;
> +    size_t size;
> +};
> +
> +static void code_gen_buffer_unmap(struct rcu_head *rcu)
> +{
> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> +    munmap(desc->buf, desc->size + qemu_real_host_page_size);
> +    g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> +    struct code_gen_desc *desc;
> +
> +    desc = g_malloc0(sizeof(*desc));
> +    desc->buf = tcg_ctx.code_gen_buffer;
> +    desc->size = tcg_ctx.code_gen_buffer_size;
> +    call_rcu1(&desc->rcu, code_gen_buffer_unmap);
> +
> +    return alloc_code_gen_buffer();
> +}
> +#endif /* USE_STATIC_CODE_GEN_BUFFER */
> +
>  /* flush all the translation blocks */
> -/* XXX: tb_flush is currently not thread safe.  System emulation calls it only
> - * with tb_lock taken or from safe_work, so no need to take tb_lock here.
> - */
>  void tb_flush(CPUState *cpu)
>  {
>  #if defined(DEBUG_FLUSH)
> @@ -852,10 +965,17 @@ void tb_flush(CPUState *cpu)
>      memset(tcg_ctx.tb_ctx.tb_phys_hash, 0, sizeof(tcg_ctx.tb_ctx.tb_phys_hash));
>      page_flush_tb();
>
> +    tcg_ctx.code_gen_buffer = code_gen_buffer_replace();
>      tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
> +    tcg_prologue_init(&tcg_ctx);
>      /* XXX: flush processor icache at this point if cache flush is
>         expensive */
>      tcg_ctx.tb_ctx.tb_flush_count++;
> +
> +    /* exit all CPUs so that the old buffer is quickly cleared. */
> +    CPU_FOREACH(cpu) {
> +        cpu_exit(cpu);
> +    }
>  }
>
>  #ifdef DEBUG_TB_CHECK


--
Alex Bennée

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC v2] translate-all: protect code_gen_buffer with RCU
  2016-04-25 15:19     ` Alex Bennée
@ 2016-04-25 15:25       ` Emilio G. Cota
  2016-04-25 23:46         ` [Qemu-devel] [RFC v3] " Emilio G. Cota
  0 siblings, 1 reply; 21+ messages in thread
From: Emilio G. Cota @ 2016-04-25 15:25 UTC (permalink / raw)
  To: Alex Bennée
  Cc: QEMU Developers, MTTCG Devel, Paolo Bonzini, Peter Crosthwaite,
	Richard Henderson, Sergey Fedorov

On Mon, Apr 25, 2016 at 16:19:59 +0100, Alex Bennée wrote:
> 
> Emilio G. Cota <cota@braap.org> writes:
> 
> > [ Applies on top of bennee/mttcg/enable-mttcg-for-armv7-v1 after
> > reverting "translate-all: introduces tb_flush_safe". A trivial
> > conflict must be solved after applying. ]
> >
> > This is a first attempt at making tb_flush not have to stop all CPUs.
> > There are issues as pointed out below, but this could be a good start.
> >
> > Context:
> >   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
> >   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
> >
> > Changes from v1:
> > - When a static buffer is used, split it in two instead of using
> >   a second buffer.
> >
> > Known issues:
> > - Fails Alex' unit test with low enough -tb-size, see
> >   https://lists.gnu.org/archive/html/qemu-devel/2016-04/msg03465.html
> >   Seems to work in MTTCG, although I've only tested with tb_lock
> >   always being taken in tb_find_fast.
> 
> With --enable-debug-tcg I get it failing pretty quickly:
> 
> #4  0x00005555556d332a in tcg_global_alloc (s=0x555556007ba0 <tcg_ctx>)
>  at /home/alex/lsrc/qemu/qemu.git/tcg/tcg.c:463
>  463         tcg_debug_assert(s->nb_globals == s->nb_temps);
>  (gdb) p s->nb_globals
>  $1 = 24
>  (gdb) p s->nb_temps
>  $2 = 31
> Seems odd though, the other threads are all waiting on the tb_lock.

It must be the tcg_prologue_init call, as Richard pointed out.
I'm on it, will report back.

Thanks,

		E.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
  2016-04-25 15:25       ` Emilio G. Cota
@ 2016-04-25 23:46         ` Emilio G. Cota
  2016-04-26  4:48           ` Richard Henderson
  2016-04-26  6:32           ` Alex Bennée
  0 siblings, 2 replies; 21+ messages in thread
From: Emilio G. Cota @ 2016-04-25 23:46 UTC (permalink / raw)
  To: QEMU Developers, MTTCG Devel
  Cc: Alex Bennée, Paolo Bonzini, Peter Crosthwaite,
	Richard Henderson, Sergey Fedorov

Context:
  https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
  https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html

This seems to half-work[*] although I'm uneasy about the while idea.
I see two major hurdles:

* If the TB size is too small, this breaks badly, because we're not
  out of the RCU read critical section when another call to tb_flush
  is made. For instance, when booting ARM with this patch applied,
  I need to at least pass -tb-size 10 for it to fully boot debian
  jessie.
* We have different tb_flush callers that should be audited:
 $ git grep '\stb_flush(' | grep -v '//' | grep -v 'CPUState'
 exec.c:            tb_flush(cpu);
 gdbstub.c:        tb_flush(cpu);
 gdbstub.c:    tb_flush(cpu);
 hw/ppc/spapr_hcall.c:            tb_flush(CPU(cpu));
 target-alpha/sys_helper.c:    tb_flush(CPU(alpha_env_get_cpu(env)));
 target-i386/translate.c:        tb_flush(CPU(x86_env_get_cpu(env)));
 translate-all.c:        tb_flush(cpu);

With two code_gen "halves", if two tb_flush calls are done in the same
RCU read critical section, we're screwed. I added a cpu_exit at the end
of tb_flush to try to mitigate this, but I haven't audited all the callers
(for instance, what does the gdbstub do?).

If we end up having a mechanism to "stop all  CPUs to do something", as
I think we'll end up needing for correct LL/SC emulation, we'll probably
be better off using that mechanism for tb_flush as well -- plus, we'll avoid
wasting memory.

Other issues:
- This could be split into at least 2 patches, one that touches
  tcg/ and another to deal with translate-all.
  Note that in translate-all, the initial allocation of code_gen doesn't
  allocate extra space for the guard page; reserving guard page space is
  done instead by the added split_code_gen_buffer function.
- Windows: not even compile-tested.

[*] "Seems to half-work". At least it boots ARM OK with MTTCG and -smp 4.
    Alex' tests, however, sometimes fail with:

Unhandled exception 3 (pabt)
Exception frame registers:
pc : [<fffffb44>]    lr : [<00000001>]    psr: 20000173
sp : 4004f528  ip : 40012048  fp : 40032ca8
r10: 40032ca8  r9 : 00000000  r8 : 00000000
r7 : 0000000e  r6 : 40030000  r5 : 40032ca8  r4 : 00001ac6
r3 : 40012030  r2 : 40012030  r1 : d5ffffe7  r0 : 00000028
Flags: nzCv  IRQs on  FIQs off  Mode SVC_32
Control: 00c5107d  Table: 40060000  DAC: 00000000
IFAR: fffffb44    IFSR: 00000205

or with:

CPU0: 16986 irqs (0 races, 11 slow,  1322 ticks avg latency)
FAIL: smc: irq: 17295 IRQs sent, 16986 received

Unhandled exception 6 (irq)
Exception frame registers:
pc : [<00000020>]    lr : [<40010800>]    psr: 60000153
sp : 400b45c0  ip : 400b34e8  fp : 40032ca8
r10: 00000000  r9 : 00000000  r8 : 00000000
r7 : 00000000  r6 : 00000000  r5 : 00000000  r4 : 00000000
r3 : 00000000  r2 : 00000000  r1 : 000000ff  r0 : 00000000
Flags: nZCv  IRQs on  FIQs off  Mode SVC_32
Control: 00c5107d  Table: 40060000  DAC: 00000000

I built with --enable-tcg-debug.

Signed-off-by: Emilio G. Cota <cota@braap.org>
---
 tcg/tcg.c       |  22 +++++---
 tcg/tcg.h       |   1 +
 translate-all.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 144 insertions(+), 34 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index b46bf1a..7db8ce9 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -380,6 +380,20 @@ void tcg_context_init(TCGContext *s)
     }
 }
 
+void tcg_code_gen_init(TCGContext *s, void *buf, size_t prologue_size)
+{
+    size_t size = s->code_gen_buffer_size - prologue_size;
+
+    s->code_gen_ptr = buf;
+    s->code_gen_buffer = buf;
+    s->code_buf = buf;
+
+    /* Compute a high-water mark, at which we voluntarily flush the buffer
+       and start over.  The size here is arbitrary, significantly larger
+       than we expect the code generation for any one opcode to require.  */
+    s->code_gen_highwater = s->code_buf + (size - 1024);
+}
+
 void tcg_prologue_init(TCGContext *s)
 {
     size_t prologue_size, total_size;
@@ -398,16 +412,10 @@ void tcg_prologue_init(TCGContext *s)
 
     /* Deduct the prologue from the buffer.  */
     prologue_size = tcg_current_code_size(s);
-    s->code_gen_ptr = buf1;
-    s->code_gen_buffer = buf1;
-    s->code_buf = buf1;
     total_size = s->code_gen_buffer_size - prologue_size;
     s->code_gen_buffer_size = total_size;
 
-    /* Compute a high-water mark, at which we voluntarily flush the buffer
-       and start over.  The size here is arbitrary, significantly larger
-       than we expect the code generation for any one opcode to require.  */
-    s->code_gen_highwater = s->code_gen_buffer + (total_size - 1024);
+    tcg_code_gen_init(s, buf1, prologue_size);
 
     tcg_register_jit(s->code_gen_buffer, total_size);
 
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 40c8fbe..e849d3e 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -634,6 +634,7 @@ static inline void *tcg_malloc(int size)
 
 void tcg_context_init(TCGContext *s);
 void tcg_prologue_init(TCGContext *s);
+void tcg_code_gen_init(TCGContext *s, void *buf, size_t prologue_size);
 void tcg_func_start(TCGContext *s);
 
 int tcg_gen_code(TCGContext *s, TranslationBlock *tb);
diff --git a/translate-all.c b/translate-all.c
index bba9b62..7a83176 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -485,6 +485,11 @@ static inline PageDesc *page_find(tb_page_addr_t index)
   (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \
    ? DEFAULT_CODE_GEN_BUFFER_SIZE_1 : MAX_CODE_GEN_BUFFER_SIZE)
 
+static int code_gen_buf_mask;
+static void *code_gen_buf1;
+static void *code_gen_buf2;
+static size_t code_gen_prologue_size;
+
 static inline size_t size_code_gen_buffer(size_t tb_size)
 {
     /* Size the buffer.  */
@@ -508,6 +513,43 @@ static inline size_t size_code_gen_buffer(size_t tb_size)
     return tb_size;
 }
 
+static void *split_code_gen_buffer(void *buf, size_t size)
+{
+    /* enforce alignment of the buffer to a page boundary */
+    if (unlikely((uintptr_t)buf & ~qemu_real_host_page_mask)) {
+        uintptr_t b;
+
+        b = QEMU_ALIGN_UP((uintptr_t)buf, qemu_real_host_page_size);
+        size -= b - (uintptr_t)buf;
+        buf = (void *)b;
+    }
+    /*
+     * Make sure the size is an even number of pages so that both halves will
+     * end on a page boundary.
+     */
+    size = QEMU_ALIGN_DOWN(size, 2 * qemu_real_host_page_size);
+
+    /* split in half, making room for 2 guard pages */
+    size -= 2 * qemu_real_host_page_size;
+    size /= 2;
+    code_gen_buf1 = buf;
+    code_gen_buf2 = buf + size + qemu_real_host_page_size;
+
+    /*
+     * write the prologue into buf2. This is safe because we'll later call
+     * tcg_prologue_init on buf1, from which we'll start execution.
+     */
+    tcg_ctx.code_gen_buffer = code_gen_buf2;
+    tcg_prologue_init(&tcg_ctx);
+    code_gen_prologue_size = (void *)tcg_ctx.code_ptr - code_gen_buf2;
+
+    tcg_ctx.code_gen_buffer_size = size;
+
+    /* start execution from buf1 */
+    code_gen_buf_mask = 1;
+    return code_gen_buf1;
+}
+
 #ifdef __mips__
 /* In order to use J and JAL within the code_gen_buffer, we require
    that the buffer not cross a 256MB boundary.  */
@@ -583,21 +625,17 @@ static inline void map_none(void *addr, long size)
 static inline void *alloc_code_gen_buffer(void)
 {
     void *buf = static_code_gen_buffer;
-    size_t full_size, size;
+    size_t size;
 
     /* The size of the buffer, rounded down to end on a page boundary.  */
-    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
-                 & qemu_real_host_page_mask) - (uintptr_t)buf;
-
-    /* Reserve a guard page.  */
-    size = full_size - qemu_real_host_page_size;
+    size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
+            & qemu_real_host_page_mask) - (uintptr_t)buf;
 
     /* Honor a command-line option limiting the size of the buffer.  */
     if (size > tcg_ctx.code_gen_buffer_size) {
         size = (((uintptr_t)buf + tcg_ctx.code_gen_buffer_size)
                 & qemu_real_host_page_mask) - (uintptr_t)buf;
     }
-    tcg_ctx.code_gen_buffer_size = size;
 
 #ifdef __mips__
     if (cross_256mb(buf, size)) {
@@ -607,27 +645,40 @@ static inline void *alloc_code_gen_buffer(void)
 #endif
 
     map_exec(buf, size);
-    map_none(buf + size, qemu_real_host_page_size);
     qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
 
+    buf = split_code_gen_buffer(buf, size);
+    size = tcg_ctx.code_gen_buffer_size;
+
+    /* page guards */
+    map_none(code_gen_buf1 + size, qemu_real_host_page_size);
+    map_none(code_gen_buf2 + size, qemu_real_host_page_size);
+
     return buf;
 }
 #elif defined(_WIN32)
 static inline void *alloc_code_gen_buffer(void)
 {
     size_t size = tcg_ctx.code_gen_buffer_size;
-    void *buf1, *buf2;
+    void *buf;
+    void *ret;
 
-    /* Perform the allocation in two steps, so that the guard page
-       is reserved but uncommitted.  */
-    buf1 = VirtualAlloc(NULL, size + qemu_real_host_page_size,
-                        MEM_RESERVE, PAGE_NOACCESS);
-    if (buf1 != NULL) {
-        buf2 = VirtualAlloc(buf1, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
-        assert(buf1 == buf2);
+    /* Perform the allocation in two steps, so that the guard pages
+       are reserved but uncommitted.  */
+    ret = VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
+    if (ret == NULL) {
+        return NULL;
     }
 
-    return buf1;
+    ret = split_code_gen_buffer(ret, size);
+    size = tcg_ctx.code_gen_buffer_size;
+
+    buf = VirtualAlloc(code_gen_buf1, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+    assert(buf);
+    buf = VirtualAlloc(code_gen_buf2, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+    assert(buf);
+
+    return ret;
 }
 #else
 static inline void *alloc_code_gen_buffer(void)
@@ -665,8 +716,7 @@ static inline void *alloc_code_gen_buffer(void)
 #  endif
 # endif
 
-    buf = mmap((void *)start, size + qemu_real_host_page_size,
-               PROT_NONE, flags, -1, 0);
+    buf = mmap((void *)start, size, PROT_NONE, flags, -1, 0);
     if (buf == MAP_FAILED) {
         return NULL;
     }
@@ -676,24 +726,24 @@ static inline void *alloc_code_gen_buffer(void)
         /* Try again, with the original still mapped, to avoid re-acquiring
            that 256mb crossing.  This time don't specify an address.  */
         size_t size2;
-        void *buf2 = mmap(NULL, size + qemu_real_host_page_size,
-                          PROT_NONE, flags, -1, 0);
+        void *buf2 = mmap(NULL, size, PROT_NONE, flags, -1, 0);
+
         switch (buf2 != MAP_FAILED) {
         case 1:
             if (!cross_256mb(buf2, size)) {
                 /* Success!  Use the new buffer.  */
-                munmap(buf, size + qemu_real_host_page_size);
+                munmap(buf, size);
                 break;
             }
             /* Failure.  Work with what we had.  */
-            munmap(buf2, size + qemu_real_host_page_size);
+            munmap(buf2, size);
             /* fallthru */
         default:
             /* Split the original buffer.  Free the smaller half.  */
             buf2 = split_cross_256mb(buf, size);
             size2 = tcg_ctx.code_gen_buffer_size;
             if (buf == buf2) {
-                munmap(buf + size2 + qemu_real_host_page_size, size - size2);
+                munmap(buf + size2, size - size2);
             } else {
                 munmap(buf, size - size2);
             }
@@ -704,13 +754,19 @@ static inline void *alloc_code_gen_buffer(void)
     }
 #endif
 
-    /* Make the final buffer accessible.  The guard page at the end
-       will remain inaccessible with PROT_NONE.  */
+    /* Make the final buffer accessible. */
     mprotect(buf, size, PROT_WRITE | PROT_READ | PROT_EXEC);
 
     /* Request large pages for the buffer.  */
     qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
 
+    buf = split_code_gen_buffer(buf + 1, size);
+    size = tcg_ctx.code_gen_buffer_size;
+
+    /* page guards */
+    mprotect(code_gen_buf1 + size, qemu_real_host_page_size, PROT_NONE);
+    mprotect(code_gen_buf2 + size, qemu_real_host_page_size, PROT_NONE);
+
     return buf;
 }
 #endif /* USE_STATIC_CODE_GEN_BUFFER, WIN32, POSIX */
@@ -829,10 +885,48 @@ static void page_flush_tb(void)
     }
 }
 
+struct code_gen_desc {
+    struct rcu_head rcu;
+    int clear_bit;
+};
+
+static void clear_code_gen_buffer(struct rcu_head *rcu)
+{
+    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+    tb_lock();
+    code_gen_buf_mask &= ~desc->clear_bit;
+    tb_unlock();
+    g_free(desc);
+}
+
+static void *replace_code_gen_buffer(void)
+{
+    struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
+
+    /*
+     * If both bits are set, we're having two concurrent flushes. This
+     * can easily happen if the buffers are heavily undersized.
+     */
+    assert(code_gen_buf_mask == 1 || code_gen_buf_mask == 2);
+
+    desc->clear_bit = code_gen_buf_mask;
+    call_rcu1(&desc->rcu, clear_code_gen_buffer);
+
+    if (code_gen_buf_mask == 1) {
+        code_gen_buf_mask |= 2;
+        return code_gen_buf2 + code_gen_prologue_size;
+    }
+    code_gen_buf_mask |= 1;
+    return code_gen_buf1 + code_gen_prologue_size;
+}
+
 /* flush all the translation blocks */
 /* XXX: tb_flush is currently not thread safe */
 void tb_flush(CPUState *cpu)
 {
+    void *buf;
+
 #if defined(DEBUG_FLUSH)
     printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
            (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer),
@@ -853,10 +947,17 @@ void tb_flush(CPUState *cpu)
     qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
     page_flush_tb();
 
-    tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
+    buf = replace_code_gen_buffer();
+    tcg_code_gen_init(&tcg_ctx, buf, code_gen_prologue_size);
+
     /* XXX: flush processor icache at this point if cache flush is
        expensive */
     tcg_ctx.tb_ctx.tb_flush_count++;
+
+    /* exit all CPUs so that the old buffer is quickly cleared */
+    CPU_FOREACH(cpu) {
+        cpu_exit(cpu);
+    }
 }
 
 #ifdef DEBUG_TB_CHECK
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
  2016-04-25 23:46         ` [Qemu-devel] [RFC v3] " Emilio G. Cota
@ 2016-04-26  4:48           ` Richard Henderson
  2016-04-26  6:35             ` Alex Bennée
  2016-04-26  6:32           ` Alex Bennée
  1 sibling, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2016-04-26  4:48 UTC (permalink / raw)
  To: Emilio G. Cota, QEMU Developers, MTTCG Devel
  Cc: Alex Bennée, Paolo Bonzini, Peter Crosthwaite, Sergey Fedorov

On 04/25/2016 04:46 PM, Emilio G. Cota wrote:
> +    /*
> +     * write the prologue into buf2. This is safe because we'll later call
> +     * tcg_prologue_init on buf1, from which we'll start execution.
> +     */
> +    tcg_ctx.code_gen_buffer = code_gen_buf2;
> +    tcg_prologue_init(&tcg_ctx);
> +

Ah, no.  Write only one prologue, not one per buffer.

If they're sufficiently close (i.e. one allocation under the max size),
then the same one can be used for both halves.

The global variables that you didn't see in this revision are:

aarch64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
arm/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
i386/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
ia64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
ia64/tcg-target.inc.c:    tcg_insn_unit *thunks[8] = { };
mips/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
ppc/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
s390/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_ld_trampoline[16];
sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_st_trampoline[16];


r~

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
  2016-04-25 23:46         ` [Qemu-devel] [RFC v3] " Emilio G. Cota
  2016-04-26  4:48           ` Richard Henderson
@ 2016-04-26  6:32           ` Alex Bennée
  2016-04-30  3:40             ` Emilio G. Cota
  1 sibling, 1 reply; 21+ messages in thread
From: Alex Bennée @ 2016-04-26  6:32 UTC (permalink / raw)
  To: Emilio G. Cota
  Cc: QEMU Developers, MTTCG Devel, Paolo Bonzini, Peter Crosthwaite,
	Richard Henderson, Sergey Fedorov


Emilio G. Cota <cota@braap.org> writes:

> Context:
>   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
>   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
>
> This seems to half-work[*] although I'm uneasy about the while idea.
> I see two major hurdles:
>
> * If the TB size is too small, this breaks badly, because we're not
>   out of the RCU read critical section when another call to tb_flush
>   is made. For instance, when booting ARM with this patch applied,
>   I need to at least pass -tb-size 10 for it to fully boot debian
>   jessie.
> * We have different tb_flush callers that should be audited:
>  $ git grep '\stb_flush(' | grep -v '//' | grep -v 'CPUState'
>  exec.c:            tb_flush(cpu);
>  gdbstub.c:        tb_flush(cpu);
>  gdbstub.c:    tb_flush(cpu);
>  hw/ppc/spapr_hcall.c:            tb_flush(CPU(cpu));
>  target-alpha/sys_helper.c:    tb_flush(CPU(alpha_env_get_cpu(env)));
>  target-i386/translate.c:        tb_flush(CPU(x86_env_get_cpu(env)));
>  translate-all.c:        tb_flush(cpu);
>
> With two code_gen "halves", if two tb_flush calls are done in the same
> RCU read critical section, we're screwed. I added a cpu_exit at the end
> of tb_flush to try to mitigate this, but I haven't audited all the callers
> (for instance, what does the gdbstub do?).

I'm not sure we are going to get much from this approach. The tb_flush
is a fairly rare occurrence its not like its on the critical performance
path (although of course pathological cases are possible).

I still think there are possibilities with a smaller TranslationRegion
approach but this is more aimed at solving problems like bulk
invalidations of a page of translations at a time and safer inter-block
patching. It doesn't do much to make the tb_flush easier though.

>
> If we end up having a mechanism to "stop all  CPUs to do something", as
> I think we'll end up needing for correct LL/SC emulation, we'll probably
> be better off using that mechanism for tb_flush as well -- plus, we'll avoid
> wasting memory.

I'm fairly certain there will need to be a "stop everything" mode for
some things - I'm less certain of the best way of doing it. Did you get
a chance to look at my version of the async_safe_work mechanism?

>
> Other issues:
> - This could be split into at least 2 patches, one that touches
>   tcg/ and another to deal with translate-all.
>   Note that in translate-all, the initial allocation of code_gen doesn't
>   allocate extra space for the guard page; reserving guard page space is
>   done instead by the added split_code_gen_buffer function.
> - Windows: not even compile-tested.
>
> [*] "Seems to half-work". At least it boots ARM OK with MTTCG and -smp 4.
>     Alex' tests, however, sometimes fail with:
>
> Unhandled exception 3 (pabt)
> Exception frame registers:
> pc : [<fffffb44>]    lr : [<00000001>]    psr: 20000173
> sp : 4004f528  ip : 40012048  fp : 40032ca8
> r10: 40032ca8  r9 : 00000000  r8 : 00000000
> r7 : 0000000e  r6 : 40030000  r5 : 40032ca8  r4 : 00001ac6
> r3 : 40012030  r2 : 40012030  r1 : d5ffffe7  r0 : 00000028
> Flags: nzCv  IRQs on  FIQs off  Mode SVC_32
> Control: 00c5107d  Table: 40060000  DAC: 00000000
> IFAR: fffffb44    IFSR: 00000205
>
> or with:
>
> CPU0: 16986 irqs (0 races, 11 slow,  1322 ticks avg latency)
> FAIL: smc: irq: 17295 IRQs sent, 16986 received
>
> Unhandled exception 6 (irq)
> Exception frame registers:
> pc : [<00000020>]    lr : [<40010800>]    psr: 60000153
> sp : 400b45c0  ip : 400b34e8  fp : 40032ca8
> r10: 00000000  r9 : 00000000  r8 : 00000000
> r7 : 00000000  r6 : 00000000  r5 : 00000000  r4 : 00000000
> r3 : 00000000  r2 : 00000000  r1 : 000000ff  r0 : 00000000
> Flags: nZCv  IRQs on  FIQs off  Mode SVC_32
> Control: 00c5107d  Table: 40060000  DAC: 00000000
>
> I built with --enable-tcg-debug.

I'll have another go at reproducing. I could only get the asserts to
fire which was odd because AFAICT the prologue generation which
triggered it was serialised with tb_lock.

>
> Signed-off-by: Emilio G. Cota <cota@braap.org>
> ---
>  tcg/tcg.c       |  22 +++++---
>  tcg/tcg.h       |   1 +
>  translate-all.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++----------
>  3 files changed, 144 insertions(+), 34 deletions(-)
>
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index b46bf1a..7db8ce9 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -380,6 +380,20 @@ void tcg_context_init(TCGContext *s)
>      }
>  }
>
> +void tcg_code_gen_init(TCGContext *s, void *buf, size_t prologue_size)
> +{
> +    size_t size = s->code_gen_buffer_size - prologue_size;
> +
> +    s->code_gen_ptr = buf;
> +    s->code_gen_buffer = buf;
> +    s->code_buf = buf;
> +
> +    /* Compute a high-water mark, at which we voluntarily flush the buffer
> +       and start over.  The size here is arbitrary, significantly larger
> +       than we expect the code generation for any one opcode to require.  */
> +    s->code_gen_highwater = s->code_buf + (size - 1024);
> +}
> +
>  void tcg_prologue_init(TCGContext *s)
>  {
>      size_t prologue_size, total_size;
> @@ -398,16 +412,10 @@ void tcg_prologue_init(TCGContext *s)
>
>      /* Deduct the prologue from the buffer.  */
>      prologue_size = tcg_current_code_size(s);
> -    s->code_gen_ptr = buf1;
> -    s->code_gen_buffer = buf1;
> -    s->code_buf = buf1;
>      total_size = s->code_gen_buffer_size - prologue_size;
>      s->code_gen_buffer_size = total_size;
>
> -    /* Compute a high-water mark, at which we voluntarily flush the buffer
> -       and start over.  The size here is arbitrary, significantly larger
> -       than we expect the code generation for any one opcode to require.  */
> -    s->code_gen_highwater = s->code_gen_buffer + (total_size - 1024);
> +    tcg_code_gen_init(s, buf1, prologue_size);
>
>      tcg_register_jit(s->code_gen_buffer, total_size);
>
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index 40c8fbe..e849d3e 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -634,6 +634,7 @@ static inline void *tcg_malloc(int size)
>
>  void tcg_context_init(TCGContext *s);
>  void tcg_prologue_init(TCGContext *s);
> +void tcg_code_gen_init(TCGContext *s, void *buf, size_t prologue_size);
>  void tcg_func_start(TCGContext *s);
>
>  int tcg_gen_code(TCGContext *s, TranslationBlock *tb);
> diff --git a/translate-all.c b/translate-all.c
> index bba9b62..7a83176 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -485,6 +485,11 @@ static inline PageDesc *page_find(tb_page_addr_t index)
>    (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \
>     ? DEFAULT_CODE_GEN_BUFFER_SIZE_1 : MAX_CODE_GEN_BUFFER_SIZE)
>
> +static int code_gen_buf_mask;
> +static void *code_gen_buf1;
> +static void *code_gen_buf2;
> +static size_t code_gen_prologue_size;
> +
>  static inline size_t size_code_gen_buffer(size_t tb_size)
>  {
>      /* Size the buffer.  */
> @@ -508,6 +513,43 @@ static inline size_t size_code_gen_buffer(size_t tb_size)
>      return tb_size;
>  }
>
> +static void *split_code_gen_buffer(void *buf, size_t size)
> +{
> +    /* enforce alignment of the buffer to a page boundary */
> +    if (unlikely((uintptr_t)buf & ~qemu_real_host_page_mask)) {
> +        uintptr_t b;
> +
> +        b = QEMU_ALIGN_UP((uintptr_t)buf, qemu_real_host_page_size);
> +        size -= b - (uintptr_t)buf;
> +        buf = (void *)b;
> +    }
> +    /*
> +     * Make sure the size is an even number of pages so that both halves will
> +     * end on a page boundary.
> +     */
> +    size = QEMU_ALIGN_DOWN(size, 2 * qemu_real_host_page_size);
> +
> +    /* split in half, making room for 2 guard pages */
> +    size -= 2 * qemu_real_host_page_size;
> +    size /= 2;
> +    code_gen_buf1 = buf;
> +    code_gen_buf2 = buf + size + qemu_real_host_page_size;
> +
> +    /*
> +     * write the prologue into buf2. This is safe because we'll later call
> +     * tcg_prologue_init on buf1, from which we'll start execution.
> +     */
> +    tcg_ctx.code_gen_buffer = code_gen_buf2;
> +    tcg_prologue_init(&tcg_ctx);
> +    code_gen_prologue_size = (void *)tcg_ctx.code_ptr - code_gen_buf2;
> +
> +    tcg_ctx.code_gen_buffer_size = size;
> +
> +    /* start execution from buf1 */
> +    code_gen_buf_mask = 1;
> +    return code_gen_buf1;
> +}
> +
>  #ifdef __mips__
>  /* In order to use J and JAL within the code_gen_buffer, we require
>     that the buffer not cross a 256MB boundary.  */
> @@ -583,21 +625,17 @@ static inline void map_none(void *addr, long size)
>  static inline void *alloc_code_gen_buffer(void)
>  {
>      void *buf = static_code_gen_buffer;
> -    size_t full_size, size;
> +    size_t size;
>
>      /* The size of the buffer, rounded down to end on a page boundary.  */
> -    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
> -                 & qemu_real_host_page_mask) - (uintptr_t)buf;
> -
> -    /* Reserve a guard page.  */
> -    size = full_size - qemu_real_host_page_size;
> +    size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
> +            & qemu_real_host_page_mask) - (uintptr_t)buf;
>
>      /* Honor a command-line option limiting the size of the buffer.  */
>      if (size > tcg_ctx.code_gen_buffer_size) {
>          size = (((uintptr_t)buf + tcg_ctx.code_gen_buffer_size)
>                  & qemu_real_host_page_mask) - (uintptr_t)buf;
>      }
> -    tcg_ctx.code_gen_buffer_size = size;
>
>  #ifdef __mips__
>      if (cross_256mb(buf, size)) {
> @@ -607,27 +645,40 @@ static inline void *alloc_code_gen_buffer(void)
>  #endif
>
>      map_exec(buf, size);
> -    map_none(buf + size, qemu_real_host_page_size);
>      qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
>
> +    buf = split_code_gen_buffer(buf, size);
> +    size = tcg_ctx.code_gen_buffer_size;
> +
> +    /* page guards */
> +    map_none(code_gen_buf1 + size, qemu_real_host_page_size);
> +    map_none(code_gen_buf2 + size, qemu_real_host_page_size);
> +
>      return buf;
>  }
>  #elif defined(_WIN32)
>  static inline void *alloc_code_gen_buffer(void)
>  {
>      size_t size = tcg_ctx.code_gen_buffer_size;
> -    void *buf1, *buf2;
> +    void *buf;
> +    void *ret;
>
> -    /* Perform the allocation in two steps, so that the guard page
> -       is reserved but uncommitted.  */
> -    buf1 = VirtualAlloc(NULL, size + qemu_real_host_page_size,
> -                        MEM_RESERVE, PAGE_NOACCESS);
> -    if (buf1 != NULL) {
> -        buf2 = VirtualAlloc(buf1, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
> -        assert(buf1 == buf2);
> +    /* Perform the allocation in two steps, so that the guard pages
> +       are reserved but uncommitted.  */
> +    ret = VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
> +    if (ret == NULL) {
> +        return NULL;
>      }
>
> -    return buf1;
> +    ret = split_code_gen_buffer(ret, size);
> +    size = tcg_ctx.code_gen_buffer_size;
> +
> +    buf = VirtualAlloc(code_gen_buf1, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
> +    assert(buf);
> +    buf = VirtualAlloc(code_gen_buf2, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
> +    assert(buf);
> +
> +    return ret;
>  }
>  #else
>  static inline void *alloc_code_gen_buffer(void)
> @@ -665,8 +716,7 @@ static inline void *alloc_code_gen_buffer(void)
>  #  endif
>  # endif
>
> -    buf = mmap((void *)start, size + qemu_real_host_page_size,
> -               PROT_NONE, flags, -1, 0);
> +    buf = mmap((void *)start, size, PROT_NONE, flags, -1, 0);
>      if (buf == MAP_FAILED) {
>          return NULL;
>      }
> @@ -676,24 +726,24 @@ static inline void *alloc_code_gen_buffer(void)
>          /* Try again, with the original still mapped, to avoid re-acquiring
>             that 256mb crossing.  This time don't specify an address.  */
>          size_t size2;
> -        void *buf2 = mmap(NULL, size + qemu_real_host_page_size,
> -                          PROT_NONE, flags, -1, 0);
> +        void *buf2 = mmap(NULL, size, PROT_NONE, flags, -1, 0);
> +
>          switch (buf2 != MAP_FAILED) {
>          case 1:
>              if (!cross_256mb(buf2, size)) {
>                  /* Success!  Use the new buffer.  */
> -                munmap(buf, size + qemu_real_host_page_size);
> +                munmap(buf, size);
>                  break;
>              }
>              /* Failure.  Work with what we had.  */
> -            munmap(buf2, size + qemu_real_host_page_size);
> +            munmap(buf2, size);
>              /* fallthru */
>          default:
>              /* Split the original buffer.  Free the smaller half.  */
>              buf2 = split_cross_256mb(buf, size);
>              size2 = tcg_ctx.code_gen_buffer_size;
>              if (buf == buf2) {
> -                munmap(buf + size2 + qemu_real_host_page_size, size - size2);
> +                munmap(buf + size2, size - size2);
>              } else {
>                  munmap(buf, size - size2);
>              }
> @@ -704,13 +754,19 @@ static inline void *alloc_code_gen_buffer(void)
>      }
>  #endif
>
> -    /* Make the final buffer accessible.  The guard page at the end
> -       will remain inaccessible with PROT_NONE.  */
> +    /* Make the final buffer accessible. */
>      mprotect(buf, size, PROT_WRITE | PROT_READ | PROT_EXEC);
>
>      /* Request large pages for the buffer.  */
>      qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
>
> +    buf = split_code_gen_buffer(buf + 1, size);
> +    size = tcg_ctx.code_gen_buffer_size;
> +
> +    /* page guards */
> +    mprotect(code_gen_buf1 + size, qemu_real_host_page_size, PROT_NONE);
> +    mprotect(code_gen_buf2 + size, qemu_real_host_page_size, PROT_NONE);
> +
>      return buf;
>  }
>  #endif /* USE_STATIC_CODE_GEN_BUFFER, WIN32, POSIX */
> @@ -829,10 +885,48 @@ static void page_flush_tb(void)
>      }
>  }
>
> +struct code_gen_desc {
> +    struct rcu_head rcu;
> +    int clear_bit;
> +};
> +
> +static void clear_code_gen_buffer(struct rcu_head *rcu)
> +{
> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> +    tb_lock();
> +    code_gen_buf_mask &= ~desc->clear_bit;
> +    tb_unlock();
> +    g_free(desc);
> +}
> +
> +static void *replace_code_gen_buffer(void)
> +{
> +    struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
> +
> +    /*
> +     * If both bits are set, we're having two concurrent flushes. This
> +     * can easily happen if the buffers are heavily undersized.
> +     */
> +    assert(code_gen_buf_mask == 1 || code_gen_buf_mask == 2);
> +
> +    desc->clear_bit = code_gen_buf_mask;
> +    call_rcu1(&desc->rcu, clear_code_gen_buffer);
> +
> +    if (code_gen_buf_mask == 1) {
> +        code_gen_buf_mask |= 2;
> +        return code_gen_buf2 + code_gen_prologue_size;
> +    }
> +    code_gen_buf_mask |= 1;
> +    return code_gen_buf1 + code_gen_prologue_size;
> +}
> +
>  /* flush all the translation blocks */
>  /* XXX: tb_flush is currently not thread safe */
>  void tb_flush(CPUState *cpu)
>  {
> +    void *buf;
> +
>  #if defined(DEBUG_FLUSH)
>      printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
>             (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer),
> @@ -853,10 +947,17 @@ void tb_flush(CPUState *cpu)
>      qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
>      page_flush_tb();
>
> -    tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
> +    buf = replace_code_gen_buffer();
> +    tcg_code_gen_init(&tcg_ctx, buf, code_gen_prologue_size);
> +
>      /* XXX: flush processor icache at this point if cache flush is
>         expensive */
>      tcg_ctx.tb_ctx.tb_flush_count++;
> +
> +    /* exit all CPUs so that the old buffer is quickly cleared */
> +    CPU_FOREACH(cpu) {
> +        cpu_exit(cpu);
> +    }
>  }
>
>  #ifdef DEBUG_TB_CHECK


--
Alex Bennée

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
  2016-04-26  4:48           ` Richard Henderson
@ 2016-04-26  6:35             ` Alex Bennée
  2016-04-26 15:42               ` Richard Henderson
  0 siblings, 1 reply; 21+ messages in thread
From: Alex Bennée @ 2016-04-26  6:35 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Emilio G. Cota, QEMU Developers, MTTCG Devel, Paolo Bonzini,
	Peter Crosthwaite, Sergey Fedorov


Richard Henderson <rth@twiddle.net> writes:

> On 04/25/2016 04:46 PM, Emilio G. Cota wrote:
>> +    /*
>> +     * write the prologue into buf2. This is safe because we'll later call
>> +     * tcg_prologue_init on buf1, from which we'll start execution.
>> +     */
>> +    tcg_ctx.code_gen_buffer = code_gen_buf2;
>> +    tcg_prologue_init(&tcg_ctx);
>> +
>
> Ah, no.  Write only one prologue, not one per buffer.
>
> If they're sufficiently close (i.e. one allocation under the max size),
> then the same one can be used for both halves.
>
> The global variables that you didn't see in this revision are:
>
> aarch64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> arm/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> i386/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> ia64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> ia64/tcg-target.inc.c:    tcg_insn_unit *thunks[8] = { };
> mips/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> ppc/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> s390/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_ld_trampoline[16];
> sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_st_trampoline[16];

Aside from the existing code structure is there any reason to have only
one prologue? It doesn't seem to be a large amount of code and in the
case of having smaller translation regions I would posit having a
"local" prologue/epilogue would make the jumps cheaper.

>
>
> r~


--
Alex Bennée

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
  2016-04-26  6:35             ` Alex Bennée
@ 2016-04-26 15:42               ` Richard Henderson
  0 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-04-26 15:42 UTC (permalink / raw)
  To: Alex Bennée
  Cc: Emilio G. Cota, QEMU Developers, MTTCG Devel, Paolo Bonzini,
	Peter Crosthwaite, Sergey Fedorov

On 04/25/2016 11:35 PM, Alex Bennée wrote:
> 
> Richard Henderson <rth@twiddle.net> writes:
> 
>> On 04/25/2016 04:46 PM, Emilio G. Cota wrote:
>>> +    /*
>>> +     * write the prologue into buf2. This is safe because we'll later call
>>> +     * tcg_prologue_init on buf1, from which we'll start execution.
>>> +     */
>>> +    tcg_ctx.code_gen_buffer = code_gen_buf2;
>>> +    tcg_prologue_init(&tcg_ctx);
>>> +
>>
>> Ah, no.  Write only one prologue, not one per buffer.
>>
>> If they're sufficiently close (i.e. one allocation under the max size),
>> then the same one can be used for both halves.
>>
>> The global variables that you didn't see in this revision are:
>>
>> aarch64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> arm/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> i386/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> ia64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> ia64/tcg-target.inc.c:    tcg_insn_unit *thunks[8] = { };
>> mips/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> ppc/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> s390/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_ld_trampoline[16];
>> sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_st_trampoline[16];
> 
> Aside from the existing code structure is there any reason to have only
> one prologue? 

Well, there's also the gdb jit unwind info.  But aside from those, no.

> It doesn't seem to be a large amount of code and in the
> case of having smaller translation regions I would posit having a
> "local" prologue/epilogue would make the jumps cheaper.

Not really.  The jumps are generally in range already, based on the restriction
in max buffer size.

Really only arm32 (and ppc32, post direct jump atomicity patchset) are the only
ones that require a tiny (less than 64MB) buffer.  Anything bigger than 64MB, I
don't see any reason to create two independent buffers.

The other consideration not yet mentioned is that you'd like to put on the
entire buffer, in the case of x86_64 and some others, within 2GB of the main
executable, so that helper calls can use a direct call insn.



r~

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
  2016-04-26  6:32           ` Alex Bennée
@ 2016-04-30  3:40             ` Emilio G. Cota
  2016-05-09 11:21               ` Paolo Bonzini
  0 siblings, 1 reply; 21+ messages in thread
From: Emilio G. Cota @ 2016-04-30  3:40 UTC (permalink / raw)
  To: Alex Bennée
  Cc: QEMU Developers, MTTCG Devel, Paolo Bonzini, Peter Crosthwaite,
	Richard Henderson, Sergey Fedorov

On Tue, Apr 26, 2016 at 07:32:39 +0100, Alex Bennée wrote:
> Emilio G. Cota <cota@braap.org> writes:
> > With two code_gen "halves", if two tb_flush calls are done in the same
> > RCU read critical section, we're screwed. I added a cpu_exit at the end
> > of tb_flush to try to mitigate this, but I haven't audited all the callers
> > (for instance, what does the gdbstub do?).
> 
> I'm not sure we are going to get much from this approach. The tb_flush
> is a fairly rare occurrence its not like its on the critical performance
> path (although of course pathological cases are possible).

This is what I thought from the beginning, but wanted to give this
alternative a go anyway to see if it was feasible.

On my end I won't do any more work on this approach. Will go back
to locks, despite Paolo's (justified) dislike for them =)

> > If we end up having a mechanism to "stop all  CPUs to do something", as
> > I think we'll end up needing for correct LL/SC emulation, we'll probably
> > be better off using that mechanism for tb_flush as well -- plus, we'll avoid
> > wasting memory.
> 
> I'm fairly certain there will need to be a "stop everything" mode for
> some things - I'm less certain of the best way of doing it. Did you get
> a chance to look at my version of the async_safe_work mechanism?

Not yet, but will get to it very soon.

Cheers,

		Emilio

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
  2016-04-30  3:40             ` Emilio G. Cota
@ 2016-05-09 11:21               ` Paolo Bonzini
  2016-05-09 11:50                 ` Alex Bennée
  2016-05-09 17:07                 ` Emilio G. Cota
  0 siblings, 2 replies; 21+ messages in thread
From: Paolo Bonzini @ 2016-05-09 11:21 UTC (permalink / raw)
  To: Emilio G. Cota, Alex Bennée
  Cc: QEMU Developers, MTTCG Devel, Peter Crosthwaite,
	Richard Henderson, Sergey Fedorov



On 30/04/2016 05:40, Emilio G. Cota wrote:
>> The tb_flush
>> > is a fairly rare occurrence its not like its on the critical performance
>> > path (although of course pathological cases are possible).
> This is what I thought from the beginning, but wanted to give this
> alternative a go anyway to see if it was feasible.
> 
> On my end I won't do any more work on this approach. Will go back
> to locks, despite Paolo's (justified) dislike for them =)

Which locks?  tb_lock during tb_find_fast?  The problem with that was
that it slowed down everything a lot, wasn't it?

To me, the RCU idea is not really about making tb_flush (the rare case)
faster; it was more about keeping the rest simple and fast.

Paolo

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
  2016-05-09 11:21               ` Paolo Bonzini
@ 2016-05-09 11:50                 ` Alex Bennée
  2016-05-09 13:55                   ` Paolo Bonzini
  2016-05-09 17:07                 ` Emilio G. Cota
  1 sibling, 1 reply; 21+ messages in thread
From: Alex Bennée @ 2016-05-09 11:50 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Emilio G. Cota, QEMU Developers, MTTCG Devel, Peter Crosthwaite,
	Richard Henderson, Sergey Fedorov


Paolo Bonzini <pbonzini@redhat.com> writes:

> On 30/04/2016 05:40, Emilio G. Cota wrote:
>>> The tb_flush
>>> > is a fairly rare occurrence its not like its on the critical performance
>>> > path (although of course pathological cases are possible).
>> This is what I thought from the beginning, but wanted to give this
>> alternative a go anyway to see if it was feasible.
>>
>> On my end I won't do any more work on this approach. Will go back
>> to locks, despite Paolo's (justified) dislike for them =)
>
> Which locks?  tb_lock during tb_find_fast?  The problem with that was
> that it slowed down everything a lot, wasn't it?

Very much so, in the new tree (coming soon) with QHT I was able to
remove the locks from the whole hot-path which means they where only
needed for code generation.

> To me, the RCU idea is not really about making tb_flush (the rare case)
> faster; it was more about keeping the rest simple and fast.

I'm not sure it achieved that as there is added complexity from having
the split buffer and then ensuring you don't double-flush.

>
> Paolo


--
Alex Bennée

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
  2016-05-09 11:50                 ` Alex Bennée
@ 2016-05-09 13:55                   ` Paolo Bonzini
  2016-05-09 15:05                     ` Alex Bennée
  0 siblings, 1 reply; 21+ messages in thread
From: Paolo Bonzini @ 2016-05-09 13:55 UTC (permalink / raw)
  To: Alex Bennée
  Cc: Emilio G. Cota, QEMU Developers, MTTCG Devel, Peter Crosthwaite,
	Richard Henderson, Sergey Fedorov



On 09/05/2016 13:50, Alex Bennée wrote:
> > Which locks?  tb_lock during tb_find_fast?  The problem with that was
> > that it slowed down everything a lot, wasn't it?
> 
> Very much so, in the new tree (coming soon) with QHT I was able to
> remove the locks from the whole hot-path which means they where only
> needed for code generation.

Okay, I'm curious now. :)

> > To me, the RCU idea is not really about making tb_flush (the rare case)
> > faster; it was more about keeping the rest simple and fast.
> 
> I'm not sure it achieved that as there is added complexity from having
> the split buffer and then ensuring you don't double-flush.

Agreed.

Paolo

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
  2016-05-09 13:55                   ` Paolo Bonzini
@ 2016-05-09 15:05                     ` Alex Bennée
  0 siblings, 0 replies; 21+ messages in thread
From: Alex Bennée @ 2016-05-09 15:05 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Emilio G. Cota, QEMU Developers, MTTCG Devel, Peter Crosthwaite,
	Richard Henderson, Sergey Fedorov


Paolo Bonzini <pbonzini@redhat.com> writes:

> On 09/05/2016 13:50, Alex Bennée wrote:
>> > Which locks?  tb_lock during tb_find_fast?  The problem with that was
>> > that it slowed down everything a lot, wasn't it?
>>
>> Very much so, in the new tree (coming soon) with QHT I was able to
>> remove the locks from the whole hot-path which means they where only
>> needed for code generation.
>
> Okay, I'm curious now. :)

https://github.com/stsquad/qemu/commits/mttcg/base-patches-v3 is the
current WIP, with:

https://github.com/stsquad/qemu/commit/0823f1c77f12ed5958f77484d6477ea205aee220

being the commit that clears the hot-path to run without locks.

The tree is based on tcg-next which has made things a lot cleaner now a
bunch of Sergey's stuff has been grabbed by rth. Obviously being WIP
subject to change. Once I'm done with my current out-of-tree diversions
I'll be back onto cleaning the tree up for the next review round.

Review comments on the posted tree's always welcome of course ;-)

>
>> > To me, the RCU idea is not really about making tb_flush (the rare case)
>> > faster; it was more about keeping the rest simple and fast.
>>
>> I'm not sure it achieved that as there is added complexity from having
>> the split buffer and then ensuring you don't double-flush.
>
> Agreed.
>
> Paolo


--
Alex Bennée

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
  2016-05-09 11:21               ` Paolo Bonzini
  2016-05-09 11:50                 ` Alex Bennée
@ 2016-05-09 17:07                 ` Emilio G. Cota
  1 sibling, 0 replies; 21+ messages in thread
From: Emilio G. Cota @ 2016-05-09 17:07 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Alex Bennée, QEMU Developers, MTTCG Devel,
	Peter Crosthwaite, Richard Henderson, Sergey Fedorov

On Mon, May 09, 2016 at 13:21:50 +0200, Paolo Bonzini wrote:
> On 30/04/2016 05:40, Emilio G. Cota wrote:
> >> The tb_flush
> >> > is a fairly rare occurrence its not like its on the critical performance
> >> > path (although of course pathological cases are possible).
> > This is what I thought from the beginning, but wanted to give this
> > alternative a go anyway to see if it was feasible.
> > 
> > On my end I won't do any more work on this approach. Will go back
> > to locks, despite Paolo's (justified) dislike for them =)
> 
> Which locks?  tb_lock during tb_find_fast?  The problem with that was
> that it slowed down everything a lot, wasn't it?

By "locks" I meant somehow forcing other threads/vcpus to stop, to then
perform tb_flush. This can be achieved with a bunch of primitives
such as condition variables (*gaaah*) -- these of course involve "locks" :P

The removal of tb_lock() when looking up tb's is orthogonal to this.

> To me, the RCU idea is not really about making tb_flush (the rare case)
> faster; it was more about keeping the rest simple and fast.

Well I agree with this idea; I wanted to see how far we could take it.

		E.

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2016-05-09 17:07 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-22  0:06 [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU Emilio G. Cota
2016-04-22 14:41 ` Alex Bennée
2016-04-22 14:47   ` Alex Bennée
2016-04-24  3:20   ` Emilio G. Cota
2016-04-25  8:35     ` Alex Bennée
2016-04-22 18:25 ` Richard Henderson
2016-04-24  3:27   ` [Qemu-devel] [RFC v2] " Emilio G. Cota
2016-04-24 18:12     ` Richard Henderson
2016-04-25 15:19     ` Alex Bennée
2016-04-25 15:25       ` Emilio G. Cota
2016-04-25 23:46         ` [Qemu-devel] [RFC v3] " Emilio G. Cota
2016-04-26  4:48           ` Richard Henderson
2016-04-26  6:35             ` Alex Bennée
2016-04-26 15:42               ` Richard Henderson
2016-04-26  6:32           ` Alex Bennée
2016-04-30  3:40             ` Emilio G. Cota
2016-05-09 11:21               ` Paolo Bonzini
2016-05-09 11:50                 ` Alex Bennée
2016-05-09 13:55                   ` Paolo Bonzini
2016-05-09 15:05                     ` Alex Bennée
2016-05-09 17:07                 ` Emilio G. Cota

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.