* [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU
@ 2016-04-22 0:06 Emilio G. Cota
2016-04-22 14:41 ` Alex Bennée
2016-04-22 18:25 ` Richard Henderson
0 siblings, 2 replies; 21+ messages in thread
From: Emilio G. Cota @ 2016-04-22 0:06 UTC (permalink / raw)
To: QEMU Developers, MTTCG Devel
Cc: Alex Bennée, Paolo Bonzini, Peter Crosthwaite,
Richard Henderson, Sergey Fedorov
This is a first attempt at making tb_flush not have to stop all CPUs.
There are issues as pointed out below, but this could be a good start.
Context:
https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
Known issues:
- Basically compile-tested only, since I've only run this with
single-threaded TCG; I also tried running it with linux-user,
but in order to trigger tb_flush I had to make code_gen_buffer
so small that the CPU calling tb_flush would immediately fill
the 2nd buffer, triggering the assert. If you have a working
multi-threaded workload that would be good to test this, please
let me know.
- Windows; not even compile-tested!
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
translate-all.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 117 insertions(+), 5 deletions(-)
diff --git a/translate-all.c b/translate-all.c
index bba9b62..4c14b4d 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -536,8 +536,13 @@ static inline void *split_cross_256mb(void *buf1, size_t size1)
#endif
#ifdef USE_STATIC_CODE_GEN_BUFFER
-static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
+static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE]
__attribute__((aligned(CODE_GEN_ALIGN)));
+static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE]
+ __attribute__((aligned(CODE_GEN_ALIGN)));
+static int static_buf_mask = 1;
+static void *static_buf1;
+static void *static_buf2;
# ifdef _WIN32
static inline void do_protect(void *addr, long size, int prot)
@@ -580,13 +585,12 @@ static inline void map_none(void *addr, long size)
}
# endif /* WIN32 */
-static inline void *alloc_code_gen_buffer(void)
+static void *alloc_static_code_gen_buffer(void *buf)
{
- void *buf = static_code_gen_buffer;
size_t full_size, size;
/* The size of the buffer, rounded down to end on a page boundary. */
- full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
+ full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1))
& qemu_real_host_page_mask) - (uintptr_t)buf;
/* Reserve a guard page. */
@@ -612,6 +616,15 @@ static inline void *alloc_code_gen_buffer(void)
return buf;
}
+
+static inline void *alloc_code_gen_buffer(void)
+{
+ static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1);
+ static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2);
+
+ assert(static_buf_mask == 1);
+ return static_buf1;
+}
#elif defined(_WIN32)
static inline void *alloc_code_gen_buffer(void)
{
@@ -829,8 +842,100 @@ static void page_flush_tb(void)
}
}
+#ifdef USE_STATIC_CODE_GEN_BUFFER
+
+struct code_gen_desc {
+ struct rcu_head rcu;
+ int clear_bit;
+};
+
+static void code_gen_buffer_clear(struct rcu_head *rcu)
+{
+ struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+ tb_lock();
+ static_buf_mask &= ~desc->clear_bit;
+ tb_unlock();
+ g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+ struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
+
+ /*
+ * If both bits are set, we're having two concurrent flushes. This
+ * can easily happen if the buffers are heavily undersized.
+ */
+ assert(static_buf_mask == 1 || static_buf_mask == 2);
+
+ desc->clear_bit = static_buf_mask;
+ call_rcu1(&desc->rcu, code_gen_buffer_clear);
+
+ if (static_buf_mask == 1) {
+ static_buf_mask |= 2;
+ return static_buf2;
+ }
+ static_buf_mask |= 1;
+ return static_buf1;
+}
+
+#elif defined(_WIN32)
+
+struct code_gen_desc {
+ struct rcu_head rcu;
+ void *buf;
+};
+
+static void code_gen_buffer_vfree(struct rcu_head *rcu)
+{
+ struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+ VirtualFree(desc->buf, 0, MEM_RELEASE);
+ g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+ struct code_gen_desc *desc;
+
+ desc = g_malloc0(sizeof(*desc));
+ desc->buf = tcg_ctx.code_gen_buffer;
+ call_rcu1(&desc->rcu, code_gen_buffer_vfree);
+
+ return alloc_code_gen_buffer();
+}
+
+#else /* UNIX, dynamically-allocated code buffer */
+
+struct code_gen_desc {
+ struct rcu_head rcu;
+ void *buf;
+ size_t size;
+};
+
+static void code_gen_buffer_unmap(struct rcu_head *rcu)
+{
+ struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+ munmap(desc->buf, desc->size + qemu_real_host_page_size);
+ g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+ struct code_gen_desc *desc;
+
+ desc = g_malloc0(sizeof(*desc));
+ desc->buf = tcg_ctx.code_gen_buffer;
+ desc->size = tcg_ctx.code_gen_buffer_size;
+ call_rcu1(&desc->rcu, code_gen_buffer_unmap);
+
+ return alloc_code_gen_buffer();
+}
+#endif /* USE_STATIC_CODE_GEN_BUFFER */
+
/* flush all the translation blocks */
-/* XXX: tb_flush is currently not thread safe */
void tb_flush(CPUState *cpu)
{
#if defined(DEBUG_FLUSH)
@@ -853,10 +958,17 @@ void tb_flush(CPUState *cpu)
qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
page_flush_tb();
+ tcg_ctx.code_gen_buffer = code_gen_buffer_replace();
tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
+ tcg_prologue_init(&tcg_ctx);
/* XXX: flush processor icache at this point if cache flush is
expensive */
tcg_ctx.tb_ctx.tb_flush_count++;
+
+ /* exit all CPUs so that the old buffer is quickly cleared. */
+ CPU_FOREACH(cpu) {
+ cpu_exit(cpu);
+ }
}
#ifdef DEBUG_TB_CHECK
--
2.5.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU
2016-04-22 0:06 [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU Emilio G. Cota
@ 2016-04-22 14:41 ` Alex Bennée
2016-04-22 14:47 ` Alex Bennée
2016-04-24 3:20 ` Emilio G. Cota
2016-04-22 18:25 ` Richard Henderson
1 sibling, 2 replies; 21+ messages in thread
From: Alex Bennée @ 2016-04-22 14:41 UTC (permalink / raw)
To: Emilio G. Cota
Cc: QEMU Developers, MTTCG Devel, Paolo Bonzini, Peter Crosthwaite,
Richard Henderson, Sergey Fedorov
Emilio G. Cota <cota@braap.org> writes:
> This is a first attempt at making tb_flush not have to stop all CPUs.
> There are issues as pointed out below, but this could be a good start.
>
> Context:
> https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
> https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
>
> Known issues:
> - Basically compile-tested only, since I've only run this with
> single-threaded TCG; I also tried running it with linux-user,
> but in order to trigger tb_flush I had to make code_gen_buffer
> so small that the CPU calling tb_flush would immediately fill
> the 2nd buffer, triggering the assert. If you have a working
> multi-threaded workload that would be good to test this, please
> let me know.
With my latest mttcg unit tests:
./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \
-device virtio-serial-device -device virtconsole,chardev=ctd \
-chardev testdev,id=ctd -display none -serial stdio \
-kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \
-append "tight smc irq mod=1 rounds=100000" -name arm,debug-threads=on
> - Windows; not even compile-tested!
>
> Signed-off-by: Emilio G. Cota <cota@braap.org>
> ---
> translate-all.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 117 insertions(+), 5 deletions(-)
>
> diff --git a/translate-all.c b/translate-all.c
> index bba9b62..4c14b4d 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -536,8 +536,13 @@ static inline void *split_cross_256mb(void *buf1, size_t size1)
> #endif
>
> #ifdef USE_STATIC_CODE_GEN_BUFFER
> -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
> +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE]
> __attribute__((aligned(CODE_GEN_ALIGN)));
> +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE]
> + __attribute__((aligned(CODE_GEN_ALIGN)));
> +static int static_buf_mask = 1;
> +static void *static_buf1;
> +static void *static_buf2;
>
> # ifdef _WIN32
> static inline void do_protect(void *addr, long size, int prot)
> @@ -580,13 +585,12 @@ static inline void map_none(void *addr, long size)
> }
> # endif /* WIN32 */
>
> -static inline void *alloc_code_gen_buffer(void)
> +static void *alloc_static_code_gen_buffer(void *buf)
> {
> - void *buf = static_code_gen_buffer;
> size_t full_size, size;
>
> /* The size of the buffer, rounded down to end on a page boundary. */
> - full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
> + full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1))
> & qemu_real_host_page_mask) - (uintptr_t)buf;
>
> /* Reserve a guard page. */
> @@ -612,6 +616,15 @@ static inline void *alloc_code_gen_buffer(void)
>
> return buf;
> }
> +
> +static inline void *alloc_code_gen_buffer(void)
> +{
> + static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1);
> + static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2);
> +
> + assert(static_buf_mask == 1);
> + return static_buf1;
> +}
> #elif defined(_WIN32)
> static inline void *alloc_code_gen_buffer(void)
> {
> @@ -829,8 +842,100 @@ static void page_flush_tb(void)
> }
> }
>
> +#ifdef USE_STATIC_CODE_GEN_BUFFER
> +
> +struct code_gen_desc {
> + struct rcu_head rcu;
> + int clear_bit;
> +};
> +
> +static void code_gen_buffer_clear(struct rcu_head *rcu)
> +{
> + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> + tb_lock();
> + static_buf_mask &= ~desc->clear_bit;
> + tb_unlock();
> + g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> + struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
> +
> + /*
> + * If both bits are set, we're having two concurrent flushes. This
> + * can easily happen if the buffers are heavily undersized.
> + */
> + assert(static_buf_mask == 1 || static_buf_mask == 2);
> +
> + desc->clear_bit = static_buf_mask;
> + call_rcu1(&desc->rcu, code_gen_buffer_clear);
> +
> + if (static_buf_mask == 1) {
> + static_buf_mask |= 2;
> + return static_buf2;
> + }
> + static_buf_mask |= 1;
> + return static_buf1;
> +}
> +
> +#elif defined(_WIN32)
> +
> +struct code_gen_desc {
> + struct rcu_head rcu;
> + void *buf;
> +};
> +
> +static void code_gen_buffer_vfree(struct rcu_head *rcu)
> +{
> + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> + VirtualFree(desc->buf, 0, MEM_RELEASE);
> + g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> + struct code_gen_desc *desc;
> +
> + desc = g_malloc0(sizeof(*desc));
> + desc->buf = tcg_ctx.code_gen_buffer;
> + call_rcu1(&desc->rcu, code_gen_buffer_vfree);
> +
> + return alloc_code_gen_buffer();
> +}
> +
> +#else /* UNIX, dynamically-allocated code buffer */
> +
> +struct code_gen_desc {
> + struct rcu_head rcu;
> + void *buf;
> + size_t size;
> +};
> +
> +static void code_gen_buffer_unmap(struct rcu_head *rcu)
> +{
> + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> + munmap(desc->buf, desc->size + qemu_real_host_page_size);
> + g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> + struct code_gen_desc *desc;
> +
> + desc = g_malloc0(sizeof(*desc));
> + desc->buf = tcg_ctx.code_gen_buffer;
> + desc->size = tcg_ctx.code_gen_buffer_size;
> + call_rcu1(&desc->rcu, code_gen_buffer_unmap);
> +
> + return alloc_code_gen_buffer();
> +}
> +#endif /* USE_STATIC_CODE_GEN_BUFFER */
> +
> /* flush all the translation blocks */
> -/* XXX: tb_flush is currently not thread safe */
> void tb_flush(CPUState *cpu)
> {
> #if defined(DEBUG_FLUSH)
> @@ -853,10 +958,17 @@ void tb_flush(CPUState *cpu)
> qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
> page_flush_tb();
>
> + tcg_ctx.code_gen_buffer = code_gen_buffer_replace();
> tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
> + tcg_prologue_init(&tcg_ctx);
> /* XXX: flush processor icache at this point if cache flush is
> expensive */
> tcg_ctx.tb_ctx.tb_flush_count++;
> +
> + /* exit all CPUs so that the old buffer is quickly cleared. */
> + CPU_FOREACH(cpu) {
> + cpu_exit(cpu);
> + }
> }
>
> #ifdef DEBUG_TB_CHECK
--
Alex Bennée
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU
2016-04-22 14:41 ` Alex Bennée
@ 2016-04-22 14:47 ` Alex Bennée
2016-04-24 3:20 ` Emilio G. Cota
1 sibling, 0 replies; 21+ messages in thread
From: Alex Bennée @ 2016-04-22 14:47 UTC (permalink / raw)
To: Emilio G. Cota
Cc: QEMU Developers, MTTCG Devel, Paolo Bonzini, Peter Crosthwaite,
Richard Henderson, Sergey Fedorov
Alex Bennée <alex.bennee@linaro.org> writes:
> Emilio G. Cota <cota@braap.org> writes:
>
>> This is a first attempt at making tb_flush not have to stop all CPUs.
>> There are issues as pointed out below, but this could be a good start.
>>
>> Context:
>> https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
>> https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
>>
>> Known issues:
>> - Basically compile-tested only, since I've only run this with
>> single-threaded TCG; I also tried running it with linux-user,
>> but in order to trigger tb_flush I had to make code_gen_buffer
>> so small that the CPU calling tb_flush would immediately fill
>> the 2nd buffer, triggering the assert. If you have a working
>> multi-threaded workload that would be good to test this, please
>> let me know.
>
> With my latest mttcg unit tests:
>
> ./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \
> -device virtio-serial-device -device virtconsole,chardev=ctd \
> -chardev testdev,id=ctd -display none -serial stdio \
> -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \
> -append "tight smc irq mod=1 rounds=100000" -name
> arm,debug-threads=on
Ahh, I just realised you wanted a linux-user workload.
>
>
>> - Windows; not even compile-tested!
>>
>> Signed-off-by: Emilio G. Cota <cota@braap.org>
>> ---
>> translate-all.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
>> 1 file changed, 117 insertions(+), 5 deletions(-)
>>
>> diff --git a/translate-all.c b/translate-all.c
>> index bba9b62..4c14b4d 100644
>> --- a/translate-all.c
>> +++ b/translate-all.c
>> @@ -536,8 +536,13 @@ static inline void *split_cross_256mb(void *buf1, size_t size1)
>> #endif
>>
>> #ifdef USE_STATIC_CODE_GEN_BUFFER
>> -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
>> +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE]
>> __attribute__((aligned(CODE_GEN_ALIGN)));
>> +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE]
>> + __attribute__((aligned(CODE_GEN_ALIGN)));
>> +static int static_buf_mask = 1;
>> +static void *static_buf1;
>> +static void *static_buf2;
>>
>> # ifdef _WIN32
>> static inline void do_protect(void *addr, long size, int prot)
>> @@ -580,13 +585,12 @@ static inline void map_none(void *addr, long size)
>> }
>> # endif /* WIN32 */
>>
>> -static inline void *alloc_code_gen_buffer(void)
>> +static void *alloc_static_code_gen_buffer(void *buf)
>> {
>> - void *buf = static_code_gen_buffer;
>> size_t full_size, size;
>>
>> /* The size of the buffer, rounded down to end on a page boundary. */
>> - full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
>> + full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1))
>> & qemu_real_host_page_mask) - (uintptr_t)buf;
>>
>> /* Reserve a guard page. */
>> @@ -612,6 +616,15 @@ static inline void *alloc_code_gen_buffer(void)
>>
>> return buf;
>> }
>> +
>> +static inline void *alloc_code_gen_buffer(void)
>> +{
>> + static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1);
>> + static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2);
>> +
>> + assert(static_buf_mask == 1);
>> + return static_buf1;
>> +}
>> #elif defined(_WIN32)
>> static inline void *alloc_code_gen_buffer(void)
>> {
>> @@ -829,8 +842,100 @@ static void page_flush_tb(void)
>> }
>> }
>>
>> +#ifdef USE_STATIC_CODE_GEN_BUFFER
>> +
>> +struct code_gen_desc {
>> + struct rcu_head rcu;
>> + int clear_bit;
>> +};
>> +
>> +static void code_gen_buffer_clear(struct rcu_head *rcu)
>> +{
>> + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
>> +
>> + tb_lock();
>> + static_buf_mask &= ~desc->clear_bit;
>> + tb_unlock();
>> + g_free(desc);
>> +}
>> +
>> +static void *code_gen_buffer_replace(void)
>> +{
>> + struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
>> +
>> + /*
>> + * If both bits are set, we're having two concurrent flushes. This
>> + * can easily happen if the buffers are heavily undersized.
>> + */
>> + assert(static_buf_mask == 1 || static_buf_mask == 2);
>> +
>> + desc->clear_bit = static_buf_mask;
>> + call_rcu1(&desc->rcu, code_gen_buffer_clear);
>> +
>> + if (static_buf_mask == 1) {
>> + static_buf_mask |= 2;
>> + return static_buf2;
>> + }
>> + static_buf_mask |= 1;
>> + return static_buf1;
>> +}
>> +
>> +#elif defined(_WIN32)
>> +
>> +struct code_gen_desc {
>> + struct rcu_head rcu;
>> + void *buf;
>> +};
>> +
>> +static void code_gen_buffer_vfree(struct rcu_head *rcu)
>> +{
>> + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
>> +
>> + VirtualFree(desc->buf, 0, MEM_RELEASE);
>> + g_free(desc);
>> +}
>> +
>> +static void *code_gen_buffer_replace(void)
>> +{
>> + struct code_gen_desc *desc;
>> +
>> + desc = g_malloc0(sizeof(*desc));
>> + desc->buf = tcg_ctx.code_gen_buffer;
>> + call_rcu1(&desc->rcu, code_gen_buffer_vfree);
>> +
>> + return alloc_code_gen_buffer();
>> +}
>> +
>> +#else /* UNIX, dynamically-allocated code buffer */
>> +
>> +struct code_gen_desc {
>> + struct rcu_head rcu;
>> + void *buf;
>> + size_t size;
>> +};
>> +
>> +static void code_gen_buffer_unmap(struct rcu_head *rcu)
>> +{
>> + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
>> +
>> + munmap(desc->buf, desc->size + qemu_real_host_page_size);
>> + g_free(desc);
>> +}
>> +
>> +static void *code_gen_buffer_replace(void)
>> +{
>> + struct code_gen_desc *desc;
>> +
>> + desc = g_malloc0(sizeof(*desc));
>> + desc->buf = tcg_ctx.code_gen_buffer;
>> + desc->size = tcg_ctx.code_gen_buffer_size;
>> + call_rcu1(&desc->rcu, code_gen_buffer_unmap);
>> +
>> + return alloc_code_gen_buffer();
>> +}
>> +#endif /* USE_STATIC_CODE_GEN_BUFFER */
>> +
>> /* flush all the translation blocks */
>> -/* XXX: tb_flush is currently not thread safe */
>> void tb_flush(CPUState *cpu)
>> {
>> #if defined(DEBUG_FLUSH)
>> @@ -853,10 +958,17 @@ void tb_flush(CPUState *cpu)
>> qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
>> page_flush_tb();
>>
>> + tcg_ctx.code_gen_buffer = code_gen_buffer_replace();
>> tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
>> + tcg_prologue_init(&tcg_ctx);
>> /* XXX: flush processor icache at this point if cache flush is
>> expensive */
>> tcg_ctx.tb_ctx.tb_flush_count++;
>> +
>> + /* exit all CPUs so that the old buffer is quickly cleared. */
>> + CPU_FOREACH(cpu) {
>> + cpu_exit(cpu);
>> + }
>> }
>>
>> #ifdef DEBUG_TB_CHECK
--
Alex Bennée
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU
2016-04-22 0:06 [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU Emilio G. Cota
2016-04-22 14:41 ` Alex Bennée
@ 2016-04-22 18:25 ` Richard Henderson
2016-04-24 3:27 ` [Qemu-devel] [RFC v2] " Emilio G. Cota
1 sibling, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2016-04-22 18:25 UTC (permalink / raw)
To: Emilio G. Cota, QEMU Developers, MTTCG Devel
Cc: Alex Bennée, Paolo Bonzini, Peter Crosthwaite, Sergey Fedorov
On 04/21/2016 05:06 PM, Emilio G. Cota wrote:
> #ifdef USE_STATIC_CODE_GEN_BUFFER
> -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
> +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE]
> __attribute__((aligned(CODE_GEN_ALIGN)));
> +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE]
> + __attribute__((aligned(CODE_GEN_ALIGN)));
> +static int static_buf_mask = 1;
> +static void *static_buf1;
> +static void *static_buf2;
I don't like this at all.
(1) This is (by default) 32MB we're adding to the RSS of the
simulator. Surely we can do better than this.
(2) On some hosts we require a maximum displacement from
any point in the code gen buffer from the tcg prologue.
That means you can't simply allocate two separate buffers.
You have to take a single buffer, of known good size and
alignment, and split it in half.
r~
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU
2016-04-22 14:41 ` Alex Bennée
2016-04-22 14:47 ` Alex Bennée
@ 2016-04-24 3:20 ` Emilio G. Cota
2016-04-25 8:35 ` Alex Bennée
1 sibling, 1 reply; 21+ messages in thread
From: Emilio G. Cota @ 2016-04-24 3:20 UTC (permalink / raw)
To: Alex Bennée
Cc: QEMU Developers, MTTCG Devel, Paolo Bonzini, Peter Crosthwaite,
Richard Henderson, Sergey Fedorov
On Fri, Apr 22, 2016 at 15:41:13 +0100, Alex Bennée wrote:
> Emilio G. Cota <cota@braap.org> writes:
(snip)
> > Known issues:
> > - Basically compile-tested only, since I've only run this with
> > single-threaded TCG; I also tried running it with linux-user,
> > but in order to trigger tb_flush I had to make code_gen_buffer
> > so small that the CPU calling tb_flush would immediately fill
> > the 2nd buffer, triggering the assert. If you have a working
> > multi-threaded workload that would be good to test this, please
> > let me know.
>
> With my latest mttcg unit tests:
>
> ./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \
> -device virtio-serial-device -device virtconsole,chardev=ctd \
> -chardev testdev,id=ctd -display none -serial stdio \
> -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \
> -append "tight smc irq mod=1 rounds=100000" -name arm,debug-threads=on
This is useful. Never mind the need for testing linux-user, I can test
both code paths (i.e. dynamic allocation and static buf) with qemu-system
by simply defining USE_STATIC_CODE_GEN_BUFFER.
After applying a modified version of this patch (that I'll send in
a jiffy) to your enable-mttcg-for-armv7-v1 branch (reverting first
"translate-all: introduces tb_flush_safe"), I can easily trigger
this error when setting a low enough TB size, e.g. -tb-size 32:
CPU1: online and setting up with pattern 0xa0b78cbf
CPU2: online and setting up with pattern 0x22287c45
CPU3: online and setting up with pattern 0x6262c5c5
CPU0: online and setting up with pattern 0xa65e7ad6
qemu: flush code_size=10622184 nb_tbs=83886 avg_tb_size=126
qemu: flush code_size=10469016 nb_tbs=83886 avg_tb_size=124
qemu: flush code_size=10492920 nb_tbs=83886 avg_tb_size=125
qemu: flush code_size=10477464 nb_tbs=83886 avg_tb_size=124
qemu: flush code_size=10495800 nb_tbs=83886 avg_tb_size=125
PASS: smc: irq: 0 errors, IRQs not checked
Unhandled exception 3 (pabt)
Exception frame registers:
pc : [<e59f2028>] lr : [<40010700>] psr: a0000153
sp : 400ac5c0 ip : 400ab4e8 fp : 40032ca8
r10: 00000000 r9 : 00000000 r8 : 00000000
r7 : 00000000 r6 : 00000000 r5 : 00000000 r4 : 00000000
r3 : 00000000 r2 : 00000000 r1 : e59f2028 r0 : 00000000
Flags: NzCv IRQs on FIQs off Mode SVC_32
Control: 00c5107d Table: 40060000 DAC: 00000000
IFAR: e59f2028 IFSR: 00000205
Any input on where to look would be appreciated. Thanks,
Emilio
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Qemu-devel] [RFC v2] translate-all: protect code_gen_buffer with RCU
2016-04-22 18:25 ` Richard Henderson
@ 2016-04-24 3:27 ` Emilio G. Cota
2016-04-24 18:12 ` Richard Henderson
2016-04-25 15:19 ` Alex Bennée
0 siblings, 2 replies; 21+ messages in thread
From: Emilio G. Cota @ 2016-04-24 3:27 UTC (permalink / raw)
To: QEMU Developers, MTTCG Devel
Cc: Alex Bennée, Paolo Bonzini, Peter Crosthwaite,
Richard Henderson, Sergey Fedorov
[ Applies on top of bennee/mttcg/enable-mttcg-for-armv7-v1 after
reverting "translate-all: introduces tb_flush_safe". A trivial
conflict must be solved after applying. ]
This is a first attempt at making tb_flush not have to stop all CPUs.
There are issues as pointed out below, but this could be a good start.
Context:
https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
Changes from v1:
- When a static buffer is used, split it in two instead of using
a second buffer.
Known issues:
- Fails Alex' unit test with low enough -tb-size, see
https://lists.gnu.org/archive/html/qemu-devel/2016-04/msg03465.html
Seems to work in MTTCG, although I've only tested with tb_lock
always being taken in tb_find_fast.
- Windows; not even compile-tested!
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
translate-all.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 133 insertions(+), 13 deletions(-)
diff --git a/translate-all.c b/translate-all.c
index 8e70583..6830371 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -535,6 +535,9 @@ static inline void *split_cross_256mb(void *buf1, size_t size1)
#ifdef USE_STATIC_CODE_GEN_BUFFER
static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
__attribute__((aligned(CODE_GEN_ALIGN)));
+static int static_buf_mask = 1;
+static void *static_buf1;
+static void *static_buf2;
# ifdef _WIN32
static inline void do_protect(void *addr, long size, int prot)
@@ -577,6 +580,13 @@ static inline void map_none(void *addr, long size)
}
# endif /* WIN32 */
+static void map_static_code_gen_buffer(void *buf, size_t size)
+{
+ map_exec(buf, size);
+ map_none(buf + size, qemu_real_host_page_size);
+ qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
+}
+
static inline void *alloc_code_gen_buffer(void)
{
void *buf = static_code_gen_buffer;
@@ -586,28 +596,41 @@ static inline void *alloc_code_gen_buffer(void)
full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
& qemu_real_host_page_mask) - (uintptr_t)buf;
- /* Reserve a guard page. */
- size = full_size - qemu_real_host_page_size;
+ /*
+ * Reserve two guard pages, one after each of the two buffers:
+ * | buf1 |g1| buf2 |g2|
+ */
+ size = full_size - 2 * qemu_real_host_page_size;
/* Honor a command-line option limiting the size of the buffer. */
if (size > tcg_ctx.code_gen_buffer_size) {
size = (((uintptr_t)buf + tcg_ctx.code_gen_buffer_size)
& qemu_real_host_page_mask) - (uintptr_t)buf;
}
- tcg_ctx.code_gen_buffer_size = size;
#ifdef __mips__
- if (cross_256mb(buf, size)) {
- buf = split_cross_256mb(buf, size);
- size = tcg_ctx.code_gen_buffer_size;
+ /*
+ * Pass 'size + page_size', since we want 'buf1 | guard1 | buf2' to be
+ * within the boundary.
+ */
+ if (cross_256mb(buf, size + qemu_real_host_page_size)) {
+ buf = split_cross_256mb(buf, size + qemu_real_host_page_size);
+ size = tcg_ctx.code_gen_buffer_size - qemu_real_host_page_size;
}
#endif
- map_exec(buf, size);
- map_none(buf + size, qemu_real_host_page_size);
- qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
+ /* split the buffer in two */
+ size /= 2;
+ tcg_ctx.code_gen_buffer_size = size;
- return buf;
+ static_buf1 = buf;
+ static_buf2 = buf + size + qemu_real_host_page_mask;
+
+ map_static_code_gen_buffer(static_buf1, size);
+ map_static_code_gen_buffer(static_buf2, size);
+
+ assert(static_buf_mask == 1);
+ return static_buf1;
}
#elif defined(_WIN32)
static inline void *alloc_code_gen_buffer(void)
@@ -825,10 +848,100 @@ static void page_flush_tb(void)
}
}
+#ifdef USE_STATIC_CODE_GEN_BUFFER
+
+struct code_gen_desc {
+ struct rcu_head rcu;
+ int clear_bit;
+};
+
+static void code_gen_buffer_clear(struct rcu_head *rcu)
+{
+ struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+ tb_lock();
+ static_buf_mask &= ~desc->clear_bit;
+ tb_unlock();
+ g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+ struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
+
+ /*
+ * If both bits are set, we're having two concurrent flushes. This
+ * can easily happen if the buffers are heavily undersized.
+ */
+ assert(static_buf_mask == 1 || static_buf_mask == 2);
+
+ desc->clear_bit = static_buf_mask;
+ call_rcu1(&desc->rcu, code_gen_buffer_clear);
+
+ if (static_buf_mask == 1) {
+ static_buf_mask |= 2;
+ return static_buf2;
+ }
+ static_buf_mask |= 1;
+ return static_buf1;
+}
+
+#elif defined(_WIN32)
+
+struct code_gen_desc {
+ struct rcu_head rcu;
+ void *buf;
+};
+
+static void code_gen_buffer_vfree(struct rcu_head *rcu)
+{
+ struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+ VirtualFree(desc->buf, 0, MEM_RELEASE);
+ g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+ struct code_gen_desc *desc;
+
+ desc = g_malloc0(sizeof(*desc));
+ desc->buf = tcg_ctx.code_gen_buffer;
+ call_rcu1(&desc->rcu, code_gen_buffer_vfree);
+
+ return alloc_code_gen_buffer();
+}
+
+#else /* UNIX, dynamically-allocated code buffer */
+
+struct code_gen_desc {
+ struct rcu_head rcu;
+ void *buf;
+ size_t size;
+};
+
+static void code_gen_buffer_unmap(struct rcu_head *rcu)
+{
+ struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+ munmap(desc->buf, desc->size + qemu_real_host_page_size);
+ g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+ struct code_gen_desc *desc;
+
+ desc = g_malloc0(sizeof(*desc));
+ desc->buf = tcg_ctx.code_gen_buffer;
+ desc->size = tcg_ctx.code_gen_buffer_size;
+ call_rcu1(&desc->rcu, code_gen_buffer_unmap);
+
+ return alloc_code_gen_buffer();
+}
+#endif /* USE_STATIC_CODE_GEN_BUFFER */
+
/* flush all the translation blocks */
-/* XXX: tb_flush is currently not thread safe. System emulation calls it only
- * with tb_lock taken or from safe_work, so no need to take tb_lock here.
- */
void tb_flush(CPUState *cpu)
{
#if defined(DEBUG_FLUSH)
@@ -852,10 +965,17 @@ void tb_flush(CPUState *cpu)
memset(tcg_ctx.tb_ctx.tb_phys_hash, 0, sizeof(tcg_ctx.tb_ctx.tb_phys_hash));
page_flush_tb();
+ tcg_ctx.code_gen_buffer = code_gen_buffer_replace();
tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
+ tcg_prologue_init(&tcg_ctx);
/* XXX: flush processor icache at this point if cache flush is
expensive */
tcg_ctx.tb_ctx.tb_flush_count++;
+
+ /* exit all CPUs so that the old buffer is quickly cleared. */
+ CPU_FOREACH(cpu) {
+ cpu_exit(cpu);
+ }
}
#ifdef DEBUG_TB_CHECK
--
2.5.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC v2] translate-all: protect code_gen_buffer with RCU
2016-04-24 3:27 ` [Qemu-devel] [RFC v2] " Emilio G. Cota
@ 2016-04-24 18:12 ` Richard Henderson
2016-04-25 15:19 ` Alex Bennée
1 sibling, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-04-24 18:12 UTC (permalink / raw)
To: Emilio G. Cota, QEMU Developers, MTTCG Devel
Cc: Alex Bennée, Paolo Bonzini, Peter Crosthwaite, Sergey Fedorov
On 04/23/2016 08:27 PM, Emilio G. Cota wrote:
> [ Applies on top of bennee/mttcg/enable-mttcg-for-armv7-v1 after
> reverting "translate-all: introduces tb_flush_safe". A trivial
> conflict must be solved after applying. ]
>
> This is a first attempt at making tb_flush not have to stop all CPUs.
> There are issues as pointed out below, but this could be a good start.
>
> Context:
> https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
> https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
I will again say that I don't believe that wasting all of this memory is as
good as using locks -- tb_flush doesn't happen *that* often.
> +static void map_static_code_gen_buffer(void *buf, size_t size)
> +{
> + map_exec(buf, size);
> + map_none(buf + size, qemu_real_host_page_size);
> + qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
> +}
Nit: I know it's only startup, but there's no reason to make multiple map_exec
or madvise calls. You can cover the entire buffer in one go, and then call
map_none on the guard pages.
> +#ifdef USE_STATIC_CODE_GEN_BUFFER
...
> +#elif defined(_WIN32)
...
> +#else /* UNIX, dynamically-allocated code buffer */
...
> +#endif /* USE_STATIC_CODE_GEN_BUFFER */
I'm not keen on your dynamic allocation implementations. Why not split the one
dynamic buffer the same way as the static buffer? We are talking about >=
256MB here, after all.
> + tcg_prologue_init(&tcg_ctx);
We have some global variables in the tcg backends that are initialized by
tcg_prologue_init. I don't think we should be calling it again without locks
being involved.
Of course, you don't have to call it again if you split one buffer. Then you
also get to share the same rcu implementation.
r~
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU
2016-04-24 3:20 ` Emilio G. Cota
@ 2016-04-25 8:35 ` Alex Bennée
0 siblings, 0 replies; 21+ messages in thread
From: Alex Bennée @ 2016-04-25 8:35 UTC (permalink / raw)
To: Emilio G. Cota
Cc: QEMU Developers, MTTCG Devel, Paolo Bonzini, Peter Crosthwaite,
Richard Henderson, Sergey Fedorov
Emilio G. Cota <cota@braap.org> writes:
> On Fri, Apr 22, 2016 at 15:41:13 +0100, Alex Bennée wrote:
>> Emilio G. Cota <cota@braap.org> writes:
> (snip)
>> > Known issues:
>> > - Basically compile-tested only, since I've only run this with
>> > single-threaded TCG; I also tried running it with linux-user,
>> > but in order to trigger tb_flush I had to make code_gen_buffer
>> > so small that the CPU calling tb_flush would immediately fill
>> > the 2nd buffer, triggering the assert. If you have a working
>> > multi-threaded workload that would be good to test this, please
>> > let me know.
>>
>> With my latest mttcg unit tests:
>>
>> ./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \
>> -device virtio-serial-device -device virtconsole,chardev=ctd \
>> -chardev testdev,id=ctd -display none -serial stdio \
>> -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \
>> -append "tight smc irq mod=1 rounds=100000" -name arm,debug-threads=on
>
> This is useful. Never mind the need for testing linux-user, I can test
> both code paths (i.e. dynamic allocation and static buf) with qemu-system
> by simply defining USE_STATIC_CODE_GEN_BUFFER.
>
> After applying a modified version of this patch (that I'll send in
> a jiffy) to your enable-mttcg-for-armv7-v1 branch (reverting first
> "translate-all: introduces tb_flush_safe"), I can easily trigger
> this error when setting a low enough TB size, e.g. -tb-size 32:
>
> CPU1: online and setting up with pattern 0xa0b78cbf
> CPU2: online and setting up with pattern 0x22287c45
> CPU3: online and setting up with pattern 0x6262c5c5
> CPU0: online and setting up with pattern 0xa65e7ad6
> qemu: flush code_size=10622184 nb_tbs=83886 avg_tb_size=126
> qemu: flush code_size=10469016 nb_tbs=83886 avg_tb_size=124
> qemu: flush code_size=10492920 nb_tbs=83886 avg_tb_size=125
> qemu: flush code_size=10477464 nb_tbs=83886 avg_tb_size=124
> qemu: flush code_size=10495800 nb_tbs=83886 avg_tb_size=125
> PASS: smc: irq: 0 errors, IRQs not checked
> Unhandled exception 3 (pabt)
> Exception frame registers:
> pc : [<e59f2028>] lr : [<40010700>] psr: a0000153
> sp : 400ac5c0 ip : 400ab4e8 fp : 40032ca8
> r10: 00000000 r9 : 00000000 r8 : 00000000
> r7 : 00000000 r6 : 00000000 r5 : 00000000 r4 : 00000000
> r3 : 00000000 r2 : 00000000 r1 : e59f2028 r0 : 00000000
> Flags: NzCv IRQs on FIQs off Mode SVC_32
> Control: 00c5107d Table: 40060000 DAC: 00000000
> IFAR: e59f2028 IFSR: 00000205
>
> Any input on where to look would be appreciated. Thanks,
I'll have a look and see if I can replicate.
>
> Emilio
--
Alex Bennée
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC v2] translate-all: protect code_gen_buffer with RCU
2016-04-24 3:27 ` [Qemu-devel] [RFC v2] " Emilio G. Cota
2016-04-24 18:12 ` Richard Henderson
@ 2016-04-25 15:19 ` Alex Bennée
2016-04-25 15:25 ` Emilio G. Cota
1 sibling, 1 reply; 21+ messages in thread
From: Alex Bennée @ 2016-04-25 15:19 UTC (permalink / raw)
To: Emilio G. Cota
Cc: QEMU Developers, MTTCG Devel, Paolo Bonzini, Peter Crosthwaite,
Richard Henderson, Sergey Fedorov
Emilio G. Cota <cota@braap.org> writes:
> [ Applies on top of bennee/mttcg/enable-mttcg-for-armv7-v1 after
> reverting "translate-all: introduces tb_flush_safe". A trivial
> conflict must be solved after applying. ]
>
> This is a first attempt at making tb_flush not have to stop all CPUs.
> There are issues as pointed out below, but this could be a good start.
>
> Context:
> https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
> https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
>
> Changes from v1:
> - When a static buffer is used, split it in two instead of using
> a second buffer.
>
> Known issues:
> - Fails Alex' unit test with low enough -tb-size, see
> https://lists.gnu.org/archive/html/qemu-devel/2016-04/msg03465.html
> Seems to work in MTTCG, although I've only tested with tb_lock
> always being taken in tb_find_fast.
With --enable-debug-tcg I get it failing pretty quickly:
#4 0x00005555556d332a in tcg_global_alloc (s=0x555556007ba0 <tcg_ctx>)
at /home/alex/lsrc/qemu/qemu.git/tcg/tcg.c:463
463 tcg_debug_assert(s->nb_globals == s->nb_temps);
(gdb) p s->nb_globals
$1 = 24
(gdb) p s->nb_temps
$2 = 31
Seems odd though, the other threads are all waiting on the tb_lock.
> - Windows; not even compile-tested!
>
> Signed-off-by: Emilio G. Cota <cota@braap.org>
> ---
> translate-all.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
> 1 file changed, 133 insertions(+), 13 deletions(-)
>
> diff --git a/translate-all.c b/translate-all.c
> index 8e70583..6830371 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -535,6 +535,9 @@ static inline void *split_cross_256mb(void *buf1, size_t size1)
> #ifdef USE_STATIC_CODE_GEN_BUFFER
> static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
> __attribute__((aligned(CODE_GEN_ALIGN)));
> +static int static_buf_mask = 1;
> +static void *static_buf1;
> +static void *static_buf2;
>
> # ifdef _WIN32
> static inline void do_protect(void *addr, long size, int prot)
> @@ -577,6 +580,13 @@ static inline void map_none(void *addr, long size)
> }
> # endif /* WIN32 */
>
> +static void map_static_code_gen_buffer(void *buf, size_t size)
> +{
> + map_exec(buf, size);
> + map_none(buf + size, qemu_real_host_page_size);
> + qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
> +}
> +
> static inline void *alloc_code_gen_buffer(void)
> {
> void *buf = static_code_gen_buffer;
> @@ -586,28 +596,41 @@ static inline void *alloc_code_gen_buffer(void)
> full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
> & qemu_real_host_page_mask) - (uintptr_t)buf;
>
> - /* Reserve a guard page. */
> - size = full_size - qemu_real_host_page_size;
> + /*
> + * Reserve two guard pages, one after each of the two buffers:
> + * | buf1 |g1| buf2 |g2|
> + */
> + size = full_size - 2 * qemu_real_host_page_size;
>
> /* Honor a command-line option limiting the size of the buffer. */
> if (size > tcg_ctx.code_gen_buffer_size) {
> size = (((uintptr_t)buf + tcg_ctx.code_gen_buffer_size)
> & qemu_real_host_page_mask) - (uintptr_t)buf;
> }
> - tcg_ctx.code_gen_buffer_size = size;
>
> #ifdef __mips__
> - if (cross_256mb(buf, size)) {
> - buf = split_cross_256mb(buf, size);
> - size = tcg_ctx.code_gen_buffer_size;
> + /*
> + * Pass 'size + page_size', since we want 'buf1 | guard1 | buf2' to be
> + * within the boundary.
> + */
> + if (cross_256mb(buf, size + qemu_real_host_page_size)) {
> + buf = split_cross_256mb(buf, size + qemu_real_host_page_size);
> + size = tcg_ctx.code_gen_buffer_size - qemu_real_host_page_size;
> }
> #endif
>
> - map_exec(buf, size);
> - map_none(buf + size, qemu_real_host_page_size);
> - qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
> + /* split the buffer in two */
> + size /= 2;
> + tcg_ctx.code_gen_buffer_size = size;
>
> - return buf;
> + static_buf1 = buf;
> + static_buf2 = buf + size + qemu_real_host_page_mask;
> +
> + map_static_code_gen_buffer(static_buf1, size);
> + map_static_code_gen_buffer(static_buf2, size);
> +
> + assert(static_buf_mask == 1);
> + return static_buf1;
> }
> #elif defined(_WIN32)
> static inline void *alloc_code_gen_buffer(void)
> @@ -825,10 +848,100 @@ static void page_flush_tb(void)
> }
> }
>
> +#ifdef USE_STATIC_CODE_GEN_BUFFER
> +
> +struct code_gen_desc {
> + struct rcu_head rcu;
> + int clear_bit;
> +};
> +
> +static void code_gen_buffer_clear(struct rcu_head *rcu)
> +{
> + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> + tb_lock();
> + static_buf_mask &= ~desc->clear_bit;
> + tb_unlock();
> + g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> + struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
> +
> + /*
> + * If both bits are set, we're having two concurrent flushes. This
> + * can easily happen if the buffers are heavily undersized.
> + */
> + assert(static_buf_mask == 1 || static_buf_mask == 2);
> +
> + desc->clear_bit = static_buf_mask;
> + call_rcu1(&desc->rcu, code_gen_buffer_clear);
> +
> + if (static_buf_mask == 1) {
> + static_buf_mask |= 2;
> + return static_buf2;
> + }
> + static_buf_mask |= 1;
> + return static_buf1;
> +}
> +
> +#elif defined(_WIN32)
> +
> +struct code_gen_desc {
> + struct rcu_head rcu;
> + void *buf;
> +};
> +
> +static void code_gen_buffer_vfree(struct rcu_head *rcu)
> +{
> + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> + VirtualFree(desc->buf, 0, MEM_RELEASE);
> + g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> + struct code_gen_desc *desc;
> +
> + desc = g_malloc0(sizeof(*desc));
> + desc->buf = tcg_ctx.code_gen_buffer;
> + call_rcu1(&desc->rcu, code_gen_buffer_vfree);
> +
> + return alloc_code_gen_buffer();
> +}
> +
> +#else /* UNIX, dynamically-allocated code buffer */
> +
> +struct code_gen_desc {
> + struct rcu_head rcu;
> + void *buf;
> + size_t size;
> +};
> +
> +static void code_gen_buffer_unmap(struct rcu_head *rcu)
> +{
> + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> + munmap(desc->buf, desc->size + qemu_real_host_page_size);
> + g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> + struct code_gen_desc *desc;
> +
> + desc = g_malloc0(sizeof(*desc));
> + desc->buf = tcg_ctx.code_gen_buffer;
> + desc->size = tcg_ctx.code_gen_buffer_size;
> + call_rcu1(&desc->rcu, code_gen_buffer_unmap);
> +
> + return alloc_code_gen_buffer();
> +}
> +#endif /* USE_STATIC_CODE_GEN_BUFFER */
> +
> /* flush all the translation blocks */
> -/* XXX: tb_flush is currently not thread safe. System emulation calls it only
> - * with tb_lock taken or from safe_work, so no need to take tb_lock here.
> - */
> void tb_flush(CPUState *cpu)
> {
> #if defined(DEBUG_FLUSH)
> @@ -852,10 +965,17 @@ void tb_flush(CPUState *cpu)
> memset(tcg_ctx.tb_ctx.tb_phys_hash, 0, sizeof(tcg_ctx.tb_ctx.tb_phys_hash));
> page_flush_tb();
>
> + tcg_ctx.code_gen_buffer = code_gen_buffer_replace();
> tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
> + tcg_prologue_init(&tcg_ctx);
> /* XXX: flush processor icache at this point if cache flush is
> expensive */
> tcg_ctx.tb_ctx.tb_flush_count++;
> +
> + /* exit all CPUs so that the old buffer is quickly cleared. */
> + CPU_FOREACH(cpu) {
> + cpu_exit(cpu);
> + }
> }
>
> #ifdef DEBUG_TB_CHECK
--
Alex Bennée
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC v2] translate-all: protect code_gen_buffer with RCU
2016-04-25 15:19 ` Alex Bennée
@ 2016-04-25 15:25 ` Emilio G. Cota
2016-04-25 23:46 ` [Qemu-devel] [RFC v3] " Emilio G. Cota
0 siblings, 1 reply; 21+ messages in thread
From: Emilio G. Cota @ 2016-04-25 15:25 UTC (permalink / raw)
To: Alex Bennée
Cc: QEMU Developers, MTTCG Devel, Paolo Bonzini, Peter Crosthwaite,
Richard Henderson, Sergey Fedorov
On Mon, Apr 25, 2016 at 16:19:59 +0100, Alex Bennée wrote:
>
> Emilio G. Cota <cota@braap.org> writes:
>
> > [ Applies on top of bennee/mttcg/enable-mttcg-for-armv7-v1 after
> > reverting "translate-all: introduces tb_flush_safe". A trivial
> > conflict must be solved after applying. ]
> >
> > This is a first attempt at making tb_flush not have to stop all CPUs.
> > There are issues as pointed out below, but this could be a good start.
> >
> > Context:
> > https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
> > https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
> >
> > Changes from v1:
> > - When a static buffer is used, split it in two instead of using
> > a second buffer.
> >
> > Known issues:
> > - Fails Alex' unit test with low enough -tb-size, see
> > https://lists.gnu.org/archive/html/qemu-devel/2016-04/msg03465.html
> > Seems to work in MTTCG, although I've only tested with tb_lock
> > always being taken in tb_find_fast.
>
> With --enable-debug-tcg I get it failing pretty quickly:
>
> #4 0x00005555556d332a in tcg_global_alloc (s=0x555556007ba0 <tcg_ctx>)
> at /home/alex/lsrc/qemu/qemu.git/tcg/tcg.c:463
> 463 tcg_debug_assert(s->nb_globals == s->nb_temps);
> (gdb) p s->nb_globals
> $1 = 24
> (gdb) p s->nb_temps
> $2 = 31
> Seems odd though, the other threads are all waiting on the tb_lock.
It must be the tcg_prologue_init call, as Richard pointed out.
I'm on it, will report back.
Thanks,
E.
^ permalink raw reply [flat|nested] 21+ messages in thread
* [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
2016-04-25 15:25 ` Emilio G. Cota
@ 2016-04-25 23:46 ` Emilio G. Cota
2016-04-26 4:48 ` Richard Henderson
2016-04-26 6:32 ` Alex Bennée
0 siblings, 2 replies; 21+ messages in thread
From: Emilio G. Cota @ 2016-04-25 23:46 UTC (permalink / raw)
To: QEMU Developers, MTTCG Devel
Cc: Alex Bennée, Paolo Bonzini, Peter Crosthwaite,
Richard Henderson, Sergey Fedorov
Context:
https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
This seems to half-work[*] although I'm uneasy about the while idea.
I see two major hurdles:
* If the TB size is too small, this breaks badly, because we're not
out of the RCU read critical section when another call to tb_flush
is made. For instance, when booting ARM with this patch applied,
I need to at least pass -tb-size 10 for it to fully boot debian
jessie.
* We have different tb_flush callers that should be audited:
$ git grep '\stb_flush(' | grep -v '//' | grep -v 'CPUState'
exec.c: tb_flush(cpu);
gdbstub.c: tb_flush(cpu);
gdbstub.c: tb_flush(cpu);
hw/ppc/spapr_hcall.c: tb_flush(CPU(cpu));
target-alpha/sys_helper.c: tb_flush(CPU(alpha_env_get_cpu(env)));
target-i386/translate.c: tb_flush(CPU(x86_env_get_cpu(env)));
translate-all.c: tb_flush(cpu);
With two code_gen "halves", if two tb_flush calls are done in the same
RCU read critical section, we're screwed. I added a cpu_exit at the end
of tb_flush to try to mitigate this, but I haven't audited all the callers
(for instance, what does the gdbstub do?).
If we end up having a mechanism to "stop all CPUs to do something", as
I think we'll end up needing for correct LL/SC emulation, we'll probably
be better off using that mechanism for tb_flush as well -- plus, we'll avoid
wasting memory.
Other issues:
- This could be split into at least 2 patches, one that touches
tcg/ and another to deal with translate-all.
Note that in translate-all, the initial allocation of code_gen doesn't
allocate extra space for the guard page; reserving guard page space is
done instead by the added split_code_gen_buffer function.
- Windows: not even compile-tested.
[*] "Seems to half-work". At least it boots ARM OK with MTTCG and -smp 4.
Alex' tests, however, sometimes fail with:
Unhandled exception 3 (pabt)
Exception frame registers:
pc : [<fffffb44>] lr : [<00000001>] psr: 20000173
sp : 4004f528 ip : 40012048 fp : 40032ca8
r10: 40032ca8 r9 : 00000000 r8 : 00000000
r7 : 0000000e r6 : 40030000 r5 : 40032ca8 r4 : 00001ac6
r3 : 40012030 r2 : 40012030 r1 : d5ffffe7 r0 : 00000028
Flags: nzCv IRQs on FIQs off Mode SVC_32
Control: 00c5107d Table: 40060000 DAC: 00000000
IFAR: fffffb44 IFSR: 00000205
or with:
CPU0: 16986 irqs (0 races, 11 slow, 1322 ticks avg latency)
FAIL: smc: irq: 17295 IRQs sent, 16986 received
Unhandled exception 6 (irq)
Exception frame registers:
pc : [<00000020>] lr : [<40010800>] psr: 60000153
sp : 400b45c0 ip : 400b34e8 fp : 40032ca8
r10: 00000000 r9 : 00000000 r8 : 00000000
r7 : 00000000 r6 : 00000000 r5 : 00000000 r4 : 00000000
r3 : 00000000 r2 : 00000000 r1 : 000000ff r0 : 00000000
Flags: nZCv IRQs on FIQs off Mode SVC_32
Control: 00c5107d Table: 40060000 DAC: 00000000
I built with --enable-tcg-debug.
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
tcg/tcg.c | 22 +++++---
tcg/tcg.h | 1 +
translate-all.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++----------
3 files changed, 144 insertions(+), 34 deletions(-)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index b46bf1a..7db8ce9 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -380,6 +380,20 @@ void tcg_context_init(TCGContext *s)
}
}
+void tcg_code_gen_init(TCGContext *s, void *buf, size_t prologue_size)
+{
+ size_t size = s->code_gen_buffer_size - prologue_size;
+
+ s->code_gen_ptr = buf;
+ s->code_gen_buffer = buf;
+ s->code_buf = buf;
+
+ /* Compute a high-water mark, at which we voluntarily flush the buffer
+ and start over. The size here is arbitrary, significantly larger
+ than we expect the code generation for any one opcode to require. */
+ s->code_gen_highwater = s->code_buf + (size - 1024);
+}
+
void tcg_prologue_init(TCGContext *s)
{
size_t prologue_size, total_size;
@@ -398,16 +412,10 @@ void tcg_prologue_init(TCGContext *s)
/* Deduct the prologue from the buffer. */
prologue_size = tcg_current_code_size(s);
- s->code_gen_ptr = buf1;
- s->code_gen_buffer = buf1;
- s->code_buf = buf1;
total_size = s->code_gen_buffer_size - prologue_size;
s->code_gen_buffer_size = total_size;
- /* Compute a high-water mark, at which we voluntarily flush the buffer
- and start over. The size here is arbitrary, significantly larger
- than we expect the code generation for any one opcode to require. */
- s->code_gen_highwater = s->code_gen_buffer + (total_size - 1024);
+ tcg_code_gen_init(s, buf1, prologue_size);
tcg_register_jit(s->code_gen_buffer, total_size);
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 40c8fbe..e849d3e 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -634,6 +634,7 @@ static inline void *tcg_malloc(int size)
void tcg_context_init(TCGContext *s);
void tcg_prologue_init(TCGContext *s);
+void tcg_code_gen_init(TCGContext *s, void *buf, size_t prologue_size);
void tcg_func_start(TCGContext *s);
int tcg_gen_code(TCGContext *s, TranslationBlock *tb);
diff --git a/translate-all.c b/translate-all.c
index bba9b62..7a83176 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -485,6 +485,11 @@ static inline PageDesc *page_find(tb_page_addr_t index)
(DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \
? DEFAULT_CODE_GEN_BUFFER_SIZE_1 : MAX_CODE_GEN_BUFFER_SIZE)
+static int code_gen_buf_mask;
+static void *code_gen_buf1;
+static void *code_gen_buf2;
+static size_t code_gen_prologue_size;
+
static inline size_t size_code_gen_buffer(size_t tb_size)
{
/* Size the buffer. */
@@ -508,6 +513,43 @@ static inline size_t size_code_gen_buffer(size_t tb_size)
return tb_size;
}
+static void *split_code_gen_buffer(void *buf, size_t size)
+{
+ /* enforce alignment of the buffer to a page boundary */
+ if (unlikely((uintptr_t)buf & ~qemu_real_host_page_mask)) {
+ uintptr_t b;
+
+ b = QEMU_ALIGN_UP((uintptr_t)buf, qemu_real_host_page_size);
+ size -= b - (uintptr_t)buf;
+ buf = (void *)b;
+ }
+ /*
+ * Make sure the size is an even number of pages so that both halves will
+ * end on a page boundary.
+ */
+ size = QEMU_ALIGN_DOWN(size, 2 * qemu_real_host_page_size);
+
+ /* split in half, making room for 2 guard pages */
+ size -= 2 * qemu_real_host_page_size;
+ size /= 2;
+ code_gen_buf1 = buf;
+ code_gen_buf2 = buf + size + qemu_real_host_page_size;
+
+ /*
+ * write the prologue into buf2. This is safe because we'll later call
+ * tcg_prologue_init on buf1, from which we'll start execution.
+ */
+ tcg_ctx.code_gen_buffer = code_gen_buf2;
+ tcg_prologue_init(&tcg_ctx);
+ code_gen_prologue_size = (void *)tcg_ctx.code_ptr - code_gen_buf2;
+
+ tcg_ctx.code_gen_buffer_size = size;
+
+ /* start execution from buf1 */
+ code_gen_buf_mask = 1;
+ return code_gen_buf1;
+}
+
#ifdef __mips__
/* In order to use J and JAL within the code_gen_buffer, we require
that the buffer not cross a 256MB boundary. */
@@ -583,21 +625,17 @@ static inline void map_none(void *addr, long size)
static inline void *alloc_code_gen_buffer(void)
{
void *buf = static_code_gen_buffer;
- size_t full_size, size;
+ size_t size;
/* The size of the buffer, rounded down to end on a page boundary. */
- full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
- & qemu_real_host_page_mask) - (uintptr_t)buf;
-
- /* Reserve a guard page. */
- size = full_size - qemu_real_host_page_size;
+ size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
+ & qemu_real_host_page_mask) - (uintptr_t)buf;
/* Honor a command-line option limiting the size of the buffer. */
if (size > tcg_ctx.code_gen_buffer_size) {
size = (((uintptr_t)buf + tcg_ctx.code_gen_buffer_size)
& qemu_real_host_page_mask) - (uintptr_t)buf;
}
- tcg_ctx.code_gen_buffer_size = size;
#ifdef __mips__
if (cross_256mb(buf, size)) {
@@ -607,27 +645,40 @@ static inline void *alloc_code_gen_buffer(void)
#endif
map_exec(buf, size);
- map_none(buf + size, qemu_real_host_page_size);
qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
+ buf = split_code_gen_buffer(buf, size);
+ size = tcg_ctx.code_gen_buffer_size;
+
+ /* page guards */
+ map_none(code_gen_buf1 + size, qemu_real_host_page_size);
+ map_none(code_gen_buf2 + size, qemu_real_host_page_size);
+
return buf;
}
#elif defined(_WIN32)
static inline void *alloc_code_gen_buffer(void)
{
size_t size = tcg_ctx.code_gen_buffer_size;
- void *buf1, *buf2;
+ void *buf;
+ void *ret;
- /* Perform the allocation in two steps, so that the guard page
- is reserved but uncommitted. */
- buf1 = VirtualAlloc(NULL, size + qemu_real_host_page_size,
- MEM_RESERVE, PAGE_NOACCESS);
- if (buf1 != NULL) {
- buf2 = VirtualAlloc(buf1, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
- assert(buf1 == buf2);
+ /* Perform the allocation in two steps, so that the guard pages
+ are reserved but uncommitted. */
+ ret = VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
+ if (ret == NULL) {
+ return NULL;
}
- return buf1;
+ ret = split_code_gen_buffer(ret, size);
+ size = tcg_ctx.code_gen_buffer_size;
+
+ buf = VirtualAlloc(code_gen_buf1, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+ assert(buf);
+ buf = VirtualAlloc(code_gen_buf2, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+ assert(buf);
+
+ return ret;
}
#else
static inline void *alloc_code_gen_buffer(void)
@@ -665,8 +716,7 @@ static inline void *alloc_code_gen_buffer(void)
# endif
# endif
- buf = mmap((void *)start, size + qemu_real_host_page_size,
- PROT_NONE, flags, -1, 0);
+ buf = mmap((void *)start, size, PROT_NONE, flags, -1, 0);
if (buf == MAP_FAILED) {
return NULL;
}
@@ -676,24 +726,24 @@ static inline void *alloc_code_gen_buffer(void)
/* Try again, with the original still mapped, to avoid re-acquiring
that 256mb crossing. This time don't specify an address. */
size_t size2;
- void *buf2 = mmap(NULL, size + qemu_real_host_page_size,
- PROT_NONE, flags, -1, 0);
+ void *buf2 = mmap(NULL, size, PROT_NONE, flags, -1, 0);
+
switch (buf2 != MAP_FAILED) {
case 1:
if (!cross_256mb(buf2, size)) {
/* Success! Use the new buffer. */
- munmap(buf, size + qemu_real_host_page_size);
+ munmap(buf, size);
break;
}
/* Failure. Work with what we had. */
- munmap(buf2, size + qemu_real_host_page_size);
+ munmap(buf2, size);
/* fallthru */
default:
/* Split the original buffer. Free the smaller half. */
buf2 = split_cross_256mb(buf, size);
size2 = tcg_ctx.code_gen_buffer_size;
if (buf == buf2) {
- munmap(buf + size2 + qemu_real_host_page_size, size - size2);
+ munmap(buf + size2, size - size2);
} else {
munmap(buf, size - size2);
}
@@ -704,13 +754,19 @@ static inline void *alloc_code_gen_buffer(void)
}
#endif
- /* Make the final buffer accessible. The guard page at the end
- will remain inaccessible with PROT_NONE. */
+ /* Make the final buffer accessible. */
mprotect(buf, size, PROT_WRITE | PROT_READ | PROT_EXEC);
/* Request large pages for the buffer. */
qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
+ buf = split_code_gen_buffer(buf + 1, size);
+ size = tcg_ctx.code_gen_buffer_size;
+
+ /* page guards */
+ mprotect(code_gen_buf1 + size, qemu_real_host_page_size, PROT_NONE);
+ mprotect(code_gen_buf2 + size, qemu_real_host_page_size, PROT_NONE);
+
return buf;
}
#endif /* USE_STATIC_CODE_GEN_BUFFER, WIN32, POSIX */
@@ -829,10 +885,48 @@ static void page_flush_tb(void)
}
}
+struct code_gen_desc {
+ struct rcu_head rcu;
+ int clear_bit;
+};
+
+static void clear_code_gen_buffer(struct rcu_head *rcu)
+{
+ struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+ tb_lock();
+ code_gen_buf_mask &= ~desc->clear_bit;
+ tb_unlock();
+ g_free(desc);
+}
+
+static void *replace_code_gen_buffer(void)
+{
+ struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
+
+ /*
+ * If both bits are set, we're having two concurrent flushes. This
+ * can easily happen if the buffers are heavily undersized.
+ */
+ assert(code_gen_buf_mask == 1 || code_gen_buf_mask == 2);
+
+ desc->clear_bit = code_gen_buf_mask;
+ call_rcu1(&desc->rcu, clear_code_gen_buffer);
+
+ if (code_gen_buf_mask == 1) {
+ code_gen_buf_mask |= 2;
+ return code_gen_buf2 + code_gen_prologue_size;
+ }
+ code_gen_buf_mask |= 1;
+ return code_gen_buf1 + code_gen_prologue_size;
+}
+
/* flush all the translation blocks */
/* XXX: tb_flush is currently not thread safe */
void tb_flush(CPUState *cpu)
{
+ void *buf;
+
#if defined(DEBUG_FLUSH)
printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
(unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer),
@@ -853,10 +947,17 @@ void tb_flush(CPUState *cpu)
qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
page_flush_tb();
- tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
+ buf = replace_code_gen_buffer();
+ tcg_code_gen_init(&tcg_ctx, buf, code_gen_prologue_size);
+
/* XXX: flush processor icache at this point if cache flush is
expensive */
tcg_ctx.tb_ctx.tb_flush_count++;
+
+ /* exit all CPUs so that the old buffer is quickly cleared */
+ CPU_FOREACH(cpu) {
+ cpu_exit(cpu);
+ }
}
#ifdef DEBUG_TB_CHECK
--
2.5.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
2016-04-25 23:46 ` [Qemu-devel] [RFC v3] " Emilio G. Cota
@ 2016-04-26 4:48 ` Richard Henderson
2016-04-26 6:35 ` Alex Bennée
2016-04-26 6:32 ` Alex Bennée
1 sibling, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2016-04-26 4:48 UTC (permalink / raw)
To: Emilio G. Cota, QEMU Developers, MTTCG Devel
Cc: Alex Bennée, Paolo Bonzini, Peter Crosthwaite, Sergey Fedorov
On 04/25/2016 04:46 PM, Emilio G. Cota wrote:
> + /*
> + * write the prologue into buf2. This is safe because we'll later call
> + * tcg_prologue_init on buf1, from which we'll start execution.
> + */
> + tcg_ctx.code_gen_buffer = code_gen_buf2;
> + tcg_prologue_init(&tcg_ctx);
> +
Ah, no. Write only one prologue, not one per buffer.
If they're sufficiently close (i.e. one allocation under the max size),
then the same one can be used for both halves.
The global variables that you didn't see in this revision are:
aarch64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
arm/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
i386/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
ia64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
ia64/tcg-target.inc.c: tcg_insn_unit *thunks[8] = { };
mips/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
ppc/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
s390/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_ld_trampoline[16];
sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_st_trampoline[16];
r~
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
2016-04-25 23:46 ` [Qemu-devel] [RFC v3] " Emilio G. Cota
2016-04-26 4:48 ` Richard Henderson
@ 2016-04-26 6:32 ` Alex Bennée
2016-04-30 3:40 ` Emilio G. Cota
1 sibling, 1 reply; 21+ messages in thread
From: Alex Bennée @ 2016-04-26 6:32 UTC (permalink / raw)
To: Emilio G. Cota
Cc: QEMU Developers, MTTCG Devel, Paolo Bonzini, Peter Crosthwaite,
Richard Henderson, Sergey Fedorov
Emilio G. Cota <cota@braap.org> writes:
> Context:
> https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
> https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
>
> This seems to half-work[*] although I'm uneasy about the while idea.
> I see two major hurdles:
>
> * If the TB size is too small, this breaks badly, because we're not
> out of the RCU read critical section when another call to tb_flush
> is made. For instance, when booting ARM with this patch applied,
> I need to at least pass -tb-size 10 for it to fully boot debian
> jessie.
> * We have different tb_flush callers that should be audited:
> $ git grep '\stb_flush(' | grep -v '//' | grep -v 'CPUState'
> exec.c: tb_flush(cpu);
> gdbstub.c: tb_flush(cpu);
> gdbstub.c: tb_flush(cpu);
> hw/ppc/spapr_hcall.c: tb_flush(CPU(cpu));
> target-alpha/sys_helper.c: tb_flush(CPU(alpha_env_get_cpu(env)));
> target-i386/translate.c: tb_flush(CPU(x86_env_get_cpu(env)));
> translate-all.c: tb_flush(cpu);
>
> With two code_gen "halves", if two tb_flush calls are done in the same
> RCU read critical section, we're screwed. I added a cpu_exit at the end
> of tb_flush to try to mitigate this, but I haven't audited all the callers
> (for instance, what does the gdbstub do?).
I'm not sure we are going to get much from this approach. The tb_flush
is a fairly rare occurrence its not like its on the critical performance
path (although of course pathological cases are possible).
I still think there are possibilities with a smaller TranslationRegion
approach but this is more aimed at solving problems like bulk
invalidations of a page of translations at a time and safer inter-block
patching. It doesn't do much to make the tb_flush easier though.
>
> If we end up having a mechanism to "stop all CPUs to do something", as
> I think we'll end up needing for correct LL/SC emulation, we'll probably
> be better off using that mechanism for tb_flush as well -- plus, we'll avoid
> wasting memory.
I'm fairly certain there will need to be a "stop everything" mode for
some things - I'm less certain of the best way of doing it. Did you get
a chance to look at my version of the async_safe_work mechanism?
>
> Other issues:
> - This could be split into at least 2 patches, one that touches
> tcg/ and another to deal with translate-all.
> Note that in translate-all, the initial allocation of code_gen doesn't
> allocate extra space for the guard page; reserving guard page space is
> done instead by the added split_code_gen_buffer function.
> - Windows: not even compile-tested.
>
> [*] "Seems to half-work". At least it boots ARM OK with MTTCG and -smp 4.
> Alex' tests, however, sometimes fail with:
>
> Unhandled exception 3 (pabt)
> Exception frame registers:
> pc : [<fffffb44>] lr : [<00000001>] psr: 20000173
> sp : 4004f528 ip : 40012048 fp : 40032ca8
> r10: 40032ca8 r9 : 00000000 r8 : 00000000
> r7 : 0000000e r6 : 40030000 r5 : 40032ca8 r4 : 00001ac6
> r3 : 40012030 r2 : 40012030 r1 : d5ffffe7 r0 : 00000028
> Flags: nzCv IRQs on FIQs off Mode SVC_32
> Control: 00c5107d Table: 40060000 DAC: 00000000
> IFAR: fffffb44 IFSR: 00000205
>
> or with:
>
> CPU0: 16986 irqs (0 races, 11 slow, 1322 ticks avg latency)
> FAIL: smc: irq: 17295 IRQs sent, 16986 received
>
> Unhandled exception 6 (irq)
> Exception frame registers:
> pc : [<00000020>] lr : [<40010800>] psr: 60000153
> sp : 400b45c0 ip : 400b34e8 fp : 40032ca8
> r10: 00000000 r9 : 00000000 r8 : 00000000
> r7 : 00000000 r6 : 00000000 r5 : 00000000 r4 : 00000000
> r3 : 00000000 r2 : 00000000 r1 : 000000ff r0 : 00000000
> Flags: nZCv IRQs on FIQs off Mode SVC_32
> Control: 00c5107d Table: 40060000 DAC: 00000000
>
> I built with --enable-tcg-debug.
I'll have another go at reproducing. I could only get the asserts to
fire which was odd because AFAICT the prologue generation which
triggered it was serialised with tb_lock.
>
> Signed-off-by: Emilio G. Cota <cota@braap.org>
> ---
> tcg/tcg.c | 22 +++++---
> tcg/tcg.h | 1 +
> translate-all.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++----------
> 3 files changed, 144 insertions(+), 34 deletions(-)
>
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index b46bf1a..7db8ce9 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -380,6 +380,20 @@ void tcg_context_init(TCGContext *s)
> }
> }
>
> +void tcg_code_gen_init(TCGContext *s, void *buf, size_t prologue_size)
> +{
> + size_t size = s->code_gen_buffer_size - prologue_size;
> +
> + s->code_gen_ptr = buf;
> + s->code_gen_buffer = buf;
> + s->code_buf = buf;
> +
> + /* Compute a high-water mark, at which we voluntarily flush the buffer
> + and start over. The size here is arbitrary, significantly larger
> + than we expect the code generation for any one opcode to require. */
> + s->code_gen_highwater = s->code_buf + (size - 1024);
> +}
> +
> void tcg_prologue_init(TCGContext *s)
> {
> size_t prologue_size, total_size;
> @@ -398,16 +412,10 @@ void tcg_prologue_init(TCGContext *s)
>
> /* Deduct the prologue from the buffer. */
> prologue_size = tcg_current_code_size(s);
> - s->code_gen_ptr = buf1;
> - s->code_gen_buffer = buf1;
> - s->code_buf = buf1;
> total_size = s->code_gen_buffer_size - prologue_size;
> s->code_gen_buffer_size = total_size;
>
> - /* Compute a high-water mark, at which we voluntarily flush the buffer
> - and start over. The size here is arbitrary, significantly larger
> - than we expect the code generation for any one opcode to require. */
> - s->code_gen_highwater = s->code_gen_buffer + (total_size - 1024);
> + tcg_code_gen_init(s, buf1, prologue_size);
>
> tcg_register_jit(s->code_gen_buffer, total_size);
>
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index 40c8fbe..e849d3e 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -634,6 +634,7 @@ static inline void *tcg_malloc(int size)
>
> void tcg_context_init(TCGContext *s);
> void tcg_prologue_init(TCGContext *s);
> +void tcg_code_gen_init(TCGContext *s, void *buf, size_t prologue_size);
> void tcg_func_start(TCGContext *s);
>
> int tcg_gen_code(TCGContext *s, TranslationBlock *tb);
> diff --git a/translate-all.c b/translate-all.c
> index bba9b62..7a83176 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -485,6 +485,11 @@ static inline PageDesc *page_find(tb_page_addr_t index)
> (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \
> ? DEFAULT_CODE_GEN_BUFFER_SIZE_1 : MAX_CODE_GEN_BUFFER_SIZE)
>
> +static int code_gen_buf_mask;
> +static void *code_gen_buf1;
> +static void *code_gen_buf2;
> +static size_t code_gen_prologue_size;
> +
> static inline size_t size_code_gen_buffer(size_t tb_size)
> {
> /* Size the buffer. */
> @@ -508,6 +513,43 @@ static inline size_t size_code_gen_buffer(size_t tb_size)
> return tb_size;
> }
>
> +static void *split_code_gen_buffer(void *buf, size_t size)
> +{
> + /* enforce alignment of the buffer to a page boundary */
> + if (unlikely((uintptr_t)buf & ~qemu_real_host_page_mask)) {
> + uintptr_t b;
> +
> + b = QEMU_ALIGN_UP((uintptr_t)buf, qemu_real_host_page_size);
> + size -= b - (uintptr_t)buf;
> + buf = (void *)b;
> + }
> + /*
> + * Make sure the size is an even number of pages so that both halves will
> + * end on a page boundary.
> + */
> + size = QEMU_ALIGN_DOWN(size, 2 * qemu_real_host_page_size);
> +
> + /* split in half, making room for 2 guard pages */
> + size -= 2 * qemu_real_host_page_size;
> + size /= 2;
> + code_gen_buf1 = buf;
> + code_gen_buf2 = buf + size + qemu_real_host_page_size;
> +
> + /*
> + * write the prologue into buf2. This is safe because we'll later call
> + * tcg_prologue_init on buf1, from which we'll start execution.
> + */
> + tcg_ctx.code_gen_buffer = code_gen_buf2;
> + tcg_prologue_init(&tcg_ctx);
> + code_gen_prologue_size = (void *)tcg_ctx.code_ptr - code_gen_buf2;
> +
> + tcg_ctx.code_gen_buffer_size = size;
> +
> + /* start execution from buf1 */
> + code_gen_buf_mask = 1;
> + return code_gen_buf1;
> +}
> +
> #ifdef __mips__
> /* In order to use J and JAL within the code_gen_buffer, we require
> that the buffer not cross a 256MB boundary. */
> @@ -583,21 +625,17 @@ static inline void map_none(void *addr, long size)
> static inline void *alloc_code_gen_buffer(void)
> {
> void *buf = static_code_gen_buffer;
> - size_t full_size, size;
> + size_t size;
>
> /* The size of the buffer, rounded down to end on a page boundary. */
> - full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
> - & qemu_real_host_page_mask) - (uintptr_t)buf;
> -
> - /* Reserve a guard page. */
> - size = full_size - qemu_real_host_page_size;
> + size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
> + & qemu_real_host_page_mask) - (uintptr_t)buf;
>
> /* Honor a command-line option limiting the size of the buffer. */
> if (size > tcg_ctx.code_gen_buffer_size) {
> size = (((uintptr_t)buf + tcg_ctx.code_gen_buffer_size)
> & qemu_real_host_page_mask) - (uintptr_t)buf;
> }
> - tcg_ctx.code_gen_buffer_size = size;
>
> #ifdef __mips__
> if (cross_256mb(buf, size)) {
> @@ -607,27 +645,40 @@ static inline void *alloc_code_gen_buffer(void)
> #endif
>
> map_exec(buf, size);
> - map_none(buf + size, qemu_real_host_page_size);
> qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
>
> + buf = split_code_gen_buffer(buf, size);
> + size = tcg_ctx.code_gen_buffer_size;
> +
> + /* page guards */
> + map_none(code_gen_buf1 + size, qemu_real_host_page_size);
> + map_none(code_gen_buf2 + size, qemu_real_host_page_size);
> +
> return buf;
> }
> #elif defined(_WIN32)
> static inline void *alloc_code_gen_buffer(void)
> {
> size_t size = tcg_ctx.code_gen_buffer_size;
> - void *buf1, *buf2;
> + void *buf;
> + void *ret;
>
> - /* Perform the allocation in two steps, so that the guard page
> - is reserved but uncommitted. */
> - buf1 = VirtualAlloc(NULL, size + qemu_real_host_page_size,
> - MEM_RESERVE, PAGE_NOACCESS);
> - if (buf1 != NULL) {
> - buf2 = VirtualAlloc(buf1, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
> - assert(buf1 == buf2);
> + /* Perform the allocation in two steps, so that the guard pages
> + are reserved but uncommitted. */
> + ret = VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
> + if (ret == NULL) {
> + return NULL;
> }
>
> - return buf1;
> + ret = split_code_gen_buffer(ret, size);
> + size = tcg_ctx.code_gen_buffer_size;
> +
> + buf = VirtualAlloc(code_gen_buf1, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
> + assert(buf);
> + buf = VirtualAlloc(code_gen_buf2, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
> + assert(buf);
> +
> + return ret;
> }
> #else
> static inline void *alloc_code_gen_buffer(void)
> @@ -665,8 +716,7 @@ static inline void *alloc_code_gen_buffer(void)
> # endif
> # endif
>
> - buf = mmap((void *)start, size + qemu_real_host_page_size,
> - PROT_NONE, flags, -1, 0);
> + buf = mmap((void *)start, size, PROT_NONE, flags, -1, 0);
> if (buf == MAP_FAILED) {
> return NULL;
> }
> @@ -676,24 +726,24 @@ static inline void *alloc_code_gen_buffer(void)
> /* Try again, with the original still mapped, to avoid re-acquiring
> that 256mb crossing. This time don't specify an address. */
> size_t size2;
> - void *buf2 = mmap(NULL, size + qemu_real_host_page_size,
> - PROT_NONE, flags, -1, 0);
> + void *buf2 = mmap(NULL, size, PROT_NONE, flags, -1, 0);
> +
> switch (buf2 != MAP_FAILED) {
> case 1:
> if (!cross_256mb(buf2, size)) {
> /* Success! Use the new buffer. */
> - munmap(buf, size + qemu_real_host_page_size);
> + munmap(buf, size);
> break;
> }
> /* Failure. Work with what we had. */
> - munmap(buf2, size + qemu_real_host_page_size);
> + munmap(buf2, size);
> /* fallthru */
> default:
> /* Split the original buffer. Free the smaller half. */
> buf2 = split_cross_256mb(buf, size);
> size2 = tcg_ctx.code_gen_buffer_size;
> if (buf == buf2) {
> - munmap(buf + size2 + qemu_real_host_page_size, size - size2);
> + munmap(buf + size2, size - size2);
> } else {
> munmap(buf, size - size2);
> }
> @@ -704,13 +754,19 @@ static inline void *alloc_code_gen_buffer(void)
> }
> #endif
>
> - /* Make the final buffer accessible. The guard page at the end
> - will remain inaccessible with PROT_NONE. */
> + /* Make the final buffer accessible. */
> mprotect(buf, size, PROT_WRITE | PROT_READ | PROT_EXEC);
>
> /* Request large pages for the buffer. */
> qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
>
> + buf = split_code_gen_buffer(buf + 1, size);
> + size = tcg_ctx.code_gen_buffer_size;
> +
> + /* page guards */
> + mprotect(code_gen_buf1 + size, qemu_real_host_page_size, PROT_NONE);
> + mprotect(code_gen_buf2 + size, qemu_real_host_page_size, PROT_NONE);
> +
> return buf;
> }
> #endif /* USE_STATIC_CODE_GEN_BUFFER, WIN32, POSIX */
> @@ -829,10 +885,48 @@ static void page_flush_tb(void)
> }
> }
>
> +struct code_gen_desc {
> + struct rcu_head rcu;
> + int clear_bit;
> +};
> +
> +static void clear_code_gen_buffer(struct rcu_head *rcu)
> +{
> + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> + tb_lock();
> + code_gen_buf_mask &= ~desc->clear_bit;
> + tb_unlock();
> + g_free(desc);
> +}
> +
> +static void *replace_code_gen_buffer(void)
> +{
> + struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
> +
> + /*
> + * If both bits are set, we're having two concurrent flushes. This
> + * can easily happen if the buffers are heavily undersized.
> + */
> + assert(code_gen_buf_mask == 1 || code_gen_buf_mask == 2);
> +
> + desc->clear_bit = code_gen_buf_mask;
> + call_rcu1(&desc->rcu, clear_code_gen_buffer);
> +
> + if (code_gen_buf_mask == 1) {
> + code_gen_buf_mask |= 2;
> + return code_gen_buf2 + code_gen_prologue_size;
> + }
> + code_gen_buf_mask |= 1;
> + return code_gen_buf1 + code_gen_prologue_size;
> +}
> +
> /* flush all the translation blocks */
> /* XXX: tb_flush is currently not thread safe */
> void tb_flush(CPUState *cpu)
> {
> + void *buf;
> +
> #if defined(DEBUG_FLUSH)
> printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
> (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer),
> @@ -853,10 +947,17 @@ void tb_flush(CPUState *cpu)
> qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
> page_flush_tb();
>
> - tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
> + buf = replace_code_gen_buffer();
> + tcg_code_gen_init(&tcg_ctx, buf, code_gen_prologue_size);
> +
> /* XXX: flush processor icache at this point if cache flush is
> expensive */
> tcg_ctx.tb_ctx.tb_flush_count++;
> +
> + /* exit all CPUs so that the old buffer is quickly cleared */
> + CPU_FOREACH(cpu) {
> + cpu_exit(cpu);
> + }
> }
>
> #ifdef DEBUG_TB_CHECK
--
Alex Bennée
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
2016-04-26 4:48 ` Richard Henderson
@ 2016-04-26 6:35 ` Alex Bennée
2016-04-26 15:42 ` Richard Henderson
0 siblings, 1 reply; 21+ messages in thread
From: Alex Bennée @ 2016-04-26 6:35 UTC (permalink / raw)
To: Richard Henderson
Cc: Emilio G. Cota, QEMU Developers, MTTCG Devel, Paolo Bonzini,
Peter Crosthwaite, Sergey Fedorov
Richard Henderson <rth@twiddle.net> writes:
> On 04/25/2016 04:46 PM, Emilio G. Cota wrote:
>> + /*
>> + * write the prologue into buf2. This is safe because we'll later call
>> + * tcg_prologue_init on buf1, from which we'll start execution.
>> + */
>> + tcg_ctx.code_gen_buffer = code_gen_buf2;
>> + tcg_prologue_init(&tcg_ctx);
>> +
>
> Ah, no. Write only one prologue, not one per buffer.
>
> If they're sufficiently close (i.e. one allocation under the max size),
> then the same one can be used for both halves.
>
> The global variables that you didn't see in this revision are:
>
> aarch64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> arm/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> i386/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> ia64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> ia64/tcg-target.inc.c: tcg_insn_unit *thunks[8] = { };
> mips/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> ppc/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> s390/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
> sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_ld_trampoline[16];
> sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_st_trampoline[16];
Aside from the existing code structure is there any reason to have only
one prologue? It doesn't seem to be a large amount of code and in the
case of having smaller translation regions I would posit having a
"local" prologue/epilogue would make the jumps cheaper.
>
>
> r~
--
Alex Bennée
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
2016-04-26 6:35 ` Alex Bennée
@ 2016-04-26 15:42 ` Richard Henderson
0 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2016-04-26 15:42 UTC (permalink / raw)
To: Alex Bennée
Cc: Emilio G. Cota, QEMU Developers, MTTCG Devel, Paolo Bonzini,
Peter Crosthwaite, Sergey Fedorov
On 04/25/2016 11:35 PM, Alex Bennée wrote:
>
> Richard Henderson <rth@twiddle.net> writes:
>
>> On 04/25/2016 04:46 PM, Emilio G. Cota wrote:
>>> + /*
>>> + * write the prologue into buf2. This is safe because we'll later call
>>> + * tcg_prologue_init on buf1, from which we'll start execution.
>>> + */
>>> + tcg_ctx.code_gen_buffer = code_gen_buf2;
>>> + tcg_prologue_init(&tcg_ctx);
>>> +
>>
>> Ah, no. Write only one prologue, not one per buffer.
>>
>> If they're sufficiently close (i.e. one allocation under the max size),
>> then the same one can be used for both halves.
>>
>> The global variables that you didn't see in this revision are:
>>
>> aarch64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> arm/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> i386/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> ia64/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> ia64/tcg-target.inc.c: tcg_insn_unit *thunks[8] = { };
>> mips/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> ppc/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> s390/tcg-target.inc.c:static tcg_insn_unit *tb_ret_addr;
>> sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_ld_trampoline[16];
>> sparc/tcg-target.inc.c:static tcg_insn_unit *qemu_st_trampoline[16];
>
> Aside from the existing code structure is there any reason to have only
> one prologue?
Well, there's also the gdb jit unwind info. But aside from those, no.
> It doesn't seem to be a large amount of code and in the
> case of having smaller translation regions I would posit having a
> "local" prologue/epilogue would make the jumps cheaper.
Not really. The jumps are generally in range already, based on the restriction
in max buffer size.
Really only arm32 (and ppc32, post direct jump atomicity patchset) are the only
ones that require a tiny (less than 64MB) buffer. Anything bigger than 64MB, I
don't see any reason to create two independent buffers.
The other consideration not yet mentioned is that you'd like to put on the
entire buffer, in the case of x86_64 and some others, within 2GB of the main
executable, so that helper calls can use a direct call insn.
r~
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
2016-04-26 6:32 ` Alex Bennée
@ 2016-04-30 3:40 ` Emilio G. Cota
2016-05-09 11:21 ` Paolo Bonzini
0 siblings, 1 reply; 21+ messages in thread
From: Emilio G. Cota @ 2016-04-30 3:40 UTC (permalink / raw)
To: Alex Bennée
Cc: QEMU Developers, MTTCG Devel, Paolo Bonzini, Peter Crosthwaite,
Richard Henderson, Sergey Fedorov
On Tue, Apr 26, 2016 at 07:32:39 +0100, Alex Bennée wrote:
> Emilio G. Cota <cota@braap.org> writes:
> > With two code_gen "halves", if two tb_flush calls are done in the same
> > RCU read critical section, we're screwed. I added a cpu_exit at the end
> > of tb_flush to try to mitigate this, but I haven't audited all the callers
> > (for instance, what does the gdbstub do?).
>
> I'm not sure we are going to get much from this approach. The tb_flush
> is a fairly rare occurrence its not like its on the critical performance
> path (although of course pathological cases are possible).
This is what I thought from the beginning, but wanted to give this
alternative a go anyway to see if it was feasible.
On my end I won't do any more work on this approach. Will go back
to locks, despite Paolo's (justified) dislike for them =)
> > If we end up having a mechanism to "stop all CPUs to do something", as
> > I think we'll end up needing for correct LL/SC emulation, we'll probably
> > be better off using that mechanism for tb_flush as well -- plus, we'll avoid
> > wasting memory.
>
> I'm fairly certain there will need to be a "stop everything" mode for
> some things - I'm less certain of the best way of doing it. Did you get
> a chance to look at my version of the async_safe_work mechanism?
Not yet, but will get to it very soon.
Cheers,
Emilio
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
2016-04-30 3:40 ` Emilio G. Cota
@ 2016-05-09 11:21 ` Paolo Bonzini
2016-05-09 11:50 ` Alex Bennée
2016-05-09 17:07 ` Emilio G. Cota
0 siblings, 2 replies; 21+ messages in thread
From: Paolo Bonzini @ 2016-05-09 11:21 UTC (permalink / raw)
To: Emilio G. Cota, Alex Bennée
Cc: QEMU Developers, MTTCG Devel, Peter Crosthwaite,
Richard Henderson, Sergey Fedorov
On 30/04/2016 05:40, Emilio G. Cota wrote:
>> The tb_flush
>> > is a fairly rare occurrence its not like its on the critical performance
>> > path (although of course pathological cases are possible).
> This is what I thought from the beginning, but wanted to give this
> alternative a go anyway to see if it was feasible.
>
> On my end I won't do any more work on this approach. Will go back
> to locks, despite Paolo's (justified) dislike for them =)
Which locks? tb_lock during tb_find_fast? The problem with that was
that it slowed down everything a lot, wasn't it?
To me, the RCU idea is not really about making tb_flush (the rare case)
faster; it was more about keeping the rest simple and fast.
Paolo
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
2016-05-09 11:21 ` Paolo Bonzini
@ 2016-05-09 11:50 ` Alex Bennée
2016-05-09 13:55 ` Paolo Bonzini
2016-05-09 17:07 ` Emilio G. Cota
1 sibling, 1 reply; 21+ messages in thread
From: Alex Bennée @ 2016-05-09 11:50 UTC (permalink / raw)
To: Paolo Bonzini
Cc: Emilio G. Cota, QEMU Developers, MTTCG Devel, Peter Crosthwaite,
Richard Henderson, Sergey Fedorov
Paolo Bonzini <pbonzini@redhat.com> writes:
> On 30/04/2016 05:40, Emilio G. Cota wrote:
>>> The tb_flush
>>> > is a fairly rare occurrence its not like its on the critical performance
>>> > path (although of course pathological cases are possible).
>> This is what I thought from the beginning, but wanted to give this
>> alternative a go anyway to see if it was feasible.
>>
>> On my end I won't do any more work on this approach. Will go back
>> to locks, despite Paolo's (justified) dislike for them =)
>
> Which locks? tb_lock during tb_find_fast? The problem with that was
> that it slowed down everything a lot, wasn't it?
Very much so, in the new tree (coming soon) with QHT I was able to
remove the locks from the whole hot-path which means they where only
needed for code generation.
> To me, the RCU idea is not really about making tb_flush (the rare case)
> faster; it was more about keeping the rest simple and fast.
I'm not sure it achieved that as there is added complexity from having
the split buffer and then ensuring you don't double-flush.
>
> Paolo
--
Alex Bennée
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
2016-05-09 11:50 ` Alex Bennée
@ 2016-05-09 13:55 ` Paolo Bonzini
2016-05-09 15:05 ` Alex Bennée
0 siblings, 1 reply; 21+ messages in thread
From: Paolo Bonzini @ 2016-05-09 13:55 UTC (permalink / raw)
To: Alex Bennée
Cc: Emilio G. Cota, QEMU Developers, MTTCG Devel, Peter Crosthwaite,
Richard Henderson, Sergey Fedorov
On 09/05/2016 13:50, Alex Bennée wrote:
> > Which locks? tb_lock during tb_find_fast? The problem with that was
> > that it slowed down everything a lot, wasn't it?
>
> Very much so, in the new tree (coming soon) with QHT I was able to
> remove the locks from the whole hot-path which means they where only
> needed for code generation.
Okay, I'm curious now. :)
> > To me, the RCU idea is not really about making tb_flush (the rare case)
> > faster; it was more about keeping the rest simple and fast.
>
> I'm not sure it achieved that as there is added complexity from having
> the split buffer and then ensuring you don't double-flush.
Agreed.
Paolo
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
2016-05-09 13:55 ` Paolo Bonzini
@ 2016-05-09 15:05 ` Alex Bennée
0 siblings, 0 replies; 21+ messages in thread
From: Alex Bennée @ 2016-05-09 15:05 UTC (permalink / raw)
To: Paolo Bonzini
Cc: Emilio G. Cota, QEMU Developers, MTTCG Devel, Peter Crosthwaite,
Richard Henderson, Sergey Fedorov
Paolo Bonzini <pbonzini@redhat.com> writes:
> On 09/05/2016 13:50, Alex Bennée wrote:
>> > Which locks? tb_lock during tb_find_fast? The problem with that was
>> > that it slowed down everything a lot, wasn't it?
>>
>> Very much so, in the new tree (coming soon) with QHT I was able to
>> remove the locks from the whole hot-path which means they where only
>> needed for code generation.
>
> Okay, I'm curious now. :)
https://github.com/stsquad/qemu/commits/mttcg/base-patches-v3 is the
current WIP, with:
https://github.com/stsquad/qemu/commit/0823f1c77f12ed5958f77484d6477ea205aee220
being the commit that clears the hot-path to run without locks.
The tree is based on tcg-next which has made things a lot cleaner now a
bunch of Sergey's stuff has been grabbed by rth. Obviously being WIP
subject to change. Once I'm done with my current out-of-tree diversions
I'll be back onto cleaning the tree up for the next review round.
Review comments on the posted tree's always welcome of course ;-)
>
>> > To me, the RCU idea is not really about making tb_flush (the rare case)
>> > faster; it was more about keeping the rest simple and fast.
>>
>> I'm not sure it achieved that as there is added complexity from having
>> the split buffer and then ensuring you don't double-flush.
>
> Agreed.
>
> Paolo
--
Alex Bennée
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU
2016-05-09 11:21 ` Paolo Bonzini
2016-05-09 11:50 ` Alex Bennée
@ 2016-05-09 17:07 ` Emilio G. Cota
1 sibling, 0 replies; 21+ messages in thread
From: Emilio G. Cota @ 2016-05-09 17:07 UTC (permalink / raw)
To: Paolo Bonzini
Cc: Alex Bennée, QEMU Developers, MTTCG Devel,
Peter Crosthwaite, Richard Henderson, Sergey Fedorov
On Mon, May 09, 2016 at 13:21:50 +0200, Paolo Bonzini wrote:
> On 30/04/2016 05:40, Emilio G. Cota wrote:
> >> The tb_flush
> >> > is a fairly rare occurrence its not like its on the critical performance
> >> > path (although of course pathological cases are possible).
> > This is what I thought from the beginning, but wanted to give this
> > alternative a go anyway to see if it was feasible.
> >
> > On my end I won't do any more work on this approach. Will go back
> > to locks, despite Paolo's (justified) dislike for them =)
>
> Which locks? tb_lock during tb_find_fast? The problem with that was
> that it slowed down everything a lot, wasn't it?
By "locks" I meant somehow forcing other threads/vcpus to stop, to then
perform tb_flush. This can be achieved with a bunch of primitives
such as condition variables (*gaaah*) -- these of course involve "locks" :P
The removal of tb_lock() when looking up tb's is orthogonal to this.
> To me, the RCU idea is not really about making tb_flush (the rare case)
> faster; it was more about keeping the rest simple and fast.
Well I agree with this idea; I wanted to see how far we could take it.
E.
^ permalink raw reply [flat|nested] 21+ messages in thread
end of thread, other threads:[~2016-05-09 17:07 UTC | newest]
Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-22 0:06 [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU Emilio G. Cota
2016-04-22 14:41 ` Alex Bennée
2016-04-22 14:47 ` Alex Bennée
2016-04-24 3:20 ` Emilio G. Cota
2016-04-25 8:35 ` Alex Bennée
2016-04-22 18:25 ` Richard Henderson
2016-04-24 3:27 ` [Qemu-devel] [RFC v2] " Emilio G. Cota
2016-04-24 18:12 ` Richard Henderson
2016-04-25 15:19 ` Alex Bennée
2016-04-25 15:25 ` Emilio G. Cota
2016-04-25 23:46 ` [Qemu-devel] [RFC v3] " Emilio G. Cota
2016-04-26 4:48 ` Richard Henderson
2016-04-26 6:35 ` Alex Bennée
2016-04-26 15:42 ` Richard Henderson
2016-04-26 6:32 ` Alex Bennée
2016-04-30 3:40 ` Emilio G. Cota
2016-05-09 11:21 ` Paolo Bonzini
2016-05-09 11:50 ` Alex Bennée
2016-05-09 13:55 ` Paolo Bonzini
2016-05-09 15:05 ` Alex Bennée
2016-05-09 17:07 ` Emilio G. Cota
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.