[Qemu-devel] [PATCH 2/4] exec: [tcg] Use multiple physical TB caches

From: "Lluís Vilanova" <vilanova@ac.upc.edu>
To: qemu-devel@nongnu.org
Cc: Eric Blake <eblake@redhat.com>,
	Eduardo Habkost <ehabkost@redhat.com>,
	Stefan Hajnoczi <stefanha@redhat.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Peter Crosthwaite <crosthwaite.peter@gmail.com>,
	Richard Henderson <rth@twiddle.net>
Subject: [Qemu-devel] [PATCH 2/4] exec: [tcg] Use multiple physical TB caches
Date: Wed, 14 Sep 2016 23:23:28 +0200	[thread overview]
Message-ID: <147388820802.17002.12187474866416310198.stgit@fimbulvetr.bsc.es> (raw)
In-Reply-To: <147388819720.17002.17020698136656908126.stgit@fimbulvetr.bsc.es>

The physical TB cache is split into 2^E caches, where E is the number of
events with the "vcpu" and without the "disable" properties.

The virtual TB cache on each vCPU uses a (potentially) different
physical TB cache.

This is later exploited to support different tracing event states on a
per-vCPU basis.

Signed-off-by: Lluís Vilanova <vilanova@ac.upc.edu>
---
 cpu-exec.c                |    5 ++++
 include/exec/exec-all.h   |    6 +++++
 include/exec/tb-context.h |    2 +-
 include/qom/cpu.h         |    4 +++-
 qom/cpu.c                 |    1 +
 translate-all.c           |   51 +++++++++++++++++++++++++++++++++++++--------
 translate-all.h           |   17 +++++++++++++++
 translate-all.inc.h       |   13 +++++++++++
 8 files changed, 87 insertions(+), 12 deletions(-)
 create mode 100644 translate-all.inc.h

diff --git a/cpu-exec.c b/cpu-exec.c
index 5d9710a..7b2d8c6 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -33,6 +33,7 @@
 #include "hw/i386/apic.h"
 #endif
 #include "sysemu/replay.h"
+#include "translate-all.h"
 
 /* -icount align implementation. */
 
@@ -267,6 +268,7 @@ static TranslationBlock *tb_find_physical(CPUState *cpu,
     tb_page_addr_t phys_pc;
     struct tb_desc desc;
     uint32_t h;
+    struct qht *qht;
 
     desc.env = (CPUArchState *)cpu->env_ptr;
     desc.cs_base = cs_base;
@@ -275,7 +277,8 @@ static TranslationBlock *tb_find_physical(CPUState *cpu,
     phys_pc = get_page_addr_code(desc.env, pc);
     desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
     h = tb_hash_func(phys_pc, pc, flags);
-    return qht_lookup(&tcg_ctx.tb_ctx.htable, tb_cmp, &desc, h);
+    qht = tb_caches_get(&tcg_ctx.tb_ctx, cpu->tb_cache_idx);
+    return qht_lookup(qht, tb_cmp, &desc, h);
 }
 
 static TranslationBlock *tb_find_slow(CPUState *cpu,
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index e2124dc..4ae04f6 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -211,6 +211,10 @@ static inline void tlb_flush_by_mmuidx(CPUState *cpu, ...)
 #define USE_DIRECT_JUMP
 #endif
 
+/**
+ * TranslationBlock:
+ * @tb_cache_idx: Index of physical TB cache where this TB has been allocated.
+ */
 struct TranslationBlock {
     target_ulong pc;   /* simulated PC corresponding to this block (EIP + CS base) */
     target_ulong cs_base; /* CS base for this block */
@@ -262,6 +266,8 @@ struct TranslationBlock {
      */
     uintptr_t jmp_list_next[2];
     uintptr_t jmp_list_first;
+
+    DECLARE_BITMAP(tb_cache_idx, TRACE_VCPU_EVENT_COUNT);
 };
 
 void tb_free(TranslationBlock *tb);
diff --git a/include/exec/tb-context.h b/include/exec/tb-context.h
index dce95d9..7728904 100644
--- a/include/exec/tb-context.h
+++ b/include/exec/tb-context.h
@@ -32,7 +32,7 @@ typedef struct TBContext TBContext;
 struct TBContext {
 
     TranslationBlock *tbs;
-    struct qht htable;
+    struct qht *htables;
     int nb_tbs;
     /* any access to the tbs or the page table must use this lock */
     QemuMutex tb_lock;
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index ce0c406..d870810 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -282,6 +282,7 @@ struct qemu_work_item {
  * @kvm_fd: vCPU file descriptor for KVM.
  * @work_mutex: Lock to prevent multiple access to queued_work_*.
  * @queued_work_first: First asynchronous work pending.
+ * @tb_cache_idx: Index of current TB cache.
  * @trace_dstate: Dynamic tracing state of events for this vCPU (bitmask).
  *
  * State of one CPU core or thread.
@@ -350,7 +351,8 @@ struct CPUState {
     struct KVMState *kvm_state;
     struct kvm_run *kvm_run;
 
-    /* Used for events with 'vcpu' and *without* the 'disabled' properties */
+    /* Used for events with 'vcpu' and *without* the 'disable' properties */
+    DECLARE_BITMAP(tb_cache_idx, TRACE_VCPU_EVENT_COUNT);
     DECLARE_BITMAP(trace_dstate, TRACE_VCPU_EVENT_COUNT);
 
     /* TODO Move common fields from CPUArchState here. */
diff --git a/qom/cpu.c b/qom/cpu.c
index 2553247..2225103 100644
--- a/qom/cpu.c
+++ b/qom/cpu.c
@@ -345,6 +345,7 @@ static void cpu_common_initfn(Object *obj)
     qemu_mutex_init(&cpu->work_mutex);
     QTAILQ_INIT(&cpu->breakpoints);
     QTAILQ_INIT(&cpu->watchpoints);
+    bitmap_zero(cpu->tb_cache_idx, TRACE_VCPU_EVENT_COUNT);
     bitmap_zero(cpu->trace_dstate, TRACE_VCPU_EVENT_COUNT);
 }
 
diff --git a/translate-all.c b/translate-all.c
index ebd9fa0..c864eee 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -733,11 +733,22 @@ static inline void code_gen_alloc(size_t tb_size)
     qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock);
 }
 
+/*
+ * Ensure bitmaps can be used as indexes.
+ */
+void *__error__too_many_vcpu_events[
+    (TRACE_VCPU_EVENT_COUNT + 1) <= BITS_PER_LONG ? 0 : -1];
+
 static void tb_htable_init(void)
 {
+    int cache;
     unsigned int mode = QHT_MODE_AUTO_RESIZE;
 
-    qht_init(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE, mode);
+    tcg_ctx.tb_ctx.htables = g_malloc(
+        sizeof(tcg_ctx.tb_ctx.htables[0]) * tb_caches_count());
+    for (cache = 0; cache < tb_caches_count(); cache++) {
+        qht_init(&tcg_ctx.tb_ctx.htables[cache], CODE_GEN_HTABLE_SIZE, mode);
+    }
 }
 
 /* Must be called before using the QEMU cpus. 'tb_size' is the size
@@ -834,6 +845,8 @@ static void page_flush_tb(void)
 /* XXX: tb_flush is currently not thread safe */
 void tb_flush(CPUState *cpu)
 {
+    int i;
+
     if (!tcg_enabled()) {
         return;
     }
@@ -854,7 +867,9 @@ void tb_flush(CPUState *cpu)
         tb_flush_jmp_cache_all(cpu);
     }
 
-    qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
+    for (i = 0; i < tb_caches_count(); i++) {
+        qht_reset_size(&tcg_ctx.tb_ctx.htables[i], CODE_GEN_HTABLE_SIZE);
+    }
     page_flush_tb();
 
     tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
@@ -879,8 +894,12 @@ do_tb_invalidate_check(struct qht *ht, void *p, uint32_t hash, void *userp)
 
 static void tb_invalidate_check(target_ulong address)
 {
+    int i;
+
     address &= TARGET_PAGE_MASK;
-    qht_iter(&tcg_ctx.tb_ctx.htable, do_tb_invalidate_check, &address);
+    for (i = 0; i < tb_caches_count(); i++) {
+        qht_iter(&tcg_ctx.tb_ctx.htables[i], do_tb_invalidate_check, &address);
+    }
 }
 
 static void
@@ -900,7 +919,10 @@ do_tb_page_check(struct qht *ht, void *p, uint32_t hash, void *userp)
 /* verify that all the pages have correct rights for code */
 static void tb_page_check(void)
 {
-    qht_iter(&tcg_ctx.tb_ctx.htable, do_tb_page_check, NULL);
+    int i;
+    for (i = 0; i < tb_caches_count(); i++) {
+        qht_iter(&tcg_ctx.tb_ctx.htables[i], do_tb_page_check, NULL);
+    }
 }
 
 #endif
@@ -987,12 +1009,14 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
     CPUState *cpu;
     PageDesc *p;
     uint32_t h;
+    struct qht *qht;
     tb_page_addr_t phys_pc;
 
     /* remove the TB from the hash list */
     phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
     h = tb_hash_func(phys_pc, tb->pc, tb->flags);
-    qht_remove(&tcg_ctx.tb_ctx.htable, tb, h);
+    qht = tb_caches_get(&tcg_ctx.tb_ctx, tb->tb_cache_idx);
+    qht_remove(qht, tb, h);
 
     /* remove the TB from the page list */
     if (tb->page_addr[0] != page_addr) {
@@ -1122,10 +1146,12 @@ static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
                          tb_page_addr_t phys_page2)
 {
     uint32_t h;
+    struct qht *qht;
 
     /* add in the hash table */
     h = tb_hash_func(phys_pc, tb->pc, tb->flags);
-    qht_insert(&tcg_ctx.tb_ctx.htable, tb, h);
+    qht = tb_caches_get(&tcg_ctx.tb_ctx, tb->tb_cache_idx);
+    qht_insert(qht, tb, h);
 
     /* add in the page list */
     tb_alloc_page(tb, 0, phys_pc & TARGET_PAGE_MASK);
@@ -1175,6 +1201,8 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     tb->cs_base = cs_base;
     tb->flags = flags;
     tb->cflags = cflags;
+    bitmap_copy(tb->tb_cache_idx, ENV_GET_CPU(env)->tb_cache_idx,
+                TRACE_VCPU_EVENT_COUNT);
 
 #ifdef CONFIG_PROFILER
     tcg_ctx.tb_count1++; /* includes aborted translations because of
@@ -1636,6 +1664,8 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
     pc = tb->pc;
     cs_base = tb->cs_base;
     flags = tb->flags;
+    /* XXX: It is OK to invalidate only this TB, as this is the one triggering
+     * the memory access */
     tb_phys_invalidate(tb, -1);
     if (tb->cflags & CF_NOCACHE) {
         if (tb->orig_tb) {
@@ -1715,6 +1745,7 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
     int direct_jmp_count, direct_jmp2_count, cross_page;
     TranslationBlock *tb;
     struct qht_stats hst;
+    int cache;
 
     target_code_size = 0;
     max_target_code_size = 0;
@@ -1766,9 +1797,11 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
                 tcg_ctx.tb_ctx.nb_tbs ? (direct_jmp2_count * 100) /
                         tcg_ctx.tb_ctx.nb_tbs : 0);
 
-    qht_statistics_init(&tcg_ctx.tb_ctx.htable, &hst);
-    print_qht_statistics(f, cpu_fprintf, hst);
-    qht_statistics_destroy(&hst);
+    for (cache = 0; cache < tb_caches_count(); cache++) {
+        qht_statistics_init(&tcg_ctx.tb_ctx.htables[cache], &hst);
+        print_qht_statistics(f, cpu_fprintf, hst);
+        qht_statistics_destroy(&hst);
+    }
 
     cpu_fprintf(f, "\nStatistics:\n");
     cpu_fprintf(f, "TB flush count      %d\n", tcg_ctx.tb_ctx.tb_flush_count);
diff --git a/translate-all.h b/translate-all.h
index ba8e4d6..d39bf32 100644
--- a/translate-all.h
+++ b/translate-all.h
@@ -20,7 +20,21 @@
 #define TRANSLATE_ALL_H
 
 #include "exec/exec-all.h"
+#include "qemu/typedefs.h"
 
+/**
+ * tb_caches_count:
+ *
+ * Number of TB caches.
+ */
+static size_t tb_caches_count(void);
+
+/**
+ * tb_caches_get:
+ *
+ * Get the TB cache for the given bitmap index.
+ */
+static struct qht *tb_caches_get(TBContext *tb_ctx, unsigned long *bitmap);
 
 /* translate-all.c */
 void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len);
@@ -33,4 +47,7 @@ void tb_check_watchpoint(CPUState *cpu);
 int page_unprotect(target_ulong address, uintptr_t pc);
 #endif
 
+
+#include "translate-all.inc.h"
+
 #endif /* TRANSLATE_ALL_H */
diff --git a/translate-all.inc.h b/translate-all.inc.h
new file mode 100644
index 0000000..c60a48e
--- /dev/null
+++ b/translate-all.inc.h
@@ -0,0 +1,13 @@
+/* Inline implementations for translate-all.h */
+
+static inline size_t tb_caches_count(void)
+{
+    return 1ULL << TRACE_VCPU_EVENT_COUNT;
+}
+
+static inline struct qht *tb_caches_get(TBContext *tb_ctx,
+                                        unsigned long *bitmap)
+{
+    unsigned long idx = *bitmap;
+    return &tb_ctx->htables[idx];
+}