[Qemu-devel] [RFC 11/11] arm/tcg-test: some basic TCG exercising tests

From: "Alex Bennée" <alex.bennee@linaro.org>
To: mttcg@listserver.greensocs.com, mark.burton@greensocs.com,
	fred.konrad@greensocs.com, a.rigo@virtualopensystems.com
Cc: peter.maydell@linaro.org, drjones@redhat.com,
	a.spyridakis@virtualopensystems.com, claudio.fontana@huawei.com,
	qemu-devel@nongnu.org, will.deacon@arm.com,
	crosthwaitepeter@gmail.com, pbonzini@redhat.com,
	"Alex Bennée" <alex.bennee@linaro.org>,
	aurelien@aurel32.net, rth@twiddle.net
Subject: [Qemu-devel] [RFC 11/11] arm/tcg-test: some basic TCG exercising tests
Date: Fri, 26 Feb 2016 13:15:33 +0000	[thread overview]
Message-ID: <1456492533-17171-12-git-send-email-alex.bennee@linaro.org> (raw)
In-Reply-To: <1456492533-17171-1-git-send-email-alex.bennee@linaro.org>

These tests are not really aimed at KVM at all but exist to stretch
QEMU's TCG code generator. In particular these exercise the ability of
the TCG to:

  * Chain TranslationBlocks together (tight)
  * Handle heavy usage of the tb_jump_cache (paged)
  * Pathological case of computed local jumps (computed)

In addition the tests can be varied by adding IPI IRQs or SMC sequences
into the mix to stress the tcg_exit and invalidation mechanisms.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
---
 arm/tcg-test-asm.S           | 170 +++++++++++++++++++++++++++++
 arm/tcg-test.c               | 248 +++++++++++++++++++++++++++++++++++++++++++
 arm/unittests.cfg            |  90 ++++++++++++++++
 config/config-arm-common.mak |   2 +
 4 files changed, 510 insertions(+)
 create mode 100644 arm/tcg-test-asm.S
 create mode 100644 arm/tcg-test.c

diff --git a/arm/tcg-test-asm.S b/arm/tcg-test-asm.S
new file mode 100644
index 0000000..6e823b7
--- /dev/null
+++ b/arm/tcg-test-asm.S
@@ -0,0 +1,170 @@
+/*
+ * TCG Test assembler functions for armv7 tests.
+ *
+ * Copyright (C) 2016, Linaro Ltd, Alex Bennée <alex.bennee@linaro.org>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.
+ *
+ * These helper functions are written in pure asm to control the size
+ * of the basic blocks and ensure they fit neatly into page
+ * aligned chunks. The pattern of branches they follow is determined by
+ * the 32 bit seed they are passed. It should be the same for each set.
+ *
+ * Calling convention
+ *  - r0, iterations
+ *  - r1, jump pattern
+ *  - r2-r3, scratch
+ *
+ * Returns r0
+ */
+
+.arm
+
+.section .text
+
+/* Tight - all blocks should quickly be patched and should run
+ * very fast unless irqs or smc gets in the way
+ */
+
+.global tight_start
+tight_start:
+        subs    r0, r0, #1
+        beq     tight_end
+
+        ror     r1, r1, #1
+        tst     r1, #1
+        beq     tightA
+        b       tight_start
+
+tightA:
+        subs    r0, r0, #1
+        beq     tight_end
+
+        ror     r1, r1, #1
+        tst     r1, #1
+        beq     tightB
+        b       tight_start
+
+tightB:
+        subs    r0, r0, #1
+        beq     tight_end
+
+        ror     r1, r1, #1
+        tst     r1, #1
+        beq     tight_start
+        b       tightA
+
+.global tight_end
+tight_end:
+        mov     pc, lr
+
+/*
+ * Computed jumps cannot be hardwired into the basic blocks so each one
+ * will cause an exit for the main execution loop to look up the next block.
+ *
+ * There is some caching which should ameliorate the cost a little.
+ */
+
+        /* Align << 13 == 4096 byte alignment */
+        .align 13
+        .global computed_start
+computed_start:
+        subs    r0, r0, #1
+        beq     computed_end
+
+        /* Jump table */
+        ror     r1, r1, #1
+        and     r2, r1, #1
+        adr     r3, computed_jump_table
+        ldr     r2, [r3, r2, lsl #2]
+        mov     pc, r2
+
+        b       computed_err
+
+computed_jump_table:
+        .word   computed_start
+        .word   computedA
+
+computedA:
+        subs    r0, r0, #1
+        beq     computed_end
+
+        /* Jump into code */
+        ror     r1, r1, #1
+        and     r2, r1, #1
+        adr     r3, 1f
+        add	r3, r2, lsl #2
+        mov     pc, r3
+1:      b       computed_start
+        b       computedB
+
+        b       computed_err
+
+
+computedB:
+        subs    r0, r0, #1
+        beq     computed_end
+        ror     r1, r1, #1
+
+        /* Conditional register load */
+        adr     r3, computedA
+        tst     r1, #1
+        adreq   r3, computed_start
+        mov     pc, r3
+
+        b       computed_err
+
+computed_err:
+        mov     r0, #1
+        .global computed_end
+computed_end:
+        mov     pc, lr
+
+
+/*
+ * Page hoping
+ *
+ * Each block is in a different page, hence the blocks never get joined
+ */
+        /* Align << 13 == 4096 byte alignment */
+        .align 13
+        .global paged_start
+paged_start:
+        subs    r0, r0, #1
+        beq     paged_end
+
+        ror     r1, r1, #1
+        tst     r1, #1
+        beq     pagedA
+        b       paged_start
+
+        /* Align << 13 == 4096 byte alignment */
+        .align 13
+pagedA:
+        subs    r0, r0, #1
+        beq     paged_end
+
+        ror     r1, r1, #1
+        tst     r1, #1
+        beq     pagedB
+        b       paged_start
+
+        /* Align << 13 == 4096 byte alignment */
+        .align 13
+pagedB:
+        subs    r0, r0, #1
+        beq     paged_end
+
+        ror     r1, r1, #1
+        tst     r1, #1
+        beq     paged_start
+        b       pagedA
+
+        /* Align << 13 == 4096 byte alignment */
+        .align 13
+.global paged_end
+paged_end:
+        mov     pc, lr
+
+.global test_code_end
+test_code_end:
diff --git a/arm/tcg-test.c b/arm/tcg-test.c
new file mode 100644
index 0000000..6fa61ba
--- /dev/null
+++ b/arm/tcg-test.c
@@ -0,0 +1,248 @@
+/*
+ * ARM TCG Tests
+ *
+ * These tests are explicitly aimed at stretching the QEMU TCG engine.
+ */
+
+#include <libcflat.h>
+#include <asm/smp.h>
+#include <asm/cpumask.h>
+#include <asm/barrier.h>
+#include <asm/mmu.h>
+#include <asm/gic.h>
+
+#include <prng.h>
+
+#define MAX_CPUS 8
+
+/* These entry points are in the assembly code */
+extern int tight_start(uint32_t count, uint32_t pattern);
+extern int computed_start(uint32_t count, uint32_t pattern);
+extern int paged_start(uint32_t count, uint32_t pattern);
+extern uint32_t tight_end;
+extern uint32_t computed_end;
+extern uint32_t paged_end;
+extern unsigned long test_code_end;
+
+typedef int (*test_fn)(uint32_t count, uint32_t pattern);
+
+typedef struct {
+	const char *test_name;
+	bool       should_pass;
+	test_fn    start_fn;
+	uint32_t   *code_end;
+} test_descr_t;
+
+/* Test array */
+static test_descr_t tests[] = {
+       /*
+	* Tight chain.
+	*
+	* These are a bunch of basic blocks that have fixed branches in
+	* a page aligned space. The branches taken are decided by a
+	* psuedo-random bitmap for each CPU.
+	*
+	* Once the basic blocks have been chained together by the TCG they
+	* should run until they reach their block count. This will be the
+	* most efficient mode in which generated code is run. The only other
+	* exits will be caused by interrupts or TB invalidation.
+	*/
+	{ "tight", true, tight_start, &tight_end },
+	/*
+	 * Computed jumps.
+	 *
+	 * A bunch of basic blocks which just do computed jumps so the basic
+	 * block is never chained but they are all within a page (maybe not
+	 * required). This will exercise the cache lookup but not the new
+	 * generation.
+	 */
+	{ "computed", true, computed_start, &computed_end },
+        /*
+	 * Page ping pong.
+	 *
+	 * Have the blocks are separated by PAGE_SIZE so they can never
+	 * be chained together.
+	 *
+	 */
+	{ "paged", true, paged_start, &paged_end}
+};
+
+static test_descr_t *test = NULL;
+
+static int iterations = 100000;
+static int rounds = 1000;
+static int mod_freq = 5;
+static uint32_t pattern[MAX_CPUS];
+
+static int smc = 0;
+static int irq = 0;
+static int irq_cnt[MAX_CPUS];
+static int errors[MAX_CPUS];
+
+static cpumask_t smp_test_complete;
+
+
+/* This triggers TCGs SMC detection by writing values to the executing
+ * code pages. We are not actually modifying the instructions and the
+ * underlying code will remain unchanged. However this should trigger
+ * invalidation of the Translation Blocks
+ */
+
+void trigger_smc_detection(uint32_t *start, uint32_t *end)
+{
+	volatile uint32_t *ptr = start;
+	while (ptr < end) {
+		uint32_t inst = *ptr;
+		*ptr++ = inst;
+	}
+}
+
+/* Handler for receiving IRQs */
+
+static void irq_handler(struct pt_regs *regs __unused)
+{
+	int cpu = smp_processor_id();
+	irq_cnt[cpu]++;
+	gic_irq_ack();
+}
+
+/* This triggers cross-CPU IRQs. Each IRQ should cause the basic block
+ * execution to finish the main run-loop get entered again.
+ */
+int send_cross_cpu_irqs(int this_cpu)
+{
+	int cpu, sent = 0;
+
+	for_each_present_cpu(cpu) {
+		if (cpu != this_cpu) {
+			gic_send_sgi(cpu, 1);
+			sent++;
+		}
+	}
+
+	return sent;
+}
+
+
+void do_test(void)
+{
+	int cpu = smp_processor_id();
+	int i;
+	int sent_irqs = 0;
+
+	printf("CPU%d: online and setting up with pattern 0x%x\n", cpu, pattern[cpu]);
+
+	if (irq) {
+		gic_enable();
+#ifdef __arm__
+		install_exception_handler(EXCPTN_IRQ, irq_handler);
+#else
+		install_irq_handler(EL1H_IRQ, irq_handler);
+#endif
+		local_irq_enable();
+	}
+
+	for (i=0; i<rounds; i++)
+	{
+		/* Enter the blocks */
+		errors[cpu] += test->start_fn(iterations, pattern[cpu]);
+
+		if ((i + cpu) % mod_freq == 0)
+		{
+			if (smc) {
+				trigger_smc_detection((uint32_t *) test->start_fn,
+						test->code_end);
+			}
+			if (irq) {
+				sent_irqs += send_cross_cpu_irqs(cpu);
+			}
+		}
+	}
+
+	if (irq) {
+		printf("CPU%d: Done with %d irqs sent and %d received\n", cpu, sent_irqs, irq_cnt[cpu]);
+	} else {
+		printf("CPU%d: Done with %d errors\n", cpu, errors[cpu]);
+	}
+
+	cpumask_set_cpu(cpu, &smp_test_complete);
+	if (cpu != 0)
+		halt();
+}
+
+
+void setup_and_run_tcg_test(void)
+{
+	static const unsigned char seed[] = "tcg-test";
+	struct isaac_ctx prng_context;
+	int cpu;
+
+	isaac_init(&prng_context, &seed[0], sizeof(seed));
+
+	if (irq) {
+		gic_enable();
+	}
+
+	/* boot other CPUs */
+	for_each_present_cpu(cpu) {
+		pattern[cpu] = isaac_next_uint32(&prng_context);
+
+		if (cpu == 0)
+			continue;
+
+		smp_boot_secondary(cpu, do_test);
+	}
+
+	do_test();
+
+	while (!cpumask_full(&smp_test_complete))
+		cpu_relax();
+
+	/* how do we detect errors other than not crashing? */
+	report("passed", true);
+}
+
+int main(int argc, char **argv)
+{
+	int i;
+	unsigned int j;
+
+	for (i=0; i<argc; i++) {
+		char *arg = argv[i];
+
+		for (j = 0; j < ARRAY_SIZE(tests); j++) {
+			if (strcmp(arg, tests[j].test_name) == 0)
+				test = & tests[j];
+		}
+
+		/* Test modifiers */
+		if (strstr(arg, "mod=") != NULL) {
+			char *p = strstr(arg, "=");
+			mod_freq = atol(p+1);
+		}
+
+		if (strcmp(arg, "smc") == 0) {
+			unsigned long test_start = (unsigned long) &tight_start;
+			unsigned long test_end = (unsigned long) &test_code_end;
+
+			smc = 1;
+			mmu_set_range_ptes(mmu_idmap, test_start, test_start, test_end,
+					__pgprot(PTE_WBWA));
+
+			report_prefix_push("smc");
+		}
+
+		if (strcmp(arg, "irq") == 0) {
+			irq = 1;
+			report_prefix_push("irq");
+		}
+	}
+
+	if (test) {
+		setup_and_run_tcg_test();
+	} else {
+		report("Unknown test", false);
+	}
+
+	return report_summary();
+}
diff --git a/arm/unittests.cfg b/arm/unittests.cfg
index 3f05cfa..c00be15 100644
--- a/arm/unittests.cfg
+++ b/arm/unittests.cfg
@@ -39,6 +39,11 @@ smp = $MAX_SMP
 extra_params = -append 'smp'
 groups = selftest
 
+[irq-ipi]
+file = ipi-test.flat
+smp = 2
+group = irq
+
 # TLB Torture Tests
 [tlbflush::all_other]
 file = tlbflush-test.flat
@@ -118,3 +123,88 @@ smp = 2
 extra_params = -append 'mp_acqrel'
 groups = barrier
 accel = tcg
+
+# TCG Tests
+[tcg::tight]
+file = tcg-test.flat
+smp = $MAX_SMP
+extra_params = -append 'tight'
+groups = tcg
+accel = tcg
+
+[tcg::tight-smc]
+file = tcg-test.flat
+smp = $MAX_SMP
+extra_params = -append 'tight smc'
+groups = tcg
+accel = tcg
+
+[tcg::tight-irq]
+file = tcg-test.flat
+smp = $MAX_SMP
+extra_params = -append 'tight irq'
+groups = tcg
+accel = tcg
+
+[tcg::tight-smc-irq]
+file = tcg-test.flat
+smp = $MAX_SMP
+extra_params = -append 'tight smc irq'
+groups = tcg
+accel = tcg
+
+[tcg::computed]
+file = tcg-test.flat
+smp = $MAX_SMP
+extra_params = -append 'computed'
+groups = tcg
+accel = tcg
+
+[tcg::computed-smc]
+file = tcg-test.flat
+smp = $MAX_SMP
+extra_params = -append 'computed smc'
+groups = tcg
+accel = tcg
+
+[tcg::computed-irq]
+file = tcg-test.flat
+smp = $MAX_SMP
+extra_params = -append 'computed irq'
+groups = tcg
+accel = tcg
+
+[tcg::computed-smc-irq]
+file = tcg-test.flat
+smp = $MAX_SMP
+extra_params = -append 'computed smc irq'
+groups = tcg
+accel = tcg
+
+[tcg::paged]
+file = tcg-test.flat
+smp = $MAX_SMP
+extra_params = -append 'paged'
+groups = tcg
+accel = tcg
+
+[tcg::paged-smc]
+file = tcg-test.flat
+smp = $MAX_SMP
+extra_params = -append 'paged smc'
+groups = tcg
+accel = tcg
+
+[tcg::paged-irq]
+file = tcg-test.flat
+smp = $MAX_SMP
+extra_params = -append 'paged irq'
+groups = tcg
+accel = tcg
+
+[tcg::paged-smc-irq]
+file = tcg-test.flat
+smp = $MAX_SMP
+extra_params = -append 'paged smc irq'
+groups = tcg
+accel = tcg
diff --git a/config/config-arm-common.mak b/config/config-arm-common.mak
index 023a0c3..23208b0 100644
--- a/config/config-arm-common.mak
+++ b/config/config-arm-common.mak
@@ -15,6 +15,7 @@ tests-common += $(TEST_DIR)/ipi-test.flat
 tests-common += $(TEST_DIR)/tlbflush-test.flat
 tests-common += $(TEST_DIR)/locking-test.flat
 tests-common += $(TEST_DIR)/barrier-litmus-test.flat
+tests-common += $(TEST_DIR)/tcg-test.flat
 
 all: test_cases
 
@@ -78,3 +79,4 @@ $(TEST_DIR)/ipi-test.elf: $(cstart.o) $(TEST_DIR)/ipi-test.o
 $(TEST_DIR)/tlbflush-test.elf: $(cstart.o) $(TEST_DIR)/tlbflush-test.o
 $(TEST_DIR)/locking-test.elf: $(cstart.o) $(TEST_DIR)/locking-test.o
 $(TEST_DIR)/barrier-litmus-test.elf: $(cstart.o) $(TEST_DIR)/barrier-litmus-test.o
+$(TEST_DIR)/tcg-test.elf: $(cstart.o) $(TEST_DIR)/tcg-test.o $(TEST_DIR)/tcg-test-asm.o
-- 
2.7.1