All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/7] xen/arm: run on real hardware
@ 2012-10-24 15:03 Stefano Stabellini
  2012-10-24 15:03 ` [PATCH 1/7] xen/arm: fix rank calculation in vgic_vcpu_inject_irq Stefano Stabellini
                   ` (7 more replies)
  0 siblings, 8 replies; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-24 15:03 UTC (permalink / raw)
  To: xen-devel; +Cc: Tim Deegan, Ian Campbell, Stefano Stabellini

Hi all,
this is a collection of fixes that I wrote trying to run Xen on a real
Versatile Express Cortex A15 machine.


Stefano Stabellini (7):
      xen/arm: fix rank calculation in vgic_vcpu_inject_irq
      xen/arm: setup the fixmap in head.S
      pl011: set baud and clock_hz to the right defaults for Versatile Express
      xen/arm: set the SMP bit in the ACTLR register
      xen/arm: wake up secondary cpus
      xen/arm: flush D-cache and I-cache when appropriate
      xen/arm: get the number of cpus from device tree

 xen/arch/arm/early_printk.c     |    5 +--
 xen/arch/arm/gic.c              |    4 +--
 xen/arch/arm/gic.h              |    2 +-
 xen/arch/arm/head.S             |   55 ++++++++++++++++++++++++++++----------
 xen/arch/arm/mm.c               |   16 ++++++++++-
 xen/arch/arm/mode_switch.S      |   31 ++++++++++++++++++++++
 xen/arch/arm/setup.c            |    5 +--
 xen/arch/arm/smpboot.c          |    2 +
 xen/arch/arm/vgic.c             |    2 +-
 xen/common/device_tree.c        |   20 ++++++++++++++
 xen/drivers/char/pl011.c        |    4 +-
 xen/include/asm-arm/page.h      |   14 ++++++++++
 xen/include/asm-arm/processor.h |   30 +++++++++++++++++++++
 xen/include/xen/device_tree.h   |    1 +
 14 files changed, 161 insertions(+), 30 deletions(-)

Cheers,

Stefano

^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH 1/7] xen/arm: fix rank calculation in vgic_vcpu_inject_irq
  2012-10-24 15:03 [PATCH 0/7] xen/arm: run on real hardware Stefano Stabellini
@ 2012-10-24 15:03 ` Stefano Stabellini
  2012-10-25  9:27   ` Ian Campbell
  2012-10-24 15:03 ` [PATCH 2/7] xen/arm: setup the fixmap in head.S Stefano Stabellini
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-24 15:03 UTC (permalink / raw)
  To: xen-devel; +Cc: tim, Ian.Campbell, Stefano Stabellini

Each rank holds 32 irqs, so we should divide by 32 rather than by 4.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 xen/arch/arm/vgic.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/xen/arch/arm/vgic.c b/xen/arch/arm/vgic.c
index 3c3983f..5eae61c 100644
--- a/xen/arch/arm/vgic.c
+++ b/xen/arch/arm/vgic.c
@@ -577,7 +577,7 @@ struct pending_irq *irq_to_pending(struct vcpu *v, unsigned int irq)
 
 void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int irq, int virtual)
 {
-    int idx = irq >> 2, byte = irq & 0x3;
+    int idx = irq / 32, byte = irq & 0x3;
     uint8_t priority;
     struct vgic_irq_rank *rank = vgic_irq_rank(v, 8, idx);
     struct pending_irq *iter, *n = irq_to_pending(v, irq);
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH 2/7] xen/arm: setup the fixmap in head.S
  2012-10-24 15:03 [PATCH 0/7] xen/arm: run on real hardware Stefano Stabellini
  2012-10-24 15:03 ` [PATCH 1/7] xen/arm: fix rank calculation in vgic_vcpu_inject_irq Stefano Stabellini
@ 2012-10-24 15:03 ` Stefano Stabellini
  2012-10-24 15:27   ` Tim Deegan
  2012-10-25  9:33   ` Ian Campbell
  2012-10-24 15:03 ` [PATCH 3/7] pl011: set baud and clock_hz to the right defaults for Versatile Express Stefano Stabellini
                   ` (5 subsequent siblings)
  7 siblings, 2 replies; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-24 15:03 UTC (permalink / raw)
  To: xen-devel; +Cc: tim, Ian.Campbell, Stefano Stabellini

Setup the fixmap mapping directly in head.S rather than having a
temporary mapping only to re-do it later in C.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 xen/arch/arm/early_printk.c |    5 ++---
 xen/arch/arm/head.S         |   32 +++++++++++++++++++-------------
 xen/arch/arm/mm.c           |    2 +-
 xen/arch/arm/setup.c        |    2 --
 4 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/xen/arch/arm/early_printk.c b/xen/arch/arm/early_printk.c
index 3e51252..bdf4c0e 100644
--- a/xen/arch/arm/early_printk.c
+++ b/xen/arch/arm/early_printk.c
@@ -17,12 +17,11 @@
 
 #ifdef EARLY_UART_ADDRESS
 
-static void __init early_putch(char c)
+void __init early_putch(char c)
 {
     volatile uint32_t *r;
 
-    r = (uint32_t *)((EARLY_UART_ADDRESS & 0x001fffff)
-                     + XEN_VIRT_START + (1 << 21));
+    r = (uint32_t *)(XEN_VIRT_START + (1 << 21));
 
     /* XXX: assuming a PL011 UART. */
     while(*(r + 0x6) & 0x8)
diff --git a/xen/arch/arm/head.S b/xen/arch/arm/head.S
index 9fdeda7..3d01be0 100644
--- a/xen/arch/arm/head.S
+++ b/xen/arch/arm/head.S
@@ -24,6 +24,7 @@
 #define PT_PT  0xe7f /* nG=1, AF=1, SH=10, AP=01, NS=1, ATTR=111, T=1, P=1 */
 #define PT_MEM 0xe7d /* nG=1, AF=1, SH=10, AP=01, NS=1, ATTR=111, T=0, P=1 */
 #define PT_DEV 0xe71 /* nG=1, AF=1, SH=10, AP=01, NS=1, ATTR=100, T=0, P=1 */
+#define PT_DEV_L3 0xe73 /* lev3: nG=1, AF=1, SH=10, AP=01, NS=1, ATTR=100, T=1, P=1 */
 
 #define PT_UPPER(x) (PT_##x & 0xf00)
 #define PT_LOWER(x) (PT_##x & 0x0ff)
@@ -183,6 +184,16 @@ skip_bss:
 	teq   r12, #0
 	bne   pt_ready
 	
+	/* console fixmap */
+	ldr   r1, =xen_fixmap
+	add   r1, r1, r10            /* r1 := paddr (xen_fixmap) */
+	mov   r3, #0
+	lsr   r2, r11, #12
+	lsl   r2, r2, #12            /* 4K aligned paddr of UART */
+	orr   r2, r2, #PT_UPPER(DEV_L3)
+	orr   r2, r2, #PT_LOWER(DEV_L3) /* r2:r3 := 4K dev map including UART */
+	strd  r2, r3, [r1, #0]       /* Map it in the first fixmap's slot */
+	
 	/* Build the baseline idle pagetable's first-level entries */
 	ldr   r1, =xen_second
 	add   r1, r1, r10            /* r1 := paddr (xen_second) */
@@ -205,12 +216,13 @@ skip_bss:
 	ldr   r4, =start
 	lsr   r4, #18                /* Slot for vaddr(start) */
 	strd  r2, r3, [r1, r4]       /* Map Xen there too */
+
 #ifdef EARLY_UART_ADDRESS
-	ldr   r3, =(1<<(54-32))      /* NS for device mapping */
-	lsr   r2, r11, #21
-	lsl   r2, r2, #21            /* 2MB-aligned paddr of UART */
-	orr   r2, r2, #PT_UPPER(DEV)
-	orr   r2, r2, #PT_LOWER(DEV) /* r2:r3 := 2MB dev map including UART */
+	/* xen_fixmap pagetable */
+	ldr r2, =xen_fixmap
+	add r2, r2, r10
+	orr   r2, r2, #PT_UPPER(PT)
+	orr   r2, r2, #PT_LOWER(PT) /* r2:r3 := paddr (xen_fixmap) */
 	add   r4, r4, #8
 	strd  r2, r3, [r1, r4]       /* Map it in the fixmap's slot */
 #else
@@ -236,13 +248,9 @@ pt_ready:
 	mov   pc, r1                 /* Get a proper vaddr into PC */
 paging:
 
+	
 #ifdef EARLY_UART_ADDRESS
-	/* Recover the UART address in the new address space. */
-	lsl   r11, #11
-	lsr   r11, #11               /* UART base's offset from 2MB base */
-	adr   r0, start
-	add   r0, r0, #0x200000      /* vaddr of the fixmap's 2MB slot */
-	add   r11, r11, r0           /* r11 := vaddr (UART base address) */
+	ldr r11, =FIXMAP_ADDR(FIXMAP_CONSOLE)
 #endif
 
 	PRINT("- Ready -\r\n")
@@ -261,8 +269,6 @@ paging:
 	mcr   CP32(r0, BPIALL)       /* Flush branch predictor */
 	dsb                          /* Ensure completion of TLB+BP flush */
 	isb
- 	/* Now, the UART is in its proper fixmap address */
-	ldrne r11, =FIXMAP_ADDR(FIXMAP_CONSOLE)
 
 	/* Non-boot CPUs report that they've got this far */
 	ldr   r0, =ready_cpus
diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index b0068d2..d0cd2c9 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -37,7 +37,7 @@ struct domain *dom_xen, *dom_io;
 /* Static start-of-day pagetables that we use before the allocators are up */
 lpae_t xen_pgtable[LPAE_ENTRIES] __attribute__((__aligned__(4096)));
 lpae_t xen_second[LPAE_ENTRIES*4] __attribute__((__aligned__(4096*4)));
-static lpae_t xen_fixmap[LPAE_ENTRIES] __attribute__((__aligned__(4096)));
+lpae_t xen_fixmap[LPAE_ENTRIES] __attribute__((__aligned__(4096)));
 static lpae_t xen_xenmap[LPAE_ENTRIES] __attribute__((__aligned__(4096)));
 
 /* Non-boot CPUs use this to find the correct pagetables. */
diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
index 7568968..6fbcb81 100644
--- a/xen/arch/arm/setup.c
+++ b/xen/arch/arm/setup.c
@@ -194,9 +194,7 @@ void __init start_xen(unsigned long boot_phys_offset,
     setup_pagetables(boot_phys_offset, get_xen_paddr());
 
 #ifdef EARLY_UART_ADDRESS
-    /* Map the UART */
     /* TODO Need to get device tree or command line for UART address */
-    set_fixmap(FIXMAP_CONSOLE, EARLY_UART_ADDRESS >> PAGE_SHIFT, DEV_SHARED);
     pl011_init(0, FIXMAP_ADDR(FIXMAP_CONSOLE));
     console_init_preirq();
 #endif
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH 3/7] pl011: set baud and clock_hz to the right defaults for Versatile Express
  2012-10-24 15:03 [PATCH 0/7] xen/arm: run on real hardware Stefano Stabellini
  2012-10-24 15:03 ` [PATCH 1/7] xen/arm: fix rank calculation in vgic_vcpu_inject_irq Stefano Stabellini
  2012-10-24 15:03 ` [PATCH 2/7] xen/arm: setup the fixmap in head.S Stefano Stabellini
@ 2012-10-24 15:03 ` Stefano Stabellini
  2012-10-25  9:37   ` Ian Campbell
  2012-10-24 15:03 ` [PATCH 4/7] xen/arm: set the SMP bit in the ACTLR register Stefano Stabellini
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-24 15:03 UTC (permalink / raw)
  To: xen-devel; +Cc: tim, Ian.Campbell, Stefano Stabellini

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 xen/drivers/char/pl011.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/xen/drivers/char/pl011.c b/xen/drivers/char/pl011.c
index 6af45aa..6ccb73a 100644
--- a/xen/drivers/char/pl011.c
+++ b/xen/drivers/char/pl011.c
@@ -241,8 +241,8 @@ void __init pl011_init(int index, unsigned long register_base_address)
 
     uart = &pl011_com[index];
 
-    uart->clock_hz  = 7372800;
-    uart->baud      = 115200;
+    uart->clock_hz  = 0x16e3600;
+    uart->baud      = 38400;
     uart->data_bits = 8;
     uart->parity    = PARITY_NONE;
     uart->stop_bits = 1;
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH 4/7] xen/arm: set the SMP bit in the ACTLR register
  2012-10-24 15:03 [PATCH 0/7] xen/arm: run on real hardware Stefano Stabellini
                   ` (2 preceding siblings ...)
  2012-10-24 15:03 ` [PATCH 3/7] pl011: set baud and clock_hz to the right defaults for Versatile Express Stefano Stabellini
@ 2012-10-24 15:03 ` Stefano Stabellini
  2012-10-25  9:52   ` Ian Campbell
  2012-10-24 15:03 ` [PATCH 5/7] xen/arm: wake up secondary cpus Stefano Stabellini
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-24 15:03 UTC (permalink / raw)
  To: xen-devel; +Cc: tim, Ian.Campbell, Stefano Stabellini

>From the Cortex A15 manual:

"Enables the processor to receive instruction cache, BTB, and TLB maintenance
operations from other processors"

...

"You must set this bit before enabling the caches and MMU, or
performing any cache and TLB maintenance operations. The only time
you must clear this bit is during a processor power-down sequence"

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 xen/arch/arm/head.S             |    6 ++++++
 xen/include/asm-arm/processor.h |   30 ++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 0 deletions(-)

diff --git a/xen/arch/arm/head.S b/xen/arch/arm/head.S
index 3d01be0..c784f4d 100644
--- a/xen/arch/arm/head.S
+++ b/xen/arch/arm/head.S
@@ -148,6 +148,12 @@ skip_bss:
 
 	PRINT("- Setting up control registers -\r\n")
 	
+	/* XXX: Cortex A15 specific */
+	/* Set up the SMP bit in ACTLR */
+	mrc   CP32(r0, ACTLR)
+	orr   r0, r0, #(ACTLR_SMP) /* enable SMP bit*/
+	mcr   CP32(r0, ACTLR)
+
 	/* Set up memory attribute type tables */
 	ldr   r0, =MAIR0VAL
 	ldr   r1, =MAIR1VAL
diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
index 3849b23..91a5836 100644
--- a/xen/include/asm-arm/processor.h
+++ b/xen/include/asm-arm/processor.h
@@ -34,6 +34,36 @@
 #define SCTLR_A         (1<<1)
 #define SCTLR_M         (1<<0)
 
+/* ACTLR Auxiliary Control Register */
+#define ACTLR_SNOOP_DELAYED      (1<<31)
+#define ACTLR_MAIN_CLOCK         (1<<30)
+#define ACTLR_NEON_CLOCK         (1<<29)
+#define ACTLR_NONCACHE           (1<<24)
+#define ACTLR_INORDER_REQ        (1<<23)
+#define ACTLR_INORDER_LOAD       (1<<22)
+#define ACTLR_L2_TLB_PREFETCH    (1<<21)
+#define ACTLR_L2_IPA_PA_CACHE    (1<<20)
+#define ACTLR_L2_CACHE           (1<<19)
+#define ACTLR_L2_PA_CACHE        (1<<18)
+#define ACTLR_TLB                (1<<17)
+#define ACTLR_STRONGY_ORDERED    (1<<16)
+#define ACTLR_INORDER            (1<<15)
+#define ACTLR_FORCE_LIM          (1<<14)
+#define ACTLR_CP_FLUSH           (1<<13)
+#define ACTLR_CP_PUSH            (1<<12)
+#define ACTLR_LIM                (1<<11)
+#define ACTLR_SER                (1<<10)
+#define ACTLR_OPT                (1<<9)
+#define ACTLR_WFI                (1<<8)
+#define ACTLR_WFE                (1<<7)
+#define ACTLR_SMP                (1<<6)
+#define ACTLR_PLD                (1<<5)
+#define ACTLR_IP                 (1<<4)
+#define ACTLR_MICRO_BTB          (1<<3)
+#define ACTLR_LOOP_ONE           (1<<2)
+#define ACTLR_LOOP_DISABLE       (1<<1)
+#define ACTLR_BTB                (1<<0)
+
 #define SCTLR_BASE        0x00c50078
 #define HSCTLR_BASE       0x30c51878
 
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH 5/7] xen/arm: wake up secondary cpus
  2012-10-24 15:03 [PATCH 0/7] xen/arm: run on real hardware Stefano Stabellini
                   ` (3 preceding siblings ...)
  2012-10-24 15:03 ` [PATCH 4/7] xen/arm: set the SMP bit in the ACTLR register Stefano Stabellini
@ 2012-10-24 15:03 ` Stefano Stabellini
  2012-10-24 15:38   ` Tim Deegan
  2012-10-25  9:59   ` Ian Campbell
  2012-10-24 15:03 ` [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate Stefano Stabellini
                   ` (2 subsequent siblings)
  7 siblings, 2 replies; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-24 15:03 UTC (permalink / raw)
  To: xen-devel; +Cc: tim, Ian.Campbell, Stefano Stabellini

Secondary cpus are held by the firmware until we send an IPI to them.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 xen/arch/arm/head.S        |    8 ++++++--
 xen/arch/arm/mode_switch.S |   31 +++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/xen/arch/arm/head.S b/xen/arch/arm/head.S
index c784f4d..39c4774 100644
--- a/xen/arch/arm/head.S
+++ b/xen/arch/arm/head.S
@@ -79,12 +79,12 @@ start:
 	beq   boot_cpu               /* If we're CPU 0, boot now */
 
 	/* Non-boot CPUs wait here to be woken up one at a time. */
-1:	wfe
-	dsb
+1:	dsb
 	ldr   r0, =smp_up_cpu        /* VA of gate */
 	add   r0, r0, r10            /* PA of gate */
 	ldr   r1, [r0]               /* Which CPU is being booted? */
 	teq   r1, r12                /* Is it us? */
+	wfene
 	bne   1b
 
 boot_cpu:
@@ -98,6 +98,10 @@ boot_cpu:
 	PRINT(" booting -\r\n")
 #endif
 
+	/* Wake up secondary cpus */
+	teq   r12, #0
+	bleq  kick_cpus
+
 	/* Check that this CPU has Hyp mode */
 	mrc   CP32(r0, ID_PFR1)
 	and   r0, r0, #0xf000        /* Bits 12-15 define virt extensions */
diff --git a/xen/arch/arm/mode_switch.S b/xen/arch/arm/mode_switch.S
index 83a682b..39d80e8 100644
--- a/xen/arch/arm/mode_switch.S
+++ b/xen/arch/arm/mode_switch.S
@@ -21,6 +21,37 @@
 #include <asm/page.h>
 #include <asm/asm_defns.h>
 
+/* XXX: Versatile Express specific code */
+/* wake up secondary cpus */
+.globl kick_cpus
+kick_cpus:
+    /* write start paddr to v2m sysreg FLAGSSET register */
+	ldr   r0, =0x1c010000 /* base V2M sysreg MMIO address */
+	add   r1, r0, #0x34   /* SYS_FLAGSCLR register */
+	dsb
+	mov   r2, #0xffffffff
+	str   r2, [r1]
+	dsb
+	add   r1, r0, #0x30   /* SYS_FLAGSSET register */
+	ldr   r2, =start
+	add   r2, r2, r10
+	str   r2, [r1]
+	dsb
+	/* send an interrupt */
+	ldr   r0, =0x2c001000 /* base GICD MMIO address */
+	mov   r1, r0
+	mov   r2, #0x1        /* GICD_CTLR */
+	str   r2, [r1]        /* enable distributor */
+	add   r1, r0, #0xf00  /* GICD_SGIR */
+	mov   r2, #0xfe0000
+	str   r2, [r1]        /* send IPI to everybody */
+	dsb
+	mov   r1, r0
+	mov   r2, #0x0        /* GICD_CTLR */
+	str   r2, [r1]        /* disable distributor */
+	mov   pc, lr
+
+
 /* Get up a CPU into Hyp mode.  Clobbers r0-r3.
  *
  * Expects r12 == CPU number
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-24 15:03 [PATCH 0/7] xen/arm: run on real hardware Stefano Stabellini
                   ` (4 preceding siblings ...)
  2012-10-24 15:03 ` [PATCH 5/7] xen/arm: wake up secondary cpus Stefano Stabellini
@ 2012-10-24 15:03 ` Stefano Stabellini
  2012-10-24 15:59   ` Tim Deegan
  2012-10-24 15:03 ` [PATCH 7/7] xen/arm: get the number of cpus from device tree Stefano Stabellini
  2012-10-24 16:02 ` [PATCH 0/7] xen/arm: run on real hardware Tim Deegan
  7 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-24 15:03 UTC (permalink / raw)
  To: xen-devel; +Cc: tim, Ian.Campbell, Stefano Stabellini

- invalidate tlb after setting WXN;
- flush D-cache and I-cache after relocation;
- flush D-cache after writing to smp_up_cpu;
- flush TLB before changing HTTBR;
- flush I-cache after changing HTTBR;
- flush I-cache and branch predictor after writing Xen text ptes.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 xen/arch/arm/head.S        |    9 +++++++++
 xen/arch/arm/mm.c          |   14 +++++++++++++-
 xen/arch/arm/smpboot.c     |    2 ++
 xen/include/asm-arm/page.h |   14 ++++++++++++++
 4 files changed, 38 insertions(+), 1 deletions(-)

diff --git a/xen/arch/arm/head.S b/xen/arch/arm/head.S
index 39c4774..4c420ac 100644
--- a/xen/arch/arm/head.S
+++ b/xen/arch/arm/head.S
@@ -274,8 +274,15 @@ paging:
 	ldr   r4, =boot_httbr        /* VA of HTTBR value stashed by CPU 0 */
 	add   r4, r4, r10            /* PA of it */
 	ldrd  r4, r5, [r4]           /* Actual value */
+	dsb
+	mcr   CP32(r0, TLBIALLH)     /* Flush hypervisor TLB */
+	dsb
+	isb
 	mcrr  CP64(r4, r5, HTTBR)
+	dsb
+	isb
 	mcr   CP32(r0, TLBIALLH)     /* Flush hypervisor TLB */
+	mcr   CP32(r0, ICIALLU)      /* Flush I-cache */
 	mcr   CP32(r0, BPIALL)       /* Flush branch predictor */
 	dsb                          /* Ensure completion of TLB+BP flush */
 	isb
@@ -288,6 +295,8 @@ paging:
 	teq   r2, #0
 	bne   1b
 	dsb
+	mcr   CP32(r0, DCCMVAC)      /* flush D-Cache */
+	isb
 
 	/* Here, the non-boot CPUs must wait again -- they're now running on
 	 * the boot CPU's pagetables so it's safe for the boot CPU to
diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index d0cd2c9..37e49c8 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -211,6 +211,7 @@ void __init setup_pagetables(unsigned long boot_phys_offset, paddr_t xen_paddr)
     unsigned long dest_va;
     lpae_t pte, *p;
     int i;
+    unsigned long cacheline_size = READ_CP32(CCSIDR);
 
     /* Map the destination in the boot misc area. */
     dest_va = BOOT_MISC_VIRT_START;
@@ -244,10 +245,18 @@ void __init setup_pagetables(unsigned long boot_phys_offset, paddr_t xen_paddr)
 
     /* Change pagetables to the copy in the relocated Xen */
     boot_httbr = (unsigned long) xen_pgtable + phys_offset;
+    flush_xen_dcache_va((unsigned long)&boot_httbr);
+    for ( i = 0; i < _end - _start; i += cacheline_size )
+        flush_xen_dcache_va(dest_va + i);
+    flush_xen_text_tlb();
+
     asm volatile (
+        "dsb;"                        /* Ensure visibility of HTTBR update */
         STORE_CP64(0, HTTBR)          /* Change translation base */
         "dsb;"                        /* Ensure visibility of HTTBR update */
+        "isb;"
         STORE_CP32(0, TLBIALLH)       /* Flush hypervisor TLB */
+        STORE_CP32(0, ICIALLU)        /* Flush I-cache */
         STORE_CP32(0, BPIALL)         /* Flush branch predictor */
         "dsb;"                        /* Ensure completion of TLB+BP flush */
         "isb;"
@@ -292,10 +301,11 @@ void __init setup_pagetables(unsigned long boot_phys_offset, paddr_t xen_paddr)
     pte.pt.table = 1;
     write_pte(xen_second + second_linear_offset(XEN_VIRT_START), pte);
     /* Have changed a mapping used for .text. Flush everything for safety. */
-    flush_xen_text_tlb();
 
     /* From now on, no mapping may be both writable and executable. */
     WRITE_CP32(READ_CP32(HSCTLR) | SCTLR_WXN, HSCTLR);
+    isb();
+    flush_xen_text_tlb();
 }
 
 /* MMU setup for secondary CPUS (which already have paging enabled) */
@@ -303,6 +313,8 @@ void __cpuinit mmu_init_secondary_cpu(void)
 {
     /* From now on, no mapping may be both writable and executable. */
     WRITE_CP32(READ_CP32(HSCTLR) | SCTLR_WXN, HSCTLR);
+    isb();
+    flush_xen_text_tlb();
 }
 
 /* Create Xen's mappings of memory.
diff --git a/xen/arch/arm/smpboot.c b/xen/arch/arm/smpboot.c
index c0750c0..767e553 100644
--- a/xen/arch/arm/smpboot.c
+++ b/xen/arch/arm/smpboot.c
@@ -105,6 +105,7 @@ make_cpus_ready(unsigned int max_cpus, unsigned long boot_phys_offset)
         /* Tell the next CPU to get ready */
         /* TODO: handle boards where CPUIDs are not contiguous */
         *gate = i;
+        flush_xen_dcache_va((unsigned long)gate);
         asm volatile("dsb; isb; sev");
         /* And wait for it to respond */
         while ( ready_cpus < i )
@@ -201,6 +202,7 @@ int __cpu_up(unsigned int cpu)
     /* Unblock the CPU.  It should be waiting in the loop in head.S
      * for an event to arrive when smp_up_cpu matches its cpuid. */
     smp_up_cpu = cpu;
+    flush_xen_dcache_va((unsigned long)&smp_up_cpu);
     asm volatile("dsb; isb; sev");
 
     while ( !cpu_online(cpu) )
diff --git a/xen/include/asm-arm/page.h b/xen/include/asm-arm/page.h
index 9511c45..7d70d8c 100644
--- a/xen/include/asm-arm/page.h
+++ b/xen/include/asm-arm/page.h
@@ -232,13 +232,26 @@ static inline lpae_t mfn_to_p2m_entry(unsigned long mfn, unsigned int mattr)
 static inline void write_pte(lpae_t *p, lpae_t pte)
 {
     asm volatile (
+        "dsb;"
         /* Safely write the entry (STRD is atomic on CPUs that support LPAE) */
         "strd %0, %H0, [%1];"
+        "dsb;"
         /* Push this cacheline to the PoC so the rest of the system sees it. */
         STORE_CP32(1, DCCMVAC)
+        "isb;"
         : : "r" (pte.bits), "r" (p) : "memory");
 }
 
+static inline void flush_xen_dcache_va(unsigned long va)
+{
+    register unsigned long r0 asm ("r0") = va;
+    asm volatile (
+        "dsb;"
+        STORE_CP32(0, DCCMVAC)
+        "isb;"
+        : : "r" (r0) : "memory");
+}
+
 /*
  * Flush all hypervisor mappings from the TLB and branch predictor.
  * This is needed after changing Xen code mappings. 
@@ -249,6 +262,7 @@ static inline void flush_xen_text_tlb(void)
     asm volatile (
         "dsb;"                        /* Ensure visibility of PTE writes */
         STORE_CP32(0, TLBIALLH)       /* Flush hypervisor TLB */
+        STORE_CP32(0, ICIALLU)        /* Flush I-cache */
         STORE_CP32(0, BPIALL)         /* Flush branch predictor */
         "dsb;"                        /* Ensure completion of TLB+BP flush */
         "isb;"
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH 7/7] xen/arm: get the number of cpus from device tree
  2012-10-24 15:03 [PATCH 0/7] xen/arm: run on real hardware Stefano Stabellini
                   ` (5 preceding siblings ...)
  2012-10-24 15:03 ` [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate Stefano Stabellini
@ 2012-10-24 15:03 ` Stefano Stabellini
  2012-10-25 10:02   ` Ian Campbell
  2012-10-24 16:02 ` [PATCH 0/7] xen/arm: run on real hardware Tim Deegan
  7 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-24 15:03 UTC (permalink / raw)
  To: xen-devel; +Cc: tim, Ian.Campbell, Stefano Stabellini

The system might have fewer cpus than the GIC supports.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 xen/arch/arm/gic.c            |    4 +---
 xen/arch/arm/gic.h            |    2 +-
 xen/arch/arm/setup.c          |    3 ++-
 xen/common/device_tree.c      |   20 ++++++++++++++++++++
 xen/include/xen/device_tree.h |    1 +
 5 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/xen/arch/arm/gic.c b/xen/arch/arm/gic.c
index 5f06e08..0c6fab9 100644
--- a/xen/arch/arm/gic.c
+++ b/xen/arch/arm/gic.c
@@ -304,7 +304,7 @@ static void __cpuinit gic_hyp_disable(void)
 }
 
 /* Set up the GIC */
-int __init gic_init(void)
+void __init gic_init(void)
 {
     /* XXX FIXME get this from devicetree */
     gic.dbase = GIC_BASE_ADDRESS + GIC_DR_OFFSET;
@@ -328,8 +328,6 @@ int __init gic_init(void)
     gic.lr_mask = 0ULL;
 
     spin_unlock(&gic.lock);
-
-    return gic.cpus;
 }
 
 /* Set up the per-CPU parts of the GIC for a secondary CPU */
diff --git a/xen/arch/arm/gic.h b/xen/arch/arm/gic.h
index b8f9f201..beeaa46 100644
--- a/xen/arch/arm/gic.h
+++ b/xen/arch/arm/gic.h
@@ -146,7 +146,7 @@ extern int gic_route_irq_to_guest(struct domain *d, unsigned int irq,
 /* Accept an interrupt from the GIC and dispatch its handler */
 extern void gic_interrupt(struct cpu_user_regs *regs, int is_fiq);
 /* Bring up the interrupt controller, and report # cpus attached */
-extern int gic_init(void);
+extern void gic_init(void);
 /* Bring up a secondary CPU's per-CPU GIC interface */
 extern void gic_init_secondary_cpu(void);
 /* Take down a CPU's per-CPU GIC interface */
diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
index 6fbcb81..26df902 100644
--- a/xen/arch/arm/setup.c
+++ b/xen/arch/arm/setup.c
@@ -189,6 +189,7 @@ void __init start_xen(unsigned long boot_phys_offset,
         + (atag_paddr & ((1 << SECOND_SHIFT) - 1));
     fdt_size = device_tree_early_init(fdt);
 
+    cpus = device_tree_cpus(fdt);
     cmdline_parse(device_tree_bootargs(fdt));
 
     setup_pagetables(boot_phys_offset, get_xen_paddr());
@@ -199,7 +200,7 @@ void __init start_xen(unsigned long boot_phys_offset,
     console_init_preirq();
 #endif
 
-    cpus = gic_init();
+    gic_init();
     make_cpus_ready(cpus, boot_phys_offset);
 
     percpu_init_areas();
diff --git a/xen/common/device_tree.c b/xen/common/device_tree.c
index 3d1f0f4..3ae0f4d 100644
--- a/xen/common/device_tree.c
+++ b/xen/common/device_tree.c
@@ -153,6 +153,26 @@ const char *device_tree_bootargs(const void *fdt)
     return prop->data;
 }
 
+int device_tree_cpus(const void *fdt)
+{
+    int node, depth;
+    int cpus = 0;
+
+    node = fdt_path_offset(fdt, "/cpus");
+    if ( node < 0 )
+        return -1;
+    do {
+        node = fdt_next_node(fdt, node, &depth);
+        if ( node < 0 )
+            break;
+        if ( strncmp(fdt_get_name(fdt, node, NULL), "cpu", 3) )
+            break;
+        cpus++;
+    } while ( depth > 0 );
+
+    return cpus;
+}
+
 static int dump_node(const void *fdt, int node, const char *name, int depth,
                      u32 address_cells, u32 size_cells, void *data)
 {
diff --git a/xen/include/xen/device_tree.h b/xen/include/xen/device_tree.h
index 4d010c0..46bc0f8 100644
--- a/xen/include/xen/device_tree.h
+++ b/xen/include/xen/device_tree.h
@@ -49,6 +49,7 @@ bool_t device_tree_node_matches(const void *fdt, int node, const char *match);
 int device_tree_for_each_node(const void *fdt,
                               device_tree_node_func func, void *data);
 const char *device_tree_bootargs(const void *fdt);
+int device_tree_cpus(const void *fdt);
 void device_tree_dump(const void *fdt);
 
 #endif
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH 2/7] xen/arm: setup the fixmap in head.S
  2012-10-24 15:03 ` [PATCH 2/7] xen/arm: setup the fixmap in head.S Stefano Stabellini
@ 2012-10-24 15:27   ` Tim Deegan
  2012-10-24 15:37     ` Stefano Stabellini
  2012-10-25  9:33   ` Ian Campbell
  1 sibling, 1 reply; 61+ messages in thread
From: Tim Deegan @ 2012-10-24 15:27 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Ian.Campbell

At 16:03 +0100 on 24 Oct (1351094622), Stefano Stabellini wrote:
> Setup the fixmap mapping directly in head.S rather than having a
> temporary mapping only to re-do it later in C.
> 
> Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>

I'm generally against doing anything in asm that could be in C, as
you know, but this actually looks OK.  Nits below.

> @@ -183,6 +184,16 @@ skip_bss:
>  	teq   r12, #0
>  	bne   pt_ready
>  	
> +	/* console fixmap */
> +	ldr   r1, =xen_fixmap
> +	add   r1, r1, r10            /* r1 := paddr (xen_fixmap) */
> +	mov   r3, #0
> +	lsr   r2, r11, #12
> +	lsl   r2, r2, #12            /* 4K aligned paddr of UART */
> +	orr   r2, r2, #PT_UPPER(DEV_L3)
> +	orr   r2, r2, #PT_LOWER(DEV_L3) /* r2:r3 := 4K dev map including UART */
> +	strd  r2, r3, [r1, #0]       /* Map it in the first fixmap's slot */

Can you use #(FIXMAP_CONSOLE*8) here rather than hard-coding the 0?

> +	
>  	/* Build the baseline idle pagetable's first-level entries */
>  	ldr   r1, =xen_second
>  	add   r1, r1, r10            /* r1 := paddr (xen_second) */
> @@ -205,12 +216,13 @@ skip_bss:
>  	ldr   r4, =start
>  	lsr   r4, #18                /* Slot for vaddr(start) */
>  	strd  r2, r3, [r1, r4]       /* Map Xen there too */
> +
>  #ifdef EARLY_UART_ADDRESS
> -	ldr   r3, =(1<<(54-32))      /* NS for device mapping */
> -	lsr   r2, r11, #21
> -	lsl   r2, r2, #21            /* 2MB-aligned paddr of UART */
> -	orr   r2, r2, #PT_UPPER(DEV)
> -	orr   r2, r2, #PT_LOWER(DEV) /* r2:r3 := 2MB dev map including UART */
> +	/* xen_fixmap pagetable */
> +	ldr r2, =xen_fixmap
> +	add r2, r2, r10

Please keep the operands aligned with the code above and below, and maybe
add a comment that r2 := paddr (xen_fixmap).

> +	orr   r2, r2, #PT_UPPER(PT)
> +	orr   r2, r2, #PT_LOWER(PT) /* r2:r3 := paddr (xen_fixmap) */

r2:r3 is a pagetable mapping here, not a paddr.  Also the comment's not
aligned with the ones above and below.

>  	add   r4, r4, #8
>  	strd  r2, r3, [r1, r4]       /* Map it in the fixmap's slot */
>  #else
> @@ -236,13 +248,9 @@ pt_ready:
>  	mov   pc, r1                 /* Get a proper vaddr into PC */
>  paging:
>  
> +	
>  #ifdef EARLY_UART_ADDRESS
> -	/* Recover the UART address in the new address space. */
> -	lsl   r11, #11
> -	lsr   r11, #11               /* UART base's offset from 2MB base */
> -	adr   r0, start
> -	add   r0, r0, #0x200000      /* vaddr of the fixmap's 2MB slot */
> -	add   r11, r11, r0           /* r11 := vaddr (UART base address) */
> +	ldr r11, =FIXMAP_ADDR(FIXMAP_CONSOLE)

Please leave a comment in to say what this is doing.  Also, the
alignment is wrong again.  Sorry to be picky about this but I want to
keep head.S readable.

Cheers,

Tim.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 2/7] xen/arm: setup the fixmap in head.S
  2012-10-24 15:27   ` Tim Deegan
@ 2012-10-24 15:37     ` Stefano Stabellini
  0 siblings, 0 replies; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-24 15:37 UTC (permalink / raw)
  To: Tim Deegan; +Cc: xen-devel, Ian Campbell, Stefano Stabellini

On Wed, 24 Oct 2012, Tim Deegan wrote:
> At 16:03 +0100 on 24 Oct (1351094622), Stefano Stabellini wrote:
> > Setup the fixmap mapping directly in head.S rather than having a
> > temporary mapping only to re-do it later in C.
> > 
> > Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> 
> I'm generally against doing anything in asm that could be in C, as
> you know, but this actually looks OK.  Nits below.

Thanks for the quick review!


> > @@ -183,6 +184,16 @@ skip_bss:
> >  	teq   r12, #0
> >  	bne   pt_ready
> >  	
> > +	/* console fixmap */
> > +	ldr   r1, =xen_fixmap
> > +	add   r1, r1, r10            /* r1 := paddr (xen_fixmap) */
> > +	mov   r3, #0
> > +	lsr   r2, r11, #12
> > +	lsl   r2, r2, #12            /* 4K aligned paddr of UART */
> > +	orr   r2, r2, #PT_UPPER(DEV_L3)
> > +	orr   r2, r2, #PT_LOWER(DEV_L3) /* r2:r3 := 4K dev map including UART */
> > +	strd  r2, r3, [r1, #0]       /* Map it in the first fixmap's slot */
> 
> Can you use #(FIXMAP_CONSOLE*8) here rather than hard-coding the 0?

Yep


> > +	
> >  	/* Build the baseline idle pagetable's first-level entries */
> >  	ldr   r1, =xen_second
> >  	add   r1, r1, r10            /* r1 := paddr (xen_second) */
> > @@ -205,12 +216,13 @@ skip_bss:
> >  	ldr   r4, =start
> >  	lsr   r4, #18                /* Slot for vaddr(start) */
> >  	strd  r2, r3, [r1, r4]       /* Map Xen there too */
> > +
> >  #ifdef EARLY_UART_ADDRESS
> > -	ldr   r3, =(1<<(54-32))      /* NS for device mapping */
> > -	lsr   r2, r11, #21
> > -	lsl   r2, r2, #21            /* 2MB-aligned paddr of UART */
> > -	orr   r2, r2, #PT_UPPER(DEV)
> > -	orr   r2, r2, #PT_LOWER(DEV) /* r2:r3 := 2MB dev map including UART */
> > +	/* xen_fixmap pagetable */
> > +	ldr r2, =xen_fixmap
> > +	add r2, r2, r10
> 
> Please keep the operands aligned with the code above and below, and maybe
> add a comment that r2 := paddr (xen_fixmap).

good point


> > +	orr   r2, r2, #PT_UPPER(PT)
> > +	orr   r2, r2, #PT_LOWER(PT) /* r2:r3 := paddr (xen_fixmap) */
> 
> r2:r3 is a pagetable mapping here, not a paddr.  Also the comment's not
> aligned with the ones above and below.

I'll fix


> >  	add   r4, r4, #8
> >  	strd  r2, r3, [r1, r4]       /* Map it in the fixmap's slot */
> >  #else
> > @@ -236,13 +248,9 @@ pt_ready:
> >  	mov   pc, r1                 /* Get a proper vaddr into PC */
> >  paging:
> >  
> > +	
> >  #ifdef EARLY_UART_ADDRESS
> > -	/* Recover the UART address in the new address space. */
> > -	lsl   r11, #11
> > -	lsr   r11, #11               /* UART base's offset from 2MB base */
> > -	adr   r0, start
> > -	add   r0, r0, #0x200000      /* vaddr of the fixmap's 2MB slot */
> > -	add   r11, r11, r0           /* r11 := vaddr (UART base address) */
> > +	ldr r11, =FIXMAP_ADDR(FIXMAP_CONSOLE)
> 
> Please leave a comment in to say what this is doing.  Also, the
> alignment is wrong again.  Sorry to be picky about this but I want to
> keep head.S readable.

You have my complete support on code style issues and comments in
assembly code!

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 5/7] xen/arm: wake up secondary cpus
  2012-10-24 15:03 ` [PATCH 5/7] xen/arm: wake up secondary cpus Stefano Stabellini
@ 2012-10-24 15:38   ` Tim Deegan
  2012-10-24 15:59     ` Stefano Stabellini
  2012-10-25  9:59   ` Ian Campbell
  1 sibling, 1 reply; 61+ messages in thread
From: Tim Deegan @ 2012-10-24 15:38 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Ian.Campbell

At 16:03 +0100 on 24 Oct (1351094625), Stefano Stabellini wrote:
> Secondary cpus are held by the firmware until we send an IPI to them.
> 
> Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> ---
>  xen/arch/arm/head.S        |    8 ++++++--
>  xen/arch/arm/mode_switch.S |   31 +++++++++++++++++++++++++++++++
>  2 files changed, 37 insertions(+), 2 deletions(-)
> 
> diff --git a/xen/arch/arm/head.S b/xen/arch/arm/head.S
> index c784f4d..39c4774 100644
> --- a/xen/arch/arm/head.S
> +++ b/xen/arch/arm/head.S
> @@ -79,12 +79,12 @@ start:
>  	beq   boot_cpu               /* If we're CPU 0, boot now */
>  
>  	/* Non-boot CPUs wait here to be woken up one at a time. */
> -1:	wfe
> -	dsb
> +1:	dsb
>  	ldr   r0, =smp_up_cpu        /* VA of gate */
>  	add   r0, r0, r10            /* PA of gate */
>  	ldr   r1, [r0]               /* Which CPU is being booted? */
>  	teq   r1, r12                /* Is it us? */
> +	wfene

Shouldn't that be wf_i_ne?

>  	bne   1b
>  
>  boot_cpu:
> @@ -98,6 +98,10 @@ boot_cpu:
>  	PRINT(" booting -\r\n")
>  #endif
>  
> +	/* Wake up secondary cpus */
> +	teq   r12, #0
> +	bleq  kick_cpus
> +
>  	/* Check that this CPU has Hyp mode */
>  	mrc   CP32(r0, ID_PFR1)
>  	and   r0, r0, #0xf000        /* Bits 12-15 define virt extensions */
> diff --git a/xen/arch/arm/mode_switch.S b/xen/arch/arm/mode_switch.S
> index 83a682b..39d80e8 100644
> --- a/xen/arch/arm/mode_switch.S
> +++ b/xen/arch/arm/mode_switch.S
> @@ -21,6 +21,37 @@
>  #include <asm/page.h>
>  #include <asm/asm_defns.h>
>  
> +/* XXX: Versatile Express specific code */
> +/* wake up secondary cpus */
> +.globl kick_cpus
> +kick_cpus:
> +    /* write start paddr to v2m sysreg FLAGSSET register */
> +	ldr   r0, =0x1c010000 /* base V2M sysreg MMIO address */
> +	add   r1, r0, #0x34   /* SYS_FLAGSCLR register */
> +	dsb
> +	mov   r2, #0xffffffff
> +	str   r2, [r1]

Why not str r2, [r0, #0x34] and save the explicit add (and similarly below)?

> +	dsb
> +	add   r1, r0, #0x30   /* SYS_FLAGSSET register */
> +	ldr   r2, =start
> +	add   r2, r2, r10
> +	str   r2, [r1]
> +	dsb
> +	/* send an interrupt */
> +	ldr   r0, =0x2c001000 /* base GICD MMIO address */
> +	mov   r1, r0
> +	mov   r2, #0x1        /* GICD_CTLR */
> +	str   r2, [r1]        /* enable distributor */
> +	add   r1, r0, #0xf00  /* GICD_SGIR */

These GIGD_* addresses and constants are already defined in config.h and
gic.h; can you reuse those definitions?

Tim.

> +	mov   r2, #0xfe0000
> +	str   r2, [r1]        /* send IPI to everybody */
> +	dsb
> +	mov   r1, r0
> +	mov   r2, #0x0        /* GICD_CTLR */
> +	str   r2, [r1]        /* disable distributor */
> +	mov   pc, lr
> +
> +
>  /* Get up a CPU into Hyp mode.  Clobbers r0-r3.
>   *
>   * Expects r12 == CPU number
> -- 
> 1.7.2.5
> 

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-24 15:03 ` [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate Stefano Stabellini
@ 2012-10-24 15:59   ` Tim Deegan
  2012-10-24 16:05     ` Ian Campbell
  2012-10-24 17:35     ` Stefano Stabellini
  0 siblings, 2 replies; 61+ messages in thread
From: Tim Deegan @ 2012-10-24 15:59 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Ian.Campbell

At 16:03 +0100 on 24 Oct (1351094626), Stefano Stabellini wrote:
> - invalidate tlb after setting WXN;
> - flush D-cache and I-cache after relocation;
> - flush D-cache after writing to smp_up_cpu;
> - flush TLB before changing HTTBR;
> - flush I-cache after changing HTTBR;
> - flush I-cache and branch predictor after writing Xen text ptes.
> 
> Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>

> @@ -244,10 +245,18 @@ void __init setup_pagetables(unsigned long boot_phys_offset, paddr_t xen_paddr)
>  
>      /* Change pagetables to the copy in the relocated Xen */
>      boot_httbr = (unsigned long) xen_pgtable + phys_offset;
> +    flush_xen_dcache_va((unsigned long)&boot_httbr);
> +    for ( i = 0; i < _end - _start; i += cacheline_size )
> +        flush_xen_dcache_va(dest_va + i);
> +    flush_xen_text_tlb();
> +
>      asm volatile (
> +        "dsb;"                        /* Ensure visibility of HTTBR update */

That comment should be changed -- this dsb is to make sure all the PT
changes are completed, right?

>          STORE_CP64(0, HTTBR)          /* Change translation base */
>          "dsb;"                        /* Ensure visibility of HTTBR update */
> +        "isb;"
>          STORE_CP32(0, TLBIALLH)       /* Flush hypervisor TLB */
> +        STORE_CP32(0, ICIALLU)        /* Flush I-cache */
>          STORE_CP32(0, BPIALL)         /* Flush branch predictor */
>          "dsb;"                        /* Ensure completion of TLB+BP flush */
>          "isb;"

> diff --git a/xen/include/asm-arm/page.h b/xen/include/asm-arm/page.h
> index 9511c45..7d70d8c 100644
> --- a/xen/include/asm-arm/page.h
> +++ b/xen/include/asm-arm/page.h
> @@ -232,13 +232,26 @@ static inline lpae_t mfn_to_p2m_entry(unsigned long mfn, unsigned int mattr)
>  static inline void write_pte(lpae_t *p, lpae_t pte)
>  {
>      asm volatile (
> +        "dsb;"

I guess this is to make sure all writes that used the old mapping have
completed?  Can you add a comment?

>          /* Safely write the entry (STRD is atomic on CPUs that support LPAE) */
>          "strd %0, %H0, [%1];"
> +        "dsb;"
>          /* Push this cacheline to the PoC so the rest of the system sees it. */
>          STORE_CP32(1, DCCMVAC)
> +        "isb;"

This is for code modifications, right?  I think we can drop it, and have
all the paths that modify text mappings do explicit isb()s there -- the
vast majority of PTE writes don't need it. 

>          : : "r" (pte.bits), "r" (p) : "memory");
>  }
>  
> +static inline void flush_xen_dcache_va(unsigned long va)

Three of the four users of this function cast their arguments from
pointer types - maybe it should take a void * instead?.

> +{
> +    register unsigned long r0 asm ("r0") = va;

I don't think this is necessary - why not just pass va directly to the
inline asm?  We don't care what register it's in (and if we did I'm not
convinced this would guarantee it was r0).

> +    asm volatile (
> +        "dsb;"
> +        STORE_CP32(0, DCCMVAC)
> +        "isb;"
> +        : : "r" (r0) : "memory");

Does this need a 'memory' clobber?  Can we get away with just saying it
consumes *va as an input?  All we need to be sure of is that the
particular thing we're flushing has been written out; no need to stop
any other optimizations.

I guess it might need to be re-cast as a macro so the compiler knows how
big *va is?

Tim.

> +}
> +
>  /*
>   * Flush all hypervisor mappings from the TLB and branch predictor.
>   * This is needed after changing Xen code mappings. 
> @@ -249,6 +262,7 @@ static inline void flush_xen_text_tlb(void)
>      asm volatile (
>          "dsb;"                        /* Ensure visibility of PTE writes */
>          STORE_CP32(0, TLBIALLH)       /* Flush hypervisor TLB */
> +        STORE_CP32(0, ICIALLU)        /* Flush I-cache */
>          STORE_CP32(0, BPIALL)         /* Flush branch predictor */
>          "dsb;"                        /* Ensure completion of TLB+BP flush */
>          "isb;"
> -- 
> 1.7.2.5
> 

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 5/7] xen/arm: wake up secondary cpus
  2012-10-24 15:38   ` Tim Deegan
@ 2012-10-24 15:59     ` Stefano Stabellini
  2012-10-24 16:05       ` Tim Deegan
  0 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-24 15:59 UTC (permalink / raw)
  To: Tim Deegan; +Cc: xen-devel, Ian Campbell, Stefano Stabellini

On Wed, 24 Oct 2012, Tim Deegan wrote:
> At 16:03 +0100 on 24 Oct (1351094625), Stefano Stabellini wrote:
> > Secondary cpus are held by the firmware until we send an IPI to them.
> > 
> > Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> > ---
> >  xen/arch/arm/head.S        |    8 ++++++--
> >  xen/arch/arm/mode_switch.S |   31 +++++++++++++++++++++++++++++++
> >  2 files changed, 37 insertions(+), 2 deletions(-)
> > 
> > diff --git a/xen/arch/arm/head.S b/xen/arch/arm/head.S
> > index c784f4d..39c4774 100644
> > --- a/xen/arch/arm/head.S
> > +++ b/xen/arch/arm/head.S
> > @@ -79,12 +79,12 @@ start:
> >  	beq   boot_cpu               /* If we're CPU 0, boot now */
> >  
> >  	/* Non-boot CPUs wait here to be woken up one at a time. */
> > -1:	wfe
> > -	dsb
> > +1:	dsb
> >  	ldr   r0, =smp_up_cpu        /* VA of gate */
> >  	add   r0, r0, r10            /* PA of gate */
> >  	ldr   r1, [r0]               /* Which CPU is being booted? */
> >  	teq   r1, r12                /* Is it us? */
> > +	wfene
> 
> Shouldn't that be wf_i_ne?

Nope. We need wfe here, in fact the corresponding code is
xen/arch/arm/smpboot.c:make_cpus_ready that uses sev to wake the other
cpus up. The code running wfi on secondary cpus is actually in the
firmware.


> >  	bne   1b
> >  
> >  boot_cpu:
> > @@ -98,6 +98,10 @@ boot_cpu:
> >  	PRINT(" booting -\r\n")
> >  #endif
> >  
> > +	/* Wake up secondary cpus */
> > +	teq   r12, #0
> > +	bleq  kick_cpus
> > +
> >  	/* Check that this CPU has Hyp mode */
> >  	mrc   CP32(r0, ID_PFR1)
> >  	and   r0, r0, #0xf000        /* Bits 12-15 define virt extensions */
> > diff --git a/xen/arch/arm/mode_switch.S b/xen/arch/arm/mode_switch.S
> > index 83a682b..39d80e8 100644
> > --- a/xen/arch/arm/mode_switch.S
> > +++ b/xen/arch/arm/mode_switch.S
> > @@ -21,6 +21,37 @@
> >  #include <asm/page.h>
> >  #include <asm/asm_defns.h>
> >  
> > +/* XXX: Versatile Express specific code */
> > +/* wake up secondary cpus */
> > +.globl kick_cpus
> > +kick_cpus:
> > +    /* write start paddr to v2m sysreg FLAGSSET register */
> > +	ldr   r0, =0x1c010000 /* base V2M sysreg MMIO address */
> > +	add   r1, r0, #0x34   /* SYS_FLAGSCLR register */
> > +	dsb
> > +	mov   r2, #0xffffffff
> > +	str   r2, [r1]
> 
> Why not str r2, [r0, #0x34] and save the explicit add (and similarly below)?

Good idea


> > +	dsb
> > +	add   r1, r0, #0x30   /* SYS_FLAGSSET register */
> > +	ldr   r2, =start
> > +	add   r2, r2, r10
> > +	str   r2, [r1]
> > +	dsb
> > +	/* send an interrupt */
> > +	ldr   r0, =0x2c001000 /* base GICD MMIO address */
> > +	mov   r1, r0
> > +	mov   r2, #0x1        /* GICD_CTLR */
> > +	str   r2, [r1]        /* enable distributor */
> > +	add   r1, r0, #0xf00  /* GICD_SGIR */
> 
> These GIGD_* addresses and constants are already defined in config.h and
> gic.h; can you reuse those definitions?

Yep

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 0/7] xen/arm: run on real hardware
  2012-10-24 15:03 [PATCH 0/7] xen/arm: run on real hardware Stefano Stabellini
                   ` (6 preceding siblings ...)
  2012-10-24 15:03 ` [PATCH 7/7] xen/arm: get the number of cpus from device tree Stefano Stabellini
@ 2012-10-24 16:02 ` Tim Deegan
  7 siblings, 0 replies; 61+ messages in thread
From: Tim Deegan @ 2012-10-24 16:02 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Ian Campbell

At 16:03 +0100 on 24 Oct (1351094597), Stefano Stabellini wrote:
> Hi all,
> this is a collection of fixes that I wrote trying to run Xen on a real
> Versatile Express Cortex A15 machine.

Congratulations on getting it running on real hardware!  That's great
progress.

I've replied to 2, 5 and 6.  For 1, 3, 4 and 7,
Acked-by: Tim Deegan <tim@xen.org>

Cheers,

Tim.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 5/7] xen/arm: wake up secondary cpus
  2012-10-24 15:59     ` Stefano Stabellini
@ 2012-10-24 16:05       ` Tim Deegan
  0 siblings, 0 replies; 61+ messages in thread
From: Tim Deegan @ 2012-10-24 16:05 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Ian Campbell

At 16:59 +0100 on 24 Oct (1351097986), Stefano Stabellini wrote:
> On Wed, 24 Oct 2012, Tim Deegan wrote:
> > At 16:03 +0100 on 24 Oct (1351094625), Stefano Stabellini wrote:
> > > Secondary cpus are held by the firmware until we send an IPI to them.
> > > 
> > > Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> > > ---
> > >  xen/arch/arm/head.S        |    8 ++++++--
> > >  xen/arch/arm/mode_switch.S |   31 +++++++++++++++++++++++++++++++
> > >  2 files changed, 37 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/xen/arch/arm/head.S b/xen/arch/arm/head.S
> > > index c784f4d..39c4774 100644
> > > --- a/xen/arch/arm/head.S
> > > +++ b/xen/arch/arm/head.S
> > > @@ -79,12 +79,12 @@ start:
> > >  	beq   boot_cpu               /* If we're CPU 0, boot now */
> > >  
> > >  	/* Non-boot CPUs wait here to be woken up one at a time. */
> > > -1:	wfe
> > > -	dsb
> > > +1:	dsb
> > >  	ldr   r0, =smp_up_cpu        /* VA of gate */
> > >  	add   r0, r0, r10            /* PA of gate */
> > >  	ldr   r1, [r0]               /* Which CPU is being booted? */
> > >  	teq   r1, r12                /* Is it us? */
> > > +	wfene
> > 
> > Shouldn't that be wf_i_ne?
> 
> Nope. We need wfe here, in fact the corresponding code is
> xen/arch/arm/smpboot.c:make_cpus_ready that uses sev to wake the other
> cpus up. The code running wfi on secondary cpus is actually in the
> firmware.

Ah, I see - thanks.

Tim.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-24 15:59   ` Tim Deegan
@ 2012-10-24 16:05     ` Ian Campbell
  2012-10-24 16:17       ` Tim Deegan
  2012-10-24 17:35     ` Stefano Stabellini
  1 sibling, 1 reply; 61+ messages in thread
From: Ian Campbell @ 2012-10-24 16:05 UTC (permalink / raw)
  To: Tim Deegan; +Cc: xen-devel, Stefano Stabellini

On Wed, 2012-10-24 at 16:59 +0100, Tim Deegan wrote:
> > +    asm volatile (
> > +        "dsb;"
> > +        STORE_CP32(0, DCCMVAC)
> > +        "isb;"
> > +        : : "r" (r0) : "memory");
> 
> Does this need a 'memory' clobber?  Can we get away with just saying it
> consumes *va as an input?  All we need to be sure of is that the
> particular thing we're flushing has been written out; no need to stop
> any other optimizations.
> 
> I guess it might need to be re-cast as a macro so the compiler knows how
> big *va is?

Does it matter that this flushes the cache line containing va, which
might be larger than whatever type va is?

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-24 16:05     ` Ian Campbell
@ 2012-10-24 16:17       ` Tim Deegan
  0 siblings, 0 replies; 61+ messages in thread
From: Tim Deegan @ 2012-10-24 16:17 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel, Stefano Stabellini

At 17:05 +0100 on 24 Oct (1351098349), Ian Campbell wrote:
> On Wed, 2012-10-24 at 16:59 +0100, Tim Deegan wrote:
> > > +    asm volatile (
> > > +        "dsb;"
> > > +        STORE_CP32(0, DCCMVAC)
> > > +        "isb;"
> > > +        : : "r" (r0) : "memory");
> > 
> > Does this need a 'memory' clobber?  Can we get away with just saying it
> > consumes *va as an input?  All we need to be sure of is that the
> > particular thing we're flushing has been written out; no need to stop
> > any other optimizations.
> > 
> > I guess it might need to be re-cast as a macro so the compiler knows how
> > big *va is?
> 
> Does it matter that this flushes the cache line containing va, which
> might be larger than whatever type va is?

That depends why you're flushing. :)  From this CPU's PoV the flush
should have no effect at all, so the things to worry about are:

 - the write of the thing we're trying to flush gets moved by the
   compiler so it's after the flush.   That case is handled by using an 
   input memory operand of the right size.
 - something else on the cacheline that's written after the flush gets
   hoisted and written before.  It seems like anything that's relying on
   that (?!) is on very thin ice already and ought to be using (at least)
   explicit memory barriers of its own.

Another interesting case is where we're flushing a large area by
cachelines (like the new loop after relocation), so effectively we don't
know the size of the input operand (a.k.a. 1 cacheline) at compile time.
But there I think a single explicit barrier before the loop isn't too much
to ask.  We could break that and the loop out into a separate function.

Tim.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-24 15:59   ` Tim Deegan
  2012-10-24 16:05     ` Ian Campbell
@ 2012-10-24 17:35     ` Stefano Stabellini
  2012-10-26  9:01       ` Tim Deegan
  1 sibling, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-24 17:35 UTC (permalink / raw)
  To: Tim Deegan; +Cc: xen-devel, Ian Campbell, Stefano Stabellini

On Wed, 24 Oct 2012, Tim Deegan wrote:
> At 16:03 +0100 on 24 Oct (1351094626), Stefano Stabellini wrote:
> > - invalidate tlb after setting WXN;
> > - flush D-cache and I-cache after relocation;
> > - flush D-cache after writing to smp_up_cpu;
> > - flush TLB before changing HTTBR;
> > - flush I-cache after changing HTTBR;
> > - flush I-cache and branch predictor after writing Xen text ptes.
> > 
> > Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> 
> > @@ -244,10 +245,18 @@ void __init setup_pagetables(unsigned long boot_phys_offset, paddr_t xen_paddr)
> >  
> >      /* Change pagetables to the copy in the relocated Xen */
> >      boot_httbr = (unsigned long) xen_pgtable + phys_offset;
> > +    flush_xen_dcache_va((unsigned long)&boot_httbr);
> > +    for ( i = 0; i < _end - _start; i += cacheline_size )
> > +        flush_xen_dcache_va(dest_va + i);
> > +    flush_xen_text_tlb();
> > +
> >      asm volatile (
> > +        "dsb;"                        /* Ensure visibility of HTTBR update */
> 
> That comment should be changed -- this dsb is to make sure all the PT
> changes are completed, right?

The comment is certainly wrong. I actually think that this dsb is
not necessary, thanks to flush_xen_text_tlb.


> >          STORE_CP64(0, HTTBR)          /* Change translation base */
> >          "dsb;"                        /* Ensure visibility of HTTBR update */
> > +        "isb;"
> >          STORE_CP32(0, TLBIALLH)       /* Flush hypervisor TLB */
> > +        STORE_CP32(0, ICIALLU)        /* Flush I-cache */
> >          STORE_CP32(0, BPIALL)         /* Flush branch predictor */
> >          "dsb;"                        /* Ensure completion of TLB+BP flush */
> >          "isb;"
> 
> > diff --git a/xen/include/asm-arm/page.h b/xen/include/asm-arm/page.h
> > index 9511c45..7d70d8c 100644
> > --- a/xen/include/asm-arm/page.h
> > +++ b/xen/include/asm-arm/page.h
> > @@ -232,13 +232,26 @@ static inline lpae_t mfn_to_p2m_entry(unsigned long mfn, unsigned int mattr)
> >  static inline void write_pte(lpae_t *p, lpae_t pte)
> >  {
> >      asm volatile (
> > +        "dsb;"
> 
> I guess this is to make sure all writes that used the old mapping have
> completed?  Can you add a comment?

Yes, that is exactly the reason. I'll add a comment.


> >          /* Safely write the entry (STRD is atomic on CPUs that support LPAE) */
> >          "strd %0, %H0, [%1];"
> > +        "dsb;"
> >          /* Push this cacheline to the PoC so the rest of the system sees it. */
> >          STORE_CP32(1, DCCMVAC)
> > +        "isb;"
> 
> This is for code modifications, right?  I think we can drop it, and have
> all the paths that modify text mappings do explicit isb()s there -- the
> vast majority of PTE writes don't need it. 

Yes. Thinking twice about it we should have a dsb there instead, for data
mappings too, otherwise we can't be sure that the DCCMVAC is completed
before returning.


> >          : : "r" (pte.bits), "r" (p) : "memory");
> >  }
> >  
> > +static inline void flush_xen_dcache_va(unsigned long va)
> 
> Three of the four users of this function cast their arguments from
> pointer types - maybe it should take a void * instead?.

I can do that


> > +{
> > +    register unsigned long r0 asm ("r0") = va;
> 
> I don't think this is necessary - why not just pass va directly to the
> inline asm?  We don't care what register it's in (and if we did I'm not
> convinced this would guarantee it was r0).
> 
> > +    asm volatile (
> > +        "dsb;"
> > +        STORE_CP32(0, DCCMVAC)
> > +        "isb;"
> > +        : : "r" (r0) : "memory");
> 
> Does this need a 'memory' clobber?  Can we get away with just saying it
> consumes *va as an input?  All we need to be sure of is that the
> particular thing we're flushing has been written out; no need to stop
> any other optimizations.

you are right on both points


> I guess it might need to be re-cast as a macro so the compiler knows how
> big *va is?

I don't think it is necessary, after all the size of a register has to
be the same of a virtual address

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/7] xen/arm: fix rank calculation in vgic_vcpu_inject_irq
  2012-10-24 15:03 ` [PATCH 1/7] xen/arm: fix rank calculation in vgic_vcpu_inject_irq Stefano Stabellini
@ 2012-10-25  9:27   ` Ian Campbell
  2012-10-26 16:33     ` Stefano Stabellini
  0 siblings, 1 reply; 61+ messages in thread
From: Ian Campbell @ 2012-10-25  9:27 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org)

On Wed, 2012-10-24 at 16:03 +0100, Stefano Stabellini wrote:
> Each rank holds 32 irqs, so we should divide by 32 rather than by 4.

I think this is wrong, because idx isn't used in the way your reasoning
implies, when we use it we assume 8 bits-per-interrupt in the register
not 1.

The accessor function vgix_irq_rank is couched in terms of
GICD_<FOO><n>, which is convenient where we are emulating register
accesses but is very confusing in this one function where we aren't
thinking in terms of registers!

vgic_irq_rank(v, 8, idx) is saying "find me the rank containing
GICD_<FOO><idx> where FOO has 8 bits per interrupt". Since 4 lots of 8
bits goes into 32 we therefore need to divide by 4. This makes sense if
you consider that FOO here is PRIORITY and that GICD_PRIORITY0 controls
irq 0..3, GICD_PRIORITY1 irq 4..7 etc.

With idx = irq >> 2 you get:

IRQ00: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 0
IRQ01: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 1
IRQ02: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 2
IRQ03: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 3
IRQ04: idx = 01 REG_RANK_NR(08, 01) = 0, REG_RANK_INDEX(08, 01) = 1, BYTE = 0
IRQ05: idx = 01 REG_RANK_NR(08, 01) = 0, REG_RANK_INDEX(08, 01) = 1, BYTE = 1
[...]
IRQ30: idx = 07 REG_RANK_NR(08, 07) = 0, REG_RANK_INDEX(08, 07) = 7, BYTE = 2
IRQ31: idx = 07 REG_RANK_NR(08, 07) = 0, REG_RANK_INDEX(08, 07) = 7, BYTE = 3
IRQ32: idx = 08 REG_RANK_NR(08, 08) = 1, REG_RANK_INDEX(08, 08) = 0, BYTE = 0
IRQ33: idx = 08 REG_RANK_NR(08, 08) = 1, REG_RANK_INDEX(08, 08) = 0, BYTE = 1
[...]
IRQ62: idx = 15 REG_RANK_NR(08, 15) = 1, REG_RANK_INDEX(08, 15) = 7, BYTE = 2
IRQ63: idx = 15 REG_RANK_NR(08, 15) = 1, REG_RANK_INDEX(08, 15) = 7, BYTE = 3
IRQ64: idx = 16 REG_RANK_NR(08, 16) = 2, REG_RANK_INDEX(08, 16) = 0, BYTE = 0
IRQ65: idx = 16 REG_RANK_NR(08, 16) = 2, REG_RANK_INDEX(08, 16) = 0, BYTE = 1
[...]

IOW interrupts 0..31 are in rank 0, 32..63 are rank 1 etc, which is correct

With idx = irq / 32 you instead get:

IRQ00: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 0
IRQ01: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 1
IRQ02: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 2
IRQ03: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 3
IRQ04: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 0
IRQ05: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 1
[...]
IRQ30: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 2
IRQ31: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 3
IRQ32: idx = 01 REG_RANK_NR(08, 01) = 0, REG_RANK_INDEX(08, 01) = 1, BYTE = 0
IRQ33: idx = 01 REG_RANK_NR(08, 01) = 0, REG_RANK_INDEX(08, 01) = 1, BYTE = 1
[...]
IRQ62: idx = 01 REG_RANK_NR(08, 01) = 0, REG_RANK_INDEX(08, 01) = 1, BYTE = 2
IRQ63: idx = 01 REG_RANK_NR(08, 01) = 0, REG_RANK_INDEX(08, 01) = 1, BYTE = 3
IRQ64: idx = 02 REG_RANK_NR(08, 02) = 0, REG_RANK_INDEX(08, 02) = 2, BYTE = 0
IRQ65: idx = 02 REG_RANK_NR(08, 02) = 0, REG_RANK_INDEX(08, 02) = 2, BYTE = 1
[...]
IRQ255: idx = 07 REG_RANK_NR(08, 07) = 0, REG_RANK_INDEX(08, 07) = 7, BYTE = 3
IRQ256: idx = 08 REG_RANK_NR(08, 08) = 1, REG_RANK_INDEX(08, 08) = 0, BYTE = 0

Here allinterrupts up to 255 are in rank 0, which is wrong.

For idx = irq / 32 you would need to use REG_RANK_NR(1, idx) not (8,
idx). (and you'd need to trivially fix REG_RANK_NR to handle b == 1).

Perhaps the way to think of it is not as "/ 32" but rather as ">> 5". 
Since REG_RANK_NR(8, ) is effectively >> 3 when combined with the
existing >> 2 we get >> 5 overall.

If you change it as you have done then you are doing >> 5 *and* >> 3,
i.e. >> 8 aka "/ 256" -- which explains why interrupts up to 255 end up
in rank 0 with your change.

Perhaps this could be made clearer by renaming vgic_irq_rank to
vgic_register_rank? Then perhaps as a helper:
#define VGIC_IRQ_RANK(v, irq) vgic_register_rank(v, 1, irq/32) for the
case of looking up a rank from an irq rather than a register offset?

Ian.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 2/7] xen/arm: setup the fixmap in head.S
  2012-10-24 15:03 ` [PATCH 2/7] xen/arm: setup the fixmap in head.S Stefano Stabellini
  2012-10-24 15:27   ` Tim Deegan
@ 2012-10-25  9:33   ` Ian Campbell
  2012-10-25 11:00     ` Stefano Stabellini
  1 sibling, 1 reply; 61+ messages in thread
From: Ian Campbell @ 2012-10-25  9:33 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org)

On Wed, 2012-10-24 at 16:03 +0100, Stefano Stabellini wrote:
> @@ -183,6 +184,16 @@ skip_bss:
>         teq   r12, #0
>         bne   pt_ready
>         
> +       /* console fixmap */
> +       ldr   r1, =xen_fixmap
> +       add   r1, r1, r10            /* r1 := paddr (xen_fixmap) */
> +       mov   r3, #0
> +       lsr   r2, r11, #12
> +       lsl   r2, r2, #12            /* 4K aligned paddr of UART */
> +       orr   r2, r2, #PT_UPPER(DEV_L3)
> +       orr   r2, r2, #PT_LOWER(DEV_L3) /* r2:r3 := 4K dev map including UART */
> +       strd  r2, r3, [r1, #0]       /* Map it in the first fixmap's slot */
> +       
>         /* Build the baseline idle pagetable's first-level entries */
>         ldr   r1, =xen_second
>         add   r1, r1, r10            /* r1 := paddr (xen_second) */
[...]
> @@ -205,12 +216,13 @@ skip_bss:
>  	ldr   r4, =start
>  	lsr   r4, #18                /* Slot for vaddr(start) */
>  	strd  r2, r3, [r1, r4]       /* Map Xen there too */
> +
>  #ifdef EARLY_UART_ADDRESS
> -	ldr   r3, =(1<<(54-32))      /* NS for device mapping */
> -	lsr   r2, r11, #21
> -	lsl   r2, r2, #21            /* 2MB-aligned paddr of UART */
> -	orr   r2, r2, #PT_UPPER(DEV)
> -	orr   r2, r2, #PT_LOWER(DEV) /* r2:r3 := 2MB dev map including UART */
> +	/* xen_fixmap pagetable */
> +	ldr r2, =xen_fixmap
> +	add r2, r2, r10
> +	orr   r2, r2, #PT_UPPER(PT)
> +	orr   r2, r2, #PT_LOWER(PT) /* r2:r3 := paddr (xen_fixmap) */

We should setup the fixmap infrastructure even if !EARLY_UART_ADDRESS,
unless we set it up again later?

As it stands it looks like you setup the generic fixmap mapping iff
EARLY_UART_ADDRESS (second hunk above) but setup the UART mapping within
that unconditionally (first hunk) which is exactly backwards, isn't it ?

Ian.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 3/7] pl011: set baud and clock_hz to the right defaults for Versatile Express
  2012-10-24 15:03 ` [PATCH 3/7] pl011: set baud and clock_hz to the right defaults for Versatile Express Stefano Stabellini
@ 2012-10-25  9:37   ` Ian Campbell
  2012-10-25 10:57     ` Stefano Stabellini
  0 siblings, 1 reply; 61+ messages in thread
From: Ian Campbell @ 2012-10-25  9:37 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org)

Can we get these (or at least the clock_hz) from DTB?

On Wed, 2012-10-24 at 16:03 +0100, Stefano Stabellini wrote:
> Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> ---
>  xen/drivers/char/pl011.c |    4 ++--
>  1 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/xen/drivers/char/pl011.c b/xen/drivers/char/pl011.c
> index 6af45aa..6ccb73a 100644
> --- a/xen/drivers/char/pl011.c
> +++ b/xen/drivers/char/pl011.c
> @@ -241,8 +241,8 @@ void __init pl011_init(int index, unsigned long register_base_address)
>  
>      uart = &pl011_com[index];
>  
> -    uart->clock_hz  = 7372800;
> -    uart->baud      = 115200;
> +    uart->clock_hz  = 0x16e3600;
> +    uart->baud      = 38400;
>      uart->data_bits = 8;
>      uart->parity    = PARITY_NONE;
>      uart->stop_bits = 1;

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 4/7] xen/arm: set the SMP bit in the ACTLR register
  2012-10-24 15:03 ` [PATCH 4/7] xen/arm: set the SMP bit in the ACTLR register Stefano Stabellini
@ 2012-10-25  9:52   ` Ian Campbell
  2012-10-25 11:57     ` Stefano Stabellini
  0 siblings, 1 reply; 61+ messages in thread
From: Ian Campbell @ 2012-10-25  9:52 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org)

On Wed, 2012-10-24 at 16:03 +0100, Stefano Stabellini wrote:
> From the Cortex A15 manual:
> 
> "Enables the processor to receive instruction cache, BTB, and TLB maintenance
> operations from other processors"
> 
> ...
> 
> "You must set this bit before enabling the caches and MMU, or
> performing any cache and TLB maintenance operations. The only time
> you must clear this bit is during a processor power-down sequence"

Is it considered a bug that the firmware doesn't do this?

> Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> ---
>  xen/arch/arm/head.S             |    6 ++++++
>  xen/include/asm-arm/processor.h |   30 ++++++++++++++++++++++++++++++
>  2 files changed, 36 insertions(+), 0 deletions(-)
> 
> diff --git a/xen/arch/arm/head.S b/xen/arch/arm/head.S
> index 3d01be0..c784f4d 100644
> --- a/xen/arch/arm/head.S
> +++ b/xen/arch/arm/head.S
> @@ -148,6 +148,12 @@ skip_bss:
>  
>  	PRINT("- Setting up control registers -\r\n")
>  	
> +	/* XXX: Cortex A15 specific */

We probably need some sort of early proc info keyed of the processor id
(like Linux) has so we can push this processor specific stuff into the
right places.

> +	/* Set up the SMP bit in ACTLR */
> +	mrc   CP32(r0, ACTLR)
> +	orr   r0, r0, #(ACTLR_SMP) /* enable SMP bit*/
> +	mcr   CP32(r0, ACTLR)

Linux does this IFF it isn't already set for some reason:
#ifdef CONFIG_SMP
        ALT_SMP(mrc     p15, 0, r0, c1, c0, 1)
        ALT_UP(mov      r0, #(1 << 6))          @ fake it for UP
        tst     r0, #(1 << 6)                   @ SMP/nAMP mode enabled?
        orreq   r0, r0, #(1 << 6)               @ Enable SMP/nAMP mode
        orreq   r0, r0, r10                     @ Enable CPU-specific SMP bits
        mcreq   p15, 0, r0, c1, c0, 1
#endif

Might just relate to the "fake it up for UP" thing I suppose.

> +
>  	/* Set up memory attribute type tables */
>  	ldr   r0, =MAIR0VAL
>  	ldr   r1, =MAIR1VAL
> diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
> index 3849b23..91a5836 100644
> --- a/xen/include/asm-arm/processor.h
> +++ b/xen/include/asm-arm/processor.h
> @@ -34,6 +34,36 @@
>  #define SCTLR_A         (1<<1)
>  #define SCTLR_M         (1<<0)
>  
> +/* ACTLR Auxiliary Control Register */
> +#define ACTLR_SNOOP_DELAYED      (1<<31)

If these are CortexA15 specific then I think they ought to be
ACTLR_CA15_FOO or something. And possibly in a new processor-ca15.h
header instead.

> +#define ACTLR_MAIN_CLOCK         (1<<30)
> +#define ACTLR_NEON_CLOCK         (1<<29)
> +#define ACTLR_NONCACHE           (1<<24)
> +#define ACTLR_INORDER_REQ        (1<<23)
> +#define ACTLR_INORDER_LOAD       (1<<22)
> +#define ACTLR_L2_TLB_PREFETCH    (1<<21)
> +#define ACTLR_L2_IPA_PA_CACHE    (1<<20)
> +#define ACTLR_L2_CACHE           (1<<19)
> +#define ACTLR_L2_PA_CACHE        (1<<18)
> +#define ACTLR_TLB                (1<<17)
> +#define ACTLR_STRONGY_ORDERED    (1<<16)
> +#define ACTLR_INORDER            (1<<15)
> +#define ACTLR_FORCE_LIM          (1<<14)
> +#define ACTLR_CP_FLUSH           (1<<13)
> +#define ACTLR_CP_PUSH            (1<<12)
> +#define ACTLR_LIM                (1<<11)
> +#define ACTLR_SER                (1<<10)
> +#define ACTLR_OPT                (1<<9)
> +#define ACTLR_WFI                (1<<8)
> +#define ACTLR_WFE                (1<<7)
> +#define ACTLR_SMP                (1<<6)
> +#define ACTLR_PLD                (1<<5)
> +#define ACTLR_IP                 (1<<4)
> +#define ACTLR_MICRO_BTB          (1<<3)
> +#define ACTLR_LOOP_ONE           (1<<2)
> +#define ACTLR_LOOP_DISABLE       (1<<1)
> +#define ACTLR_BTB                (1<<0)
> +
>  #define SCTLR_BASE        0x00c50078
>  #define HSCTLR_BASE       0x30c51878
>  

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 5/7] xen/arm: wake up secondary cpus
  2012-10-24 15:03 ` [PATCH 5/7] xen/arm: wake up secondary cpus Stefano Stabellini
  2012-10-24 15:38   ` Tim Deegan
@ 2012-10-25  9:59   ` Ian Campbell
  2012-10-25 17:45     ` Stefano Stabellini
  1 sibling, 1 reply; 61+ messages in thread
From: Ian Campbell @ 2012-10-25  9:59 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org)

On Wed, 2012-10-24 at 16:03 +0100, Stefano Stabellini wrote:
> Secondary cpus are held by the firmware until we send an IPI to them.
> 
> Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> ---
>  xen/arch/arm/head.S        |    8 ++++++--
>  xen/arch/arm/mode_switch.S |   31 +++++++++++++++++++++++++++++++
>  2 files changed, 37 insertions(+), 2 deletions(-)
> 
> diff --git a/xen/arch/arm/head.S b/xen/arch/arm/head.S
> index c784f4d..39c4774 100644
> --- a/xen/arch/arm/head.S
> +++ b/xen/arch/arm/head.S
> @@ -79,12 +79,12 @@ start:
>  	beq   boot_cpu               /* If we're CPU 0, boot now */
>  
>  	/* Non-boot CPUs wait here to be woken up one at a time. */
> -1:	wfe
> -	dsb
> +1:	dsb
>  	ldr   r0, =smp_up_cpu        /* VA of gate */
>  	add   r0, r0, r10            /* PA of gate */
>  	ldr   r1, [r0]               /* Which CPU is being booted? */
>  	teq   r1, r12                /* Is it us? */
> +	wfene
>  	bne   1b

This is a semi-independent bug fix relating to unnecessarily waiting
(and perhaps blocking indefinitely) on the first iteration of the loop?

>  
>  boot_cpu:
> @@ -98,6 +98,10 @@ boot_cpu:
>  	PRINT(" booting -\r\n")
>  #endif
>  
> +	/* Wake up secondary cpus */
> +	teq   r12, #0
> +	bleq  kick_cpus

Does this have to be done this early? Couldn't we defer it to C land
where it would be easier to isolate the processor/platform specific
behaviour?

>  	/* Check that this CPU has Hyp mode */
>  	mrc   CP32(r0, ID_PFR1)
>  	and   r0, r0, #0xf000        /* Bits 12-15 define virt extensions */
> diff --git a/xen/arch/arm/mode_switch.S b/xen/arch/arm/mode_switch.S
> index 83a682b..39d80e8 100644
> --- a/xen/arch/arm/mode_switch.S
> +++ b/xen/arch/arm/mode_switch.S
> @@ -21,6 +21,37 @@
>  #include <asm/page.h>
>  #include <asm/asm_defns.h>
>  
> +/* XXX: Versatile Express specific code */
> +/* wake up secondary cpus */
> +.globl kick_cpus
> +kick_cpus:

My understanding was the mode_switch.S was intended to be a place to
hold the hacks and fixups required because there is no firmware on the
models and/or to address short comings in the firmware. This kick
function is a normal/expected part of booting on a vexpress so I don't
think it especially belongs here.

> +    /* write start paddr to v2m sysreg FLAGSSET register */
> +	ldr   r0, =0x1c010000 /* base V2M sysreg MMIO address */
> +	add   r1, r0, #0x34   /* SYS_FLAGSCLR register */

As well as Tim's comment about using the GICD_* registers you should
define names for the V2M registers somewhere too.

> +	dsb
> +	mov   r2, #0xffffffff
> +	str   r2, [r1]
> +	dsb
> +	add   r1, r0, #0x30   /* SYS_FLAGSSET register */
> +	ldr   r2, =start
> +	add   r2, r2, r10
> +	str   r2, [r1]
> +	dsb
> +	/* send an interrupt */
> +	ldr   r0, =0x2c001000 /* base GICD MMIO address */
> +	mov   r1, r0
> +	mov   r2, #0x1        /* GICD_CTLR */
> +	str   r2, [r1]        /* enable distributor */
> +	add   r1, r0, #0xf00  /* GICD_SGIR */
> +	mov   r2, #0xfe0000
> +	str   r2, [r1]        /* send IPI to everybody */
> +	dsb
> +	mov   r1, r0
> +	mov   r2, #0x0        /* GICD_CTLR */
> +	str   r2, [r1]        /* disable distributor */
> +	mov   pc, lr
> +
> +
>  /* Get up a CPU into Hyp mode.  Clobbers r0-r3.
>   *
>   * Expects r12 == CPU number

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 7/7] xen/arm: get the number of cpus from device tree
  2012-10-24 15:03 ` [PATCH 7/7] xen/arm: get the number of cpus from device tree Stefano Stabellini
@ 2012-10-25 10:02   ` Ian Campbell
  0 siblings, 0 replies; 61+ messages in thread
From: Ian Campbell @ 2012-10-25 10:02 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org)

> diff --git a/xen/common/device_tree.c b/xen/common/device_tree.c
> index 3d1f0f4..3ae0f4d 100644
> --- a/xen/common/device_tree.c
> +++ b/xen/common/device_tree.c
> @@ -153,6 +153,26 @@ const char *device_tree_bootargs(const void *fdt)
>      return prop->data;
>  }
>  
> +int device_tree_cpus(const void *fdt)
> +{
> +    int node, depth;
> +    int cpus = 0;
> +
> +    node = fdt_path_offset(fdt, "/cpus");
> +    if ( node < 0 )
> +        return -1;

With this we will stumble on with cpus == -1 (~4 billion, which would
make a nice system ;-) ). Perhaps we should either assume UP (with a big
printk) or BUG on failure here?

> +    do {
> +        node = fdt_next_node(fdt, node, &depth);
> +        if ( node < 0 )
> +            break;
> +        if ( strncmp(fdt_get_name(fdt, node, NULL), "cpu", 3) )
> +            break;

If we find /cpus/foo (for any foo != "cpu*") then we stop counting,
rather than just ignoring that node?

> +        cpus++;
> +    } while ( depth > 0 );
> +
> +    return cpus;
> +}
> +
>  static int dump_node(const void *fdt, int node, const char *name, int depth,
>                       u32 address_cells, u32 size_cells, void *data)
>  {
> diff --git a/xen/include/xen/device_tree.h b/xen/include/xen/device_tree.h
> index 4d010c0..46bc0f8 100644
> --- a/xen/include/xen/device_tree.h
> +++ b/xen/include/xen/device_tree.h
> @@ -49,6 +49,7 @@ bool_t device_tree_node_matches(const void *fdt, int node, const char *match);
>  int device_tree_for_each_node(const void *fdt,
>                                device_tree_node_func func, void *data);
>  const char *device_tree_bootargs(const void *fdt);
> +int device_tree_cpus(const void *fdt);
>  void device_tree_dump(const void *fdt);
>  
>  #endif

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 3/7] pl011: set baud and clock_hz to the right defaults for Versatile Express
  2012-10-25  9:37   ` Ian Campbell
@ 2012-10-25 10:57     ` Stefano Stabellini
  2012-10-25 11:00       ` Ian Campbell
  0 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-25 10:57 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel, Tim (Xen.org), Stefano Stabellini

On Thu, 25 Oct 2012, Ian Campbell wrote:
> Can we get these (or at least the clock_hz) from DTB?

Unfortunately we cannot.


> On Wed, 2012-10-24 at 16:03 +0100, Stefano Stabellini wrote:
> > Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> > ---
> >  xen/drivers/char/pl011.c |    4 ++--
> >  1 files changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/xen/drivers/char/pl011.c b/xen/drivers/char/pl011.c
> > index 6af45aa..6ccb73a 100644
> > --- a/xen/drivers/char/pl011.c
> > +++ b/xen/drivers/char/pl011.c
> > @@ -241,8 +241,8 @@ void __init pl011_init(int index, unsigned long register_base_address)
> >  
> >      uart = &pl011_com[index];
> >  
> > -    uart->clock_hz  = 7372800;
> > -    uart->baud      = 115200;
> > +    uart->clock_hz  = 0x16e3600;
> > +    uart->baud      = 38400;
> >      uart->data_bits = 8;
> >      uart->parity    = PARITY_NONE;
> >      uart->stop_bits = 1;
> 
> 
> 

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 2/7] xen/arm: setup the fixmap in head.S
  2012-10-25  9:33   ` Ian Campbell
@ 2012-10-25 11:00     ` Stefano Stabellini
  0 siblings, 0 replies; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-25 11:00 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel, Tim (Xen.org), Stefano Stabellini

On Thu, 25 Oct 2012, Ian Campbell wrote:
> On Wed, 2012-10-24 at 16:03 +0100, Stefano Stabellini wrote:
> > @@ -183,6 +184,16 @@ skip_bss:
> >         teq   r12, #0
> >         bne   pt_ready
> >         
> > +       /* console fixmap */
> > +       ldr   r1, =xen_fixmap
> > +       add   r1, r1, r10            /* r1 := paddr (xen_fixmap) */
> > +       mov   r3, #0
> > +       lsr   r2, r11, #12
> > +       lsl   r2, r2, #12            /* 4K aligned paddr of UART */
> > +       orr   r2, r2, #PT_UPPER(DEV_L3)
> > +       orr   r2, r2, #PT_LOWER(DEV_L3) /* r2:r3 := 4K dev map including UART */
> > +       strd  r2, r3, [r1, #0]       /* Map it in the first fixmap's slot */
> > +       
> >         /* Build the baseline idle pagetable's first-level entries */
> >         ldr   r1, =xen_second
> >         add   r1, r1, r10            /* r1 := paddr (xen_second) */
> [...]
> > @@ -205,12 +216,13 @@ skip_bss:
> >  	ldr   r4, =start
> >  	lsr   r4, #18                /* Slot for vaddr(start) */
> >  	strd  r2, r3, [r1, r4]       /* Map Xen there too */
> > +
> >  #ifdef EARLY_UART_ADDRESS
> > -	ldr   r3, =(1<<(54-32))      /* NS for device mapping */
> > -	lsr   r2, r11, #21
> > -	lsl   r2, r2, #21            /* 2MB-aligned paddr of UART */
> > -	orr   r2, r2, #PT_UPPER(DEV)
> > -	orr   r2, r2, #PT_LOWER(DEV) /* r2:r3 := 2MB dev map including UART */
> > +	/* xen_fixmap pagetable */
> > +	ldr r2, =xen_fixmap
> > +	add r2, r2, r10
> > +	orr   r2, r2, #PT_UPPER(PT)
> > +	orr   r2, r2, #PT_LOWER(PT) /* r2:r3 := paddr (xen_fixmap) */
> 
> We should setup the fixmap infrastructure even if !EARLY_UART_ADDRESS,
> unless we set it up again later?
> 
> As it stands it looks like you setup the generic fixmap mapping iff
> EARLY_UART_ADDRESS (second hunk above) but setup the UART mapping within
> that unconditionally (first hunk) which is exactly backwards, isn't it ?

Yes, you are right, they should be inverted.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 3/7] pl011: set baud and clock_hz to the right defaults for Versatile Express
  2012-10-25 10:57     ` Stefano Stabellini
@ 2012-10-25 11:00       ` Ian Campbell
  0 siblings, 0 replies; 61+ messages in thread
From: Ian Campbell @ 2012-10-25 11:00 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org)

On Thu, 2012-10-25 at 11:57 +0100, Stefano Stabellini wrote:
> On Thu, 25 Oct 2012, Ian Campbell wrote:
> > Can we get these (or at least the clock_hz) from DTB?
> 
> Unfortunately we cannot.

I suppose we ought to get them from the pl011 equivalent of the com1
command line parameter then.

Ian.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 4/7] xen/arm: set the SMP bit in the ACTLR register
  2012-10-25  9:52   ` Ian Campbell
@ 2012-10-25 11:57     ` Stefano Stabellini
  2012-10-25 12:04       ` Ian Campbell
  0 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-25 11:57 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel, Tim (Xen.org), Stefano Stabellini

On Thu, 25 Oct 2012, Ian Campbell wrote:
> On Wed, 2012-10-24 at 16:03 +0100, Stefano Stabellini wrote:
> > From the Cortex A15 manual:
> > 
> > "Enables the processor to receive instruction cache, BTB, and TLB maintenance
> > operations from other processors"
> > 
> > ...
> > 
> > "You must set this bit before enabling the caches and MMU, or
> > performing any cache and TLB maintenance operations. The only time
> > you must clear this bit is during a processor power-down sequence"
> 
> Is it considered a bug that the firmware doesn't do this?

Why would it be? You can run fairly complicated pieces of software
without caches or MMU.


> > Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> > ---
> >  xen/arch/arm/head.S             |    6 ++++++
> >  xen/include/asm-arm/processor.h |   30 ++++++++++++++++++++++++++++++
> >  2 files changed, 36 insertions(+), 0 deletions(-)
> > 
> > diff --git a/xen/arch/arm/head.S b/xen/arch/arm/head.S
> > index 3d01be0..c784f4d 100644
> > --- a/xen/arch/arm/head.S
> > +++ b/xen/arch/arm/head.S
> > @@ -148,6 +148,12 @@ skip_bss:
> >  
> >  	PRINT("- Setting up control registers -\r\n")
> >  	
> > +	/* XXX: Cortex A15 specific */
> 
> We probably need some sort of early proc info keyed of the processor id
> (like Linux) has so we can push this processor specific stuff into the
> right places.

Indeed.
I wrote something simple that should be OK until we get more than
~10 different processors.


> > +	/* Set up the SMP bit in ACTLR */
> > +	mrc   CP32(r0, ACTLR)
> > +	orr   r0, r0, #(ACTLR_SMP) /* enable SMP bit*/
> > +	mcr   CP32(r0, ACTLR)
> 
> Linux does this IFF it isn't already set for some reason:
> #ifdef CONFIG_SMP
>         ALT_SMP(mrc     p15, 0, r0, c1, c0, 1)
>         ALT_UP(mov      r0, #(1 << 6))          @ fake it for UP
>         tst     r0, #(1 << 6)                   @ SMP/nAMP mode enabled?
>         orreq   r0, r0, #(1 << 6)               @ Enable SMP/nAMP mode
>         orreq   r0, r0, r10                     @ Enable CPU-specific SMP bits
>         mcreq   p15, 0, r0, c1, c0, 1
> #endif
> 
> Might just relate to the "fake it up for UP" thing I suppose.

Considering that we don't have a UP config option for Xen (I am aware
of), there is no need for us to do that, I think.


> > +
> >  	/* Set up memory attribute type tables */
> >  	ldr   r0, =MAIR0VAL
> >  	ldr   r1, =MAIR1VAL
> > diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
> > index 3849b23..91a5836 100644
> > --- a/xen/include/asm-arm/processor.h
> > +++ b/xen/include/asm-arm/processor.h
> > @@ -34,6 +34,36 @@
> >  #define SCTLR_A         (1<<1)
> >  #define SCTLR_M         (1<<0)
> >  
> > +/* ACTLR Auxiliary Control Register */
> > +#define ACTLR_SNOOP_DELAYED      (1<<31)
> 
> If these are CortexA15 specific then I think they ought to be
> ACTLR_CA15_FOO or something. And possibly in a new processor-ca15.h
> header instead.

OK

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 4/7] xen/arm: set the SMP bit in the ACTLR register
  2012-10-25 11:57     ` Stefano Stabellini
@ 2012-10-25 12:04       ` Ian Campbell
  2012-10-26  8:56         ` Tim Deegan
  0 siblings, 1 reply; 61+ messages in thread
From: Ian Campbell @ 2012-10-25 12:04 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org)

On Thu, 2012-10-25 at 12:57 +0100, Stefano Stabellini wrote:
> On Thu, 25 Oct 2012, Ian Campbell wrote:
> > On Wed, 2012-10-24 at 16:03 +0100, Stefano Stabellini wrote:
> > > From the Cortex A15 manual:
> > > 
> > > "Enables the processor to receive instruction cache, BTB, and TLB maintenance
> > > operations from other processors"
> > > 
> > > ...
> > > 
> > > "You must set this bit before enabling the caches and MMU, or
> > > performing any cache and TLB maintenance operations. The only time
> > > you must clear this bit is during a processor power-down sequence"
> > 
> > Is it considered a bug that the firmware doesn't do this?
> 
> Why would it be? You can run fairly complicated pieces of software
> without caches or MMU.

True, I guess I just considered that not setting the SMP bit on the
second processor when the firmware starts it seemed a bit odd. If you
aren't caches/MMU/etc them then setting the bit is pretty much a NOP (or
maybe it isn't?).

> 
> 
> > > Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> > > ---
> > >  xen/arch/arm/head.S             |    6 ++++++
> > >  xen/include/asm-arm/processor.h |   30 ++++++++++++++++++++++++++++++
> > >  2 files changed, 36 insertions(+), 0 deletions(-)
> > > 
> > > diff --git a/xen/arch/arm/head.S b/xen/arch/arm/head.S
> > > index 3d01be0..c784f4d 100644
> > > --- a/xen/arch/arm/head.S
> > > +++ b/xen/arch/arm/head.S
> > > @@ -148,6 +148,12 @@ skip_bss:
> > >  
> > >  	PRINT("- Setting up control registers -\r\n")
> > >  	
> > > +	/* XXX: Cortex A15 specific */
> > 
> > We probably need some sort of early proc info keyed of the processor id
> > (like Linux) has so we can push this processor specific stuff into the
> > right places.
> 
> Indeed.
> I wrote something simple that should be OK until we get more than
> ~10 different processors.
> 
> 
> > > +	/* Set up the SMP bit in ACTLR */
> > > +	mrc   CP32(r0, ACTLR)
> > > +	orr   r0, r0, #(ACTLR_SMP) /* enable SMP bit*/
> > > +	mcr   CP32(r0, ACTLR)
> > 
> > Linux does this IFF it isn't already set for some reason:
> > #ifdef CONFIG_SMP
> >         ALT_SMP(mrc     p15, 0, r0, c1, c0, 1)
> >         ALT_UP(mov      r0, #(1 << 6))          @ fake it for UP
> >         tst     r0, #(1 << 6)                   @ SMP/nAMP mode enabled?
> >         orreq   r0, r0, #(1 << 6)               @ Enable SMP/nAMP mode
> >         orreq   r0, r0, r10                     @ Enable CPU-specific SMP bits
> >         mcreq   p15, 0, r0, c1, c0, 1
> > #endif
> > 
> > Might just relate to the "fake it up for UP" thing I suppose.
> 
> Considering that we don't have a UP config option for Xen (I am aware
> of), there is no need for us to do that, I think.
> 
> 
> > > +
> > >  	/* Set up memory attribute type tables */
> > >  	ldr   r0, =MAIR0VAL
> > >  	ldr   r1, =MAIR1VAL
> > > diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
> > > index 3849b23..91a5836 100644
> > > --- a/xen/include/asm-arm/processor.h
> > > +++ b/xen/include/asm-arm/processor.h
> > > @@ -34,6 +34,36 @@
> > >  #define SCTLR_A         (1<<1)
> > >  #define SCTLR_M         (1<<0)
> > >  
> > > +/* ACTLR Auxiliary Control Register */
> > > +#define ACTLR_SNOOP_DELAYED      (1<<31)
> > 
> > If these are CortexA15 specific then I think they ought to be
> > ACTLR_CA15_FOO or something. And possibly in a new processor-ca15.h
> > header instead.
> 
> OK

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 5/7] xen/arm: wake up secondary cpus
  2012-10-25  9:59   ` Ian Campbell
@ 2012-10-25 17:45     ` Stefano Stabellini
  2012-10-26  7:30       ` Ian Campbell
  0 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-25 17:45 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel, Tim (Xen.org), Stefano Stabellini

On Thu, 25 Oct 2012, Ian Campbell wrote:
> On Wed, 2012-10-24 at 16:03 +0100, Stefano Stabellini wrote:
> > Secondary cpus are held by the firmware until we send an IPI to them.
> > 
> > Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> > ---
> >  xen/arch/arm/head.S        |    8 ++++++--
> >  xen/arch/arm/mode_switch.S |   31 +++++++++++++++++++++++++++++++
> >  2 files changed, 37 insertions(+), 2 deletions(-)
> > 
> > diff --git a/xen/arch/arm/head.S b/xen/arch/arm/head.S
> > index c784f4d..39c4774 100644
> > --- a/xen/arch/arm/head.S
> > +++ b/xen/arch/arm/head.S
> > @@ -79,12 +79,12 @@ start:
> >  	beq   boot_cpu               /* If we're CPU 0, boot now */
> >  
> >  	/* Non-boot CPUs wait here to be woken up one at a time. */
> > -1:	wfe
> > -	dsb
> > +1:	dsb
> >  	ldr   r0, =smp_up_cpu        /* VA of gate */
> >  	add   r0, r0, r10            /* PA of gate */
> >  	ldr   r1, [r0]               /* Which CPU is being booted? */
> >  	teq   r1, r12                /* Is it us? */
> > +	wfene
> >  	bne   1b
> 
> This is a semi-independent bug fix relating to unnecessarily waiting
> (and perhaps blocking indefinitely) on the first iteration of the loop?

yes


> >  
> >  boot_cpu:
> > @@ -98,6 +98,10 @@ boot_cpu:
> >  	PRINT(" booting -\r\n")
> >  #endif
> >  
> > +	/* Wake up secondary cpus */
> > +	teq   r12, #0
> > +	bleq  kick_cpus
> 
> Does this have to be done this early? Couldn't we defer it to C land
> where it would be easier to isolate the processor/platform specific
> behaviour?

Yes, it does because we need to send an interrupt to cpus running in
secure mode, so this has to happen before we drop off secure state and we
enter hypervisor state.


> >  	/* Check that this CPU has Hyp mode */
> >  	mrc   CP32(r0, ID_PFR1)
> >  	and   r0, r0, #0xf000        /* Bits 12-15 define virt extensions */
> > diff --git a/xen/arch/arm/mode_switch.S b/xen/arch/arm/mode_switch.S
> > index 83a682b..39d80e8 100644
> > --- a/xen/arch/arm/mode_switch.S
> > +++ b/xen/arch/arm/mode_switch.S
> > @@ -21,6 +21,37 @@
> >  #include <asm/page.h>
> >  #include <asm/asm_defns.h>
> >  
> > +/* XXX: Versatile Express specific code */
> > +/* wake up secondary cpus */
> > +.globl kick_cpus
> > +kick_cpus:
> 
> My understanding was the mode_switch.S was intended to be a place to
> hold the hacks and fixups required because there is no firmware on the
> models and/or to address short comings in the firmware. This kick
> function is a normal/expected part of booting on a vexpress so I don't
> think it especially belongs here.

I have created a processor.S file for processor specific initializations
(see ACTLR), maybe I can move it there.


> > +    /* write start paddr to v2m sysreg FLAGSSET register */
> > +	ldr   r0, =0x1c010000 /* base V2M sysreg MMIO address */
> > +	add   r1, r0, #0x34   /* SYS_FLAGSCLR register */
> 
> As well as Tim's comment about using the GICD_* registers you should
> define names for the V2M registers somewhere too.

sure

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 5/7] xen/arm: wake up secondary cpus
  2012-10-25 17:45     ` Stefano Stabellini
@ 2012-10-26  7:30       ` Ian Campbell
  2012-10-26 11:18         ` Stefano Stabellini
  0 siblings, 1 reply; 61+ messages in thread
From: Ian Campbell @ 2012-10-26  7:30 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org)

On Thu, 2012-10-25 at 18:45 +0100, Stefano Stabellini wrote:
> > >  
> > >  boot_cpu:
> > > @@ -98,6 +98,10 @@ boot_cpu:
> > >  	PRINT(" booting -\r\n")
> > >  #endif
> > >  
> > > +	/* Wake up secondary cpus */
> > > +	teq   r12, #0
> > > +	bleq  kick_cpus
> > 
> > Does this have to be done this early? Couldn't we defer it to C land
> > where it would be easier to isolate the processor/platform specific
> > behaviour?
> 
> Yes, it does because we need to send an interrupt to cpus running in
> secure mode, so this has to happen before we drop off secure state and we
> enter hypervisor state.

Hrm, so maybe this stuff does belong in mode_switch.S after all?

Or is there perhaps some register (e.g. in the GIC?) which would allow
non-secure hyp mode to sent an event to a processor in secure monitor
mode? Or are secondary CPUs actually spinning in secure supervisor mode?

I guess this works in Linux because the boot CPU is in *secure* kernel
mode and that is allowed to send events to other secure modes? That's a
further argument that this is related to the firmware not bringing us up
in Hyp / NS mode and therefore that the fix should be in mode_switch.S.

> > >  	/* Check that this CPU has Hyp mode */
> > >  	mrc   CP32(r0, ID_PFR1)
> > >  	and   r0, r0, #0xf000        /* Bits 12-15 define virt extensions */
> > > diff --git a/xen/arch/arm/mode_switch.S b/xen/arch/arm/mode_switch.S
> > > index 83a682b..39d80e8 100644
> > > --- a/xen/arch/arm/mode_switch.S
> > > +++ b/xen/arch/arm/mode_switch.S
> > > @@ -21,6 +21,37 @@
> > >  #include <asm/page.h>
> > >  #include <asm/asm_defns.h>
> > >  
> > > +/* XXX: Versatile Express specific code */
> > > +/* wake up secondary cpus */
> > > +.globl kick_cpus
> > > +kick_cpus:
> > 
> > My understanding was the mode_switch.S was intended to be a place to
> > hold the hacks and fixups required because there is no firmware on the
> > models and/or to address short comings in the firmware. This kick
> > function is a normal/expected part of booting on a vexpress so I don't
> > think it especially belongs here.
> 
> I have created a processor.S file for processor specific initializations
> (see ACTLR), maybe I can move it there.

proc-ca15.S perhaps? So we can add proc-exynos.S etc in the future?

Ian

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 4/7] xen/arm: set the SMP bit in the ACTLR register
  2012-10-25 12:04       ` Ian Campbell
@ 2012-10-26  8:56         ` Tim Deegan
  2012-10-26  8:59           ` Ian Campbell
  0 siblings, 1 reply; 61+ messages in thread
From: Tim Deegan @ 2012-10-26  8:56 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel, Stefano Stabellini

At 13:04 +0100 on 25 Oct (1351170270), Ian Campbell wrote:
> On Thu, 2012-10-25 at 12:57 +0100, Stefano Stabellini wrote:
> > On Thu, 25 Oct 2012, Ian Campbell wrote:
> > > On Wed, 2012-10-24 at 16:03 +0100, Stefano Stabellini wrote:
> > > > From the Cortex A15 manual:
> > > > 
> > > > "Enables the processor to receive instruction cache, BTB, and TLB maintenance
> > > > operations from other processors"
> > > > 
> > > > ...
> > > > 
> > > > "You must set this bit before enabling the caches and MMU, or
> > > > performing any cache and TLB maintenance operations. The only time
> > > > you must clear this bit is during a processor power-down sequence"
> > > 
> > > Is it considered a bug that the firmware doesn't do this?
> > 
> > Why would it be? You can run fairly complicated pieces of software
> > without caches or MMU.
> 
> True, I guess I just considered that not setting the SMP bit on the
> second processor when the firmware starts it seemed a bit odd. If you
> aren't caches/MMU/etc them then setting the bit is pretty much a NOP (or
> maybe it isn't?).

Well, given that you're not allowed to clear it except at power-down, I
think it would be impolite of the firmware to set it if there's _any_
reason the OS might not want it set.

Tim.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 4/7] xen/arm: set the SMP bit in the ACTLR register
  2012-10-26  8:56         ` Tim Deegan
@ 2012-10-26  8:59           ` Ian Campbell
  0 siblings, 0 replies; 61+ messages in thread
From: Ian Campbell @ 2012-10-26  8:59 UTC (permalink / raw)
  To: Tim Deegan; +Cc: xen-devel, Stefano Stabellini

On Fri, 2012-10-26 at 09:56 +0100, Tim Deegan wrote:
> At 13:04 +0100 on 25 Oct (1351170270), Ian Campbell wrote:
> > On Thu, 2012-10-25 at 12:57 +0100, Stefano Stabellini wrote:
> > > On Thu, 25 Oct 2012, Ian Campbell wrote:
> > > > On Wed, 2012-10-24 at 16:03 +0100, Stefano Stabellini wrote:
> > > > > From the Cortex A15 manual:
> > > > > 
> > > > > "Enables the processor to receive instruction cache, BTB, and TLB maintenance
> > > > > operations from other processors"
> > > > > 
> > > > > ...
> > > > > 
> > > > > "You must set this bit before enabling the caches and MMU, or
> > > > > performing any cache and TLB maintenance operations. The only time
> > > > > you must clear this bit is during a processor power-down sequence"
> > > > 
> > > > Is it considered a bug that the firmware doesn't do this?
> > > 
> > > Why would it be? You can run fairly complicated pieces of software
> > > without caches or MMU.
> > 
> > True, I guess I just considered that not setting the SMP bit on the
> > second processor when the firmware starts it seemed a bit odd. If you
> > aren't caches/MMU/etc them then setting the bit is pretty much a NOP (or
> > maybe it isn't?).
> 
> Well, given that you're not allowed to clear it except at power-down, I
> think it would be impolite of the firmware to set it if there's _any_
> reason the OS might not want it set.

True.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-24 17:35     ` Stefano Stabellini
@ 2012-10-26  9:01       ` Tim Deegan
  2012-10-26 15:53         ` Stefano Stabellini
  0 siblings, 1 reply; 61+ messages in thread
From: Tim Deegan @ 2012-10-26  9:01 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Ian Campbell

At 18:35 +0100 on 24 Oct (1351103740), Stefano Stabellini wrote:
> > I don't think this is necessary - why not just pass va directly to the
> > inline asm?  We don't care what register it's in (and if we did I'm not
> > convinced this would guarantee it was r0).
> > 
> > > +    asm volatile (
> > > +        "dsb;"
> > > +        STORE_CP32(0, DCCMVAC)
> > > +        "isb;"
> > > +        : : "r" (r0) : "memory");
> > 
> > Does this need a 'memory' clobber?  Can we get away with just saying it
> > consumes *va as an input?  All we need to be sure of is that the
> > particular thing we're flushing has been written out; no need to stop
> > any other optimizations.
> 
> you are right on both points
> 
> > I guess it might need to be re-cast as a macro so the compiler knows how
> > big *va is?
> 
> I don't think it is necessary, after all the size of a register has to
> be the same of a virtual address

But it's the size of the thing in memory that's being flushed that
matters, not the size of the pointer to it!  E.g. after a PTE write we
need a 64-bit memory input operand to stop the compiler from hoisting
any part of the PTE write past the cache flush. (well OK we explicitly use
a 64-bit atomic write for PTE writes, but YKWIM).

Tim.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 5/7] xen/arm: wake up secondary cpus
  2012-10-26  7:30       ` Ian Campbell
@ 2012-10-26 11:18         ` Stefano Stabellini
  2012-10-26 12:16           ` Ian Campbell
  0 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-26 11:18 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel, Tim (Xen.org), Stefano Stabellini

On Fri, 26 Oct 2012, Ian Campbell wrote:
> On Thu, 2012-10-25 at 18:45 +0100, Stefano Stabellini wrote:
> > > >  
> > > >  boot_cpu:
> > > > @@ -98,6 +98,10 @@ boot_cpu:
> > > >  	PRINT(" booting -\r\n")
> > > >  #endif
> > > >  
> > > > +	/* Wake up secondary cpus */
> > > > +	teq   r12, #0
> > > > +	bleq  kick_cpus
> > > 
> > > Does this have to be done this early? Couldn't we defer it to C land
> > > where it would be easier to isolate the processor/platform specific
> > > behaviour?
> > 
> > Yes, it does because we need to send an interrupt to cpus running in
> > secure mode, so this has to happen before we drop off secure state and we
> > enter hypervisor state.
> 
> Hrm, so maybe this stuff does belong in mode_switch.S after all?
> 
> Or is there perhaps some register (e.g. in the GIC?) which would allow
> non-secure hyp mode to sent an event to a processor in secure monitor
> mode?

Whether the target processor in secure mode receives the interrupt or
not, depends only on the GIC configuration on the target processor.
I don't think there is anything we can do from cpu0 in non-secure mode.


> Or are secondary CPUs actually spinning in secure supervisor mode?

Yes.


> I guess this works in Linux because the boot CPU is in *secure* kernel
> mode and that is allowed to send events to other secure modes? That's a
> further argument that this is related to the firmware not bringing us up
> in Hyp / NS mode and therefore that the fix should be in mode_switch.S.

That's right.


> > I have created a processor.S file for processor specific initializations
> > (see ACTLR), maybe I can move it there.
> 
> proc-ca15.S perhaps? So we can add proc-exynos.S etc in the future?
 
OK

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 5/7] xen/arm: wake up secondary cpus
  2012-10-26 11:18         ` Stefano Stabellini
@ 2012-10-26 12:16           ` Ian Campbell
  2012-10-26 15:24             ` Stefano Stabellini
  0 siblings, 1 reply; 61+ messages in thread
From: Ian Campbell @ 2012-10-26 12:16 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org)

On Fri, 2012-10-26 at 12:18 +0100, Stefano Stabellini wrote:
> On Fri, 26 Oct 2012, Ian Campbell wrote:
> > On Thu, 2012-10-25 at 18:45 +0100, Stefano Stabellini wrote:
> > > > >  
> > > > >  boot_cpu:
> > > > > @@ -98,6 +98,10 @@ boot_cpu:
> > > > >  	PRINT(" booting -\r\n")
> > > > >  #endif
> > > > >  
> > > > > +	/* Wake up secondary cpus */
> > > > > +	teq   r12, #0
> > > > > +	bleq  kick_cpus
> > > > 
> > > > Does this have to be done this early? Couldn't we defer it to C land
> > > > where it would be easier to isolate the processor/platform specific
> > > > behaviour?
> > > 
> > > Yes, it does because we need to send an interrupt to cpus running in
> > > secure mode, so this has to happen before we drop off secure state and we
> > > enter hypervisor state.
> > 
> > Hrm, so maybe this stuff does belong in mode_switch.S after all?
> > 
> > Or is there perhaps some register (e.g. in the GIC?) which would allow
> > non-secure hyp mode to sent an event to a processor in secure monitor
> > mode?
> 
> Whether the target processor in secure mode receives the interrupt or
> not, depends only on the GIC configuration on the target processor.
> I don't think there is anything we can do from cpu0 in non-secure mode.

Isn't this controlled by the GICD_GROUP register, which can be set from
cpu0 but affects all cpus?

Or is it the case that we need to do this poke before mode_switch.S
makes all interrupts into grp1 interrupts?

Ian.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 5/7] xen/arm: wake up secondary cpus
  2012-10-26 12:16           ` Ian Campbell
@ 2012-10-26 15:24             ` Stefano Stabellini
  2012-10-26 15:32               ` Ian Campbell
  0 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-26 15:24 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel, Tim (Xen.org), Stefano Stabellini

On Fri, 26 Oct 2012, Ian Campbell wrote:
> On Fri, 2012-10-26 at 12:18 +0100, Stefano Stabellini wrote:
> > On Fri, 26 Oct 2012, Ian Campbell wrote:
> > > On Thu, 2012-10-25 at 18:45 +0100, Stefano Stabellini wrote:
> > > > > >  
> > > > > >  boot_cpu:
> > > > > > @@ -98,6 +98,10 @@ boot_cpu:
> > > > > >  	PRINT(" booting -\r\n")
> > > > > >  #endif
> > > > > >  
> > > > > > +	/* Wake up secondary cpus */
> > > > > > +	teq   r12, #0
> > > > > > +	bleq  kick_cpus
> > > > > 
> > > > > Does this have to be done this early? Couldn't we defer it to C land
> > > > > where it would be easier to isolate the processor/platform specific
> > > > > behaviour?
> > > > 
> > > > Yes, it does because we need to send an interrupt to cpus running in
> > > > secure mode, so this has to happen before we drop off secure state and we
> > > > enter hypervisor state.
> > > 
> > > Hrm, so maybe this stuff does belong in mode_switch.S after all?
> > > 
> > > Or is there perhaps some register (e.g. in the GIC?) which would allow
> > > non-secure hyp mode to sent an event to a processor in secure monitor
> > > mode?
> > 
> > Whether the target processor in secure mode receives the interrupt or
> > not, depends only on the GIC configuration on the target processor.
> > I don't think there is anything we can do from cpu0 in non-secure mode.
> 
> Isn't this controlled by the GICD_GROUP register, which can be set from
> cpu0 but affects all cpus?

GICD_GROUP0 is banked


> Or is it the case that we need to do this poke before mode_switch.S
> makes all interrupts into grp1 interrupts?

I don't think we can do that, it is the receiver cpu that would need to
do it

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 5/7] xen/arm: wake up secondary cpus
  2012-10-26 15:24             ` Stefano Stabellini
@ 2012-10-26 15:32               ` Ian Campbell
  0 siblings, 0 replies; 61+ messages in thread
From: Ian Campbell @ 2012-10-26 15:32 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org)

On Fri, 2012-10-26 at 16:24 +0100, Stefano Stabellini wrote:
> On Fri, 26 Oct 2012, Ian Campbell wrote:
> > On Fri, 2012-10-26 at 12:18 +0100, Stefano Stabellini wrote:
> > > On Fri, 26 Oct 2012, Ian Campbell wrote:
> > > > On Thu, 2012-10-25 at 18:45 +0100, Stefano Stabellini wrote:
> > > > > > >  
> > > > > > >  boot_cpu:
> > > > > > > @@ -98,6 +98,10 @@ boot_cpu:
> > > > > > >  	PRINT(" booting -\r\n")
> > > > > > >  #endif
> > > > > > >  
> > > > > > > +	/* Wake up secondary cpus */
> > > > > > > +	teq   r12, #0
> > > > > > > +	bleq  kick_cpus
> > > > > > 
> > > > > > Does this have to be done this early? Couldn't we defer it to C land
> > > > > > where it would be easier to isolate the processor/platform specific
> > > > > > behaviour?
> > > > > 
> > > > > Yes, it does because we need to send an interrupt to cpus running in
> > > > > secure mode, so this has to happen before we drop off secure state and we
> > > > > enter hypervisor state.
> > > > 
> > > > Hrm, so maybe this stuff does belong in mode_switch.S after all?
> > > > 
> > > > Or is there perhaps some register (e.g. in the GIC?) which would allow
> > > > non-secure hyp mode to sent an event to a processor in secure monitor
> > > > mode?
> > > 
> > > Whether the target processor in secure mode receives the interrupt or
> > > not, depends only on the GIC configuration on the target processor.
> > > I don't think there is anything we can do from cpu0 in non-secure mode.
> > 
> > Isn't this controlled by the GICD_GROUP register, which can be set from
> > cpu0 but affects all cpus?
> 
> GICD_GROUP0 is banked

Damn, right.

> 
> 
> > Or is it the case that we need to do this poke before mode_switch.S
> > makes all interrupts into grp1 interrupts?
> 
> I don't think we can do that, it is the receiver cpu that would need to
> do it

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-26  9:01       ` Tim Deegan
@ 2012-10-26 15:53         ` Stefano Stabellini
  2012-10-26 15:55           ` Stefano Stabellini
  2012-10-26 16:45           ` Tim Deegan
  0 siblings, 2 replies; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-26 15:53 UTC (permalink / raw)
  To: Tim Deegan; +Cc: xen-devel, Ian Campbell, Stefano Stabellini

On Fri, 26 Oct 2012, Tim Deegan wrote:
> At 18:35 +0100 on 24 Oct (1351103740), Stefano Stabellini wrote:
> > > I don't think this is necessary - why not just pass va directly to the
> > > inline asm?  We don't care what register it's in (and if we did I'm not
> > > convinced this would guarantee it was r0).
> > > 
> > > > +    asm volatile (
> > > > +        "dsb;"
> > > > +        STORE_CP32(0, DCCMVAC)
> > > > +        "isb;"
> > > > +        : : "r" (r0) : "memory");
> > > 
> > > Does this need a 'memory' clobber?  Can we get away with just saying it
> > > consumes *va as an input?  All we need to be sure of is that the
> > > particular thing we're flushing has been written out; no need to stop
> > > any other optimizations.
> > 
> > you are right on both points
> > 
> > > I guess it might need to be re-cast as a macro so the compiler knows how
> > > big *va is?
> > 
> > I don't think it is necessary, after all the size of a register has to
> > be the same of a virtual address
> 
> But it's the size of the thing in memory that's being flushed that
> matters, not the size of the pointer to it!
>
> E.g. after a PTE write we
> need a 64-bit memory input operand to stop the compiler from hoisting
> any part of the PTE write past the cache flush. (well OK we explicitly use
> a 64-bit atomic write for PTE writes, but YKWIM).

The implementation of write_pte is entirely in assembly so I doubt that
the compiler is going to reorder it.

However I see your point in case of flush_xen_dcache_va.
Wouldn't a barrier() at the beginning of the function be enough?

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-26 15:53         ` Stefano Stabellini
@ 2012-10-26 15:55           ` Stefano Stabellini
  2012-10-26 16:03             ` Stefano Stabellini
  2012-10-26 16:45           ` Tim Deegan
  1 sibling, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-26 15:55 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org), Ian Campbell

On Fri, 26 Oct 2012, Stefano Stabellini wrote:
> On Fri, 26 Oct 2012, Tim Deegan wrote:
> > At 18:35 +0100 on 24 Oct (1351103740), Stefano Stabellini wrote:
> > > > I don't think this is necessary - why not just pass va directly to the
> > > > inline asm?  We don't care what register it's in (and if we did I'm not
> > > > convinced this would guarantee it was r0).
> > > > 
> > > > > +    asm volatile (
> > > > > +        "dsb;"
> > > > > +        STORE_CP32(0, DCCMVAC)
> > > > > +        "isb;"
> > > > > +        : : "r" (r0) : "memory");
> > > > 
> > > > Does this need a 'memory' clobber?  Can we get away with just saying it
> > > > consumes *va as an input?  All we need to be sure of is that the
> > > > particular thing we're flushing has been written out; no need to stop
> > > > any other optimizations.
> > > 
> > > you are right on both points
> > > 
> > > > I guess it might need to be re-cast as a macro so the compiler knows how
> > > > big *va is?
> > > 
> > > I don't think it is necessary, after all the size of a register has to
> > > be the same of a virtual address
> > 
> > But it's the size of the thing in memory that's being flushed that
> > matters, not the size of the pointer to it!
> >
> > E.g. after a PTE write we
> > need a 64-bit memory input operand to stop the compiler from hoisting
> > any part of the PTE write past the cache flush. (well OK we explicitly use
> > a 64-bit atomic write for PTE writes, but YKWIM).
> 
> The implementation of write_pte is entirely in assembly so I doubt that
> the compiler is going to reorder it.
> 
> However I see your point in case of flush_xen_dcache_va.
> Wouldn't a barrier() at the beginning of the function be enough?
>

Actually we already have a memory clobber in flush_xen_dcache_va,
shouldn't that prevent the compiler from reordering instructions?

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-26 15:55           ` Stefano Stabellini
@ 2012-10-26 16:03             ` Stefano Stabellini
  2012-10-26 16:55               ` Tim Deegan
  0 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-26 16:03 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org), Ian Campbell

On Fri, 26 Oct 2012, Stefano Stabellini wrote:
> On Fri, 26 Oct 2012, Stefano Stabellini wrote:
> > On Fri, 26 Oct 2012, Tim Deegan wrote:
> > > At 18:35 +0100 on 24 Oct (1351103740), Stefano Stabellini wrote:
> > > > > I don't think this is necessary - why not just pass va directly to the
> > > > > inline asm?  We don't care what register it's in (and if we did I'm not
> > > > > convinced this would guarantee it was r0).
> > > > > 
> > > > > > +    asm volatile (
> > > > > > +        "dsb;"
> > > > > > +        STORE_CP32(0, DCCMVAC)
> > > > > > +        "isb;"
> > > > > > +        : : "r" (r0) : "memory");
> > > > > 
> > > > > Does this need a 'memory' clobber?  Can we get away with just saying it
> > > > > consumes *va as an input?  All we need to be sure of is that the
> > > > > particular thing we're flushing has been written out; no need to stop
> > > > > any other optimizations.
> > > > 
> > > > you are right on both points
> > > > 
> > > > > I guess it might need to be re-cast as a macro so the compiler knows how
> > > > > big *va is?
> > > > 
> > > > I don't think it is necessary, after all the size of a register has to
> > > > be the same of a virtual address
> > > 
> > > But it's the size of the thing in memory that's being flushed that
> > > matters, not the size of the pointer to it!
> > >
> > > E.g. after a PTE write we
> > > need a 64-bit memory input operand to stop the compiler from hoisting
> > > any part of the PTE write past the cache flush. (well OK we explicitly use
> > > a 64-bit atomic write for PTE writes, but YKWIM).
> > 
> > The implementation of write_pte is entirely in assembly so I doubt that
> > the compiler is going to reorder it.
> > 
> > However I see your point in case of flush_xen_dcache_va.
> > Wouldn't a barrier() at the beginning of the function be enough?
> >
> 
> Actually we already have a memory clobber in flush_xen_dcache_va,
> shouldn't that prevent the compiler from reordering instructions?
> 

After reading again the entire thread I think that I understand what you
meant: can't we get rid of the memory clobber and specify *va as input?

However in order to do that correctly we need to make clear how big *va is
otherwise the compiler could reorder things.

The problem is that we don't know at compile time how big a cacheline
is, so I think that we need to keep the memory clobber.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/7] xen/arm: fix rank calculation in vgic_vcpu_inject_irq
  2012-10-25  9:27   ` Ian Campbell
@ 2012-10-26 16:33     ` Stefano Stabellini
  2012-10-26 16:40       ` Ian Campbell
  0 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-26 16:33 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel, Tim (Xen.org), Stefano Stabellini

On Thu, 25 Oct 2012, Ian Campbell wrote:
> On Wed, 2012-10-24 at 16:03 +0100, Stefano Stabellini wrote:
> > Each rank holds 32 irqs, so we should divide by 32 rather than by 4.
> 
> I think this is wrong, because idx isn't used in the way your reasoning
> implies, when we use it we assume 8 bits-per-interrupt in the register
> not 1.
> 
> The accessor function vgix_irq_rank is couched in terms of
> GICD_<FOO><n>, which is convenient where we are emulating register
> accesses but is very confusing in this one function where we aren't
> thinking in terms of registers!
> 
> vgic_irq_rank(v, 8, idx) is saying "find me the rank containing
> GICD_<FOO><idx> where FOO has 8 bits per interrupt". Since 4 lots of 8
> bits goes into 32 we therefore need to divide by 4. This makes sense if
> you consider that FOO here is PRIORITY and that GICD_PRIORITY0 controls
> irq 0..3, GICD_PRIORITY1 irq 4..7 etc.
> 
> With idx = irq >> 2 you get:
> 
> IRQ00: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 0
> IRQ01: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 1
> IRQ02: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 2
> IRQ03: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 3
> IRQ04: idx = 01 REG_RANK_NR(08, 01) = 0, REG_RANK_INDEX(08, 01) = 1, BYTE = 0
> IRQ05: idx = 01 REG_RANK_NR(08, 01) = 0, REG_RANK_INDEX(08, 01) = 1, BYTE = 1
> [...]
> IRQ30: idx = 07 REG_RANK_NR(08, 07) = 0, REG_RANK_INDEX(08, 07) = 7, BYTE = 2
> IRQ31: idx = 07 REG_RANK_NR(08, 07) = 0, REG_RANK_INDEX(08, 07) = 7, BYTE = 3
> IRQ32: idx = 08 REG_RANK_NR(08, 08) = 1, REG_RANK_INDEX(08, 08) = 0, BYTE = 0
> IRQ33: idx = 08 REG_RANK_NR(08, 08) = 1, REG_RANK_INDEX(08, 08) = 0, BYTE = 1
> [...]
> IRQ62: idx = 15 REG_RANK_NR(08, 15) = 1, REG_RANK_INDEX(08, 15) = 7, BYTE = 2
> IRQ63: idx = 15 REG_RANK_NR(08, 15) = 1, REG_RANK_INDEX(08, 15) = 7, BYTE = 3
> IRQ64: idx = 16 REG_RANK_NR(08, 16) = 2, REG_RANK_INDEX(08, 16) = 0, BYTE = 0
> IRQ65: idx = 16 REG_RANK_NR(08, 16) = 2, REG_RANK_INDEX(08, 16) = 0, BYTE = 1
> [...]
> 
> IOW interrupts 0..31 are in rank 0, 32..63 are rank 1 etc, which is correct
> 
> With idx = irq / 32 you instead get:
> 
> IRQ00: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 0
> IRQ01: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 1
> IRQ02: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 2
> IRQ03: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 3
> IRQ04: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 0
> IRQ05: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 1
> [...]
> IRQ30: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 2
> IRQ31: idx = 00 REG_RANK_NR(08, 00) = 0, REG_RANK_INDEX(08, 00) = 0, BYTE = 3
> IRQ32: idx = 01 REG_RANK_NR(08, 01) = 0, REG_RANK_INDEX(08, 01) = 1, BYTE = 0
> IRQ33: idx = 01 REG_RANK_NR(08, 01) = 0, REG_RANK_INDEX(08, 01) = 1, BYTE = 1
> [...]
> IRQ62: idx = 01 REG_RANK_NR(08, 01) = 0, REG_RANK_INDEX(08, 01) = 1, BYTE = 2
> IRQ63: idx = 01 REG_RANK_NR(08, 01) = 0, REG_RANK_INDEX(08, 01) = 1, BYTE = 3
> IRQ64: idx = 02 REG_RANK_NR(08, 02) = 0, REG_RANK_INDEX(08, 02) = 2, BYTE = 0
> IRQ65: idx = 02 REG_RANK_NR(08, 02) = 0, REG_RANK_INDEX(08, 02) = 2, BYTE = 1
> [...]
> IRQ255: idx = 07 REG_RANK_NR(08, 07) = 0, REG_RANK_INDEX(08, 07) = 7, BYTE = 3
> IRQ256: idx = 08 REG_RANK_NR(08, 08) = 1, REG_RANK_INDEX(08, 08) = 0, BYTE = 0
> 
> Here allinterrupts up to 255 are in rank 0, which is wrong.
> 
> For idx = irq / 32 you would need to use REG_RANK_NR(1, idx) not (8,
> idx). (and you'd need to trivially fix REG_RANK_NR to handle b == 1).
> 
> Perhaps the way to think of it is not as "/ 32" but rather as ">> 5". 
> Since REG_RANK_NR(8, ) is effectively >> 3 when combined with the
> existing >> 2 we get >> 5 overall.
> 
> If you change it as you have done then you are doing >> 5 *and* >> 3,
> i.e. >> 8 aka "/ 256" -- which explains why interrupts up to 255 end up
> in rank 0 with your change.

Your explanation is sounds, but the fact remains that the code doesn't
work as is :)
It means that this patch hides the bug rather than fixing it...

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/7] xen/arm: fix rank calculation in vgic_vcpu_inject_irq
  2012-10-26 16:33     ` Stefano Stabellini
@ 2012-10-26 16:40       ` Ian Campbell
  2012-10-26 18:42         ` Stefano Stabellini
  0 siblings, 1 reply; 61+ messages in thread
From: Ian Campbell @ 2012-10-26 16:40 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org)

On Fri, 2012-10-26 at 17:33 +0100, Stefano Stabellini wrote:

> Your explanation is sounds, but the fact remains that the code doesn't
> work as is :)

;-)

> It means that this patch hides the bug rather than fixing it...

What are the symptoms?

BTW I noticed that "byte = irq & 0x3;" is redundant since byte_read also
has the & in it. But that won't cause problems.

Ian.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-26 15:53         ` Stefano Stabellini
  2012-10-26 15:55           ` Stefano Stabellini
@ 2012-10-26 16:45           ` Tim Deegan
  1 sibling, 0 replies; 61+ messages in thread
From: Tim Deegan @ 2012-10-26 16:45 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Ian Campbell

At 16:53 +0100 on 26 Oct (1351270394), Stefano Stabellini wrote:
> On Fri, 26 Oct 2012, Tim Deegan wrote:
> > At 18:35 +0100 on 24 Oct (1351103740), Stefano Stabellini wrote:
> > > > I don't think this is necessary - why not just pass va directly to the
> > > > inline asm?  We don't care what register it's in (and if we did I'm not
> > > > convinced this would guarantee it was r0).
> > > > 
> > > > > +    asm volatile (
> > > > > +        "dsb;"
> > > > > +        STORE_CP32(0, DCCMVAC)
> > > > > +        "isb;"
> > > > > +        : : "r" (r0) : "memory");
> > > > 
> > > > Does this need a 'memory' clobber?  Can we get away with just saying it
> > > > consumes *va as an input?  All we need to be sure of is that the
> > > > particular thing we're flushing has been written out; no need to stop
> > > > any other optimizations.
> > > 
> > > you are right on both points
> > > 
> > > > I guess it might need to be re-cast as a macro so the compiler knows how
> > > > big *va is?
> > > 
> > > I don't think it is necessary, after all the size of a register has to
> > > be the same of a virtual address
> > 
> > But it's the size of the thing in memory that's being flushed that
> > matters, not the size of the pointer to it!
> >
> > E.g. after a PTE write we
> > need a 64-bit memory input operand to stop the compiler from hoisting
> > any part of the PTE write past the cache flush. (well OK we explicitly use
> > a 64-bit atomic write for PTE writes, but YKWIM).
> 
> The implementation of write_pte is entirely in assembly so I doubt that
> the compiler is going to reorder it.

Augh!  Yes, like I said, PTE writes are fine.

> However I see your point in case of flush_xen_dcache_va.
> Wouldn't a barrier() at the beginning of the function be enough?

More than enough.  That would be exactly equivalent to the "memory"
clobber above.  What I'm arguing for is a _less_ restrictive constraint,
that only restricts delaying writes, and only affects the thing actually
being flushed (whatever size that is).

For larger regions we should have a function with a single barrier at
the top and then a loop of DCCMVAC writes.  For single objects smaller
than a cacheline we need to pass the object itself as a memory input
operand.  Probably we should also have a compile-time check that the
object is smaller than the smallest supported cache-line (i.e. one
DCCMVAC is enough).

Tim.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-26 16:03             ` Stefano Stabellini
@ 2012-10-26 16:55               ` Tim Deegan
  2012-10-26 18:40                 ` Stefano Stabellini
  0 siblings, 1 reply; 61+ messages in thread
From: Tim Deegan @ 2012-10-26 16:55 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Ian Campbell

At 17:03 +0100 on 26 Oct (1351271018), Stefano Stabellini wrote:
> On Fri, 26 Oct 2012, Stefano Stabellini wrote:
> > On Fri, 26 Oct 2012, Stefano Stabellini wrote:
> > > On Fri, 26 Oct 2012, Tim Deegan wrote:
> > > > At 18:35 +0100 on 24 Oct (1351103740), Stefano Stabellini wrote:
> > > > > > I don't think this is necessary - why not just pass va directly to the
> > > > > > inline asm?  We don't care what register it's in (and if we did I'm not
> > > > > > convinced this would guarantee it was r0).
> > > > > > 
> > > > > > > +    asm volatile (
> > > > > > > +        "dsb;"
> > > > > > > +        STORE_CP32(0, DCCMVAC)
> > > > > > > +        "isb;"
> > > > > > > +        : : "r" (r0) : "memory");
> > > > > > 
> > > > > > Does this need a 'memory' clobber?  Can we get away with just saying it
> > > > > > consumes *va as an input?  All we need to be sure of is that the
> > > > > > particular thing we're flushing has been written out; no need to stop
> > > > > > any other optimizations.
> > > > > 
> > > > > you are right on both points
> > > > > 
> > > > > > I guess it might need to be re-cast as a macro so the compiler knows how
> > > > > > big *va is?
> > > > > 
> > > > > I don't think it is necessary, after all the size of a register has to
> > > > > be the same of a virtual address
> > > > 
> > > > But it's the size of the thing in memory that's being flushed that
> > > > matters, not the size of the pointer to it!
> > > >
> > > > E.g. after a PTE write we
> > > > need a 64-bit memory input operand to stop the compiler from hoisting
> > > > any part of the PTE write past the cache flush. (well OK we explicitly use
> > > > a 64-bit atomic write for PTE writes, but YKWIM).
> > > 
> > > The implementation of write_pte is entirely in assembly so I doubt that
> > > the compiler is going to reorder it.
> > > 
> > > However I see your point in case of flush_xen_dcache_va.
> > > Wouldn't a barrier() at the beginning of the function be enough?
> > >
> > 
> > Actually we already have a memory clobber in flush_xen_dcache_va,
> > shouldn't that prevent the compiler from reordering instructions?
> > 
> 
> After reading again the entire thread I think that I understand what you
> meant: can't we get rid of the memory clobber and specify *va as input?
> 
> However in order to do that correctly we need to make clear how big *va is
> otherwise the compiler could reorder things.

Yes!

> The problem is that we don't know at compile time how big a cacheline
> is, so I think that we need to keep the memory clobber.

No!  It's always safe to flush the entire line -- regardless of what
other writes might be happening to it.  After all, the cache controller
might evict that line on its own at any time, so nothing can be relying
on its _not_ being flushed.

The only problem with not knowing how big a cacheline is is this: if the
object is _bigger_ than a cache line we might use one DCCMVAC where more
than one is needed.  We can avoid that by a compile-time check on some
sort of architectural minimum cacheline size.

Tim.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-26 16:55               ` Tim Deegan
@ 2012-10-26 18:40                 ` Stefano Stabellini
  2012-10-27 10:44                   ` Tim Deegan
  0 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-26 18:40 UTC (permalink / raw)
  To: Tim Deegan; +Cc: xen-devel, Ian Campbell, Stefano Stabellini

On Fri, 26 Oct 2012, Tim Deegan wrote:
> At 17:03 +0100 on 26 Oct (1351271018), Stefano Stabellini wrote:
> > On Fri, 26 Oct 2012, Stefano Stabellini wrote:
> > > On Fri, 26 Oct 2012, Stefano Stabellini wrote:
> > > > On Fri, 26 Oct 2012, Tim Deegan wrote:
> > > > > At 18:35 +0100 on 24 Oct (1351103740), Stefano Stabellini wrote:
> > > > > > > I don't think this is necessary - why not just pass va directly to the
> > > > > > > inline asm?  We don't care what register it's in (and if we did I'm not
> > > > > > > convinced this would guarantee it was r0).
> > > > > > > 
> > > > > > > > +    asm volatile (
> > > > > > > > +        "dsb;"
> > > > > > > > +        STORE_CP32(0, DCCMVAC)
> > > > > > > > +        "isb;"
> > > > > > > > +        : : "r" (r0) : "memory");
> > > > > > > 
> > > > > > > Does this need a 'memory' clobber?  Can we get away with just saying it
> > > > > > > consumes *va as an input?  All we need to be sure of is that the
> > > > > > > particular thing we're flushing has been written out; no need to stop
> > > > > > > any other optimizations.
> > > > > > 
> > > > > > you are right on both points
> > > > > > 
> > > > > > > I guess it might need to be re-cast as a macro so the compiler knows how
> > > > > > > big *va is?
> > > > > > 
> > > > > > I don't think it is necessary, after all the size of a register has to
> > > > > > be the same of a virtual address
> > > > > 
> > > > > But it's the size of the thing in memory that's being flushed that
> > > > > matters, not the size of the pointer to it!
> > > > >
> > > > > E.g. after a PTE write we
> > > > > need a 64-bit memory input operand to stop the compiler from hoisting
> > > > > any part of the PTE write past the cache flush. (well OK we explicitly use
> > > > > a 64-bit atomic write for PTE writes, but YKWIM).
> > > > 
> > > > The implementation of write_pte is entirely in assembly so I doubt that
> > > > the compiler is going to reorder it.
> > > > 
> > > > However I see your point in case of flush_xen_dcache_va.
> > > > Wouldn't a barrier() at the beginning of the function be enough?
> > > >
> > > 
> > > Actually we already have a memory clobber in flush_xen_dcache_va,
> > > shouldn't that prevent the compiler from reordering instructions?
> > > 
> > 
> > After reading again the entire thread I think that I understand what you
> > meant: can't we get rid of the memory clobber and specify *va as input?
> > 
> > However in order to do that correctly we need to make clear how big *va is
> > otherwise the compiler could reorder things.
> 
> Yes!
> 
> > The problem is that we don't know at compile time how big a cacheline
> > is, so I think that we need to keep the memory clobber.
> 
> No!  It's always safe to flush the entire line -- regardless of what
> other writes might be happening to it.  After all, the cache controller
> might evict that line on its own at any time, so nothing can be relying
> on its _not_ being flushed.
> 
> The only problem with not knowing how big a cacheline is is this: if the
> object is _bigger_ than a cache line we might use one DCCMVAC where more
> than one is needed.  We can avoid that by a compile-time check on some
> sort of architectural minimum cacheline size.
 
If we want to do that then we need to pass a size argument to
flush_xen_dcache_va and have a loop in the function, each iteration
flushing a cacheline:


static inline void flush_xen_dcache_va(void *p, unsigned long size)
{
    unsigned long cacheline_size = READ_CP32(CCSIDR);
    unsigned long i;
    for (i = 0; i < size; i += cacheline_size, p += cacheline_size) {
        asm volatile (
            "dsb;"
            STORE_CP32(0, DCCMVAC)
            "dsb;"
            : : "r" (p));
    }
}


Even though I think that it is really unlikely that the compiler moves
the writes after the loop, in theory we would still need to specify the
correct size of the input parameter to the inline assembly.

Is there a better way?

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/7] xen/arm: fix rank calculation in vgic_vcpu_inject_irq
  2012-10-26 16:40       ` Ian Campbell
@ 2012-10-26 18:42         ` Stefano Stabellini
  2012-10-26 20:47           ` Ian Campbell
  0 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-26 18:42 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel, Tim (Xen.org), Stefano Stabellini

On Fri, 26 Oct 2012, Ian Campbell wrote:
> > It means that this patch hides the bug rather than fixing it...
> 
> What are the symptoms?
> 
> BTW I noticed that "byte = irq & 0x3;" is redundant since byte_read also
> has the & in it. But that won't cause problems.

The ienable bit is wrong for irq > 32.

I think that the problem is the usage of vgic_irq_rank with registers
that have 1 bit per interrupt.
In fact the following patch fixes the issue.

---


xen/arm: pass the correct bit-per-interrupt argument to vgic_irq_rank

Use 1 for registers that have 1 bit per irq.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>

diff --git a/xen/arch/arm/vgic.c b/xen/arch/arm/vgic.c
index 3c3983f..92731b6 100644
--- a/xen/arch/arm/vgic.c
+++ b/xen/arch/arm/vgic.c
@@ -42,13 +42,7 @@
  */
 static inline int REG_RANK_NR(int b, uint32_t n)
 {
-    switch ( b )
-    {
-    case 8: return n >> 3;
-    case 4: return n >> 2;
-    case 2: return n >> 1;
-    default: BUG();
-    }
+    return n / b;
 }
 
 /*
@@ -199,7 +193,7 @@ static int vgic_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
 
     case GICD_ISENABLER ... GICD_ISENABLERN:
         if ( dabt.size != 2 ) goto bad_width;
-        rank = vgic_irq_rank(v, 8, gicd_reg - GICD_ISENABLER);
+        rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ISENABLER);
         if ( rank == NULL) goto read_as_zero;
         vgic_lock_rank(v, rank);
         *r = rank->ienable;
@@ -208,7 +202,7 @@ static int vgic_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
 
     case GICD_ICENABLER ... GICD_ICENABLERN:
         if ( dabt.size != 2 ) goto bad_width;
-        rank = vgic_irq_rank(v, 8, gicd_reg - GICD_ICENABLER);
+        rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ICENABLER);
         if ( rank == NULL) goto read_as_zero;
         vgic_lock_rank(v, rank);
         *r = rank->ienable;
@@ -217,7 +211,7 @@ static int vgic_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
 
     case GICD_ISPENDR ... GICD_ISPENDRN:
         if ( dabt.size != 0 && dabt.size != 2 ) goto bad_width;
-        rank = vgic_irq_rank(v, 8, gicd_reg - GICD_ISPENDR);
+        rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ISPENDR);
         if ( rank == NULL) goto read_as_zero;
         vgic_lock_rank(v, rank);
         *r = byte_read(rank->ipend, dabt.sign, offset);
@@ -226,7 +220,7 @@ static int vgic_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
 
     case GICD_ICPENDR ... GICD_ICPENDRN:
         if ( dabt.size != 0 && dabt.size != 2 ) goto bad_width;
-        rank = vgic_irq_rank(v, 8, gicd_reg - GICD_ICPENDR);
+        rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ICPENDR);
         if ( rank == NULL) goto read_as_zero;
         vgic_lock_rank(v, rank);
         *r = byte_read(rank->ipend, dabt.sign, offset);
@@ -235,7 +229,7 @@ static int vgic_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
 
     case GICD_ISACTIVER ... GICD_ISACTIVERN:
         if ( dabt.size != 2 ) goto bad_width;
-        rank = vgic_irq_rank(v, 8, gicd_reg - GICD_ISACTIVER);
+        rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ISACTIVER);
         if ( rank == NULL) goto read_as_zero;
         vgic_lock_rank(v, rank);
         *r = rank->iactive;
@@ -244,7 +238,7 @@ static int vgic_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
 
     case GICD_ICACTIVER ... GICD_ICACTIVERN:
         if ( dabt.size != 2 ) goto bad_width;
-        rank = vgic_irq_rank(v, 8, gicd_reg - GICD_ICACTIVER);
+        rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ICACTIVER);
         if ( rank == NULL) goto read_as_zero;
         vgic_lock_rank(v, rank);
         *r = rank->iactive;
@@ -296,7 +290,7 @@ static int vgic_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
 
     case GICD_CPENDSGIR ... GICD_CPENDSGIRN:
         if ( dabt.size != 0 && dabt.size != 2 ) goto bad_width;
-        rank = vgic_irq_rank(v, 8, gicd_reg - GICD_CPENDSGIR);
+        rank = vgic_irq_rank(v, 1, gicd_reg - GICD_CPENDSGIR);
         if ( rank == NULL) goto read_as_zero;
         vgic_lock_rank(v, rank);
         *r = byte_read(rank->pendsgi, dabt.sign, offset);
@@ -305,7 +299,7 @@ static int vgic_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
 
     case GICD_SPENDSGIR ... GICD_SPENDSGIRN:
         if ( dabt.size != 0 && dabt.size != 2 ) goto bad_width;
-        rank = vgic_irq_rank(v, 8, gicd_reg - GICD_SPENDSGIR);
+        rank = vgic_irq_rank(v, 1, gicd_reg - GICD_SPENDSGIR);
         if ( rank == NULL) goto read_as_zero;
         vgic_lock_rank(v, rank);
         *r = byte_read(rank->pendsgi, dabt.sign, offset);
@@ -400,7 +394,7 @@ static int vgic_distr_mmio_write(struct vcpu *v, mmio_info_t *info)
 
     case GICD_ISENABLER ... GICD_ISENABLERN:
         if ( dabt.size != 2 ) goto bad_width;
-        rank = vgic_irq_rank(v, 8, gicd_reg - GICD_ISENABLER);
+        rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ISENABLER);
         if ( rank == NULL) goto write_ignore;
         vgic_lock_rank(v, rank);
         tr = rank->ienable;
@@ -411,7 +405,7 @@ static int vgic_distr_mmio_write(struct vcpu *v, mmio_info_t *info)
 
     case GICD_ICENABLER ... GICD_ICENABLERN:
         if ( dabt.size != 2 ) goto bad_width;
-        rank = vgic_irq_rank(v, 8, gicd_reg - GICD_ICENABLER);
+        rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ICENABLER);
         if ( rank == NULL) goto write_ignore;
         vgic_lock_rank(v, rank);
         rank->ienable &= ~*r;
@@ -432,7 +426,7 @@ static int vgic_distr_mmio_write(struct vcpu *v, mmio_info_t *info)
 
     case GICD_ISACTIVER ... GICD_ISACTIVERN:
         if ( dabt.size != 2 ) goto bad_width;
-        rank = vgic_irq_rank(v, 8, gicd_reg - GICD_ISACTIVER);
+        rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ISACTIVER);
         if ( rank == NULL) goto write_ignore;
         vgic_lock_rank(v, rank);
         rank->iactive &= ~*r;
@@ -441,7 +435,7 @@ static int vgic_distr_mmio_write(struct vcpu *v, mmio_info_t *info)
 
     case GICD_ICACTIVER ... GICD_ICACTIVERN:
         if ( dabt.size != 2 ) goto bad_width;
-        rank = vgic_irq_rank(v, 8, gicd_reg - GICD_ICACTIVER);
+        rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ICACTIVER);
         if ( rank == NULL) goto write_ignore;
         vgic_lock_rank(v, rank);
         rank->iactive &= ~*r;
@@ -577,9 +571,9 @@ struct pending_irq *irq_to_pending(struct vcpu *v, unsigned int irq)
 
 void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int irq, int virtual)
 {
-    int idx = irq >> 2, byte = irq & 0x3;
+    int byte = irq & 0x3;
     uint8_t priority;
-    struct vgic_irq_rank *rank = vgic_irq_rank(v, 8, idx);
+    struct vgic_irq_rank *rank = vgic_irq_rank(v, 1, irq / 32);
     struct pending_irq *iter, *n = irq_to_pending(v, irq);
     unsigned long flags;
 
@@ -587,7 +581,7 @@ void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int irq, int virtual)
     if (!list_empty(&n->inflight))
         return;
 
-    priority = byte_read(rank->ipriority[REG_RANK_INDEX(8, idx)], 0, byte);
+    priority = byte_read(rank->ipriority[REG_RANK_INDEX(8, irq / 4)], 0, byte);
 
     n->irq = irq;
     n->priority = priority;

^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/7] xen/arm: fix rank calculation in vgic_vcpu_inject_irq
  2012-10-26 18:42         ` Stefano Stabellini
@ 2012-10-26 20:47           ` Ian Campbell
  2012-10-27 10:09             ` Tim Deegan
  0 siblings, 1 reply; 61+ messages in thread
From: Ian Campbell @ 2012-10-26 20:47 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org)

On Fri, 2012-10-26 at 19:42 +0100, Stefano Stabellini wrote:
> I think that the problem is the usage of vgic_irq_rank with registers
> that have 1 bit per interrupt.

That's very plausible indeed.

> diff --git a/xen/arch/arm/vgic.c b/xen/arch/arm/vgic.c
> index 3c3983f..92731b6 100644
> --- a/xen/arch/arm/vgic.c
> +++ b/xen/arch/arm/vgic.c
> @@ -42,13 +42,7 @@
>   */
>  static inline int REG_RANK_NR(int b, uint32_t n)
>  {
> -    switch ( b )
> -    {
> -    case 8: return n >> 3;
> -    case 4: return n >> 2;
> -    case 2: return n >> 1;
> -    default: BUG();
> -    }
> +    return n / b;

All the infrastructure will fall apart if b isn't a power of two, that's
why I used the switch. Can you just add the appropriate case 1 instead?
Probably the bug should be a call to an undefined function to make this
a compile time rather than runtime failure too.

> @@ -577,9 +571,9 @@ struct pending_irq *irq_to_pending(struct vcpu *v, unsigned int irq)
>  
>  void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int irq, int virtual)

The changes to this function are all unnecessary, since it was already
consistent within itself. If you want to change it please add a helper
which takes an irq and hides the shifts.

I'd be inclined to rename the existing vgic_irq_rank to vgic_reg_rank
and then call the new helper vgic_irq_rank since that would better
reflect the purpose of both.

>  {
> -    int idx = irq >> 2, byte = irq & 0x3;
> +    int byte = irq & 0x3;

byte isn't actually needed either.

>      uint8_t priority;
> -    struct vgic_irq_rank *rank = vgic_irq_rank(v, 8, idx);
> +    struct vgic_irq_rank *rank = vgic_irq_rank(v, 1, irq / 32);
>      struct pending_irq *iter, *n = irq_to_pending(v, irq);
>      unsigned long flags;
>  
> @@ -587,7 +581,7 @@ void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int irq, int virtual)
>      if (!list_empty(&n->inflight))
>          return;
>  
> -    priority = byte_read(rank->ipriority[REG_RANK_INDEX(8, idx)], 0, byte);
> +    priority = byte_read(rank->ipriority[REG_RANK_INDEX(8, irq / 4)], 0, byte);
>  
>      n->irq = irq;
>      n->priority = priority;

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/7] xen/arm: fix rank calculation in vgic_vcpu_inject_irq
  2012-10-26 20:47           ` Ian Campbell
@ 2012-10-27 10:09             ` Tim Deegan
  2012-10-27 10:44               ` Ian Campbell
  2012-11-13 11:57               ` Stefano Stabellini
  0 siblings, 2 replies; 61+ messages in thread
From: Tim Deegan @ 2012-10-27 10:09 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel, Stefano Stabellini

At 21:47 +0100 on 26 Oct (1351288040), Ian Campbell wrote:
> On Fri, 2012-10-26 at 19:42 +0100, Stefano Stabellini wrote:
> > I think that the problem is the usage of vgic_irq_rank with registers
> > that have 1 bit per interrupt.
> 
> That's very plausible indeed.
> 
> > diff --git a/xen/arch/arm/vgic.c b/xen/arch/arm/vgic.c
> > index 3c3983f..92731b6 100644
> > --- a/xen/arch/arm/vgic.c
> > +++ b/xen/arch/arm/vgic.c
> > @@ -42,13 +42,7 @@
> >   */
> >  static inline int REG_RANK_NR(int b, uint32_t n)
> >  {
> > -    switch ( b )
> > -    {
> > -    case 8: return n >> 3;
> > -    case 4: return n >> 2;
> > -    case 2: return n >> 1;
> > -    default: BUG();
> > -    }
> > +    return n / b;
> 
> All the infrastructure will fall apart if b isn't a power of two, that's
> why I used the switch. Can you just add the appropriate case 1 instead?
> Probably the bug should be a call to an undefined function to make this
> a compile time rather than runtime failure too.

BUILD_BUG_ON((b & -b) != b); ?

Tim.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-26 18:40                 ` Stefano Stabellini
@ 2012-10-27 10:44                   ` Tim Deegan
  2012-10-27 11:54                     ` Tim Deegan
                                       ` (2 more replies)
  0 siblings, 3 replies; 61+ messages in thread
From: Tim Deegan @ 2012-10-27 10:44 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Ian Campbell

At 19:40 +0100 on 26 Oct (1351280413), Stefano Stabellini wrote:
> > No!  It's always safe to flush the entire line -- regardless of what
> > other writes might be happening to it.  After all, the cache controller
> > might evict that line on its own at any time, so nothing can be relying
> > on its _not_ being flushed.
> > 
> > The only problem with not knowing how big a cacheline is is this: if the
> > object is _bigger_ than a cache line we might use one DCCMVAC where more
> > than one is needed.  We can avoid that by a compile-time check on some
> > sort of architectural minimum cacheline size.
>  
> If we want to do that then we need to pass a size argument to
> flush_xen_dcache_va and have a loop in the function, each iteration
> flushing a cacheline:
> 
> static inline void flush_xen_dcache_va(void *p, unsigned long size)
> {
>     unsigned long cacheline_size = READ_CP32(CCSIDR);
>     unsigned long i;
>     for (i = 0; i < size; i += cacheline_size, p += cacheline_size) {
>         asm volatile (
>             "dsb;"
>             STORE_CP32(0, DCCMVAC)
>             "dsb;"
>             : : "r" (p));
>     }
> }

I think we should have two functions.  One should look almost like that
and be for flushing large ranges, and one much simpler for flushing
small items.  Like this (totally untested, uncompiled even):

 #define MIN_CACHELINE_BYTES 32 // or whatever

 /* In setup.c somewhere. */
 if ( READ_CP32(CCSIDR) < MIN_CACHELINE_BYTES )
     panic("CPU has preposterously small cache lines");

 /* Function for flushing medium-sized areas.
  * if 'range' is large enough we might want to use model-specific
  * full-cache flushes. */
 static inline void flush_xen_dcache_va_range(void *p, unsigned long size)
 {
     void *end;
     unsigned long cacheline_bytes = READ_CP32(CCSIDR);
     barrier();       /* So the compiler issues all writes to the range */
     dsb();           /* So the CPU issues all writes to the range */ 
     for ( end = p + size; p < end; p += cacheline_bytes )
         WRITE_CP32(DCCMVAC, p);
     dsb();           /* So we know the flushes happen before continuing */
 }

 /* Macro for flushing a single small item.  The predicate is always 
  * compile-time constant so this will compile down to 3 instructions in
  * the common case.  Make sure to call it with the correct type of
  * pointer! */
 #define flush_xen_dcache_va(p) do {                       \
     typeof(p) _p = (p);                                   \
     if ( (sizeof *_p) > MIN_CACHELINE_BYTES )             \
         flush_xen_dcache_va_range(_p, sizeof *_p);        \
     else                                                  \
         asm volatile (                                    \
             "dsb;"   /* Finish all earlier writes */      \
             STORE_CP32(0, DCCMVAC)                        \
             "dsb;"   /* Finish flush before continuing */ \
             : : "r" (_p), "m" (*_p));                     \
 } while (0)

What do you think?

Tim.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/7] xen/arm: fix rank calculation in vgic_vcpu_inject_irq
  2012-10-27 10:09             ` Tim Deegan
@ 2012-10-27 10:44               ` Ian Campbell
  2012-11-13 11:57               ` Stefano Stabellini
  1 sibling, 0 replies; 61+ messages in thread
From: Ian Campbell @ 2012-10-27 10:44 UTC (permalink / raw)
  To: Tim Deegan; +Cc: xen-devel, Stefano Stabellini

On Sat, 2012-10-27 at 11:09 +0100, Tim Deegan wrote:
> At 21:47 +0100 on 26 Oct (1351288040), Ian Campbell wrote:
> > All the infrastructure will fall apart if b isn't a power of two, that's
> > why I used the switch. Can you just add the appropriate case 1 instead?
> > Probably the bug should be a call to an undefined function to make this
> > a compile time rather than runtime failure too.
> 
> BUILD_BUG_ON((b & -b) != b); ?

Sure.

Ian.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-27 10:44                   ` Tim Deegan
@ 2012-10-27 11:54                     ` Tim Deegan
  2012-10-29  9:53                       ` Stefano Stabellini
  2012-10-29  9:52                     ` Stefano Stabellini
  2012-11-13 12:01                     ` Stefano Stabellini
  2 siblings, 1 reply; 61+ messages in thread
From: Tim Deegan @ 2012-10-27 11:54 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Ian Campbell

At 11:44 +0100 on 27 Oct (1351338268), Tim Deegan wrote:
>  /* Function for flushing medium-sized areas.
>   * if 'range' is large enough we might want to use model-specific
>   * full-cache flushes. */
>  static inline void flush_xen_dcache_va_range(void *p, unsigned long size)
>  {
>      void *end;
>      unsigned long cacheline_bytes = READ_CP32(CCSIDR);
>      barrier();       /* So the compiler issues all writes to the range */
>      dsb();           /* So the CPU issues all writes to the range */ 

Oh - I just noticed that the way we've defined dsb() it includes a
memory clobber.   So I guess we don't need barrier() as well there.

We might want to look at the other users of dsb() and see if we want to
drop the memory clobber from it as well.  But OTOH we may be getting
way into premature optimization already. :)

Tim.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-27 10:44                   ` Tim Deegan
  2012-10-27 11:54                     ` Tim Deegan
@ 2012-10-29  9:52                     ` Stefano Stabellini
  2012-11-13 12:01                     ` Stefano Stabellini
  2 siblings, 0 replies; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-29  9:52 UTC (permalink / raw)
  To: Tim Deegan; +Cc: xen-devel, Ian Campbell, Stefano Stabellini

On Sat, 27 Oct 2012, Tim Deegan wrote:
> At 19:40 +0100 on 26 Oct (1351280413), Stefano Stabellini wrote:
> > > No!  It's always safe to flush the entire line -- regardless of what
> > > other writes might be happening to it.  After all, the cache controller
> > > might evict that line on its own at any time, so nothing can be relying
> > > on its _not_ being flushed.
> > > 
> > > The only problem with not knowing how big a cacheline is is this: if the
> > > object is _bigger_ than a cache line we might use one DCCMVAC where more
> > > than one is needed.  We can avoid that by a compile-time check on some
> > > sort of architectural minimum cacheline size.
> >  
> > If we want to do that then we need to pass a size argument to
> > flush_xen_dcache_va and have a loop in the function, each iteration
> > flushing a cacheline:
> > 
> > static inline void flush_xen_dcache_va(void *p, unsigned long size)
> > {
> >     unsigned long cacheline_size = READ_CP32(CCSIDR);
> >     unsigned long i;
> >     for (i = 0; i < size; i += cacheline_size, p += cacheline_size) {
> >         asm volatile (
> >             "dsb;"
> >             STORE_CP32(0, DCCMVAC)
> >             "dsb;"
> >             : : "r" (p));
> >     }
> > }
> 
> I think we should have two functions.  One should look almost like that
> and be for flushing large ranges, and one much simpler for flushing
> small items.  Like this (totally untested, uncompiled even):
> 
>  #define MIN_CACHELINE_BYTES 32 // or whatever
> 
>  /* In setup.c somewhere. */
>  if ( READ_CP32(CCSIDR) < MIN_CACHELINE_BYTES )
>      panic("CPU has preposterously small cache lines");
> 
>  /* Function for flushing medium-sized areas.
>   * if 'range' is large enough we might want to use model-specific
>   * full-cache flushes. */
>  static inline void flush_xen_dcache_va_range(void *p, unsigned long size)
>  {
>      void *end;
>      unsigned long cacheline_bytes = READ_CP32(CCSIDR);
>      barrier();       /* So the compiler issues all writes to the range */
>      dsb();           /* So the CPU issues all writes to the range */ 
>      for ( end = p + size; p < end; p += cacheline_bytes )
>          WRITE_CP32(DCCMVAC, p);
>      dsb();           /* So we know the flushes happen before continuing */
>  }
> 
>  /* Macro for flushing a single small item.  The predicate is always 
>   * compile-time constant so this will compile down to 3 instructions in
>   * the common case.  Make sure to call it with the correct type of
>   * pointer! */
>  #define flush_xen_dcache_va(p) do {                       \
>      typeof(p) _p = (p);                                   \
>      if ( (sizeof *_p) > MIN_CACHELINE_BYTES )             \
>          flush_xen_dcache_va_range(_p, sizeof *_p);        \
>      else                                                  \
>          asm volatile (                                    \
>              "dsb;"   /* Finish all earlier writes */      \
>              STORE_CP32(0, DCCMVAC)                        \
>              "dsb;"   /* Finish flush before continuing */ \
>              : : "r" (_p), "m" (*_p));                     \
>  } while (0)
> 
> What do you think?
 
I think that's OK, the macro doesn't look as bad as I thought it would
:)

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-27 11:54                     ` Tim Deegan
@ 2012-10-29  9:53                       ` Stefano Stabellini
  0 siblings, 0 replies; 61+ messages in thread
From: Stefano Stabellini @ 2012-10-29  9:53 UTC (permalink / raw)
  To: Tim Deegan; +Cc: xen-devel, Ian Campbell, Stefano Stabellini

On Sat, 27 Oct 2012, Tim Deegan wrote:
> At 11:44 +0100 on 27 Oct (1351338268), Tim Deegan wrote:
> >  /* Function for flushing medium-sized areas.
> >   * if 'range' is large enough we might want to use model-specific
> >   * full-cache flushes. */
> >  static inline void flush_xen_dcache_va_range(void *p, unsigned long size)
> >  {
> >      void *end;
> >      unsigned long cacheline_bytes = READ_CP32(CCSIDR);
> >      barrier();       /* So the compiler issues all writes to the range */
> >      dsb();           /* So the CPU issues all writes to the range */ 
> 
> Oh - I just noticed that the way we've defined dsb() it includes a
> memory clobber.   So I guess we don't need barrier() as well there.
> 
> We might want to look at the other users of dsb() and see if we want to
> drop the memory clobber from it as well.

> But OTOH we may be getting way into premature optimization already. :)
 
My thoughts exactly.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/7] xen/arm: fix rank calculation in vgic_vcpu_inject_irq
  2012-10-27 10:09             ` Tim Deegan
  2012-10-27 10:44               ` Ian Campbell
@ 2012-11-13 11:57               ` Stefano Stabellini
  2012-11-13 12:00                 ` Ian Campbell
  1 sibling, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-11-13 11:57 UTC (permalink / raw)
  To: Tim Deegan; +Cc: xen-devel, Ian Campbell, Stefano Stabellini

On Sat, 27 Oct 2012, Tim Deegan wrote:
> At 21:47 +0100 on 26 Oct (1351288040), Ian Campbell wrote:
> > On Fri, 2012-10-26 at 19:42 +0100, Stefano Stabellini wrote:
> > > I think that the problem is the usage of vgic_irq_rank with registers
> > > that have 1 bit per interrupt.
> > 
> > That's very plausible indeed.
> > 
> > > diff --git a/xen/arch/arm/vgic.c b/xen/arch/arm/vgic.c
> > > index 3c3983f..92731b6 100644
> > > --- a/xen/arch/arm/vgic.c
> > > +++ b/xen/arch/arm/vgic.c
> > > @@ -42,13 +42,7 @@
> > >   */
> > >  static inline int REG_RANK_NR(int b, uint32_t n)
> > >  {
> > > -    switch ( b )
> > > -    {
> > > -    case 8: return n >> 3;
> > > -    case 4: return n >> 2;
> > > -    case 2: return n >> 1;
> > > -    default: BUG();
> > > -    }
> > > +    return n / b;
> > 
> > All the infrastructure will fall apart if b isn't a power of two, that's
> > why I used the switch. Can you just add the appropriate case 1 instead?
> > Probably the bug should be a call to an undefined function to make this
> > a compile time rather than runtime failure too.
> 
> BUILD_BUG_ON((b & -b) != b); ?

I can't do that:

error: expression in static assertion is not constant

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/7] xen/arm: fix rank calculation in vgic_vcpu_inject_irq
  2012-11-13 11:57               ` Stefano Stabellini
@ 2012-11-13 12:00                 ` Ian Campbell
  2012-11-13 12:23                   ` Stefano Stabellini
  0 siblings, 1 reply; 61+ messages in thread
From: Ian Campbell @ 2012-11-13 12:00 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org)

On Tue, 2012-11-13 at 11:57 +0000, Stefano Stabellini wrote:
> On Sat, 27 Oct 2012, Tim Deegan wrote:
> > At 21:47 +0100 on 26 Oct (1351288040), Ian Campbell wrote:
> > > On Fri, 2012-10-26 at 19:42 +0100, Stefano Stabellini wrote:
> > > > I think that the problem is the usage of vgic_irq_rank with registers
> > > > that have 1 bit per interrupt.
> > > 
> > > That's very plausible indeed.
> > > 
> > > > diff --git a/xen/arch/arm/vgic.c b/xen/arch/arm/vgic.c
> > > > index 3c3983f..92731b6 100644
> > > > --- a/xen/arch/arm/vgic.c
> > > > +++ b/xen/arch/arm/vgic.c
> > > > @@ -42,13 +42,7 @@
> > > >   */
> > > >  static inline int REG_RANK_NR(int b, uint32_t n)
> > > >  {
> > > > -    switch ( b )
> > > > -    {
> > > > -    case 8: return n >> 3;
> > > > -    case 4: return n >> 2;
> > > > -    case 2: return n >> 1;
> > > > -    default: BUG();
> > > > -    }
> > > > +    return n / b;
> > > 
> > > All the infrastructure will fall apart if b isn't a power of two, that's
> > > why I used the switch. Can you just add the appropriate case 1 instead?
> > > Probably the bug should be a call to an undefined function to make this
> > > a compile time rather than runtime failure too.
> > 
> > BUILD_BUG_ON((b & -b) != b); ?
> 
> I can't do that:
> 
> error: expression in static assertion is not constant

I suppose you could make REG_RANK_NR a macro (as it's name implies it
ought to be, oops!).

If not then given that b must be in {1,2,4,8} maybe the switch is
ok/tolerable? If we end up supporting b=16 or 32 then more thought is
probably required around the place anyway.

Ian.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-10-27 10:44                   ` Tim Deegan
  2012-10-27 11:54                     ` Tim Deegan
  2012-10-29  9:52                     ` Stefano Stabellini
@ 2012-11-13 12:01                     ` Stefano Stabellini
  2012-11-13 12:15                       ` Tim Deegan
  2 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-11-13 12:01 UTC (permalink / raw)
  To: Tim Deegan; +Cc: xen-devel, Ian Campbell, Stefano Stabellini

On Sat, 27 Oct 2012, Tim Deegan wrote:
> At 19:40 +0100 on 26 Oct (1351280413), Stefano Stabellini wrote:
> > > No!  It's always safe to flush the entire line -- regardless of what
> > > other writes might be happening to it.  After all, the cache controller
> > > might evict that line on its own at any time, so nothing can be relying
> > > on its _not_ being flushed.
> > > 
> > > The only problem with not knowing how big a cacheline is is this: if the
> > > object is _bigger_ than a cache line we might use one DCCMVAC where more
> > > than one is needed.  We can avoid that by a compile-time check on some
> > > sort of architectural minimum cacheline size.
> >  
> > If we want to do that then we need to pass a size argument to
> > flush_xen_dcache_va and have a loop in the function, each iteration
> > flushing a cacheline:
> > 
> > static inline void flush_xen_dcache_va(void *p, unsigned long size)
> > {
> >     unsigned long cacheline_size = READ_CP32(CCSIDR);
> >     unsigned long i;
> >     for (i = 0; i < size; i += cacheline_size, p += cacheline_size) {
> >         asm volatile (
> >             "dsb;"
> >             STORE_CP32(0, DCCMVAC)
> >             "dsb;"
> >             : : "r" (p));
> >     }
> > }
> 
> I think we should have two functions.  One should look almost like that
> and be for flushing large ranges, and one much simpler for flushing
> small items.  Like this (totally untested, uncompiled even):
> 
>  #define MIN_CACHELINE_BYTES 32 // or whatever
> 
>  /* In setup.c somewhere. */
>  if ( READ_CP32(CCSIDR) < MIN_CACHELINE_BYTES )
>      panic("CPU has preposterously small cache lines");
> 
>  /* Function for flushing medium-sized areas.
>   * if 'range' is large enough we might want to use model-specific
>   * full-cache flushes. */
>  static inline void flush_xen_dcache_va_range(void *p, unsigned long size)
>  {
>      void *end;
>      unsigned long cacheline_bytes = READ_CP32(CCSIDR);
>      barrier();       /* So the compiler issues all writes to the range */
>      dsb();           /* So the CPU issues all writes to the range */ 
>      for ( end = p + size; p < end; p += cacheline_bytes )
>          WRITE_CP32(DCCMVAC, p);
>      dsb();           /* So we know the flushes happen before continuing */
>  }
> 
>  /* Macro for flushing a single small item.  The predicate is always 
>   * compile-time constant so this will compile down to 3 instructions in
>   * the common case.  Make sure to call it with the correct type of
>   * pointer! */
>  #define flush_xen_dcache_va(p) do {                       \
>      typeof(p) _p = (p);                                   \
>      if ( (sizeof *_p) > MIN_CACHELINE_BYTES )             \
>          flush_xen_dcache_va_range(_p, sizeof *_p);        \
>      else                                                  \
>          asm volatile (                                    \
>              "dsb;"   /* Finish all earlier writes */      \
>              STORE_CP32(0, DCCMVAC)                        \
>              "dsb;"   /* Finish flush before continuing */ \
>              : : "r" (_p), "m" (*_p));                     \
>  } while (0)
> 
> What do you think?

I think that is OK, but I would like to avoid reading CCSIDR every
single time we need to do a dcache flush, I don't know how slow that
coprocessor read is but I wouldn't want to have to find out :)

I am thinking of introducing a global variable to hold the cacheline
size and initialize it in start_xen.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate
  2012-11-13 12:01                     ` Stefano Stabellini
@ 2012-11-13 12:15                       ` Tim Deegan
  0 siblings, 0 replies; 61+ messages in thread
From: Tim Deegan @ 2012-11-13 12:15 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Ian Campbell

At 12:01 +0000 on 13 Nov (1352808094), Stefano Stabellini wrote:
> > I think we should have two functions.  One should look almost like that
> > and be for flushing large ranges, and one much simpler for flushing
> > small items.  Like this (totally untested, uncompiled even):
> > 
> >  #define MIN_CACHELINE_BYTES 32 // or whatever
> > 
> >  /* In setup.c somewhere. */
> >  if ( READ_CP32(CCSIDR) < MIN_CACHELINE_BYTES )
> >      panic("CPU has preposterously small cache lines");
> > 
> >  /* Function for flushing medium-sized areas.
> >   * if 'range' is large enough we might want to use model-specific
> >   * full-cache flushes. */
> >  static inline void flush_xen_dcache_va_range(void *p, unsigned long size)
> >  {
> >      void *end;
> >      unsigned long cacheline_bytes = READ_CP32(CCSIDR);
> >      barrier();       /* So the compiler issues all writes to the range */
> >      dsb();           /* So the CPU issues all writes to the range */ 
> >      for ( end = p + size; p < end; p += cacheline_bytes )
> >          WRITE_CP32(DCCMVAC, p);
> >      dsb();           /* So we know the flushes happen before continuing */
> >  }
> > 
> >  /* Macro for flushing a single small item.  The predicate is always 
> >   * compile-time constant so this will compile down to 3 instructions in
> >   * the common case.  Make sure to call it with the correct type of
> >   * pointer! */
> >  #define flush_xen_dcache_va(p) do {                       \
> >      typeof(p) _p = (p);                                   \
> >      if ( (sizeof *_p) > MIN_CACHELINE_BYTES )             \
> >          flush_xen_dcache_va_range(_p, sizeof *_p);        \
> >      else                                                  \
> >          asm volatile (                                    \
> >              "dsb;"   /* Finish all earlier writes */      \
> >              STORE_CP32(0, DCCMVAC)                        \
> >              "dsb;"   /* Finish flush before continuing */ \
> >              : : "r" (_p), "m" (*_p));                     \
> >  } while (0)
> > 
> > What do you think?
> 
> I think that is OK, but I would like to avoid reading CCSIDR every
> single time we need to do a dcache flush

The code above only reads it once for each large dcache flush.  When I
was writing it I did think of just stashing cacheline_bytes in a
read_mostly somewhere, but I had the opposite concern -- wouldn't
reading this constant from on-chip be quicker than going to the memory
bus for it? :)

I'm happy either way.

Tim.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 1/7] xen/arm: fix rank calculation in vgic_vcpu_inject_irq
  2012-11-13 12:00                 ` Ian Campbell
@ 2012-11-13 12:23                   ` Stefano Stabellini
  0 siblings, 0 replies; 61+ messages in thread
From: Stefano Stabellini @ 2012-11-13 12:23 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel, Tim (Xen.org), Stefano Stabellini

On Tue, 13 Nov 2012, Ian Campbell wrote:
> On Tue, 2012-11-13 at 11:57 +0000, Stefano Stabellini wrote:
> > On Sat, 27 Oct 2012, Tim Deegan wrote:
> > > At 21:47 +0100 on 26 Oct (1351288040), Ian Campbell wrote:
> > > > On Fri, 2012-10-26 at 19:42 +0100, Stefano Stabellini wrote:
> > > > > I think that the problem is the usage of vgic_irq_rank with registers
> > > > > that have 1 bit per interrupt.
> > > > 
> > > > That's very plausible indeed.
> > > > 
> > > > > diff --git a/xen/arch/arm/vgic.c b/xen/arch/arm/vgic.c
> > > > > index 3c3983f..92731b6 100644
> > > > > --- a/xen/arch/arm/vgic.c
> > > > > +++ b/xen/arch/arm/vgic.c
> > > > > @@ -42,13 +42,7 @@
> > > > >   */
> > > > >  static inline int REG_RANK_NR(int b, uint32_t n)
> > > > >  {
> > > > > -    switch ( b )
> > > > > -    {
> > > > > -    case 8: return n >> 3;
> > > > > -    case 4: return n >> 2;
> > > > > -    case 2: return n >> 1;
> > > > > -    default: BUG();
> > > > > -    }
> > > > > +    return n / b;
> > > > 
> > > > All the infrastructure will fall apart if b isn't a power of two, that's
> > > > why I used the switch. Can you just add the appropriate case 1 instead?
> > > > Probably the bug should be a call to an undefined function to make this
> > > > a compile time rather than runtime failure too.
> > > 
> > > BUILD_BUG_ON((b & -b) != b); ?
> > 
> > I can't do that:
> > 
> > error: expression in static assertion is not constant
> 
> I suppose you could make REG_RANK_NR a macro (as it's name implies it
> ought to be, oops!).
> 

REG_RANK_NR is called by vgic_irq_rank, so that would need to become a
macro too


> If not then given that b must be in {1,2,4,8} maybe the switch is
> ok/tolerable? If we end up supporting b=16 or 32 then more thought is
> probably required around the place anyway.

yeah I think is OK

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 4/7] xen/arm: set the SMP bit in the ACTLR register
  2012-11-13 15:42 ` [PATCH 4/7] xen/arm: set the SMP bit in the ACTLR register Stefano Stabellini
@ 2012-11-15 10:27   ` Ian Campbell
  0 siblings, 0 replies; 61+ messages in thread
From: Ian Campbell @ 2012-11-15 10:27 UTC (permalink / raw)
  To: Stefano Stabellini; +Cc: xen-devel, Tim (Xen.org)

On Tue, 2012-11-13 at 15:42 +0000, Stefano Stabellini wrote:

> Changes in v2:
> - detect the processor ID and call a processor specific initialization
> function;

I presume this will become a table driven lookup at some point...

> 
> Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> Acked-by: Tim Deegan <tim@xen.org>

Applied, thanks.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH 4/7] xen/arm: set the SMP bit in the ACTLR register
  2012-11-13 15:40 [PATCH v2 " Stefano Stabellini
@ 2012-11-13 15:42 ` Stefano Stabellini
  2012-11-15 10:27   ` Ian Campbell
  0 siblings, 1 reply; 61+ messages in thread
From: Stefano Stabellini @ 2012-11-13 15:42 UTC (permalink / raw)
  To: xen-devel; +Cc: tim, Ian.Campbell, Stefano Stabellini

>From the Cortex A15 manual:

"Enables the processor to receive instruction cache, BTB, and TLB maintenance
operations from other processors"

...

"You must set this bit before enabling the caches and MMU, or
performing any cache and TLB maintenance operations. The only time
you must clear this bit is during a processor power-down sequence"


Changes in v2:
- detect the processor ID and call a processor specific initialization
function;
- move the ACTLR initialization to the CortexA15 initialization
function;
- move the ACTLR_* defines to processor-ca15.h.


Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Acked-by: Tim Deegan <tim@xen.org>
---
 xen/arch/arm/Makefile                |    1 +
 xen/arch/arm/head.S                  |   10 +++++++
 xen/arch/arm/proc-ca15.S             |   28 +++++++++++++++++++++
 xen/include/asm-arm/cpregs.h         |    1 +
 xen/include/asm-arm/processor-ca15.h |   45 ++++++++++++++++++++++++++++++++++
 xen/include/asm-arm/processor.h      |    3 ++
 6 files changed, 88 insertions(+), 0 deletions(-)
 create mode 100644 xen/arch/arm/proc-ca15.S
 create mode 100644 xen/include/asm-arm/processor-ca15.h

diff --git a/xen/arch/arm/Makefile b/xen/arch/arm/Makefile
index 634b620..d3e34bc 100644
--- a/xen/arch/arm/Makefile
+++ b/xen/arch/arm/Makefile
@@ -13,6 +13,7 @@ obj-y += irq.o
 obj-y += kernel.o
 obj-y += mm.o
 obj-y += mode_switch.o
+obj-y += proc-ca15.o
 obj-y += p2m.o
 obj-y += percpu.o
 obj-y += guestcopy.o
diff --git a/xen/arch/arm/head.S b/xen/arch/arm/head.S
index 4506244..25993d6 100644
--- a/xen/arch/arm/head.S
+++ b/xen/arch/arm/head.S
@@ -19,6 +19,7 @@
 
 #include <asm/config.h>
 #include <asm/page.h>
+#include <asm/processor-ca15.h>
 #include <asm/asm_defns.h>
 
 #define PT_PT  0xe7f /* nG=1, AF=1, SH=10, AP=01, NS=1, ATTR=111, T=1, P=1 */
@@ -148,6 +149,15 @@ skip_bss:
 
 	PRINT("- Setting up control registers -\r\n")
 	
+	/* Read CPU ID */
+	mrc   CP32(r0, MIDR)
+	ldr   r1, =(MIDR_MASK)
+	and   r0, r0, r1
+	/* Is this a Cortex A15? */
+	ldr   r1, =(CORTEX_A15_ID)
+	teq   r0, r1
+	bleq  cortex_a15_init
+
 	/* Set up memory attribute type tables */
 	ldr   r0, =MAIR0VAL
 	ldr   r1, =MAIR1VAL
diff --git a/xen/arch/arm/proc-ca15.S b/xen/arch/arm/proc-ca15.S
new file mode 100644
index 0000000..5a5bf64
--- /dev/null
+++ b/xen/arch/arm/proc-ca15.S
@@ -0,0 +1,28 @@
+/*
+ * xen/arch/arm/proc-ca15.S
+ *
+ * Cortex A15 specific initializations
+ *
+ * Copyright (c) 2011 Citrix Systems.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <asm/asm_defns.h>
+#include <asm/processor-ca15.h>
+
+.globl cortex_a15_init
+cortex_a15_init:
+	/* Set up the SMP bit in ACTLR */
+	mrc   CP32(r0, ACTLR)
+	orr   r0, r0, #(ACTLR_CA15_SMP) /* enable SMP bit*/
+	mcr   CP32(r0, ACTLR)
+	mov   pc, lr
diff --git a/xen/include/asm-arm/cpregs.h b/xen/include/asm-arm/cpregs.h
index 34a9e93..3b51845 100644
--- a/xen/include/asm-arm/cpregs.h
+++ b/xen/include/asm-arm/cpregs.h
@@ -104,6 +104,7 @@
 /* Coprocessor 15 */
 
 /* CP15 CR0: CPUID and Cache Type Registers */
+#define MIDR            p15,0,c0,c0,0   /* Main ID Register */
 #define MPIDR           p15,0,c0,c0,5   /* Multiprocessor Affinity Register */
 #define ID_PFR0         p15,0,c0,c1,0   /* Processor Feature Register 0 */
 #define ID_PFR1         p15,0,c0,c1,1   /* Processor Feature Register 1 */
diff --git a/xen/include/asm-arm/processor-ca15.h b/xen/include/asm-arm/processor-ca15.h
new file mode 100644
index 0000000..86231b3
--- /dev/null
+++ b/xen/include/asm-arm/processor-ca15.h
@@ -0,0 +1,45 @@
+#ifndef __ASM_ARM_PROCESSOR_CA15_H
+#define __ASM_ARM_PROCESSOR_CA15_H
+
+
+#define CORTEX_A15_ID     (0x410FC0F0)
+
+/* ACTLR Auxiliary Control Register, Cortex A15 */
+#define ACTLR_CA15_SNOOP_DELAYED      (1<<31)
+#define ACTLR_CA15_MAIN_CLOCK         (1<<30)
+#define ACTLR_CA15_NEON_CLOCK         (1<<29)
+#define ACTLR_CA15_NONCACHE           (1<<24)
+#define ACTLR_CA15_INORDER_REQ        (1<<23)
+#define ACTLR_CA15_INORDER_LOAD       (1<<22)
+#define ACTLR_CA15_L2_TLB_PREFETCH    (1<<21)
+#define ACTLR_CA15_L2_IPA_PA_CACHE    (1<<20)
+#define ACTLR_CA15_L2_CACHE           (1<<19)
+#define ACTLR_CA15_L2_PA_CACHE        (1<<18)
+#define ACTLR_CA15_TLB                (1<<17)
+#define ACTLR_CA15_STRONGY_ORDERED    (1<<16)
+#define ACTLR_CA15_INORDER            (1<<15)
+#define ACTLR_CA15_FORCE_LIM          (1<<14)
+#define ACTLR_CA15_CP_FLUSH           (1<<13)
+#define ACTLR_CA15_CP_PUSH            (1<<12)
+#define ACTLR_CA15_LIM                (1<<11)
+#define ACTLR_CA15_SER                (1<<10)
+#define ACTLR_CA15_OPT                (1<<9)
+#define ACTLR_CA15_WFI                (1<<8)
+#define ACTLR_CA15_WFE                (1<<7)
+#define ACTLR_CA15_SMP                (1<<6)
+#define ACTLR_CA15_PLD                (1<<5)
+#define ACTLR_CA15_IP                 (1<<4)
+#define ACTLR_CA15_MICRO_BTB          (1<<3)
+#define ACTLR_CA15_LOOP_ONE           (1<<2)
+#define ACTLR_CA15_LOOP_DISABLE       (1<<1)
+#define ACTLR_CA15_BTB                (1<<0)
+
+#endif /* __ASM_ARM_PROCESSOR_CA15_H */
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
index 3849b23..e0c0beb 100644
--- a/xen/include/asm-arm/processor.h
+++ b/xen/include/asm-arm/processor.h
@@ -3,6 +3,9 @@
 
 #include <asm/cpregs.h>
 
+/* MIDR Main ID Register */
+#define MIDR_MASK    0xff0ffff0
+
 /* TTBCR Translation Table Base Control Register */
 #define TTBCR_EAE    0x80000000
 #define TTBCR_N_MASK 0x07
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 61+ messages in thread

end of thread, other threads:[~2012-11-15 10:27 UTC | newest]

Thread overview: 61+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-10-24 15:03 [PATCH 0/7] xen/arm: run on real hardware Stefano Stabellini
2012-10-24 15:03 ` [PATCH 1/7] xen/arm: fix rank calculation in vgic_vcpu_inject_irq Stefano Stabellini
2012-10-25  9:27   ` Ian Campbell
2012-10-26 16:33     ` Stefano Stabellini
2012-10-26 16:40       ` Ian Campbell
2012-10-26 18:42         ` Stefano Stabellini
2012-10-26 20:47           ` Ian Campbell
2012-10-27 10:09             ` Tim Deegan
2012-10-27 10:44               ` Ian Campbell
2012-11-13 11:57               ` Stefano Stabellini
2012-11-13 12:00                 ` Ian Campbell
2012-11-13 12:23                   ` Stefano Stabellini
2012-10-24 15:03 ` [PATCH 2/7] xen/arm: setup the fixmap in head.S Stefano Stabellini
2012-10-24 15:27   ` Tim Deegan
2012-10-24 15:37     ` Stefano Stabellini
2012-10-25  9:33   ` Ian Campbell
2012-10-25 11:00     ` Stefano Stabellini
2012-10-24 15:03 ` [PATCH 3/7] pl011: set baud and clock_hz to the right defaults for Versatile Express Stefano Stabellini
2012-10-25  9:37   ` Ian Campbell
2012-10-25 10:57     ` Stefano Stabellini
2012-10-25 11:00       ` Ian Campbell
2012-10-24 15:03 ` [PATCH 4/7] xen/arm: set the SMP bit in the ACTLR register Stefano Stabellini
2012-10-25  9:52   ` Ian Campbell
2012-10-25 11:57     ` Stefano Stabellini
2012-10-25 12:04       ` Ian Campbell
2012-10-26  8:56         ` Tim Deegan
2012-10-26  8:59           ` Ian Campbell
2012-10-24 15:03 ` [PATCH 5/7] xen/arm: wake up secondary cpus Stefano Stabellini
2012-10-24 15:38   ` Tim Deegan
2012-10-24 15:59     ` Stefano Stabellini
2012-10-24 16:05       ` Tim Deegan
2012-10-25  9:59   ` Ian Campbell
2012-10-25 17:45     ` Stefano Stabellini
2012-10-26  7:30       ` Ian Campbell
2012-10-26 11:18         ` Stefano Stabellini
2012-10-26 12:16           ` Ian Campbell
2012-10-26 15:24             ` Stefano Stabellini
2012-10-26 15:32               ` Ian Campbell
2012-10-24 15:03 ` [PATCH 6/7] xen/arm: flush D-cache and I-cache when appropriate Stefano Stabellini
2012-10-24 15:59   ` Tim Deegan
2012-10-24 16:05     ` Ian Campbell
2012-10-24 16:17       ` Tim Deegan
2012-10-24 17:35     ` Stefano Stabellini
2012-10-26  9:01       ` Tim Deegan
2012-10-26 15:53         ` Stefano Stabellini
2012-10-26 15:55           ` Stefano Stabellini
2012-10-26 16:03             ` Stefano Stabellini
2012-10-26 16:55               ` Tim Deegan
2012-10-26 18:40                 ` Stefano Stabellini
2012-10-27 10:44                   ` Tim Deegan
2012-10-27 11:54                     ` Tim Deegan
2012-10-29  9:53                       ` Stefano Stabellini
2012-10-29  9:52                     ` Stefano Stabellini
2012-11-13 12:01                     ` Stefano Stabellini
2012-11-13 12:15                       ` Tim Deegan
2012-10-26 16:45           ` Tim Deegan
2012-10-24 15:03 ` [PATCH 7/7] xen/arm: get the number of cpus from device tree Stefano Stabellini
2012-10-25 10:02   ` Ian Campbell
2012-10-24 16:02 ` [PATCH 0/7] xen/arm: run on real hardware Tim Deegan
2012-11-13 15:40 [PATCH v2 " Stefano Stabellini
2012-11-13 15:42 ` [PATCH 4/7] xen/arm: set the SMP bit in the ACTLR register Stefano Stabellini
2012-11-15 10:27   ` Ian Campbell

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.