[U-Boot] [Patch v2 3/5] armv8/fsl-lsch3: Release secondary cores from boot hold off with Boot Page

From: York Sun <yorksun@freescale.com>
To: u-boot@lists.denx.de
Subject: [U-Boot] [Patch v2 3/5] armv8/fsl-lsch3: Release secondary cores from boot hold off with Boot Page
Date: Thu, 21 Aug 2014 11:32:53 -0700	[thread overview]
Message-ID: <53F63B55.3040900@freescale.com> (raw)
In-Reply-To: <20140821134724.GM21734@leverpostej>

On 08/21/2014 06:47 AM, Mark Rutland wrote:
> Hi York,
> 
> I have mostly minor comments this time; this is looking pretty good.
> 
> On Tue, Aug 19, 2014 at 09:28:00PM +0100, York Sun wrote:
>> Secondary cores need to be released from holdoff by boot release
>> registers. With GPP bootrom, they can boot from main memory
>> directly. Individual spin table is used for each core. If a single
>> release address is needed, defining macro CONFIG_FSL_SMP_RELEASE_ALL
>> will use the CPU_RELEASE_ADDR. Spin table and the boot page is reserved
>> in device tree so OS won't overwrite.
>>
>> Signed-off-by: York Sun <yorksun@freescale.com>
>> Signed-off-by: Arnab Basu <arnab.basu@freescale.com>
>> ---
>>  v2: Removed copying boot page. Use u-boot image as is in memory.
>>      Added dealing with different size of addr_cell in device tree.
>>      Added dealing with big- and little-endian.
>>      Added flushing spin table after cpu_release().
>>
>>  arch/arm/cpu/armv8/fsl-lsch3/Makefile             |    2 +
>>  arch/arm/cpu/armv8/fsl-lsch3/cpu.c                |   13 ++
>>  arch/arm/cpu/armv8/fsl-lsch3/cpu.h                |    1 +
>>  arch/arm/cpu/armv8/fsl-lsch3/fdt.c                |   56 +++++++
>>  arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S           |  128 ++++++++++++---
>>  arch/arm/cpu/armv8/fsl-lsch3/mp.c                 |  172 +++++++++++++++++++++
>>  arch/arm/cpu/armv8/fsl-lsch3/mp.h                 |   36 +++++
>>  arch/arm/cpu/armv8/transition.S                   |   63 +-------
>>  arch/arm/include/asm/arch-fsl-lsch3/config.h      |    3 +-
>>  arch/arm/include/asm/arch-fsl-lsch3/immap_lsch3.h |   35 +++++
>>  arch/arm/include/asm/macro.h                      |   92 +++++++++++
>>  arch/arm/lib/gic_64.S                             |   10 +-
>>  common/board_f.c                                  |    2 +-
>>  13 files changed, 525 insertions(+), 88 deletions(-)
>>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/fdt.c
>>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/mp.c
>>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/mp.h
>>
>> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/Makefile b/arch/arm/cpu/armv8/fsl-lsch3/Makefile
>> index 9249537..f920eeb 100644
>> --- a/arch/arm/cpu/armv8/fsl-lsch3/Makefile
>> +++ b/arch/arm/cpu/armv8/fsl-lsch3/Makefile
>> @@ -7,3 +7,5 @@
>>  obj-y += cpu.o
>>  obj-y += lowlevel.o
>>  obj-y += speed.o
>> +obj-$(CONFIG_MP) += mp.o
>> +obj-$(CONFIG_OF_LIBFDT) += fdt.o
>> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/cpu.c b/arch/arm/cpu/armv8/fsl-lsch3/cpu.c
>> index c129d03..47b947f 100644
>> --- a/arch/arm/cpu/armv8/fsl-lsch3/cpu.c
>> +++ b/arch/arm/cpu/armv8/fsl-lsch3/cpu.c
>> @@ -11,6 +11,7 @@
>>  #include <asm/io.h>
>>  #include <asm/arch-fsl-lsch3/immap_lsch3.h>
>>  #include "cpu.h"
>> +#include "mp.h"
>>  #include "speed.h"
>>  #include <fsl_mc.h>
>>
>> @@ -434,3 +435,15 @@ int cpu_eth_init(bd_t *bis)
>>  #endif
>>         return error;
>>  }
>> +
>> +
>> +int arch_early_init_r(void)
>> +{
>> +       int rv;
>> +       rv = fsl_lsch3_wake_seconday_cores();
>> +
>> +       if (rv)
>> +               printf("Did not wake secondary cores\n");
>> +
>> +       return 0;
>> +}
>> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/cpu.h b/arch/arm/cpu/armv8/fsl-lsch3/cpu.h
>> index 28544d7..2e3312b 100644
>> --- a/arch/arm/cpu/armv8/fsl-lsch3/cpu.h
>> +++ b/arch/arm/cpu/armv8/fsl-lsch3/cpu.h
>> @@ -5,3 +5,4 @@
>>   */
>>
>>  int fsl_qoriq_core_to_cluster(unsigned int core);
>> +u32 cpu_mask(void);
>> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/fdt.c b/arch/arm/cpu/armv8/fsl-lsch3/fdt.c
>> new file mode 100644
>> index 0000000..2dbcdcb
>> --- /dev/null
>> +++ b/arch/arm/cpu/armv8/fsl-lsch3/fdt.c
>> @@ -0,0 +1,56 @@
>> +/*
>> + * Copyright 2014 Freescale Semiconductor, Inc.
>> + *
>> + * SPDX-License-Identifier:    GPL-2.0+
>> + */
>> +
>> +#include <common.h>
>> +#include <libfdt.h>
>> +#include <fdt_support.h>
>> +#include "mp.h"
>> +
>> +#ifdef CONFIG_MP
>> +void ft_fixup_cpu(void *blob)
>> +{
>> +       int off;
>> +       __maybe_unused u64 spin_tbl_addr = (u64)get_spin_tbl_addr();
>> +       fdt32_t *reg;
>> +       int addr_cells;
>> +       u64 val;
>> +       size_t *boot_code_size = &(__secondary_boot_code_size);
>> +
>> +       off = fdt_node_offset_by_prop_value(blob, -1, "device_type", "cpus", 4);
> 
> I didn't think /cpus had device_type = "cpus". I can't see any
> instances in any DTs I have to hand. Can we not find /cpus by path?

I will let Arnab to comment on this. He is coordinating with Linux device tree.

> 
>> +       of_bus_default_count_cells(blob, off, &addr_cells, NULL);
>> +
>> +       off = fdt_node_offset_by_prop_value(blob, -1, "device_type", "cpu", 4);
>> +       while (off != -FDT_ERR_NOTFOUND) {
>> +               reg = (fdt32_t *)fdt_getprop(blob, off, "reg", 0);
>> +               if (reg) {
>> +                       val = spin_tbl_addr;
>> +#ifndef CONFIG_FSL_SMP_RELEASE_ALL
>> +                       val += id_to_core(of_read_number(reg, addr_cells))
>> +                               * SPIN_TABLE_ELEM_SIZE;
>> +#endif
>> +                       val = cpu_to_fdt64(val);
>> +                       fdt_setprop_string(blob, off, "enable-method",
>> +                                          "spin-table");
>> +                       fdt_setprop(blob, off, "cpu-release-addr",
>> +                                   &val, sizeof(val));
>> +               } else {
>> +                       puts("cpu NULL\n");
> 
> Could we change that to "Warning: found cpu node without reg property"
> or something like that which makes the problem clear?

Yes, we can.

> 
>> +               }
>> +               off = fdt_node_offset_by_prop_value(blob, off, "device_type",
>> +                                                   "cpu", 4);
>> +       }
>> +
>> +       fdt_add_mem_rsv(blob, (uintptr_t)&secondary_boot_code,
>> +                       *boot_code_size);
>> +}
>> +#endif
>> +
>> +void ft_cpu_setup(void *blob, bd_t *bd)
>> +{
>> +#ifdef CONFIG_MP
>> +       ft_fixup_cpu(blob);
>> +#endif
>> +}
>> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S b/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S
>> index ad32b6c..629e6d4 100644
>> --- a/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S
>> +++ b/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S
>> @@ -8,7 +8,9 @@
>>
>>  #include <config.h>
>>  #include <linux/linkage.h>
>> +#include <asm/gic.h>
>>  #include <asm/macro.h>
>> +#include "mp.h"
>>
>>  ENTRY(lowlevel_init)
>>         mov     x29, lr                 /* Save LR */
>> @@ -37,29 +39,119 @@ ENTRY(lowlevel_init)
>>
>>         branch_if_master x0, x1, 1f
>>
>> +       ldr     x0, =secondary_boot_func
>> +       blr     x0
>> +
>> +1:
>> +2:
> 
> Isn't the '2' label redundant?

We have some internal code dealing with trust zone between the 1 and 2. It is
not likely to be used in long term since we are trying to move them into
security monitor. I can drop label 2 here.

> 
>> +       mov     lr, x29                 /* Restore LR */
>> +       ret
>> +ENDPROC(lowlevel_init)
>> +
>> +       /* Keep literals not used by the secondary boot code outside it */
>> +       .ltorg
>> +
>> +       /* Using 64 bit alignment since the spin table is accessed as data */
>> +       .align 4
>> +       .global secondary_boot_code
>> +       /* Secondary Boot Code starts here */
>> +secondary_boot_code:
>> +       .global __spin_table
>> +__spin_table:
>> +       .space CONFIG_MAX_CPUS*SPIN_TABLE_ELEM_SIZE
>> +
>> +       .align 2
>> +ENTRY(secondary_boot_func)
>>         /*
>> -        * Slave should wait for master clearing spin table.
>> -        * This sync prevent salves observing incorrect
>> -        * value of spin table and jumping to wrong place.
>> +        * MPIDR_EL1 Fields:
>> +        * MPIDR[1:0] = AFF0_CPUID <- Core ID (0,1)
>> +        * MPIDR[7:2] = AFF0_RES
>> +        * MPIDR[15:8] = AFF1_CLUSTERID <- Cluster ID (0,1,2,3)
>> +        * MPIDR[23:16] = AFF2_CLUSTERID
>> +        * MPIDR[24] = MT
>> +        * MPIDR[29:25] = RES0
>> +        * MPIDR[30] = U
>> +        * MPIDR[31] = ME
>> +        * MPIDR[39:32] = AFF3
>> +        *
>> +        * Linear Processor ID (LPID) calculation from MPIDR_EL1:
>> +        * (We only use AFF0_CPUID and AFF1_CLUSTERID for now
>> +        * until AFF2_CLUSTERID and AFF3 have non-zero values)
>> +        *
>> +        * LPID = MPIDR[15:8] | MPIDR[1:0]
>>          */
>> -#if defined(CONFIG_GICV2) || defined(CONFIG_GICV3)
>> -#ifdef CONFIG_GICV2
>> -       ldr     x0, =GICC_BASE
>> +       mrs     x0, mpidr_el1
>> +       ubfm    x1, x0, #8, #15
>> +       ubfm    x2, x0, #0, #1
>> +       orr     x10, x2, x1, lsl #2     /* x10 has LPID */
>> +       ubfm    x9, x0, #0, #15         /* x9 contains MPIDR[15:0] */
>> +       /*
>> +        * offset of the spin table element for this core from start of spin
>> +        * table (each elem is padded to 64 bytes)
>> +        */
>> +       lsl     x1, x10, #6
>> +       ldr     x0, =__spin_table
>> +       /* physical address of this cpus spin table element */
>> +       add     x11, x1, x0
>> +
>> +       str     x9, [x11, #16]  /* LPID */
>> +       mov     x4, #1
>> +       str     x4, [x11, #8]   /* STATUS */
>> +       dsb     sy
>> +#if defined(CONFIG_GICV3)
>> +       gic_wait_for_interrupt_m x0
>>  #endif
>> -       bl      gic_wait_for_interrupt
> 
> Why do we only need to wait for GICv3? We previously did so for GICv2.

I think this is leftover from the attempt to boot all cores simultaneously. I
will let Arnab to comment.

> 
>> +
>> +       bl secondary_switch_to_el2
>> +#ifdef CONFIG_ARMV8_SWITCH_TO_EL1
>> +       secondary_switch_to_el1
>>  #endif
> 
> Missing bl?

Looks so.

> 
> Is there any reason to have the SWITCH_TO_EL1 option other than for
> debugging?

Good question. I will let Arnab to comment here.

> 
> EL2 is the preferred EL to boot at for Linux and Xen (it gives far more
> flexibility), and if dropping to EL1 is necessary I think it would make
> more sense as a run-time option than a compile-time option.
> 
>>
>> -       /*
>> -        * All processors will enter EL2 and optionally EL1.
>> +slave_cpu:
>> +       wfe
>> +#ifdef CONFIG_FSL_SMP_RELEASE_ALL
>> +       /* All cores are released from the address in the 1st spin table
>> +        * element
>>          */
>> -       bl      armv8_switch_to_el2
>> -#ifdef CONFIG_ARMV8_SWITCH_TO_EL1
>> -       bl      armv8_switch_to_el1
>> +       ldr     x1, =__spin_table
>> +       ldr     x0, [x1]
>> +#else
>> +       ldr     x0, [x11]
>> +#endif
>> +       cbz     x0, slave_cpu
> 
> Similarly is there any reason to have the option of a single release
> addr if we can support unique addresses?

I think it was used by Linux for some ARM parts. I personally not a fun of using
single release. But if it makes everyone happy, I can keep it.

> 
> [...]
> 
>> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/mp.c b/arch/arm/cpu/armv8/fsl-lsch3/mp.c
>> new file mode 100644
>> index 0000000..b29eda9
>> --- /dev/null
>> +++ b/arch/arm/cpu/armv8/fsl-lsch3/mp.c
>> @@ -0,0 +1,172 @@
>> +/*
>> + * Copyright 2014 Freescale Semiconductor, Inc.
>> + *
>> + * SPDX-License-Identifier:    GPL-2.0+
>> + */
>> +
>> +#include <common.h>
>> +#include <asm/io.h>
>> +#include <asm/system.h>
>> +#include <asm/io.h>
>> +#include <asm/arch-fsl-lsch3/immap_lsch3.h>
>> +#include "mp.h"
>> +
>> +DECLARE_GLOBAL_DATA_PTR;
>> +
>> +void *get_spin_tbl_addr(void)
>> +{
>> +       void *ptr = (void *)&__spin_table;
>> +
>> +       return ptr;
>> +}
> 
> I don't think the cast is necessary here.  As far as I can see this
> could be just:
> 
> void *get_spin_tbl_addr(void)
> {
> 	return &__spin_table;
> }
> 

Let me try that.

> [...]
> 
>> diff --git a/arch/arm/include/asm/macro.h b/arch/arm/include/asm/macro.h
>> index f77e4b8..0009c28 100644
>> --- a/arch/arm/include/asm/macro.h
>> +++ b/arch/arm/include/asm/macro.h
>> @@ -105,6 +105,98 @@ lr .req    x30
>>         cbz     \xreg1, \master_label
>>  .endm
>>
>> +.macro armv8_switch_to_el2_m, xreg1
>> +       mov     \xreg1, #0x5b1  /* Non-secure EL0/EL1 | HVC | 64bit EL2 */
>> +       msr     scr_el3, \xreg1
> 
> When dropping to EL1 from EL2 we disable HVC via HCR_EL2; presumably due
> to lack of a handler. Would it make sense to do similarly here and
> disable SMC here until we have a user (e.g. PSCI)?

I will let Arnab to comment here.

York