Re: [Qemu-devel] [PATCH 2/3] pseries: Allow KVM Book3S-HV on PPC970 CPUS

From: Alexander Graf <agraf@suse.de>
To: David Gibson <david@gibson.dropbear.id.au>
Cc: qemu-devel@nongnu.org
Subject: Re: [Qemu-devel] [PATCH 2/3] pseries: Allow KVM Book3S-HV on PPC970 CPUS
Date: Thu, 29 Sep 2011 15:25:37 +0200	[thread overview]
Message-ID: <E4FDF83B-E29E-4242-86FE-6FFF59A27689@suse.de> (raw)
In-Reply-To: <1317278706-16105-3-git-send-email-david@gibson.dropbear.id.au>

On 29.09.2011, at 08:45, David Gibson wrote:

> At present, using the hypervisor aware Book3S-HV KVM will only work
> with qemu on POWER7 CPUs.  PPC970 CPUs also have hypervisor
> capability, but they lack the VRMA feature which makes assigning guest
> memory easier.
> 
> In order to allow KVM Book3S-HV on PPC970, we need to specially
> allocate the first chunk of guest memory (the "Real Mode Area" or
> RMA), so that it is physically contiguous.
> 
> Sufficiently recent host kernels allow such contiguous RMAs to be
> allocated, with a kvm capability advertising whether the feature is
> available and/or necessary on this hardware.  This patch enables qemu
> to use this support, thus allowing kvm acceleration of pseries qemu
> machines on PPC970 hardware.
> 
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
> hw/spapr.c           |   50 ++++++++++++++++++++++++++++++++++++++++--------
> target-ppc/kvm.c     |   51 ++++++++++++++++++++++++++++++++++++++++++++++++++
> target-ppc/kvm_ppc.h |    6 +++++
> 3 files changed, 98 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/spapr.c b/hw/spapr.c
> index ba9ae1c..d51425a 100644
> --- a/hw/spapr.c
> +++ b/hw/spapr.c
> @@ -89,6 +89,7 @@ qemu_irq spapr_allocate_irq(uint32_t hint, uint32_t *irq_num)
> }
> 
> static void *spapr_create_fdt_skel(const char *cpu_model,
> +                                   target_phys_addr_t rma_size,
>                                    target_phys_addr_t initrd_base,
>                                    target_phys_addr_t initrd_size,
>                                    const char *boot_device,
> @@ -97,7 +98,9 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
> {
>     void *fdt;
>     CPUState *env;
> -    uint64_t mem_reg_property[] = { 0, cpu_to_be64(ram_size) };
> +    uint64_t mem_reg_property_rma[] = { 0, cpu_to_be64(rma_size) };
> +    uint64_t mem_reg_property_nonrma[] = { cpu_to_be64(rma_size),
> +                                           cpu_to_be64(ram_size - rma_size) };
>     uint32_t start_prop = cpu_to_be32(initrd_base);
>     uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
>     uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)};
> @@ -143,15 +146,25 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
> 
>     _FDT((fdt_end_node(fdt)));
> 
> -    /* memory node */
> +    /* memory node(s) */
>     _FDT((fdt_begin_node(fdt, "memory@0")));
> 
>     _FDT((fdt_property_string(fdt, "device_type", "memory")));
> -    _FDT((fdt_property(fdt, "reg",
> -                       mem_reg_property, sizeof(mem_reg_property))));
> -
> +    _FDT((fdt_property(fdt, "reg", mem_reg_property_rma,
> +                       sizeof(mem_reg_property_rma))));
>     _FDT((fdt_end_node(fdt)));
> 
> +    if (ram_size > rma_size) {
> +        char mem_name[32];
> +
> +	sprintf(mem_name, "memory@%" PRIx64, (uint64_t)rma_size);
> +	_FDT((fdt_begin_node(fdt, mem_name)));
> +	_FDT((fdt_property_string(fdt, "device_type", "memory")));
> +        _FDT((fdt_property(fdt, "reg", mem_reg_property_nonrma,
> +                           sizeof(mem_reg_property_nonrma))));
> +        _FDT((fdt_end_node(fdt)));
> +    }        
> +
>     /* cpus */
>     _FDT((fdt_begin_node(fdt, "cpus")));
> 
> @@ -341,6 +354,7 @@ static void ppc_spapr_init(ram_addr_t ram_size,
> {
>     CPUState *env;
>     int i;
> +    target_phys_addr_t rma_alloc_size, rma_size;
>     ram_addr_t ram_offset;
>     uint32_t initrd_base;
>     long kernel_size, initrd_size, fw_size;
> @@ -350,10 +364,23 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>     spapr = g_malloc(sizeof(*spapr));
>     cpu_ppc_hypercall = emulate_spapr_hypercall;
> 
> +    /* Allocate RMA if necessary */
> +    rma_alloc_size = kvmppc_alloc_rma("ppc_spapr.rma");
> +
> +    if (rma_alloc_size == -1) {
> +        hw_error("qemu: Unable to create RMA\n");
> +        exit(1);
> +    }
> +    if (rma_alloc_size && (rma_alloc_size < ram_size)) {
> +        rma_size = rma_alloc_size;
> +    } else {
> +        rma_size = ram_size;
> +    }
> +
>     /* We place the device tree just below either the top of RAM, or
>      * 2GB, so that it can be processed with 32-bit code if
>      * necessary */
> -    spapr->fdt_addr = MIN(ram_size, 0x80000000) - FDT_MAX_SIZE;
> +    spapr->fdt_addr = MIN(rma_size, 0x80000000) - FDT_MAX_SIZE;

The change looks sane, so I'd assume the description above is now wrong :)

>     spapr->rtas_addr = spapr->fdt_addr - RTAS_MAX_SIZE;
> 
>     /* init CPUs */
> @@ -378,8 +405,13 @@ static void ppc_spapr_init(ram_addr_t ram_size,
> 
>     /* allocate RAM */
>     spapr->ram_limit = ram_size;
> -    ram_offset = qemu_ram_alloc(NULL, "ppc_spapr.ram", spapr->ram_limit);
> -    cpu_register_physical_memory(0, ram_size, ram_offset);
> +    if (spapr->ram_limit > rma_alloc_size) {
> +        ram_addr_t nonrma_base = rma_alloc_size;
> +        ram_addr_t nonrma_size = spapr->ram_limit - rma_alloc_size;
> +
> +        ram_offset = qemu_ram_alloc(NULL, "ppc_spapr.ram", nonrma_size);
> +        cpu_register_physical_memory(nonrma_base, nonrma_size, ram_offset);
> +    }
> 
>     /* allocate hash page table.  For now we always make this 16mb,
>      * later we should probably make it scale to the size of guest
> @@ -503,7 +535,7 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>     }
> 
>     /* Prepare the device tree */
> -    spapr->fdt_skel = spapr_create_fdt_skel(cpu_model,
> +    spapr->fdt_skel = spapr_create_fdt_skel(cpu_model, rma_size,
>                                             initrd_base, initrd_size,
>                                             boot_device, kernel_cmdline,
>                                             pteg_shift + 7);
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index 2c1bc7a..37ee902 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -55,6 +55,9 @@ static int cap_interrupt_level = false;
> static int cap_segstate;
> static int cap_booke_sregs;
> static int cap_ppc_smt = 0;
> +#ifdef KVM_CAP_PPC_RMA

No need for these ifdefs anymore thanks to qemu local kvm headers :)

Alex