Re: [Qemu-devel] [RFC PATCH] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions

From: Marcelo Tosatti <mtosatti@redhat.com>
To: Igor Mammedov <imammedo@redhat.com>
Cc: aarcange@redhat.com, peter.maydell@linaro.org, gleb@redhat.com,
	quintela@redhat.com, jan.kiszka@siemens.com,
	qemu-devel@nongnu.org, aliguori@amazon.com, pbonzini@redhat.com,
	afaerber@suse.de, rth@twiddle.net
Subject: Re: [Qemu-devel] [RFC PATCH] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions
Date: Tue, 29 Oct 2013 19:38:44 -0200	[thread overview]
Message-ID: <20131029213844.GB32615@amt.cnet> (raw)
In-Reply-To: <1383070729-19427-1-git-send-email-imammedo@redhat.com>

On Tue, Oct 29, 2013 at 07:18:49PM +0100, Igor Mammedov wrote:
> Otherwise 1GB TLBs cannot be cached for the range.

This fails to back non-1GB-aligned gpas, but 2MB aligned, with 2MB large
pages.

Since hugetlbfs allocation is static, it requires the user to inform
different 1GB and 2MB sized hugetlbfs mount points (with proper number
of corresponding hugetlbfs pages allocated). This is incompatible with
the current command line, and i'd like to see this problem handled in a
way that is command line backwards compatible.

Also, if the argument for one-to-one mapping between dimms and linear host
virtual address sections holds, it means virtual DIMMs must be
partitioned into whatever hugepage alignment is necessary (and in
that case, why they can't be partitioned similarly with the memory
region aliases?).

> PS:
> as side effect we are not wasting ~1Gb of memory if
> 1Gb hugepages are used and -m "hpagesize(in Mb)*n + 1"

This is how hugetlbfs works. You waste 1GB hugepage if an extra
byte is requested.

> Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> ---
> PS2:
> As RFC it's yet without compatibility changes noted by Paolo
> 
> ---
>  exec.c                    |    8 ++++++-
>  hw/i386/pc.c              |   50 ++++++++++++++++++++++++++++----------------
>  include/exec/cpu-common.h |    1 +
>  3 files changed, 40 insertions(+), 19 deletions(-)
> 
> diff --git a/exec.c b/exec.c
> index 9b6ea50..a4e5c80 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -882,7 +882,7 @@ void qemu_mutex_unlock_ramlist(void)
>  
>  #define HUGETLBFS_MAGIC       0x958458f6
>  
> -static long gethugepagesize(const char *path)
> +long gethugepagesize(const char *path)
>  {
>      struct statfs fs;
>      int ret;
> @@ -925,6 +925,12 @@ static void *file_ram_alloc(RAMBlock *block,
>          return NULL;
>      }
>  
> +    /* refuse to use huge pages if requested size isn't page aligned
> +     * to avoid wasting memory */
> +    if (memory != (memory & ~(hpagesize-1))) {
> +        return NULL;
> +    }
> +
>      if (kvm_enabled() && !kvm_has_sync_mmu()) {
>          fprintf(stderr, "host lacks kvm mmu notifiers, -mem-path unsupported\n");
>          return NULL;
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 0c313fe..1611fa7 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1116,32 +1116,46 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>  {
>      int linux_boot, i;
>      MemoryRegion *ram, *option_rom_mr;
> -    MemoryRegion *ram_below_4g, *ram_above_4g;
>      FWCfgState *fw_cfg;
> +    unsigned long hpagesize = gethugepagesize(mem_path);
> +    ram_addr_t below_4g_mem_size_alined, below_4g_mem_size_tail, above_4g_mem_size_alined, above_4g_mem_size_tail;
>  
>      linux_boot = (kernel_filename != NULL);
>  
> -    /* Allocate RAM.  We allocate it as a single memory region and use
> -     * aliases to address portions of it, mostly for backwards compatibility
> -     * with older qemus that used qemu_ram_alloc().
> -     */
> +    *ram_memory = g_malloc(sizeof(**ram_memory));
> +    memory_region_init(*ram_memory, NULL, "pc.ram",
> +                       above_4g_mem_size == 0 ? below_4g_mem_size: 0x100000000ULL + above_4g_mem_size);
> +    memory_region_add_subregion(system_memory, 0, *ram_memory);
> +
> +    below_4g_mem_size_alined = below_4g_mem_size & ~(hpagesize - 1);
>      ram = g_malloc(sizeof(*ram));
> -    memory_region_init_ram(ram, NULL, "pc.ram",
> -                           below_4g_mem_size + above_4g_mem_size);
> +    memory_region_init_ram(ram, NULL, "pc.ram.low.aligned", below_4g_mem_size_alined);
> +    memory_region_add_subregion(*ram_memory, 0, ram);
>      vmstate_register_ram_global(ram);
> -    *ram_memory = ram;
> -    ram_below_4g = g_malloc(sizeof(*ram_below_4g));
> -    memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
> -                             0, below_4g_mem_size);
> -    memory_region_add_subregion(system_memory, 0, ram_below_4g);
> -    if (above_4g_mem_size > 0) {
> -        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> -                                 below_4g_mem_size, above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> -                                    ram_above_4g);
> +
> +    below_4g_mem_size_tail = below_4g_mem_size - below_4g_mem_size_alined;
> +    if (below_4g_mem_size_tail) {
> +        ram = g_malloc(sizeof(*ram));
> +        memory_region_init_ram(ram, NULL, "pc.ram.low.unaligned", below_4g_mem_size_tail);
> +        memory_region_add_subregion(*ram_memory, below_4g_mem_size_alined, ram);
> +        vmstate_register_ram_global(ram);
>      }
>  
> +    if (above_4g_mem_size > 0) {
> +        above_4g_mem_size_alined = above_4g_mem_size & ~(hpagesize - 1);
> +        ram = g_malloc(sizeof(*ram));
> +        memory_region_init_ram(ram, NULL, "pc.ram.high.aligned", above_4g_mem_size_alined);
> +        memory_region_add_subregion(*ram_memory, 0x100000000ULL, ram);
> +        vmstate_register_ram_global(ram);
> +
> +        above_4g_mem_size_tail = above_4g_mem_size - above_4g_mem_size_alined;
> +        if (above_4g_mem_size_tail) {
> +            ram = g_malloc(sizeof(*ram));
> +            memory_region_init_ram(ram, NULL, "pc.ram.high.unaligned", above_4g_mem_size_tail);
> +            memory_region_add_subregion(*ram_memory, 0x100000000ULL + above_4g_mem_size_alined, ram);
> +            vmstate_register_ram_global(ram);
> +	}
> +    }
>  
>      /* Initialize PC system firmware */
>      pc_system_firmware_init(rom_memory, guest_info->isapc_ram_fw);
> diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
> index 40e15e4..f89a37c 100644
> --- a/include/exec/cpu-common.h
> +++ b/include/exec/cpu-common.h
> @@ -57,6 +57,7 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev);
>  #ifdef __linux__
>  uint32_t qemu_get_ram_hpagesize(ram_addr_t addr);
>  #endif
> +long gethugepagesize(const char *path);
>  
>  void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
>                              int len, int is_write);
> -- 
> 1.7.1