All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [patch 0/2] align >4GB guest RAM to a 1GB boundary, in case of 1GB-sized hugetlbfs
@ 2013-10-24 21:11 Marcelo Tosatti
  2013-10-24 21:11 ` [Qemu-devel] [patch 1/2] exec: add qemu_get_ram_hpagesize Marcelo Tosatti
  2013-10-24 21:12 ` [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary Marcelo Tosatti
  0 siblings, 2 replies; 53+ messages in thread
From: Marcelo Tosatti @ 2013-10-24 21:11 UTC (permalink / raw)
  To: qemu-devel; +Cc: aarcange, pbonzini, gleb

See individual patches for details.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [Qemu-devel] [patch 1/2] exec: add qemu_get_ram_hpagesize
  2013-10-24 21:11 [Qemu-devel] [patch 0/2] align >4GB guest RAM to a 1GB boundary, in case of 1GB-sized hugetlbfs Marcelo Tosatti
@ 2013-10-24 21:11 ` Marcelo Tosatti
  2013-10-24 21:12 ` [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary Marcelo Tosatti
  1 sibling, 0 replies; 53+ messages in thread
From: Marcelo Tosatti @ 2013-10-24 21:11 UTC (permalink / raw)
  To: qemu-devel; +Cc: aarcange, pbonzini, Marcelo Tosatti, gleb

[-- Attachment #1: add-hpagesize-to-ramblock --]
[-- Type: text/plain, Size: 1975 bytes --]

Add helper to retrieve page size of RAM backing, when hugetlbfs 
is used.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: git/qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -961,8 +961,27 @@ static void *file_ram_alloc(RAMBlock *bl
         return (NULL);
     }
     block->fd = fd;
+    block->hpagesize = hpagesize;
     return area;
 }
+
+uint32_t qemu_get_ram_hpagesize(ram_addr_t addr)
+{
+    RAMBlock *block;
+
+    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+        if (addr - block->offset < block->length) {
+            return block->hpagesize;
+        }
+    }
+
+    fprintf(stderr, "%s: Bad ram offset %" PRIx64 "\n", __func__,
+                    (uint64_t)addr);
+    abort();
+
+    return 0;
+}
+
 #else
 static void *file_ram_alloc(RAMBlock *block,
                             ram_addr_t memory,
Index: qemu/include/exec/cpu-all.h
===================================================================
--- qemu.orig/include/exec/cpu-all.h
+++ qemu/include/exec/cpu-all.h
@@ -448,6 +448,7 @@ typedef struct RAMBlock {
     ram_addr_t offset;
     ram_addr_t length;
     uint32_t flags;
+    uint32_t hpagesize;
     char idstr[256];
     /* Reads can take either the iothread or the ramlist lock.
      * Writes must take both locks.
Index: git/qemu/include/exec/cpu-common.h
===================================================================
--- qemu.orig/include/exec/cpu-common.h
+++ qemu/include/exec/cpu-common.h
@@ -54,6 +54,9 @@ void qemu_ram_remap(ram_addr_t addr, ram
 /* This should not be used by devices.  */
 MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
 void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev);
+#ifdef __linux__
+uint32_t qemu_get_ram_hpagesize(ram_addr_t addr);
+#endif
 
 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
                             int len, int is_write);

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-24 21:11 [Qemu-devel] [patch 0/2] align >4GB guest RAM to a 1GB boundary, in case of 1GB-sized hugetlbfs Marcelo Tosatti
  2013-10-24 21:11 ` [Qemu-devel] [patch 1/2] exec: add qemu_get_ram_hpagesize Marcelo Tosatti
@ 2013-10-24 21:12 ` Marcelo Tosatti
  2013-10-24 21:55   ` Peter Maydell
                     ` (2 more replies)
  1 sibling, 3 replies; 53+ messages in thread
From: Marcelo Tosatti @ 2013-10-24 21:12 UTC (permalink / raw)
  To: qemu-devel; +Cc: aarcange, pbonzini, Marcelo Tosatti, gleb

[-- Attachment #1: align-ram-to-1gb-pagesize --]
[-- Type: text/plain, Size: 3515 bytes --]

Align guest physical address and host physical address
beyond guest 4GB on a 1GB boundary, in case hugetlbfs is used.

Otherwise 1GB TLBs cannot be cached for the range.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/hw/i386/pc.c
===================================================================
--- qemu.orig/hw/i386/pc.c
+++ qemu/hw/i386/pc.c
@@ -1116,8 +1116,9 @@ FWCfgState *pc_memory_init(MemoryRegion
 {
     int linux_boot, i;
     MemoryRegion *ram, *option_rom_mr;
-    MemoryRegion *ram_below_4g, *ram_above_4g;
+    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
     FWCfgState *fw_cfg;
+    unsigned long hpagesize;
 
     linux_boot = (kernel_filename != NULL);
 
@@ -1129,6 +1130,7 @@ FWCfgState *pc_memory_init(MemoryRegion
     memory_region_init_ram(ram, NULL, "pc.ram",
                            below_4g_mem_size + above_4g_mem_size);
     vmstate_register_ram_global(ram);
+    hpagesize = qemu_get_ram_hpagesize(ram->ram_addr);
     *ram_memory = ram;
     ram_below_4g = g_malloc(sizeof(*ram_below_4g));
     memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
@@ -1136,10 +1138,46 @@ FWCfgState *pc_memory_init(MemoryRegion
     memory_region_add_subregion(system_memory, 0, ram_below_4g);
     if (above_4g_mem_size > 0) {
         ram_above_4g = g_malloc(sizeof(*ram_above_4g));
-        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
-                                 below_4g_mem_size, above_4g_mem_size);
-        memory_region_add_subregion(system_memory, 0x100000000ULL,
+
+        /*
+         *
+         * If 1GB hugepages are used to back guest RAM, map guest address
+         * space in the range [ramsize,ramsize+holesize] to the ram block
+         * range [holestart, 4GB]
+         *
+         *                      0      h     4G     [ramsize,ramsize+holesize]
+         *
+         * guest-addr-space     [      ]     [      ][xxx]
+         *                                /----------/
+         * contiguous-ram-block [      ][xxx][     ]
+         *
+         * So that memory beyond 4GB is aligned on a 1GB boundary,
+         * at the host physical address space.
+         *
+         */
+        if (hpagesize == (1<<30)) {
+            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
+
+            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
+                                    0x100000000ULL,
+                                    above_4g_mem_size - holesize);
+            memory_region_add_subregion(system_memory, 0x100000000ULL,
+                                    ram_above_4g);
+
+            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
+            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
+                                     "ram-above-4g-piecetwo", ram,
+                                     0x100000000ULL - holesize, holesize);
+            memory_region_add_subregion(system_memory,
+                                        0x100000000ULL +
+                                        above_4g_mem_size - holesize,
+                                        ram_above_4g_piecetwo);
+        } else {
+            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
+                                    below_4g_mem_size, above_4g_mem_size);
+            memory_region_add_subregion(system_memory, 0x100000000ULL,
                                     ram_above_4g);
+        }
     }
 
 

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-24 21:12 ` [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary Marcelo Tosatti
@ 2013-10-24 21:55   ` Peter Maydell
  2013-10-24 22:48     ` Marcelo Tosatti
  2013-10-24 23:55   ` Paolo Bonzini
  2013-11-06  1:49   ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v2) Marcelo Tosatti
  2 siblings, 1 reply; 53+ messages in thread
From: Peter Maydell @ 2013-10-24 21:55 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Andrea Arcangeli, Paolo Bonzini, QEMU Developers, Gleb Natapov

On 24 October 2013 22:12, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> Align guest physical address and host physical address
> beyond guest 4GB on a 1GB boundary, in case hugetlbfs is used.
>
> Otherwise 1GB TLBs cannot be cached for the range.

> +        if (hpagesize == (1<<30)) {
> +            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
> +
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    0x100000000ULL,
> +                                    above_4g_mem_size - holesize);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
> +                                    ram_above_4g);
> +
> +            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> +                                     "ram-above-4g-piecetwo", ram,
> +                                     0x100000000ULL - holesize, holesize);
> +            memory_region_add_subregion(system_memory,
> +                                        0x100000000ULL +
> +                                        above_4g_mem_size - holesize,
> +                                        ram_above_4g_piecetwo);
> +        } else {
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    below_4g_mem_size, above_4g_mem_size);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
>                                      ram_above_4g);

This looks pretty weird. Presence or absence of host OS features
shouldn't affect how we model the guest hardware and RAM.
Conversely, if hugetlbs have performance related requirements
then a patch which only touches the x86 pc model seems rather
limited.

-- PMM

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-24 21:55   ` Peter Maydell
@ 2013-10-24 22:48     ` Marcelo Tosatti
  0 siblings, 0 replies; 53+ messages in thread
From: Marcelo Tosatti @ 2013-10-24 22:48 UTC (permalink / raw)
  To: Peter Maydell
  Cc: Andrea Arcangeli, Paolo Bonzini, QEMU Developers, Gleb Natapov

On Thu, Oct 24, 2013 at 10:55:54PM +0100, Peter Maydell wrote:
> On 24 October 2013 22:12, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > Align guest physical address and host physical address
> > beyond guest 4GB on a 1GB boundary, in case hugetlbfs is used.
> >
> > Otherwise 1GB TLBs cannot be cached for the range.
> 
> > +        if (hpagesize == (1<<30)) {
> > +            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
> > +
> > +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> > +                                    0x100000000ULL,
> > +                                    above_4g_mem_size - holesize);
> > +            memory_region_add_subregion(system_memory, 0x100000000ULL,
> > +                                    ram_above_4g);
> > +
> > +            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> > +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> > +                                     "ram-above-4g-piecetwo", ram,
> > +                                     0x100000000ULL - holesize, holesize);
> > +            memory_region_add_subregion(system_memory,
> > +                                        0x100000000ULL +
> > +                                        above_4g_mem_size - holesize,
> > +                                        ram_above_4g_piecetwo);
> > +        } else {
> > +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> > +                                    below_4g_mem_size, above_4g_mem_size);
> > +            memory_region_add_subregion(system_memory, 0x100000000ULL,
> >                                      ram_above_4g);
> 
> This looks pretty weird. Presence or absence of host OS features
> shouldn't affect how we model the guest hardware and RAM.

This is not visible to the guest (read the comment in the patch).

> Conversely, if hugetlbs have performance related requirements
> then a patch which only touches the x86 pc model seems rather
> limited.

The requirement is that gpa and and hpas must be aligned on hugepage
boundaries. The memory region API allows gpas to be registered at custom
hpas (via the offset parameter).

It is not entirely clear what is your request.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-24 21:12 ` [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary Marcelo Tosatti
  2013-10-24 21:55   ` Peter Maydell
@ 2013-10-24 23:55   ` Paolo Bonzini
  2013-10-25  4:58     ` Marcelo Tosatti
  2013-11-06  1:49   ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v2) Marcelo Tosatti
  2 siblings, 1 reply; 53+ messages in thread
From: Paolo Bonzini @ 2013-10-24 23:55 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: aarcange, qemu-devel, gleb

Il 24/10/2013 22:12, Marcelo Tosatti ha scritto:
> Align guest physical address and host physical address
> beyond guest 4GB on a 1GB boundary, in case hugetlbfs is used.
> 
> Otherwise 1GB TLBs cannot be cached for the range.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> Index: qemu/hw/i386/pc.c
> ===================================================================
> --- qemu.orig/hw/i386/pc.c
> +++ qemu/hw/i386/pc.c
> @@ -1116,8 +1116,9 @@ FWCfgState *pc_memory_init(MemoryRegion
>  {
>      int linux_boot, i;
>      MemoryRegion *ram, *option_rom_mr;
> -    MemoryRegion *ram_below_4g, *ram_above_4g;
> +    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
>      FWCfgState *fw_cfg;
> +    unsigned long hpagesize;
>  
>      linux_boot = (kernel_filename != NULL);
>  
> @@ -1129,6 +1130,7 @@ FWCfgState *pc_memory_init(MemoryRegion
>      memory_region_init_ram(ram, NULL, "pc.ram",
>                             below_4g_mem_size + above_4g_mem_size);
>      vmstate_register_ram_global(ram);
> +    hpagesize = qemu_get_ram_hpagesize(ram->ram_addr);
>      *ram_memory = ram;
>      ram_below_4g = g_malloc(sizeof(*ram_below_4g));
>      memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
> @@ -1136,10 +1138,46 @@ FWCfgState *pc_memory_init(MemoryRegion
>      memory_region_add_subregion(system_memory, 0, ram_below_4g);
>      if (above_4g_mem_size > 0) {
>          ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> -                                 below_4g_mem_size, above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> +
> +        /*
> +         *
> +         * If 1GB hugepages are used to back guest RAM, map guest address
> +         * space in the range [ramsize,ramsize+holesize] to the ram block
> +         * range [holestart, 4GB]
> +         *
> +         *                      0      h     4G     [ramsize,ramsize+holesize]
> +         *
> +         * guest-addr-space     [      ]     [      ][xxx]
> +         *                                /----------/
> +         * contiguous-ram-block [      ][xxx][     ]
> +         *
> +         * So that memory beyond 4GB is aligned on a 1GB boundary,
> +         * at the host physical address space.
> +         *
> +         */
> +        if (hpagesize == (1<<30)) {
> +            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
> +
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    0x100000000ULL,
> +                                    above_4g_mem_size - holesize);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
> +                                    ram_above_4g);
> +
> +            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> +                                     "ram-above-4g-piecetwo", ram,
> +                                     0x100000000ULL - holesize, holesize);
> +            memory_region_add_subregion(system_memory,
> +                                        0x100000000ULL +
> +                                        above_4g_mem_size - holesize,
> +                                        ram_above_4g_piecetwo);

Why break it in two?  You can just allocate extra holesize bytes in the
"ram" MemoryRegion, and not map the part that corresponds to
[0x100000000ULL - holesize, 0x100000000ULL).

Also, as Peter said this cannot depend on host considerations.  Just do
it unconditionally, but only for new machine types (pc-1.8 and q35-1.8,
since unfortunately we're too close to hard freeze).

Paolo

> +        } else {
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    below_4g_mem_size, above_4g_mem_size);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
>                                      ram_above_4g);
> +        }
>      }

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-24 23:55   ` Paolo Bonzini
@ 2013-10-25  4:58     ` Marcelo Tosatti
  2013-10-25  8:52       ` Paolo Bonzini
  2013-10-25  9:57       ` igor Mammedov
  0 siblings, 2 replies; 53+ messages in thread
From: Marcelo Tosatti @ 2013-10-25  4:58 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: aarcange, qemu-devel, gleb

On Fri, Oct 25, 2013 at 12:55:36AM +0100, Paolo Bonzini wrote:
> > +        if (hpagesize == (1<<30)) {
> > +            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
> > +
> > +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> > +                                    0x100000000ULL,
> > +                                    above_4g_mem_size - holesize);
> > +            memory_region_add_subregion(system_memory, 0x100000000ULL,
> > +                                    ram_above_4g);
> > +
> > +            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> > +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> > +                                     "ram-above-4g-piecetwo", ram,
> > +                                     0x100000000ULL - holesize, holesize);
> > +            memory_region_add_subregion(system_memory,
> > +                                        0x100000000ULL +
> > +                                        above_4g_mem_size - holesize,
> > +                                        ram_above_4g_piecetwo);
> 
> Why break it in two?  You can just allocate extra holesize bytes in the
> "ram" MemoryRegion, and not map the part that corresponds to
> [0x100000000ULL - holesize, 0x100000000ULL).

- If the "ram" MemoryRegion is backed with 1GB hugepages, you might not 
want to allocate extra holesize bytes (which might require an entire
1GB page).

- 1GB backed RAM can be mapped with 2MB pages.

> Also, as Peter said this cannot depend on host considerations.  Just do
> it unconditionally, but only for new machine types (pc-1.8 and q35-1.8,
> since unfortunately we're too close to hard freeze).

Why the description of memory subregions and aliases are part of machine
types?

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-25  4:58     ` Marcelo Tosatti
@ 2013-10-25  8:52       ` Paolo Bonzini
  2013-10-25 19:50         ` Marcelo Tosatti
  2013-10-25  9:57       ` igor Mammedov
  1 sibling, 1 reply; 53+ messages in thread
From: Paolo Bonzini @ 2013-10-25  8:52 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: aarcange, qemu-devel, gleb

Il 25/10/2013 05:58, Marcelo Tosatti ha scritto:
> On Fri, Oct 25, 2013 at 12:55:36AM +0100, Paolo Bonzini wrote:
>>> +        if (hpagesize == (1<<30)) {
>>> +            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
>>> +
>>> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
>>> +                                    0x100000000ULL,
>>> +                                    above_4g_mem_size - holesize);
>>> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
>>> +                                    ram_above_4g);
>>> +
>>> +            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
>>> +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
>>> +                                     "ram-above-4g-piecetwo", ram,
>>> +                                     0x100000000ULL - holesize, holesize);
>>> +            memory_region_add_subregion(system_memory,
>>> +                                        0x100000000ULL +
>>> +                                        above_4g_mem_size - holesize,
>>> +                                        ram_above_4g_piecetwo);
>>
>> Why break it in two?  You can just allocate extra holesize bytes in the
>> "ram" MemoryRegion, and not map the part that corresponds to
>> [0x100000000ULL - holesize, 0x100000000ULL).
> 
> - If the "ram" MemoryRegion is backed with 1GB hugepages, you might not 
> want to allocate extra holesize bytes (which might require an entire
> 1GB page).
> 
> - 1GB backed RAM can be mapped with 2MB pages.
> 
>> Also, as Peter said this cannot depend on host considerations.  Just do
>> it unconditionally, but only for new machine types (pc-1.8 and q35-1.8,
>> since unfortunately we're too close to hard freeze).
> 
> Why the description of memory subregions and aliases are part of machine
> types?

It affects the migration stream, which stores RAM offsets instead of
physical addresses.

Let's say you have an 8 GB guest and the hole size is 0.25 GB.

If the huge page size is 2MB, you have:

       Physical address            Length           RAM offsets
       0                           3.75 GB          pc.ram @ 0
       4 GB                        4.25 GB          pc.ram @ 3.75 GB

If the huge page size is 1GB, you have:

       Physical address            Length           RAM offsets
       0                           3.75 GB          pc.ram @ 0
       4 GB                        4 GB             pc.ram @ 4 GB
       8 GB                        0.25 GB          pc.ram @ 3.75 GB

So your memory "rotates" around the 3.75 GB boundary when you migrate
from a non-gbpages host to a gbpages host or vice versa.

If we're doing it only for new machine types, it's even simpler to just
have two RAM regions:

       Physical address            Length           RAM offsets
       0                           3.75 GB          pc.ram-below-4g @ 0
       4 GB                        4.25 GB          pc.ram-above-4g @ 0

Because offsets are zero, and lengths match the RAM block lengths, you
do not need any complication with aliasing.  This still has to be done
only for new machine types.

Paolo

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-25  4:58     ` Marcelo Tosatti
  2013-10-25  8:52       ` Paolo Bonzini
@ 2013-10-25  9:57       ` igor Mammedov
  2013-10-25 13:34         ` Marcelo Tosatti
  1 sibling, 1 reply; 53+ messages in thread
From: igor Mammedov @ 2013-10-25  9:57 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: aarcange, Paolo Bonzini, qemu-devel, gleb

On Fri, 25 Oct 2013 02:58:05 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> On Fri, Oct 25, 2013 at 12:55:36AM +0100, Paolo Bonzini wrote:
> > > +        if (hpagesize == (1<<30)) {
> > > +            unsigned long holesize = 0x100000000ULL -
> > > below_4g_mem_size; +
> > > +            memory_region_init_alias(ram_above_4g, NULL,
> > > "ram-above-4g", ram,
> > > +                                    0x100000000ULL,
> > > +                                    above_4g_mem_size -
> > > holesize);
> > > +            memory_region_add_subregion(system_memory,
> > > 0x100000000ULL,
> > > +                                    ram_above_4g);
> > > +
> > > +            ram_above_4g_piecetwo =
> > > g_malloc(sizeof(*ram_above_4g_piecetwo));
> > > +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> > > +                                     "ram-above-4g-piecetwo",
> > > ram,
> > > +                                     0x100000000ULL - holesize,
> > > holesize);
> > > +            memory_region_add_subregion(system_memory,
> > > +                                        0x100000000ULL +
> > > +                                        above_4g_mem_size -
> > > holesize,
> > > +                                        ram_above_4g_piecetwo);
> > 
> > Why break it in two?  You can just allocate extra holesize bytes in
> > the "ram" MemoryRegion, and not map the part that corresponds to
> > [0x100000000ULL - holesize, 0x100000000ULL).
> 
> - If the "ram" MemoryRegion is backed with 1GB hugepages, you might
> not want to allocate extra holesize bytes (which might require an
> entire 1GB page).
From POV of moddeling current "ram" as dimm devices, aliasing
wouldn't work nice. But breaking one block in two or more is fine since
then blocks could be represented as several dimm devices.

+3Gb backend ram it could be split in blocks like this:

  [ 3Gb (1Gb pages backed) ]
  [tail1 (below_4gb - 3Gb) (2mb pages backed) ]
  [above_4gb whole X Gb pages (1Gb pages backed)]
  [tail2 (2mb pages backed)]

> - 1GB backed RAM can be mapped with 2MB pages.
> 
> > Also, as Peter said this cannot depend on host considerations.
> > Just do it unconditionally, but only for new machine types (pc-1.8
> > and q35-1.8, since unfortunately we're too close to hard freeze).
> 
> Why the description of memory subregions and aliases are part of
> machine types?
> 
> 

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-25  9:57       ` igor Mammedov
@ 2013-10-25 13:34         ` Marcelo Tosatti
  2013-10-27 15:20           ` igor Mammedov
  0 siblings, 1 reply; 53+ messages in thread
From: Marcelo Tosatti @ 2013-10-25 13:34 UTC (permalink / raw)
  To: igor Mammedov; +Cc: aarcange, Paolo Bonzini, qemu-devel, gleb

On Fri, Oct 25, 2013 at 11:57:18AM +0200, igor Mammedov wrote:
> On Fri, 25 Oct 2013 02:58:05 -0200
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
> > On Fri, Oct 25, 2013 at 12:55:36AM +0100, Paolo Bonzini wrote:
> > > > +        if (hpagesize == (1<<30)) {
> > > > +            unsigned long holesize = 0x100000000ULL -
> > > > below_4g_mem_size; +
> > > > +            memory_region_init_alias(ram_above_4g, NULL,
> > > > "ram-above-4g", ram,
> > > > +                                    0x100000000ULL,
> > > > +                                    above_4g_mem_size -
> > > > holesize);
> > > > +            memory_region_add_subregion(system_memory,
> > > > 0x100000000ULL,
> > > > +                                    ram_above_4g);
> > > > +
> > > > +            ram_above_4g_piecetwo =
> > > > g_malloc(sizeof(*ram_above_4g_piecetwo));
> > > > +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> > > > +                                     "ram-above-4g-piecetwo",
> > > > ram,
> > > > +                                     0x100000000ULL - holesize,
> > > > holesize);
> > > > +            memory_region_add_subregion(system_memory,
> > > > +                                        0x100000000ULL +
> > > > +                                        above_4g_mem_size -
> > > > holesize,
> > > > +                                        ram_above_4g_piecetwo);
> > > 
> > > Why break it in two?  You can just allocate extra holesize bytes in
> > > the "ram" MemoryRegion, and not map the part that corresponds to
> > > [0x100000000ULL - holesize, 0x100000000ULL).
> > 
> > - If the "ram" MemoryRegion is backed with 1GB hugepages, you might
> > not want to allocate extra holesize bytes (which might require an
> > entire 1GB page).
> From POV of moddeling current "ram" as dimm devices, aliasing
> wouldn't work nice. But breaking one block in two or more is fine since
> then blocks could be represented as several dimm devices.
> 
> +3Gb backend ram it could be split in blocks like this:
> 
>   [ 3Gb (1Gb pages backed) ]
>   [tail1 (below_4gb - 3Gb) (2mb pages backed) ]
>   [above_4gb whole X Gb pages (1Gb pages backed)]
>   [tail2 (2mb pages backed)]

Yes, thought of that, unfortunately its cumbersome to add an interface
for the user to supply both 2MB and 1GB hugetlbfs pages.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-25  8:52       ` Paolo Bonzini
@ 2013-10-25 19:50         ` Marcelo Tosatti
  2013-10-25 22:53           ` Paolo Bonzini
  0 siblings, 1 reply; 53+ messages in thread
From: Marcelo Tosatti @ 2013-10-25 19:50 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: aarcange, qemu-devel, gleb

On Fri, Oct 25, 2013 at 09:52:34AM +0100, Paolo Bonzini wrote:
> Because offsets are zero, and lengths match the RAM block lengths, you
> do not need any complication with aliasing.  This still has to be done
> only for new machine types.

Not possible because you just wasted holesize bytes (if number of
additional bytes due to huge page alignment is smaller than holesize, a
new hugepage is required, which is not acceptable).

Is there a tree the new machine types can live until 1.8 opens up?

Can you pick up the MAP_POPULATE patch?

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-25 19:50         ` Marcelo Tosatti
@ 2013-10-25 22:53           ` Paolo Bonzini
  2013-10-30 11:07             ` Gerd Hoffmann
  0 siblings, 1 reply; 53+ messages in thread
From: Paolo Bonzini @ 2013-10-25 22:53 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: aarcange, qemu-devel, gleb, Michael S. Tsirkin

Il 25/10/2013 20:50, Marcelo Tosatti ha scritto:
> On Fri, Oct 25, 2013 at 09:52:34AM +0100, Paolo Bonzini wrote:
>> Because offsets are zero, and lengths match the RAM block lengths, you
>> do not need any complication with aliasing.  This still has to be done
>> only for new machine types.
> 
> Not possible because you just wasted holesize bytes (if number of
> additional bytes due to huge page alignment is smaller than holesize, a
> new hugepage is required, which is not acceptable).

Ok.  Thanks for explaining---the patch seems good with the proper
compatibility option in the machine type.  Please run the
guest_memory_dump_analysis test in autotest too.

> Is there a tree the new machine types can live until 1.8 opens up?
> 
> Can you pick up the MAP_POPULATE patch?

Yes, I can pick that one up next week.

Michael is usually gathering hw/i386/pc* patches in his PCI tree, you
can Cc him on v2 of this one.

Paolo

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-25 13:34         ` Marcelo Tosatti
@ 2013-10-27 15:20           ` igor Mammedov
  2013-10-28 14:04             ` Marcelo Tosatti
  0 siblings, 1 reply; 53+ messages in thread
From: igor Mammedov @ 2013-10-27 15:20 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: aarcange, Paolo Bonzini, qemu-devel, gleb

On Fri, 25 Oct 2013 11:34:22 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> On Fri, Oct 25, 2013 at 11:57:18AM +0200, igor Mammedov wrote:
> > On Fri, 25 Oct 2013 02:58:05 -0200
> > Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > 
> > > On Fri, Oct 25, 2013 at 12:55:36AM +0100, Paolo Bonzini wrote:
> > > > > +        if (hpagesize == (1<<30)) {
> > > > > +            unsigned long holesize = 0x100000000ULL -
> > > > > below_4g_mem_size; +
> > > > > +            memory_region_init_alias(ram_above_4g, NULL,
> > > > > "ram-above-4g", ram,
> > > > > +                                    0x100000000ULL,
> > > > > +                                    above_4g_mem_size -
> > > > > holesize);
> > > > > +            memory_region_add_subregion(system_memory,
> > > > > 0x100000000ULL,
> > > > > +                                    ram_above_4g);
> > > > > +
> > > > > +            ram_above_4g_piecetwo =
> > > > > g_malloc(sizeof(*ram_above_4g_piecetwo));
> > > > > +            memory_region_init_alias(ram_above_4g_piecetwo,
> > > > > NULL,
> > > > > +                                     "ram-above-4g-piecetwo",
> > > > > ram,
> > > > > +                                     0x100000000ULL -
> > > > > holesize, holesize);
> > > > > +            memory_region_add_subregion(system_memory,
> > > > > +                                        0x100000000ULL +
> > > > > +                                        above_4g_mem_size -
> > > > > holesize,
> > > > > +
> > > > > ram_above_4g_piecetwo);
> > > > 
> > > > Why break it in two?  You can just allocate extra holesize
> > > > bytes in the "ram" MemoryRegion, and not map the part that
> > > > corresponds to [0x100000000ULL - holesize, 0x100000000ULL).
> > > 
> > > - If the "ram" MemoryRegion is backed with 1GB hugepages, you
> > > might not want to allocate extra holesize bytes (which might
> > > require an entire 1GB page).
> > From POV of moddeling current "ram" as dimm devices, aliasing
> > wouldn't work nice. But breaking one block in two or more is fine
> > since then blocks could be represented as several dimm devices.
> > 
> > +3Gb backend ram it could be split in blocks like this:
> > 
> >   [ 3Gb (1Gb pages backed) ]
> >   [tail1 (below_4gb - 3Gb) (2mb pages backed) ]
> >   [above_4gb whole X Gb pages (1Gb pages backed)]
> >   [tail2 (2mb pages backed)]
> 
> Yes, thought of that, unfortunately its cumbersome to add an interface
> for the user to supply both 2MB and 1GB hugetlbfs pages.
Could 2Mb tails be automated, meaning if host uses 1Gb hugepages and
there is/are tail/s, QEMU should be able to figure out alignment
issues and allocate with appropriate pages.

Goal is separate host part allocation aspect from guest related one,
aliasing 32-bit hole size at the end doesn't help it at all, it's quite
opposite, it's making current code more complicated and harder to fix
in the future.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-27 15:20           ` igor Mammedov
@ 2013-10-28 14:04             ` Marcelo Tosatti
  2013-10-28 14:20               ` Marcelo Tosatti
                                 ` (2 more replies)
  0 siblings, 3 replies; 53+ messages in thread
From: Marcelo Tosatti @ 2013-10-28 14:04 UTC (permalink / raw)
  To: igor Mammedov; +Cc: aarcange, Paolo Bonzini, qemu-devel, gleb

On Sun, Oct 27, 2013 at 04:20:44PM +0100, igor Mammedov wrote:
> > Yes, thought of that, unfortunately its cumbersome to add an interface
> > for the user to supply both 2MB and 1GB hugetlbfs pages.
> Could 2Mb tails be automated, meaning if host uses 1Gb hugepages and
> there is/are tail/s, QEMU should be able to figure out alignment
> issues and allocate with appropriate pages.

Yes that would be ideal but the problem with hugetlbfs is that pages are
preallocated.

So in the end you'd have to expose the split of guest RAM in 2MB/1GB types
to the user (it would be necessary for the user to calculate the size of
the hole, etc).

> Goal is separate host part allocation aspect from guest related one,
> aliasing 32-bit hole size at the end doesn't help it at all, it's quite
> opposite, it's making current code more complicated and harder to fix
> in the future.

You can simply back the 1GB areas which the hole reside with 2MB pages.
Can't see why having the tail of RAM map to the hole is problematic.

Understand your concern, but the complication is necessary: the host
virtual/physical address and guest physical addresses must be aligned on
largepage boundaries.

Do you foresee any problem with memory hotplug?

Could add a warning to memory API: if memory region is larger than 1GB
and RAM is 1GB backed, and not properly aligned, warn.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-28 14:04             ` Marcelo Tosatti
@ 2013-10-28 14:20               ` Marcelo Tosatti
  2013-10-29 18:00               ` Igor Mammedov
  2013-10-29 18:18               ` [Qemu-devel] [RFC PATCH] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions Igor Mammedov
  2 siblings, 0 replies; 53+ messages in thread
From: Marcelo Tosatti @ 2013-10-28 14:20 UTC (permalink / raw)
  To: igor Mammedov; +Cc: aarcange, Paolo Bonzini, qemu-devel, gleb

On Mon, Oct 28, 2013 at 12:04:06PM -0200, Marcelo Tosatti wrote:
> On Sun, Oct 27, 2013 at 04:20:44PM +0100, igor Mammedov wrote:
> > > Yes, thought of that, unfortunately its cumbersome to add an interface
> > > for the user to supply both 2MB and 1GB hugetlbfs pages.
> > Could 2Mb tails be automated, meaning if host uses 1Gb hugepages and
> > there is/are tail/s, QEMU should be able to figure out alignment
> > issues and allocate with appropriate pages.
> 
> Yes that would be ideal but the problem with hugetlbfs is that pages are
> preallocated.
> 
> So in the end you'd have to expose the split of guest RAM in 2MB/1GB types
> to the user (it would be necessary for the user to calculate the size of
> the hole, etc).

Note the assumption here is this: its easier for the hugetlbfs user to
manage 

number of 1GB hugepages = size of guest RAM

Than for him to calculate the size of the hole (which depends on machine
type), allocate 1GB/2MB hugepages accordingly. And the benefit of that
would be to save one 1GB hugepage (which is preallocated during boot, in
the first place).

So matching number of 1GB hugepages and guest RAM seems the easier
choice.

> > Goal is separate host part allocation aspect from guest related one,
> > aliasing 32-bit hole size at the end doesn't help it at all, it's quite
> > opposite, it's making current code more complicated and harder to fix
> > in the future.

What is the problem to be fixed, exactly?

> You can simply back the 1GB areas which the hole reside with 2MB pages.
> Can't see why having the tail of RAM map to the hole is problematic.
> 
> Understand your concern, but the complication is necessary: the host
> virtual/physical address and guest physical addresses must be aligned on
> largepage boundaries.
> 
> Do you foresee any problem with memory hotplug?
> 
> Could add a warning to memory API: if memory region is larger than 1GB
> and RAM is 1GB backed, and not properly aligned, warn.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-28 14:04             ` Marcelo Tosatti
  2013-10-28 14:20               ` Marcelo Tosatti
@ 2013-10-29 18:00               ` Igor Mammedov
  2013-10-29 21:21                 ` Marcelo Tosatti
  2013-10-29 18:18               ` [Qemu-devel] [RFC PATCH] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions Igor Mammedov
  2 siblings, 1 reply; 53+ messages in thread
From: Igor Mammedov @ 2013-10-29 18:00 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: aarcange, Paolo Bonzini, qemu-devel, gleb

On Mon, 28 Oct 2013 12:04:06 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> On Sun, Oct 27, 2013 at 04:20:44PM +0100, igor Mammedov wrote:
> > > Yes, thought of that, unfortunately its cumbersome to add an interface
> > > for the user to supply both 2MB and 1GB hugetlbfs pages.
> > Could 2Mb tails be automated, meaning if host uses 1Gb hugepages and
> > there is/are tail/s, QEMU should be able to figure out alignment
> > issues and allocate with appropriate pages.
> 
> Yes that would be ideal but the problem with hugetlbfs is that pages are
> preallocated.
> 
> So in the end you'd have to expose the split of guest RAM in 2MB/1GB types
> to the user (it would be necessary for the user to calculate the size of
> the hole, etc).
exposing it to the user might be not necessary,
QEMU could allocate 5Gb+3Mb ram without user intervention:
 3Gb low.ram.aligned.region // using huge pages
 1mb low.ram.unaligned.region if below_4g_ram_size - 3Gb; // so not to waste precious low ram, using fallback allocation
 //hypothetically hole starts at 3Gb+1mb
 2Gb high.ram.aligned.region // using huge pages
 2Mb high.ram.unaligned.region // so that not to waste 1Gb on memory using huge page
> 
> > Goal is separate host part allocation aspect from guest related one,
> > aliasing 32-bit hole size at the end doesn't help it at all, it's quite
> > opposite, it's making current code more complicated and harder to fix
> > in the future.
> 
> You can simply back the 1GB areas which the hole reside with 2MB pages.
I'm not getting what do you mean here.

> Can't see why having the tail of RAM map to the hole is problematic.
Problem I see is that with proposed aliasing there is no one-one
mapping to future "memdev" where each Dimm device (guest/model visible memory block)
has a corresponding memdev backend (host memory block).

Moreover with current hugepages handling in QEMU including this patch and usage of
1Gb hugepages, QEMU might loose ~1Gb if -m "hpagesize*n+1", which is by itself is a
good reason to use several allocations with different allocator backends.


> Understand your concern, but the complication is necessary: the host
> virtual/physical address and guest physical addresses must be aligned on
> largepage boundaries.
I don't argue against it, only about the best way to achieve it.

If we assume possible conversion from adhoc way of allocating initial RAM to
DIMM devices in the future then changing region layout several times in
incompatible way doesn't seems to be the best approach. If we are going to
change it, let at least minimize compatibility issues and do it right
in the first place.

I'll post RFC patch as reply to this thread.

> 
> Do you foresee any problem with memory hotplug?
I don't see any problem with memory hotplug so far, but as noted above
there will be problems with converting initial ram to DIMM devices.

> 
> Could add a warning to memory API: if memory region is larger than 1GB
> and RAM is 1GB backed, and not properly aligned, warn.
Perhaps it would be better do abort and ask user to fix configuration,
and on hugepage allocation failure not fallback to malloc but abort and
tell user amount of hugepages needed to run guest with hugepage backend.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [Qemu-devel] [RFC PATCH] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions
  2013-10-28 14:04             ` Marcelo Tosatti
  2013-10-28 14:20               ` Marcelo Tosatti
  2013-10-29 18:00               ` Igor Mammedov
@ 2013-10-29 18:18               ` Igor Mammedov
  2013-10-29 21:38                 ` Marcelo Tosatti
  2 siblings, 1 reply; 53+ messages in thread
From: Igor Mammedov @ 2013-10-29 18:18 UTC (permalink / raw)
  To: qemu-devel
  Cc: aarcange, peter.maydell, gleb, quintela, jan.kiszka, mtosatti,
	aliguori, pbonzini, afaerber, rth

Otherwise 1GB TLBs cannot be cached for the range.

PS:
as side effect we are not wasting ~1Gb of memory if
1Gb hugepages are used and -m "hpagesize(in Mb)*n + 1"

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
---
PS2:
As RFC it's yet without compatibility changes noted by Paolo

---
 exec.c                    |    8 ++++++-
 hw/i386/pc.c              |   50 ++++++++++++++++++++++++++++----------------
 include/exec/cpu-common.h |    1 +
 3 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/exec.c b/exec.c
index 9b6ea50..a4e5c80 100644
--- a/exec.c
+++ b/exec.c
@@ -882,7 +882,7 @@ void qemu_mutex_unlock_ramlist(void)
 
 #define HUGETLBFS_MAGIC       0x958458f6
 
-static long gethugepagesize(const char *path)
+long gethugepagesize(const char *path)
 {
     struct statfs fs;
     int ret;
@@ -925,6 +925,12 @@ static void *file_ram_alloc(RAMBlock *block,
         return NULL;
     }
 
+    /* refuse to use huge pages if requested size isn't page aligned
+     * to avoid wasting memory */
+    if (memory != (memory & ~(hpagesize-1))) {
+        return NULL;
+    }
+
     if (kvm_enabled() && !kvm_has_sync_mmu()) {
         fprintf(stderr, "host lacks kvm mmu notifiers, -mem-path unsupported\n");
         return NULL;
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 0c313fe..1611fa7 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1116,32 +1116,46 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
 {
     int linux_boot, i;
     MemoryRegion *ram, *option_rom_mr;
-    MemoryRegion *ram_below_4g, *ram_above_4g;
     FWCfgState *fw_cfg;
+    unsigned long hpagesize = gethugepagesize(mem_path);
+    ram_addr_t below_4g_mem_size_alined, below_4g_mem_size_tail, above_4g_mem_size_alined, above_4g_mem_size_tail;
 
     linux_boot = (kernel_filename != NULL);
 
-    /* Allocate RAM.  We allocate it as a single memory region and use
-     * aliases to address portions of it, mostly for backwards compatibility
-     * with older qemus that used qemu_ram_alloc().
-     */
+    *ram_memory = g_malloc(sizeof(**ram_memory));
+    memory_region_init(*ram_memory, NULL, "pc.ram",
+                       above_4g_mem_size == 0 ? below_4g_mem_size: 0x100000000ULL + above_4g_mem_size);
+    memory_region_add_subregion(system_memory, 0, *ram_memory);
+
+    below_4g_mem_size_alined = below_4g_mem_size & ~(hpagesize - 1);
     ram = g_malloc(sizeof(*ram));
-    memory_region_init_ram(ram, NULL, "pc.ram",
-                           below_4g_mem_size + above_4g_mem_size);
+    memory_region_init_ram(ram, NULL, "pc.ram.low.aligned", below_4g_mem_size_alined);
+    memory_region_add_subregion(*ram_memory, 0, ram);
     vmstate_register_ram_global(ram);
-    *ram_memory = ram;
-    ram_below_4g = g_malloc(sizeof(*ram_below_4g));
-    memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
-                             0, below_4g_mem_size);
-    memory_region_add_subregion(system_memory, 0, ram_below_4g);
-    if (above_4g_mem_size > 0) {
-        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
-        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
-                                 below_4g_mem_size, above_4g_mem_size);
-        memory_region_add_subregion(system_memory, 0x100000000ULL,
-                                    ram_above_4g);
+
+    below_4g_mem_size_tail = below_4g_mem_size - below_4g_mem_size_alined;
+    if (below_4g_mem_size_tail) {
+        ram = g_malloc(sizeof(*ram));
+        memory_region_init_ram(ram, NULL, "pc.ram.low.unaligned", below_4g_mem_size_tail);
+        memory_region_add_subregion(*ram_memory, below_4g_mem_size_alined, ram);
+        vmstate_register_ram_global(ram);
     }
 
+    if (above_4g_mem_size > 0) {
+        above_4g_mem_size_alined = above_4g_mem_size & ~(hpagesize - 1);
+        ram = g_malloc(sizeof(*ram));
+        memory_region_init_ram(ram, NULL, "pc.ram.high.aligned", above_4g_mem_size_alined);
+        memory_region_add_subregion(*ram_memory, 0x100000000ULL, ram);
+        vmstate_register_ram_global(ram);
+
+        above_4g_mem_size_tail = above_4g_mem_size - above_4g_mem_size_alined;
+        if (above_4g_mem_size_tail) {
+            ram = g_malloc(sizeof(*ram));
+            memory_region_init_ram(ram, NULL, "pc.ram.high.unaligned", above_4g_mem_size_tail);
+            memory_region_add_subregion(*ram_memory, 0x100000000ULL + above_4g_mem_size_alined, ram);
+            vmstate_register_ram_global(ram);
+	}
+    }
 
     /* Initialize PC system firmware */
     pc_system_firmware_init(rom_memory, guest_info->isapc_ram_fw);
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 40e15e4..f89a37c 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -57,6 +57,7 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev);
 #ifdef __linux__
 uint32_t qemu_get_ram_hpagesize(ram_addr_t addr);
 #endif
+long gethugepagesize(const char *path);
 
 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
                             int len, int is_write);
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-29 18:00               ` Igor Mammedov
@ 2013-10-29 21:21                 ` Marcelo Tosatti
  2013-10-30  8:48                   ` Gleb Natapov
  0 siblings, 1 reply; 53+ messages in thread
From: Marcelo Tosatti @ 2013-10-29 21:21 UTC (permalink / raw)
  To: Igor Mammedov; +Cc: aarcange, Paolo Bonzini, qemu-devel, gleb

On Tue, Oct 29, 2013 at 07:00:54PM +0100, Igor Mammedov wrote:
> On Mon, 28 Oct 2013 12:04:06 -0200
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
> > On Sun, Oct 27, 2013 at 04:20:44PM +0100, igor Mammedov wrote:
> > > > Yes, thought of that, unfortunately its cumbersome to add an interface
> > > > for the user to supply both 2MB and 1GB hugetlbfs pages.
> > > Could 2Mb tails be automated, meaning if host uses 1Gb hugepages and
> > > there is/are tail/s, QEMU should be able to figure out alignment
> > > issues and allocate with appropriate pages.
> > 
> > Yes that would be ideal but the problem with hugetlbfs is that pages are
> > preallocated.
> > 
> > So in the end you'd have to expose the split of guest RAM in 2MB/1GB types
> > to the user (it would be necessary for the user to calculate the size of
> > the hole, etc).
> exposing it to the user might be not necessary,
> QEMU could allocate 5Gb+3Mb ram without user intervention:

It is necessary because the user has to allocate hugetlbfs pages 
(see the end of the email).

>  3Gb low.ram.aligned.region // using huge pages
>  1mb low.ram.unaligned.region if below_4g_ram_size - 3Gb; // so not to waste precious low ram, using fallback allocation
>  //hypothetically hole starts at 3Gb+1mb
>  2Gb high.ram.aligned.region // using huge pages
>  2Mb high.ram.unaligned.region // so that not to waste 1Gb on memory using huge page

You want memory areas not backed by 1GB pages to be backed by 2MB pages
(so that possibility of creation of TLB entries per physical address
range is similar, or matches, physical hardware) (*)

> > > Goal is separate host part allocation aspect from guest related one,
> > > aliasing 32-bit hole size at the end doesn't help it at all, it's quite
> > > opposite, it's making current code more complicated and harder to fix
> > > in the future.
> > 
> > You can simply back the 1GB areas which the hole reside with 2MB pages.
> I'm not getting what do you mean here.

The 1GB memory backed areas which can't be mapped with 1GB TLBs, such as the
[3GB,4GB] guest physical address range can be mapped with 2MB TLBs.

> > Can't see why having the tail of RAM map to the hole is problematic.
> Problem I see is that with proposed aliasing there is no one-one
> mapping to future "memdev" where each Dimm device (guest/model visible memory block)
> has a corresponding memdev backend (host memory block).

1) What is the dependency of memdev on linear host memory block? (that is,
i can't see the reasoning behind a one-to-one mapping).

2) Why can't memdev access host memory via mappings? (that
is, why does memdev require each DIMM to be mapped linearly in
QEMU's virtual address space?).

> Moreover with current hugepages handling in QEMU including this patch and usage of
> 1Gb hugepages, QEMU might loose ~1Gb if -m "hpagesize*n+1", which is by itself is a
> good reason to use several allocations with different allocator backends.
> > Understand your concern, but the complication is necessary: the host
> > virtual/physical address and guest physical addresses must be aligned on
> > largepage boundaries.
> I don't argue against it, only about the best way to achieve it.
> 
> If we assume possible conversion from adhoc way of allocating initial RAM to
> DIMM devices in the future then changing region layout several times in
> incompatible way doesn't seems to be the best approach. If we are going to
> change it, let at least minimize compatibility issues and do it right
> in the first place.
> 
> I'll post RFC patch as reply to this thread.
> 
> > 
> > Do you foresee any problem with memory hotplug?
> I don't see any problem with memory hotplug so far, but as noted above
> there will be problems with converting initial ram to DIMM devices.
> 
> > 
> > Could add a warning to memory API: if memory region is larger than 1GB
> > and RAM is 1GB backed, and not properly aligned, warn.
> Perhaps it would be better do abort and ask user to fix configuration,
> and on hugepage allocation failure not fallback to malloc but abort and
> tell user amount of hugepages needed to run guest with hugepage backend.

You want to back your guest with 1GB hugepages. You get 1 such page at a
time, worst case.

You either 

1) map the guest physical address space region (1GB sized) where
the hole is located with smaller page sizes, which must be 2MB, see *,
which requires the user to specify a different hugetlbfs mount path with
sufficient 2MB huge pages.

2) move the pieces of memory which can't be 1GB mapped backed into
1GB hugepages, and map the remaining 1GB-aligned regions to individual 1GB 
pages.

I am trying to avoid 1) as it complicates management (and fixes a bug).

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions
  2013-10-29 18:18               ` [Qemu-devel] [RFC PATCH] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions Igor Mammedov
@ 2013-10-29 21:38                 ` Marcelo Tosatti
  2013-10-30 16:49                   ` Igor Mammedov
  0 siblings, 1 reply; 53+ messages in thread
From: Marcelo Tosatti @ 2013-10-29 21:38 UTC (permalink / raw)
  To: Igor Mammedov
  Cc: aarcange, peter.maydell, gleb, quintela, jan.kiszka, qemu-devel,
	aliguori, pbonzini, afaerber, rth

On Tue, Oct 29, 2013 at 07:18:49PM +0100, Igor Mammedov wrote:
> Otherwise 1GB TLBs cannot be cached for the range.

This fails to back non-1GB-aligned gpas, but 2MB aligned, with 2MB large
pages.

Since hugetlbfs allocation is static, it requires the user to inform
different 1GB and 2MB sized hugetlbfs mount points (with proper number
of corresponding hugetlbfs pages allocated). This is incompatible with
the current command line, and i'd like to see this problem handled in a
way that is command line backwards compatible.

Also, if the argument for one-to-one mapping between dimms and linear host
virtual address sections holds, it means virtual DIMMs must be
partitioned into whatever hugepage alignment is necessary (and in
that case, why they can't be partitioned similarly with the memory
region aliases?).

> PS:
> as side effect we are not wasting ~1Gb of memory if
> 1Gb hugepages are used and -m "hpagesize(in Mb)*n + 1"

This is how hugetlbfs works. You waste 1GB hugepage if an extra
byte is requested.

> Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> ---
> PS2:
> As RFC it's yet without compatibility changes noted by Paolo
> 
> ---
>  exec.c                    |    8 ++++++-
>  hw/i386/pc.c              |   50 ++++++++++++++++++++++++++++----------------
>  include/exec/cpu-common.h |    1 +
>  3 files changed, 40 insertions(+), 19 deletions(-)
> 
> diff --git a/exec.c b/exec.c
> index 9b6ea50..a4e5c80 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -882,7 +882,7 @@ void qemu_mutex_unlock_ramlist(void)
>  
>  #define HUGETLBFS_MAGIC       0x958458f6
>  
> -static long gethugepagesize(const char *path)
> +long gethugepagesize(const char *path)
>  {
>      struct statfs fs;
>      int ret;
> @@ -925,6 +925,12 @@ static void *file_ram_alloc(RAMBlock *block,
>          return NULL;
>      }
>  
> +    /* refuse to use huge pages if requested size isn't page aligned
> +     * to avoid wasting memory */
> +    if (memory != (memory & ~(hpagesize-1))) {
> +        return NULL;
> +    }
> +
>      if (kvm_enabled() && !kvm_has_sync_mmu()) {
>          fprintf(stderr, "host lacks kvm mmu notifiers, -mem-path unsupported\n");
>          return NULL;
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 0c313fe..1611fa7 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1116,32 +1116,46 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>  {
>      int linux_boot, i;
>      MemoryRegion *ram, *option_rom_mr;
> -    MemoryRegion *ram_below_4g, *ram_above_4g;
>      FWCfgState *fw_cfg;
> +    unsigned long hpagesize = gethugepagesize(mem_path);
> +    ram_addr_t below_4g_mem_size_alined, below_4g_mem_size_tail, above_4g_mem_size_alined, above_4g_mem_size_tail;
>  
>      linux_boot = (kernel_filename != NULL);
>  
> -    /* Allocate RAM.  We allocate it as a single memory region and use
> -     * aliases to address portions of it, mostly for backwards compatibility
> -     * with older qemus that used qemu_ram_alloc().
> -     */
> +    *ram_memory = g_malloc(sizeof(**ram_memory));
> +    memory_region_init(*ram_memory, NULL, "pc.ram",
> +                       above_4g_mem_size == 0 ? below_4g_mem_size: 0x100000000ULL + above_4g_mem_size);
> +    memory_region_add_subregion(system_memory, 0, *ram_memory);
> +
> +    below_4g_mem_size_alined = below_4g_mem_size & ~(hpagesize - 1);
>      ram = g_malloc(sizeof(*ram));
> -    memory_region_init_ram(ram, NULL, "pc.ram",
> -                           below_4g_mem_size + above_4g_mem_size);
> +    memory_region_init_ram(ram, NULL, "pc.ram.low.aligned", below_4g_mem_size_alined);
> +    memory_region_add_subregion(*ram_memory, 0, ram);
>      vmstate_register_ram_global(ram);
> -    *ram_memory = ram;
> -    ram_below_4g = g_malloc(sizeof(*ram_below_4g));
> -    memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
> -                             0, below_4g_mem_size);
> -    memory_region_add_subregion(system_memory, 0, ram_below_4g);
> -    if (above_4g_mem_size > 0) {
> -        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> -                                 below_4g_mem_size, above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> -                                    ram_above_4g);
> +
> +    below_4g_mem_size_tail = below_4g_mem_size - below_4g_mem_size_alined;
> +    if (below_4g_mem_size_tail) {
> +        ram = g_malloc(sizeof(*ram));
> +        memory_region_init_ram(ram, NULL, "pc.ram.low.unaligned", below_4g_mem_size_tail);
> +        memory_region_add_subregion(*ram_memory, below_4g_mem_size_alined, ram);
> +        vmstate_register_ram_global(ram);
>      }
>  
> +    if (above_4g_mem_size > 0) {
> +        above_4g_mem_size_alined = above_4g_mem_size & ~(hpagesize - 1);
> +        ram = g_malloc(sizeof(*ram));
> +        memory_region_init_ram(ram, NULL, "pc.ram.high.aligned", above_4g_mem_size_alined);
> +        memory_region_add_subregion(*ram_memory, 0x100000000ULL, ram);
> +        vmstate_register_ram_global(ram);
> +
> +        above_4g_mem_size_tail = above_4g_mem_size - above_4g_mem_size_alined;
> +        if (above_4g_mem_size_tail) {
> +            ram = g_malloc(sizeof(*ram));
> +            memory_region_init_ram(ram, NULL, "pc.ram.high.unaligned", above_4g_mem_size_tail);
> +            memory_region_add_subregion(*ram_memory, 0x100000000ULL + above_4g_mem_size_alined, ram);
> +            vmstate_register_ram_global(ram);
> +	}
> +    }
>  
>      /* Initialize PC system firmware */
>      pc_system_firmware_init(rom_memory, guest_info->isapc_ram_fw);
> diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
> index 40e15e4..f89a37c 100644
> --- a/include/exec/cpu-common.h
> +++ b/include/exec/cpu-common.h
> @@ -57,6 +57,7 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev);
>  #ifdef __linux__
>  uint32_t qemu_get_ram_hpagesize(ram_addr_t addr);
>  #endif
> +long gethugepagesize(const char *path);
>  
>  void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
>                              int len, int is_write);
> -- 
> 1.7.1

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-29 21:21                 ` Marcelo Tosatti
@ 2013-10-30  8:48                   ` Gleb Natapov
  2013-10-30 18:30                     ` Marcelo Tosatti
  0 siblings, 1 reply; 53+ messages in thread
From: Gleb Natapov @ 2013-10-30  8:48 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: aarcange, Igor Mammedov, qemu-devel, Paolo Bonzini

On Tue, Oct 29, 2013 at 07:21:59PM -0200, Marcelo Tosatti wrote:
> > > Could add a warning to memory API: if memory region is larger than 1GB
> > > and RAM is 1GB backed, and not properly aligned, warn.
> > Perhaps it would be better do abort and ask user to fix configuration,
> > and on hugepage allocation failure not fallback to malloc but abort and
> > tell user amount of hugepages needed to run guest with hugepage backend.
> 
> You want to back your guest with 1GB hugepages. You get 1 such page at a
> time, worst case.
> 
> You either 
> 
> 1) map the guest physical address space region (1GB sized) where
> the hole is located with smaller page sizes, which must be 2MB, see *,
> which requires the user to specify a different hugetlbfs mount path with
> sufficient 2MB huge pages.
> 
Why not really on THP to do the work?

> 2) move the pieces of memory which can't be 1GB mapped backed into
> 1GB hugepages, and map the remaining 1GB-aligned regions to individual 1GB 
> pages.
> 
> I am trying to avoid 1) as it complicates management (and fixes a bug).

--
			Gleb.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-25 22:53           ` Paolo Bonzini
@ 2013-10-30 11:07             ` Gerd Hoffmann
  2013-10-30 11:47               ` Paolo Bonzini
  0 siblings, 1 reply; 53+ messages in thread
From: Gerd Hoffmann @ 2013-10-30 11:07 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: aarcange, Marcelo Tosatti, qemu-devel, gleb, Michael S. Tsirkin

On Fr, 2013-10-25 at 23:53 +0100, Paolo Bonzini wrote:
> Il 25/10/2013 20:50, Marcelo Tosatti ha scritto:
> > On Fri, Oct 25, 2013 at 09:52:34AM +0100, Paolo Bonzini wrote:
> >> Because offsets are zero, and lengths match the RAM block lengths, you
> >> do not need any complication with aliasing.  This still has to be done
> >> only for new machine types.
> > 
> > Not possible because you just wasted holesize bytes (if number of
> > additional bytes due to huge page alignment is smaller than holesize, a
> > new hugepage is required, which is not acceptable).
> 
> Ok.  Thanks for explaining---the patch seems good with the proper
> compatibility option in the machine type.  Please run the
> guest_memory_dump_analysis test in autotest too.

As the whole thing must depend on machine type anyway for live migration
compatibility we can also simply change the memory split, i.e. map 2GB
(-M q35) or 3GB (-M pc) below 4G instead of the odd sizes (2.75 / 3.5)
we have now.

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-30 11:07             ` Gerd Hoffmann
@ 2013-10-30 11:47               ` Paolo Bonzini
  0 siblings, 0 replies; 53+ messages in thread
From: Paolo Bonzini @ 2013-10-30 11:47 UTC (permalink / raw)
  To: Gerd Hoffmann
  Cc: aarcange, Marcelo Tosatti, qemu-devel, gleb, Michael S. Tsirkin

Il 30/10/2013 12:07, Gerd Hoffmann ha scritto:
> On Fr, 2013-10-25 at 23:53 +0100, Paolo Bonzini wrote:
>> Il 25/10/2013 20:50, Marcelo Tosatti ha scritto:
>>> On Fri, Oct 25, 2013 at 09:52:34AM +0100, Paolo Bonzini wrote:
>>>> Because offsets are zero, and lengths match the RAM block lengths, you
>>>> do not need any complication with aliasing.  This still has to be done
>>>> only for new machine types.
>>>
>>> Not possible because you just wasted holesize bytes (if number of
>>> additional bytes due to huge page alignment is smaller than holesize, a
>>> new hugepage is required, which is not acceptable).
>>
>> Ok.  Thanks for explaining---the patch seems good with the proper
>> compatibility option in the machine type.  Please run the
>> guest_memory_dump_analysis test in autotest too.
> 
> As the whole thing must depend on machine type anyway for live migration
> compatibility we can also simply change the memory split, i.e. map 2GB
> (-M q35) or 3GB (-M pc) below 4G instead of the odd sizes (2.75 / 3.5)
> we have now.

For q35 it could be a possibility.  For pc, I'm worried about old
Windows systems that ignore all memory outside the first 4GB.  They
would lose 512 MB of memory.

Paolo

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions
  2013-10-29 21:38                 ` Marcelo Tosatti
@ 2013-10-30 16:49                   ` Igor Mammedov
  2013-10-30 18:51                     ` Marcelo Tosatti
  2013-10-30 19:31                     ` Marcelo Tosatti
  0 siblings, 2 replies; 53+ messages in thread
From: Igor Mammedov @ 2013-10-30 16:49 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: aarcange, peter.maydell, gleb, quintela, jan.kiszka, qemu-devel,
	aliguori, pbonzini, afaerber, rth

On Tue, 29 Oct 2013 19:38:44 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> On Tue, Oct 29, 2013 at 07:18:49PM +0100, Igor Mammedov wrote:
> > Otherwise 1GB TLBs cannot be cached for the range.
> 
> This fails to back non-1GB-aligned gpas, but 2MB aligned, with 2MB large
> pages.
With current command line only one hugetlbfs mount point is possible, so it
will back with whatever alignment specified hugetlbfs mount point has.
Anything that doesn't fit into page aligned region goes to tail using
non hugepage baked phys_mem_set_alloc()=qemu_anon_ram_alloc() allocator.

> 
> Since hugetlbfs allocation is static, it requires the user to inform
> different 1GB and 2MB sized hugetlbfs mount points (with proper number
> of corresponding hugetlbfs pages allocated). This is incompatible with
> the current command line, and i'd like to see this problem handled in a
> way that is command line backwards compatible.
patch doesn't change that, it uses provided hugetlbfs and fallbacks (hunk 2)
to phys_mem_alloc if requested memory region is not hugepage size aligned.
So there is no any CLI change, only fixing memory leak.

> Also, if the argument for one-to-one mapping between dimms and linear host
> virtual address sections holds, it means virtual DIMMs must be
> partitioned into whatever hugepage alignment is necessary (and in
> that case, why they can't be partitioned similarly with the memory
> region aliases?).
Because during hotplug a new memory region of desired size is allocated
and it could be mapped directly without any aliasing. And if some day we
convert adhoc initial memory allocation to dimm devices there is no reason to
alloc one huge block and then invent means how to alias hole somewhere else,
we could just reuse memdev/dimm and allocate several memory regions
with desired properties each represented by a memdev/dimm pair.

one-one mapping simplifies design and interface with ACPI part during memory
hotplug.

for hotplug case flow could look like:
 memdev_add id=x1,size=1Gb,mem-path=/hugetlbfs/1gb,other-host-related-stuff-options
 #memdev could enforce size to be backend aligned
 device_add dimm,id=y1,backend=x1,addr=xxxxxx
 #dimm could get alignment from associated memdev or fail if addr
 #doesn't meet alignment of memdev backend

 memdev_add id=x2,size=2mb,mem-path=/hugetlbfs/2mb
 device_add dimm,id=y2,backend=x2,addr=yyyyyyy

 memdev_add id=x3,size=1mb
 device_add dimm,id=y3,backend=x3,addr=xxxxxxx

linear memory block is allocated at runtime (user has to make sure that enough
hugepages are available) by each memdev_add command and that RAM memory region
is mapped into GPA by virtual DIMM as is, there wouldn't be any need for
aliasing.

Now back to intial memory and bright future we are looking forward to (i.e.
ability to create machine from configuration file without adhoc codding
like(pc_memory_init)):

legacy cmdline "-m 4512 -mem-path /hugetlbfs/1gb" could be automatically
translated into:

-memdev id=x1,size=3g,mem-path=/hugetlbfs/1gb -device dimm,backend=x1,addr=0
-memdev id=x2,size=1g,mem-path=/hugetlbfs/1gb -device dimm,backend=x2,addr=4g
-memdev id=x3,size=512m -device dimm,backend=x3,addr=5g

or user could drop legacy CLI and assume fine grained control over memory
configuration:

-memdev id=x1,size=3g,mem-path=/hugetlbfs/1gb -device dimm,backend=x1,addr=0
-memdev id=x2,size=1g,mem-path=/hugetlbfs/1gb -device dimm,backend=x2,addr=4g
-memdev id=x3,size=512m,mem-path=/hugetlbfs/2mb -device dimm,backend=x3,addr=5g

So if we are going to break migration compatibility for new machine type
lets do a way that could painlessly changed to memdev/device in future.

> > PS:
> > as side effect we are not wasting ~1Gb of memory if
> > 1Gb hugepages are used and -m "hpagesize(in Mb)*n + 1"
> 
> This is how hugetlbfs works. You waste 1GB hugepage if an extra
> byte is requested.
it looks more a bug than feature,
why do it if leak could be avoided as shown below?

> 
> > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > ---
> > PS2:
> > As RFC it's yet without compatibility changes noted by Paolo
> > 
> > ---
> >  exec.c                    |    8 ++++++-
> >  hw/i386/pc.c              |   50 ++++++++++++++++++++++++++++----------------
> >  include/exec/cpu-common.h |    1 +
> >  3 files changed, 40 insertions(+), 19 deletions(-)
> > 
> > diff --git a/exec.c b/exec.c
> > index 9b6ea50..a4e5c80 100644
> > --- a/exec.c
> > +++ b/exec.c
> > @@ -882,7 +882,7 @@ void qemu_mutex_unlock_ramlist(void)
> >  
> >  #define HUGETLBFS_MAGIC       0x958458f6
> >  
> > -static long gethugepagesize(const char *path)
> > +long gethugepagesize(const char *path)
> >  {
> >      struct statfs fs;
> >      int ret;
> > @@ -925,6 +925,12 @@ static void *file_ram_alloc(RAMBlock *block,
> >          return NULL;
> >      }
> >  
> > +    /* refuse to use huge pages if requested size isn't page aligned
> > +     * to avoid wasting memory */
> > +    if (memory != (memory & ~(hpagesize-1))) {
> > +        return NULL;
> > +    }
> > +
> >      if (kvm_enabled() && !kvm_has_sync_mmu()) {
> >          fprintf(stderr, "host lacks kvm mmu notifiers, -mem-path unsupported\n");
> >          return NULL;
> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> > index 0c313fe..1611fa7 100644
> > --- a/hw/i386/pc.c
> > +++ b/hw/i386/pc.c
> > @@ -1116,32 +1116,46 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
> >  {
> >      int linux_boot, i;
> >      MemoryRegion *ram, *option_rom_mr;
> > -    MemoryRegion *ram_below_4g, *ram_above_4g;
> >      FWCfgState *fw_cfg;
> > +    unsigned long hpagesize = gethugepagesize(mem_path);
> > +    ram_addr_t below_4g_mem_size_alined, below_4g_mem_size_tail, above_4g_mem_size_alined, above_4g_mem_size_tail;
> >  
> >      linux_boot = (kernel_filename != NULL);
> >  
> > -    /* Allocate RAM.  We allocate it as a single memory region and use
> > -     * aliases to address portions of it, mostly for backwards compatibility
> > -     * with older qemus that used qemu_ram_alloc().
> > -     */
> > +    *ram_memory = g_malloc(sizeof(**ram_memory));
> > +    memory_region_init(*ram_memory, NULL, "pc.ram",
> > +                       above_4g_mem_size == 0 ? below_4g_mem_size: 0x100000000ULL + above_4g_mem_size);
> > +    memory_region_add_subregion(system_memory, 0, *ram_memory);
> > +
> > +    below_4g_mem_size_alined = below_4g_mem_size & ~(hpagesize - 1);
> >      ram = g_malloc(sizeof(*ram));
> > -    memory_region_init_ram(ram, NULL, "pc.ram",
> > -                           below_4g_mem_size + above_4g_mem_size);
> > +    memory_region_init_ram(ram, NULL, "pc.ram.low.aligned", below_4g_mem_size_alined);
> > +    memory_region_add_subregion(*ram_memory, 0, ram);
> >      vmstate_register_ram_global(ram);
> > -    *ram_memory = ram;
> > -    ram_below_4g = g_malloc(sizeof(*ram_below_4g));
> > -    memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
> > -                             0, below_4g_mem_size);
> > -    memory_region_add_subregion(system_memory, 0, ram_below_4g);
> > -    if (above_4g_mem_size > 0) {
> > -        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> > -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> > -                                 below_4g_mem_size, above_4g_mem_size);
> > -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> > -                                    ram_above_4g);
> > +
> > +    below_4g_mem_size_tail = below_4g_mem_size - below_4g_mem_size_alined;
> > +    if (below_4g_mem_size_tail) {
> > +        ram = g_malloc(sizeof(*ram));
> > +        memory_region_init_ram(ram, NULL, "pc.ram.low.unaligned", below_4g_mem_size_tail);
> > +        memory_region_add_subregion(*ram_memory, below_4g_mem_size_alined, ram);
> > +        vmstate_register_ram_global(ram);
> >      }
> >  
> > +    if (above_4g_mem_size > 0) {
> > +        above_4g_mem_size_alined = above_4g_mem_size & ~(hpagesize - 1);
> > +        ram = g_malloc(sizeof(*ram));
> > +        memory_region_init_ram(ram, NULL, "pc.ram.high.aligned", above_4g_mem_size_alined);
> > +        memory_region_add_subregion(*ram_memory, 0x100000000ULL, ram);
> > +        vmstate_register_ram_global(ram);
> > +
> > +        above_4g_mem_size_tail = above_4g_mem_size - above_4g_mem_size_alined;
> > +        if (above_4g_mem_size_tail) {
> > +            ram = g_malloc(sizeof(*ram));
> > +            memory_region_init_ram(ram, NULL, "pc.ram.high.unaligned", above_4g_mem_size_tail);
> > +            memory_region_add_subregion(*ram_memory, 0x100000000ULL + above_4g_mem_size_alined, ram);
> > +            vmstate_register_ram_global(ram);
> > +	}
> > +    }
> >  
> >      /* Initialize PC system firmware */
> >      pc_system_firmware_init(rom_memory, guest_info->isapc_ram_fw);
> > diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
> > index 40e15e4..f89a37c 100644
> > --- a/include/exec/cpu-common.h
> > +++ b/include/exec/cpu-common.h
> > @@ -57,6 +57,7 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev);
> >  #ifdef __linux__
> >  uint32_t qemu_get_ram_hpagesize(ram_addr_t addr);
> >  #endif
> > +long gethugepagesize(const char *path);
> >  
> >  void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
> >                              int len, int is_write);
> > -- 
> > 1.7.1

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary
  2013-10-30  8:48                   ` Gleb Natapov
@ 2013-10-30 18:30                     ` Marcelo Tosatti
  0 siblings, 0 replies; 53+ messages in thread
From: Marcelo Tosatti @ 2013-10-30 18:30 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: aarcange, Igor Mammedov, qemu-devel, Paolo Bonzini

On Wed, Oct 30, 2013 at 10:48:13AM +0200, Gleb Natapov wrote:
> On Tue, Oct 29, 2013 at 07:21:59PM -0200, Marcelo Tosatti wrote:
> > > > Could add a warning to memory API: if memory region is larger than 1GB
> > > > and RAM is 1GB backed, and not properly aligned, warn.
> > > Perhaps it would be better do abort and ask user to fix configuration,
> > > and on hugepage allocation failure not fallback to malloc but abort and
> > > tell user amount of hugepages needed to run guest with hugepage backend.
> > 
> > You want to back your guest with 1GB hugepages. You get 1 such page at a
> > time, worst case.
> > 
> > You either 
> > 
> > 1) map the guest physical address space region (1GB sized) where
> > the hole is located with smaller page sizes, which must be 2MB, see *,
> > which requires the user to specify a different hugetlbfs mount path with
> > sufficient 2MB huge pages.
> > 
> Why not really on THP to do the work?

Because it might be the case that static hugepage assignment is desired
(no guarantees of backingwith THP).

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions
  2013-10-30 16:49                   ` Igor Mammedov
@ 2013-10-30 18:51                     ` Marcelo Tosatti
  2013-10-30 19:03                       ` Marcelo Tosatti
  2013-10-30 19:56                       ` Igor Mammedov
  2013-10-30 19:31                     ` Marcelo Tosatti
  1 sibling, 2 replies; 53+ messages in thread
From: Marcelo Tosatti @ 2013-10-30 18:51 UTC (permalink / raw)
  To: Igor Mammedov
  Cc: aarcange, peter.maydell, gleb, quintela, jan.kiszka, qemu-devel,
	aliguori, pbonzini, afaerber, rth

On Wed, Oct 30, 2013 at 05:49:49PM +0100, Igor Mammedov wrote:
> On Tue, 29 Oct 2013 19:38:44 -0200
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
> > On Tue, Oct 29, 2013 at 07:18:49PM +0100, Igor Mammedov wrote:
> > > Otherwise 1GB TLBs cannot be cached for the range.
> > 
> > This fails to back non-1GB-aligned gpas, but 2MB aligned, with 2MB large
> > pages.
> With current command line only one hugetlbfs mount point is possible, so it
> will back with whatever alignment specified hugetlbfs mount point has.
> Anything that doesn't fit into page aligned region goes to tail using
> non hugepage baked phys_mem_set_alloc()=qemu_anon_ram_alloc() allocator.

The patch you propose allocates the non-1GB aligned tail of RAM with 4k
pages. As mentioned, this is not acceptable (2MB pages should be used
whenever 1GB alignment is not possible).

I believe its easier for the user to allocate enough 1GB pages to back
all of guest RAM, since allocation is static, than for him to allocate
mixed 1GB/2MB pages in hugetlbfs.

> > Since hugetlbfs allocation is static, it requires the user to inform
> > different 1GB and 2MB sized hugetlbfs mount points (with proper number
> > of corresponding hugetlbfs pages allocated). This is incompatible with
> > the current command line, and i'd like to see this problem handled in a
> > way that is command line backwards compatible.
> patch doesn't change that, it uses provided hugetlbfs and fallbacks (hunk 2)
> to phys_mem_alloc if requested memory region is not hugepage size aligned.
> So there is no any CLI change, only fixing memory leak.
> 
> > Also, if the argument for one-to-one mapping between dimms and linear host
> > virtual address sections holds, it means virtual DIMMs must be
> > partitioned into whatever hugepage alignment is necessary (and in
> > that case, why they can't be partitioned similarly with the memory
> > region aliases?).
> Because during hotplug a new memory region of desired size is allocated
> and it could be mapped directly without any aliasing. And if some day we
> convert adhoc initial memory allocation to dimm devices there is no reason to
> alloc one huge block and then invent means how to alias hole somewhere else,
> we could just reuse memdev/dimm and allocate several memory regions
> with desired properties each represented by a memdev/dimm pair.
> 
> one-one mapping simplifies design and interface with ACPI part during memory
> hotplug.
> 
> for hotplug case flow could look like:
>  memdev_add id=x1,size=1Gb,mem-path=/hugetlbfs/1gb,other-host-related-stuff-options
>  #memdev could enforce size to be backend aligned
>  device_add dimm,id=y1,backend=x1,addr=xxxxxx
>  #dimm could get alignment from associated memdev or fail if addr
>  #doesn't meet alignment of memdev backend
> 
>  memdev_add id=x2,size=2mb,mem-path=/hugetlbfs/2mb
>  device_add dimm,id=y2,backend=x2,addr=yyyyyyy
> 
>  memdev_add id=x3,size=1mb
>  device_add dimm,id=y3,backend=x3,addr=xxxxxxx
> 
> linear memory block is allocated at runtime (user has to make sure that enough
> hugepages are available) by each memdev_add command and that RAM memory region
> is mapped into GPA by virtual DIMM as is, there wouldn't be any need for
> aliasing.
> 
> Now back to intial memory and bright future we are looking forward to (i.e.
> ability to create machine from configuration file without adhoc codding
> like(pc_memory_init)):
> 
> legacy cmdline "-m 4512 -mem-path /hugetlbfs/1gb" could be automatically
> translated into:
> 
> -memdev id=x1,size=3g,mem-path=/hugetlbfs/1gb -device dimm,backend=x1,addr=0
> -memdev id=x2,size=1g,mem-path=/hugetlbfs/1gb -device dimm,backend=x2,addr=4g
> -memdev id=x3,size=512m -device dimm,backend=x3,addr=5g
> 
> or user could drop legacy CLI and assume fine grained control over memory
> configuration:
> 
> -memdev id=x1,size=3g,mem-path=/hugetlbfs/1gb -device dimm,backend=x1,addr=0
> -memdev id=x2,size=1g,mem-path=/hugetlbfs/1gb -device dimm,backend=x2,addr=4g
> -memdev id=x3,size=512m,mem-path=/hugetlbfs/2mb -device dimm,backend=x3,addr=5g
> 
> So if we are going to break migration compatibility for new machine type
> lets do a way that could painlessly changed to memdev/device in future.

Ok then please improve your proposal to allow for multiple hugetlbfs
mount points.

> > > PS:
> > > as side effect we are not wasting ~1Gb of memory if
> > > 1Gb hugepages are used and -m "hpagesize(in Mb)*n + 1"
> > 
> > This is how hugetlbfs works. You waste 1GB hugepage if an extra
> > byte is requested.
> it looks more a bug than feature,
> why do it if leak could be avoided as shown below?

Because IMO it is confusing for the user, since hugetlbfs allocation is
static. But if you have a necessity for the one-to-one relationship, 
feel free to support mixed hugetlbfs page sizes.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions
  2013-10-30 18:51                     ` Marcelo Tosatti
@ 2013-10-30 19:03                       ` Marcelo Tosatti
  2013-10-30 19:56                       ` Igor Mammedov
  1 sibling, 0 replies; 53+ messages in thread
From: Marcelo Tosatti @ 2013-10-30 19:03 UTC (permalink / raw)
  To: Igor Mammedov
  Cc: aarcange, peter.maydell, gleb, quintela, jan.kiszka, qemu-devel,
	aliguori, pbonzini, afaerber, rth

On Wed, Oct 30, 2013 at 04:51:29PM -0200, Marcelo Tosatti wrote:
> On Wed, Oct 30, 2013 at 05:49:49PM +0100, Igor Mammedov wrote:
> > On Tue, 29 Oct 2013 19:38:44 -0200
> > Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > 
> > > On Tue, Oct 29, 2013 at 07:18:49PM +0100, Igor Mammedov wrote:
> > > > Otherwise 1GB TLBs cannot be cached for the range.
> > > 
> > > This fails to back non-1GB-aligned gpas, but 2MB aligned, with 2MB large
> > > pages.
> > With current command line only one hugetlbfs mount point is possible, so it
> > will back with whatever alignment specified hugetlbfs mount point has.
> > Anything that doesn't fit into page aligned region goes to tail using
> > non hugepage baked phys_mem_set_alloc()=qemu_anon_ram_alloc() allocator.
> 
> The patch you propose allocates the non-1GB aligned tail of RAM with 4k
> pages. As mentioned, this is not acceptable (2MB pages should be used
> whenever 1GB alignment is not possible).

Tail and 3-4GB region.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions
  2013-10-30 16:49                   ` Igor Mammedov
  2013-10-30 18:51                     ` Marcelo Tosatti
@ 2013-10-30 19:31                     ` Marcelo Tosatti
  2013-10-30 20:28                       ` Paolo Bonzini
  1 sibling, 1 reply; 53+ messages in thread
From: Marcelo Tosatti @ 2013-10-30 19:31 UTC (permalink / raw)
  To: Igor Mammedov
  Cc: aarcange, peter.maydell, gleb, quintela, jan.kiszka, qemu-devel,
	aliguori, pbonzini, afaerber, rth

On Wed, Oct 30, 2013 at 05:49:49PM +0100, Igor Mammedov wrote:
> On Tue, 29 Oct 2013 19:38:44 -0200
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
> > On Tue, Oct 29, 2013 at 07:18:49PM +0100, Igor Mammedov wrote:
> > > Otherwise 1GB TLBs cannot be cached for the range.
> > 
> > This fails to back non-1GB-aligned gpas, but 2MB aligned, with 2MB large
> > pages.
> With current command line only one hugetlbfs mount point is possible, so it
> will back with whatever alignment specified hugetlbfs mount point has.
> Anything that doesn't fit into page aligned region goes to tail using
> non hugepage baked phys_mem_set_alloc()=qemu_anon_ram_alloc() allocator.

Thats why it fails to back non-1GB-aligned gpas with 2MB large pages.

> > 
> > Since hugetlbfs allocation is static, it requires the user to inform
> > different 1GB and 2MB sized hugetlbfs mount points (with proper number
> > of corresponding hugetlbfs pages allocated). This is incompatible with
> > the current command line, and i'd like to see this problem handled in a
> > way that is command line backwards compatible.
> patch doesn't change that, it uses provided hugetlbfs and fallbacks (hunk 2)
> to phys_mem_alloc if requested memory region is not hugepage size aligned.
> So there is no any CLI change, only fixing memory leak.
> 
> > Also, if the argument for one-to-one mapping between dimms and linear host
> > virtual address sections holds, it means virtual DIMMs must be
> > partitioned into whatever hugepage alignment is necessary (and in
> > that case, why they can't be partitioned similarly with the memory
> > region aliases?).
> Because during hotplug a new memory region of desired size is allocated
> and it could be mapped directly without any aliasing. And if some day we
> convert adhoc initial memory allocation to dimm devices there is no reason to
> alloc one huge block and then invent means how to alias hole somewhere else,
> we could just reuse memdev/dimm and allocate several memory regions
> with desired properties each represented by a memdev/dimm pair.
> 
> one-one mapping simplifies design and interface with ACPI part during memory
> hotplug.
> 
> for hotplug case flow could look like:
>  memdev_add id=x1,size=1Gb,mem-path=/hugetlbfs/1gb,other-host-related-stuff-options
>  #memdev could enforce size to be backend aligned
>  device_add dimm,id=y1,backend=x1,addr=xxxxxx
>  #dimm could get alignment from associated memdev or fail if addr
>  #doesn't meet alignment of memdev backend
> 
>  memdev_add id=x2,size=2mb,mem-path=/hugetlbfs/2mb
>  device_add dimm,id=y2,backend=x2,addr=yyyyyyy
> 
>  memdev_add id=x3,size=1mb
>  device_add dimm,id=y3,backend=x3,addr=xxxxxxx
> 
> linear memory block is allocated at runtime (user has to make sure that enough
> hugepages are available) by each memdev_add command and that RAM memory region
> is mapped into GPA by virtual DIMM as is, there wouldn't be any need for
> aliasing.

Actually, it is awkward to require the user to supply mixed 1GB/2MB huge
page sizes. The number of 1GB pages and 2MB pages must be exact, because
they are preallocated, so for different machine types you have different
numbers.

Also, dealing with unability to migrate between machine types is a
matter of reinitialization of the guest.

I do not believe that unability to deal with memory aliases on the
future ability to create machine from configuration is strong enough
reason to justify pushing mixed 1GB and 2MB pages size to the user (1GB
pages are allocated on the host kernel command line).

> Now back to intial memory and bright future we are looking forward to (i.e.
> ability to create machine from configuration file without adhoc codding
> like(pc_memory_init)):
> 
> legacy cmdline "-m 4512 -mem-path /hugetlbfs/1gb" could be automatically
> translated into:
> 
> -memdev id=x1,size=3g,mem-path=/hugetlbfs/1gb -device dimm,backend=x1,addr=0
> -memdev id=x2,size=1g,mem-path=/hugetlbfs/1gb -device dimm,backend=x2,addr=4g
> -memdev id=x3,size=512m -device dimm,backend=x3,addr=5g
> 
> or user could drop legacy CLI and assume fine grained control over memory
> configuration:
> 
> -memdev id=x1,size=3g,mem-path=/hugetlbfs/1gb -device dimm,backend=x1,addr=0
> -memdev id=x2,size=1g,mem-path=/hugetlbfs/1gb -device dimm,backend=x2,addr=4g
> -memdev id=x3,size=512m,mem-path=/hugetlbfs/2mb -device dimm,backend=x3,addr=5g
> 
> So if we are going to break migration compatibility for new machine type
> lets do a way that could painlessly changed to memdev/device in future.

You can just separate backing device and dimm device creation into two 
commands with offsets.

Which might be a good ideia anyway, BTW.

> > > PS:
> > > as side effect we are not wasting ~1Gb of memory if
> > > 1Gb hugepages are used and -m "hpagesize(in Mb)*n + 1"
> > 
> > This is how hugetlbfs works. You waste 1GB hugepage if an extra
> > byte is requested.
> it looks more a bug than feature,
> why do it if leak could be avoided as shown below?

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions
  2013-10-30 18:51                     ` Marcelo Tosatti
  2013-10-30 19:03                       ` Marcelo Tosatti
@ 2013-10-30 19:56                       ` Igor Mammedov
  2013-10-30 23:44                         ` Marcelo Tosatti
  1 sibling, 1 reply; 53+ messages in thread
From: Igor Mammedov @ 2013-10-30 19:56 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: aarcange, peter.maydell, gleb, quintela, jan.kiszka, qemu-devel,
	aliguori, pbonzini, afaerber, rth

On Wed, 30 Oct 2013 16:51:29 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> On Wed, Oct 30, 2013 at 05:49:49PM +0100, Igor Mammedov wrote:
> > On Tue, 29 Oct 2013 19:38:44 -0200
> > Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > 
> > > On Tue, Oct 29, 2013 at 07:18:49PM +0100, Igor Mammedov wrote:
> > > > Otherwise 1GB TLBs cannot be cached for the range.
> > > 
> > > This fails to back non-1GB-aligned gpas, but 2MB aligned, with 2MB large
> > > pages.
> > With current command line only one hugetlbfs mount point is possible, so it
> > will back with whatever alignment specified hugetlbfs mount point has.
> > Anything that doesn't fit into page aligned region goes to tail using
> > non hugepage baked phys_mem_set_alloc()=qemu_anon_ram_alloc() allocator.
> 
> The patch you propose allocates the non-1GB aligned tail of RAM with 4k
> pages. As mentioned, this is not acceptable (2MB pages should be used
> whenever 1GB alignment is not possible).
> 
> I believe its easier for the user to allocate enough 1GB pages to back
> all of guest RAM, since allocation is static, than for him to allocate
> mixed 1GB/2MB pages in hugetlbfs.
tail, if it's present at all, is always less than 1Gb.
why tail allocated with 4K pages is not acceptable?

btw: now if QEMU can't allocate hugepages for whole guest size it will fallback
to 4k pages anyway for whole guest size, with warning that isn't visible if
user doesn't start QEMU manually.

> 
> > > Since hugetlbfs allocation is static, it requires the user to inform
> > > different 1GB and 2MB sized hugetlbfs mount points (with proper number
> > > of corresponding hugetlbfs pages allocated). This is incompatible with
> > > the current command line, and i'd like to see this problem handled in a
> > > way that is command line backwards compatible.
> > patch doesn't change that, it uses provided hugetlbfs and fallbacks (hunk 2)
> > to phys_mem_alloc if requested memory region is not hugepage size aligned.
> > So there is no any CLI change, only fixing memory leak.
> > 
> > > Also, if the argument for one-to-one mapping between dimms and linear host
> > > virtual address sections holds, it means virtual DIMMs must be
> > > partitioned into whatever hugepage alignment is necessary (and in
> > > that case, why they can't be partitioned similarly with the memory
> > > region aliases?).
> > Because during hotplug a new memory region of desired size is allocated
> > and it could be mapped directly without any aliasing. And if some day we
> > convert adhoc initial memory allocation to dimm devices there is no reason to
> > alloc one huge block and then invent means how to alias hole somewhere else,
> > we could just reuse memdev/dimm and allocate several memory regions
> > with desired properties each represented by a memdev/dimm pair.
> > 
> > one-one mapping simplifies design and interface with ACPI part during memory
> > hotplug.
> > 
> > for hotplug case flow could look like:
> >  memdev_add id=x1,size=1Gb,mem-path=/hugetlbfs/1gb,other-host-related-stuff-options
> >  #memdev could enforce size to be backend aligned
> >  device_add dimm,id=y1,backend=x1,addr=xxxxxx
> >  #dimm could get alignment from associated memdev or fail if addr
> >  #doesn't meet alignment of memdev backend
> > 
> >  memdev_add id=x2,size=2mb,mem-path=/hugetlbfs/2mb
> >  device_add dimm,id=y2,backend=x2,addr=yyyyyyy
> > 
> >  memdev_add id=x3,size=1mb
> >  device_add dimm,id=y3,backend=x3,addr=xxxxxxx
> > 
> > linear memory block is allocated at runtime (user has to make sure that enough
> > hugepages are available) by each memdev_add command and that RAM memory region
> > is mapped into GPA by virtual DIMM as is, there wouldn't be any need for
> > aliasing.
> > 
> > Now back to intial memory and bright future we are looking forward to (i.e.
> > ability to create machine from configuration file without adhoc codding
> > like(pc_memory_init)):
> > 
> > legacy cmdline "-m 4512 -mem-path /hugetlbfs/1gb" could be automatically
> > translated into:
> > 
> > -memdev id=x1,size=3g,mem-path=/hugetlbfs/1gb -device dimm,backend=x1,addr=0
> > -memdev id=x2,size=1g,mem-path=/hugetlbfs/1gb -device dimm,backend=x2,addr=4g
> > -memdev id=x3,size=512m -device dimm,backend=x3,addr=5g
> > 
> > or user could drop legacy CLI and assume fine grained control over memory
> > configuration:
> > 
> > -memdev id=x1,size=3g,mem-path=/hugetlbfs/1gb -device dimm,backend=x1,addr=0
> > -memdev id=x2,size=1g,mem-path=/hugetlbfs/1gb -device dimm,backend=x2,addr=4g
> > -memdev id=x3,size=512m,mem-path=/hugetlbfs/2mb -device dimm,backend=x3,addr=5g
> > 
> > So if we are going to break migration compatibility for new machine type
> > lets do a way that could painlessly changed to memdev/device in future.
> 
> Ok then please improve your proposal to allow for multiple hugetlbfs
> mount points.
> 
> > > > PS:
> > > > as side effect we are not wasting ~1Gb of memory if
> > > > 1Gb hugepages are used and -m "hpagesize(in Mb)*n + 1"
> > > 
> > > This is how hugetlbfs works. You waste 1GB hugepage if an extra
> > > byte is requested.
> > it looks more a bug than feature,
> > why do it if leak could be avoided as shown below?
> 
> Because IMO it is confusing for the user, since hugetlbfs allocation is
> static. But if you have a necessity for the one-to-one relationship, 
> feel free to support mixed hugetlbfs page sizes.
> 
> 


-- 
Regards,
  Igor

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions
  2013-10-30 19:31                     ` Marcelo Tosatti
@ 2013-10-30 20:28                       ` Paolo Bonzini
  0 siblings, 0 replies; 53+ messages in thread
From: Paolo Bonzini @ 2013-10-30 20:28 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: aarcange, peter.maydell, gleb, quintela, jan.kiszka, qemu-devel,
	aliguori, Igor Mammedov, afaerber, rth

Il 30/10/2013 20:31, Marcelo Tosatti ha scritto:
> 
> I do not believe that unability to deal with memory aliases on the
> future ability to create machine from configuration is strong enough
> reason to justify pushing mixed 1GB and 2MB pages size to the user (1GB
> pages are allocated on the host kernel command line).

I agree.  The memory allocation in your patch is a bit awkward, but from
the user's point of view it is by far the cleanest.

Paolo

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions
  2013-10-30 19:56                       ` Igor Mammedov
@ 2013-10-30 23:44                         ` Marcelo Tosatti
  2013-11-07 15:25                           ` Igor Mammedov
  0 siblings, 1 reply; 53+ messages in thread
From: Marcelo Tosatti @ 2013-10-30 23:44 UTC (permalink / raw)
  To: Igor Mammedov
  Cc: aarcange, peter.maydell, gleb, quintela, jan.kiszka, qemu-devel,
	aliguori, pbonzini, afaerber, rth

On Wed, Oct 30, 2013 at 08:56:17PM +0100, Igor Mammedov wrote:
> On Wed, 30 Oct 2013 16:51:29 -0200
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
> > On Wed, Oct 30, 2013 at 05:49:49PM +0100, Igor Mammedov wrote:
> > > On Tue, 29 Oct 2013 19:38:44 -0200
> > > Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > 
> > > > On Tue, Oct 29, 2013 at 07:18:49PM +0100, Igor Mammedov wrote:
> > > > > Otherwise 1GB TLBs cannot be cached for the range.
> > > > 
> > > > This fails to back non-1GB-aligned gpas, but 2MB aligned, with 2MB large
> > > > pages.
> > > With current command line only one hugetlbfs mount point is possible, so it
> > > will back with whatever alignment specified hugetlbfs mount point has.
> > > Anything that doesn't fit into page aligned region goes to tail using
> > > non hugepage baked phys_mem_set_alloc()=qemu_anon_ram_alloc() allocator.
> > 
> > The patch you propose allocates the non-1GB aligned tail of RAM with 4k
> > pages. As mentioned, this is not acceptable (2MB pages should be used
> > whenever 1GB alignment is not possible).
> > 
> > I believe its easier for the user to allocate enough 1GB pages to back
> > all of guest RAM, since allocation is static, than for him to allocate
> > mixed 1GB/2MB pages in hugetlbfs.
> tail, if it's present at all, is always less than 1Gb.
> why tail allocated with 4K pages is not acceptable?

Depends on workload.

> btw: now if QEMU can't allocate hugepages for whole guest size it will fallback
> to 4k pages anyway for whole guest size, with warning that isn't visible if
> user doesn't start QEMU manually.

Not with -mem-prealloc.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v2)
  2013-10-24 21:12 ` [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary Marcelo Tosatti
  2013-10-24 21:55   ` Peter Maydell
  2013-10-24 23:55   ` Paolo Bonzini
@ 2013-11-06  1:49   ` Marcelo Tosatti
  2013-11-06  1:55     ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v3) Marcelo Tosatti
  2 siblings, 1 reply; 53+ messages in thread
From: Marcelo Tosatti @ 2013-11-06  1:49 UTC (permalink / raw)
  To: qemu-devel; +Cc: aarcange, pbonzini, Gerd Hoffmann, gleb, Igor Mammedov


Align guest physical address and host physical address
beyond guest 4GB on a 1GB boundary, in case hugetlbfs is used.

Otherwise 1GB TLBs cannot be cached for the range.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 0c313fe..534e067 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1116,7 +1116,7 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
 {
     int linux_boot, i;
     MemoryRegion *ram, *option_rom_mr;
-    MemoryRegion *ram_below_4g, *ram_above_4g;
+    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
     FWCfgState *fw_cfg;
 
     linux_boot = (kernel_filename != NULL);
@@ -1136,10 +1136,46 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
     memory_region_add_subregion(system_memory, 0, ram_below_4g);
     if (above_4g_mem_size > 0) {
         ram_above_4g = g_malloc(sizeof(*ram_above_4g));
-        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
-                                 below_4g_mem_size, above_4g_mem_size);
-        memory_region_add_subregion(system_memory, 0x100000000ULL,
+
+        /*
+         *
+         * If 1GB hugepages are used to back guest RAM, map guest address
+         * space in the range [ramsize,ramsize+holesize] to the ram block
+         * range [holestart, 4GB]
+         *
+         *                      0      h     4G     [ramsize,ramsize+holesize]
+         *
+         * guest-addr-space     [      ]     [      ][xxx]
+         *                                /----------/
+         * contiguous-ram-block [      ][xxx][     ]
+         *
+         * So that memory beyond 4GB is aligned on a 1GB boundary,
+         * at the host physical address space.
+         *
+         */
+        if (guest_info->gb_align) {
+            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
+
+            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
+                                    0x100000000ULL,
+                                    above_4g_mem_size - holesize);
+            memory_region_add_subregion(system_memory, 0x100000000ULL,
                                     ram_above_4g);
+
+            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
+            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
+                                     "ram-above-4g-piecetwo", ram,
+                                     0x100000000ULL - holesize, holesize);
+            memory_region_add_subregion(system_memory,
+                                        0x100000000ULL +
+                                        above_4g_mem_size - holesize,
+                                        ram_above_4g_piecetwo);
+        } else {
+            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
+                                    below_4g_mem_size, above_4g_mem_size);
+            memory_region_add_subregion(system_memory, 0x100000000ULL,
+                                    ram_above_4g);
+        }
     }
 
 
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index c6042c7..305a4cd 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -59,6 +59,7 @@ static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
 
 static bool has_pvpanic;
 static bool has_pci_info = true;
+static bool gb_align = true;
 
 /* PC hardware initialisation */
 static void pc_init1(QEMUMachineInitArgs *args,
@@ -124,6 +125,7 @@ static void pc_init1(QEMUMachineInitArgs *args,
     guest_info = pc_guest_info_init(below_4g_mem_size, above_4g_mem_size);
     guest_info->has_pci_info = has_pci_info;
     guest_info->isapc_ram_fw = !pci_enabled;
+    guest_info->gb_align = gb_align;
 
     /* allocate ram and load rom/bios */
     if (!xen_enabled()) {
@@ -236,8 +238,14 @@ static void pc_init_pci(QEMUMachineInitArgs *args)
     pc_init1(args, 1, 1);
 }
 
+static void pc_compat_1_7(QEMUMachineInitArgs *args)
+{
+    gb_align = false;
+}
+
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
 {
+    pc_compat_1_7(args);
     has_pci_info = false;
     rom_file_in_ram = false;
 }
@@ -269,6 +277,12 @@ static void pc_compat_1_2(QEMUMachineInitArgs *args)
     disable_kvm_pv_eoi();
 }
 
+static void pc_init_pci_1_7(QEMUMachineInitArgs *args)
+{
+    pc_compat_1_7(args);
+    pc_init_pci(args);
+}
+
 static void pc_init_pci_1_6(QEMUMachineInitArgs *args)
 {
     pc_compat_1_6(args);
@@ -339,13 +353,21 @@ static void pc_xen_hvm_init(QEMUMachineInitArgs *args)
     .desc = "Standard PC (i440FX + PIIX, 1996)", \
     .hot_add_cpu = pc_hot_add_cpu
 
+#define PC_I440FX_1_8_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
+static QEMUMachine pc_i440fx_machine_v1_8 = {
+    PC_I440FX_1_8_MACHINE_OPTIONS,
+    .name = "pc-i440fx-1.8",
+    .alias = "pc",
+    .init = pc_init_pci,
+    .is_default = 1,
+};
+
 #define PC_I440FX_1_7_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
 static QEMUMachine pc_i440fx_machine_v1_7 = {
     PC_I440FX_1_7_MACHINE_OPTIONS,
     .name = "pc-i440fx-1.7",
     .alias = "pc",
-    .init = pc_init_pci,
-    .is_default = 1,
+    .init = pc_init_pci_1_7,
 };
 
 #define PC_I440FX_1_6_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
@@ -747,6 +769,7 @@ static QEMUMachine xenfv_machine = {
 
 static void pc_machine_init(void)
 {
+    qemu_register_machine(&pc_i440fx_machine_v1_8);
     qemu_register_machine(&pc_i440fx_machine_v1_7);
     qemu_register_machine(&pc_i440fx_machine_v1_6);
     qemu_register_machine(&pc_i440fx_machine_v1_5);
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index ca84e1c..45bec19 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -49,6 +49,7 @@
 
 static bool has_pvpanic;
 static bool has_pci_info = true;
+static bool gb_align = true;
 
 /* PC hardware initialisation */
 static void pc_q35_init(QEMUMachineInitArgs *args)
@@ -111,6 +112,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
     guest_info = pc_guest_info_init(below_4g_mem_size, above_4g_mem_size);
     guest_info->has_pci_info = has_pci_info;
     guest_info->isapc_ram_fw = false;
+    guest_info->gb_align = gb_align;
 
     /* allocate ram and load rom/bios */
     if (!xen_enabled()) {
@@ -220,8 +222,14 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
     }
 }
 
+static void pc_compat_1_7(QEMUMachineInitArgs *args)
+{
+   gb_align = false;
+}
+
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
 {
+    pc_compat_1_7(args);
     has_pci_info = false;
     rom_file_in_ram = false;
 }
@@ -240,6 +248,12 @@ static void pc_compat_1_4(QEMUMachineInitArgs *args)
     x86_cpu_compat_set_features("Westmere", FEAT_1_ECX, 0, CPUID_EXT_PCLMULQDQ);
 }
 
+static void pc_q35_init_1_7(QEMUMachineInitArgs *args)
+{
+    pc_compat_1_7(args);
+    pc_q35_init(args);
+}
+
 static void pc_q35_init_1_6(QEMUMachineInitArgs *args)
 {
     pc_compat_1_6(args);
@@ -263,13 +277,22 @@ static void pc_q35_init_1_4(QEMUMachineInitArgs *args)
     .desc = "Standard PC (Q35 + ICH9, 2009)", \
     .hot_add_cpu = pc_hot_add_cpu
 
+#define PC_Q35_1_8_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
+
+static QEMUMachine pc_q35_machine_v1_8 = {
+    PC_Q35_1_8_MACHINE_OPTIONS,
+    .name = "pc-q35-1.8",
+    .alias = "q35",
+    .init = pc_q35_init,
+};
+
 #define PC_Q35_1_7_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
 
 static QEMUMachine pc_q35_machine_v1_7 = {
     PC_Q35_1_7_MACHINE_OPTIONS,
     .name = "pc-q35-1.7",
     .alias = "q35",
-    .init = pc_q35_init,
+    .init = pc_q35_init_1_7,
 };
 
 #define PC_Q35_1_6_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
@@ -310,6 +333,7 @@ static QEMUMachine pc_q35_machine_v1_4 = {
 
 static void pc_q35_machine_init(void)
 {
+    qemu_register_machine(&pc_q35_machine_v1_8);
     qemu_register_machine(&pc_q35_machine_v1_7);
     qemu_register_machine(&pc_q35_machine_v1_6);
     qemu_register_machine(&pc_q35_machine_v1_5);
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 6083839..00afe4a 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -20,6 +20,7 @@ typedef struct PcPciInfo {
 struct PcGuestInfo {
     bool has_pci_info;
     bool isapc_ram_fw;
+    bool gb_align;
     FWCfgState *fw_cfg;
 };
 

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v3)
  2013-11-06  1:49   ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v2) Marcelo Tosatti
@ 2013-11-06  1:55     ` Marcelo Tosatti
  2013-11-06 11:59       ` Igor Mammedov
  2013-11-06 21:31       ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v4) Marcelo Tosatti
  0 siblings, 2 replies; 53+ messages in thread
From: Marcelo Tosatti @ 2013-11-06  1:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: aarcange, pbonzini, Gerd Hoffmann, gleb, Igor Mammedov



v2: condition enablement of new mapping to new machine types (Paolo)
v3: fix changelog

-----


Align guest physical address and host physical address
beyond guest 4GB on a 1GB boundary.

Otherwise 1GB TLBs cannot be cached for the range.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 0c313fe..534e067 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1116,7 +1116,7 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
 {
     int linux_boot, i;
     MemoryRegion *ram, *option_rom_mr;
-    MemoryRegion *ram_below_4g, *ram_above_4g;
+    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
     FWCfgState *fw_cfg;
 
     linux_boot = (kernel_filename != NULL);
@@ -1136,10 +1136,46 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
     memory_region_add_subregion(system_memory, 0, ram_below_4g);
     if (above_4g_mem_size > 0) {
         ram_above_4g = g_malloc(sizeof(*ram_above_4g));
-        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
-                                 below_4g_mem_size, above_4g_mem_size);
-        memory_region_add_subregion(system_memory, 0x100000000ULL,
+
+        /*
+         *
+         * If 1GB hugepages are used to back guest RAM, map guest address
+         * space in the range [ramsize,ramsize+holesize] to the ram block
+         * range [holestart, 4GB]
+         *
+         *                      0      h     4G     [ramsize,ramsize+holesize]
+         *
+         * guest-addr-space     [      ]     [      ][xxx]
+         *                                /----------/
+         * contiguous-ram-block [      ][xxx][     ]
+         *
+         * So that memory beyond 4GB is aligned on a 1GB boundary,
+         * at the host physical address space.
+         *
+         */
+        if (guest_info->gb_align) {
+            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
+
+            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
+                                    0x100000000ULL,
+                                    above_4g_mem_size - holesize);
+            memory_region_add_subregion(system_memory, 0x100000000ULL,
                                     ram_above_4g);
+
+            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
+            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
+                                     "ram-above-4g-piecetwo", ram,
+                                     0x100000000ULL - holesize, holesize);
+            memory_region_add_subregion(system_memory,
+                                        0x100000000ULL +
+                                        above_4g_mem_size - holesize,
+                                        ram_above_4g_piecetwo);
+        } else {
+            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
+                                    below_4g_mem_size, above_4g_mem_size);
+            memory_region_add_subregion(system_memory, 0x100000000ULL,
+                                    ram_above_4g);
+        }
     }
 
 
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index c6042c7..305a4cd 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -59,6 +59,7 @@ static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
 
 static bool has_pvpanic;
 static bool has_pci_info = true;
+static bool gb_align = true;
 
 /* PC hardware initialisation */
 static void pc_init1(QEMUMachineInitArgs *args,
@@ -124,6 +125,7 @@ static void pc_init1(QEMUMachineInitArgs *args,
     guest_info = pc_guest_info_init(below_4g_mem_size, above_4g_mem_size);
     guest_info->has_pci_info = has_pci_info;
     guest_info->isapc_ram_fw = !pci_enabled;
+    guest_info->gb_align = gb_align;
 
     /* allocate ram and load rom/bios */
     if (!xen_enabled()) {
@@ -236,8 +238,14 @@ static void pc_init_pci(QEMUMachineInitArgs *args)
     pc_init1(args, 1, 1);
 }
 
+static void pc_compat_1_7(QEMUMachineInitArgs *args)
+{
+    gb_align = false;
+}
+
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
 {
+    pc_compat_1_7(args);
     has_pci_info = false;
     rom_file_in_ram = false;
 }
@@ -269,6 +277,12 @@ static void pc_compat_1_2(QEMUMachineInitArgs *args)
     disable_kvm_pv_eoi();
 }
 
+static void pc_init_pci_1_7(QEMUMachineInitArgs *args)
+{
+    pc_compat_1_7(args);
+    pc_init_pci(args);
+}
+
 static void pc_init_pci_1_6(QEMUMachineInitArgs *args)
 {
     pc_compat_1_6(args);
@@ -339,13 +353,21 @@ static void pc_xen_hvm_init(QEMUMachineInitArgs *args)
     .desc = "Standard PC (i440FX + PIIX, 1996)", \
     .hot_add_cpu = pc_hot_add_cpu
 
+#define PC_I440FX_1_8_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
+static QEMUMachine pc_i440fx_machine_v1_8 = {
+    PC_I440FX_1_8_MACHINE_OPTIONS,
+    .name = "pc-i440fx-1.8",
+    .alias = "pc",
+    .init = pc_init_pci,
+    .is_default = 1,
+};
+
 #define PC_I440FX_1_7_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
 static QEMUMachine pc_i440fx_machine_v1_7 = {
     PC_I440FX_1_7_MACHINE_OPTIONS,
     .name = "pc-i440fx-1.7",
     .alias = "pc",
-    .init = pc_init_pci,
-    .is_default = 1,
+    .init = pc_init_pci_1_7,
 };
 
 #define PC_I440FX_1_6_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
@@ -747,6 +769,7 @@ static QEMUMachine xenfv_machine = {
 
 static void pc_machine_init(void)
 {
+    qemu_register_machine(&pc_i440fx_machine_v1_8);
     qemu_register_machine(&pc_i440fx_machine_v1_7);
     qemu_register_machine(&pc_i440fx_machine_v1_6);
     qemu_register_machine(&pc_i440fx_machine_v1_5);
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index ca84e1c..45bec19 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -49,6 +49,7 @@
 
 static bool has_pvpanic;
 static bool has_pci_info = true;
+static bool gb_align = true;
 
 /* PC hardware initialisation */
 static void pc_q35_init(QEMUMachineInitArgs *args)
@@ -111,6 +112,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
     guest_info = pc_guest_info_init(below_4g_mem_size, above_4g_mem_size);
     guest_info->has_pci_info = has_pci_info;
     guest_info->isapc_ram_fw = false;
+    guest_info->gb_align = gb_align;
 
     /* allocate ram and load rom/bios */
     if (!xen_enabled()) {
@@ -220,8 +222,14 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
     }
 }
 
+static void pc_compat_1_7(QEMUMachineInitArgs *args)
+{
+   gb_align = false;
+}
+
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
 {
+    pc_compat_1_7(args);
     has_pci_info = false;
     rom_file_in_ram = false;
 }
@@ -240,6 +248,12 @@ static void pc_compat_1_4(QEMUMachineInitArgs *args)
     x86_cpu_compat_set_features("Westmere", FEAT_1_ECX, 0, CPUID_EXT_PCLMULQDQ);
 }
 
+static void pc_q35_init_1_7(QEMUMachineInitArgs *args)
+{
+    pc_compat_1_7(args);
+    pc_q35_init(args);
+}
+
 static void pc_q35_init_1_6(QEMUMachineInitArgs *args)
 {
     pc_compat_1_6(args);
@@ -263,13 +277,22 @@ static void pc_q35_init_1_4(QEMUMachineInitArgs *args)
     .desc = "Standard PC (Q35 + ICH9, 2009)", \
     .hot_add_cpu = pc_hot_add_cpu
 
+#define PC_Q35_1_8_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
+
+static QEMUMachine pc_q35_machine_v1_8 = {
+    PC_Q35_1_8_MACHINE_OPTIONS,
+    .name = "pc-q35-1.8",
+    .alias = "q35",
+    .init = pc_q35_init,
+};
+
 #define PC_Q35_1_7_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
 
 static QEMUMachine pc_q35_machine_v1_7 = {
     PC_Q35_1_7_MACHINE_OPTIONS,
     .name = "pc-q35-1.7",
     .alias = "q35",
-    .init = pc_q35_init,
+    .init = pc_q35_init_1_7,
 };
 
 #define PC_Q35_1_6_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
@@ -310,6 +333,7 @@ static QEMUMachine pc_q35_machine_v1_4 = {
 
 static void pc_q35_machine_init(void)
 {
+    qemu_register_machine(&pc_q35_machine_v1_8);
     qemu_register_machine(&pc_q35_machine_v1_7);
     qemu_register_machine(&pc_q35_machine_v1_6);
     qemu_register_machine(&pc_q35_machine_v1_5);
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 6083839..00afe4a 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -20,6 +20,7 @@ typedef struct PcPciInfo {
 struct PcGuestInfo {
     bool has_pci_info;
     bool isapc_ram_fw;
+    bool gb_align;
     FWCfgState *fw_cfg;
 };
 

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v3)
  2013-11-06  1:55     ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v3) Marcelo Tosatti
@ 2013-11-06 11:59       ` Igor Mammedov
  2013-11-06 12:07         ` Paolo Bonzini
  2013-11-06 21:31       ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v4) Marcelo Tosatti
  1 sibling, 1 reply; 53+ messages in thread
From: Igor Mammedov @ 2013-11-06 11:59 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: aarcange, pbonzini, qemu-devel, gleb, Gerd Hoffmann

On Tue, 5 Nov 2013 23:55:43 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> 
> 
> v2: condition enablement of new mapping to new machine types (Paolo)
> v3: fix changelog
> 
> -----
> 
> 
> Align guest physical address and host physical address
> beyond guest 4GB on a 1GB boundary.
> 
> Otherwise 1GB TLBs cannot be cached for the range.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 0c313fe..534e067 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1116,7 +1116,7 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
[...]
> +        /*
> +         *
> +         * If 1GB hugepages are used to back guest RAM, map guest address
> +         * space in the range [ramsize,ramsize+holesize] to the ram block
> +         * range [holestart, 4GB]
> +         *
> +         *                      0      h     4G     [ramsize,ramsize+holesize]
> +         *
> +         * guest-addr-space     [      ]     [      ][xxx]
> +         *                                /----------/
> +         * contiguous-ram-block [      ][xxx][     ]
> +         *
> +         * So that memory beyond 4GB is aligned on a 1GB boundary,
> +         * at the host physical address space.
> +         *
> +         */
> +        if (guest_info->gb_align) {
'gb_align' is one shot usage, it would be better to just add it as an argument
to pc_memory_init(). That would allow to avoid extending PcGuestInfo needlessly,
since gb_align isn't reused.

> +            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
> +
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    0x100000000ULL,
> +                                    above_4g_mem_size - holesize);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
>                                      ram_above_4g);
[...]
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 6083839..00afe4a 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -20,6 +20,7 @@ typedef struct PcPciInfo {
>  struct PcGuestInfo {
>      bool has_pci_info;
>      bool isapc_ram_fw;
> +    bool gb_align;
>      FWCfgState *fw_cfg;
>  };
it doesn't apply anymore.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v3)
  2013-11-06 11:59       ` Igor Mammedov
@ 2013-11-06 12:07         ` Paolo Bonzini
  2013-11-06 12:22           ` Igor Mammedov
  0 siblings, 1 reply; 53+ messages in thread
From: Paolo Bonzini @ 2013-11-06 12:07 UTC (permalink / raw)
  To: Igor Mammedov; +Cc: aarcange, Marcelo Tosatti, qemu-devel, gleb, Gerd Hoffmann

Il 06/11/2013 12:59, Igor Mammedov ha scritto:
> On Tue, 5 Nov 2013 23:55:43 -0200
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
>>
>>
>> v2: condition enablement of new mapping to new machine types (Paolo)
>> v3: fix changelog
>>
>> -----
>>
>>
>> Align guest physical address and host physical address
>> beyond guest 4GB on a 1GB boundary.
>>
>> Otherwise 1GB TLBs cannot be cached for the range.
>>
>> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>>
>> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
>> index 0c313fe..534e067 100644
>> --- a/hw/i386/pc.c
>> +++ b/hw/i386/pc.c
>> @@ -1116,7 +1116,7 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
> [...]
>> +        /*
>> +         *
>> +         * If 1GB hugepages are used to back guest RAM, map guest address
>> +         * space in the range [ramsize,ramsize+holesize] to the ram block
>> +         * range [holestart, 4GB]
>> +         *
>> +         *                      0      h     4G     [ramsize,ramsize+holesize]
>> +         *
>> +         * guest-addr-space     [      ]     [      ][xxx]
>> +         *                                /----------/
>> +         * contiguous-ram-block [      ][xxx][     ]
>> +         *
>> +         * So that memory beyond 4GB is aligned on a 1GB boundary,
>> +         * at the host physical address space.
>> +         *
>> +         */
>> +        if (guest_info->gb_align) {
> 'gb_align' is one shot usage, it would be better to just add it as an argument
> to pc_memory_init(). That would allow to avoid extending PcGuestInfo needlessly,
> since gb_align isn't reused.

No, Marcelo's way is better.  pc_memory_init already has too many
arguments, moving them to PcGuestInfo (which ultimately might become
properties of the /machine QOM object) is the right thing to do.

Paolo

>> +            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
>> +
>> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
>> +                                    0x100000000ULL,
>> +                                    above_4g_mem_size - holesize);
>> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
>>                                      ram_above_4g);
> [...]
>> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
>> index 6083839..00afe4a 100644
>> --- a/include/hw/i386/pc.h
>> +++ b/include/hw/i386/pc.h
>> @@ -20,6 +20,7 @@ typedef struct PcPciInfo {
>>  struct PcGuestInfo {
>>      bool has_pci_info;
>>      bool isapc_ram_fw;
>> +    bool gb_align;
>>      FWCfgState *fw_cfg;
>>  };
> it doesn't apply anymore.
> 

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v3)
  2013-11-06 12:07         ` Paolo Bonzini
@ 2013-11-06 12:22           ` Igor Mammedov
  2013-11-06 12:24             ` Paolo Bonzini
  0 siblings, 1 reply; 53+ messages in thread
From: Igor Mammedov @ 2013-11-06 12:22 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: aarcange, Marcelo Tosatti, qemu-devel, gleb, Gerd Hoffmann

On Wed, 06 Nov 2013 13:07:26 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> Il 06/11/2013 12:59, Igor Mammedov ha scritto:
> > On Tue, 5 Nov 2013 23:55:43 -0200
> > Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > 
> >>
> >>
> >> v2: condition enablement of new mapping to new machine types (Paolo)
> >> v3: fix changelog
> >>
> >> -----
> >>
> >>
> >> Align guest physical address and host physical address
> >> beyond guest 4GB on a 1GB boundary.
> >>
> >> Otherwise 1GB TLBs cannot be cached for the range.
> >>
> >> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> >>
> >> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> >> index 0c313fe..534e067 100644
> >> --- a/hw/i386/pc.c
> >> +++ b/hw/i386/pc.c
> >> @@ -1116,7 +1116,7 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
> > [...]
> >> +        /*
> >> +         *
> >> +         * If 1GB hugepages are used to back guest RAM, map guest address
> >> +         * space in the range [ramsize,ramsize+holesize] to the ram block
> >> +         * range [holestart, 4GB]
> >> +         *
> >> +         *                      0      h     4G     [ramsize,ramsize+holesize]
> >> +         *
> >> +         * guest-addr-space     [      ]     [      ][xxx]
> >> +         *                                /----------/
> >> +         * contiguous-ram-block [      ][xxx][     ]
> >> +         *
> >> +         * So that memory beyond 4GB is aligned on a 1GB boundary,
> >> +         * at the host physical address space.
> >> +         *
> >> +         */
> >> +        if (guest_info->gb_align) {
> > 'gb_align' is one shot usage, it would be better to just add it as an argument
> > to pc_memory_init(). That would allow to avoid extending PcGuestInfo needlessly,
> > since gb_align isn't reused.
> 
> No, Marcelo's way is better.  pc_memory_init already has too many
> arguments, moving them to PcGuestInfo (which ultimately might become
> properties of the /machine QOM object) is the right thing to do.
In general I agree. But unless there is plans to reuse gb_align in future,
it doesn't really belong to PcGuestInfo (this change however looks like
cannibalizing structure for argument passing only).

> 
> Paolo
> 
> >> +            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
> >> +
> >> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> >> +                                    0x100000000ULL,
> >> +                                    above_4g_mem_size - holesize);
> >> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
> >>                                      ram_above_4g);
> > [...]
> >> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> >> index 6083839..00afe4a 100644
> >> --- a/include/hw/i386/pc.h
> >> +++ b/include/hw/i386/pc.h
> >> @@ -20,6 +20,7 @@ typedef struct PcPciInfo {
> >>  struct PcGuestInfo {
> >>      bool has_pci_info;
> >>      bool isapc_ram_fw;
> >> +    bool gb_align;
> >>      FWCfgState *fw_cfg;
> >>  };
> > it doesn't apply anymore.
> > 
> 
> 

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v3)
  2013-11-06 12:22           ` Igor Mammedov
@ 2013-11-06 12:24             ` Paolo Bonzini
  0 siblings, 0 replies; 53+ messages in thread
From: Paolo Bonzini @ 2013-11-06 12:24 UTC (permalink / raw)
  To: Igor Mammedov; +Cc: aarcange, Marcelo Tosatti, qemu-devel, gleb, Gerd Hoffmann

Il 06/11/2013 13:22, Igor Mammedov ha scritto:
>>> > > 'gb_align' is one shot usage, it would be better to just add it as an argument
>>> > > to pc_memory_init(). That would allow to avoid extending PcGuestInfo needlessly,
>>> > > since gb_align isn't reused.
>> > 
>> > No, Marcelo's way is better.  pc_memory_init already has too many
>> > arguments, moving them to PcGuestInfo (which ultimately might become
>> > properties of the /machine QOM object) is the right thing to do.
> In general I agree. But unless there is plans to reuse gb_align in future,

The plan is to turn it (and also has_acpi_build etc.) into a global
property that can be set using the usual compat-property machinery.

> it doesn't really belong to PcGuestInfo (this change however looks like
> cannibalizing structure for argument passing only).

For now, it doesn't just look like that, it is like that. :)  But it is
still a step in the right direction.

Paolo

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v4)
  2013-11-06  1:55     ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v3) Marcelo Tosatti
  2013-11-06 11:59       ` Igor Mammedov
@ 2013-11-06 21:31       ` Marcelo Tosatti
  2013-11-06 21:40         ` Michael S. Tsirkin
  2013-11-07 15:24         ` Igor Mammedov
  1 sibling, 2 replies; 53+ messages in thread
From: Marcelo Tosatti @ 2013-11-06 21:31 UTC (permalink / raw)
  To: qemu-devel
  Cc: aarcange, gleb, Michael S. Tsirkin, Gerd Hoffmann, Igor Mammedov,
	pbonzini


v2: condition enablement of new mapping to new machine types (Paolo)
v3: fix changelog
v4: rebase

-----


Align guest physical address and host physical address
beyond guest 4GB on a 1GB boundary.

Otherwise 1GB TLBs cannot be cached for the range.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>


diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 12c436e..d29196d 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1156,7 +1156,7 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
 {
     int linux_boot, i;
     MemoryRegion *ram, *option_rom_mr;
-    MemoryRegion *ram_below_4g, *ram_above_4g;
+    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
     FWCfgState *fw_cfg;
 
     linux_boot = (kernel_filename != NULL);
@@ -1177,10 +1177,45 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
     e820_add_entry(0, below_4g_mem_size, E820_RAM);
     if (above_4g_mem_size > 0) {
         ram_above_4g = g_malloc(sizeof(*ram_above_4g));
-        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
-                                 below_4g_mem_size, above_4g_mem_size);
-        memory_region_add_subregion(system_memory, 0x100000000ULL,
+        /*
+         *
+         * If 1GB hugepages are used to back guest RAM, map guest address
+         * space in the range [ramsize,ramsize+holesize] to the ram block
+         * range [holestart, 4GB]
+         *
+         *                      0      h     4G     [ramsize,ramsize+holesize]
+         *
+         * guest-addr-space     [      ]     [      ][xxx]
+         *                                /----------/
+         * contiguous-ram-block [      ][xxx][     ]
+         *
+         * So that memory beyond 4GB is aligned on a 1GB boundary,
+         * at the host physical address space.
+         *
+         */
+        if (guest_info->gb_align) {
+            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
+
+            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
+                                    0x100000000ULL,
+                                    above_4g_mem_size - holesize);
+            memory_region_add_subregion(system_memory, 0x100000000ULL,
                                     ram_above_4g);
+
+            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
+            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
+                                     "ram-above-4g-piecetwo", ram,
+                                     0x100000000ULL - holesize, holesize);
+            memory_region_add_subregion(system_memory,
+                                        0x100000000ULL +
+                                        above_4g_mem_size - holesize,
+                                        ram_above_4g_piecetwo);
+        } else {
+            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
+                                    below_4g_mem_size, above_4g_mem_size);
+            memory_region_add_subregion(system_memory, 0x100000000ULL,
+                                    ram_above_4g);
+        }
         e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
     }
 
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 4fdb7b6..686736e 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -60,6 +60,7 @@ static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
 static bool has_pvpanic;
 static bool has_pci_info = true;
 static bool has_acpi_build = true;
+static bool gb_align = true;
 
 /* PC hardware initialisation */
 static void pc_init1(QEMUMachineInitArgs *args,
@@ -128,6 +129,7 @@ static void pc_init1(QEMUMachineInitArgs *args,
 
     guest_info->has_pci_info = has_pci_info;
     guest_info->isapc_ram_fw = !pci_enabled;
+    guest_info->gb_align = gb_align;
 
     /* allocate ram and load rom/bios */
     if (!xen_enabled()) {
@@ -240,8 +242,14 @@ static void pc_init_pci(QEMUMachineInitArgs *args)
     pc_init1(args, 1, 1);
 }
 
+static void pc_compat_1_7(QEMUMachineInitArgs *args)
+{
+    gb_align = false;
+}
+
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
 {
+    pc_compat_1_7(args);
     has_pci_info = false;
     rom_file_in_ram = false;
     has_acpi_build = false;
@@ -274,6 +282,12 @@ static void pc_compat_1_2(QEMUMachineInitArgs *args)
     disable_kvm_pv_eoi();
 }
 
+static void pc_init_pci_1_7(QEMUMachineInitArgs *args)
+{
+    pc_compat_1_7(args);
+    pc_init_pci(args);
+}
+
 static void pc_init_pci_1_6(QEMUMachineInitArgs *args)
 {
     pc_compat_1_6(args);
@@ -346,13 +360,21 @@ static void pc_xen_hvm_init(QEMUMachineInitArgs *args)
     .desc = "Standard PC (i440FX + PIIX, 1996)", \
     .hot_add_cpu = pc_hot_add_cpu
 
+#define PC_I440FX_1_8_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
+static QEMUMachine pc_i440fx_machine_v1_8 = {
+    PC_I440FX_1_8_MACHINE_OPTIONS,
+    .name = "pc-i440fx-1.8",
+    .alias = "pc",
+    .init = pc_init_pci,
+    .is_default = 1,
+};
+
 #define PC_I440FX_1_7_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
 static QEMUMachine pc_i440fx_machine_v1_7 = {
     PC_I440FX_1_7_MACHINE_OPTIONS,
     .name = "pc-i440fx-1.7",
     .alias = "pc",
-    .init = pc_init_pci,
-    .is_default = 1,
+    .init = pc_init_pci_1_7,
 };
 
 #define PC_I440FX_1_6_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
@@ -754,6 +776,7 @@ static QEMUMachine xenfv_machine = {
 
 static void pc_machine_init(void)
 {
+    qemu_register_machine(&pc_i440fx_machine_v1_8);
     qemu_register_machine(&pc_i440fx_machine_v1_7);
     qemu_register_machine(&pc_i440fx_machine_v1_6);
     qemu_register_machine(&pc_i440fx_machine_v1_5);
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 4c191d3..c2eb568 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -50,6 +50,7 @@
 static bool has_pvpanic;
 static bool has_pci_info = true;
 static bool has_acpi_build = true;
+static bool gb_align = true;
 
 /* PC hardware initialisation */
 static void pc_q35_init(QEMUMachineInitArgs *args)
@@ -113,6 +114,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
     guest_info->has_pci_info = has_pci_info;
     guest_info->isapc_ram_fw = false;
     guest_info->has_acpi_build = has_acpi_build;
+    guest_info->gb_align = gb_align;
 
     /* allocate ram and load rom/bios */
     if (!xen_enabled()) {
@@ -222,8 +224,14 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
     }
 }
 
+static void pc_compat_1_7(QEMUMachineInitArgs *args)
+{
+   gb_align = false;
+}
+
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
 {
+    pc_compat_1_7(args);
     has_pci_info = false;
     rom_file_in_ram = false;
     has_acpi_build = false;
@@ -243,6 +251,12 @@ static void pc_compat_1_4(QEMUMachineInitArgs *args)
     x86_cpu_compat_set_features("Westmere", FEAT_1_ECX, 0, CPUID_EXT_PCLMULQDQ);
 }
 
+static void pc_q35_init_1_7(QEMUMachineInitArgs *args)
+{
+    pc_compat_1_7(args);
+    pc_q35_init(args);
+}
+
 static void pc_q35_init_1_6(QEMUMachineInitArgs *args)
 {
     pc_compat_1_6(args);
@@ -266,13 +280,22 @@ static void pc_q35_init_1_4(QEMUMachineInitArgs *args)
     .desc = "Standard PC (Q35 + ICH9, 2009)", \
     .hot_add_cpu = pc_hot_add_cpu
 
+#define PC_Q35_1_8_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
+
+static QEMUMachine pc_q35_machine_v1_8 = {
+    PC_Q35_1_8_MACHINE_OPTIONS,
+    .name = "pc-q35-1.8",
+    .alias = "q35",
+    .init = pc_q35_init,
+};
+
 #define PC_Q35_1_7_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
 
 static QEMUMachine pc_q35_machine_v1_7 = {
     PC_Q35_1_7_MACHINE_OPTIONS,
     .name = "pc-q35-1.7",
     .alias = "q35",
-    .init = pc_q35_init,
+    .init = pc_q35_init_1_7,
 };
 
 #define PC_Q35_1_6_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
@@ -313,6 +336,7 @@ static QEMUMachine pc_q35_machine_v1_4 = {
 
 static void pc_q35_machine_init(void)
 {
+    qemu_register_machine(&pc_q35_machine_v1_8);
     qemu_register_machine(&pc_q35_machine_v1_7);
     qemu_register_machine(&pc_q35_machine_v1_6);
     qemu_register_machine(&pc_q35_machine_v1_5);
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 03cc0ba..35a6885 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -41,6 +41,7 @@ struct PcGuestInfo {
     uint64_t *node_cpu;
     FWCfgState *fw_cfg;
     bool has_acpi_build;
+    bool gb_align;
 };
 
 /* parallel.c */

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v4)
  2013-11-06 21:31       ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v4) Marcelo Tosatti
@ 2013-11-06 21:40         ` Michael S. Tsirkin
  2013-11-06 21:53           ` Marcelo Tosatti
  2013-11-07 15:24         ` Igor Mammedov
  1 sibling, 1 reply; 53+ messages in thread
From: Michael S. Tsirkin @ 2013-11-06 21:40 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: aarcange, gleb, qemu-devel, Gerd Hoffmann, Igor Mammedov, pbonzini

On Wed, Nov 06, 2013 at 07:31:19PM -0200, Marcelo Tosatti wrote:
> 
> v2: condition enablement of new mapping to new machine types (Paolo)
> v3: fix changelog
> v4: rebase
> 
> -----
> 
> 
> Align guest physical address and host physical address
> beyond guest 4GB on a 1GB boundary.
> 
> Otherwise 1GB TLBs cannot be cached for the range.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Um. This will conflict with:
    pc: map PCI address space as catchall region for not mapped addresses

I think we really should stop using the hacked hole thing
and just use priorities like that patch does.
Do you agree? If yes I'm afraid your patch will have to be
rebased on top of that yet again, sorry to give you a
run-around like that :(


Also - do you think this is 1.7 material?

> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 12c436e..d29196d 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1156,7 +1156,7 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>  {
>      int linux_boot, i;
>      MemoryRegion *ram, *option_rom_mr;
> -    MemoryRegion *ram_below_4g, *ram_above_4g;
> +    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
>      FWCfgState *fw_cfg;
>  
>      linux_boot = (kernel_filename != NULL);
> @@ -1177,10 +1177,45 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>      e820_add_entry(0, below_4g_mem_size, E820_RAM);
>      if (above_4g_mem_size > 0) {
>          ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> -                                 below_4g_mem_size, above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> +        /*
> +         *
> +         * If 1GB hugepages are used to back guest RAM, map guest address
> +         * space in the range [ramsize,ramsize+holesize] to the ram block
> +         * range [holestart, 4GB]
> +         *
> +         *                      0      h     4G     [ramsize,ramsize+holesize]
> +         *
> +         * guest-addr-space     [      ]     [      ][xxx]
> +         *                                /----------/
> +         * contiguous-ram-block [      ][xxx][     ]
> +         *
> +         * So that memory beyond 4GB is aligned on a 1GB boundary,
> +         * at the host physical address space.
> +         *
> +         */
> +        if (guest_info->gb_align) {
> +            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
> +
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    0x100000000ULL,
> +                                    above_4g_mem_size - holesize);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
>                                      ram_above_4g);
> +
> +            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> +                                     "ram-above-4g-piecetwo", ram,
> +                                     0x100000000ULL - holesize, holesize);
> +            memory_region_add_subregion(system_memory,
> +                                        0x100000000ULL +
> +                                        above_4g_mem_size - holesize,
> +                                        ram_above_4g_piecetwo);
> +        } else {
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    below_4g_mem_size, above_4g_mem_size);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
> +                                    ram_above_4g);
> +        }
>          e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
>      }
>  
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 4fdb7b6..686736e 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -60,6 +60,7 @@ static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
>  static bool has_pvpanic;
>  static bool has_pci_info = true;
>  static bool has_acpi_build = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_init1(QEMUMachineInitArgs *args,
> @@ -128,6 +129,7 @@ static void pc_init1(QEMUMachineInitArgs *args,
>  
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = !pci_enabled;
> +    guest_info->gb_align = gb_align;
>  
>      /* allocate ram and load rom/bios */
>      if (!xen_enabled()) {
> @@ -240,8 +242,14 @@ static void pc_init_pci(QEMUMachineInitArgs *args)
>      pc_init1(args, 1, 1);
>  }
>  
> +static void pc_compat_1_7(QEMUMachineInitArgs *args)
> +{
> +    gb_align = false;
> +}
> +
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
>  {
> +    pc_compat_1_7(args);
>      has_pci_info = false;
>      rom_file_in_ram = false;
>      has_acpi_build = false;
> @@ -274,6 +282,12 @@ static void pc_compat_1_2(QEMUMachineInitArgs *args)
>      disable_kvm_pv_eoi();
>  }
>  
> +static void pc_init_pci_1_7(QEMUMachineInitArgs *args)
> +{
> +    pc_compat_1_7(args);
> +    pc_init_pci(args);
> +}
> +
>  static void pc_init_pci_1_6(QEMUMachineInitArgs *args)
>  {
>      pc_compat_1_6(args);
> @@ -346,13 +360,21 @@ static void pc_xen_hvm_init(QEMUMachineInitArgs *args)
>      .desc = "Standard PC (i440FX + PIIX, 1996)", \
>      .hot_add_cpu = pc_hot_add_cpu
>  
> +#define PC_I440FX_1_8_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
> +static QEMUMachine pc_i440fx_machine_v1_8 = {
> +    PC_I440FX_1_8_MACHINE_OPTIONS,
> +    .name = "pc-i440fx-1.8",
> +    .alias = "pc",
> +    .init = pc_init_pci,
> +    .is_default = 1,
> +};
> +
>  #define PC_I440FX_1_7_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
>  static QEMUMachine pc_i440fx_machine_v1_7 = {
>      PC_I440FX_1_7_MACHINE_OPTIONS,
>      .name = "pc-i440fx-1.7",
>      .alias = "pc",
> -    .init = pc_init_pci,
> -    .is_default = 1,
> +    .init = pc_init_pci_1_7,
>  };
>  
>  #define PC_I440FX_1_6_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
> @@ -754,6 +776,7 @@ static QEMUMachine xenfv_machine = {
>  
>  static void pc_machine_init(void)
>  {
> +    qemu_register_machine(&pc_i440fx_machine_v1_8);
>      qemu_register_machine(&pc_i440fx_machine_v1_7);
>      qemu_register_machine(&pc_i440fx_machine_v1_6);
>      qemu_register_machine(&pc_i440fx_machine_v1_5);
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index 4c191d3..c2eb568 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -50,6 +50,7 @@
>  static bool has_pvpanic;
>  static bool has_pci_info = true;
>  static bool has_acpi_build = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_q35_init(QEMUMachineInitArgs *args)
> @@ -113,6 +114,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = false;
>      guest_info->has_acpi_build = has_acpi_build;
> +    guest_info->gb_align = gb_align;
>  
>      /* allocate ram and load rom/bios */
>      if (!xen_enabled()) {
> @@ -222,8 +224,14 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>      }
>  }
>  
> +static void pc_compat_1_7(QEMUMachineInitArgs *args)
> +{
> +   gb_align = false;
> +}
> +
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
>  {
> +    pc_compat_1_7(args);
>      has_pci_info = false;
>      rom_file_in_ram = false;
>      has_acpi_build = false;
> @@ -243,6 +251,12 @@ static void pc_compat_1_4(QEMUMachineInitArgs *args)
>      x86_cpu_compat_set_features("Westmere", FEAT_1_ECX, 0, CPUID_EXT_PCLMULQDQ);
>  }
>  
> +static void pc_q35_init_1_7(QEMUMachineInitArgs *args)
> +{
> +    pc_compat_1_7(args);
> +    pc_q35_init(args);
> +}
> +
>  static void pc_q35_init_1_6(QEMUMachineInitArgs *args)
>  {
>      pc_compat_1_6(args);
> @@ -266,13 +280,22 @@ static void pc_q35_init_1_4(QEMUMachineInitArgs *args)
>      .desc = "Standard PC (Q35 + ICH9, 2009)", \
>      .hot_add_cpu = pc_hot_add_cpu
>  
> +#define PC_Q35_1_8_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
> +
> +static QEMUMachine pc_q35_machine_v1_8 = {
> +    PC_Q35_1_8_MACHINE_OPTIONS,
> +    .name = "pc-q35-1.8",
> +    .alias = "q35",
> +    .init = pc_q35_init,
> +};
> +
>  #define PC_Q35_1_7_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
>  
>  static QEMUMachine pc_q35_machine_v1_7 = {
>      PC_Q35_1_7_MACHINE_OPTIONS,
>      .name = "pc-q35-1.7",
>      .alias = "q35",
> -    .init = pc_q35_init,
> +    .init = pc_q35_init_1_7,
>  };
>  
>  #define PC_Q35_1_6_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
> @@ -313,6 +336,7 @@ static QEMUMachine pc_q35_machine_v1_4 = {
>  
>  static void pc_q35_machine_init(void)
>  {
> +    qemu_register_machine(&pc_q35_machine_v1_8);
>      qemu_register_machine(&pc_q35_machine_v1_7);
>      qemu_register_machine(&pc_q35_machine_v1_6);
>      qemu_register_machine(&pc_q35_machine_v1_5);
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 03cc0ba..35a6885 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -41,6 +41,7 @@ struct PcGuestInfo {
>      uint64_t *node_cpu;
>      FWCfgState *fw_cfg;
>      bool has_acpi_build;
> +    bool gb_align;
>  };
>  
>  /* parallel.c */

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v4)
  2013-11-06 21:40         ` Michael S. Tsirkin
@ 2013-11-06 21:53           ` Marcelo Tosatti
  2013-11-06 22:15             ` Michael S. Tsirkin
  0 siblings, 1 reply; 53+ messages in thread
From: Marcelo Tosatti @ 2013-11-06 21:53 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: aarcange, gleb, qemu-devel, Gerd Hoffmann, Igor Mammedov, pbonzini

On Wed, Nov 06, 2013 at 11:40:34PM +0200, Michael S. Tsirkin wrote:
> On Wed, Nov 06, 2013 at 07:31:19PM -0200, Marcelo Tosatti wrote:
> > 
> > v2: condition enablement of new mapping to new machine types (Paolo)
> > v3: fix changelog
> > v4: rebase
> > 
> > -----
> > 
> > 
> > Align guest physical address and host physical address
> > beyond guest 4GB on a 1GB boundary.
> > 
> > Otherwise 1GB TLBs cannot be cached for the range.
> > 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> Um. This will conflict with:
>     pc: map PCI address space as catchall region for not mapped addresses
> 
> I think we really should stop using the hacked hole thing
> and just use priorities like that patch does.

Sorry hacked in what way?

This patch is necessary to enable 1GB hugepages beyond 4GB of RAM on the
current machine types.

> Do you agree? If yes I'm afraid your patch will have to be
> rebased on top of that yet again, sorry to give you a
> run-around like that :(

I don't see what exactly is the suggestion (or why the proposed 
patch should conflict with "pc: map PCI address space as catchall region
for not mapped addresses").

> Also - do you think this is 1.7 material?

No. Paolo mentioned you have a tree with 1.8 material, correct?

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v4)
  2013-11-06 21:53           ` Marcelo Tosatti
@ 2013-11-06 22:15             ` Michael S. Tsirkin
  2013-11-06 22:24               ` Marcelo Tosatti
  0 siblings, 1 reply; 53+ messages in thread
From: Michael S. Tsirkin @ 2013-11-06 22:15 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: aarcange, gleb, qemu-devel, Gerd Hoffmann, Igor Mammedov, pbonzini

On Wed, Nov 06, 2013 at 07:53:51PM -0200, Marcelo Tosatti wrote:
> On Wed, Nov 06, 2013 at 11:40:34PM +0200, Michael S. Tsirkin wrote:
> > On Wed, Nov 06, 2013 at 07:31:19PM -0200, Marcelo Tosatti wrote:
> > > 
> > > v2: condition enablement of new mapping to new machine types (Paolo)
> > > v3: fix changelog
> > > v4: rebase
> > > 
> > > -----
> > > 
> > > 
> > > Align guest physical address and host physical address
> > > beyond guest 4GB on a 1GB boundary.
> > > 
> > > Otherwise 1GB TLBs cannot be cached for the range.
> > > 
> > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > Um. This will conflict with:
> >     pc: map PCI address space as catchall region for not mapped addresses
> > 
> > I think we really should stop using the hacked hole thing
> > and just use priorities like that patch does.
> 
> Sorry hacked in what way?
> This patch is necessary to enable 1GB hugepages beyond 4GB of RAM on the
> current machine types.


Sorry if I wasn't clear. when I said "hacked" I was talking about the
pci hole concept generally in upstream qemu, not about your patch.

Its hacked because there's no "pci hole" on PIIX.
pci hole is where pci was hiding some ram behind it
on some systems. AFAIK this is not what is happens on piix though.
What happens really is that everything not covered by RAM memory is PCI.

We implemented this using two aliases of RAM but
the natural thing is really just making PCI lower
priority than RAM and let it overlap.

> > Do you agree? If yes I'm afraid your patch will have to be
> > rebased on top of that yet again, sorry to give you a
> > run-around like that :(
> 
> I don't see what exactly is the suggestion (or why the proposed 
> patch should conflict with "pc: map PCI address space as catchall region
> for not mapped addresses").

It seemed to me that they will conflict but it's after midnight
so maybe I'm confused.
You are saying you apply yours on top and there's no conflict?
In that case I'll recheck, sorry.

> > Also - do you think this is 1.7 material?
> 
> No. Paolo mentioned you have a tree with 1.8 material, correct?

Yes

git://git.kernel.org/pub/scm/virt/kvm/mst/qemu.git pci

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v4)
  2013-11-06 22:15             ` Michael S. Tsirkin
@ 2013-11-06 22:24               ` Marcelo Tosatti
  0 siblings, 0 replies; 53+ messages in thread
From: Marcelo Tosatti @ 2013-11-06 22:24 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: aarcange, gleb, qemu-devel, Gerd Hoffmann, Igor Mammedov, pbonzini

On Thu, Nov 07, 2013 at 12:15:59AM +0200, Michael S. Tsirkin wrote:
> On Wed, Nov 06, 2013 at 07:53:51PM -0200, Marcelo Tosatti wrote:
> > On Wed, Nov 06, 2013 at 11:40:34PM +0200, Michael S. Tsirkin wrote:
> > > On Wed, Nov 06, 2013 at 07:31:19PM -0200, Marcelo Tosatti wrote:
> > > > 
> > > > v2: condition enablement of new mapping to new machine types (Paolo)
> > > > v3: fix changelog
> > > > v4: rebase
> > > > 
> > > > -----
> > > > 
> > > > 
> > > > Align guest physical address and host physical address
> > > > beyond guest 4GB on a 1GB boundary.
> > > > 
> > > > Otherwise 1GB TLBs cannot be cached for the range.
> > > > 
> > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > 
> > > Um. This will conflict with:
> > >     pc: map PCI address space as catchall region for not mapped addresses
> > > 
> > > I think we really should stop using the hacked hole thing
> > > and just use priorities like that patch does.
> > 
> > Sorry hacked in what way?
> > This patch is necessary to enable 1GB hugepages beyond 4GB of RAM on the
> > current machine types.
> 
> 
> Sorry if I wasn't clear. when I said "hacked" I was talking about the
> pci hole concept generally in upstream qemu, not about your patch.
> 
> Its hacked because there's no "pci hole" on PIIX.
> pci hole is where pci was hiding some ram behind it
> on some systems. AFAIK this is not what is happens on piix though.
> What happens really is that everything not covered by RAM memory is PCI.
> 
> We implemented this using two aliases of RAM but
> the natural thing is really just making PCI lower
> priority than RAM and let it overlap.
>
> > > Do you agree? If yes I'm afraid your patch will have to be
> > > rebased on top of that yet again, sorry to give you a
> > > run-around like that :(
> > 
> > I don't see what exactly is the suggestion (or why the proposed 
> > patch should conflict with "pc: map PCI address space as catchall region
> > for not mapped addresses").
> 
> It seemed to me that they will conflict but it's after midnight
> so maybe I'm confused.
> You are saying you apply yours on top and there's no conflict?
> In that case I'll recheck, sorry.

No conflict between "pc: map PCI address space as catchall region" 
and the proposed patch.

> > > Also - do you think this is 1.7 material?
> > 
> > No. Paolo mentioned you have a tree with 1.8 material, correct?
> 
> Yes
> 
> git://git.kernel.org/pub/scm/virt/kvm/mst/qemu.git pci

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v4)
  2013-11-06 21:31       ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v4) Marcelo Tosatti
  2013-11-06 21:40         ` Michael S. Tsirkin
@ 2013-11-07 15:24         ` Igor Mammedov
  2013-11-07 21:53           ` Marcelo Tosatti
  1 sibling, 1 reply; 53+ messages in thread
From: Igor Mammedov @ 2013-11-07 15:24 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: aarcange, gleb, Michael S. Tsirkin, qemu-devel, Gerd Hoffmann, pbonzini

On Wed, 6 Nov 2013 19:31:19 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> 
> v2: condition enablement of new mapping to new machine types (Paolo)
> v3: fix changelog
> v4: rebase
> 
> -----
> 
> 
> Align guest physical address and host physical address
> beyond guest 4GB on a 1GB boundary.
> 
> Otherwise 1GB TLBs cannot be cached for the range.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 12c436e..d29196d 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1156,7 +1156,7 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>  {
>      int linux_boot, i;
>      MemoryRegion *ram, *option_rom_mr;
> -    MemoryRegion *ram_below_4g, *ram_above_4g;
> +    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
>      FWCfgState *fw_cfg;
>  
>      linux_boot = (kernel_filename != NULL);
> @@ -1177,10 +1177,45 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>      e820_add_entry(0, below_4g_mem_size, E820_RAM);
>      if (above_4g_mem_size > 0) {
>          ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> -                                 below_4g_mem_size, above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> +        /*
> +         *
> +         * If 1GB hugepages are used to back guest RAM, map guest address
> +         * space in the range [ramsize,ramsize+holesize] to the ram block
> +         * range [holestart, 4GB]
> +         *
> +         *                      0      h     4G     [ramsize,ramsize+holesize]
> +         *
> +         * guest-addr-space     [      ]     [      ][xxx]
> +         *                                /----------/
> +         * contiguous-ram-block [      ][xxx][     ]
> +         *
> +         * So that memory beyond 4GB is aligned on a 1GB boundary,
> +         * at the host physical address space.
> +         *
> +         */
I did some corner cases testing and there is alias overlapping in case 
  -m 4096 -mem-path /var/lib/hugetlbfs/global/pagesize-1GB

  0000000100000000-000000011fffffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000ffffffff
  0000000100000000-0000000100000000 (prio 0, RW): alias ram-above-4g @pc.ram 0000000100000000-0000000100000000

perhaps zero sized ram-above-4g shouldn't be created at all?

in addition ram-above-4g-piecetwo starts at half page offset 00000000e0000000 but guest sees it 4Gb offset,
wouldn't it cause the same issue patch tries to solve but only for ram-above-4g-piecetwo tail sync host/guest offsets
are not 1Gb aligned?

there is more misalignment with:
 -m 4097 -mem-path /var/lib/hugetlbfs/global/pagesize-1GB

  0000000100000000-00000001000fffff (prio 0, RW): alias ram-above-4g @pc.ram 0000000100000000-00000001000fffff
  0000000100100000-00000001200fffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000ffffffff

where ram-above-4g-piecetwo is aligned with 1Gb+1Mb GPA offset, in addition to 500Mb offset of HPA.
which would cause the same issue as above prehaps?

> +        if (guest_info->gb_align) {
> +            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
> +
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    0x100000000ULL,
> +                                    above_4g_mem_size - holesize);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
>                                      ram_above_4g);
> +
> +            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> +                                     "ram-above-4g-piecetwo", ram,
> +                                     0x100000000ULL - holesize, holesize);
> +            memory_region_add_subregion(system_memory,
> +                                        0x100000000ULL +
> +                                        above_4g_mem_size - holesize,
> +                                        ram_above_4g_piecetwo);
> +        } else {
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    below_4g_mem_size, above_4g_mem_size);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
> +                                    ram_above_4g);
> +        }
>          e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
>      }
>  
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 4fdb7b6..686736e 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -60,6 +60,7 @@ static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
>  static bool has_pvpanic;
>  static bool has_pci_info = true;
>  static bool has_acpi_build = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_init1(QEMUMachineInitArgs *args,
> @@ -128,6 +129,7 @@ static void pc_init1(QEMUMachineInitArgs *args,
>  
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = !pci_enabled;
> +    guest_info->gb_align = gb_align;
>  
>      /* allocate ram and load rom/bios */
>      if (!xen_enabled()) {
> @@ -240,8 +242,14 @@ static void pc_init_pci(QEMUMachineInitArgs *args)
>      pc_init1(args, 1, 1);
>  }
>  
> +static void pc_compat_1_7(QEMUMachineInitArgs *args)
> +{
> +    gb_align = false;
> +}
> +
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
>  {
> +    pc_compat_1_7(args);
>      has_pci_info = false;
>      rom_file_in_ram = false;
>      has_acpi_build = false;
> @@ -274,6 +282,12 @@ static void pc_compat_1_2(QEMUMachineInitArgs *args)
>      disable_kvm_pv_eoi();
>  }
>  
> +static void pc_init_pci_1_7(QEMUMachineInitArgs *args)
> +{
> +    pc_compat_1_7(args);
> +    pc_init_pci(args);
> +}
> +
>  static void pc_init_pci_1_6(QEMUMachineInitArgs *args)
>  {
>      pc_compat_1_6(args);
> @@ -346,13 +360,21 @@ static void pc_xen_hvm_init(QEMUMachineInitArgs *args)
>      .desc = "Standard PC (i440FX + PIIX, 1996)", \
>      .hot_add_cpu = pc_hot_add_cpu
>  
> +#define PC_I440FX_1_8_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
> +static QEMUMachine pc_i440fx_machine_v1_8 = {
> +    PC_I440FX_1_8_MACHINE_OPTIONS,
> +    .name = "pc-i440fx-1.8",
> +    .alias = "pc",
> +    .init = pc_init_pci,
> +    .is_default = 1,
> +};
> +
>  #define PC_I440FX_1_7_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
>  static QEMUMachine pc_i440fx_machine_v1_7 = {
>      PC_I440FX_1_7_MACHINE_OPTIONS,
>      .name = "pc-i440fx-1.7",
>      .alias = "pc",
> -    .init = pc_init_pci,
> -    .is_default = 1,
> +    .init = pc_init_pci_1_7,
>  };
>  
>  #define PC_I440FX_1_6_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
> @@ -754,6 +776,7 @@ static QEMUMachine xenfv_machine = {
>  
>  static void pc_machine_init(void)
>  {
> +    qemu_register_machine(&pc_i440fx_machine_v1_8);
>      qemu_register_machine(&pc_i440fx_machine_v1_7);
>      qemu_register_machine(&pc_i440fx_machine_v1_6);
>      qemu_register_machine(&pc_i440fx_machine_v1_5);
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index 4c191d3..c2eb568 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -50,6 +50,7 @@
>  static bool has_pvpanic;
>  static bool has_pci_info = true;
>  static bool has_acpi_build = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_q35_init(QEMUMachineInitArgs *args)
> @@ -113,6 +114,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = false;
>      guest_info->has_acpi_build = has_acpi_build;
> +    guest_info->gb_align = gb_align;
>  
>      /* allocate ram and load rom/bios */
>      if (!xen_enabled()) {
> @@ -222,8 +224,14 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>      }
>  }
>  
> +static void pc_compat_1_7(QEMUMachineInitArgs *args)
> +{
> +   gb_align = false;
> +}
> +
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
>  {
> +    pc_compat_1_7(args);
>      has_pci_info = false;
>      rom_file_in_ram = false;
>      has_acpi_build = false;
> @@ -243,6 +251,12 @@ static void pc_compat_1_4(QEMUMachineInitArgs *args)
>      x86_cpu_compat_set_features("Westmere", FEAT_1_ECX, 0, CPUID_EXT_PCLMULQDQ);
>  }
>  
> +static void pc_q35_init_1_7(QEMUMachineInitArgs *args)
> +{
> +    pc_compat_1_7(args);
> +    pc_q35_init(args);
> +}
> +
>  static void pc_q35_init_1_6(QEMUMachineInitArgs *args)
>  {
>      pc_compat_1_6(args);
> @@ -266,13 +280,22 @@ static void pc_q35_init_1_4(QEMUMachineInitArgs *args)
>      .desc = "Standard PC (Q35 + ICH9, 2009)", \
>      .hot_add_cpu = pc_hot_add_cpu
>  
> +#define PC_Q35_1_8_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
> +
> +static QEMUMachine pc_q35_machine_v1_8 = {
> +    PC_Q35_1_8_MACHINE_OPTIONS,
> +    .name = "pc-q35-1.8",
> +    .alias = "q35",
> +    .init = pc_q35_init,
> +};
> +
>  #define PC_Q35_1_7_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
>  
>  static QEMUMachine pc_q35_machine_v1_7 = {
>      PC_Q35_1_7_MACHINE_OPTIONS,
>      .name = "pc-q35-1.7",
>      .alias = "q35",
> -    .init = pc_q35_init,
> +    .init = pc_q35_init_1_7,
>  };
>  
>  #define PC_Q35_1_6_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
> @@ -313,6 +336,7 @@ static QEMUMachine pc_q35_machine_v1_4 = {
>  
>  static void pc_q35_machine_init(void)
>  {
> +    qemu_register_machine(&pc_q35_machine_v1_8);
>      qemu_register_machine(&pc_q35_machine_v1_7);
>      qemu_register_machine(&pc_q35_machine_v1_6);
>      qemu_register_machine(&pc_q35_machine_v1_5);
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 03cc0ba..35a6885 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -41,6 +41,7 @@ struct PcGuestInfo {
>      uint64_t *node_cpu;
>      FWCfgState *fw_cfg;
>      bool has_acpi_build;
> +    bool gb_align;
>  };
>  
>  /* parallel.c */

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions
  2013-10-30 23:44                         ` Marcelo Tosatti
@ 2013-11-07 15:25                           ` Igor Mammedov
  0 siblings, 0 replies; 53+ messages in thread
From: Igor Mammedov @ 2013-11-07 15:25 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: aarcange, peter.maydell, gleb, quintela, jan.kiszka, qemu-devel,
	aliguori, pbonzini, afaerber, rth

On Wed, 30 Oct 2013 21:44:12 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> On Wed, Oct 30, 2013 at 08:56:17PM +0100, Igor Mammedov wrote:
> > On Wed, 30 Oct 2013 16:51:29 -0200
> > Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > 
[...]
> 
> > btw: now if QEMU can't allocate hugepages for whole guest size it will fallback
> > to 4k pages anyway for whole guest size, with warning that isn't visible if
> > user doesn't start QEMU manually.
> 
> Not with -mem-prealloc.
tested with it, it does not much, just prints error message to stderr
and then fallbacks to default allocator continuing guest booting.

> 
> 

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v4)
  2013-11-07 15:24         ` Igor Mammedov
@ 2013-11-07 21:53           ` Marcelo Tosatti
  2013-11-10 20:47             ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v5) Marcelo Tosatti
  0 siblings, 1 reply; 53+ messages in thread
From: Marcelo Tosatti @ 2013-11-07 21:53 UTC (permalink / raw)
  To: Igor Mammedov
  Cc: aarcange, gleb, Michael S. Tsirkin, qemu-devel, Gerd Hoffmann, pbonzini

On Thu, Nov 07, 2013 at 04:24:59PM +0100, Igor Mammedov wrote:
> On Wed, 6 Nov 2013 19:31:19 -0200
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
> > 
> > v2: condition enablement of new mapping to new machine types (Paolo)
> > v3: fix changelog
> > v4: rebase
> > 
> > -----
> > 
> > 
> > Align guest physical address and host physical address
> > beyond guest 4GB on a 1GB boundary.
> > 
> > Otherwise 1GB TLBs cannot be cached for the range.
> > 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > 
> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> > index 12c436e..d29196d 100644
> > --- a/hw/i386/pc.c
> > +++ b/hw/i386/pc.c
> > @@ -1156,7 +1156,7 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
> >  {
> >      int linux_boot, i;
> >      MemoryRegion *ram, *option_rom_mr;
> > -    MemoryRegion *ram_below_4g, *ram_above_4g;
> > +    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
> >      FWCfgState *fw_cfg;
> >  
> >      linux_boot = (kernel_filename != NULL);
> > @@ -1177,10 +1177,45 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
> >      e820_add_entry(0, below_4g_mem_size, E820_RAM);
> >      if (above_4g_mem_size > 0) {
> >          ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> > -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> > -                                 below_4g_mem_size, above_4g_mem_size);
> > -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> > +        /*
> > +         *
> > +         * If 1GB hugepages are used to back guest RAM, map guest address
> > +         * space in the range [ramsize,ramsize+holesize] to the ram block
> > +         * range [holestart, 4GB]
> > +         *
> > +         *                      0      h     4G     [ramsize,ramsize+holesize]
> > +         *
> > +         * guest-addr-space     [      ]     [      ][xxx]
> > +         *                                /----------/
> > +         * contiguous-ram-block [      ][xxx][     ]
> > +         *
> > +         * So that memory beyond 4GB is aligned on a 1GB boundary,
> > +         * at the host physical address space.
> > +         *
> > +         */
> I did some corner cases testing and there is alias overlapping in case 
>   -m 4096 -mem-path /var/lib/hugetlbfs/global/pagesize-1GB
> 
>   0000000100000000-000000011fffffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000ffffffff
>   0000000100000000-0000000100000000 (prio 0, RW): alias ram-above-4g @pc.ram 0000000100000000-0000000100000000
> 
> perhaps zero sized ram-above-4g shouldn't be created at all?

Right.

> in addition ram-above-4g-piecetwo starts at half page offset 00000000e0000000 but guest sees it 4Gb offset,
> wouldn't it cause the same issue patch tries to solve but only for ram-above-4g-piecetwo tail sync host/guest offsets
> are not 1Gb aligned?

Piece 1 is aligned. Piece 2 maps from tail of RAM (gpa) to start of hole
(ramblock).

> there is more misalignment with:
>  -m 4097 -mem-path /var/lib/hugetlbfs/global/pagesize-1GB
> 
>   0000000100000000-00000001000fffff (prio 0, RW): alias ram-above-4g @pc.ram 0000000100000000-00000001000fffff

Piece 1 is aligned.

>   0000000100100000-00000001200fffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000ffffffff

Piece 2 is not. Should allocate one extra MB of RAM to align that. I'll
resend, thanks.

> where ram-above-4g-piecetwo is aligned with 1Gb+1Mb GPA offset, in addition to 500Mb offset of HPA.
> which would cause the same issue as above prehaps?

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v5)
  2013-11-07 21:53           ` Marcelo Tosatti
@ 2013-11-10 20:47             ` Marcelo Tosatti
  2013-11-12 12:45               ` Igor Mammedov
  2013-11-12 21:16               ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v6) Marcelo Tosatti
  0 siblings, 2 replies; 53+ messages in thread
From: Marcelo Tosatti @ 2013-11-10 20:47 UTC (permalink / raw)
  To: Igor Mammedov
  Cc: aarcange, gleb, Michael S. Tsirkin, qemu-devel, Gerd Hoffmann, pbonzini

v2: condition enablement of new mapping to new machine types (Paolo)
v3: fix changelog
v4: rebase
v5: ensure alignment of piecetwo on 2MB GPA (Igor)
    do not register zero-sized piece-one    (Igor)

-----

Align guest physical address and host physical address
beyond guest 4GB on a 1GB boundary.

Otherwise 1GB TLBs cannot be cached for the range.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 12c436e..abd6b81 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1156,8 +1156,9 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
 {
     int linux_boot, i;
     MemoryRegion *ram, *option_rom_mr;
-    MemoryRegion *ram_below_4g, *ram_above_4g;
+    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
     FWCfgState *fw_cfg;
+    uint64_t memsize, align_offset;
 
     linux_boot = (kernel_filename != NULL);
 
@@ -1166,8 +1167,12 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
      * with older qemus that used qemu_ram_alloc().
      */
     ram = g_malloc(sizeof(*ram));
-    memory_region_init_ram(ram, NULL, "pc.ram",
-                           below_4g_mem_size + above_4g_mem_size);
+
+    memsize = ROUND_UP(below_4g_mem_size + above_4g_mem_size, 1UL << 21);
+    align_offset = memsize - (below_4g_mem_size + above_4g_mem_size);
+
+    memory_region_init_ram(ram, NULL, "pc.ram", memsize);
+
     vmstate_register_ram_global(ram);
     *ram_memory = ram;
     ram_below_4g = g_malloc(sizeof(*ram_below_4g));
@@ -1177,10 +1182,50 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
     e820_add_entry(0, below_4g_mem_size, E820_RAM);
     if (above_4g_mem_size > 0) {
         ram_above_4g = g_malloc(sizeof(*ram_above_4g));
-        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
-                                 below_4g_mem_size, above_4g_mem_size);
-        memory_region_add_subregion(system_memory, 0x100000000ULL,
+        /*
+         *
+         * If 1GB hugepages are used to back guest RAM, map guest address
+         * space in the range [ramsize,ramsize+holesize] to the ram block
+         * range [holestart, 4GB]
+         *
+         *                      0      h     4G     [ramsize,ramsize+holesize]
+         *
+         * guest-addr-space     [      ]     [      ][xxx]
+         *                                /----------/
+         * contiguous-ram-block [      ][xxx][     ]
+         *
+         * So that memory beyond 4GB is aligned on a 1GB boundary,
+         * at the host physical address space.
+         *
+         */
+        if (guest_info->gb_align) {
+            uint64_t holesize = 0x100000000ULL - below_4g_mem_size;
+            uint64_t piecetwosize = holesize - align_offset;
+
+            assert(piecetwosize <= holesize);
+
+            if ((above_4g_mem_size - piecetwosize) > 0) {
+                memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
+                                         ram, 0x100000000ULL,
+                                         above_4g_mem_size - piecetwosize);
+                memory_region_add_subregion(system_memory, 0x100000000ULL,
+                                         ram_above_4g);
+            }
+
+            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
+            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
+                                     "ram-above-4g-piecetwo", ram,
+                                     0x100000000ULL - holesize, piecetwosize);
+            memory_region_add_subregion(system_memory,
+                                        0x100000000ULL +
+                                        above_4g_mem_size - piecetwosize,
+                                        ram_above_4g_piecetwo);
+        } else {
+            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
+                                    below_4g_mem_size, above_4g_mem_size);
+            memory_region_add_subregion(system_memory, 0x100000000ULL,
                                     ram_above_4g);
+        }
         e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
     }
 
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 4fdb7b6..686736e 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -60,6 +60,7 @@ static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
 static bool has_pvpanic;
 static bool has_pci_info = true;
 static bool has_acpi_build = true;
+static bool gb_align = true;
 
 /* PC hardware initialisation */
 static void pc_init1(QEMUMachineInitArgs *args,
@@ -128,6 +129,7 @@ static void pc_init1(QEMUMachineInitArgs *args,
 
     guest_info->has_pci_info = has_pci_info;
     guest_info->isapc_ram_fw = !pci_enabled;
+    guest_info->gb_align = gb_align;
 
     /* allocate ram and load rom/bios */
     if (!xen_enabled()) {
@@ -240,8 +242,14 @@ static void pc_init_pci(QEMUMachineInitArgs *args)
     pc_init1(args, 1, 1);
 }
 
+static void pc_compat_1_7(QEMUMachineInitArgs *args)
+{
+    gb_align = false;
+}
+
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
 {
+    pc_compat_1_7(args);
     has_pci_info = false;
     rom_file_in_ram = false;
     has_acpi_build = false;
@@ -274,6 +282,12 @@ static void pc_compat_1_2(QEMUMachineInitArgs *args)
     disable_kvm_pv_eoi();
 }
 
+static void pc_init_pci_1_7(QEMUMachineInitArgs *args)
+{
+    pc_compat_1_7(args);
+    pc_init_pci(args);
+}
+
 static void pc_init_pci_1_6(QEMUMachineInitArgs *args)
 {
     pc_compat_1_6(args);
@@ -346,13 +360,21 @@ static void pc_xen_hvm_init(QEMUMachineInitArgs *args)
     .desc = "Standard PC (i440FX + PIIX, 1996)", \
     .hot_add_cpu = pc_hot_add_cpu
 
+#define PC_I440FX_1_8_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
+static QEMUMachine pc_i440fx_machine_v1_8 = {
+    PC_I440FX_1_8_MACHINE_OPTIONS,
+    .name = "pc-i440fx-1.8",
+    .alias = "pc",
+    .init = pc_init_pci,
+    .is_default = 1,
+};
+
 #define PC_I440FX_1_7_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
 static QEMUMachine pc_i440fx_machine_v1_7 = {
     PC_I440FX_1_7_MACHINE_OPTIONS,
     .name = "pc-i440fx-1.7",
     .alias = "pc",
-    .init = pc_init_pci,
-    .is_default = 1,
+    .init = pc_init_pci_1_7,
 };
 
 #define PC_I440FX_1_6_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
@@ -754,6 +776,7 @@ static QEMUMachine xenfv_machine = {
 
 static void pc_machine_init(void)
 {
+    qemu_register_machine(&pc_i440fx_machine_v1_8);
     qemu_register_machine(&pc_i440fx_machine_v1_7);
     qemu_register_machine(&pc_i440fx_machine_v1_6);
     qemu_register_machine(&pc_i440fx_machine_v1_5);
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 4c191d3..c2eb568 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -50,6 +50,7 @@
 static bool has_pvpanic;
 static bool has_pci_info = true;
 static bool has_acpi_build = true;
+static bool gb_align = true;
 
 /* PC hardware initialisation */
 static void pc_q35_init(QEMUMachineInitArgs *args)
@@ -113,6 +114,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
     guest_info->has_pci_info = has_pci_info;
     guest_info->isapc_ram_fw = false;
     guest_info->has_acpi_build = has_acpi_build;
+    guest_info->gb_align = gb_align;
 
     /* allocate ram and load rom/bios */
     if (!xen_enabled()) {
@@ -222,8 +224,14 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
     }
 }
 
+static void pc_compat_1_7(QEMUMachineInitArgs *args)
+{
+   gb_align = false;
+}
+
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
 {
+    pc_compat_1_7(args);
     has_pci_info = false;
     rom_file_in_ram = false;
     has_acpi_build = false;
@@ -243,6 +251,12 @@ static void pc_compat_1_4(QEMUMachineInitArgs *args)
     x86_cpu_compat_set_features("Westmere", FEAT_1_ECX, 0, CPUID_EXT_PCLMULQDQ);
 }
 
+static void pc_q35_init_1_7(QEMUMachineInitArgs *args)
+{
+    pc_compat_1_7(args);
+    pc_q35_init(args);
+}
+
 static void pc_q35_init_1_6(QEMUMachineInitArgs *args)
 {
     pc_compat_1_6(args);
@@ -266,13 +280,22 @@ static void pc_q35_init_1_4(QEMUMachineInitArgs *args)
     .desc = "Standard PC (Q35 + ICH9, 2009)", \
     .hot_add_cpu = pc_hot_add_cpu
 
+#define PC_Q35_1_8_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
+
+static QEMUMachine pc_q35_machine_v1_8 = {
+    PC_Q35_1_8_MACHINE_OPTIONS,
+    .name = "pc-q35-1.8",
+    .alias = "q35",
+    .init = pc_q35_init,
+};
+
 #define PC_Q35_1_7_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
 
 static QEMUMachine pc_q35_machine_v1_7 = {
     PC_Q35_1_7_MACHINE_OPTIONS,
     .name = "pc-q35-1.7",
     .alias = "q35",
-    .init = pc_q35_init,
+    .init = pc_q35_init_1_7,
 };
 
 #define PC_Q35_1_6_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
@@ -313,6 +336,7 @@ static QEMUMachine pc_q35_machine_v1_4 = {
 
 static void pc_q35_machine_init(void)
 {
+    qemu_register_machine(&pc_q35_machine_v1_8);
     qemu_register_machine(&pc_q35_machine_v1_7);
     qemu_register_machine(&pc_q35_machine_v1_6);
     qemu_register_machine(&pc_q35_machine_v1_5);
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 03cc0ba..35a6885 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -41,6 +41,7 @@ struct PcGuestInfo {
     uint64_t *node_cpu;
     FWCfgState *fw_cfg;
     bool has_acpi_build;
+    bool gb_align;
 };
 
 /* parallel.c */

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v5)
  2013-11-10 20:47             ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v5) Marcelo Tosatti
@ 2013-11-12 12:45               ` Igor Mammedov
  2013-11-12 20:32                 ` Marcelo Tosatti
  2013-11-12 21:16               ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v6) Marcelo Tosatti
  1 sibling, 1 reply; 53+ messages in thread
From: Igor Mammedov @ 2013-11-12 12:45 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: aarcange, gleb, Michael S. Tsirkin, qemu-devel, Gerd Hoffmann, pbonzini

On Sun, 10 Nov 2013 18:47:53 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

[...]

> @@ -1177,10 +1182,50 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>      e820_add_entry(0, below_4g_mem_size, E820_RAM);
>      if (above_4g_mem_size > 0) {
>          ram_above_4g = g_malloc(sizeof(*ram_above_4g));
it is a memory leak when "ram-above-4g" is not created

> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> -                                 below_4g_mem_size, above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> +        /*
> +         *
> +         * If 1GB hugepages are used to back guest RAM, map guest address
> +         * space in the range [ramsize,ramsize+holesize] to the ram block
> +         * range [holestart, 4GB]
> +         *
> +         *                      0      h     4G     [ramsize,ramsize+holesize]
> +         *
> +         * guest-addr-space     [      ]     [      ][xxx]
> +         *                                /----------/
> +         * contiguous-ram-block [      ][xxx][     ]
> +         *
> +         * So that memory beyond 4GB is aligned on a 1GB boundary,
> +         * at the host physical address space.
> +         *
> +         */
> +        if (guest_info->gb_align) {
> +            uint64_t holesize = 0x100000000ULL - below_4g_mem_size;
> +            uint64_t piecetwosize = holesize - align_offset;
> +
> +            assert(piecetwosize <= holesize);
> +
> +            if ((above_4g_mem_size - piecetwosize) > 0) {
here is integer overflow,
reproducable with:  -mem-path /var/lib/hugetlbfs/global/pagesize-1GB -m 3600


> +                memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
> +                                         ram, 0x100000000ULL,
> +                                         above_4g_mem_size - piecetwosize);
> +                memory_region_add_subregion(system_memory, 0x100000000ULL,
> +                                         ram_above_4g);
> +            }
> +
> +            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> +                                     "ram-above-4g-piecetwo", ram,
> +                                     0x100000000ULL - holesize, piecetwosize);
> +            memory_region_add_subregion(system_memory,
> +                                        0x100000000ULL +
> +                                        above_4g_mem_size - piecetwosize,
is there a guaranty that "ram-above-4g-piecetwo" will be mapped immediately
after "ram-above-4g" without any gap?

if there is no then you might need to change how e820_add_entry() for high ram
is handled and possibly CMOS value as well. 

> +                                        ram_above_4g_piecetwo);
> +        } else {
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    below_4g_mem_size, above_4g_mem_size);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
>                                      ram_above_4g);
> +        }
>          e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
>      }
[...]
-- 
Regards,
  Igor

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v5)
  2013-11-12 12:45               ` Igor Mammedov
@ 2013-11-12 20:32                 ` Marcelo Tosatti
  0 siblings, 0 replies; 53+ messages in thread
From: Marcelo Tosatti @ 2013-11-12 20:32 UTC (permalink / raw)
  To: Igor Mammedov
  Cc: aarcange, gleb, Michael S. Tsirkin, qemu-devel, Gerd Hoffmann, pbonzini

On Tue, Nov 12, 2013 at 01:45:51PM +0100, Igor Mammedov wrote:
> On Sun, 10 Nov 2013 18:47:53 -0200
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
> [...]
> 
> > @@ -1177,10 +1182,50 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
> >      e820_add_entry(0, below_4g_mem_size, E820_RAM);
> >      if (above_4g_mem_size > 0) {
> >          ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> it is a memory leak when "ram-above-4g" is not created
> 
> > -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> > -                                 below_4g_mem_size, above_4g_mem_size);
> > -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> > +        /*
> > +         *
> > +         * If 1GB hugepages are used to back guest RAM, map guest address
> > +         * space in the range [ramsize,ramsize+holesize] to the ram block
> > +         * range [holestart, 4GB]
> > +         *
> > +         *                      0      h     4G     [ramsize,ramsize+holesize]
> > +         *
> > +         * guest-addr-space     [      ]     [      ][xxx]
> > +         *                                /----------/
> > +         * contiguous-ram-block [      ][xxx][     ]
> > +         *
> > +         * So that memory beyond 4GB is aligned on a 1GB boundary,
> > +         * at the host physical address space.
> > +         *
> > +         */
> > +        if (guest_info->gb_align) {
> > +            uint64_t holesize = 0x100000000ULL - below_4g_mem_size;
> > +            uint64_t piecetwosize = holesize - align_offset;
> > +
> > +            assert(piecetwosize <= holesize);
> > +
> > +            if ((above_4g_mem_size - piecetwosize) > 0) {
> here is integer overflow,
> reproducable with:  -mem-path /var/lib/hugetlbfs/global/pagesize-1GB -m 3600

Gah.

> 
> 
> > +                memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
> > +                                         ram, 0x100000000ULL,
> > +                                         above_4g_mem_size - piecetwosize);
> > +                memory_region_add_subregion(system_memory, 0x100000000ULL,
> > +                                         ram_above_4g);
> > +            }
> > +
> > +            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> > +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> > +                                     "ram-above-4g-piecetwo", ram,
> > +                                     0x100000000ULL - holesize, piecetwosize);
> > +            memory_region_add_subregion(system_memory,
> > +                                        0x100000000ULL +
> > +                                        above_4g_mem_size - piecetwosize,
> is there a guaranty that "ram-above-4g-piecetwo" will be mapped immediately
> after "ram-above-4g" without any gap?

Yes:

Length of piece one alias: above_4g_mem_size - piecetwosize
Start address of piecetwo: 0x100000000ULL + above_4g_mem_size - piecetwosize

> if there is no then you might need to change how e820_add_entry() for high ram
> is handled and possibly CMOS value as well. 

> 
> > +                                        ram_above_4g_piecetwo);
> > +        } else {
> > +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> > +                                    below_4g_mem_size, above_4g_mem_size);
> > +            memory_region_add_subregion(system_memory, 0x100000000ULL,
> >                                      ram_above_4g);
> > +        }
> >          e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
> >      }
> [...]
> -- 
> Regards,
>   Igor

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v6)
  2013-11-10 20:47             ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v5) Marcelo Tosatti
  2013-11-12 12:45               ` Igor Mammedov
@ 2013-11-12 21:16               ` Marcelo Tosatti
  2013-11-13 13:59                 ` Igor Mammedov
                                   ` (2 more replies)
  1 sibling, 3 replies; 53+ messages in thread
From: Marcelo Tosatti @ 2013-11-12 21:16 UTC (permalink / raw)
  To: Igor Mammedov
  Cc: aarcange, gleb, Michael S. Tsirkin, qemu-devel, Gerd Hoffmann, pbonzini


v2: condition enablement of new mapping to new machine types (Paolo)
v3: fix changelog
v4: rebase
v5: ensure alignment of piecetwo on 2MB GPA (Igor)
    do not register zero-sized piece-one    (Igor)
v6: fix memory leak                         (Igor)
    fix integer overflow                    (Igor)

----

Align guest physical address and host physical address
beyond guest 4GB on a 1GB boundary.

Otherwise 1GB TLBs cannot be cached for the range.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 12c436e..9cf5109 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1156,8 +1156,9 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
 {
     int linux_boot, i;
     MemoryRegion *ram, *option_rom_mr;
-    MemoryRegion *ram_below_4g, *ram_above_4g;
+    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
     FWCfgState *fw_cfg;
+    uint64_t memsize, align_offset;
 
     linux_boot = (kernel_filename != NULL);
 
@@ -1166,8 +1167,12 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
      * with older qemus that used qemu_ram_alloc().
      */
     ram = g_malloc(sizeof(*ram));
-    memory_region_init_ram(ram, NULL, "pc.ram",
-                           below_4g_mem_size + above_4g_mem_size);
+
+    memsize = ROUND_UP(below_4g_mem_size + above_4g_mem_size, 1UL << 21);
+    align_offset = memsize - (below_4g_mem_size + above_4g_mem_size);
+
+    memory_region_init_ram(ram, NULL, "pc.ram", memsize);
+
     vmstate_register_ram_global(ram);
     *ram_memory = ram;
     ram_below_4g = g_malloc(sizeof(*ram_below_4g));
@@ -1177,10 +1182,53 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
     e820_add_entry(0, below_4g_mem_size, E820_RAM);
     if (above_4g_mem_size > 0) {
         ram_above_4g = g_malloc(sizeof(*ram_above_4g));
-        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
-                                 below_4g_mem_size, above_4g_mem_size);
-        memory_region_add_subregion(system_memory, 0x100000000ULL,
+        /*
+         *
+         * If 1GB hugepages are used to back guest RAM, map guest address
+         * space in the range [ramsize,ramsize+holesize] to the ram block
+         * range [holestart, 4GB]
+         *
+         *                      0      h     4G     [ramsize,ramsize+holesize]
+         *
+         * guest-addr-space     [      ]     [      ][xxx]
+         *                                /----------/
+         * contiguous-ram-block [      ][xxx][     ]
+         *
+         * So that memory beyond 4GB is aligned on a 1GB boundary,
+         * at the host physical address space.
+         *
+         */
+        if (guest_info->gb_align) {
+            uint64_t holesize = 0x100000000ULL - below_4g_mem_size;
+            uint64_t piecetwosize = holesize - align_offset;
+
+            assert(piecetwosize <= holesize);
+
+            piecetwosize = MIN(above_4g_mem_size, piecetwosize);
+            if ((above_4g_mem_size - piecetwosize) > 0) {
+                memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
+                                         ram, 0x100000000ULL,
+                                         above_4g_mem_size - piecetwosize);
+                memory_region_add_subregion(system_memory, 0x100000000ULL,
+                                         ram_above_4g);
+            } else {
+                g_free(ram_above_4g);
+            }
+
+            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
+            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
+                                     "ram-above-4g-piecetwo", ram,
+                                     0x100000000ULL - holesize, piecetwosize);
+            memory_region_add_subregion(system_memory,
+                                        0x100000000ULL +
+                                        above_4g_mem_size - piecetwosize,
+                                        ram_above_4g_piecetwo);
+        } else {
+            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
+                                    below_4g_mem_size, above_4g_mem_size);
+            memory_region_add_subregion(system_memory, 0x100000000ULL,
                                     ram_above_4g);
+        }
         e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
     }
 
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 4fdb7b6..686736e 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -60,6 +60,7 @@ static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
 static bool has_pvpanic;
 static bool has_pci_info = true;
 static bool has_acpi_build = true;
+static bool gb_align = true;
 
 /* PC hardware initialisation */
 static void pc_init1(QEMUMachineInitArgs *args,
@@ -128,6 +129,7 @@ static void pc_init1(QEMUMachineInitArgs *args,
 
     guest_info->has_pci_info = has_pci_info;
     guest_info->isapc_ram_fw = !pci_enabled;
+    guest_info->gb_align = gb_align;
 
     /* allocate ram and load rom/bios */
     if (!xen_enabled()) {
@@ -240,8 +242,14 @@ static void pc_init_pci(QEMUMachineInitArgs *args)
     pc_init1(args, 1, 1);
 }
 
+static void pc_compat_1_7(QEMUMachineInitArgs *args)
+{
+    gb_align = false;
+}
+
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
 {
+    pc_compat_1_7(args);
     has_pci_info = false;
     rom_file_in_ram = false;
     has_acpi_build = false;
@@ -274,6 +282,12 @@ static void pc_compat_1_2(QEMUMachineInitArgs *args)
     disable_kvm_pv_eoi();
 }
 
+static void pc_init_pci_1_7(QEMUMachineInitArgs *args)
+{
+    pc_compat_1_7(args);
+    pc_init_pci(args);
+}
+
 static void pc_init_pci_1_6(QEMUMachineInitArgs *args)
 {
     pc_compat_1_6(args);
@@ -346,13 +360,21 @@ static void pc_xen_hvm_init(QEMUMachineInitArgs *args)
     .desc = "Standard PC (i440FX + PIIX, 1996)", \
     .hot_add_cpu = pc_hot_add_cpu
 
+#define PC_I440FX_1_8_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
+static QEMUMachine pc_i440fx_machine_v1_8 = {
+    PC_I440FX_1_8_MACHINE_OPTIONS,
+    .name = "pc-i440fx-1.8",
+    .alias = "pc",
+    .init = pc_init_pci,
+    .is_default = 1,
+};
+
 #define PC_I440FX_1_7_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
 static QEMUMachine pc_i440fx_machine_v1_7 = {
     PC_I440FX_1_7_MACHINE_OPTIONS,
     .name = "pc-i440fx-1.7",
     .alias = "pc",
-    .init = pc_init_pci,
-    .is_default = 1,
+    .init = pc_init_pci_1_7,
 };
 
 #define PC_I440FX_1_6_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
@@ -754,6 +776,7 @@ static QEMUMachine xenfv_machine = {
 
 static void pc_machine_init(void)
 {
+    qemu_register_machine(&pc_i440fx_machine_v1_8);
     qemu_register_machine(&pc_i440fx_machine_v1_7);
     qemu_register_machine(&pc_i440fx_machine_v1_6);
     qemu_register_machine(&pc_i440fx_machine_v1_5);
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 4c191d3..c2eb568 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -50,6 +50,7 @@
 static bool has_pvpanic;
 static bool has_pci_info = true;
 static bool has_acpi_build = true;
+static bool gb_align = true;
 
 /* PC hardware initialisation */
 static void pc_q35_init(QEMUMachineInitArgs *args)
@@ -113,6 +114,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
     guest_info->has_pci_info = has_pci_info;
     guest_info->isapc_ram_fw = false;
     guest_info->has_acpi_build = has_acpi_build;
+    guest_info->gb_align = gb_align;
 
     /* allocate ram and load rom/bios */
     if (!xen_enabled()) {
@@ -222,8 +224,14 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
     }
 }
 
+static void pc_compat_1_7(QEMUMachineInitArgs *args)
+{
+   gb_align = false;
+}
+
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
 {
+    pc_compat_1_7(args);
     has_pci_info = false;
     rom_file_in_ram = false;
     has_acpi_build = false;
@@ -243,6 +251,12 @@ static void pc_compat_1_4(QEMUMachineInitArgs *args)
     x86_cpu_compat_set_features("Westmere", FEAT_1_ECX, 0, CPUID_EXT_PCLMULQDQ);
 }
 
+static void pc_q35_init_1_7(QEMUMachineInitArgs *args)
+{
+    pc_compat_1_7(args);
+    pc_q35_init(args);
+}
+
 static void pc_q35_init_1_6(QEMUMachineInitArgs *args)
 {
     pc_compat_1_6(args);
@@ -266,13 +280,22 @@ static void pc_q35_init_1_4(QEMUMachineInitArgs *args)
     .desc = "Standard PC (Q35 + ICH9, 2009)", \
     .hot_add_cpu = pc_hot_add_cpu
 
+#define PC_Q35_1_8_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
+
+static QEMUMachine pc_q35_machine_v1_8 = {
+    PC_Q35_1_8_MACHINE_OPTIONS,
+    .name = "pc-q35-1.8",
+    .alias = "q35",
+    .init = pc_q35_init,
+};
+
 #define PC_Q35_1_7_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
 
 static QEMUMachine pc_q35_machine_v1_7 = {
     PC_Q35_1_7_MACHINE_OPTIONS,
     .name = "pc-q35-1.7",
     .alias = "q35",
-    .init = pc_q35_init,
+    .init = pc_q35_init_1_7,
 };
 
 #define PC_Q35_1_6_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
@@ -313,6 +336,7 @@ static QEMUMachine pc_q35_machine_v1_4 = {
 
 static void pc_q35_machine_init(void)
 {
+    qemu_register_machine(&pc_q35_machine_v1_8);
     qemu_register_machine(&pc_q35_machine_v1_7);
     qemu_register_machine(&pc_q35_machine_v1_6);
     qemu_register_machine(&pc_q35_machine_v1_5);
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 03cc0ba..35a6885 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -41,6 +41,7 @@ struct PcGuestInfo {
     uint64_t *node_cpu;
     FWCfgState *fw_cfg;
     bool has_acpi_build;
+    bool gb_align;
 };
 
 /* parallel.c */

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v6)
  2013-11-12 21:16               ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v6) Marcelo Tosatti
@ 2013-11-13 13:59                 ` Igor Mammedov
  2013-11-13 17:13                 ` Paolo Bonzini
  2013-11-13 19:08                 ` Michael S. Tsirkin
  2 siblings, 0 replies; 53+ messages in thread
From: Igor Mammedov @ 2013-11-13 13:59 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: aarcange, gleb, Michael S. Tsirkin, qemu-devel, Gerd Hoffmann, pbonzini

On Tue, 12 Nov 2013 19:16:37 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> 
> v2: condition enablement of new mapping to new machine types (Paolo)
> v3: fix changelog
> v4: rebase
> v5: ensure alignment of piecetwo on 2MB GPA (Igor)
>     do not register zero-sized piece-one    (Igor)
> v6: fix memory leak                         (Igor)
>     fix integer overflow                    (Igor)
> 
> ----
> 
> Align guest physical address and host physical address
> beyond guest 4GB on a 1GB boundary.
> 
> Otherwise 1GB TLBs cannot be cached for the range.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Reviewed-By: Igor Mammedov <imammedo@redhat.com>

PS:
all this alignment calculations look very fragile and if this code is
touched it's easy to regress.

It would be nice for make check to catch regression here when it happens.

> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 12c436e..9cf5109 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1156,8 +1156,9 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>  {
>      int linux_boot, i;
>      MemoryRegion *ram, *option_rom_mr;
> -    MemoryRegion *ram_below_4g, *ram_above_4g;
> +    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
>      FWCfgState *fw_cfg;
> +    uint64_t memsize, align_offset;
>  
>      linux_boot = (kernel_filename != NULL);
>  
> @@ -1166,8 +1167,12 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>       * with older qemus that used qemu_ram_alloc().
>       */
>      ram = g_malloc(sizeof(*ram));
> -    memory_region_init_ram(ram, NULL, "pc.ram",
> -                           below_4g_mem_size + above_4g_mem_size);
> +
> +    memsize = ROUND_UP(below_4g_mem_size + above_4g_mem_size, 1UL << 21);
> +    align_offset = memsize - (below_4g_mem_size + above_4g_mem_size);
> +
> +    memory_region_init_ram(ram, NULL, "pc.ram", memsize);
> +
>      vmstate_register_ram_global(ram);
>      *ram_memory = ram;
>      ram_below_4g = g_malloc(sizeof(*ram_below_4g));
> @@ -1177,10 +1182,53 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>      e820_add_entry(0, below_4g_mem_size, E820_RAM);
>      if (above_4g_mem_size > 0) {
>          ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> -                                 below_4g_mem_size, above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> +        /*
> +         *
> +         * If 1GB hugepages are used to back guest RAM, map guest address
> +         * space in the range [ramsize,ramsize+holesize] to the ram block
> +         * range [holestart, 4GB]
> +         *
> +         *                      0      h     4G     [ramsize,ramsize+holesize]
> +         *
> +         * guest-addr-space     [      ]     [      ][xxx]
> +         *                                /----------/
> +         * contiguous-ram-block [      ][xxx][     ]
> +         *
> +         * So that memory beyond 4GB is aligned on a 1GB boundary,
> +         * at the host physical address space.
> +         *
> +         */
> +        if (guest_info->gb_align) {
> +            uint64_t holesize = 0x100000000ULL - below_4g_mem_size;
> +            uint64_t piecetwosize = holesize - align_offset;
> +
> +            assert(piecetwosize <= holesize);
> +
> +            piecetwosize = MIN(above_4g_mem_size, piecetwosize);
> +            if ((above_4g_mem_size - piecetwosize) > 0) {
> +                memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
> +                                         ram, 0x100000000ULL,
> +                                         above_4g_mem_size - piecetwosize);
> +                memory_region_add_subregion(system_memory, 0x100000000ULL,
> +                                         ram_above_4g);
> +            } else {
> +                g_free(ram_above_4g);
> +            }
> +
> +            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> +                                     "ram-above-4g-piecetwo", ram,
> +                                     0x100000000ULL - holesize, piecetwosize);
> +            memory_region_add_subregion(system_memory,
> +                                        0x100000000ULL +
> +                                        above_4g_mem_size - piecetwosize,
> +                                        ram_above_4g_piecetwo);
> +        } else {
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    below_4g_mem_size, above_4g_mem_size);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
>                                      ram_above_4g);
> +        }
>          e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
>      }
>  
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 4fdb7b6..686736e 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -60,6 +60,7 @@ static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
>  static bool has_pvpanic;
>  static bool has_pci_info = true;
>  static bool has_acpi_build = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_init1(QEMUMachineInitArgs *args,
> @@ -128,6 +129,7 @@ static void pc_init1(QEMUMachineInitArgs *args,
>  
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = !pci_enabled;
> +    guest_info->gb_align = gb_align;
>  
>      /* allocate ram and load rom/bios */
>      if (!xen_enabled()) {
> @@ -240,8 +242,14 @@ static void pc_init_pci(QEMUMachineInitArgs *args)
>      pc_init1(args, 1, 1);
>  }
>  
> +static void pc_compat_1_7(QEMUMachineInitArgs *args)
> +{
> +    gb_align = false;
> +}
> +
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
>  {
> +    pc_compat_1_7(args);
>      has_pci_info = false;
>      rom_file_in_ram = false;
>      has_acpi_build = false;
> @@ -274,6 +282,12 @@ static void pc_compat_1_2(QEMUMachineInitArgs *args)
>      disable_kvm_pv_eoi();
>  }
>  
> +static void pc_init_pci_1_7(QEMUMachineInitArgs *args)
> +{
> +    pc_compat_1_7(args);
> +    pc_init_pci(args);
> +}
> +
>  static void pc_init_pci_1_6(QEMUMachineInitArgs *args)
>  {
>      pc_compat_1_6(args);
> @@ -346,13 +360,21 @@ static void pc_xen_hvm_init(QEMUMachineInitArgs *args)
>      .desc = "Standard PC (i440FX + PIIX, 1996)", \
>      .hot_add_cpu = pc_hot_add_cpu
>  
> +#define PC_I440FX_1_8_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
> +static QEMUMachine pc_i440fx_machine_v1_8 = {
> +    PC_I440FX_1_8_MACHINE_OPTIONS,
> +    .name = "pc-i440fx-1.8",
> +    .alias = "pc",
> +    .init = pc_init_pci,
> +    .is_default = 1,
> +};
> +
>  #define PC_I440FX_1_7_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
>  static QEMUMachine pc_i440fx_machine_v1_7 = {
>      PC_I440FX_1_7_MACHINE_OPTIONS,
>      .name = "pc-i440fx-1.7",
>      .alias = "pc",
> -    .init = pc_init_pci,
> -    .is_default = 1,
> +    .init = pc_init_pci_1_7,
>  };
>  
>  #define PC_I440FX_1_6_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
> @@ -754,6 +776,7 @@ static QEMUMachine xenfv_machine = {
>  
>  static void pc_machine_init(void)
>  {
> +    qemu_register_machine(&pc_i440fx_machine_v1_8);
>      qemu_register_machine(&pc_i440fx_machine_v1_7);
>      qemu_register_machine(&pc_i440fx_machine_v1_6);
>      qemu_register_machine(&pc_i440fx_machine_v1_5);
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index 4c191d3..c2eb568 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -50,6 +50,7 @@
>  static bool has_pvpanic;
>  static bool has_pci_info = true;
>  static bool has_acpi_build = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_q35_init(QEMUMachineInitArgs *args)
> @@ -113,6 +114,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = false;
>      guest_info->has_acpi_build = has_acpi_build;
> +    guest_info->gb_align = gb_align;
>  
>      /* allocate ram and load rom/bios */
>      if (!xen_enabled()) {
> @@ -222,8 +224,14 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>      }
>  }
>  
> +static void pc_compat_1_7(QEMUMachineInitArgs *args)
> +{
> +   gb_align = false;
> +}
> +
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
>  {
> +    pc_compat_1_7(args);
>      has_pci_info = false;
>      rom_file_in_ram = false;
>      has_acpi_build = false;
> @@ -243,6 +251,12 @@ static void pc_compat_1_4(QEMUMachineInitArgs *args)
>      x86_cpu_compat_set_features("Westmere", FEAT_1_ECX, 0, CPUID_EXT_PCLMULQDQ);
>  }
>  
> +static void pc_q35_init_1_7(QEMUMachineInitArgs *args)
> +{
> +    pc_compat_1_7(args);
> +    pc_q35_init(args);
> +}
> +
>  static void pc_q35_init_1_6(QEMUMachineInitArgs *args)
>  {
>      pc_compat_1_6(args);
> @@ -266,13 +280,22 @@ static void pc_q35_init_1_4(QEMUMachineInitArgs *args)
>      .desc = "Standard PC (Q35 + ICH9, 2009)", \
>      .hot_add_cpu = pc_hot_add_cpu
>  
> +#define PC_Q35_1_8_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
> +
> +static QEMUMachine pc_q35_machine_v1_8 = {
> +    PC_Q35_1_8_MACHINE_OPTIONS,
> +    .name = "pc-q35-1.8",
> +    .alias = "q35",
> +    .init = pc_q35_init,
> +};
> +
>  #define PC_Q35_1_7_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
>  
>  static QEMUMachine pc_q35_machine_v1_7 = {
>      PC_Q35_1_7_MACHINE_OPTIONS,
>      .name = "pc-q35-1.7",
>      .alias = "q35",
> -    .init = pc_q35_init,
> +    .init = pc_q35_init_1_7,
>  };
>  
>  #define PC_Q35_1_6_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
> @@ -313,6 +336,7 @@ static QEMUMachine pc_q35_machine_v1_4 = {
>  
>  static void pc_q35_machine_init(void)
>  {
> +    qemu_register_machine(&pc_q35_machine_v1_8);
>      qemu_register_machine(&pc_q35_machine_v1_7);
>      qemu_register_machine(&pc_q35_machine_v1_6);
>      qemu_register_machine(&pc_q35_machine_v1_5);
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 03cc0ba..35a6885 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -41,6 +41,7 @@ struct PcGuestInfo {
>      uint64_t *node_cpu;
>      FWCfgState *fw_cfg;
>      bool has_acpi_build;
> +    bool gb_align;
>  };
>  
>  /* parallel.c */
> 


-- 
Regards,
  Igor

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v6)
  2013-11-12 21:16               ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v6) Marcelo Tosatti
  2013-11-13 13:59                 ` Igor Mammedov
@ 2013-11-13 17:13                 ` Paolo Bonzini
       [not found]                   ` <20131113195832.GA29433@amt.cnet>
  2013-11-13 19:08                 ` Michael S. Tsirkin
  2 siblings, 1 reply; 53+ messages in thread
From: Paolo Bonzini @ 2013-11-13 17:13 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: aarcange, gleb, Michael S. Tsirkin, qemu-devel, Gerd Hoffmann,
	Igor Mammedov

>             assert(piecetwosize <= holesize);
> 
>             piecetwosize = MIN(above_4g_mem_size, piecetwosize);
>             if ((above_4g_mem_size - piecetwosize) > 0) {
>                 memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
>                                          ram, 0x100000000ULL,
>                                          above_4g_mem_size - piecetwosize);
>                 memory_region_add_subregion(system_memory, 0x100000000ULL,
>                                          ram_above_4g);
>             } else {
>                 g_free(ram_above_4g);
>             }
>             memory_region_init_alias(ram_above_4g_piecetwo, NULL,
>                                      "ram-above-4g-piecetwo", ram,
>                                      0x100000000ULL - holesize, piecetwosize);
>             memory_region_add_subregion(system_memory,
>                                         0x100000000ULL +
>                                         above_4g_mem_size - piecetwosize,
>                                         ram_above_4g_piecetwo);

There is still a small problem in that the 2MB rounding must not be
done for old machine types.

I did a really careful review of the code and everything else looks okay
to me.  However, it grew by accretion from v1 and now it took me really a
long time to figure it out...  I adjusted it a bit and the result seems
easier to understand to me.

Here's the hw/i386/pc.c part of the patch (the patch from v6 is unreadable):

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 12c436e..f2fd138 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1156,8 +1156,10 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
 {
     int linux_boot, i;
     MemoryRegion *ram, *option_rom_mr;
-    MemoryRegion *ram_below_4g, *ram_above_4g;
+    MemoryRegion *ram_below_4g, *ram_above_4g_pieceone, *ram_above_4g_piecetwo;
     FWCfgState *fw_cfg;
+    uint64_t holesize, pieceonesize, piecetwosize;
+    uint64_t memsize, align_offset;
 
     linux_boot = (kernel_filename != NULL);
 
@@ -1165,26 +1167,74 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
      * aliases to address portions of it, mostly for backwards compatibility
      * with older qemus that used qemu_ram_alloc().
      */
+    memsize = below_4g_mem_size + above_4g_mem_size;
+    holesize = 0x100000000ULL - below_4g_mem_size;
+
+    /* If 1GB hugepages are used to back guest RAM, we want the
+     * physical address 4GB to map to 4GB in the RAM, so that
+     * memory beyond 4GB is aligned on a 1GB boundary, at the
+     * host physical address space.  Thus, the ram block range
+     * [holestart, 4GB] is mapped to the last holesize bytes of RAM:
+     *
+     *                      0      h     4G     memsize-holesize
+     *
+     * contiguous-ram-block [xxxxxx][yyy][zzzzz]
+     *                                '-----------.
+     * guest-addr-space     [xxxxxx]     [zzzzz][yyy]
+     *
+     * This is only done in new-enough machine types, and of course
+     * it is only necessary if the [zzzzz] block exists at all.
+     */
+    if (guest_info->gb_align && above_4g_mem_size > holesize) {
+        /* Round the allocation up to 2 MB to use more hugepages.
+         * Remove the slack from the [yyy] piece so that pieceonesize
+         * (and thus the start of piecetwo) remains aligned.
+         */
+        align_offset = ROUND_UP(memsize, 1UL << 21) - memsize;
+        piecetwosize = holesize - align_offset;
+    } else {
+        /* There's no "piece one", all memory above 4G starts
+         * at below_4g_mem_size in the RAM block.  Also no need
+         * to align anything.
+         */
+        align_offset = 0;
+        piecetwosize = above_4g_mem_size;
+    }
+
     ram = g_malloc(sizeof(*ram));
-    memory_region_init_ram(ram, NULL, "pc.ram",
-                           below_4g_mem_size + above_4g_mem_size);
+    memory_region_init_ram(ram, NULL, "pc.ram", memsize + align_offset);
     vmstate_register_ram_global(ram);
     *ram_memory = ram;
+
     ram_below_4g = g_malloc(sizeof(*ram_below_4g));
     memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
                              0, below_4g_mem_size);
     memory_region_add_subregion(system_memory, 0, ram_below_4g);
+
+    pieceonesize = above_4g_mem_size - piecetwosize;
+    if (pieceonesize) {
+        ram_above_4g_pieceone = g_malloc(sizeof(*ram_above_4g_pieceone));
+        memory_region_init_alias(ram_above_4g_pieceone, NULL,
+                                 "ram-above-4g-pieceone", ram,
+                                 0x100000000ULL, pieceonesize);
+        memory_region_add_subregion(system_memory, 0x100000000ULL,
+                                    ram_above_4g_pieceone);
+    }
+    if (piecetwosize) {
+        ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
+        memory_region_init_alias(ram_above_4g_piecetwo, NULL,
+                                 "ram-above-4g-piecetwo", ram,
+                                 below_4g_mem_size, piecetwosize);
+        memory_region_add_subregion(system_memory,
+                                    0x100000000ULL + pieceonesize,
+                                    ram_above_4g_piecetwo);
+    }
+
     e820_add_entry(0, below_4g_mem_size, E820_RAM);
     if (above_4g_mem_size > 0) {
-        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
-        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
-                                 below_4g_mem_size, above_4g_mem_size);
-        memory_region_add_subregion(system_memory, 0x100000000ULL,
-                                    ram_above_4g);
         e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
     }
 
-
     /* Initialize PC system firmware */
     pc_system_firmware_init(rom_memory, guest_info->isapc_ram_fw);
 

Tests:

-m 3585 (without gb_align)
  0000000100000000-00000001000fffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000e00fffff

-m 4096 (without gb_align)
  0000000100000000-000000011fffffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000ffffffff

-m 4097 (without gb_align)
  0000000100000000-00000001200fffff (prio 0, RW): alias ram-above-4g @pc.ram 00000000e0000000-00000001000fffff

-m 4097 (with gb_align, both high regions 2MB aligned, unused MB of RAM at 0xfff00000)
  0000000100000000-00000001001fffff (prio 0, RW): alias ram-above-4g @pc.ram 0000000100000000-00000001001fffff
  0000000100200000-00000001200fffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000ffefffff

-m 8192 (without gb_align)
  0000000100000000-000000021fffffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000001ffffffff

-m 8192 (with gb_align)
  0000000100000000-00000001ffffffff (prio 0, RW): alias ram-above-4g @pc.ram 0000000100000000-00000001ffffffff
  0000000200000000-000000021fffffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000ffffffff

-m 8193 (without gb_align)
  0000000100000000-00000002200fffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000002000fffff

-m 8193 (with gb_align)
  0000000100000000-00000002001fffff (prio 0, RW): alias ram-above-4g @pc.ram 0000000100000000-00000002001fffff
  0000000200200000-00000002200fffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000ffefffff

Ok to apply this version?

Paolo

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v6)
  2013-11-12 21:16               ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v6) Marcelo Tosatti
  2013-11-13 13:59                 ` Igor Mammedov
  2013-11-13 17:13                 ` Paolo Bonzini
@ 2013-11-13 19:08                 ` Michael S. Tsirkin
  2 siblings, 0 replies; 53+ messages in thread
From: Michael S. Tsirkin @ 2013-11-13 19:08 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: aarcange, gleb, qemu-devel, Gerd Hoffmann, pbonzini, Igor Mammedov

On Tue, Nov 12, 2013 at 07:16:37PM -0200, Marcelo Tosatti wrote:
> 
> v2: condition enablement of new mapping to new machine types (Paolo)
> v3: fix changelog
> v4: rebase
> v5: ensure alignment of piecetwo on 2MB GPA (Igor)
>     do not register zero-sized piece-one    (Igor)
> v6: fix memory leak                         (Igor)
>     fix integer overflow                    (Igor)

BTW if you respin anyway, and if you want me to queue this for you,
could you make a patch against pci branch please?
We have the 1.8 place-holders there already.

Thanks!

> ----
> 
> Align guest physical address and host physical address
> beyond guest 4GB on a 1GB boundary.
> 
> Otherwise 1GB TLBs cannot be cached for the range.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 12c436e..9cf5109 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1156,8 +1156,9 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>  {
>      int linux_boot, i;
>      MemoryRegion *ram, *option_rom_mr;
> -    MemoryRegion *ram_below_4g, *ram_above_4g;
> +    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
>      FWCfgState *fw_cfg;
> +    uint64_t memsize, align_offset;
>  
>      linux_boot = (kernel_filename != NULL);
>  
> @@ -1166,8 +1167,12 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>       * with older qemus that used qemu_ram_alloc().
>       */
>      ram = g_malloc(sizeof(*ram));
> -    memory_region_init_ram(ram, NULL, "pc.ram",
> -                           below_4g_mem_size + above_4g_mem_size);
> +
> +    memsize = ROUND_UP(below_4g_mem_size + above_4g_mem_size, 1UL << 21);
> +    align_offset = memsize - (below_4g_mem_size + above_4g_mem_size);
> +
> +    memory_region_init_ram(ram, NULL, "pc.ram", memsize);
> +
>      vmstate_register_ram_global(ram);
>      *ram_memory = ram;
>      ram_below_4g = g_malloc(sizeof(*ram_below_4g));
> @@ -1177,10 +1182,53 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>      e820_add_entry(0, below_4g_mem_size, E820_RAM);
>      if (above_4g_mem_size > 0) {
>          ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> -                                 below_4g_mem_size, above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> +        /*
> +         *
> +         * If 1GB hugepages are used to back guest RAM, map guest address
> +         * space in the range [ramsize,ramsize+holesize] to the ram block
> +         * range [holestart, 4GB]
> +         *
> +         *                      0      h     4G     [ramsize,ramsize+holesize]
> +         *
> +         * guest-addr-space     [      ]     [      ][xxx]
> +         *                                /----------/
> +         * contiguous-ram-block [      ][xxx][     ]
> +         *
> +         * So that memory beyond 4GB is aligned on a 1GB boundary,
> +         * at the host physical address space.
> +         *
> +         */
> +        if (guest_info->gb_align) {
> +            uint64_t holesize = 0x100000000ULL - below_4g_mem_size;
> +            uint64_t piecetwosize = holesize - align_offset;
> +
> +            assert(piecetwosize <= holesize);
> +
> +            piecetwosize = MIN(above_4g_mem_size, piecetwosize);
> +            if ((above_4g_mem_size - piecetwosize) > 0) {
> +                memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
> +                                         ram, 0x100000000ULL,
> +                                         above_4g_mem_size - piecetwosize);
> +                memory_region_add_subregion(system_memory, 0x100000000ULL,
> +                                         ram_above_4g);
> +            } else {
> +                g_free(ram_above_4g);
> +            }
> +
> +            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> +                                     "ram-above-4g-piecetwo", ram,
> +                                     0x100000000ULL - holesize, piecetwosize);
> +            memory_region_add_subregion(system_memory,
> +                                        0x100000000ULL +
> +                                        above_4g_mem_size - piecetwosize,
> +                                        ram_above_4g_piecetwo);
> +        } else {
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    below_4g_mem_size, above_4g_mem_size);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
>                                      ram_above_4g);
> +        }
>          e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
>      }
>  
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 4fdb7b6..686736e 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -60,6 +60,7 @@ static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
>  static bool has_pvpanic;
>  static bool has_pci_info = true;
>  static bool has_acpi_build = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_init1(QEMUMachineInitArgs *args,
> @@ -128,6 +129,7 @@ static void pc_init1(QEMUMachineInitArgs *args,
>  
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = !pci_enabled;
> +    guest_info->gb_align = gb_align;
>  
>      /* allocate ram and load rom/bios */
>      if (!xen_enabled()) {
> @@ -240,8 +242,14 @@ static void pc_init_pci(QEMUMachineInitArgs *args)
>      pc_init1(args, 1, 1);
>  }
>  
> +static void pc_compat_1_7(QEMUMachineInitArgs *args)
> +{
> +    gb_align = false;
> +}
> +
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
>  {
> +    pc_compat_1_7(args);
>      has_pci_info = false;
>      rom_file_in_ram = false;
>      has_acpi_build = false;
> @@ -274,6 +282,12 @@ static void pc_compat_1_2(QEMUMachineInitArgs *args)
>      disable_kvm_pv_eoi();
>  }
>  
> +static void pc_init_pci_1_7(QEMUMachineInitArgs *args)
> +{
> +    pc_compat_1_7(args);
> +    pc_init_pci(args);
> +}
> +
>  static void pc_init_pci_1_6(QEMUMachineInitArgs *args)
>  {
>      pc_compat_1_6(args);
> @@ -346,13 +360,21 @@ static void pc_xen_hvm_init(QEMUMachineInitArgs *args)
>      .desc = "Standard PC (i440FX + PIIX, 1996)", \
>      .hot_add_cpu = pc_hot_add_cpu
>  
> +#define PC_I440FX_1_8_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
> +static QEMUMachine pc_i440fx_machine_v1_8 = {
> +    PC_I440FX_1_8_MACHINE_OPTIONS,
> +    .name = "pc-i440fx-1.8",
> +    .alias = "pc",
> +    .init = pc_init_pci,
> +    .is_default = 1,
> +};
> +
>  #define PC_I440FX_1_7_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
>  static QEMUMachine pc_i440fx_machine_v1_7 = {
>      PC_I440FX_1_7_MACHINE_OPTIONS,
>      .name = "pc-i440fx-1.7",
>      .alias = "pc",
> -    .init = pc_init_pci,
> -    .is_default = 1,
> +    .init = pc_init_pci_1_7,
>  };
>  
>  #define PC_I440FX_1_6_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
> @@ -754,6 +776,7 @@ static QEMUMachine xenfv_machine = {
>  
>  static void pc_machine_init(void)
>  {
> +    qemu_register_machine(&pc_i440fx_machine_v1_8);
>      qemu_register_machine(&pc_i440fx_machine_v1_7);
>      qemu_register_machine(&pc_i440fx_machine_v1_6);
>      qemu_register_machine(&pc_i440fx_machine_v1_5);
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index 4c191d3..c2eb568 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -50,6 +50,7 @@
>  static bool has_pvpanic;
>  static bool has_pci_info = true;
>  static bool has_acpi_build = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_q35_init(QEMUMachineInitArgs *args)
> @@ -113,6 +114,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = false;
>      guest_info->has_acpi_build = has_acpi_build;
> +    guest_info->gb_align = gb_align;
>  
>      /* allocate ram and load rom/bios */
>      if (!xen_enabled()) {
> @@ -222,8 +224,14 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>      }
>  }
>  
> +static void pc_compat_1_7(QEMUMachineInitArgs *args)
> +{
> +   gb_align = false;
> +}
> +
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
>  {
> +    pc_compat_1_7(args);
>      has_pci_info = false;
>      rom_file_in_ram = false;
>      has_acpi_build = false;
> @@ -243,6 +251,12 @@ static void pc_compat_1_4(QEMUMachineInitArgs *args)
>      x86_cpu_compat_set_features("Westmere", FEAT_1_ECX, 0, CPUID_EXT_PCLMULQDQ);
>  }
>  
> +static void pc_q35_init_1_7(QEMUMachineInitArgs *args)
> +{
> +    pc_compat_1_7(args);
> +    pc_q35_init(args);
> +}
> +
>  static void pc_q35_init_1_6(QEMUMachineInitArgs *args)
>  {
>      pc_compat_1_6(args);
> @@ -266,13 +280,22 @@ static void pc_q35_init_1_4(QEMUMachineInitArgs *args)
>      .desc = "Standard PC (Q35 + ICH9, 2009)", \
>      .hot_add_cpu = pc_hot_add_cpu
>  
> +#define PC_Q35_1_8_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
> +
> +static QEMUMachine pc_q35_machine_v1_8 = {
> +    PC_Q35_1_8_MACHINE_OPTIONS,
> +    .name = "pc-q35-1.8",
> +    .alias = "q35",
> +    .init = pc_q35_init,
> +};
> +
>  #define PC_Q35_1_7_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
>  
>  static QEMUMachine pc_q35_machine_v1_7 = {
>      PC_Q35_1_7_MACHINE_OPTIONS,
>      .name = "pc-q35-1.7",
>      .alias = "q35",
> -    .init = pc_q35_init,
> +    .init = pc_q35_init_1_7,
>  };
>  
>  #define PC_Q35_1_6_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
> @@ -313,6 +336,7 @@ static QEMUMachine pc_q35_machine_v1_4 = {
>  
>  static void pc_q35_machine_init(void)
>  {
> +    qemu_register_machine(&pc_q35_machine_v1_8);
>      qemu_register_machine(&pc_q35_machine_v1_7);
>      qemu_register_machine(&pc_q35_machine_v1_6);
>      qemu_register_machine(&pc_q35_machine_v1_5);
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 03cc0ba..35a6885 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -41,6 +41,7 @@ struct PcGuestInfo {
>      uint64_t *node_cpu;
>      FWCfgState *fw_cfg;
>      bool has_acpi_build;
> +    bool gb_align;
>  };
>  
>  /* parallel.c */

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v6)
       [not found]                   ` <20131113195832.GA29433@amt.cnet>
@ 2013-11-13 20:39                     ` Marcelo Tosatti
  2013-11-13 21:49                       ` Paolo Bonzini
  0 siblings, 1 reply; 53+ messages in thread
From: Marcelo Tosatti @ 2013-11-13 20:39 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: aarcange, gleb, Michael S. Tsirkin, qemu-devel, Gerd Hoffmann,
	Igor Mammedov

On Wed, Nov 13, 2013 at 05:58:32PM -0200, Marcelo Tosatti wrote:
> On Wed, Nov 13, 2013 at 06:13:15PM +0100, Paolo Bonzini wrote:
> > >             assert(piecetwosize <= holesize);
> > > 
> > >             piecetwosize = MIN(above_4g_mem_size, piecetwosize);
> > >             if ((above_4g_mem_size - piecetwosize) > 0) {
> > >                 memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
> > >                                          ram, 0x100000000ULL,
> > >                                          above_4g_mem_size - piecetwosize);
> > >                 memory_region_add_subregion(system_memory, 0x100000000ULL,
> > >                                          ram_above_4g);
> > >             } else {
> > >                 g_free(ram_above_4g);
> > >             }
> > >             memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> > >                                      "ram-above-4g-piecetwo", ram,
> > >                                      0x100000000ULL - holesize, piecetwosize);
> > >             memory_region_add_subregion(system_memory,
> > >                                         0x100000000ULL +
> > >                                         above_4g_mem_size - piecetwosize,
> > >                                         ram_above_4g_piecetwo);
> > 
> > There is still a small problem in that the 2MB rounding must not be
> > done for old machine types.
> >
> > I did a really careful review of the code and everything else looks okay
> > to me.  However, it grew by accretion from v1 and now it took me really a
> > long time to figure it out...  I adjusted it a bit and the result seems
> > easier to understand to me.
> > 
> > Here's the hw/i386/pc.c part of the patch (the patch from v6 is unreadable):
> > 
> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> > index 12c436e..f2fd138 100644
> > --- a/hw/i386/pc.c
> > +++ b/hw/i386/pc.c
> > @@ -1156,8 +1156,10 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
> >  {
> >      int linux_boot, i;
> >      MemoryRegion *ram, *option_rom_mr;
> > -    MemoryRegion *ram_below_4g, *ram_above_4g;
> > +    MemoryRegion *ram_below_4g, *ram_above_4g_pieceone, *ram_above_4g_piecetwo;
> >      FWCfgState *fw_cfg;
> > +    uint64_t holesize, pieceonesize, piecetwosize;
> > +    uint64_t memsize, align_offset;
> >  
> >      linux_boot = (kernel_filename != NULL);
> >  
> > @@ -1165,26 +1167,74 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
> >       * aliases to address portions of it, mostly for backwards compatibility
> >       * with older qemus that used qemu_ram_alloc().
> >       */
> > +    memsize = below_4g_mem_size + above_4g_mem_size;
> > +    holesize = 0x100000000ULL - below_4g_mem_size;
> > +
> > +    /* If 1GB hugepages are used to back guest RAM, we want the
> > +     * physical address 4GB to map to 4GB in the RAM, so that
> > +     * memory beyond 4GB is aligned on a 1GB boundary, at the
> > +     * host physical address space.  Thus, the ram block range
> > +     * [holestart, 4GB] is mapped to the last holesize bytes of RAM:
> > +     *
> > +     *                      0      h     4G     memsize-holesize
> > +     *
> > +     * contiguous-ram-block [xxxxxx][yyy][zzzzz]
> > +     *                                '-----------.
> > +     * guest-addr-space     [xxxxxx]     [zzzzz][yyy]
> > +     *
> > +     * This is only done in new-enough machine types, and of course
> > +     * it is only necessary if the [zzzzz] block exists at all.
> > +     */
> > +    if (guest_info->gb_align && above_4g_mem_size > holesize) {
> > +        /* Round the allocation up to 2 MB to use more hugepages.

To align to 2MB boundary, the number of hugepages is the same.

> > +         * Remove the slack from the [yyy] piece so that pieceonesize
> > +         * (and thus the start of piecetwo) remains aligned.
> > +         */
> > +        align_offset = ROUND_UP(memsize, 1UL << 21) - memsize;
> > +        piecetwosize = holesize - align_offset;
> > +    } else {
> > +        /* There's no "piece one", all memory above 4G starts

Piece two.

> > +         * at below_4g_mem_size in the RAM block.  Also no need
> > +         * to align anything.
> > +         */
> > +        align_offset = 0;
> > +        piecetwosize = above_4g_mem_size;
> > +    }
> > +
> >      ram = g_malloc(sizeof(*ram));
> > -    memory_region_init_ram(ram, NULL, "pc.ram",
> > -                           below_4g_mem_size + above_4g_mem_size);
> > +    memory_region_init_ram(ram, NULL, "pc.ram", memsize + align_offset);
> >      vmstate_register_ram_global(ram);
> >      *ram_memory = ram;
> > +
> >      ram_below_4g = g_malloc(sizeof(*ram_below_4g));
> >      memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
> >                               0, below_4g_mem_size);
> >      memory_region_add_subregion(system_memory, 0, ram_below_4g);
> > +
> > +    pieceonesize = above_4g_mem_size - piecetwosize;
> > +    if (pieceonesize) {
> > +        ram_above_4g_pieceone = g_malloc(sizeof(*ram_above_4g_pieceone));
> > +        memory_region_init_alias(ram_above_4g_pieceone, NULL,
> > +                                 "ram-above-4g-pieceone", ram,
> > +                                 0x100000000ULL, pieceonesize);
> > +        memory_region_add_subregion(system_memory, 0x100000000ULL,
> > +                                    ram_above_4g_pieceone);
> > +    }
> 
> Can you change the name of aliases and subregions without breaking
> migration?
> 
> Its much simpler, i'm fine with it.

Test with Q35?

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v6)
  2013-11-13 20:39                     ` Marcelo Tosatti
@ 2013-11-13 21:49                       ` Paolo Bonzini
  0 siblings, 0 replies; 53+ messages in thread
From: Paolo Bonzini @ 2013-11-13 21:49 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: aarcange, gleb, Michael S. Tsirkin, qemu-devel, Gerd Hoffmann,
	Igor Mammedov

> > > +    if (guest_info->gb_align && above_4g_mem_size > holesize) {
> > > +        /* Round the allocation up to 2 MB to use more hugepages.
> 
> To align to 2MB boundary, the number of hugepages is the same.

Right.

> > > +         * Remove the slack from the [yyy] piece so that pieceonesize
> > > +         * (and thus the start of piecetwo) remains aligned.
> > > +         */
> > > +        align_offset = ROUND_UP(memsize, 1UL << 21) - memsize;
> > > +        piecetwosize = holesize - align_offset;
> > > +    } else {
> > > +        /* There's no "piece one", all memory above 4G starts
> 
> Piece two.

I'm calling "piece one" the part that is aligned at 0x100000000 in the
RAM block, and "piece two" the part that starts at below_4g_mem_size.
I'll change to "there's no [zzzzz] region".

> > > +         * at below_4g_mem_size in the RAM block.  Also no need
> > > +         * to align anything.
> > > +         */
> > > +        align_offset = 0;
> > > +        piecetwosize = above_4g_mem_size;
> > > +    }
> > > +
> > >      ram = g_malloc(sizeof(*ram));
> > > -    memory_region_init_ram(ram, NULL, "pc.ram",
> > > -                           below_4g_mem_size + above_4g_mem_size);
> > > +    memory_region_init_ram(ram, NULL, "pc.ram", memsize + align_offset);
> > >      vmstate_register_ram_global(ram);
> > >      *ram_memory = ram;
> > > +
> > >      ram_below_4g = g_malloc(sizeof(*ram_below_4g));
> > >      memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
> > >                               0, below_4g_mem_size);
> > >      memory_region_add_subregion(system_memory, 0, ram_below_4g);
> > > +
> > > +    pieceonesize = above_4g_mem_size - piecetwosize;
> > > +    if (pieceonesize) {
> > > +        ram_above_4g_pieceone =
> > > g_malloc(sizeof(*ram_above_4g_pieceone));
> > > +        memory_region_init_alias(ram_above_4g_pieceone, NULL,
> > > +                                 "ram-above-4g-pieceone", ram,
> > > +                                 0x100000000ULL, pieceonesize);
> > > +        memory_region_add_subregion(system_memory, 0x100000000ULL,
> > > +                                    ram_above_4g_pieceone);
> > > +    }
> > 
> > Can you change the name of aliases and subregions without breaking
> > migration?

Yes, memory regions are invisible except for RAM regions.

> Test with Q35?

Will do.  Thanks for the review!

Paolo

^ permalink raw reply	[flat|nested] 53+ messages in thread

end of thread, other threads:[~2013-11-13 21:50 UTC | newest]

Thread overview: 53+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-10-24 21:11 [Qemu-devel] [patch 0/2] align >4GB guest RAM to a 1GB boundary, in case of 1GB-sized hugetlbfs Marcelo Tosatti
2013-10-24 21:11 ` [Qemu-devel] [patch 1/2] exec: add qemu_get_ram_hpagesize Marcelo Tosatti
2013-10-24 21:12 ` [Qemu-devel] [patch 2/2] i386: pc: align gpa<->hpa on 1GB boundary Marcelo Tosatti
2013-10-24 21:55   ` Peter Maydell
2013-10-24 22:48     ` Marcelo Tosatti
2013-10-24 23:55   ` Paolo Bonzini
2013-10-25  4:58     ` Marcelo Tosatti
2013-10-25  8:52       ` Paolo Bonzini
2013-10-25 19:50         ` Marcelo Tosatti
2013-10-25 22:53           ` Paolo Bonzini
2013-10-30 11:07             ` Gerd Hoffmann
2013-10-30 11:47               ` Paolo Bonzini
2013-10-25  9:57       ` igor Mammedov
2013-10-25 13:34         ` Marcelo Tosatti
2013-10-27 15:20           ` igor Mammedov
2013-10-28 14:04             ` Marcelo Tosatti
2013-10-28 14:20               ` Marcelo Tosatti
2013-10-29 18:00               ` Igor Mammedov
2013-10-29 21:21                 ` Marcelo Tosatti
2013-10-30  8:48                   ` Gleb Natapov
2013-10-30 18:30                     ` Marcelo Tosatti
2013-10-29 18:18               ` [Qemu-devel] [RFC PATCH] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions Igor Mammedov
2013-10-29 21:38                 ` Marcelo Tosatti
2013-10-30 16:49                   ` Igor Mammedov
2013-10-30 18:51                     ` Marcelo Tosatti
2013-10-30 19:03                       ` Marcelo Tosatti
2013-10-30 19:56                       ` Igor Mammedov
2013-10-30 23:44                         ` Marcelo Tosatti
2013-11-07 15:25                           ` Igor Mammedov
2013-10-30 19:31                     ` Marcelo Tosatti
2013-10-30 20:28                       ` Paolo Bonzini
2013-11-06  1:49   ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v2) Marcelo Tosatti
2013-11-06  1:55     ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v3) Marcelo Tosatti
2013-11-06 11:59       ` Igor Mammedov
2013-11-06 12:07         ` Paolo Bonzini
2013-11-06 12:22           ` Igor Mammedov
2013-11-06 12:24             ` Paolo Bonzini
2013-11-06 21:31       ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v4) Marcelo Tosatti
2013-11-06 21:40         ` Michael S. Tsirkin
2013-11-06 21:53           ` Marcelo Tosatti
2013-11-06 22:15             ` Michael S. Tsirkin
2013-11-06 22:24               ` Marcelo Tosatti
2013-11-07 15:24         ` Igor Mammedov
2013-11-07 21:53           ` Marcelo Tosatti
2013-11-10 20:47             ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v5) Marcelo Tosatti
2013-11-12 12:45               ` Igor Mammedov
2013-11-12 20:32                 ` Marcelo Tosatti
2013-11-12 21:16               ` [Qemu-devel] i386: pc: align gpa<->hpa on 1GB boundary (v6) Marcelo Tosatti
2013-11-13 13:59                 ` Igor Mammedov
2013-11-13 17:13                 ` Paolo Bonzini
     [not found]                   ` <20131113195832.GA29433@amt.cnet>
2013-11-13 20:39                     ` Marcelo Tosatti
2013-11-13 21:49                       ` Paolo Bonzini
2013-11-13 19:08                 ` Michael S. Tsirkin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.