qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 1/5] pci-assign: remove the duplicate function name in debug message
@ 2013-05-23  8:47 Wanlong Gao
  2013-05-23  8:47 ` [Qemu-devel] [PATCH 2/5] memory: check if the total numa memory size is equal to ram_size Wanlong Gao
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Wanlong Gao @ 2013-05-23  8:47 UTC (permalink / raw)
  To: qemu-devel; +Cc: Wanlong Gao

While DEBUG() already includes the function name.

Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/i386/kvm/pci-assign.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c
index ff85590..9896c28 100644
--- a/hw/i386/kvm/pci-assign.c
+++ b/hw/i386/kvm/pci-assign.c
@@ -226,7 +226,7 @@ static uint32_t slow_bar_readb(void *opaque, hwaddr addr)
     uint32_t r;
 
     r = *in;
-    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+    DEBUG("addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
 
     return r;
 }
@@ -238,7 +238,7 @@ static uint32_t slow_bar_readw(void *opaque, hwaddr addr)
     uint32_t r;
 
     r = *in;
-    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+    DEBUG("addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
 
     return r;
 }
@@ -250,7 +250,7 @@ static uint32_t slow_bar_readl(void *opaque, hwaddr addr)
     uint32_t r;
 
     r = *in;
-    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+    DEBUG("addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
 
     return r;
 }
@@ -260,7 +260,7 @@ static void slow_bar_writeb(void *opaque, hwaddr addr, uint32_t val)
     AssignedDevRegion *d = opaque;
     uint8_t *out = d->u.r_virtbase + addr;
 
-    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
+    DEBUG("addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
     *out = val;
 }
 
@@ -269,7 +269,7 @@ static void slow_bar_writew(void *opaque, hwaddr addr, uint32_t val)
     AssignedDevRegion *d = opaque;
     uint16_t *out = (uint16_t *)(d->u.r_virtbase + addr);
 
-    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
+    DEBUG("addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
     *out = val;
 }
 
@@ -278,7 +278,7 @@ static void slow_bar_writel(void *opaque, hwaddr addr, uint32_t val)
     AssignedDevRegion *d = opaque;
     uint32_t *out = (uint32_t *)(d->u.r_virtbase + addr);
 
-    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
+    DEBUG("addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
     *out = val;
 }
 
-- 
1.8.3.rc2.10.g0c2b1cf

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [Qemu-devel] [PATCH 2/5] memory: check if the total numa memory size is equal to ram_size
  2013-05-23  8:47 [Qemu-devel] [PATCH 1/5] pci-assign: remove the duplicate function name in debug message Wanlong Gao
@ 2013-05-23  8:47 ` Wanlong Gao
  2013-05-23  8:47 ` [Qemu-devel] [PATCH 3/5] memory: do not assign node_mem[] to 0 twice Wanlong Gao
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 11+ messages in thread
From: Wanlong Gao @ 2013-05-23  8:47 UTC (permalink / raw)
  To: qemu-devel; +Cc: Wanlong Gao

If the total number of the assigned numa nodes memory is not
equal to the assigned total ram size, the guest will recognize
all memory to one node.

eg:
-m 1024 -smp sockets=2,cores=1,threads=1 -numa node,cpus=0,nodeid=0,mem=512 \
-numa node,nodeid=1,cpus=1,mem=256

(qemu) info numa
2 nodes
node 0 cpus: 0
node 0 size: 512 MB
node 1 cpus: 1
node 1 size: 256 MB

$ numactl -H
avaliable: 1 nodes (0)
node 0 cpus: 0 1
node 0 size: 1023 MB
node 0 free: 821 MB
node distances:
node	0
0:     10

Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
---
 vl.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vl.c b/vl.c
index 59dc0b4..ce24bcd 100644
--- a/vl.c
+++ b/vl.c
@@ -4238,6 +4238,15 @@ int main(int argc, char **argv, char **envp)
             node_mem[i] = ram_size - usedmem;
         }
 
+        uint64_t numa_total = 0;
+        for (i = 0; i < nb_numa_nodes; i++)
+            numa_total += node_mem[i];
+        if (numa_total != ram_size) {
+            fprintf(stderr, "qemu: numa nodes total memory size "
+                            "should equal ram size\n");
+            exit(1);
+        }
+
         for (i = 0; i < nb_numa_nodes; i++) {
             if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) {
                 break;
-- 
1.8.3.rc2.10.g0c2b1cf

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [Qemu-devel] [PATCH 3/5] memory: do not assign node_mem[] to 0 twice
  2013-05-23  8:47 [Qemu-devel] [PATCH 1/5] pci-assign: remove the duplicate function name in debug message Wanlong Gao
  2013-05-23  8:47 ` [Qemu-devel] [PATCH 2/5] memory: check if the total numa memory size is equal to ram_size Wanlong Gao
@ 2013-05-23  8:47 ` Wanlong Gao
  2013-05-23  8:47 ` [Qemu-devel] [PATCH 4/5] Add qemu_mbind interface for pinning memory to host node Wanlong Gao
  2013-05-23  8:47 ` [Qemu-devel] [PATCH 5/5] memory: able to pin guest node memory to host node manually Wanlong Gao
  3 siblings, 0 replies; 11+ messages in thread
From: Wanlong Gao @ 2013-05-23  8:47 UTC (permalink / raw)
  To: qemu-devel; +Cc: Wanlong Gao

We already assigned node_mem[] to 0 before add numa,
so it's unnecessary to assign twice.

Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
---
 vl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vl.c b/vl.c
index ce24bcd..5555b1d 100644
--- a/vl.c
+++ b/vl.c
@@ -1381,9 +1381,7 @@ static void numa_add(const char *optarg)
             exit(1);
         }
 
-        if (get_param_value(option, 128, "mem", optarg) == 0) {
-            node_mem[nodenr] = 0;
-        } else {
+        if (get_param_value(option, 128, "mem", optarg) != 0) {
             int64_t sval;
             sval = strtosz(option, &endptr);
             if (sval < 0 || *endptr) {
-- 
1.8.3.rc2.10.g0c2b1cf

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [Qemu-devel] [PATCH 4/5] Add qemu_mbind interface for pinning memory to host node
  2013-05-23  8:47 [Qemu-devel] [PATCH 1/5] pci-assign: remove the duplicate function name in debug message Wanlong Gao
  2013-05-23  8:47 ` [Qemu-devel] [PATCH 2/5] memory: check if the total numa memory size is equal to ram_size Wanlong Gao
  2013-05-23  8:47 ` [Qemu-devel] [PATCH 3/5] memory: do not assign node_mem[] to 0 twice Wanlong Gao
@ 2013-05-23  8:47 ` Wanlong Gao
  2013-05-23  8:47 ` [Qemu-devel] [PATCH 5/5] memory: able to pin guest node memory to host node manually Wanlong Gao
  3 siblings, 0 replies; 11+ messages in thread
From: Wanlong Gao @ 2013-05-23  8:47 UTC (permalink / raw)
  To: qemu-devel; +Cc: Wanlong Gao

Add qemu_mbind() interface for pinning memory to host node
manually. Use the mbind() syscall wrapper which defined
in libnuma.

Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
---
 configure            | 18 ++++++++++++++++++
 include/qemu/osdep.h | 26 ++++++++++++++++++++++++++
 util/osdep.c         | 15 +++++++++++++++
 3 files changed, 59 insertions(+)

diff --git a/configure b/configure
index 5ae7e4a..5364d01 100755
--- a/configure
+++ b/configure
@@ -3141,6 +3141,20 @@ if compile_prog "" "" ; then
 fi
 
 ##########################################
+# check if we have mbind
+
+mbind=no
+cat > $TMPC << EOF
+#include <numaif.h>
+int main(void) { return mbind(0, 0, MPOL_BIND, 0, 0, 0); }
+EOF
+if compile_prog "" "-lnuma"; then
+    mbind=yes
+    LIBS="-lnuma $LIBS"
+    libs_qga="-lnuma $libs_qga"
+fi
+
+##########################################
 # check if we have usable SIGEV_THREAD_ID
 
 sigev_thread_id=no
@@ -3560,6 +3574,7 @@ echo "preadv support    $preadv"
 echo "fdatasync         $fdatasync"
 echo "madvise           $madvise"
 echo "posix_madvise     $posix_madvise"
+echo "mbind             $mbind"
 echo "sigev_thread_id   $sigev_thread_id"
 echo "uuid support      $uuid"
 echo "libcap-ng support $cap_ng"
@@ -3875,6 +3890,9 @@ fi
 if test "$posix_madvise" = "yes" ; then
   echo "CONFIG_POSIX_MADVISE=y" >> $config_host_mak
 fi
+if test "$mbind" = "yes"; then
+  echo "CONFIG_MBIND=y" >> $config_host_mak
+fi
 if test "$sigev_thread_id" = "yes" ; then
   echo "CONFIG_SIGEV_THREAD_ID=y" >> $config_host_mak
 fi
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 57d7b1f..82a790e 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -152,6 +152,32 @@ int qemu_madvise(void *addr, size_t len, int advice);
 int qemu_open(const char *name, int flags, ...);
 int qemu_close(int fd);
 
+#define QEMU_MPOL_INVALID -1
+
+#if defined(CONFIG_MBIND)
+#include <numaif.h>
+/* Policies */
+#define QEMU_MPOL_DEFAULT MPOL_DEFAULT
+#define QEMU_MPOL_PREFERRED MPOL_PREFERRED
+#define QEMU_MPOL_BIND MPOL_BIND
+#define QEMU_MPOL_INTERLEAVE MPOL_INTERLEAVE
+/* Flags for qemu_mbind */
+#define QEMU_MPOL_MF_STRICT MPOL_MF_STRICT
+#define QEMU_MPOL_MF_MOVE MPOL_MF_MOVE
+#define QEMU_MPOL_MF_MOVE_ALL MPOL_MF_MOVE_ALL
+#else
+#define QEMU_MPOL_DEFAULT QEMU_MPOL_INVALID
+#define QEMU_MPOL_PREFERRED QEMU_MPOL_INVALID
+#define QEMU_MPOL_BIND QEMU_MPOL_INVALID
+#define QEMU_MPOL_INTERLEAVE QEMU_MPOL_INVALID
+#define QEMU_MPOL_MF_STRICT QEMU_MPOL_INVALID
+#define QEMU_MPOL_MF_MOVE QEMU_MPOL_INVALID
+#define QEMU_MPOL_MF_MOVE_ALL QEMU_MPOL_INVALID
+#endif
+int qemu_mbind(void *addr, unsigned long len, int mode,
+               unsigned long *nodemask, unsigned long maxnode,
+               unsigned flags);
+
 #if defined(__HAIKU__) && defined(__i386__)
 #define FMT_pid "%ld"
 #elif defined(WIN64)
diff --git a/util/osdep.c b/util/osdep.c
index 685c8ae..70f33c7 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -37,6 +37,10 @@
 #include <sys/mman.h>
 #endif
 
+#if defined(CONFIG_MBIND)
+#include <numaif.h>
+#endif
+
 #ifdef CONFIG_SOLARIS
 #include <sys/types.h>
 #include <sys/statvfs.h>
@@ -472,3 +476,14 @@ writev(int fd, const struct iovec *iov, int iov_cnt)
     return readv_writev(fd, iov, iov_cnt, true);
 }
 #endif
+
+int qemu_mbind(void *addr, unsigned long len, int mode,
+               unsigned long *nodemask, unsigned long maxnode,
+               unsigned flags)
+{
+#if defined(CONFIG_MBIND)
+    return mbind(addr, len, mode, nodemask, maxnode, flags);
+#else
+    return 0;
+#endif
+}
-- 
1.8.3.rc2.10.g0c2b1cf

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [Qemu-devel] [PATCH 5/5] memory: able to pin guest node memory to host node manually
  2013-05-23  8:47 [Qemu-devel] [PATCH 1/5] pci-assign: remove the duplicate function name in debug message Wanlong Gao
                   ` (2 preceding siblings ...)
  2013-05-23  8:47 ` [Qemu-devel] [PATCH 4/5] Add qemu_mbind interface for pinning memory to host node Wanlong Gao
@ 2013-05-23  8:47 ` Wanlong Gao
  2013-05-24  7:10   ` Wanlong Gao
                     ` (2 more replies)
  3 siblings, 3 replies; 11+ messages in thread
From: Wanlong Gao @ 2013-05-23  8:47 UTC (permalink / raw)
  To: qemu-devel; +Cc: Wanlong Gao

Use mbind to pin guest numa node memory to host nodes manually.

If we are not able to pin memory to host node, we may meet the
cross node memory access performance regression.

With this patch, we can add manual pinning host node like this:
-m 1024 -numa node,cpus=0,nodeid=0,mem=512,pin=0 -numa node,nodeid=1,cpus=1,mem=512,pin=1

And, if PCI-passthrough is used, direct-attached-device uses DMA transfer
between device and qemu process. All pages of the guest will be pinned by get_user_pages().

KVM_ASSIGN_PCI_DEVICE ioctl
  kvm_vm_ioctl_assign_device()
    =>kvm_assign_device()
      => kvm_iommu_map_memslots()
        => kvm_iommu_map_pages()
           => kvm_pin_pages()

So, with direct-attached-device, all guest page's page count will be +1 and
any page migration will not work. AutoNUMA won't too. And direction by libvirt is *ignored*.

Above all, we need manual pinning memory to host node to avoid
such cross nodes memmory access performance regression.

Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
---
 exec.c                  | 21 +++++++++++++++++++++
 include/sysemu/sysemu.h |  1 +
 vl.c                    | 13 +++++++++++++
 3 files changed, 35 insertions(+)

diff --git a/exec.c b/exec.c
index aec65c5..fe929ef 100644
--- a/exec.c
+++ b/exec.c
@@ -36,6 +36,8 @@
 #include "qemu/config-file.h"
 #include "exec/memory.h"
 #include "sysemu/dma.h"
+#include "sysemu/sysemu.h"
+#include "qemu/bitops.h"
 #include "exec/address-spaces.h"
 #if defined(CONFIG_USER_ONLY)
 #include <qemu.h>
@@ -1081,6 +1083,25 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
             memory_try_enable_merging(new_block->host, size);
         }
     }
+
+    if (nb_numa_nodes > 0 && !strcmp(mr->name, "pc.ram")) {
+        int i;
+        uint64_t nodes_mem = 0;
+        unsigned long *maskp = g_malloc0(sizeof(*maskp));
+        for (i = 0; i < nb_numa_nodes; i++) {
+            *maskp = 0;
+            if (node_pin[i] != -1) {
+                set_bit(node_pin[i], maskp);
+                if (qemu_mbind(new_block->host + nodes_mem, node_mem[i],
+                               QEMU_MPOL_BIND, maskp, MAX_NODES, 0)) {
+                    perror("qemu_mbind");
+                    exit(1);
+                }
+            }
+            nodes_mem += node_mem[i];
+        }
+    }
+
     new_block->length = size;
 
     /* Keep the list sorted from biggest to smallest block.  */
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 2fb71af..ebf6580 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -131,6 +131,7 @@ extern QEMUClock *rtc_clock;
 #define MAX_CPUMASK_BITS 255
 extern int nb_numa_nodes;
 extern uint64_t node_mem[MAX_NODES];
+extern int node_pin[MAX_NODES];
 extern unsigned long *node_cpumask[MAX_NODES];
 
 #define MAX_OPTION_ROMS 16
diff --git a/vl.c b/vl.c
index 5555b1d..3768002 100644
--- a/vl.c
+++ b/vl.c
@@ -253,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
 
 int nb_numa_nodes;
 uint64_t node_mem[MAX_NODES];
+int node_pin[MAX_NODES];
 unsigned long *node_cpumask[MAX_NODES];
 
 uint8_t qemu_uuid[16];
@@ -1390,6 +1391,17 @@ static void numa_add(const char *optarg)
             }
             node_mem[nodenr] = sval;
         }
+
+        if (get_param_value(option, 128, "pin", optarg) != 0) {
+            int unsigned long long pin_node;
+            if (parse_uint_full(option, &pin_node, 10) < 0) {
+                fprintf(stderr, "qemu: Invalid pinning nodeid: %s\n", optarg);
+                exit(1);
+            } else {
+                node_pin[nodenr] = pin_node;
+            }
+        }
+
         if (get_param_value(option, 128, "cpus", optarg) != 0) {
             numa_node_parse_cpus(nodenr, option);
         }
@@ -2921,6 +2933,7 @@ int main(int argc, char **argv, char **envp)
 
     for (i = 0; i < MAX_NODES; i++) {
         node_mem[i] = 0;
+        node_pin[i] = -1;
         node_cpumask[i] = bitmap_new(MAX_CPUMASK_BITS);
     }
 
-- 
1.8.3.rc2.10.g0c2b1cf

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH 5/5] memory: able to pin guest node memory to host node manually
  2013-05-23  8:47 ` [Qemu-devel] [PATCH 5/5] memory: able to pin guest node memory to host node manually Wanlong Gao
@ 2013-05-24  7:10   ` Wanlong Gao
  2013-05-27  2:57     ` Wanlong Gao
  2013-05-28  2:27   ` Wanlong Gao
  2013-05-30  9:57   ` Wanlong Gao
  2 siblings, 1 reply; 11+ messages in thread
From: Wanlong Gao @ 2013-05-24  7:10 UTC (permalink / raw)
  To: qemu-devel; +Cc: Wanlong Gao

On 05/23/2013 04:47 PM, Wanlong Gao wrote:
> Use mbind to pin guest numa node memory to host nodes manually.
> 
> If we are not able to pin memory to host node, we may meet the
> cross node memory access performance regression.
> 
> With this patch, we can add manual pinning host node like this:
> -m 1024 -numa node,cpus=0,nodeid=0,mem=512,pin=0 -numa node,nodeid=1,cpus=1,mem=512,pin=1
> 
> And, if PCI-passthrough is used, direct-attached-device uses DMA transfer
> between device and qemu process. All pages of the guest will be pinned by get_user_pages().
> 
> KVM_ASSIGN_PCI_DEVICE ioctl
>   kvm_vm_ioctl_assign_device()
>     =>kvm_assign_device()
>       => kvm_iommu_map_memslots()
>         => kvm_iommu_map_pages()
>            => kvm_pin_pages()
> 
> So, with direct-attached-device, all guest page's page count will be +1 and
> any page migration will not work. AutoNUMA won't too. And direction by libvirt is *ignored*.
> 
> Above all, we need manual pinning memory to host node to avoid
> such cross nodes memmory access performance regression.

Any comments ?

Thanks,
Wanlong Gao

> 
> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
> ---
>  exec.c                  | 21 +++++++++++++++++++++
>  include/sysemu/sysemu.h |  1 +
>  vl.c                    | 13 +++++++++++++
>  3 files changed, 35 insertions(+)
> 
> diff --git a/exec.c b/exec.c
> index aec65c5..fe929ef 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -36,6 +36,8 @@
>  #include "qemu/config-file.h"
>  #include "exec/memory.h"
>  #include "sysemu/dma.h"
> +#include "sysemu/sysemu.h"
> +#include "qemu/bitops.h"
>  #include "exec/address-spaces.h"
>  #if defined(CONFIG_USER_ONLY)
>  #include <qemu.h>
> @@ -1081,6 +1083,25 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
>              memory_try_enable_merging(new_block->host, size);
>          }
>      }
> +
> +    if (nb_numa_nodes > 0 && !strcmp(mr->name, "pc.ram")) {
> +        int i;
> +        uint64_t nodes_mem = 0;
> +        unsigned long *maskp = g_malloc0(sizeof(*maskp));
> +        for (i = 0; i < nb_numa_nodes; i++) {
> +            *maskp = 0;
> +            if (node_pin[i] != -1) {
> +                set_bit(node_pin[i], maskp);
> +                if (qemu_mbind(new_block->host + nodes_mem, node_mem[i],
> +                               QEMU_MPOL_BIND, maskp, MAX_NODES, 0)) {
> +                    perror("qemu_mbind");
> +                    exit(1);
> +                }
> +            }
> +            nodes_mem += node_mem[i];
> +        }
> +    }
> +
>      new_block->length = size;
>  
>      /* Keep the list sorted from biggest to smallest block.  */
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 2fb71af..ebf6580 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -131,6 +131,7 @@ extern QEMUClock *rtc_clock;
>  #define MAX_CPUMASK_BITS 255
>  extern int nb_numa_nodes;
>  extern uint64_t node_mem[MAX_NODES];
> +extern int node_pin[MAX_NODES];
>  extern unsigned long *node_cpumask[MAX_NODES];
>  
>  #define MAX_OPTION_ROMS 16
> diff --git a/vl.c b/vl.c
> index 5555b1d..3768002 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -253,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
>  
>  int nb_numa_nodes;
>  uint64_t node_mem[MAX_NODES];
> +int node_pin[MAX_NODES];
>  unsigned long *node_cpumask[MAX_NODES];
>  
>  uint8_t qemu_uuid[16];
> @@ -1390,6 +1391,17 @@ static void numa_add(const char *optarg)
>              }
>              node_mem[nodenr] = sval;
>          }
> +
> +        if (get_param_value(option, 128, "pin", optarg) != 0) {
> +            int unsigned long long pin_node;
> +            if (parse_uint_full(option, &pin_node, 10) < 0) {
> +                fprintf(stderr, "qemu: Invalid pinning nodeid: %s\n", optarg);
> +                exit(1);
> +            } else {
> +                node_pin[nodenr] = pin_node;
> +            }
> +        }
> +
>          if (get_param_value(option, 128, "cpus", optarg) != 0) {
>              numa_node_parse_cpus(nodenr, option);
>          }
> @@ -2921,6 +2933,7 @@ int main(int argc, char **argv, char **envp)
>  
>      for (i = 0; i < MAX_NODES; i++) {
>          node_mem[i] = 0;
> +        node_pin[i] = -1;
>          node_cpumask[i] = bitmap_new(MAX_CPUMASK_BITS);
>      }
>  
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH 5/5] memory: able to pin guest node memory to host node manually
  2013-05-24  7:10   ` Wanlong Gao
@ 2013-05-27  2:57     ` Wanlong Gao
  0 siblings, 0 replies; 11+ messages in thread
From: Wanlong Gao @ 2013-05-27  2:57 UTC (permalink / raw)
  To: qemu-devel; +Cc: gaowanlong

Ping............



> On 05/23/2013 04:47 PM, Wanlong Gao wrote:
>> Use mbind to pin guest numa node memory to host nodes manually.
>>
>> If we are not able to pin memory to host node, we may meet the
>> cross node memory access performance regression.
>>
>> With this patch, we can add manual pinning host node like this:
>> -m 1024 -numa node,cpus=0,nodeid=0,mem=512,pin=0 -numa node,nodeid=1,cpus=1,mem=512,pin=1
>>
>> And, if PCI-passthrough is used, direct-attached-device uses DMA transfer
>> between device and qemu process. All pages of the guest will be pinned by get_user_pages().
>>
>> KVM_ASSIGN_PCI_DEVICE ioctl
>>   kvm_vm_ioctl_assign_device()
>>     =>kvm_assign_device()
>>       => kvm_iommu_map_memslots()
>>         => kvm_iommu_map_pages()
>>            => kvm_pin_pages()
>>
>> So, with direct-attached-device, all guest page's page count will be +1 and
>> any page migration will not work. AutoNUMA won't too. And direction by libvirt is *ignored*.
>>
>> Above all, we need manual pinning memory to host node to avoid
>> such cross nodes memmory access performance regression.
> 
> Any comments ?
> 
> Thanks,
> Wanlong Gao
> 
>>
>> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
>> ---
>>  exec.c                  | 21 +++++++++++++++++++++
>>  include/sysemu/sysemu.h |  1 +
>>  vl.c                    | 13 +++++++++++++
>>  3 files changed, 35 insertions(+)
>>
>> diff --git a/exec.c b/exec.c
>> index aec65c5..fe929ef 100644
>> --- a/exec.c
>> +++ b/exec.c
>> @@ -36,6 +36,8 @@
>>  #include "qemu/config-file.h"
>>  #include "exec/memory.h"
>>  #include "sysemu/dma.h"
>> +#include "sysemu/sysemu.h"
>> +#include "qemu/bitops.h"
>>  #include "exec/address-spaces.h"
>>  #if defined(CONFIG_USER_ONLY)
>>  #include <qemu.h>
>> @@ -1081,6 +1083,25 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
>>              memory_try_enable_merging(new_block->host, size);
>>          }
>>      }
>> +
>> +    if (nb_numa_nodes > 0 && !strcmp(mr->name, "pc.ram")) {
>> +        int i;
>> +        uint64_t nodes_mem = 0;
>> +        unsigned long *maskp = g_malloc0(sizeof(*maskp));
>> +        for (i = 0; i < nb_numa_nodes; i++) {
>> +            *maskp = 0;
>> +            if (node_pin[i] != -1) {
>> +                set_bit(node_pin[i], maskp);
>> +                if (qemu_mbind(new_block->host + nodes_mem, node_mem[i],
>> +                               QEMU_MPOL_BIND, maskp, MAX_NODES, 0)) {
>> +                    perror("qemu_mbind");
>> +                    exit(1);
>> +                }
>> +            }
>> +            nodes_mem += node_mem[i];
>> +        }
>> +    }
>> +
>>      new_block->length = size;
>>  
>>      /* Keep the list sorted from biggest to smallest block.  */
>> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
>> index 2fb71af..ebf6580 100644
>> --- a/include/sysemu/sysemu.h
>> +++ b/include/sysemu/sysemu.h
>> @@ -131,6 +131,7 @@ extern QEMUClock *rtc_clock;
>>  #define MAX_CPUMASK_BITS 255
>>  extern int nb_numa_nodes;
>>  extern uint64_t node_mem[MAX_NODES];
>> +extern int node_pin[MAX_NODES];
>>  extern unsigned long *node_cpumask[MAX_NODES];
>>  
>>  #define MAX_OPTION_ROMS 16
>> diff --git a/vl.c b/vl.c
>> index 5555b1d..3768002 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -253,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
>>  
>>  int nb_numa_nodes;
>>  uint64_t node_mem[MAX_NODES];
>> +int node_pin[MAX_NODES];
>>  unsigned long *node_cpumask[MAX_NODES];
>>  
>>  uint8_t qemu_uuid[16];
>> @@ -1390,6 +1391,17 @@ static void numa_add(const char *optarg)
>>              }
>>              node_mem[nodenr] = sval;
>>          }
>> +
>> +        if (get_param_value(option, 128, "pin", optarg) != 0) {
>> +            int unsigned long long pin_node;
>> +            if (parse_uint_full(option, &pin_node, 10) < 0) {
>> +                fprintf(stderr, "qemu: Invalid pinning nodeid: %s\n", optarg);
>> +                exit(1);
>> +            } else {
>> +                node_pin[nodenr] = pin_node;
>> +            }
>> +        }
>> +
>>          if (get_param_value(option, 128, "cpus", optarg) != 0) {
>>              numa_node_parse_cpus(nodenr, option);
>>          }
>> @@ -2921,6 +2933,7 @@ int main(int argc, char **argv, char **envp)
>>  
>>      for (i = 0; i < MAX_NODES; i++) {
>>          node_mem[i] = 0;
>> +        node_pin[i] = -1;
>>          node_cpumask[i] = bitmap_new(MAX_CPUMASK_BITS);
>>      }
>>  
>>
> 
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH 5/5] memory: able to pin guest node memory to host node manually
  2013-05-23  8:47 ` [Qemu-devel] [PATCH 5/5] memory: able to pin guest node memory to host node manually Wanlong Gao
  2013-05-24  7:10   ` Wanlong Gao
@ 2013-05-28  2:27   ` Wanlong Gao
  2013-05-30  9:57   ` Wanlong Gao
  2 siblings, 0 replies; 11+ messages in thread
From: Wanlong Gao @ 2013-05-28  2:27 UTC (permalink / raw)
  To: qemu-devel; +Cc: Paolo Bonzini, Eduardo Habkost, Wanlong Gao

Any comments?


> Use mbind to pin guest numa node memory to host nodes manually.
> 
> If we are not able to pin memory to host node, we may meet the
> cross node memory access performance regression.
> 
> With this patch, we can add manual pinning host node like this:
> -m 1024 -numa node,cpus=0,nodeid=0,mem=512,pin=0 -numa node,nodeid=1,cpus=1,mem=512,pin=1
> 
> And, if PCI-passthrough is used, direct-attached-device uses DMA transfer
> between device and qemu process. All pages of the guest will be pinned by get_user_pages().
> 
> KVM_ASSIGN_PCI_DEVICE ioctl
>   kvm_vm_ioctl_assign_device()
>     =>kvm_assign_device()
>       => kvm_iommu_map_memslots()
>         => kvm_iommu_map_pages()
>            => kvm_pin_pages()
> 
> So, with direct-attached-device, all guest page's page count will be +1 and
> any page migration will not work. AutoNUMA won't too. And direction by libvirt is *ignored*.
> 
> Above all, we need manual pinning memory to host node to avoid
> such cross nodes memmory access performance regression.
> 
> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
> ---
>  exec.c                  | 21 +++++++++++++++++++++
>  include/sysemu/sysemu.h |  1 +
>  vl.c                    | 13 +++++++++++++
>  3 files changed, 35 insertions(+)
> 
> diff --git a/exec.c b/exec.c
> index aec65c5..fe929ef 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -36,6 +36,8 @@
>  #include "qemu/config-file.h"
>  #include "exec/memory.h"
>  #include "sysemu/dma.h"
> +#include "sysemu/sysemu.h"
> +#include "qemu/bitops.h"
>  #include "exec/address-spaces.h"
>  #if defined(CONFIG_USER_ONLY)
>  #include <qemu.h>
> @@ -1081,6 +1083,25 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
>              memory_try_enable_merging(new_block->host, size);
>          }
>      }
> +
> +    if (nb_numa_nodes > 0 && !strcmp(mr->name, "pc.ram")) {
> +        int i;
> +        uint64_t nodes_mem = 0;
> +        unsigned long *maskp = g_malloc0(sizeof(*maskp));
> +        for (i = 0; i < nb_numa_nodes; i++) {
> +            *maskp = 0;
> +            if (node_pin[i] != -1) {
> +                set_bit(node_pin[i], maskp);
> +                if (qemu_mbind(new_block->host + nodes_mem, node_mem[i],
> +                               QEMU_MPOL_BIND, maskp, MAX_NODES, 0)) {
> +                    perror("qemu_mbind");
> +                    exit(1);
> +                }
> +            }
> +            nodes_mem += node_mem[i];
> +        }
> +    }
> +
>      new_block->length = size;
>  
>      /* Keep the list sorted from biggest to smallest block.  */
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 2fb71af..ebf6580 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -131,6 +131,7 @@ extern QEMUClock *rtc_clock;
>  #define MAX_CPUMASK_BITS 255
>  extern int nb_numa_nodes;
>  extern uint64_t node_mem[MAX_NODES];
> +extern int node_pin[MAX_NODES];
>  extern unsigned long *node_cpumask[MAX_NODES];
>  
>  #define MAX_OPTION_ROMS 16
> diff --git a/vl.c b/vl.c
> index 5555b1d..3768002 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -253,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
>  
>  int nb_numa_nodes;
>  uint64_t node_mem[MAX_NODES];
> +int node_pin[MAX_NODES];
>  unsigned long *node_cpumask[MAX_NODES];
>  
>  uint8_t qemu_uuid[16];
> @@ -1390,6 +1391,17 @@ static void numa_add(const char *optarg)
>              }
>              node_mem[nodenr] = sval;
>          }
> +
> +        if (get_param_value(option, 128, "pin", optarg) != 0) {
> +            int unsigned long long pin_node;
> +            if (parse_uint_full(option, &pin_node, 10) < 0) {
> +                fprintf(stderr, "qemu: Invalid pinning nodeid: %s\n", optarg);
> +                exit(1);
> +            } else {
> +                node_pin[nodenr] = pin_node;
> +            }
> +        }
> +
>          if (get_param_value(option, 128, "cpus", optarg) != 0) {
>              numa_node_parse_cpus(nodenr, option);
>          }
> @@ -2921,6 +2933,7 @@ int main(int argc, char **argv, char **envp)
>  
>      for (i = 0; i < MAX_NODES; i++) {
>          node_mem[i] = 0;
> +        node_pin[i] = -1;
>          node_cpumask[i] = bitmap_new(MAX_CPUMASK_BITS);
>      }
>  
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH 5/5] memory: able to pin guest node memory to host node manually
  2013-05-23  8:47 ` [Qemu-devel] [PATCH 5/5] memory: able to pin guest node memory to host node manually Wanlong Gao
  2013-05-24  7:10   ` Wanlong Gao
  2013-05-28  2:27   ` Wanlong Gao
@ 2013-05-30  9:57   ` Wanlong Gao
  2013-05-30 18:22     ` Eduardo Habkost
  2 siblings, 1 reply; 11+ messages in thread
From: Wanlong Gao @ 2013-05-30  9:57 UTC (permalink / raw)
  To: qemu-devel; +Cc: Paolo Bonzini, Eduardo Habkost, Wanlong Gao

Any comments?


> Use mbind to pin guest numa node memory to host nodes manually.
> 
> If we are not able to pin memory to host node, we may meet the
> cross node memory access performance regression.
> 
> With this patch, we can add manual pinning host node like this:
> -m 1024 -numa node,cpus=0,nodeid=0,mem=512,pin=0 -numa node,nodeid=1,cpus=1,mem=512,pin=1
> 
> And, if PCI-passthrough is used, direct-attached-device uses DMA transfer
> between device and qemu process. All pages of the guest will be pinned by get_user_pages().
> 
> KVM_ASSIGN_PCI_DEVICE ioctl
>   kvm_vm_ioctl_assign_device()
>     =>kvm_assign_device()
>       => kvm_iommu_map_memslots()
>         => kvm_iommu_map_pages()
>            => kvm_pin_pages()
> 
> So, with direct-attached-device, all guest page's page count will be +1 and
> any page migration will not work. AutoNUMA won't too. And direction by libvirt is *ignored*.
> 
> Above all, we need manual pinning memory to host node to avoid
> such cross nodes memmory access performance regression.
> 
> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
> ---
>  exec.c                  | 21 +++++++++++++++++++++
>  include/sysemu/sysemu.h |  1 +
>  vl.c                    | 13 +++++++++++++
>  3 files changed, 35 insertions(+)
> 
> diff --git a/exec.c b/exec.c
> index aec65c5..fe929ef 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -36,6 +36,8 @@
>  #include "qemu/config-file.h"
>  #include "exec/memory.h"
>  #include "sysemu/dma.h"
> +#include "sysemu/sysemu.h"
> +#include "qemu/bitops.h"
>  #include "exec/address-spaces.h"
>  #if defined(CONFIG_USER_ONLY)
>  #include <qemu.h>
> @@ -1081,6 +1083,25 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
>              memory_try_enable_merging(new_block->host, size);
>          }
>      }
> +
> +    if (nb_numa_nodes > 0 && !strcmp(mr->name, "pc.ram")) {
> +        int i;
> +        uint64_t nodes_mem = 0;
> +        unsigned long *maskp = g_malloc0(sizeof(*maskp));
> +        for (i = 0; i < nb_numa_nodes; i++) {
> +            *maskp = 0;
> +            if (node_pin[i] != -1) {
> +                set_bit(node_pin[i], maskp);
> +                if (qemu_mbind(new_block->host + nodes_mem, node_mem[i],
> +                               QEMU_MPOL_BIND, maskp, MAX_NODES, 0)) {
> +                    perror("qemu_mbind");
> +                    exit(1);
> +                }
> +            }
> +            nodes_mem += node_mem[i];
> +        }
> +    }
> +
>      new_block->length = size;
>  
>      /* Keep the list sorted from biggest to smallest block.  */
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 2fb71af..ebf6580 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -131,6 +131,7 @@ extern QEMUClock *rtc_clock;
>  #define MAX_CPUMASK_BITS 255
>  extern int nb_numa_nodes;
>  extern uint64_t node_mem[MAX_NODES];
> +extern int node_pin[MAX_NODES];
>  extern unsigned long *node_cpumask[MAX_NODES];
>  
>  #define MAX_OPTION_ROMS 16
> diff --git a/vl.c b/vl.c
> index 5555b1d..3768002 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -253,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
>  
>  int nb_numa_nodes;
>  uint64_t node_mem[MAX_NODES];
> +int node_pin[MAX_NODES];
>  unsigned long *node_cpumask[MAX_NODES];
>  
>  uint8_t qemu_uuid[16];
> @@ -1390,6 +1391,17 @@ static void numa_add(const char *optarg)
>              }
>              node_mem[nodenr] = sval;
>          }
> +
> +        if (get_param_value(option, 128, "pin", optarg) != 0) {
> +            int unsigned long long pin_node;
> +            if (parse_uint_full(option, &pin_node, 10) < 0) {
> +                fprintf(stderr, "qemu: Invalid pinning nodeid: %s\n", optarg);
> +                exit(1);
> +            } else {
> +                node_pin[nodenr] = pin_node;
> +            }
> +        }
> +
>          if (get_param_value(option, 128, "cpus", optarg) != 0) {
>              numa_node_parse_cpus(nodenr, option);
>          }
> @@ -2921,6 +2933,7 @@ int main(int argc, char **argv, char **envp)
>  
>      for (i = 0; i < MAX_NODES; i++) {
>          node_mem[i] = 0;
> +        node_pin[i] = -1;
>          node_cpumask[i] = bitmap_new(MAX_CPUMASK_BITS);
>      }
>  
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH 5/5] memory: able to pin guest node memory to host node manually
  2013-05-30  9:57   ` Wanlong Gao
@ 2013-05-30 18:22     ` Eduardo Habkost
  2013-05-31  8:45       ` Wanlong Gao
  0 siblings, 1 reply; 11+ messages in thread
From: Eduardo Habkost @ 2013-05-30 18:22 UTC (permalink / raw)
  To: Wanlong Gao; +Cc: Paolo Bonzini, qemu-devel

On Thu, May 30, 2013 at 05:57:21PM +0800, Wanlong Gao wrote:
> > Use mbind to pin guest numa node memory to host nodes manually.
> > 
> > If we are not able to pin memory to host node, we may meet the
> > cross node memory access performance regression.
> > 
> > With this patch, we can add manual pinning host node like this:
> > -m 1024 -numa node,cpus=0,nodeid=0,mem=512,pin=0 -numa node,nodeid=1,cpus=1,mem=512,pin=1
> > 
> > And, if PCI-passthrough is used, direct-attached-device uses DMA transfer
> > between device and qemu process. All pages of the guest will be pinned by get_user_pages().
> > 
> > KVM_ASSIGN_PCI_DEVICE ioctl
> >   kvm_vm_ioctl_assign_device()
> >     =>kvm_assign_device()
> >       => kvm_iommu_map_memslots()
> >         => kvm_iommu_map_pages()
> >            => kvm_pin_pages()
> > 
> > So, with direct-attached-device, all guest page's page count will be +1 and
> > any page migration will not work. AutoNUMA won't too. And direction by libvirt is *ignored*.
> > 
> > Above all, we need manual pinning memory to host node to avoid
> > such cross nodes memmory access performance regression.

I believe a similar approach (letting QEMU do the pinning itself) was
already proposed and rejected. See:
http://article.gmane.org/gmane.comp.emulators.kvm.devel/58835
http://article.gmane.org/gmane.comp.emulators.kvm.devel/57684

An alternative approach was proposed at;
http://article.gmane.org/gmane.comp.emulators.qemu/123001
(exporting virtual address information directly)

and another one at:
http://article.gmane.org/gmane.comp.emulators.qemu/157741
(keeping the files inside -mem-path-dir so they could be pinned manually
by other programs)

The approach I was planning to implement was the one proposed at:
http://article.gmane.org/gmane.comp.emulators.kvm.devel/93476
(exporting memory backing information through QMP, instead of depending
on predictable filenames on -mem-path-dir)


> > 
> > Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
> > ---
> >  exec.c                  | 21 +++++++++++++++++++++
> >  include/sysemu/sysemu.h |  1 +
> >  vl.c                    | 13 +++++++++++++
> >  3 files changed, 35 insertions(+)
> > 
> > diff --git a/exec.c b/exec.c
> > index aec65c5..fe929ef 100644
> > --- a/exec.c
> > +++ b/exec.c
> > @@ -36,6 +36,8 @@
> >  #include "qemu/config-file.h"
> >  #include "exec/memory.h"
> >  #include "sysemu/dma.h"
> > +#include "sysemu/sysemu.h"
> > +#include "qemu/bitops.h"
> >  #include "exec/address-spaces.h"
> >  #if defined(CONFIG_USER_ONLY)
> >  #include <qemu.h>
> > @@ -1081,6 +1083,25 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
> >              memory_try_enable_merging(new_block->host, size);
> >          }
> >      }
> > +
> > +    if (nb_numa_nodes > 0 && !strcmp(mr->name, "pc.ram")) {
> > +        int i;
> > +        uint64_t nodes_mem = 0;
> > +        unsigned long *maskp = g_malloc0(sizeof(*maskp));
> > +        for (i = 0; i < nb_numa_nodes; i++) {
> > +            *maskp = 0;
> > +            if (node_pin[i] != -1) {
> > +                set_bit(node_pin[i], maskp);
> > +                if (qemu_mbind(new_block->host + nodes_mem, node_mem[i],
> > +                               QEMU_MPOL_BIND, maskp, MAX_NODES, 0)) {
> > +                    perror("qemu_mbind");
> > +                    exit(1);
> > +                }
> > +            }
> > +            nodes_mem += node_mem[i];
> > +        }
> > +    }
> > +
> >      new_block->length = size;
> >  
> >      /* Keep the list sorted from biggest to smallest block.  */
> > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > index 2fb71af..ebf6580 100644
> > --- a/include/sysemu/sysemu.h
> > +++ b/include/sysemu/sysemu.h
> > @@ -131,6 +131,7 @@ extern QEMUClock *rtc_clock;
> >  #define MAX_CPUMASK_BITS 255
> >  extern int nb_numa_nodes;
> >  extern uint64_t node_mem[MAX_NODES];
> > +extern int node_pin[MAX_NODES];
> >  extern unsigned long *node_cpumask[MAX_NODES];
> >  
> >  #define MAX_OPTION_ROMS 16
> > diff --git a/vl.c b/vl.c
> > index 5555b1d..3768002 100644
> > --- a/vl.c
> > +++ b/vl.c
> > @@ -253,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
> >  
> >  int nb_numa_nodes;
> >  uint64_t node_mem[MAX_NODES];
> > +int node_pin[MAX_NODES];
> >  unsigned long *node_cpumask[MAX_NODES];
> >  
> >  uint8_t qemu_uuid[16];
> > @@ -1390,6 +1391,17 @@ static void numa_add(const char *optarg)
> >              }
> >              node_mem[nodenr] = sval;
> >          }
> > +
> > +        if (get_param_value(option, 128, "pin", optarg) != 0) {
> > +            int unsigned long long pin_node;
> > +            if (parse_uint_full(option, &pin_node, 10) < 0) {
> > +                fprintf(stderr, "qemu: Invalid pinning nodeid: %s\n", optarg);
> > +                exit(1);
> > +            } else {
> > +                node_pin[nodenr] = pin_node;
> > +            }
> > +        }
> > +
> >          if (get_param_value(option, 128, "cpus", optarg) != 0) {
> >              numa_node_parse_cpus(nodenr, option);
> >          }
> > @@ -2921,6 +2933,7 @@ int main(int argc, char **argv, char **envp)
> >  
> >      for (i = 0; i < MAX_NODES; i++) {
> >          node_mem[i] = 0;
> > +        node_pin[i] = -1;
> >          node_cpumask[i] = bitmap_new(MAX_CPUMASK_BITS);
> >      }
> >  
> > 
> 
> 
> 

-- 
Eduardo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH 5/5] memory: able to pin guest node memory to host node manually
  2013-05-30 18:22     ` Eduardo Habkost
@ 2013-05-31  8:45       ` Wanlong Gao
  0 siblings, 0 replies; 11+ messages in thread
From: Wanlong Gao @ 2013-05-31  8:45 UTC (permalink / raw)
  To: Eduardo Habkost
  Cc: aarcange, qemu-devel, Blue Swirl, Anthony Liguori, Paolo Bonzini,
	Wanlong Gao

On 05/31/2013 02:22 AM, Eduardo Habkost wrote:
> On Thu, May 30, 2013 at 05:57:21PM +0800, Wanlong Gao wrote:
>>> Use mbind to pin guest numa node memory to host nodes manually.
>>>
>>> If we are not able to pin memory to host node, we may meet the
>>> cross node memory access performance regression.
>>>
>>> With this patch, we can add manual pinning host node like this:
>>> -m 1024 -numa node,cpus=0,nodeid=0,mem=512,pin=0 -numa node,nodeid=1,cpus=1,mem=512,pin=1
>>>
>>> And, if PCI-passthrough is used, direct-attached-device uses DMA transfer
>>> between device and qemu process. All pages of the guest will be pinned by get_user_pages().
>>>
>>> KVM_ASSIGN_PCI_DEVICE ioctl
>>>   kvm_vm_ioctl_assign_device()
>>>     =>kvm_assign_device()
>>>       => kvm_iommu_map_memslots()
>>>         => kvm_iommu_map_pages()
>>>            => kvm_pin_pages()
>>>
>>> So, with direct-attached-device, all guest page's page count will be +1 and
>>> any page migration will not work. AutoNUMA won't too. And direction by libvirt is *ignored*.
>>>
>>> Above all, we need manual pinning memory to host node to avoid
>>> such cross nodes memmory access performance regression.
> 
> I believe a similar approach (letting QEMU do the pinning itself) was
> already proposed and rejected. See:
> http://article.gmane.org/gmane.comp.emulators.kvm.devel/58835
> http://article.gmane.org/gmane.comp.emulators.kvm.devel/57684
> 
> An alternative approach was proposed at;
> http://article.gmane.org/gmane.comp.emulators.qemu/123001
> (exporting virtual address information directly)
> 
> and another one at:
> http://article.gmane.org/gmane.comp.emulators.qemu/157741
> (keeping the files inside -mem-path-dir so they could be pinned manually
> by other programs)
> 
> The approach I was planning to implement was the one proposed at:
> http://article.gmane.org/gmane.comp.emulators.kvm.devel/93476
> (exporting memory backing information through QMP, instead of depending
> on predictable filenames on -mem-path-dir)

You proposal seems good, but as I said above, when PCI-passthrough is used,
direct-attached-device uses DMA transfer between device and qemu process.
All pages of the guest will be pinned by get_user_pages(). Then the
"numactl" directions through hugetlbfs files will not work, neither does the
AutoNUMA.  We should set the numa binding directions manually before
PCI-passthrough assigned devices. Any external tools can't resolve this
problem but pinning memory manually inside QEMU.
So, IMO, we should both support manually pinning memory nodes inside QEMU
and give interfaces to allow external tools to judge the memory bind policy.

Thanks,
Wanlong Gao

> 
> 
>>>
>>> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
>>> ---
>>>  exec.c                  | 21 +++++++++++++++++++++
>>>  include/sysemu/sysemu.h |  1 +
>>>  vl.c                    | 13 +++++++++++++
>>>  3 files changed, 35 insertions(+)
>>>
>>> diff --git a/exec.c b/exec.c
>>> index aec65c5..fe929ef 100644
>>> --- a/exec.c
>>> +++ b/exec.c
>>> @@ -36,6 +36,8 @@
>>>  #include "qemu/config-file.h"
>>>  #include "exec/memory.h"
>>>  #include "sysemu/dma.h"
>>> +#include "sysemu/sysemu.h"
>>> +#include "qemu/bitops.h"
>>>  #include "exec/address-spaces.h"
>>>  #if defined(CONFIG_USER_ONLY)
>>>  #include <qemu.h>
>>> @@ -1081,6 +1083,25 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
>>>              memory_try_enable_merging(new_block->host, size);
>>>          }
>>>      }
>>> +
>>> +    if (nb_numa_nodes > 0 && !strcmp(mr->name, "pc.ram")) {
>>> +        int i;
>>> +        uint64_t nodes_mem = 0;
>>> +        unsigned long *maskp = g_malloc0(sizeof(*maskp));
>>> +        for (i = 0; i < nb_numa_nodes; i++) {
>>> +            *maskp = 0;
>>> +            if (node_pin[i] != -1) {
>>> +                set_bit(node_pin[i], maskp);
>>> +                if (qemu_mbind(new_block->host + nodes_mem, node_mem[i],
>>> +                               QEMU_MPOL_BIND, maskp, MAX_NODES, 0)) {
>>> +                    perror("qemu_mbind");
>>> +                    exit(1);
>>> +                }
>>> +            }
>>> +            nodes_mem += node_mem[i];
>>> +        }
>>> +    }
>>> +
>>>      new_block->length = size;
>>>  
>>>      /* Keep the list sorted from biggest to smallest block.  */
>>> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
>>> index 2fb71af..ebf6580 100644
>>> --- a/include/sysemu/sysemu.h
>>> +++ b/include/sysemu/sysemu.h
>>> @@ -131,6 +131,7 @@ extern QEMUClock *rtc_clock;
>>>  #define MAX_CPUMASK_BITS 255
>>>  extern int nb_numa_nodes;
>>>  extern uint64_t node_mem[MAX_NODES];
>>> +extern int node_pin[MAX_NODES];
>>>  extern unsigned long *node_cpumask[MAX_NODES];
>>>  
>>>  #define MAX_OPTION_ROMS 16
>>> diff --git a/vl.c b/vl.c
>>> index 5555b1d..3768002 100644
>>> --- a/vl.c
>>> +++ b/vl.c
>>> @@ -253,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
>>>  
>>>  int nb_numa_nodes;
>>>  uint64_t node_mem[MAX_NODES];
>>> +int node_pin[MAX_NODES];
>>>  unsigned long *node_cpumask[MAX_NODES];
>>>  
>>>  uint8_t qemu_uuid[16];
>>> @@ -1390,6 +1391,17 @@ static void numa_add(const char *optarg)
>>>              }
>>>              node_mem[nodenr] = sval;
>>>          }
>>> +
>>> +        if (get_param_value(option, 128, "pin", optarg) != 0) {
>>> +            int unsigned long long pin_node;
>>> +            if (parse_uint_full(option, &pin_node, 10) < 0) {
>>> +                fprintf(stderr, "qemu: Invalid pinning nodeid: %s\n", optarg);
>>> +                exit(1);
>>> +            } else {
>>> +                node_pin[nodenr] = pin_node;
>>> +            }
>>> +        }
>>> +
>>>          if (get_param_value(option, 128, "cpus", optarg) != 0) {
>>>              numa_node_parse_cpus(nodenr, option);
>>>          }
>>> @@ -2921,6 +2933,7 @@ int main(int argc, char **argv, char **envp)
>>>  
>>>      for (i = 0; i < MAX_NODES; i++) {
>>>          node_mem[i] = 0;
>>> +        node_pin[i] = -1;
>>>          node_cpumask[i] = bitmap_new(MAX_CPUMASK_BITS);
>>>      }
>>>  
>>>
>>
>>
>>
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2013-05-31  8:48 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-05-23  8:47 [Qemu-devel] [PATCH 1/5] pci-assign: remove the duplicate function name in debug message Wanlong Gao
2013-05-23  8:47 ` [Qemu-devel] [PATCH 2/5] memory: check if the total numa memory size is equal to ram_size Wanlong Gao
2013-05-23  8:47 ` [Qemu-devel] [PATCH 3/5] memory: do not assign node_mem[] to 0 twice Wanlong Gao
2013-05-23  8:47 ` [Qemu-devel] [PATCH 4/5] Add qemu_mbind interface for pinning memory to host node Wanlong Gao
2013-05-23  8:47 ` [Qemu-devel] [PATCH 5/5] memory: able to pin guest node memory to host node manually Wanlong Gao
2013-05-24  7:10   ` Wanlong Gao
2013-05-27  2:57     ` Wanlong Gao
2013-05-28  2:27   ` Wanlong Gao
2013-05-30  9:57   ` Wanlong Gao
2013-05-30 18:22     ` Eduardo Habkost
2013-05-31  8:45       ` Wanlong Gao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).