All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC 0/2] linux/vnuma: vnuma topology PV guest
@ 2013-08-27  8:52 Elena Ufimtseva
  2013-08-27  8:52 ` [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest Elena Ufimtseva
                   ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: Elena Ufimtseva @ 2013-08-27  8:52 UTC (permalink / raw)
  To: xen-devel; +Cc: Elena Ufimtseva

Enables PV guest to discover NUMA topology provided by XEN
and inits this topology on boot.
Xen provides numa memblocks aligned for guest, distance table,
cpu setup per node.
On failure defaults to dummy linux numa init.

TODO:
Change XENMEM hypercall subop to SYSCTL.

Elena Ufimtseva (2):
  vNUMA PV Guest Linux support
  Enables NUMA for PV guest

 arch/x86/include/asm/xen/vnuma-xen.h |   36 +++++++++++
 arch/x86/mm/numa.c                   |    8 +++
 arch/x86/xen/enlighten.c             |  115 ++++++++++++++++++++++++++++++++++
 arch/x86/xen/setup.c                 |    2 +-
 include/xen/interface/memory.h       |    1 +
 5 files changed, 161 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma-xen.h

-- 
1.7.10.4

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-27  8:52 [PATCH RFC 0/2] linux/vnuma: vnuma topology PV guest Elena Ufimtseva
@ 2013-08-27  8:52 ` Elena Ufimtseva
  2013-08-27 13:52   ` David Vrabel
                     ` (2 more replies)
  2013-08-27  8:53 ` [PATCH RFC 2/2] linux/vnuma: Enables NUMA support for PV guest Elena Ufimtseva
  2013-08-27 13:48 ` [PATCH RFC 0/2] linux/vnuma: vnuma topology " David Vrabel
  2 siblings, 3 replies; 29+ messages in thread
From: Elena Ufimtseva @ 2013-08-27  8:52 UTC (permalink / raw)
  To: xen-devel; +Cc: Elena Ufimtseva

Uses subop hypercall to request XEN about vnuma topology.
Sets the memory blocks (aligned by XEN), cpus, distance table
on boot. NUMA support should be compiled in kernel.

NUMA support should be compiled in kernel.

Memory blocks are aligned by xen. Xen is aware of guest initial
RAM layout.
If the memory blocks are incorrect, call for default linux numa
dummy init.

Requires XEN with vNUMA support.

Applies to 3.10 git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git
latest commit: 8bb495e3f02401ee6f76d1b1d77f3ac9f079e376

TODO:
Change subop from XENMEM to SYSCTL.

Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
---
 arch/x86/include/asm/xen/vnuma-xen.h |   36 +++++++++++
 arch/x86/mm/numa.c                   |    8 +++
 arch/x86/xen/enlighten.c             |  115 ++++++++++++++++++++++++++++++++++
 include/xen/interface/memory.h       |    1 +
 4 files changed, 160 insertions(+)
 create mode 100644 arch/x86/include/asm/xen/vnuma-xen.h

diff --git a/arch/x86/include/asm/xen/vnuma-xen.h b/arch/x86/include/asm/xen/vnuma-xen.h
new file mode 100644
index 0000000..73c1bde
--- /dev/null
+++ b/arch/x86/include/asm/xen/vnuma-xen.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_X86_VNUMA_XEN_H
+#define _ASM_X86_VNUMA_XEN_H
+
+#ifdef CONFIG_XEN
+int __initdata xen_numa_init(void);
+
+struct vnuma_memblk {
+	          uint64_t start, end;
+};
+DEFINE_GUEST_HANDLE_STRUCT(vnuma_memblk);
+
+struct vnuma_topology_info {
+	domid_t domid;
+	uint16_t nr_nodes;
+	GUEST_HANDLE(vnuma_memblk) vmemblk;
+	GUEST_HANDLE(int) vdistance;
+	GUEST_HANDLE(int) cpu_to_node;
+	GUEST_HANDLE(int) vnode_to_pnode;
+};
+DEFINE_GUEST_HANDLE_STRUCT(vnuma_topology_info);
+
+struct xen_domctl {
+	uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
+	domid_t  domain;
+	union {
+		struct vnuma_topology_info vnuma;
+		} u;
+};
+typedef struct xen_domctl xen_domctl_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_t);
+
+#else
+int __init xen_numa_init(void) {}
+#endif
+
+#endif /* _ASM_X86_VNUMA_XEN_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 8bf93ba..3ec7855 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -20,6 +20,10 @@
 
 #include "numa_internal.h"
 
+#ifdef CONFIG_XEN
+#include "asm/xen/vnuma-xen.h"
+#endif
+
 int __initdata numa_off;
 nodemask_t numa_nodes_parsed __initdata;
 
@@ -621,6 +625,10 @@ static int __init dummy_numa_init(void)
 void __init x86_numa_init(void)
 {
 	if (!numa_off) {
+#ifdef CONFIG_XEN
+		if (xen_pv_domain() && !numa_init(xen_numa_init))
+			return;
+#endif
 #ifdef CONFIG_X86_NUMAQ
 		if (!numa_init(numaq_numa_init))
 			return;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 193097e..4658e9d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,7 @@
 #include <linux/memblock.h>
 #include <linux/edd.h>
 
+#include <asm/numa.h>
 #include <xen/xen.h>
 #include <xen/events.h>
 #include <xen/interface/xen.h>
@@ -69,6 +70,7 @@
 #include <asm/mwait.h>
 #include <asm/pci_x86.h>
 #include <asm/pat.h>
+#include <asm/xen/vnuma-xen.h>
 
 #ifdef CONFIG_ACPI
 #include <linux/acpi.h>
@@ -1750,4 +1752,117 @@ const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
 	.x2apic_available	= xen_x2apic_para_available,
 };
 EXPORT_SYMBOL(x86_hyper_xen_hvm);
+
+/* Xen PV NUMA topology initialization */
+int __initdata xen_numa_init(void)
+{
+	struct vnuma_memblk *vblk;
+	struct vnuma_topology_info numa_topo;
+	uint64_t size, start, end;
+	int i, j, cpu, rc;
+	u64 phys, physd, physc;
+	u64 new_end_pfn;
+	size_t mem_size;
+
+	int *vdistance, *cpu_to_node;
+	unsigned long dist_size, cpu_to_node_size;
+	numa_topo.domid = DOMID_SELF;
+	rc = -EINVAL;
+	/* block mem reservation for memblks */
+	mem_size = num_possible_cpus() * sizeof(struct vnuma_memblk);
+	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), mem_size, PAGE_SIZE);
+	if (!phys) {
+		pr_info("vNUMA: can't allocate memory for membloks size %zu\n", mem_size);
+		rc = -ENOMEM;
+		goto errout;
+	}
+	if (memblock_reserve(phys, mem_size) < 0) {
+		pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", mem_size);
+		rc = -ENOMEM;
+		goto errout;
+	}
+	vblk = __va(phys);
+	set_xen_guest_handle(numa_topo.vmemblk, vblk);
+
+	dist_size = num_possible_cpus() * num_possible_cpus() * sizeof(*numa_topo.vdistance);
+	physd = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), dist_size, PAGE_SIZE);
+	if (!physd) {
+		pr_info("vNUMA: can't allocate memory for distance size %zu\n", dist_size);
+		rc = -ENOMEM;
+		goto errout;
+	}
+	if (memblock_reserve(physd, dist_size) < 0) {
+		pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", dist_size);
+		rc = -ENOMEM;
+		goto errout;
+	}
+	vdistance  = __va(physd);
+	set_xen_guest_handle(numa_topo.vdistance, (int *)vdistance);
+
+	/* allocate memblock for vcpu to node mask of max size */
+	cpu_to_node_size = num_possible_cpus() * sizeof(*numa_topo.cpu_to_node);
+	physc = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), cpu_to_node_size, PAGE_SIZE);
+	if (!physc) {
+		pr_info("vNUMA: can't allocate memory for distance size %zu\n", cpu_to_node_size);
+		rc = -ENOMEM;
+		goto errout;
+	}
+	if (memblock_reserve(physc, cpu_to_node_size) < 0) {
+		pr_info("vNUMA: Cannot reserver mem for blocks of size %zu\n", cpu_to_node_size);
+		goto errout;
+	}
+	cpu_to_node  = __va(physc);
+	set_xen_guest_handle(numa_topo.cpu_to_node, (int *)cpu_to_node);
+	if (HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo) < 0)
+		goto errout;
+	if (numa_topo.nr_nodes == 0)
+		/* Pass to DUMMY numa */
+		goto errout;
+	if (numa_topo.nr_nodes > num_possible_cpus()) {
+		pr_info("vNUMA: Node without cpu is not supported, nodes = %d\n", numa_topo.nr_nodes);
+		goto errout;
+	}
+	new_end_pfn = 0;
+	/* We have sizes in pages received from hypervisor, no holes and ordered */
+	for (i = 0; i < numa_topo.nr_nodes; i++) {
+		start = vblk[i].start;
+		end = vblk[i].end;
+		size = end - start;
+		pr_info("VNMA blk[%d] size is %#010Lx start = %#010Lx end = %#010Lx\n",
+			i, size, start, end);
+		if (numa_add_memblk(i, start, end)) {
+			pr_info("vNUMA: Could not add memblk[%d]\n", i);
+			rc = -EAGAIN;
+			goto errout;
+		}
+		node_set(i, numa_nodes_parsed);
+	}
+	setup_nr_node_ids();
+	/* Setting the cpu, apicid to node */
+	for_each_cpu(cpu, cpu_possible_mask) {
+		pr_info("Setting up vcpu[%d] to node[%d]\n", cpu, cpu_to_node[cpu]);
+		set_apicid_to_node(cpu, cpu_to_node[cpu]);
+		numa_set_node(cpu, cpu_to_node[cpu]);
+		__apicid_to_node[cpu] = cpu_to_node[cpu];
+		cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node[cpu]]);
+	}
+	setup_nr_node_ids();
+	rc = 0;
+	for (i = 0; i < numa_topo.nr_nodes; i++)
+		for (j = 0; j < numa_topo.nr_nodes; j++) {
+			numa_set_distance(i, j, *(vdistance + j*numa_topo.nr_nodes + i));
+			pr_info("distance %d <-> %d = %d\n", i, j, *(vdistance + j*numa_topo.nr_nodes + i));
+		}
+errout:
+	if (phys)
+		memblock_free(__pa(phys), mem_size);
+	if (physd)
+		memblock_free(__pa(physd), dist_size);
+	if (physc)
+		memblock_free(__pa(physc), cpu_to_node_size);
+	if (rc)
+		pr_debug("XEN: hypercall failed to get vNUMA topology.\n");
+	return rc;
+}
+
 #endif
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
index 2ecfe4f..16b8d87 100644
--- a/include/xen/interface/memory.h
+++ b/include/xen/interface/memory.h
@@ -263,4 +263,5 @@ struct xen_remove_from_physmap {
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap);
 
+#define XENMEM_get_vnuma_info	25
 #endif /* __XEN_PUBLIC_MEMORY_H__ */
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH RFC 2/2] linux/vnuma: Enables NUMA support for PV guest
  2013-08-27  8:52 [PATCH RFC 0/2] linux/vnuma: vnuma topology PV guest Elena Ufimtseva
  2013-08-27  8:52 ` [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest Elena Ufimtseva
@ 2013-08-27  8:53 ` Elena Ufimtseva
  2013-08-27 13:35   ` David Vrabel
  2013-08-27 14:17   ` Konrad Rzeszutek Wilk
  2013-08-27 13:48 ` [PATCH RFC 0/2] linux/vnuma: vnuma topology " David Vrabel
  2 siblings, 2 replies; 29+ messages in thread
From: Elena Ufimtseva @ 2013-08-27  8:53 UTC (permalink / raw)
  To: xen-devel; +Cc: Elena Ufimtseva

Enables NUMA support for PV guest and uses vNUMA
interface to discover topology provided by XEN.

Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
---
 arch/x86/xen/setup.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 056d11f..7e0c93b 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -561,6 +561,6 @@ void __init xen_arch_setup(void)
 	WARN_ON(xen_set_default_idle());
 	fiddle_vdso();
 #ifdef CONFIG_NUMA
-	numa_off = 1;
+	numa_off = 0;
 #endif
 }
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 2/2] linux/vnuma: Enables NUMA support for PV guest
  2013-08-27  8:53 ` [PATCH RFC 2/2] linux/vnuma: Enables NUMA support for PV guest Elena Ufimtseva
@ 2013-08-27 13:35   ` David Vrabel
  2013-08-27 15:36     ` Konrad Rzeszutek Wilk
  2013-08-27 14:17   ` Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 29+ messages in thread
From: David Vrabel @ 2013-08-27 13:35 UTC (permalink / raw)
  To: Elena Ufimtseva; +Cc: xen-devel

On 27/08/13 09:53, Elena Ufimtseva wrote:
> Enables NUMA support for PV guest and uses vNUMA
> interface to discover topology provided by XEN.
> 
> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
> ---
>  arch/x86/xen/setup.c |    2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
> index 056d11f..7e0c93b 100644
> --- a/arch/x86/xen/setup.c
> +++ b/arch/x86/xen/setup.c
> @@ -561,6 +561,6 @@ void __init xen_arch_setup(void)
>  	WARN_ON(xen_set_default_idle());
>  	fiddle_vdso();
>  #ifdef CONFIG_NUMA
> -	numa_off = 1;
> +	numa_off = 0;
>  #endif
>  }

Hmm. This is either safe to turn on now or it will break on hosts
without vNUMA support as it will cause a fallback to try all the
hardware specific NUMA initialization.

Which is it?

David

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 0/2] linux/vnuma: vnuma topology PV guest
  2013-08-27  8:52 [PATCH RFC 0/2] linux/vnuma: vnuma topology PV guest Elena Ufimtseva
  2013-08-27  8:52 ` [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest Elena Ufimtseva
  2013-08-27  8:53 ` [PATCH RFC 2/2] linux/vnuma: Enables NUMA support for PV guest Elena Ufimtseva
@ 2013-08-27 13:48 ` David Vrabel
  2 siblings, 0 replies; 29+ messages in thread
From: David Vrabel @ 2013-08-27 13:48 UTC (permalink / raw)
  To: Elena Ufimtseva; +Cc: Boris Ostrovsky, xen-devel

On 27/08/13 09:52, Elena Ufimtseva wrote:
> Enables PV guest to discover NUMA topology provided by XEN
> and inits this topology on boot.
> Xen provides numa memblocks aligned for guest, distance table,
> cpu setup per node.
> On failure defaults to dummy linux numa init.
> 
> TODO:
> Change XENMEM hypercall subop to SYSCTL.

Please Cc the Xen maintainers in future: Konrad, Boris and myself.  You
will also need to Cc the x86 maintainers when it's no longer an RFC.

David

> 
> Elena Ufimtseva (2):
>   vNUMA PV Guest Linux support
>   Enables NUMA for PV guest
> 
>  arch/x86/include/asm/xen/vnuma-xen.h |   36 +++++++++++
>  arch/x86/mm/numa.c                   |    8 +++
>  arch/x86/xen/enlighten.c             |  115 ++++++++++++++++++++++++++++++++++
>  arch/x86/xen/setup.c                 |    2 +-
>  include/xen/interface/memory.h       |    1 +
>  5 files changed, 161 insertions(+), 1 deletion(-)
>  create mode 100644 arch/x86/include/asm/xen/vnuma-xen.h
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-27  8:52 ` [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest Elena Ufimtseva
@ 2013-08-27 13:52   ` David Vrabel
  2013-08-27 14:33   ` Konrad Rzeszutek Wilk
  2013-08-28  1:27   ` Matt Wilson
  2 siblings, 0 replies; 29+ messages in thread
From: David Vrabel @ 2013-08-27 13:52 UTC (permalink / raw)
  To: Elena Ufimtseva; +Cc: xen-devel

On 27/08/13 09:52, Elena Ufimtseva wrote:
> Uses subop hypercall to request XEN about vnuma topology.
> Sets the memory blocks (aligned by XEN), cpus, distance table
> on boot. NUMA support should be compiled in kernel.
> 
> NUMA support should be compiled in kernel.
> 
> Memory blocks are aligned by xen. Xen is aware of guest initial
> RAM layout.
> If the memory blocks are incorrect, call for default linux numa
> dummy init.
> 
> Requires XEN with vNUMA support.
> 
> Applies to 3.10 git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git
> latest commit: 8bb495e3f02401ee6f76d1b1d77f3ac9f079e376
> 
> TODO:
> Change subop from XENMEM to SYSCTL.

XENMEM subop seems right here.

> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
> ---
>  arch/x86/include/asm/xen/vnuma-xen.h |   36 +++++++++++
>  arch/x86/mm/numa.c                   |    8 +++
>  arch/x86/xen/enlighten.c             |  115 ++++++++++++++++++++++++++++++++++
>  include/xen/interface/memory.h       |    1 +
>  4 files changed, 160 insertions(+)
>  create mode 100644 arch/x86/include/asm/xen/vnuma-xen.h
> 
> diff --git a/arch/x86/include/asm/xen/vnuma-xen.h b/arch/x86/include/asm/xen/vnuma-xen.h
> new file mode 100644
> index 0000000..73c1bde
> --- /dev/null
> +++ b/arch/x86/include/asm/xen/vnuma-xen.h
> @@ -0,0 +1,36 @@
> +#ifndef _ASM_X86_VNUMA_XEN_H
> +#define _ASM_X86_VNUMA_XEN_H
> +
> +#ifdef CONFIG_XEN
> +int __initdata xen_numa_init(void);
> +
> +struct vnuma_memblk {
> +	          uint64_t start, end;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_memblk);
> +
> +struct vnuma_topology_info {
> +	domid_t domid;
> +	uint16_t nr_nodes;
> +	GUEST_HANDLE(vnuma_memblk) vmemblk;
> +	GUEST_HANDLE(int) vdistance;
> +	GUEST_HANDLE(int) cpu_to_node;
> +	GUEST_HANDLE(int) vnode_to_pnode;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_topology_info);
> +
> +struct xen_domctl {
> +	uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
> +	domid_t  domain;
> +	union {
> +		struct vnuma_topology_info vnuma;
> +		} u;
> +};
> +typedef struct xen_domctl xen_domctl_t;
> +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_t);

Most of this is the public interface provided by Xen, this should be in
include/xen/interface/memory.h. And only the xen_numa_init() prototype
in this header.

> +
> +#else
> +int __init xen_numa_init(void) {}
> +#endif
> +
> +#endif /* _ASM_X86_VNUMA_XEN_H */
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 8bf93ba..3ec7855 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -20,6 +20,10 @@

#include <asm/xen/vnuma.h>

here.

>  #include "numa_internal.h"
>  
> +#ifdef CONFIG_XEN
> +#include "asm/xen/vnuma-xen.h"
> +#endif

Instead of this.

> +
>  int __initdata numa_off;
>  nodemask_t numa_nodes_parsed __initdata;
>  
> @@ -621,6 +625,10 @@ static int __init dummy_numa_init(void)
>  void __init x86_numa_init(void)
>  {
>  	if (!numa_off) {
> +#ifdef CONFIG_XEN
> +		if (xen_pv_domain() && !numa_init(xen_numa_init))
> +			return;
> +#endif

Move the test for xen_pv_domain() into xen_numa_init.

>  #ifdef CONFIG_X86_NUMAQ
>  		if (!numa_init(numaq_numa_init))
>  			return;
> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> index 193097e..4658e9d 100644
> --- a/arch/x86/xen/enlighten.c
> +++ b/arch/x86/xen/enlighten.c
> @@ -33,6 +33,7 @@
>  #include <linux/memblock.h>
>  #include <linux/edd.h>
>  
> +#include <asm/numa.h>
>  #include <xen/xen.h>
>  #include <xen/events.h>
>  #include <xen/interface/xen.h>
> @@ -69,6 +70,7 @@
>  #include <asm/mwait.h>
>  #include <asm/pci_x86.h>
>  #include <asm/pat.h>
> +#include <asm/xen/vnuma-xen.h>
>  
>  #ifdef CONFIG_ACPI
>  #include <linux/acpi.h>
> @@ -1750,4 +1752,117 @@ const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
>  	.x2apic_available	= xen_x2apic_para_available,
>  };
>  EXPORT_SYMBOL(x86_hyper_xen_hvm);
> +
> +/* Xen PV NUMA topology initialization */
> +int __initdata xen_numa_init(void)

This function needs to more clearly separate the different parts.  This
could be by splitting it into several functions or by improving the
comments and spacing.

I'd also move it to a new file: arch/x86/xen/vnuma.c.

> +{
> +	struct vnuma_memblk *vblk;
> +	struct vnuma_topology_info numa_topo;
> +	uint64_t size, start, end;
> +	int i, j, cpu, rc;
> +	u64 phys, physd, physc;
> +	u64 new_end_pfn;
> +	size_t mem_size;
> +
> +	int *vdistance, *cpu_to_node;
> +	unsigned long dist_size, cpu_to_node_size;
> +	numa_topo.domid = DOMID_SELF;
> +	rc = -EINVAL;
> +	/* block mem reservation for memblks */
> +	mem_size = num_possible_cpus() * sizeof(struct vnuma_memblk);
> +	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), mem_size, PAGE_SIZE);
> +	if (!phys) {
> +		pr_info("vNUMA: can't allocate memory for membloks size %zu\n", mem_size);
> +		rc = -ENOMEM;
> +		goto errout;
> +	}
> +	if (memblock_reserve(phys, mem_size) < 0) {
> +		pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", mem_size);
> +		rc = -ENOMEM;
> +		goto errout;
> +	}

Does mem_block_alloc() work here instead of the find_in_range/reserve
combo?  Similarly for the other allocations.

> +	vblk = __va(phys);
> +	set_xen_guest_handle(numa_topo.vmemblk, vblk);

Please group all the memory allocations followed by grouping the
initialization of the hypercall arguments.

> +	if (HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo) < 0)
> +		goto errout;

ret = HYPERVISOR_memory_op(...);
if (ret < 0)
     goto out;

> +	if (numa_topo.nr_nodes == 0)
> +		/* Pass to DUMMY numa */
> +		goto errout;
> +	if (numa_topo.nr_nodes > num_possible_cpus()) {
> +		pr_info("vNUMA: Node without cpu is not supported, nodes = %d\n", numa_topo.nr_nodes);
> +		goto errout;
> +	}

Is this a constraint of the Linux NUMA support?

> +	new_end_pfn = 0;
> +	/* We have sizes in pages received from hypervisor, no holes and ordered */

How does this play with the relocate of frames done for dom0 during
initial setup?

> +	for (i = 0; i < numa_topo.nr_nodes; i++) {
> +		start = vblk[i].start;
> +		end = vblk[i].end;
> +		size = end - start;
> +		pr_info("VNMA blk[%d] size is %#010Lx start = %#010Lx end = %#010Lx\n",
> +			i, size, start, end);
> +		if (numa_add_memblk(i, start, end)) {
> +			pr_info("vNUMA: Could not add memblk[%d]\n", i);
> +			rc = -EAGAIN;
> +			goto errout;
> +		}
> +		node_set(i, numa_nodes_parsed);
> +	}
> +	setup_nr_node_ids();
> +	/* Setting the cpu, apicid to node */
> +	for_each_cpu(cpu, cpu_possible_mask) {
> +		pr_info("Setting up vcpu[%d] to node[%d]\n", cpu, cpu_to_node[cpu]);
> +		set_apicid_to_node(cpu, cpu_to_node[cpu]);
> +		numa_set_node(cpu, cpu_to_node[cpu]);
> +		__apicid_to_node[cpu] = cpu_to_node[cpu];
> +		cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node[cpu]]);
> +	}
> +	setup_nr_node_ids();
> +	rc = 0;
> +	for (i = 0; i < numa_topo.nr_nodes; i++)

Style: {} around this multi-line body.

> +		for (j = 0; j < numa_topo.nr_nodes; j++) {
> +			numa_set_distance(i, j, *(vdistance + j*numa_topo.nr_nodes + i));
> +			pr_info("distance %d <-> %d = %d\n", i, j, *(vdistance + j*numa_topo.nr_nodes + i));
> +		}
> +errout:

This isn't (always) an error path, name this label "out".

> +	if (phys)
> +		memblock_free(__pa(phys), mem_size);
> +	if (physd)
> +		memblock_free(__pa(physd), dist_size);
> +	if (physc)
> +		memblock_free(__pa(physc), cpu_to_node_size);
> +	if (rc)
> +		pr_debug("XEN: hypercall failed to get vNUMA topology.\n");

Remove this debug print, you already have prints for specific failures.

> +	return rc;
> +}
> +
>  #endif

David

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 2/2] linux/vnuma: Enables NUMA support for PV guest
  2013-08-27  8:53 ` [PATCH RFC 2/2] linux/vnuma: Enables NUMA support for PV guest Elena Ufimtseva
  2013-08-27 13:35   ` David Vrabel
@ 2013-08-27 14:17   ` Konrad Rzeszutek Wilk
  1 sibling, 0 replies; 29+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-08-27 14:17 UTC (permalink / raw)
  To: Elena Ufimtseva; +Cc: xen-devel

On Tue, Aug 27, 2013 at 04:53:00AM -0400, Elena Ufimtseva wrote:
> Enables NUMA support for PV guest and uses vNUMA
> interface to discover topology provided by XEN.

I think this should depend on a hypercall to figure out
whether Xen supports this mechanism ?

> 
> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
> ---
>  arch/x86/xen/setup.c |    2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
> index 056d11f..7e0c93b 100644
> --- a/arch/x86/xen/setup.c
> +++ b/arch/x86/xen/setup.c
> @@ -561,6 +561,6 @@ void __init xen_arch_setup(void)
>  	WARN_ON(xen_set_default_idle());
>  	fiddle_vdso();
>  #ifdef CONFIG_NUMA
> -	numa_off = 1;
> +	numa_off = 0;
>  #endif
>  }
> -- 
> 1.7.10.4
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-27  8:52 ` [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest Elena Ufimtseva
  2013-08-27 13:52   ` David Vrabel
@ 2013-08-27 14:33   ` Konrad Rzeszutek Wilk
  2013-08-28 13:41     ` Elena Ufimtseva
  2013-08-28 23:12     ` Dario Faggioli
  2013-08-28  1:27   ` Matt Wilson
  2 siblings, 2 replies; 29+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-08-27 14:33 UTC (permalink / raw)
  To: Elena Ufimtseva; +Cc: xen-devel

On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote:
> Uses subop hypercall to request XEN about vnuma topology.
> Sets the memory blocks (aligned by XEN), cpus, distance table
> on boot. NUMA support should be compiled in kernel.

Needs a bit more details (which I am sure you will add in the
next postings). Mostly:
 - How E820 and NUMA information mesh
 - What the hypercall provides
 - What changeset in the Xen provides this hypercall
 - 
> 
> NUMA support should be compiled in kernel.
> 
> Memory blocks are aligned by xen. Xen is aware of guest initial
> RAM layout.
> If the memory blocks are incorrect, call for default linux numa
> dummy init.

Unfortunatly that won't happen if you boot this under dom0. It will
find on some AMD machines the AMD Northbridge and try to scan that.
And blow up.

If you look at the git commit that did the 'numa = 0' you will see
the backstory.

I think part of enablement of ' numa = 1' is to wrap it with

	if (xen_initial_domain() && xen_supports_this_hypercall())
		numa = 1;

in the #2 patch.

> 
> Requires XEN with vNUMA support.
> 
> Applies to 3.10 git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git
> latest commit: 8bb495e3f02401ee6f76d1b1d77f3ac9f079e376
> 
> TODO:
> Change subop from XENMEM to SYSCTL.
> 
> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
> ---
>  arch/x86/include/asm/xen/vnuma-xen.h |   36 +++++++++++
>  arch/x86/mm/numa.c                   |    8 +++
>  arch/x86/xen/enlighten.c             |  115 ++++++++++++++++++++++++++++++++++
>  include/xen/interface/memory.h       |    1 +
>  4 files changed, 160 insertions(+)
>  create mode 100644 arch/x86/include/asm/xen/vnuma-xen.h
> 
> diff --git a/arch/x86/include/asm/xen/vnuma-xen.h b/arch/x86/include/asm/xen/vnuma-xen.h
> new file mode 100644
> index 0000000..73c1bde
> --- /dev/null
> +++ b/arch/x86/include/asm/xen/vnuma-xen.h
> @@ -0,0 +1,36 @@
> +#ifndef _ASM_X86_VNUMA_XEN_H
> +#define _ASM_X86_VNUMA_XEN_H
> +
> +#ifdef CONFIG_XEN
> +int __initdata xen_numa_init(void);
> +
> +struct vnuma_memblk {
> +	          uint64_t start, end;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_memblk);
> +
> +struct vnuma_topology_info {
> +	domid_t domid;
> +	uint16_t nr_nodes;
> +	GUEST_HANDLE(vnuma_memblk) vmemblk;
> +	GUEST_HANDLE(int) vdistance;
> +	GUEST_HANDLE(int) cpu_to_node;
> +	GUEST_HANDLE(int) vnode_to_pnode;

A comment explaining what those values are meant to have would be good.
Perhaps with a simple example.

> +};

The structure is not very 32-bit friendly. Would it be possible
to add some padding so that the size of this structure is the
same on 32-bit and 64-bit please?

Perhaps:
	domid_t domid;
	uint16_t nr_nodes;
	uint32_t _pad;
	GUEST_HANDLE(vnuma_memblk) vmemblk;
	GUEST_HANDLE(int) vdistance;
	GUEST_HANDLE(int) cpu_to_node;
	GUEST_HANDLE(int) vnode_to_pnode;

That should make the offsets be the same on both 32 and 64-bit
I think.

> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_topology_info);
> +
> +struct xen_domctl {
> +	uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
> +	domid_t  domain;
> +	union {
> +		struct vnuma_topology_info vnuma;
> +		} u;

Move that 'u' one tab back please.

> +};
> +typedef struct xen_domctl xen_domctl_t;

Ewww. No typdefs in the Linux code please.

> +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_t);
> +
> +#else
> +int __init xen_numa_init(void) {}
> +#endif
> +
> +#endif /* _ASM_X86_VNUMA_XEN_H */
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 8bf93ba..3ec7855 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -20,6 +20,10 @@
>  
>  #include "numa_internal.h"
>  
> +#ifdef CONFIG_XEN
> +#include "asm/xen/vnuma-xen.h"
> +#endif
> +
>  int __initdata numa_off;
>  nodemask_t numa_nodes_parsed __initdata;
>  
> @@ -621,6 +625,10 @@ static int __init dummy_numa_init(void)
>  void __init x86_numa_init(void)
>  {
>  	if (!numa_off) {
> +#ifdef CONFIG_XEN
> +		if (xen_pv_domain() && !numa_init(xen_numa_init))

I would remove the xen_pv_domain() check and add that in enlighten.c code.

> +			return;
> +#endif
>  #ifdef CONFIG_X86_NUMAQ
>  		if (!numa_init(numaq_numa_init))
>  			return;
> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> index 193097e..4658e9d 100644
> --- a/arch/x86/xen/enlighten.c
> +++ b/arch/x86/xen/enlighten.c
> @@ -33,6 +33,7 @@
>  #include <linux/memblock.h>
>  #include <linux/edd.h>
>  
> +#include <asm/numa.h>
>  #include <xen/xen.h>
>  #include <xen/events.h>
>  #include <xen/interface/xen.h>
> @@ -69,6 +70,7 @@
>  #include <asm/mwait.h>
>  #include <asm/pci_x86.h>
>  #include <asm/pat.h>
> +#include <asm/xen/vnuma-xen.h>
>  
>  #ifdef CONFIG_ACPI
>  #include <linux/acpi.h>
> @@ -1750,4 +1752,117 @@ const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
>  	.x2apic_available	= xen_x2apic_para_available,
>  };
>  EXPORT_SYMBOL(x86_hyper_xen_hvm);
> +
> +/* Xen PV NUMA topology initialization */
> +int __initdata xen_numa_init(void)
> +{
> +	struct vnuma_memblk *vblk;
> +	struct vnuma_topology_info numa_topo;
> +	uint64_t size, start, end;
> +	int i, j, cpu, rc;
> +	u64 phys, physd, physc;
> +	u64 new_end_pfn;
> +	size_t mem_size;
> +
> +	int *vdistance, *cpu_to_node;
> +	unsigned long dist_size, cpu_to_node_size;

You need an extra line here;

> +	numa_topo.domid = DOMID_SELF;
> +	rc = -EINVAL;

Both of those can be part of the decleration of variables above. Like so:
	int rc = -EINVAL;
	struct vnuma_topology_info numa_topo = {
		.domid = DOMID_SELF;
	};

> +	/* block mem reservation for memblks */
> +	mem_size = num_possible_cpus() * sizeof(struct vnuma_memblk);
> +	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), mem_size, PAGE_SIZE);
> +	if (!phys) {
> +		pr_info("vNUMA: can't allocate memory for membloks size %zu\n", mem_size);
> +		rc = -ENOMEM;
> +		goto errout;
> +	}
> +	if (memblock_reserve(phys, mem_size) < 0) {
> +		pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", mem_size);
> +		rc = -ENOMEM;
> +		goto errout;
> +	}
> +	vblk = __va(phys);
> +	set_xen_guest_handle(numa_topo.vmemblk, vblk);
> +
> +	dist_size = num_possible_cpus() * num_possible_cpus() * sizeof(*numa_topo.vdistance);
> +	physd = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), dist_size, PAGE_SIZE);
> +	if (!physd) {
> +		pr_info("vNUMA: can't allocate memory for distance size %zu\n", dist_size);
> +		rc = -ENOMEM;
> +		goto errout;
> +	}
> +	if (memblock_reserve(physd, dist_size) < 0) {
> +		pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", dist_size);
> +		rc = -ENOMEM;
> +		goto errout;
> +	}
> +	vdistance  = __va(physd);
> +	set_xen_guest_handle(numa_topo.vdistance, (int *)vdistance);
> +
> +	/* allocate memblock for vcpu to node mask of max size */
> +	cpu_to_node_size = num_possible_cpus() * sizeof(*numa_topo.cpu_to_node);
> +	physc = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), cpu_to_node_size, PAGE_SIZE);
> +	if (!physc) {
> +		pr_info("vNUMA: can't allocate memory for distance size %zu\n", cpu_to_node_size);
> +		rc = -ENOMEM;
> +		goto errout;
> +	}
> +	if (memblock_reserve(physc, cpu_to_node_size) < 0) {
> +		pr_info("vNUMA: Cannot reserver mem for blocks of size %zu\n", cpu_to_node_size);
> +		goto errout;
> +	}
> +	cpu_to_node  = __va(physc);
> +	set_xen_guest_handle(numa_topo.cpu_to_node, (int *)cpu_to_node);
> +	if (HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo) < 0)

I think you need to do rc = HYPERVISOR...

and then if you get rc = -ENOSYS (so not implemented) change the rc to zero.

> +		goto errout;
> +	if (numa_topo.nr_nodes == 0)
> +		/* Pass to DUMMY numa */

Also change rc to zero.

> +		goto errout;
> +	if (numa_topo.nr_nodes > num_possible_cpus()) {
> +		pr_info("vNUMA: Node without cpu is not supported, nodes = %d\n", numa_topo.nr_nodes);
> +		goto errout;
> +	}
> +	new_end_pfn = 0;
> +	/* We have sizes in pages received from hypervisor, no holes and ordered */

>From toolstack you mean. It is the one setting it up. And I hope it sets it up
based on the E820 it has constructed as well? Otherwise it would be a bit
awkward if those two were different.

> +	for (i = 0; i < numa_topo.nr_nodes; i++) {
> +		start = vblk[i].start;
> +		end = vblk[i].end;
> +		size = end - start;
> +		pr_info("VNMA blk[%d] size is %#010Lx start = %#010Lx end = %#010Lx\n",
> +			i, size, start, end);

pr_debug perhaps?

> +		if (numa_add_memblk(i, start, end)) {
> +			pr_info("vNUMA: Could not add memblk[%d]\n", i);
> +			rc = -EAGAIN;

Can you unroll the NUMA topology if this fails? Or is that not a problem?

> +			goto errout;
> +		}
> +		node_set(i, numa_nodes_parsed);
> +	}
> +	setup_nr_node_ids();
> +	/* Setting the cpu, apicid to node */
> +	for_each_cpu(cpu, cpu_possible_mask) {
> +		pr_info("Setting up vcpu[%d] to node[%d]\n", cpu, cpu_to_node[cpu]);

pr_debug
> +		set_apicid_to_node(cpu, cpu_to_node[cpu]);
> +		numa_set_node(cpu, cpu_to_node[cpu]);
> +		__apicid_to_node[cpu] = cpu_to_node[cpu];
> +		cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node[cpu]]);
> +	}
> +	setup_nr_node_ids();

How come this is being called twice? It should have a comment
explaining this extra call.

> +	rc = 0;
> +	for (i = 0; i < numa_topo.nr_nodes; i++)
> +		for (j = 0; j < numa_topo.nr_nodes; j++) {
> +			numa_set_distance(i, j, *(vdistance + j*numa_topo.nr_nodes + i));

You could simply this by just doing:

			int idx = (j * numa_topo.nr_nodes) + i;

			... *(vdistance + idx));

> +			pr_info("distance %d <-> %d = %d\n", i, j, *(vdistance + j*numa_topo.nr_nodes + i));

pr_debug

> +		}
> +errout:
> +	if (phys)
> +		memblock_free(__pa(phys), mem_size);
> +	if (physd)
> +		memblock_free(__pa(physd), dist_size);
> +	if (physc)
> +		memblock_free(__pa(physc), cpu_to_node_size);
> +	if (rc)
> +		pr_debug("XEN: hypercall failed to get vNUMA topology.\n");

And include the 'rc' value in the output please.
> +	return rc;
> +}
> +
>  #endif
> diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
> index 2ecfe4f..16b8d87 100644
> --- a/include/xen/interface/memory.h
> +++ b/include/xen/interface/memory.h
> @@ -263,4 +263,5 @@ struct xen_remove_from_physmap {
>  };
>  DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap);
>  
> +#define XENMEM_get_vnuma_info	25
>  #endif /* __XEN_PUBLIC_MEMORY_H__ */
> -- 
> 1.7.10.4
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 2/2] linux/vnuma: Enables NUMA support for PV guest
  2013-08-27 13:35   ` David Vrabel
@ 2013-08-27 15:36     ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 29+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-08-27 15:36 UTC (permalink / raw)
  To: David Vrabel; +Cc: Elena Ufimtseva, xen-devel

On Tue, Aug 27, 2013 at 02:35:18PM +0100, David Vrabel wrote:
> On 27/08/13 09:53, Elena Ufimtseva wrote:
> > Enables NUMA support for PV guest and uses vNUMA
> > interface to discover topology provided by XEN.
> > 
> > Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
> > ---
> >  arch/x86/xen/setup.c |    2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
> > index 056d11f..7e0c93b 100644
> > --- a/arch/x86/xen/setup.c
> > +++ b/arch/x86/xen/setup.c
> > @@ -561,6 +561,6 @@ void __init xen_arch_setup(void)
> >  	WARN_ON(xen_set_default_idle());
> >  	fiddle_vdso();
> >  #ifdef CONFIG_NUMA
> > -	numa_off = 1;
> > +	numa_off = 0;
> >  #endif
> >  }
> 
> Hmm. This is either safe to turn on now or it will break on hosts
> without vNUMA support as it will cause a fallback to try all the
> hardware specific NUMA initialization.
> 
> Which is it?

It will blow up when booting as dom0 on certain AMD machines.

> 
> David
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-27  8:52 ` [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest Elena Ufimtseva
  2013-08-27 13:52   ` David Vrabel
  2013-08-27 14:33   ` Konrad Rzeszutek Wilk
@ 2013-08-28  1:27   ` Matt Wilson
  2013-08-28  1:37     ` Matt Wilson
  2 siblings, 1 reply; 29+ messages in thread
From: Matt Wilson @ 2013-08-28  1:27 UTC (permalink / raw)
  To: Elena Ufimtseva; +Cc: xen-devel

On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote:
> Uses subop hypercall to request XEN about vnuma topology.
> Sets the memory blocks (aligned by XEN), cpus, distance table
> on boot. NUMA support should be compiled in kernel.

Are we *really sure* that we want to go this route for PV vNUMA?
Couldn't we build just enough(tm) of the ACPI tables to express the
NUMA topology when constructing the domain? That's what we do for the
e820 map.

--msw

> NUMA support should be compiled in kernel.
> 
> Memory blocks are aligned by xen. Xen is aware of guest initial
> RAM layout.
> If the memory blocks are incorrect, call for default linux numa
> dummy init.
> 
> Requires XEN with vNUMA support.
> 
> Applies to 3.10 git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git
> latest commit: 8bb495e3f02401ee6f76d1b1d77f3ac9f079e376

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-28  1:27   ` Matt Wilson
@ 2013-08-28  1:37     ` Matt Wilson
  2013-08-28 13:08       ` Dario Faggioli
  2013-08-28 16:01       ` Konrad Rzeszutek Wilk
  0 siblings, 2 replies; 29+ messages in thread
From: Matt Wilson @ 2013-08-28  1:37 UTC (permalink / raw)
  To: Elena Ufimtseva; +Cc: xen-devel

On Tue, Aug 27, 2013 at 06:27:15PM -0700, Matt Wilson wrote:
> On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote:
> > Uses subop hypercall to request XEN about vnuma topology.
> > Sets the memory blocks (aligned by XEN), cpus, distance table
> > on boot. NUMA support should be compiled in kernel.
> 
> Are we *really sure* that we want to go this route for PV vNUMA?
> Couldn't we build just enough(tm) of the ACPI tables to express the
> NUMA topology when constructing the domain? That's what we do for the
> e820 map.

Ignore me somewhat, since the e820 information is retrieved via
hypercall similar to what you're proposing.

Still, if there's some way that we can reuse existing Linux code
rather than bolting on a completely parallel mechanism to set this up
under PV I think it'd be better.

--msw

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-28  1:37     ` Matt Wilson
@ 2013-08-28 13:08       ` Dario Faggioli
  2013-08-28 13:39         ` George Dunlap
  2013-08-28 16:01       ` Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 29+ messages in thread
From: Dario Faggioli @ 2013-08-28 13:08 UTC (permalink / raw)
  To: Matt Wilson; +Cc: konrad.r.wilk, David Vrabel, Elena Ufimtseva, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 1726 bytes --]

On mar, 2013-08-27 at 18:37 -0700, Matt Wilson wrote:
> On Tue, Aug 27, 2013 at 06:27:15PM -0700, Matt Wilson wrote:
> > On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote:
> > > Uses subop hypercall to request XEN about vnuma topology.
> > > Sets the memory blocks (aligned by XEN), cpus, distance table
> > > on boot. NUMA support should be compiled in kernel.
> > 
> > Are we *really sure* that we want to go this route for PV vNUMA?
> > Couldn't we build just enough(tm) of the ACPI tables to express the
> > NUMA topology when constructing the domain? That's what we do for the
> > e820 map.
> 
> Ignore me somewhat, since the e820 information is retrieved via
> hypercall similar to what you're proposing.
> 
:-)

> Still, if there's some way that we can reuse existing Linux code
> rather than bolting on a completely parallel mechanism to set this up
> under PV I think it'd be better.
> 
Well, it looks to me that Elena is reusing quite a bit of it, isn't she?
All she's providing is a new initialization function ( xen_numa_init()
), as it is happening already for ACPI NUMA, NUMAQ, and other NUMA
implementations.

In practice, while ACPI based NUMA code parses the ACPI tables in
acpi_numa_init(), PV vNUMA parses the information coming from an
hypercall xen_numa_init(). From that point on, Linux that steps-in and
do everything else "as usual".

Isn't that enough sharing?

Thanks and Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-28 13:08       ` Dario Faggioli
@ 2013-08-28 13:39         ` George Dunlap
  0 siblings, 0 replies; 29+ messages in thread
From: George Dunlap @ 2013-08-28 13:39 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: konrad.r.wilk, Elena Ufimtseva, David Vrabel, Matt Wilson, xen-devel

On Wed, Aug 28, 2013 at 2:08 PM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
> On mar, 2013-08-27 at 18:37 -0700, Matt Wilson wrote:
>> On Tue, Aug 27, 2013 at 06:27:15PM -0700, Matt Wilson wrote:
>> > On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote:
>> > > Uses subop hypercall to request XEN about vnuma topology.
>> > > Sets the memory blocks (aligned by XEN), cpus, distance table
>> > > on boot. NUMA support should be compiled in kernel.
>> >
>> > Are we *really sure* that we want to go this route for PV vNUMA?
>> > Couldn't we build just enough(tm) of the ACPI tables to express the
>> > NUMA topology when constructing the domain? That's what we do for the
>> > e820 map.
>>
>> Ignore me somewhat, since the e820 information is retrieved via
>> hypercall similar to what you're proposing.
>>
> :-)
>
>> Still, if there's some way that we can reuse existing Linux code
>> rather than bolting on a completely parallel mechanism to set this up
>> under PV I think it'd be better.
>>
> Well, it looks to me that Elena is reusing quite a bit of it, isn't she?
> All she's providing is a new initialization function ( xen_numa_init()
> ), as it is happening already for ACPI NUMA, NUMAQ, and other NUMA
> implementations.
>
> In practice, while ACPI based NUMA code parses the ACPI tables in
> acpi_numa_init(), PV vNUMA parses the information coming from an
> hypercall xen_numa_init(). From that point on, Linux that steps-in and
> do everything else "as usual".
>
> Isn't that enough sharing?

I think the only way to "share" more would be to have Xen do some
crazy ACPI table fake-up scheme, which sounds like kind of a
nightmare; and also completely pointless, since there are already nice
clean interfaces we can just hook into and pass nice clean data
structures straight from Xen.

 -George

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-27 14:33   ` Konrad Rzeszutek Wilk
@ 2013-08-28 13:41     ` Elena Ufimtseva
  2013-08-28 23:12     ` Dario Faggioli
  1 sibling, 0 replies; 29+ messages in thread
From: Elena Ufimtseva @ 2013-08-28 13:41 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: xen-devel

Hi Konrad

Thank you for comments, I will resend the fix in the new patch.
Regarding memblocks contrustion they are done taking into account e820
map for this domain.
I will place some comments in code as you mentioned.

Elena

On Tue, Aug 27, 2013 at 10:33 AM, Konrad Rzeszutek Wilk
<konrad.wilk@oracle.com> wrote:
> On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote:
>> Uses subop hypercall to request XEN about vnuma topology.
>> Sets the memory blocks (aligned by XEN), cpus, distance table
>> on boot. NUMA support should be compiled in kernel.
>
> Needs a bit more details (which I am sure you will add in the
> next postings). Mostly:
>  - How E820 and NUMA information mesh
>  - What the hypercall provides
>  - What changeset in the Xen provides this hypercall
>  -
>>
>> NUMA support should be compiled in kernel.
>>
>> Memory blocks are aligned by xen. Xen is aware of guest initial
>> RAM layout.
>> If the memory blocks are incorrect, call for default linux numa
>> dummy init.
>
> Unfortunatly that won't happen if you boot this under dom0. It will
> find on some AMD machines the AMD Northbridge and try to scan that.
> And blow up.
>
> If you look at the git commit that did the 'numa = 0' you will see
> the backstory.
>
> I think part of enablement of ' numa = 1' is to wrap it with
>
>         if (xen_initial_domain() && xen_supports_this_hypercall())
>                 numa = 1;
>
> in the #2 patch.
>
>>
>> Requires XEN with vNUMA support.
>>
>> Applies to 3.10 git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git
>> latest commit: 8bb495e3f02401ee6f76d1b1d77f3ac9f079e376
>>
>> TODO:
>> Change subop from XENMEM to SYSCTL.
>>
>> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
>> ---
>>  arch/x86/include/asm/xen/vnuma-xen.h |   36 +++++++++++
>>  arch/x86/mm/numa.c                   |    8 +++
>>  arch/x86/xen/enlighten.c             |  115 ++++++++++++++++++++++++++++++++++
>>  include/xen/interface/memory.h       |    1 +
>>  4 files changed, 160 insertions(+)
>>  create mode 100644 arch/x86/include/asm/xen/vnuma-xen.h
>>
>> diff --git a/arch/x86/include/asm/xen/vnuma-xen.h b/arch/x86/include/asm/xen/vnuma-xen.h
>> new file mode 100644
>> index 0000000..73c1bde
>> --- /dev/null
>> +++ b/arch/x86/include/asm/xen/vnuma-xen.h
>> @@ -0,0 +1,36 @@
>> +#ifndef _ASM_X86_VNUMA_XEN_H
>> +#define _ASM_X86_VNUMA_XEN_H
>> +
>> +#ifdef CONFIG_XEN
>> +int __initdata xen_numa_init(void);
>> +
>> +struct vnuma_memblk {
>> +               uint64_t start, end;
>> +};
>> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_memblk);
>> +
>> +struct vnuma_topology_info {
>> +     domid_t domid;
>> +     uint16_t nr_nodes;
>> +     GUEST_HANDLE(vnuma_memblk) vmemblk;
>> +     GUEST_HANDLE(int) vdistance;
>> +     GUEST_HANDLE(int) cpu_to_node;
>> +     GUEST_HANDLE(int) vnode_to_pnode;
>
> A comment explaining what those values are meant to have would be good.
> Perhaps with a simple example.
>
>> +};
>
> The structure is not very 32-bit friendly. Would it be possible
> to add some padding so that the size of this structure is the
> same on 32-bit and 64-bit please?
>
> Perhaps:
>         domid_t domid;
>         uint16_t nr_nodes;
>         uint32_t _pad;
>         GUEST_HANDLE(vnuma_memblk) vmemblk;
>         GUEST_HANDLE(int) vdistance;
>         GUEST_HANDLE(int) cpu_to_node;
>         GUEST_HANDLE(int) vnode_to_pnode;
>
> That should make the offsets be the same on both 32 and 64-bit
> I think.
>
>> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_topology_info);
>> +
>> +struct xen_domctl {
>> +     uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
>> +     domid_t  domain;
>> +     union {
>> +             struct vnuma_topology_info vnuma;
>> +             } u;
>
> Move that 'u' one tab back please.
>
>> +};
>> +typedef struct xen_domctl xen_domctl_t;
>
> Ewww. No typdefs in the Linux code please.
>
>> +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_t);
>> +
>> +#else
>> +int __init xen_numa_init(void) {}
>> +#endif
>> +
>> +#endif /* _ASM_X86_VNUMA_XEN_H */
>> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
>> index 8bf93ba..3ec7855 100644
>> --- a/arch/x86/mm/numa.c
>> +++ b/arch/x86/mm/numa.c
>> @@ -20,6 +20,10 @@
>>
>>  #include "numa_internal.h"
>>
>> +#ifdef CONFIG_XEN
>> +#include "asm/xen/vnuma-xen.h"
>> +#endif
>> +
>>  int __initdata numa_off;
>>  nodemask_t numa_nodes_parsed __initdata;
>>
>> @@ -621,6 +625,10 @@ static int __init dummy_numa_init(void)
>>  void __init x86_numa_init(void)
>>  {
>>       if (!numa_off) {
>> +#ifdef CONFIG_XEN
>> +             if (xen_pv_domain() && !numa_init(xen_numa_init))
>
> I would remove the xen_pv_domain() check and add that in enlighten.c code.
>
>> +                     return;
>> +#endif
>>  #ifdef CONFIG_X86_NUMAQ
>>               if (!numa_init(numaq_numa_init))
>>                       return;
>> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
>> index 193097e..4658e9d 100644
>> --- a/arch/x86/xen/enlighten.c
>> +++ b/arch/x86/xen/enlighten.c
>> @@ -33,6 +33,7 @@
>>  #include <linux/memblock.h>
>>  #include <linux/edd.h>
>>
>> +#include <asm/numa.h>
>>  #include <xen/xen.h>
>>  #include <xen/events.h>
>>  #include <xen/interface/xen.h>
>> @@ -69,6 +70,7 @@
>>  #include <asm/mwait.h>
>>  #include <asm/pci_x86.h>
>>  #include <asm/pat.h>
>> +#include <asm/xen/vnuma-xen.h>
>>
>>  #ifdef CONFIG_ACPI
>>  #include <linux/acpi.h>
>> @@ -1750,4 +1752,117 @@ const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
>>       .x2apic_available       = xen_x2apic_para_available,
>>  };
>>  EXPORT_SYMBOL(x86_hyper_xen_hvm);
>> +
>> +/* Xen PV NUMA topology initialization */
>> +int __initdata xen_numa_init(void)
>> +{
>> +     struct vnuma_memblk *vblk;
>> +     struct vnuma_topology_info numa_topo;
>> +     uint64_t size, start, end;
>> +     int i, j, cpu, rc;
>> +     u64 phys, physd, physc;
>> +     u64 new_end_pfn;
>> +     size_t mem_size;
>> +
>> +     int *vdistance, *cpu_to_node;
>> +     unsigned long dist_size, cpu_to_node_size;
>
> You need an extra line here;
>
>> +     numa_topo.domid = DOMID_SELF;
>> +     rc = -EINVAL;
>
> Both of those can be part of the decleration of variables above. Like so:
>         int rc = -EINVAL;
>         struct vnuma_topology_info numa_topo = {
>                 .domid = DOMID_SELF;
>         };
>
>> +     /* block mem reservation for memblks */
>> +     mem_size = num_possible_cpus() * sizeof(struct vnuma_memblk);
>> +     phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), mem_size, PAGE_SIZE);
>> +     if (!phys) {
>> +             pr_info("vNUMA: can't allocate memory for membloks size %zu\n", mem_size);
>> +             rc = -ENOMEM;
>> +             goto errout;
>> +     }
>> +     if (memblock_reserve(phys, mem_size) < 0) {
>> +             pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", mem_size);
>> +             rc = -ENOMEM;
>> +             goto errout;
>> +     }
>> +     vblk = __va(phys);
>> +     set_xen_guest_handle(numa_topo.vmemblk, vblk);
>> +
>> +     dist_size = num_possible_cpus() * num_possible_cpus() * sizeof(*numa_topo.vdistance);
>> +     physd = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), dist_size, PAGE_SIZE);
>> +     if (!physd) {
>> +             pr_info("vNUMA: can't allocate memory for distance size %zu\n", dist_size);
>> +             rc = -ENOMEM;
>> +             goto errout;
>> +     }
>> +     if (memblock_reserve(physd, dist_size) < 0) {
>> +             pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", dist_size);
>> +             rc = -ENOMEM;
>> +             goto errout;
>> +     }
>> +     vdistance  = __va(physd);
>> +     set_xen_guest_handle(numa_topo.vdistance, (int *)vdistance);
>> +
>> +     /* allocate memblock for vcpu to node mask of max size */
>> +     cpu_to_node_size = num_possible_cpus() * sizeof(*numa_topo.cpu_to_node);
>> +     physc = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), cpu_to_node_size, PAGE_SIZE);
>> +     if (!physc) {
>> +             pr_info("vNUMA: can't allocate memory for distance size %zu\n", cpu_to_node_size);
>> +             rc = -ENOMEM;
>> +             goto errout;
>> +     }
>> +     if (memblock_reserve(physc, cpu_to_node_size) < 0) {
>> +             pr_info("vNUMA: Cannot reserver mem for blocks of size %zu\n", cpu_to_node_size);
>> +             goto errout;
>> +     }
>> +     cpu_to_node  = __va(physc);
>> +     set_xen_guest_handle(numa_topo.cpu_to_node, (int *)cpu_to_node);
>> +     if (HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo) < 0)
>
> I think you need to do rc = HYPERVISOR...
>
> and then if you get rc = -ENOSYS (so not implemented) change the rc to zero.
>
>> +             goto errout;
>> +     if (numa_topo.nr_nodes == 0)
>> +             /* Pass to DUMMY numa */
>
> Also change rc to zero.
>
>> +             goto errout;
>> +     if (numa_topo.nr_nodes > num_possible_cpus()) {
>> +             pr_info("vNUMA: Node without cpu is not supported, nodes = %d\n", numa_topo.nr_nodes);
>> +             goto errout;
>> +     }
>> +     new_end_pfn = 0;
>> +     /* We have sizes in pages received from hypervisor, no holes and ordered */
>
> From toolstack you mean. It is the one setting it up. And I hope it sets it up
> based on the E820 it has constructed as well? Otherwise it would be a bit
> awkward if those two were different.
>
>> +     for (i = 0; i < numa_topo.nr_nodes; i++) {
>> +             start = vblk[i].start;
>> +             end = vblk[i].end;
>> +             size = end - start;
>> +             pr_info("VNMA blk[%d] size is %#010Lx start = %#010Lx end = %#010Lx\n",
>> +                     i, size, start, end);
>
> pr_debug perhaps?
>
>> +             if (numa_add_memblk(i, start, end)) {
>> +                     pr_info("vNUMA: Could not add memblk[%d]\n", i);
>> +                     rc = -EAGAIN;
>
> Can you unroll the NUMA topology if this fails? Or is that not a problem?
>
>> +                     goto errout;
>> +             }
>> +             node_set(i, numa_nodes_parsed);
>> +     }
>> +     setup_nr_node_ids();
>> +     /* Setting the cpu, apicid to node */
>> +     for_each_cpu(cpu, cpu_possible_mask) {
>> +             pr_info("Setting up vcpu[%d] to node[%d]\n", cpu, cpu_to_node[cpu]);
>
> pr_debug
>> +             set_apicid_to_node(cpu, cpu_to_node[cpu]);
>> +             numa_set_node(cpu, cpu_to_node[cpu]);
>> +             __apicid_to_node[cpu] = cpu_to_node[cpu];
>> +             cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node[cpu]]);
>> +     }
>> +     setup_nr_node_ids();
>
> How come this is being called twice? It should have a comment
> explaining this extra call.
>
>> +     rc = 0;
>> +     for (i = 0; i < numa_topo.nr_nodes; i++)
>> +             for (j = 0; j < numa_topo.nr_nodes; j++) {
>> +                     numa_set_distance(i, j, *(vdistance + j*numa_topo.nr_nodes + i));
>
> You could simply this by just doing:
>
>                         int idx = (j * numa_topo.nr_nodes) + i;
>
>                         ... *(vdistance + idx));
>
>> +                     pr_info("distance %d <-> %d = %d\n", i, j, *(vdistance + j*numa_topo.nr_nodes + i));
>
> pr_debug
>
>> +             }
>> +errout:
>> +     if (phys)
>> +             memblock_free(__pa(phys), mem_size);
>> +     if (physd)
>> +             memblock_free(__pa(physd), dist_size);
>> +     if (physc)
>> +             memblock_free(__pa(physc), cpu_to_node_size);
>> +     if (rc)
>> +             pr_debug("XEN: hypercall failed to get vNUMA topology.\n");
>
> And include the 'rc' value in the output please.
>> +     return rc;
>> +}
>> +
>>  #endif
>> diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
>> index 2ecfe4f..16b8d87 100644
>> --- a/include/xen/interface/memory.h
>> +++ b/include/xen/interface/memory.h
>> @@ -263,4 +263,5 @@ struct xen_remove_from_physmap {
>>  };
>>  DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap);
>>
>> +#define XENMEM_get_vnuma_info        25
>>  #endif /* __XEN_PUBLIC_MEMORY_H__ */
>> --
>> 1.7.10.4
>>



-- 
Elena

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-28  1:37     ` Matt Wilson
  2013-08-28 13:08       ` Dario Faggioli
@ 2013-08-28 16:01       ` Konrad Rzeszutek Wilk
  2013-08-28 16:38         ` Matt Wilson
  1 sibling, 1 reply; 29+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-08-28 16:01 UTC (permalink / raw)
  To: Matt Wilson; +Cc: Elena Ufimtseva, xen-devel

On Tue, Aug 27, 2013 at 06:37:35PM -0700, Matt Wilson wrote:
> On Tue, Aug 27, 2013 at 06:27:15PM -0700, Matt Wilson wrote:
> > On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote:
> > > Uses subop hypercall to request XEN about vnuma topology.
> > > Sets the memory blocks (aligned by XEN), cpus, distance table
> > > on boot. NUMA support should be compiled in kernel.
> > 
> > Are we *really sure* that we want to go this route for PV vNUMA?
> > Couldn't we build just enough(tm) of the ACPI tables to express the
> > NUMA topology when constructing the domain? That's what we do for the
> > e820 map.
> 
> Ignore me somewhat, since the e820 information is retrieved via
> hypercall similar to what you're proposing.
> 
> Still, if there's some way that we can reuse existing Linux code
> rather than bolting on a completely parallel mechanism to set this up
> under PV I think it'd be better.

That would also parallel the work you do with ACPI right?

We could enable ACPI parsing in a PV guest and provide one table - the
SLIT (or SRAT).

But I don't know enough about SRAT to know whether this is something
that represents truly everything we need?
> 
> --msw
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-28 16:01       ` Konrad Rzeszutek Wilk
@ 2013-08-28 16:38         ` Matt Wilson
  2013-08-28 20:10           ` Elena Ufimtseva
  2013-08-28 22:21           ` Dario Faggioli
  0 siblings, 2 replies; 29+ messages in thread
From: Matt Wilson @ 2013-08-28 16:38 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: Elena Ufimtseva, xen-devel

On Wed, Aug 28, 2013 at 12:01:48PM -0400, Konrad Rzeszutek Wilk wrote:
> On Tue, Aug 27, 2013 at 06:37:35PM -0700, Matt Wilson wrote:
> > On Tue, Aug 27, 2013 at 06:27:15PM -0700, Matt Wilson wrote:
> > > On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote:
> > > > Uses subop hypercall to request XEN about vnuma topology.
> > > > Sets the memory blocks (aligned by XEN), cpus, distance table
> > > > on boot. NUMA support should be compiled in kernel.
> > > 
> > > Are we *really sure* that we want to go this route for PV vNUMA?
> > > Couldn't we build just enough(tm) of the ACPI tables to express the
> > > NUMA topology when constructing the domain? That's what we do for the
> > > e820 map.
> > 
> > Ignore me somewhat, since the e820 information is retrieved via
> > hypercall similar to what you're proposing.
> > 
> > Still, if there's some way that we can reuse existing Linux code
> > rather than bolting on a completely parallel mechanism to set this up
> > under PV I think it'd be better.
> 
> That would also parallel the work you do with ACPI right?

Yes.

> We could enable ACPI parsing in a PV guest and provide one table - the
> SLIT (or SRAT).

Right, it'd be the SRAT table for the resource affinity and a SLIT
table for the locality/distance information.

> But I don't know enough about SRAT to know whether this is something
> that represents truly everything we need?

The SRAT table has processor objects and memory objects. A processor
object maps a logical processor number to its initial APIC ID and
provides the node information. A memory object specifies the start and
length for a memory region and provides the node information.

For SLIT, the entries are a matrix of distances.

Here are the structs:

struct acpi_20_srat_processor {
    uint8_t type;
    uint8_t length;
    uint8_t domain;
    uint8_t apic_id;
    uint32_t flags;
    uint8_t sapic_id;
    uint8_t domain_hi[3];
    uint32_t reserved;
};

struct acpi_20_srat_memory {
    uint8_t type;
    uint8_t length;
    uint8_t domain;
    uint8_t domain_hi[3];      /* this is ACPI 3.0, reserved in 2.0 */
    uint16_t reserved;
    uint32_t base_address_lo;
    uint32_t base_address_hi;
    uint32_t length_lo;
    uint32_t length_hi;
    uint32_t reserved2;
    uint32_t flags;
    uint32_t reserved3[2];
};

struct acpi_20_slit {
    struct acpi_header header;
    uint64_t nr_sys_loc;
    uint8_t  entry[];
};

--msw

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-28 16:38         ` Matt Wilson
@ 2013-08-28 20:10           ` Elena Ufimtseva
  2013-08-28 23:25             ` Matt Wilson
  2013-08-28 22:21           ` Dario Faggioli
  1 sibling, 1 reply; 29+ messages in thread
From: Elena Ufimtseva @ 2013-08-28 20:10 UTC (permalink / raw)
  To: Matt Wilson; +Cc: xen-devel

Hi Matt

Do you construct these tables via hypercalls?
Will be good to see the details on how you do it and if I can use it as well.
Thank you.

Elena

On Wed, Aug 28, 2013 at 12:38 PM, Matt Wilson <msw@amazon.com> wrote:
> On Wed, Aug 28, 2013 at 12:01:48PM -0400, Konrad Rzeszutek Wilk wrote:
>> On Tue, Aug 27, 2013 at 06:37:35PM -0700, Matt Wilson wrote:
>> > On Tue, Aug 27, 2013 at 06:27:15PM -0700, Matt Wilson wrote:
>> > > On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote:
>> > > > Uses subop hypercall to request XEN about vnuma topology.
>> > > > Sets the memory blocks (aligned by XEN), cpus, distance table
>> > > > on boot. NUMA support should be compiled in kernel.
>> > >
>> > > Are we *really sure* that we want to go this route for PV vNUMA?
>> > > Couldn't we build just enough(tm) of the ACPI tables to express the
>> > > NUMA topology when constructing the domain? That's what we do for the
>> > > e820 map.
>> >
>> > Ignore me somewhat, since the e820 information is retrieved via
>> > hypercall similar to what you're proposing.
>> >
>> > Still, if there's some way that we can reuse existing Linux code
>> > rather than bolting on a completely parallel mechanism to set this up
>> > under PV I think it'd be better.
>>
>> That would also parallel the work you do with ACPI right?
>
> Yes.
>
>> We could enable ACPI parsing in a PV guest and provide one table - the
>> SLIT (or SRAT).
>
> Right, it'd be the SRAT table for the resource affinity and a SLIT
> table for the locality/distance information.
>
>> But I don't know enough about SRAT to know whether this is something
>> that represents truly everything we need?
>
> The SRAT table has processor objects and memory objects. A processor
> object maps a logical processor number to its initial APIC ID and
> provides the node information. A memory object specifies the start and
> length for a memory region and provides the node information.
>
> For SLIT, the entries are a matrix of distances.
>
> Here are the structs:
>
> struct acpi_20_srat_processor {
>     uint8_t type;
>     uint8_t length;
>     uint8_t domain;
>     uint8_t apic_id;
>     uint32_t flags;
>     uint8_t sapic_id;
>     uint8_t domain_hi[3];
>     uint32_t reserved;
> };
>
> struct acpi_20_srat_memory {
>     uint8_t type;
>     uint8_t length;
>     uint8_t domain;
>     uint8_t domain_hi[3];      /* this is ACPI 3.0, reserved in 2.0 */
>     uint16_t reserved;
>     uint32_t base_address_lo;
>     uint32_t base_address_hi;
>     uint32_t length_lo;
>     uint32_t length_hi;
>     uint32_t reserved2;
>     uint32_t flags;
>     uint32_t reserved3[2];
> };
>
> struct acpi_20_slit {
>     struct acpi_header header;
>     uint64_t nr_sys_loc;
>     uint8_t  entry[];
> };
>
> --msw



-- 
Elena

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-28 16:38         ` Matt Wilson
  2013-08-28 20:10           ` Elena Ufimtseva
@ 2013-08-28 22:21           ` Dario Faggioli
  2013-08-29  0:11             ` Matt Wilson
  1 sibling, 1 reply; 29+ messages in thread
From: Dario Faggioli @ 2013-08-28 22:21 UTC (permalink / raw)
  To: Matt Wilson; +Cc: George Dunlap, xen-devel, David Vrabel, Elena Ufimtseva


[-- Attachment #1.1: Type: text/plain, Size: 2629 bytes --]

On mer, 2013-08-28 at 09:38 -0700, Matt Wilson wrote:
> On Wed, Aug 28, 2013 at 12:01:48PM -0400, Konrad Rzeszutek Wilk wrote:
> > That would also parallel the work you do with ACPI right?
> 
> Yes.
> 
I see. It's hard to comment, since I have only seen some previous (off
list) versiona of Elena's code (and won't have time to properly review
this one untill Monday), and I haven't seen Matt's code at all... 

Anyway...

> > We could enable ACPI parsing in a PV guest and provide one table - the
> > SLIT (or SRAT).
> 
> Right, it'd be the SRAT table for the resource affinity and a SLIT
> table for the locality/distance information.
> 
... I see the point in sharing code for HVM and PV. However, I'm still
not convinced this would be something valuable to do with this specific
hunk, mostly because it looks really easy and clean to hook up at the
numa_init() stage (i.e., what Elena is doing), that anything else I can
think of looks like more work. :-P

> > But I don't know enough about SRAT to know whether this is something
> > that represents truly everything we need?
> 
> The SRAT table has processor objects and memory objects. A processor
> object maps a logical processor number to its initial APIC ID and
> provides the node information. A memory object specifies the start and
> length for a memory region and provides the node information.
> 
> For SLIT, the entries are a matrix of distances.
> 
> Here are the structs:
> 
> [snip]
>
Ok, thanks for the very useful info. What would be interesting to know
is where and how Linux reads the information from ACPI and fill these
structures.

The current Elena's approach is 1 hypercall, during early NUMA
initialization, and that is pretty self-contained (which is the thing I
like most about it).

How easy is it to look up the places where each one of the tables gets
filled, intercept the code/calls doing that, and replace it properly for
our use case? How easy is it to "xen-ify" those call sites (stuff like
'#ifdef CONFIG_XEN' or/and is_xen_domain() )? How many hypercalls would
it require? Is it possible to have one doing all the work, or would we
need something like one for each table?

As I said, I can't check the details about it right now, but it sounds
like more work than Elena's xen_numa_init().

Thanks and Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-27 14:33   ` Konrad Rzeszutek Wilk
  2013-08-28 13:41     ` Elena Ufimtseva
@ 2013-08-28 23:12     ` Dario Faggioli
  2013-08-29 14:07       ` Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 29+ messages in thread
From: Dario Faggioli @ 2013-08-28 23:12 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: Elena Ufimtseva, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 1592 bytes --]

On mar, 2013-08-27 at 10:33 -0400, Konrad Rzeszutek Wilk wrote:
> Unfortunatly that won't happen if you boot this under dom0. It will
> find on some AMD machines the AMD Northbridge and try to scan that.
> And blow up.
> 
Well, perhaps this was not evident enough in Elena's mails and patches,
but her current target is actually DomU. :-P

However, that's only because we're proceeding in steps, so yes, once
DomU will be working, Dom0 will become the target, and we'll have to
start caring about not making things explode! :-O

> If you look at the git commit that did the 'numa = 0' you will see
> the backstory.
> 
That's something interesting to look at... I remember having seen that
discussion but not the details. I'll check when back from vacation.

> I think part of enablement of ' numa = 1' is to wrap it with
> 
> 	if (xen_initial_domain() && xen_supports_this_hypercall())
> 		numa = 1;
> 
> in the #2 patch.
> 
Mmm... Did you mean '!xen_initial_domain()' ? Or is it me that did not
understand your comment about Dom0 above?

In case you did mean '!', again, this would be fine for now, since the
target is DomUs. At the point when we will start chasing Dom0, it would
be interesting to find a solution to this issue... :-/

Anyway, thanks for pointing this out.

Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-28 20:10           ` Elena Ufimtseva
@ 2013-08-28 23:25             ` Matt Wilson
  2013-08-29  9:16               ` George Dunlap
  0 siblings, 1 reply; 29+ messages in thread
From: Matt Wilson @ 2013-08-28 23:25 UTC (permalink / raw)
  To: Elena Ufimtseva; +Cc: xen-devel

On Wed, Aug 28, 2013 at 04:10:11PM -0400, Elena Ufimtseva wrote:
> Hi Matt
> 
> Do you construct these tables via hypercalls?
> Will be good to see the details on how you do it and if I can use it as well.
> Thank you.

All the ACPI tables get built in guest memory by hvmloader. The
configuration is passed from libxc into hvmloader via the hvm_info
table.

I can't take credit for all of this work. Most of it was written and
proposed by Andre Przywara in 2010.

http://lists.xen.org/archives/html/xen-devel/2010-02/msg00280.html

--msw

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-28 22:21           ` Dario Faggioli
@ 2013-08-29  0:11             ` Matt Wilson
  2013-08-29 13:41               ` David Vrabel
  0 siblings, 1 reply; 29+ messages in thread
From: Matt Wilson @ 2013-08-29  0:11 UTC (permalink / raw)
  To: Dario Faggioli; +Cc: George Dunlap, xen-devel, David Vrabel, Elena Ufimtseva

On Thu, Aug 29, 2013 at 12:21:40AM +0200, Dario Faggioli wrote:
> On mer, 2013-08-28 at 09:38 -0700, Matt Wilson wrote:
> > On Wed, Aug 28, 2013 at 12:01:48PM -0400, Konrad Rzeszutek Wilk wrote:
> > > That would also parallel the work you do with ACPI right?
> > 
> > Yes.
> > 
> I see. It's hard to comment, since I have only seen some previous (off
> list) versiona of Elena's code (and won't have time to properly review
> this one untill Monday), and I haven't seen Matt's code at all... 

See the link I gave in my other reply for older proposed patch I based
this work on. Here's the head of the patchset:
  http://lists.xen.org/archives/html/xen-devel/2010-02/msg00279.html

> > > We could enable ACPI parsing in a PV guest and provide one table - the
> > > SLIT (or SRAT).
> > 
> > Right, it'd be the SRAT table for the resource affinity and a SLIT
> > table for the locality/distance information.
> > 
> ... I see the point in sharing code for HVM and PV. However, I'm still
> not convinced this would be something valuable to do with this specific
> hunk, mostly because it looks really easy and clean to hook up at the
> numa_init() stage (i.e., what Elena is doing), that anything else I can
> think of looks like more work. :-P

I agree.

> > > But I don't know enough about SRAT to know whether this is something
> > > that represents truly everything we need?
> > 
> > The SRAT table has processor objects and memory objects. A processor
> > object maps a logical processor number to its initial APIC ID and
> > provides the node information. A memory object specifies the start and
> > length for a memory region and provides the node information.
> > 
> > For SLIT, the entries are a matrix of distances.
> > 
> > Here are the structs:
> > 
> > [snip]
> >
> Ok, thanks for the very useful info. What would be interesting to know
> is where and how Linux reads the information from ACPI and fill these
> structures.
> 
> The current Elena's approach is 1 hypercall, during early NUMA
> initialization, and that is pretty self-contained (which is the thing I
> like most about it).
> 
> How easy is it to look up the places where each one of the tables gets
> filled, intercept the code/calls doing that, and replace it properly for
> our use case? How easy is it to "xen-ify" those call sites (stuff like
> '#ifdef CONFIG_XEN' or/and is_xen_domain() )? How many hypercalls would
> it require? Is it possible to have one doing all the work, or would we
> need something like one for each table?

I think it wouldn't be too hard to construct the static ACPI tables
and provide them in acpi_os_get_root_pointer().

> As I said, I can't check the details about it right now, but it sounds
> like more work than Elena's xen_numa_init().

Yes, it's probably a bit more work.

--msw

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-28 23:25             ` Matt Wilson
@ 2013-08-29  9:16               ` George Dunlap
  0 siblings, 0 replies; 29+ messages in thread
From: George Dunlap @ 2013-08-29  9:16 UTC (permalink / raw)
  To: Matt Wilson; +Cc: Elena Ufimtseva, xen-devel

On Thu, Aug 29, 2013 at 12:25 AM, Matt Wilson <msw@amazon.com> wrote:
> On Wed, Aug 28, 2013 at 04:10:11PM -0400, Elena Ufimtseva wrote:
>> Hi Matt
>>
>> Do you construct these tables via hypercalls?
>> Will be good to see the details on how you do it and if I can use it as well.
>> Thank you.
>
> All the ACPI tables get built in guest memory by hvmloader.

Our end goal is to be able to have dom0 boot with NUMA, so whatever we
do will have to be constructed in Xen as well.  If making the ACPI
tables is fairly straightforward, that might be an option; but if it's
really complicated, we probably want to stick with what we have.

 -George

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-29  0:11             ` Matt Wilson
@ 2013-08-29 13:41               ` David Vrabel
  2013-08-29 14:23                 ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 29+ messages in thread
From: David Vrabel @ 2013-08-29 13:41 UTC (permalink / raw)
  To: Matt Wilson
  Cc: George Dunlap, Dario Faggioli, David Vrabel, Elena Ufimtseva, xen-devel

On 29/08/13 01:11, Matt Wilson wrote:
> 
> I think it wouldn't be too hard to construct the static ACPI tables
> and provide them in acpi_os_get_root_pointer().

Elena's approach looks fine to me from a Linux point of view.

It's still not clear how presenting the information via ACPI would be an
improvement.  I also think it is preferrable for the hypercall ABI for
PV guests to be independent of architecture specifics where possible.

If you posted Xen and Linux series producing/using the ACPI SRAT and
SLIT tables for dom0 and domU PV guests we could better evaluate that
option but until then I'm inclined to consider Elena's approach as the
better one.

David

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-28 23:12     ` Dario Faggioli
@ 2013-08-29 14:07       ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 29+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-08-29 14:07 UTC (permalink / raw)
  To: Dario Faggioli; +Cc: Elena Ufimtseva, xen-devel

On Thu, Aug 29, 2013 at 01:12:15AM +0200, Dario Faggioli wrote:
> On mar, 2013-08-27 at 10:33 -0400, Konrad Rzeszutek Wilk wrote:
> > Unfortunatly that won't happen if you boot this under dom0. It will
> > find on some AMD machines the AMD Northbridge and try to scan that.
> > And blow up.
> > 
> Well, perhaps this was not evident enough in Elena's mails and patches,
> but her current target is actually DomU. :-P
> 
> However, that's only because we're proceeding in steps, so yes, once
> DomU will be working, Dom0 will become the target, and we'll have to
> start caring about not making things explode! :-O
> 
> > If you look at the git commit that did the 'numa = 0' you will see
> > the backstory.
> > 
> That's something interesting to look at... I remember having seen that
> discussion but not the details. I'll check when back from vacation.
> 
> > I think part of enablement of ' numa = 1' is to wrap it with
> > 
> > 	if (xen_initial_domain() && xen_supports_this_hypercall())
> > 		numa = 1;
> > 
> > in the #2 patch.
> > 
> Mmm... Did you mean '!xen_initial_domain()' ? Or is it me that did not
> understand your comment about Dom0 above?

For right now '!' would suffice.
> 
> In case you did mean '!', again, this would be fine for now, since the
> target is DomUs. At the point when we will start chasing Dom0, it would
> be interesting to find a solution to this issue... :-/


> 
> Anyway, thanks for pointing this out.
> 
> Regards,
> Dario
> 
> -- 
> <<This happens because I choose it to happen!>> (Raistlin Majere)
> -----------------------------------------------------------------
> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
> Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-29 13:41               ` David Vrabel
@ 2013-08-29 14:23                 ` Konrad Rzeszutek Wilk
  2013-08-29 14:32                   ` George Dunlap
  0 siblings, 1 reply; 29+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-08-29 14:23 UTC (permalink / raw)
  To: David Vrabel
  Cc: George Dunlap, Dario Faggioli, Elena Ufimtseva, Matt Wilson, xen-devel

On Thu, Aug 29, 2013 at 02:41:23PM +0100, David Vrabel wrote:
> On 29/08/13 01:11, Matt Wilson wrote:
> > 
> > I think it wouldn't be too hard to construct the static ACPI tables
> > and provide them in acpi_os_get_root_pointer().
> 
> Elena's approach looks fine to me from a Linux point of view.

>From an maintaince perspective the question is:

 - Why not re-use existing code? As in, if both PV and HVM can use
   SRAT, why not do it that way? This way you are exercising the same
   code path in both guests and it means less bugs to chase.

   Incidententaly this also means that the mechanism to fetch the
   NUMA information from the hypervisor and construct the SRAT tables
   from can be done the same in both hvmloader and Linux kernel.
   Which is nice - b/c if you find bugs in one, there is a chance
   they are the same in the other.

   Also this means one hypercall for both PV and HVM which is neat as
   well.

 - Lastly, the question is - a new 'interface' in the generic code is
   added to deal exclusively with Xen. Why special case it when one can
   use the SRAT for example? Is there a compeling technical argument?
   (Is SRAT inferior to vNUMA?)

Of course the counter argument is:

 - It will require more work as now Linux has to use the hypercall to
   construct an ACPI SRAT table. And we need to create a "fake" SRAT
   table and as well an RS.. something structure so that ACPI code
   can find it. Or perhaps this is something libxl can do? Which would
   mean the code to construct this and for HVM is both in Xen code base.
   And the only thing Linux needs to do is turn NUMA on for PV guests
   if it finds an ACPI table (perhaps by scanning the E820?).

 - Will it work? As in will an PV domU be able to run with just one
   ACPI table - just the SRAT and nothing else? Or will it fall flat
   on its face?
> 
> It's still not clear how presenting the information via ACPI would be an
> improvement.  I also think it is preferrable for the hypercall ABI for
> PV guests to be independent of architecture specifics where possible.

I concur on the hypercall ABI - but I think the hypercall should be
available on both PV and HVM so that it can be used in both.
> 
> If you posted Xen and Linux series producing/using the ACPI SRAT and
> SLIT tables for dom0 and domU PV guests we could better evaluate that
> option but until then I'm inclined to consider Elena's approach as the
> better one.

I think short term Elena's option is better one as it gets it going and
we can experiement with it. Long term I think stashing the data in ACPI
SRAT/SLIT is right. But Elena might not get to it in the next couple of
months - which means somebody else will have to sign up for that.

P.S.
I did for fun enable ACPI support in the PV domU guests and it boots fine.
It does complain about missing tables, but that was it.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-29 14:23                 ` Konrad Rzeszutek Wilk
@ 2013-08-29 14:32                   ` George Dunlap
  2013-08-29 14:51                     ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 29+ messages in thread
From: George Dunlap @ 2013-08-29 14:32 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: Dario Faggioli, Elena Ufimtseva, David Vrabel, Matt Wilson, xen-devel

On 29/08/13 15:23, Konrad Rzeszutek Wilk wrote:
> On Thu, Aug 29, 2013 at 02:41:23PM +0100, David Vrabel wrote:
>> On 29/08/13 01:11, Matt Wilson wrote:
>>> I think it wouldn't be too hard to construct the static ACPI tables
>>> and provide them in acpi_os_get_root_pointer().
>> Elena's approach looks fine to me from a Linux point of view.
>  From an maintaince perspective the question is:
>
>   - Why not re-use existing code? As in, if both PV and HVM can use
>     SRAT, why not do it that way? This way you are exercising the same
>     code path in both guests and it means less bugs to chase.
>
>     Incidententaly this also means that the mechanism to fetch the
>     NUMA information from the hypervisor and construct the SRAT tables
>     from can be done the same in both hvmloader and Linux kernel.

If the SRAT tables are built by hvmloader for HVM guests, couldn't 
hvmloader make this hypercall and construct the tables, instead of 
having the domain builder do it?  That would mean that most of the 
codepath is the same; HVM just has the extra step of encoding and 
decoding the information in SRAT form.

>> If you posted Xen and Linux series producing/using the ACPI SRAT and
>> SLIT tables for dom0 and domU PV guests we could better evaluate that
>> option but until then I'm inclined to consider Elena's approach as the
>> better one.
> I think short term Elena's option is better one as it gets it going and
> we can experiement with it. Long term I think stashing the data in ACPI
> SRAT/SLIT is right. But Elena might not get to it in the next couple of
> months - which means somebody else will have to sign up for that.

I definitely think that Elena needs to continue on the same path for 
now, so she can actually have closure on the project in a reasonable 
amount of time.

  -George

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-29 14:32                   ` George Dunlap
@ 2013-08-29 14:51                     ` Konrad Rzeszutek Wilk
  2013-08-29 22:29                       ` Dario Faggioli
  0 siblings, 1 reply; 29+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-08-29 14:51 UTC (permalink / raw)
  To: George Dunlap
  Cc: Dario Faggioli, Elena Ufimtseva, David Vrabel, Matt Wilson, xen-devel

On Thu, Aug 29, 2013 at 03:32:13PM +0100, George Dunlap wrote:
> On 29/08/13 15:23, Konrad Rzeszutek Wilk wrote:
> >On Thu, Aug 29, 2013 at 02:41:23PM +0100, David Vrabel wrote:
> >>On 29/08/13 01:11, Matt Wilson wrote:
> >>>I think it wouldn't be too hard to construct the static ACPI tables
> >>>and provide them in acpi_os_get_root_pointer().
> >>Elena's approach looks fine to me from a Linux point of view.
> > From an maintaince perspective the question is:
> >
> >  - Why not re-use existing code? As in, if both PV and HVM can use
> >    SRAT, why not do it that way? This way you are exercising the same
> >    code path in both guests and it means less bugs to chase.
> >
> >    Incidententaly this also means that the mechanism to fetch the
> >    NUMA information from the hypervisor and construct the SRAT tables
> >    from can be done the same in both hvmloader and Linux kernel.
> 
> If the SRAT tables are built by hvmloader for HVM guests, couldn't
> hvmloader make this hypercall and construct the tables, instead of
> having the domain builder do it?  That would mean that most of the
> codepath is the same; HVM just has the extra step of encoding and
> decoding the information in SRAT form.

Correct. Or the domain builder can build it for both PV and HVM.

We do have the full control of E820 for PV - so can make an "E820_ACPI"
region and stash the ACPI tables.

Either way would work. Whichever one means more code sharing is probably
the best bang for the buck to say.
> 
> >>If you posted Xen and Linux series producing/using the ACPI SRAT and
> >>SLIT tables for dom0 and domU PV guests we could better evaluate that
> >>option but until then I'm inclined to consider Elena's approach as the
> >>better one.
> >I think short term Elena's option is better one as it gets it going and
> >we can experiement with it. Long term I think stashing the data in ACPI
> >SRAT/SLIT is right. But Elena might not get to it in the next couple of
> >months - which means somebody else will have to sign up for that.
> 
> I definitely think that Elena needs to continue on the same path for
> now, so she can actually have closure on the project in a reasonable
> amount of time.

I concur. The caveat is that it could mean that the x86 maintainers might
object to the generic path having the special case for Xen and that
particular patch part won't be upstreamed until it has been taken care of.

If Elena is OK with that possiblity  - then that is fine with me.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-29 14:51                     ` Konrad Rzeszutek Wilk
@ 2013-08-29 22:29                       ` Dario Faggioli
  2013-08-30 12:57                         ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 29+ messages in thread
From: Dario Faggioli @ 2013-08-29 22:29 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: George Dunlap, Elena Ufimtseva, David Vrabel, Matt Wilson, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 4063 bytes --]

On gio, 2013-08-29 at 10:51 -0400, Konrad Rzeszutek Wilk wrote:
> On Thu, Aug 29, 2013 at 03:32:13PM +0100, George Dunlap wrote:
> > On 29/08/13 15:23, Konrad Rzeszutek Wilk wrote:
> > >  - Why not re-use existing code? As in, if both PV and HVM can use
> > >    SRAT, why not do it that way? This way you are exercising the same
> > >    code path in both guests and it means less bugs to chase.
> > >
> > >    Incidententaly this also means that the mechanism to fetch the
> > >    NUMA information from the hypervisor and construct the SRAT tables
> > >    from can be done the same in both hvmloader and Linux kernel.
> > 
> > If the SRAT tables are built by hvmloader for HVM guests, couldn't
> > hvmloader make this hypercall and construct the tables, instead of
> > having the domain builder do it?  That would mean that most of the
> > codepath is the same; HVM just has the extra step of encoding and
> > decoding the information in SRAT form.
> 
> Correct. 
>
Indeed. Also, Matt mentioned the HVM implementation does the most of the
work in libxc and hvmloader, is that the case? (Matt, I knew about the
"old" HVM-NUMA series from Andre, but I don't have the details fresh
enough right now... Will take another look ASAP.)

If yes, please, consider that, when talking about PV-vNUMA, although
right now the series addresses DomU only, we plan to make it possible
for Dom0 to have a virtual NUMA topology too. In that case, I don't
think any code from libxc and hvmloader could be shared, could it?

So, to me, it sort of looks like we risk to introduce more code
duplication than what we're trying to avoid! :-P

> > >I think short term Elena's option is better one as it gets it going and
> > >we can experiement with it. Long term I think stashing the data in ACPI
> > >SRAT/SLIT is right. But Elena might not get to it in the next couple of
> > >months - which means somebody else will have to sign up for that.
> > 
> > I definitely think that Elena needs to continue on the same path for
> > now, so she can actually have closure on the project in a reasonable
> > amount of time.
> 
> I concur. The caveat is that it could mean that the x86 maintainers might
> object to the generic path having the special case for Xen and that
> particular patch part won't be upstreamed until it has been taken care of.
> 
> If Elena is OK with that possiblity  - then that is fine with me.
>
Sorry, I'm not sure I understand what you mean here. When talking about
PV-NUMA, with Elena's approach, we have a Xen special case in
numa_init(). With the fake table approach, we'd have a Xen special case
in the ACPI parsing code (Matt was talking about something like
acpi_os_get_root_pointer() ), wouldn't we?

So, either way, we'll have a Xen special case, the only difference I see
is that, in the latter, we'd probably have to deal with the ACPI
maintainers instead than with the x86 ones. :-)

FWIW, x86_numa_init() already looks like this:

 621 void __init x86_numa_init(void)
 622 {
 623         if (!numa_off) {
 624 #ifdef CONFIG_X86_NUMAQ
 625                 if (!numa_init(numaq_numa_init))
 626                         return;
 627 #endif
 628 #ifdef CONFIG_ACPI_NUMA
 629                 if (!numa_init(x86_acpi_numa_init))
 630                         return;
 631 #endif
 632 #ifdef CONFIG_AMD_NUMA
 633                 if (!numa_init(amd_numa_init))
 634                         return;
 635 #endif
 636         }
 637 
 638         numa_init(dummy_numa_init);
 639 }

I.e., quite a bit of architecture/implementation special casing already!
So, perhaps, having some more `#ifdef CONFIG_XEN' and/or `if
(xen_domain())' in there won't be that much upsetting after all. :-)

Thanks and Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
  2013-08-29 22:29                       ` Dario Faggioli
@ 2013-08-30 12:57                         ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 29+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-08-30 12:57 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: George Dunlap, Elena Ufimtseva, David Vrabel, Matt Wilson, xen-devel

On Fri, Aug 30, 2013 at 12:29:09AM +0200, Dario Faggioli wrote:
> On gio, 2013-08-29 at 10:51 -0400, Konrad Rzeszutek Wilk wrote:
> > On Thu, Aug 29, 2013 at 03:32:13PM +0100, George Dunlap wrote:
> > > On 29/08/13 15:23, Konrad Rzeszutek Wilk wrote:
> > > >  - Why not re-use existing code? As in, if both PV and HVM can use
> > > >    SRAT, why not do it that way? This way you are exercising the same
> > > >    code path in both guests and it means less bugs to chase.
> > > >
> > > >    Incidententaly this also means that the mechanism to fetch the
> > > >    NUMA information from the hypervisor and construct the SRAT tables
> > > >    from can be done the same in both hvmloader and Linux kernel.
> > > 
> > > If the SRAT tables are built by hvmloader for HVM guests, couldn't
> > > hvmloader make this hypercall and construct the tables, instead of
> > > having the domain builder do it?  That would mean that most of the
> > > codepath is the same; HVM just has the extra step of encoding and
> > > decoding the information in SRAT form.
> > 
> > Correct. 
> >
> Indeed. Also, Matt mentioned the HVM implementation does the most of the
> work in libxc and hvmloader, is that the case? (Matt, I knew about the
> "old" HVM-NUMA series from Andre, but I don't have the details fresh
> enough right now... Will take another look ASAP.)
> 
> If yes, please, consider that, when talking about PV-vNUMA, although
> right now the series addresses DomU only, we plan to make it possible
> for Dom0 to have a virtual NUMA topology too. In that case, I don't
> think any code from libxc and hvmloader could be shared, could it?

But it could be stuck in the hypervisor at that point. So all of that
would reside within the Xen source base - in three places. Yuck.
> 
> So, to me, it sort of looks like we risk to introduce more code
> duplication than what we're trying to avoid! :-P

Would be nice if this could be somehow stuck in a libary that all
three (hvmloader, libxc and hypervisor) could be built with.

> 
> > > >I think short term Elena's option is better one as it gets it going and
> > > >we can experiement with it. Long term I think stashing the data in ACPI
> > > >SRAT/SLIT is right. But Elena might not get to it in the next couple of
> > > >months - which means somebody else will have to sign up for that.
> > > 
> > > I definitely think that Elena needs to continue on the same path for
> > > now, so she can actually have closure on the project in a reasonable
> > > amount of time.
> > 
> > I concur. The caveat is that it could mean that the x86 maintainers might
> > object to the generic path having the special case for Xen and that
> > particular patch part won't be upstreamed until it has been taken care of.
> > 
> > If Elena is OK with that possiblity  - then that is fine with me.
> >
> Sorry, I'm not sure I understand what you mean here. When talking about
> PV-NUMA, with Elena's approach, we have a Xen special case in
> numa_init(). With the fake table approach, we'd have a Xen special case
> in the ACPI parsing code (Matt was talking about something like
> acpi_os_get_root_pointer() ), wouldn't we?
> 
> So, either way, we'll have a Xen special case, the only difference I see
> is that, in the latter, we'd probably have to deal with the ACPI
> maintainers instead than with the x86 ones. :-)

Correct.
> 
> FWIW, x86_numa_init() already looks like this:
> 
>  621 void __init x86_numa_init(void)
>  622 {
>  623         if (!numa_off) {
>  624 #ifdef CONFIG_X86_NUMAQ
>  625                 if (!numa_init(numaq_numa_init))
>  626                         return;
>  627 #endif
>  628 #ifdef CONFIG_ACPI_NUMA
>  629                 if (!numa_init(x86_acpi_numa_init))
>  630                         return;
>  631 #endif
>  632 #ifdef CONFIG_AMD_NUMA
>  633                 if (!numa_init(amd_numa_init))
>  634                         return;
>  635 #endif
>  636         }
>  637 
>  638         numa_init(dummy_numa_init);
>  639 }
> 
> I.e., quite a bit of architecture/implementation special casing already!
> So, perhaps, having some more `#ifdef CONFIG_XEN' and/or `if
> (xen_domain())' in there won't be that much upsetting after all. :-)

Or perhaps fix that special casing and have something similar to the
iommu_detect API. And make that 'iommu_detect' be more generic now.

> 
> Thanks and Regards,
> Dario
> 
> -- 
> <<This happens because I choose it to happen!>> (Raistlin Majere)
> -----------------------------------------------------------------
> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
> Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2013-08-30 12:57 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-08-27  8:52 [PATCH RFC 0/2] linux/vnuma: vnuma topology PV guest Elena Ufimtseva
2013-08-27  8:52 ` [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest Elena Ufimtseva
2013-08-27 13:52   ` David Vrabel
2013-08-27 14:33   ` Konrad Rzeszutek Wilk
2013-08-28 13:41     ` Elena Ufimtseva
2013-08-28 23:12     ` Dario Faggioli
2013-08-29 14:07       ` Konrad Rzeszutek Wilk
2013-08-28  1:27   ` Matt Wilson
2013-08-28  1:37     ` Matt Wilson
2013-08-28 13:08       ` Dario Faggioli
2013-08-28 13:39         ` George Dunlap
2013-08-28 16:01       ` Konrad Rzeszutek Wilk
2013-08-28 16:38         ` Matt Wilson
2013-08-28 20:10           ` Elena Ufimtseva
2013-08-28 23:25             ` Matt Wilson
2013-08-29  9:16               ` George Dunlap
2013-08-28 22:21           ` Dario Faggioli
2013-08-29  0:11             ` Matt Wilson
2013-08-29 13:41               ` David Vrabel
2013-08-29 14:23                 ` Konrad Rzeszutek Wilk
2013-08-29 14:32                   ` George Dunlap
2013-08-29 14:51                     ` Konrad Rzeszutek Wilk
2013-08-29 22:29                       ` Dario Faggioli
2013-08-30 12:57                         ` Konrad Rzeszutek Wilk
2013-08-27  8:53 ` [PATCH RFC 2/2] linux/vnuma: Enables NUMA support for PV guest Elena Ufimtseva
2013-08-27 13:35   ` David Vrabel
2013-08-27 15:36     ` Konrad Rzeszutek Wilk
2013-08-27 14:17   ` Konrad Rzeszutek Wilk
2013-08-27 13:48 ` [PATCH RFC 0/2] linux/vnuma: vnuma topology " David Vrabel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.