All of lore.kernel.org
 help / color / mirror / Atom feed
* PXM/Nid/SLIT patch
@ 2004-02-17 13:53 Robert Picco
  2004-02-17 22:32 ` Jesse Barnes
                   ` (10 more replies)
  0 siblings, 11 replies; 12+ messages in thread
From: Robert Picco @ 2004-02-17 13:53 UTC (permalink / raw)
  To: linux-ia64

This patch enables cell based HP machines to boot with a NUMA configured 2.6 Linux kernel.  The
problem is the default hardware configuration reports N-1 CPUs nodes without
memory.  The Nth node has all interleaved memory.  This resulted in the NUMA kernel panicing
very early.  The patch eliminates CPU nodes with no memory and reassigns these CPUs to the
interleaved memory node. The NID space is compressed during the CPU reassignments.

Bob

--- linux-2.6.2-orig/arch/ia64/kernel/acpi.c	2004-02-16 10:14:53.000000000 -0500
+++ linux-2.6.2/arch/ia64/kernel/acpi.c	2004-02-17 06:10:20.000000000 -0500
@@ -338,11 +338,16 @@
 #undef SLIT_DEBUG
 
 #define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32)
+#define	PXM_MAGIC	(255)
 
 static int __initdata srat_num_cpus;			/* number of cpus */
 static u32 __initdata pxm_flag[PXM_FLAG_LEN];
+static u32 __initdata mpxm_flag[PXM_FLAG_LEN];
 #define pxm_bit_set(bit)	(set_bit(bit,(void *)pxm_flag))
+#define	pxm_bit_clear(bit)	(clear_bit(bit, (void *)pxm_flag))
 #define pxm_bit_test(bit)	(test_bit(bit,(void *)pxm_flag))
+#define	mpxm_bit_set(bit)	(set_bit(bit, (void *) mpxm_flag))
+#define	mpxm_bit_test(bit)	(test_bit(bit, (void *) mpxm_flag))
 /* maps to convert between proximity domain and logical node ID */
 int __initdata pxm_to_nid_map[MAX_PXM_DOMAINS];
 int __initdata nid_to_pxm_map[MAX_NUMNODES];
@@ -424,6 +429,110 @@
 	num_memblks++;
 }
 
+static void __init
+acpi_pxm_magic_slit_fix(void)
+{
+	u8		distance, x;
+	int		i, j, nid;
+#define	SLIT_IDENTITY	10
+
+
+	if (!pxm_bit_test(PXM_MAGIC) || slit_table->localities >= PXM_MAGIC)
+		return;
+
+	nid = pxm_to_nid_map[PXM_MAGIC];
+
+	for (distance = SLIT_IDENTITY*2, i = 0; i < slit_table->localities; i++) {
+		if (!pxm_bit_test(i))
+			continue;
+		for (j = 0; j < slit_table->localities; j++) {
+			if (!pxm_bit_test(j) || (i = j))
+				continue;
+
+			x = (slit_table->entry[i*slit_table->localities + j] + SLIT_IDENTITY) / 2;
+			distance = min(x, distance);
+		}
+	}
+
+	/*
+	 * Fill in distances for PXM magic.
+	 */
+
+	for (i = 0; i < numnodes;  i++) 
+		node_distance(i, nid) = distance;
+
+	for (i = 0; i < (numnodes - 1); i++)
+		node_distance(nid, i) = distance;
+
+	node_distance(nid, nid) = SLIT_IDENTITY;
+
+
+	return;
+}
+
+static void __init
+acpi_pxm_magic_fix(void)
+{
+	struct node_memblk_s	*p;
+	int			i, nnode, nid, cpu, pxm;
+
+
+	/*
+	 * If every nid has memory then we are done.
+	 */
+
+	for (nnode = 0, p = &node_memblk[0]; p < &node_memblk[num_memblks]; p++) 
+		if (!mpxm_bit_test(p->nid)) {
+			mpxm_bit_set(p->nid);
+			nnode++;
+		}
+	
+	/*
+	 * All nids with memory.
+	 */
+
+	if (nnode = numnodes) 
+		return;
+
+	/*
+	 * Change logical node id for nids without memory.
+	 * If we are removing a nid without memory, then
+	 * move that nid's cpus to nnode-1 which will become
+	 * the magic PXM's logical node id.  The node_cpu[X].nid
+	 * is the PXM but will change later to logical node
+	 * id.
+	 */
+
+	for (nid = 0, i = 0; i < numnodes; i++) 
+		if (mpxm_bit_test(i)) {
+			if (i = nid) {
+				nid++;
+				continue;
+			}
+
+			for (p = &node_memblk[0]; p < &node_memblk[num_memblks]; p++)
+				if (p->nid = i)
+					p->nid = nid;
+
+			pxm = nid_to_pxm_map[i];
+			pxm_to_nid_map[pxm] = nid;
+			nid_to_pxm_map[nid] = pxm;
+			nid++;
+		}
+		else {
+			for (cpu = 0; cpu < srat_num_cpus; cpu++)
+				if (node_cpuid[cpu].nid = nid_to_pxm_map[i])
+					node_cpuid[cpu].nid = PXM_MAGIC;
+
+			pxm_to_nid_map[i] = nnode - 1;
+			pxm_bit_clear(nid_to_pxm_map[i]);
+		}
+
+	numnodes = nnode;
+
+	return;
+}
+
 void __init
 acpi_numa_arch_fixup (void)
 {
@@ -451,6 +560,8 @@
 	for (i = 0; i < num_memblks; i++)
 		node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid];
 
+	acpi_pxm_magic_fix();
+
 	/* assign memory bank numbers for each chunk on each node */
 	for (i = 0; i < numnodes; i++) {
 		int bank;
@@ -468,8 +579,13 @@
 	printk(KERN_INFO "Number of logical nodes in system = %d\n", numnodes);
 	printk(KERN_INFO "Number of memory chunks in system = %d\n", num_memblks);
 
-	if (!slit_table) return;
+	if (!slit_table) 
+		return;
+
 	memset(numa_slit, -1, sizeof(numa_slit));
+
+	acpi_pxm_magic_slit_fix();
+
 	for (i=0; i<slit_table->localities; i++) {
 		if (!pxm_bit_test(i))
 			continue;



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: PXM/Nid/SLIT patch
  2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
@ 2004-02-17 22:32 ` Jesse Barnes
  2004-02-18 15:33 ` Robert Picco
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Jesse Barnes @ 2004-02-17 22:32 UTC (permalink / raw)
  To: linux-ia64

On Tue, Feb 17, 2004 at 08:53:59AM -0500, Robert Picco wrote:
> +++ linux-2.6.2/arch/ia64/kernel/acpi.c	2004-02-17 
> 06:10:20.000000000 -0500
> @@ -338,11 +338,16 @@
> #undef SLIT_DEBUG
> 
> #define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32)
> +#define	PXM_MAGIC	(255)

Is this a reserved value in the ACPI SLIT spec?  I don't have it handy,
so I can't check that this would be a magic Linux-only value.

Also, I can't be sure (since my .muttrc is known to be weird), but I
think the patch got wrapped somehow, and doesn't seem to conform
entirely to Documentation/CodingStyle...  Haven't really looked at it
other than that yet.

Jesse

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: PXM/Nid/SLIT patch
  2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
  2004-02-17 22:32 ` Jesse Barnes
@ 2004-02-18 15:33 ` Robert Picco
  2004-02-18 17:08 ` Christoph Hellwig
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Robert Picco @ 2004-02-18 15:33 UTC (permalink / raw)
  To: linux-ia64

Jesse Barnes wrote:

>On Tue, Feb 17, 2004 at 08:53:59AM -0500, Robert Picco wrote:
>  
>
>>+++ linux-2.6.2/arch/ia64/kernel/acpi.c	2004-02-17 
>>06:10:20.000000000 -0500
>>@@ -338,11 +338,16 @@
>>#undef SLIT_DEBUG
>>
>>#define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32)
>>+#define	PXM_MAGIC	(255)
>>    
>>
>
>Is this a reserved value in the ACPI SLIT spec?  I don't have it handy,
>so I can't check that this would be a magic Linux-only value.
>
>Also, I can't be sure (since my .muttrc is known to be weird), but I
>think the patch got wrapped somehow, and doesn't seem to conform
>entirely to Documentation/CodingStyle...  Haven't really looked at it
>other than that yet.
>
>Jesse
>
>  
>
This PXM value (255) isn't a SLIT or PXM defined quantity.  It is really 
specific to HP cell machines.  For example, a machine configured with 
two cells will report three PXMs.  Two for the CPUs and one for the 
interleaved memory at magic PXM 255.  The firmware doesn't report SLIT 
information for PXM 255. The patch approximates the SLIT value for PXM 
255. I have attempted to arrive at code which doesn't break non-HP 
hardware configurations. I have assumed the way the initialization code 
was written that all NIDs require memory.  Otherwise 
reserve_pernode_space will fail.

My patch with modifications to CodingStyle is below.  Hopefully it's 
correct this time. Sorry for that inconvenience and my non-conformance ;-)

Bob

--- linux-2.6.2-orig/arch/ia64/kernel/acpi.c	2004-02-18 07:46:10.000000000 -0500
+++ linux-2.6.2/arch/ia64/kernel/acpi.c	2004-02-18 07:33:00.000000000 -0500
@@ -338,11 +338,16 @@
 #undef SLIT_DEBUG
 
 #define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32)
+#define	PXM_MAGIC	(255)
 
 static int __initdata srat_num_cpus;			/* number of cpus */
 static u32 __initdata pxm_flag[PXM_FLAG_LEN];
+static u32 __initdata mpxm_flag[PXM_FLAG_LEN];
 #define pxm_bit_set(bit)	(set_bit(bit,(void *)pxm_flag))
+#define	pxm_bit_clear(bit)	(clear_bit(bit, (void *)pxm_flag))
 #define pxm_bit_test(bit)	(test_bit(bit,(void *)pxm_flag))
+#define	mpxm_bit_set(bit)	(set_bit(bit, (void *) mpxm_flag))
+#define	mpxm_bit_test(bit)	(test_bit(bit, (void *) mpxm_flag))
 /* maps to convert between proximity domain and logical node ID */
 int __initdata pxm_to_nid_map[MAX_PXM_DOMAINS];
 int __initdata nid_to_pxm_map[MAX_NUMNODES];
@@ -424,6 +429,110 @@
 	num_memblks++;
 }
 
+static void __init
+acpi_pxm_magic_slit_fix (void)
+{
+	u8 distance, x;
+	int i, j, nid;
+#define	SLIT_IDENTITY	10
+
+
+	if (!pxm_bit_test(PXM_MAGIC) || slit_table->localities >= PXM_MAGIC)
+		return;
+
+	nid = pxm_to_nid_map[PXM_MAGIC];
+
+	for (distance = SLIT_IDENTITY*2, i = 0; i < slit_table->localities; i++) {
+		if (!pxm_bit_test(i))
+			continue;
+
+		for (j = 0; j < slit_table->localities; j++) {
+			if (!pxm_bit_test(j) || (i = j))
+				continue;
+
+			x = (slit_table->entry[i*slit_table->localities + j] + SLIT_IDENTITY) / 2;
+			distance = min(x, distance);
+		}
+	}
+
+	/*
+	 * Fill in distances for PXM magic.
+	 */
+
+	for (i = 0; i < numnodes;  i++) 
+		node_distance(i, nid) = distance;
+
+	for (i = 0; i < (numnodes - 1); i++)
+		node_distance(nid, i) = distance;
+
+	node_distance(nid, nid) = SLIT_IDENTITY;
+
+
+	return;
+}
+
+static void __init
+acpi_pxm_magic_fix (void)
+{
+	struct node_memblk_s *p;
+	int i, nnode, nid, cpu, pxm;
+
+
+	/*
+	 * If every nid has memory then we are done.
+	 */
+
+	for (nnode = 0, p = &node_memblk[0]; p < &node_memblk[num_memblks]; p++) 
+		if (!mpxm_bit_test(p->nid)) {
+			mpxm_bit_set(p->nid);
+			nnode++;
+		}
+	
+	/*
+	 * All nids with memory.
+	 */
+
+	if (nnode = numnodes) 
+		return;
+
+	/*
+	 * Change logical node id for nids without memory.
+	 * If we are removing a nid without memory, then
+	 * move that nid's cpus to nnode-1 which will become
+	 * the magic PXM's logical node id.  The node_cpu[X].nid
+	 * is the PXM but will change later to logical node
+	 * id.
+	 */
+
+	for (nid = 0, i = 0; i < numnodes; i++) 
+		if (mpxm_bit_test(i)) {
+			if (i = nid) {
+				nid++;
+				continue;
+			}
+
+			for (p = &node_memblk[0]; p < &node_memblk[num_memblks]; p++)
+				if (p->nid = i)
+					p->nid = nid;
+
+			pxm = nid_to_pxm_map[i];
+			pxm_to_nid_map[pxm] = nid;
+			nid_to_pxm_map[nid] = pxm;
+			nid++;
+		} else {
+			for (cpu = 0; cpu < srat_num_cpus; cpu++)
+				if (node_cpuid[cpu].nid = nid_to_pxm_map[i])
+					node_cpuid[cpu].nid = PXM_MAGIC;
+
+			pxm_to_nid_map[i] = nnode - 1;
+			pxm_bit_clear(nid_to_pxm_map[i]);
+		}
+
+	numnodes = nnode;
+
+	return;
+}
+
 void __init
 acpi_numa_arch_fixup (void)
 {
@@ -451,6 +560,8 @@
 	for (i = 0; i < num_memblks; i++)
 		node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid];
 
+	acpi_pxm_magic_fix();
+
 	/* assign memory bank numbers for each chunk on each node */
 	for (i = 0; i < numnodes; i++) {
 		int bank;
@@ -468,8 +579,13 @@
 	printk(KERN_INFO "Number of logical nodes in system = %d\n", numnodes);
 	printk(KERN_INFO "Number of memory chunks in system = %d\n", num_memblks);
 
-	if (!slit_table) return;
+	if (!slit_table) 
+		return;
+
 	memset(numa_slit, -1, sizeof(numa_slit));
+
+	acpi_pxm_magic_slit_fix();
+
 	for (i=0; i<slit_table->localities; i++) {
 		if (!pxm_bit_test(i))
 			continue;








^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: PXM/Nid/SLIT patch
  2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
  2004-02-17 22:32 ` Jesse Barnes
  2004-02-18 15:33 ` Robert Picco
@ 2004-02-18 17:08 ` Christoph Hellwig
  2004-02-18 18:56 ` Robert Picco
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2004-02-18 17:08 UTC (permalink / raw)
  To: linux-ia64

On Wed, Feb 18, 2004 at 10:33:29AM -0500, Robert Picco wrote:
> This PXM value (255) isn't a SLIT or PXM defined quantity.  It is really 
> specific to HP cell machines.  For example, a machine configured with 
> two cells will report three PXMs.  Two for the CPUs and one for the 
> interleaved memory at magic PXM 255.  The firmware doesn't report SLIT 
> information for PXM 255. The patch approximates the SLIT value for PXM 
> 255. I have attempted to arrive at code which doesn't break non-HP 
> hardware configurations. I have assumed the way the initialization code 
> was written that all NIDs require memory.  Otherwise 
> reserve_pernode_space will fail.

I know HP basically owns the IA64 ports, but honestly can't you fix
the firmware to return sane information instead?  i.e. move the above
fix to firmware instead of letting linux fixup the reported data.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: PXM/Nid/SLIT patch
  2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
                   ` (2 preceding siblings ...)
  2004-02-18 17:08 ` Christoph Hellwig
@ 2004-02-18 18:56 ` Robert Picco
  2004-02-18 18:59 ` David Mosberger
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Robert Picco @ 2004-02-18 18:56 UTC (permalink / raw)
  To: linux-ia64

Christoph Hellwig wrote:

>On Wed, Feb 18, 2004 at 10:33:29AM -0500, Robert Picco wrote:
>  
>
>>This PXM value (255) isn't a SLIT or PXM defined quantity.  It is really 
>>specific to HP cell machines.  For example, a machine configured with 
>>two cells will report three PXMs.  Two for the CPUs and one for the 
>>interleaved memory at magic PXM 255.  The firmware doesn't report SLIT 
>>information for PXM 255. The patch approximates the SLIT value for PXM 
>>255. I have attempted to arrive at code which doesn't break non-HP 
>>hardware configurations. I have assumed the way the initialization code 
>>was written that all NIDs require memory.  Otherwise 
>>reserve_pernode_space will fail.
>>    
>>
>
>I know HP basically owns the IA64 ports, but honestly can't you fix
>the firmware to return sane information instead?  i.e. move the above
>fix to firmware instead of letting linux fixup the reported data.
>
>  
>
Well some of us would like to see this too.  Some legacy requirements 
from our other supported OSes require this to be the default 
configuration.  Perhaps a different default can be made in the future or 
some IPMI tool to change the default.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: PXM/Nid/SLIT patch
  2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
                   ` (3 preceding siblings ...)
  2004-02-18 18:56 ` Robert Picco
@ 2004-02-18 18:59 ` David Mosberger
  2004-02-18 19:04 ` Jesse Barnes
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: David Mosberger @ 2004-02-18 18:59 UTC (permalink / raw)
  To: linux-ia64

>>>>> On Wed, 18 Feb 2004 17:08:58 +0000, Christoph Hellwig <hch@infradead.org> said:

  Christoph> On Wed, Feb 18, 2004 at 10:33:29AM -0500, Robert Picco wrote:
  >> This PXM value (255) isn't a SLIT or PXM defined quantity.  It is really
  >> specific to HP cell machines.  For example, a machine configured with
  >> two cells will report three PXMs.  Two for the CPUs and one for the
  >> interleaved memory at magic PXM 255.  The firmware doesn't report SLIT
  >> information for PXM 255. The patch approximates the SLIT value for PXM
  >> 255. I have attempted to arrive at code which doesn't break non-HP
  >> hardware configurations. I have assumed the way the initialization code
  >> was written that all NIDs require memory.  Otherwise
  >> reserve_pernode_space will fail.

  Christoph> I know HP basically owns the IA64 ports

This comment concerns me.  I certainly have always tried to judge
patches based on their technical merits for Linux.  Is there anything
in particular that I did (or didn't) do that you found objectionable?
If so, please let me know.

  Christoph> but honestly can't you fix the firmware to return sane
  Christoph> information instead?  i.e. move the above fix to firmware
  Christoph> instead of letting linux fixup the reported data.

Hmmh, I'm no NUMA-expert and it isn't clear to me whether the patch is
working around a firmware-bug or a limitation in the Linux NUMA code.
I don't see off-hand why it should be illegal to have a memory config
with only one node with memory.  The whole PXM_MAGIC business looks
strange to me though.  Can someone explain?

	--david

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: PXM/Nid/SLIT patch
  2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
                   ` (4 preceding siblings ...)
  2004-02-18 18:59 ` David Mosberger
@ 2004-02-18 19:04 ` Jesse Barnes
  2004-02-18 19:06 ` Jesse Barnes
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Jesse Barnes @ 2004-02-18 19:04 UTC (permalink / raw)
  To: linux-ia64

On Wed, Feb 18, 2004 at 10:59:03AM -0800, David Mosberger wrote:
>   Christoph> but honestly can't you fix the firmware to return sane
>   Christoph> information instead?  i.e. move the above fix to firmware
>   Christoph> instead of letting linux fixup the reported data.
> 
> Hmmh, I'm no NUMA-expert and it isn't clear to me whether the patch is
> working around a firmware-bug or a limitation in the Linux NUMA code.
> I don't see off-hand why it should be illegal to have a memory config
> with only one node with memory.  The whole PXM_MAGIC business looks
> strange to me though.  Can someone explain?

Well, it would be nice if memory layout was reported fully, with the
correct CPU/node and memory affinity information.  That would allow us
to either interleave in software (maybe a new flag that changes the way
discontig.c builds the memory maps) or just treat the machine as a
normal NUMA box.  But maybe this isn't possible with the HP cell boxes?
Robert, maybe you can describe the memory layout of these machines a
little more (sorry if I missed some discussion, I'm having mail trouble
right now and was unsubscribed from linux-ia64).

Thanks,
Jesse

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: PXM/Nid/SLIT patch
  2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
                   ` (5 preceding siblings ...)
  2004-02-18 19:04 ` Jesse Barnes
@ 2004-02-18 19:06 ` Jesse Barnes
  2004-02-18 19:13 ` Christoph Hellwig
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Jesse Barnes @ 2004-02-18 19:06 UTC (permalink / raw)
  To: linux-ia64

On Wed, Feb 18, 2004 at 01:56:48PM -0500, Robert Picco wrote:
> >I know HP basically owns the IA64 ports, but honestly can't you fix
> >the firmware to return sane information instead?  i.e. move the above
> >fix to firmware instead of letting linux fixup the reported data.
>
> Well some of us would like to see this too.  Some legacy requirements 
> from our other supported OSes require this to be the default 
> configuration.  Perhaps a different default can be made in the future or 
> some IPMI tool to change the default.

Ah, that explains it, it's what I expected (I've heard of other NUMA
boxes that do this too).  Hmm...

Jesse

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: PXM/Nid/SLIT patch
  2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
                   ` (6 preceding siblings ...)
  2004-02-18 19:06 ` Jesse Barnes
@ 2004-02-18 19:13 ` Christoph Hellwig
  2004-02-18 19:19 ` Robert Picco
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2004-02-18 19:13 UTC (permalink / raw)
  To: linux-ia64

On Wed, Feb 18, 2004 at 10:59:03AM -0800, David Mosberger wrote:
> This comment concerns me.  I certainly have always tried to judge
> patches based on their technical merits for Linux.  Is there anything
> in particular that I did (or didn't) do that you found objectionable?
> If so, please let me know.

Nah, this wasn't meant as an attac against you, it's just that HP seems
to do most of the work and thus everything in arch/ia64/ is a little
HP centric.  I guess it'll change by the time now that SGI woke up
a little.

> Hmmh, I'm no NUMA-expert and it isn't clear to me whether the patch is
> working around a firmware-bug or a limitation in the Linux NUMA code.
> I don't see off-hand why it should be illegal to have a memory config
> with only one node with memory.  The whole PXM_MAGIC business looks
> strange to me though.  Can someone explain?

There's two issues. First we should probably handle CPU-less nodes, but
that's not what this patch does.

The second issue is that the firmware reports plain wrong data to work
around the lack of NUMA support in a certain legacy OS from Redmond, and
I don't think we should so this non-standard workaround in Linux for that.

Robert's idea of a switch in the firmware to report proper tables sounds
like the best way to go, maybe together with a fix to allow cpu-less nodes
to allow boxes with old firmware to boot, even with suboptimal performance.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: PXM/Nid/SLIT patch
  2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
                   ` (7 preceding siblings ...)
  2004-02-18 19:13 ` Christoph Hellwig
@ 2004-02-18 19:19 ` Robert Picco
  2004-02-18 19:36 ` Jesse Barnes
  2004-02-18 19:43 ` David Mosberger
  10 siblings, 0 replies; 12+ messages in thread
From: Robert Picco @ 2004-02-18 19:19 UTC (permalink / raw)
  To: linux-ia64

David Mosberger wrote:

>>>>>>On Wed, 18 Feb 2004 17:08:58 +0000, Christoph Hellwig <hch@infradead.org> said:
>>>>>>            
>>>>>>
>
>  Christoph> On Wed, Feb 18, 2004 at 10:33:29AM -0500, Robert Picco wrote:
>  >> This PXM value (255) isn't a SLIT or PXM defined quantity.  It is really
>  >> specific to HP cell machines.  For example, a machine configured with
>  >> two cells will report three PXMs.  Two for the CPUs and one for the
>  >> interleaved memory at magic PXM 255.  The firmware doesn't report SLIT
>  >> information for PXM 255. The patch approximates the SLIT value for PXM
>  >> 255. I have attempted to arrive at code which doesn't break non-HP
>  >> hardware configurations. I have assumed the way the initialization code
>  >> was written that all NIDs require memory.  Otherwise
>  >> reserve_pernode_space will fail.
>
>  Christoph> I know HP basically owns the IA64 ports
>
>This comment concerns me.  I certainly have always tried to judge
>patches based on their technical merits for Linux.  Is there anything
>in particular that I did (or didn't) do that you found objectionable?
>If so, please let me know.
>
>  Christoph> but honestly can't you fix the firmware to return sane
>  Christoph> information instead?  i.e. move the above fix to firmware
>  Christoph> instead of letting linux fixup the reported data.
>
>Hmmh, I'm no NUMA-expert and it isn't clear to me whether the patch is
>working around a firmware-bug or a limitation in the Linux NUMA code.
>I don't see off-hand why it should be illegal to have a memory config
>with only one node with memory.  The whole PXM_MAGIC business looks
>strange to me though.  Can someone explain?
>
>	--david
>
>  
>
Our HP default boot configuration has all memory  interleaved and 
reported in NUMA SRAT PXM 255.  The
other cell nodes (PXMs) don't have any memory.  This was totally 
unexpected by the current NUMA code. There will be N-1 nids with CPUs 
and no memory and 1 NID with all the memory.  Initialization crashes 
very early.  The current code expects each node to have local memory.  
Well this isn't the case for HP machines.  It could be configured with 
some IPMI interface for every cell to have Cell Local Memory (CLM) but 
such an interface doesn't exist for Linux.  Should such an interface 
become available, the firmware would still steal 0.5Gb of interleaved 
memory from the root cell. 

So, if we had a tool to configure CLM for all cells, there would be N-1 
nids with CPU and local memory and 1 nid with just interleaved memory.  
The current kernel code would work fine but the SLIT information would be
wrong because PXM 255 isn't reported by the firmware in the SLIT table.  
numa_slit isn't used  by non-machine dependent code for memory 
allocation policy  but could be in the future for memory  allocations 
when the current node's memory is exhausted. numa_slit would be used as 
a measure of the best locality to make the allocation from (shortest path).

Bob


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: PXM/Nid/SLIT patch
  2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
                   ` (8 preceding siblings ...)
  2004-02-18 19:19 ` Robert Picco
@ 2004-02-18 19:36 ` Jesse Barnes
  2004-02-18 19:43 ` David Mosberger
  10 siblings, 0 replies; 12+ messages in thread
From: Jesse Barnes @ 2004-02-18 19:36 UTC (permalink / raw)
  To: linux-ia64

On Wed, Feb 18, 2004 at 02:19:23PM -0500, Robert Picco wrote:
> Our HP default boot configuration has all memory  interleaved and 
> reported in NUMA SRAT PXM 255.  The
> other cell nodes (PXMs) don't have any memory.  This was totally 
> unexpected by the current NUMA code. There will be N-1 nids with CPUs 
> and no memory and 1 NID with all the memory.  Initialization crashes 
> very early.  The current code expects each node to have local memory.  

Oh, right, there's that... we could fix it to fallback to other nodes
though.  In fact, we should do the bootmem initialization earlier and
use alloc_bootmem_node for things instead of allocating stuff in
find_pernode_space.  If we fixed that your machine would work pretty
well I think.

> So, if we had a tool to configure CLM for all cells, there would be N-1 
> nids with CPU and local memory and 1 nid with just interleaved memory.  
> The current kernel code would work fine but the SLIT information would be
> wrong because PXM 255 isn't reported by the firmware in the SLIT table.  
> numa_slit isn't used  by non-machine dependent code for memory 
> allocation policy  but could be in the future for memory  allocations 
> when the current node's memory is exhausted. numa_slit would be used as 
> a measure of the best locality to make the allocation from (shortest path).

No, pgdat->zonelist is used instead.  It needs to be built better
though...

Jesse

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: PXM/Nid/SLIT patch
  2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
                   ` (9 preceding siblings ...)
  2004-02-18 19:36 ` Jesse Barnes
@ 2004-02-18 19:43 ` David Mosberger
  10 siblings, 0 replies; 12+ messages in thread
From: David Mosberger @ 2004-02-18 19:43 UTC (permalink / raw)
  To: linux-ia64

Bob,

Thanks for your explanation.  I'm not very familiar with SRAT, PXM etc
(and don't see much reason at this point why I should read it,
especially considering that it's covered by one of those long
Microsoft licenses), so my preference is for this issue to be worked
out among those folks that care about NUMA (you, Jesse, etc.).  In the
unexpected event of not being able to find a solution that's
acceptable to everybody, I'm willing to try to mediate (and learn
about all the RATty stuff.. ;-), but again, I doubt that'll be
necessary.

	--david

>>>>> On Wed, 18 Feb 2004 14:19:23 -0500, Robert Picco <Robert.Picco@hp.com> said:

  Robert> Our HP default boot configuration has all memory interleaved
  Robert> and reported in NUMA SRAT PXM 255.  The other cell nodes
  Robert> (PXMs) don't have any memory.  This was totally unexpected
  Robert> by the current NUMA code. There will be N-1 nids with CPUs
  Robert> and no memory and 1 NID with all the memory.  Initialization
  Robert> crashes very early.  The current code expects each node to
  Robert> have local memory.  Well this isn't the case for HP
  Robert> machines.  It could be configured with some IPMI interface
  Robert> for every cell to have Cell Local Memory (CLM) but such an
  Robert> interface doesn't exist for Linux.  Should such an interface
  Robert> become available, the firmware would still steal 0.5Gb of
  Robert> interleaved memory from the root cell.

  Robert> So, if we had a tool to configure CLM for all cells, there
  Robert> would be N-1 nids with CPU and local memory and 1 nid with
  Robert> just interleaved memory.  The current kernel code would work
  Robert> fine but the SLIT information would be wrong because PXM 255
  Robert> isn't reported by the firmware in the SLIT table.  numa_slit
  Robert> isn't used by non-machine dependent code for memory
  Robert> allocation policy but could be in the future for memory
  Robert> allocations when the current node's memory is
  Robert> exhausted. numa_slit would be used as a measure of the best
  Robert> locality to make the allocation from (shortest path).


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2004-02-18 19:43 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
2004-02-17 22:32 ` Jesse Barnes
2004-02-18 15:33 ` Robert Picco
2004-02-18 17:08 ` Christoph Hellwig
2004-02-18 18:56 ` Robert Picco
2004-02-18 18:59 ` David Mosberger
2004-02-18 19:04 ` Jesse Barnes
2004-02-18 19:06 ` Jesse Barnes
2004-02-18 19:13 ` Christoph Hellwig
2004-02-18 19:19 ` Robert Picco
2004-02-18 19:36 ` Jesse Barnes
2004-02-18 19:43 ` David Mosberger

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.