* PXM/Nid/SLIT patch
@ 2004-02-17 13:53 Robert Picco
2004-02-17 22:32 ` Jesse Barnes
` (10 more replies)
0 siblings, 11 replies; 12+ messages in thread
From: Robert Picco @ 2004-02-17 13:53 UTC (permalink / raw)
To: linux-ia64
This patch enables cell based HP machines to boot with a NUMA configured 2.6 Linux kernel. The
problem is the default hardware configuration reports N-1 CPUs nodes without
memory. The Nth node has all interleaved memory. This resulted in the NUMA kernel panicing
very early. The patch eliminates CPU nodes with no memory and reassigns these CPUs to the
interleaved memory node. The NID space is compressed during the CPU reassignments.
Bob
--- linux-2.6.2-orig/arch/ia64/kernel/acpi.c 2004-02-16 10:14:53.000000000 -0500
+++ linux-2.6.2/arch/ia64/kernel/acpi.c 2004-02-17 06:10:20.000000000 -0500
@@ -338,11 +338,16 @@
#undef SLIT_DEBUG
#define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32)
+#define PXM_MAGIC (255)
static int __initdata srat_num_cpus; /* number of cpus */
static u32 __initdata pxm_flag[PXM_FLAG_LEN];
+static u32 __initdata mpxm_flag[PXM_FLAG_LEN];
#define pxm_bit_set(bit) (set_bit(bit,(void *)pxm_flag))
+#define pxm_bit_clear(bit) (clear_bit(bit, (void *)pxm_flag))
#define pxm_bit_test(bit) (test_bit(bit,(void *)pxm_flag))
+#define mpxm_bit_set(bit) (set_bit(bit, (void *) mpxm_flag))
+#define mpxm_bit_test(bit) (test_bit(bit, (void *) mpxm_flag))
/* maps to convert between proximity domain and logical node ID */
int __initdata pxm_to_nid_map[MAX_PXM_DOMAINS];
int __initdata nid_to_pxm_map[MAX_NUMNODES];
@@ -424,6 +429,110 @@
num_memblks++;
}
+static void __init
+acpi_pxm_magic_slit_fix(void)
+{
+ u8 distance, x;
+ int i, j, nid;
+#define SLIT_IDENTITY 10
+
+
+ if (!pxm_bit_test(PXM_MAGIC) || slit_table->localities >= PXM_MAGIC)
+ return;
+
+ nid = pxm_to_nid_map[PXM_MAGIC];
+
+ for (distance = SLIT_IDENTITY*2, i = 0; i < slit_table->localities; i++) {
+ if (!pxm_bit_test(i))
+ continue;
+ for (j = 0; j < slit_table->localities; j++) {
+ if (!pxm_bit_test(j) || (i = j))
+ continue;
+
+ x = (slit_table->entry[i*slit_table->localities + j] + SLIT_IDENTITY) / 2;
+ distance = min(x, distance);
+ }
+ }
+
+ /*
+ * Fill in distances for PXM magic.
+ */
+
+ for (i = 0; i < numnodes; i++)
+ node_distance(i, nid) = distance;
+
+ for (i = 0; i < (numnodes - 1); i++)
+ node_distance(nid, i) = distance;
+
+ node_distance(nid, nid) = SLIT_IDENTITY;
+
+
+ return;
+}
+
+static void __init
+acpi_pxm_magic_fix(void)
+{
+ struct node_memblk_s *p;
+ int i, nnode, nid, cpu, pxm;
+
+
+ /*
+ * If every nid has memory then we are done.
+ */
+
+ for (nnode = 0, p = &node_memblk[0]; p < &node_memblk[num_memblks]; p++)
+ if (!mpxm_bit_test(p->nid)) {
+ mpxm_bit_set(p->nid);
+ nnode++;
+ }
+
+ /*
+ * All nids with memory.
+ */
+
+ if (nnode = numnodes)
+ return;
+
+ /*
+ * Change logical node id for nids without memory.
+ * If we are removing a nid without memory, then
+ * move that nid's cpus to nnode-1 which will become
+ * the magic PXM's logical node id. The node_cpu[X].nid
+ * is the PXM but will change later to logical node
+ * id.
+ */
+
+ for (nid = 0, i = 0; i < numnodes; i++)
+ if (mpxm_bit_test(i)) {
+ if (i = nid) {
+ nid++;
+ continue;
+ }
+
+ for (p = &node_memblk[0]; p < &node_memblk[num_memblks]; p++)
+ if (p->nid = i)
+ p->nid = nid;
+
+ pxm = nid_to_pxm_map[i];
+ pxm_to_nid_map[pxm] = nid;
+ nid_to_pxm_map[nid] = pxm;
+ nid++;
+ }
+ else {
+ for (cpu = 0; cpu < srat_num_cpus; cpu++)
+ if (node_cpuid[cpu].nid = nid_to_pxm_map[i])
+ node_cpuid[cpu].nid = PXM_MAGIC;
+
+ pxm_to_nid_map[i] = nnode - 1;
+ pxm_bit_clear(nid_to_pxm_map[i]);
+ }
+
+ numnodes = nnode;
+
+ return;
+}
+
void __init
acpi_numa_arch_fixup (void)
{
@@ -451,6 +560,8 @@
for (i = 0; i < num_memblks; i++)
node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid];
+ acpi_pxm_magic_fix();
+
/* assign memory bank numbers for each chunk on each node */
for (i = 0; i < numnodes; i++) {
int bank;
@@ -468,8 +579,13 @@
printk(KERN_INFO "Number of logical nodes in system = %d\n", numnodes);
printk(KERN_INFO "Number of memory chunks in system = %d\n", num_memblks);
- if (!slit_table) return;
+ if (!slit_table)
+ return;
+
memset(numa_slit, -1, sizeof(numa_slit));
+
+ acpi_pxm_magic_slit_fix();
+
for (i=0; i<slit_table->localities; i++) {
if (!pxm_bit_test(i))
continue;
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: PXM/Nid/SLIT patch
2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
@ 2004-02-17 22:32 ` Jesse Barnes
2004-02-18 15:33 ` Robert Picco
` (9 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Jesse Barnes @ 2004-02-17 22:32 UTC (permalink / raw)
To: linux-ia64
On Tue, Feb 17, 2004 at 08:53:59AM -0500, Robert Picco wrote:
> +++ linux-2.6.2/arch/ia64/kernel/acpi.c 2004-02-17
> 06:10:20.000000000 -0500
> @@ -338,11 +338,16 @@
> #undef SLIT_DEBUG
>
> #define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32)
> +#define PXM_MAGIC (255)
Is this a reserved value in the ACPI SLIT spec? I don't have it handy,
so I can't check that this would be a magic Linux-only value.
Also, I can't be sure (since my .muttrc is known to be weird), but I
think the patch got wrapped somehow, and doesn't seem to conform
entirely to Documentation/CodingStyle... Haven't really looked at it
other than that yet.
Jesse
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: PXM/Nid/SLIT patch
2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
2004-02-17 22:32 ` Jesse Barnes
@ 2004-02-18 15:33 ` Robert Picco
2004-02-18 17:08 ` Christoph Hellwig
` (8 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Robert Picco @ 2004-02-18 15:33 UTC (permalink / raw)
To: linux-ia64
Jesse Barnes wrote:
>On Tue, Feb 17, 2004 at 08:53:59AM -0500, Robert Picco wrote:
>
>
>>+++ linux-2.6.2/arch/ia64/kernel/acpi.c 2004-02-17
>>06:10:20.000000000 -0500
>>@@ -338,11 +338,16 @@
>>#undef SLIT_DEBUG
>>
>>#define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32)
>>+#define PXM_MAGIC (255)
>>
>>
>
>Is this a reserved value in the ACPI SLIT spec? I don't have it handy,
>so I can't check that this would be a magic Linux-only value.
>
>Also, I can't be sure (since my .muttrc is known to be weird), but I
>think the patch got wrapped somehow, and doesn't seem to conform
>entirely to Documentation/CodingStyle... Haven't really looked at it
>other than that yet.
>
>Jesse
>
>
>
This PXM value (255) isn't a SLIT or PXM defined quantity. It is really
specific to HP cell machines. For example, a machine configured with
two cells will report three PXMs. Two for the CPUs and one for the
interleaved memory at magic PXM 255. The firmware doesn't report SLIT
information for PXM 255. The patch approximates the SLIT value for PXM
255. I have attempted to arrive at code which doesn't break non-HP
hardware configurations. I have assumed the way the initialization code
was written that all NIDs require memory. Otherwise
reserve_pernode_space will fail.
My patch with modifications to CodingStyle is below. Hopefully it's
correct this time. Sorry for that inconvenience and my non-conformance ;-)
Bob
--- linux-2.6.2-orig/arch/ia64/kernel/acpi.c 2004-02-18 07:46:10.000000000 -0500
+++ linux-2.6.2/arch/ia64/kernel/acpi.c 2004-02-18 07:33:00.000000000 -0500
@@ -338,11 +338,16 @@
#undef SLIT_DEBUG
#define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32)
+#define PXM_MAGIC (255)
static int __initdata srat_num_cpus; /* number of cpus */
static u32 __initdata pxm_flag[PXM_FLAG_LEN];
+static u32 __initdata mpxm_flag[PXM_FLAG_LEN];
#define pxm_bit_set(bit) (set_bit(bit,(void *)pxm_flag))
+#define pxm_bit_clear(bit) (clear_bit(bit, (void *)pxm_flag))
#define pxm_bit_test(bit) (test_bit(bit,(void *)pxm_flag))
+#define mpxm_bit_set(bit) (set_bit(bit, (void *) mpxm_flag))
+#define mpxm_bit_test(bit) (test_bit(bit, (void *) mpxm_flag))
/* maps to convert between proximity domain and logical node ID */
int __initdata pxm_to_nid_map[MAX_PXM_DOMAINS];
int __initdata nid_to_pxm_map[MAX_NUMNODES];
@@ -424,6 +429,110 @@
num_memblks++;
}
+static void __init
+acpi_pxm_magic_slit_fix (void)
+{
+ u8 distance, x;
+ int i, j, nid;
+#define SLIT_IDENTITY 10
+
+
+ if (!pxm_bit_test(PXM_MAGIC) || slit_table->localities >= PXM_MAGIC)
+ return;
+
+ nid = pxm_to_nid_map[PXM_MAGIC];
+
+ for (distance = SLIT_IDENTITY*2, i = 0; i < slit_table->localities; i++) {
+ if (!pxm_bit_test(i))
+ continue;
+
+ for (j = 0; j < slit_table->localities; j++) {
+ if (!pxm_bit_test(j) || (i = j))
+ continue;
+
+ x = (slit_table->entry[i*slit_table->localities + j] + SLIT_IDENTITY) / 2;
+ distance = min(x, distance);
+ }
+ }
+
+ /*
+ * Fill in distances for PXM magic.
+ */
+
+ for (i = 0; i < numnodes; i++)
+ node_distance(i, nid) = distance;
+
+ for (i = 0; i < (numnodes - 1); i++)
+ node_distance(nid, i) = distance;
+
+ node_distance(nid, nid) = SLIT_IDENTITY;
+
+
+ return;
+}
+
+static void __init
+acpi_pxm_magic_fix (void)
+{
+ struct node_memblk_s *p;
+ int i, nnode, nid, cpu, pxm;
+
+
+ /*
+ * If every nid has memory then we are done.
+ */
+
+ for (nnode = 0, p = &node_memblk[0]; p < &node_memblk[num_memblks]; p++)
+ if (!mpxm_bit_test(p->nid)) {
+ mpxm_bit_set(p->nid);
+ nnode++;
+ }
+
+ /*
+ * All nids with memory.
+ */
+
+ if (nnode = numnodes)
+ return;
+
+ /*
+ * Change logical node id for nids without memory.
+ * If we are removing a nid without memory, then
+ * move that nid's cpus to nnode-1 which will become
+ * the magic PXM's logical node id. The node_cpu[X].nid
+ * is the PXM but will change later to logical node
+ * id.
+ */
+
+ for (nid = 0, i = 0; i < numnodes; i++)
+ if (mpxm_bit_test(i)) {
+ if (i = nid) {
+ nid++;
+ continue;
+ }
+
+ for (p = &node_memblk[0]; p < &node_memblk[num_memblks]; p++)
+ if (p->nid = i)
+ p->nid = nid;
+
+ pxm = nid_to_pxm_map[i];
+ pxm_to_nid_map[pxm] = nid;
+ nid_to_pxm_map[nid] = pxm;
+ nid++;
+ } else {
+ for (cpu = 0; cpu < srat_num_cpus; cpu++)
+ if (node_cpuid[cpu].nid = nid_to_pxm_map[i])
+ node_cpuid[cpu].nid = PXM_MAGIC;
+
+ pxm_to_nid_map[i] = nnode - 1;
+ pxm_bit_clear(nid_to_pxm_map[i]);
+ }
+
+ numnodes = nnode;
+
+ return;
+}
+
void __init
acpi_numa_arch_fixup (void)
{
@@ -451,6 +560,8 @@
for (i = 0; i < num_memblks; i++)
node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid];
+ acpi_pxm_magic_fix();
+
/* assign memory bank numbers for each chunk on each node */
for (i = 0; i < numnodes; i++) {
int bank;
@@ -468,8 +579,13 @@
printk(KERN_INFO "Number of logical nodes in system = %d\n", numnodes);
printk(KERN_INFO "Number of memory chunks in system = %d\n", num_memblks);
- if (!slit_table) return;
+ if (!slit_table)
+ return;
+
memset(numa_slit, -1, sizeof(numa_slit));
+
+ acpi_pxm_magic_slit_fix();
+
for (i=0; i<slit_table->localities; i++) {
if (!pxm_bit_test(i))
continue;
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: PXM/Nid/SLIT patch
2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
2004-02-17 22:32 ` Jesse Barnes
2004-02-18 15:33 ` Robert Picco
@ 2004-02-18 17:08 ` Christoph Hellwig
2004-02-18 18:56 ` Robert Picco
` (7 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2004-02-18 17:08 UTC (permalink / raw)
To: linux-ia64
On Wed, Feb 18, 2004 at 10:33:29AM -0500, Robert Picco wrote:
> This PXM value (255) isn't a SLIT or PXM defined quantity. It is really
> specific to HP cell machines. For example, a machine configured with
> two cells will report three PXMs. Two for the CPUs and one for the
> interleaved memory at magic PXM 255. The firmware doesn't report SLIT
> information for PXM 255. The patch approximates the SLIT value for PXM
> 255. I have attempted to arrive at code which doesn't break non-HP
> hardware configurations. I have assumed the way the initialization code
> was written that all NIDs require memory. Otherwise
> reserve_pernode_space will fail.
I know HP basically owns the IA64 ports, but honestly can't you fix
the firmware to return sane information instead? i.e. move the above
fix to firmware instead of letting linux fixup the reported data.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: PXM/Nid/SLIT patch
2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
` (2 preceding siblings ...)
2004-02-18 17:08 ` Christoph Hellwig
@ 2004-02-18 18:56 ` Robert Picco
2004-02-18 18:59 ` David Mosberger
` (6 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Robert Picco @ 2004-02-18 18:56 UTC (permalink / raw)
To: linux-ia64
Christoph Hellwig wrote:
>On Wed, Feb 18, 2004 at 10:33:29AM -0500, Robert Picco wrote:
>
>
>>This PXM value (255) isn't a SLIT or PXM defined quantity. It is really
>>specific to HP cell machines. For example, a machine configured with
>>two cells will report three PXMs. Two for the CPUs and one for the
>>interleaved memory at magic PXM 255. The firmware doesn't report SLIT
>>information for PXM 255. The patch approximates the SLIT value for PXM
>>255. I have attempted to arrive at code which doesn't break non-HP
>>hardware configurations. I have assumed the way the initialization code
>>was written that all NIDs require memory. Otherwise
>>reserve_pernode_space will fail.
>>
>>
>
>I know HP basically owns the IA64 ports, but honestly can't you fix
>the firmware to return sane information instead? i.e. move the above
>fix to firmware instead of letting linux fixup the reported data.
>
>
>
Well some of us would like to see this too. Some legacy requirements
from our other supported OSes require this to be the default
configuration. Perhaps a different default can be made in the future or
some IPMI tool to change the default.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: PXM/Nid/SLIT patch
2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
` (3 preceding siblings ...)
2004-02-18 18:56 ` Robert Picco
@ 2004-02-18 18:59 ` David Mosberger
2004-02-18 19:04 ` Jesse Barnes
` (5 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: David Mosberger @ 2004-02-18 18:59 UTC (permalink / raw)
To: linux-ia64
>>>>> On Wed, 18 Feb 2004 17:08:58 +0000, Christoph Hellwig <hch@infradead.org> said:
Christoph> On Wed, Feb 18, 2004 at 10:33:29AM -0500, Robert Picco wrote:
>> This PXM value (255) isn't a SLIT or PXM defined quantity. It is really
>> specific to HP cell machines. For example, a machine configured with
>> two cells will report three PXMs. Two for the CPUs and one for the
>> interleaved memory at magic PXM 255. The firmware doesn't report SLIT
>> information for PXM 255. The patch approximates the SLIT value for PXM
>> 255. I have attempted to arrive at code which doesn't break non-HP
>> hardware configurations. I have assumed the way the initialization code
>> was written that all NIDs require memory. Otherwise
>> reserve_pernode_space will fail.
Christoph> I know HP basically owns the IA64 ports
This comment concerns me. I certainly have always tried to judge
patches based on their technical merits for Linux. Is there anything
in particular that I did (or didn't) do that you found objectionable?
If so, please let me know.
Christoph> but honestly can't you fix the firmware to return sane
Christoph> information instead? i.e. move the above fix to firmware
Christoph> instead of letting linux fixup the reported data.
Hmmh, I'm no NUMA-expert and it isn't clear to me whether the patch is
working around a firmware-bug or a limitation in the Linux NUMA code.
I don't see off-hand why it should be illegal to have a memory config
with only one node with memory. The whole PXM_MAGIC business looks
strange to me though. Can someone explain?
--david
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: PXM/Nid/SLIT patch
2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
` (4 preceding siblings ...)
2004-02-18 18:59 ` David Mosberger
@ 2004-02-18 19:04 ` Jesse Barnes
2004-02-18 19:06 ` Jesse Barnes
` (4 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Jesse Barnes @ 2004-02-18 19:04 UTC (permalink / raw)
To: linux-ia64
On Wed, Feb 18, 2004 at 10:59:03AM -0800, David Mosberger wrote:
> Christoph> but honestly can't you fix the firmware to return sane
> Christoph> information instead? i.e. move the above fix to firmware
> Christoph> instead of letting linux fixup the reported data.
>
> Hmmh, I'm no NUMA-expert and it isn't clear to me whether the patch is
> working around a firmware-bug or a limitation in the Linux NUMA code.
> I don't see off-hand why it should be illegal to have a memory config
> with only one node with memory. The whole PXM_MAGIC business looks
> strange to me though. Can someone explain?
Well, it would be nice if memory layout was reported fully, with the
correct CPU/node and memory affinity information. That would allow us
to either interleave in software (maybe a new flag that changes the way
discontig.c builds the memory maps) or just treat the machine as a
normal NUMA box. But maybe this isn't possible with the HP cell boxes?
Robert, maybe you can describe the memory layout of these machines a
little more (sorry if I missed some discussion, I'm having mail trouble
right now and was unsubscribed from linux-ia64).
Thanks,
Jesse
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: PXM/Nid/SLIT patch
2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
` (5 preceding siblings ...)
2004-02-18 19:04 ` Jesse Barnes
@ 2004-02-18 19:06 ` Jesse Barnes
2004-02-18 19:13 ` Christoph Hellwig
` (3 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Jesse Barnes @ 2004-02-18 19:06 UTC (permalink / raw)
To: linux-ia64
On Wed, Feb 18, 2004 at 01:56:48PM -0500, Robert Picco wrote:
> >I know HP basically owns the IA64 ports, but honestly can't you fix
> >the firmware to return sane information instead? i.e. move the above
> >fix to firmware instead of letting linux fixup the reported data.
>
> Well some of us would like to see this too. Some legacy requirements
> from our other supported OSes require this to be the default
> configuration. Perhaps a different default can be made in the future or
> some IPMI tool to change the default.
Ah, that explains it, it's what I expected (I've heard of other NUMA
boxes that do this too). Hmm...
Jesse
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: PXM/Nid/SLIT patch
2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
` (6 preceding siblings ...)
2004-02-18 19:06 ` Jesse Barnes
@ 2004-02-18 19:13 ` Christoph Hellwig
2004-02-18 19:19 ` Robert Picco
` (2 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2004-02-18 19:13 UTC (permalink / raw)
To: linux-ia64
On Wed, Feb 18, 2004 at 10:59:03AM -0800, David Mosberger wrote:
> This comment concerns me. I certainly have always tried to judge
> patches based on their technical merits for Linux. Is there anything
> in particular that I did (or didn't) do that you found objectionable?
> If so, please let me know.
Nah, this wasn't meant as an attac against you, it's just that HP seems
to do most of the work and thus everything in arch/ia64/ is a little
HP centric. I guess it'll change by the time now that SGI woke up
a little.
> Hmmh, I'm no NUMA-expert and it isn't clear to me whether the patch is
> working around a firmware-bug or a limitation in the Linux NUMA code.
> I don't see off-hand why it should be illegal to have a memory config
> with only one node with memory. The whole PXM_MAGIC business looks
> strange to me though. Can someone explain?
There's two issues. First we should probably handle CPU-less nodes, but
that's not what this patch does.
The second issue is that the firmware reports plain wrong data to work
around the lack of NUMA support in a certain legacy OS from Redmond, and
I don't think we should so this non-standard workaround in Linux for that.
Robert's idea of a switch in the firmware to report proper tables sounds
like the best way to go, maybe together with a fix to allow cpu-less nodes
to allow boxes with old firmware to boot, even with suboptimal performance.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: PXM/Nid/SLIT patch
2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
` (7 preceding siblings ...)
2004-02-18 19:13 ` Christoph Hellwig
@ 2004-02-18 19:19 ` Robert Picco
2004-02-18 19:36 ` Jesse Barnes
2004-02-18 19:43 ` David Mosberger
10 siblings, 0 replies; 12+ messages in thread
From: Robert Picco @ 2004-02-18 19:19 UTC (permalink / raw)
To: linux-ia64
David Mosberger wrote:
>>>>>>On Wed, 18 Feb 2004 17:08:58 +0000, Christoph Hellwig <hch@infradead.org> said:
>>>>>>
>>>>>>
>
> Christoph> On Wed, Feb 18, 2004 at 10:33:29AM -0500, Robert Picco wrote:
> >> This PXM value (255) isn't a SLIT or PXM defined quantity. It is really
> >> specific to HP cell machines. For example, a machine configured with
> >> two cells will report three PXMs. Two for the CPUs and one for the
> >> interleaved memory at magic PXM 255. The firmware doesn't report SLIT
> >> information for PXM 255. The patch approximates the SLIT value for PXM
> >> 255. I have attempted to arrive at code which doesn't break non-HP
> >> hardware configurations. I have assumed the way the initialization code
> >> was written that all NIDs require memory. Otherwise
> >> reserve_pernode_space will fail.
>
> Christoph> I know HP basically owns the IA64 ports
>
>This comment concerns me. I certainly have always tried to judge
>patches based on their technical merits for Linux. Is there anything
>in particular that I did (or didn't) do that you found objectionable?
>If so, please let me know.
>
> Christoph> but honestly can't you fix the firmware to return sane
> Christoph> information instead? i.e. move the above fix to firmware
> Christoph> instead of letting linux fixup the reported data.
>
>Hmmh, I'm no NUMA-expert and it isn't clear to me whether the patch is
>working around a firmware-bug or a limitation in the Linux NUMA code.
>I don't see off-hand why it should be illegal to have a memory config
>with only one node with memory. The whole PXM_MAGIC business looks
>strange to me though. Can someone explain?
>
> --david
>
>
>
Our HP default boot configuration has all memory interleaved and
reported in NUMA SRAT PXM 255. The
other cell nodes (PXMs) don't have any memory. This was totally
unexpected by the current NUMA code. There will be N-1 nids with CPUs
and no memory and 1 NID with all the memory. Initialization crashes
very early. The current code expects each node to have local memory.
Well this isn't the case for HP machines. It could be configured with
some IPMI interface for every cell to have Cell Local Memory (CLM) but
such an interface doesn't exist for Linux. Should such an interface
become available, the firmware would still steal 0.5Gb of interleaved
memory from the root cell.
So, if we had a tool to configure CLM for all cells, there would be N-1
nids with CPU and local memory and 1 nid with just interleaved memory.
The current kernel code would work fine but the SLIT information would be
wrong because PXM 255 isn't reported by the firmware in the SLIT table.
numa_slit isn't used by non-machine dependent code for memory
allocation policy but could be in the future for memory allocations
when the current node's memory is exhausted. numa_slit would be used as
a measure of the best locality to make the allocation from (shortest path).
Bob
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: PXM/Nid/SLIT patch
2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
` (8 preceding siblings ...)
2004-02-18 19:19 ` Robert Picco
@ 2004-02-18 19:36 ` Jesse Barnes
2004-02-18 19:43 ` David Mosberger
10 siblings, 0 replies; 12+ messages in thread
From: Jesse Barnes @ 2004-02-18 19:36 UTC (permalink / raw)
To: linux-ia64
On Wed, Feb 18, 2004 at 02:19:23PM -0500, Robert Picco wrote:
> Our HP default boot configuration has all memory interleaved and
> reported in NUMA SRAT PXM 255. The
> other cell nodes (PXMs) don't have any memory. This was totally
> unexpected by the current NUMA code. There will be N-1 nids with CPUs
> and no memory and 1 NID with all the memory. Initialization crashes
> very early. The current code expects each node to have local memory.
Oh, right, there's that... we could fix it to fallback to other nodes
though. In fact, we should do the bootmem initialization earlier and
use alloc_bootmem_node for things instead of allocating stuff in
find_pernode_space. If we fixed that your machine would work pretty
well I think.
> So, if we had a tool to configure CLM for all cells, there would be N-1
> nids with CPU and local memory and 1 nid with just interleaved memory.
> The current kernel code would work fine but the SLIT information would be
> wrong because PXM 255 isn't reported by the firmware in the SLIT table.
> numa_slit isn't used by non-machine dependent code for memory
> allocation policy but could be in the future for memory allocations
> when the current node's memory is exhausted. numa_slit would be used as
> a measure of the best locality to make the allocation from (shortest path).
No, pgdat->zonelist is used instead. It needs to be built better
though...
Jesse
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: PXM/Nid/SLIT patch
2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
` (9 preceding siblings ...)
2004-02-18 19:36 ` Jesse Barnes
@ 2004-02-18 19:43 ` David Mosberger
10 siblings, 0 replies; 12+ messages in thread
From: David Mosberger @ 2004-02-18 19:43 UTC (permalink / raw)
To: linux-ia64
Bob,
Thanks for your explanation. I'm not very familiar with SRAT, PXM etc
(and don't see much reason at this point why I should read it,
especially considering that it's covered by one of those long
Microsoft licenses), so my preference is for this issue to be worked
out among those folks that care about NUMA (you, Jesse, etc.). In the
unexpected event of not being able to find a solution that's
acceptable to everybody, I'm willing to try to mediate (and learn
about all the RATty stuff.. ;-), but again, I doubt that'll be
necessary.
--david
>>>>> On Wed, 18 Feb 2004 14:19:23 -0500, Robert Picco <Robert.Picco@hp.com> said:
Robert> Our HP default boot configuration has all memory interleaved
Robert> and reported in NUMA SRAT PXM 255. The other cell nodes
Robert> (PXMs) don't have any memory. This was totally unexpected
Robert> by the current NUMA code. There will be N-1 nids with CPUs
Robert> and no memory and 1 NID with all the memory. Initialization
Robert> crashes very early. The current code expects each node to
Robert> have local memory. Well this isn't the case for HP
Robert> machines. It could be configured with some IPMI interface
Robert> for every cell to have Cell Local Memory (CLM) but such an
Robert> interface doesn't exist for Linux. Should such an interface
Robert> become available, the firmware would still steal 0.5Gb of
Robert> interleaved memory from the root cell.
Robert> So, if we had a tool to configure CLM for all cells, there
Robert> would be N-1 nids with CPU and local memory and 1 nid with
Robert> just interleaved memory. The current kernel code would work
Robert> fine but the SLIT information would be wrong because PXM 255
Robert> isn't reported by the firmware in the SLIT table. numa_slit
Robert> isn't used by non-machine dependent code for memory
Robert> allocation policy but could be in the future for memory
Robert> allocations when the current node's memory is
Robert> exhausted. numa_slit would be used as a measure of the best
Robert> locality to make the allocation from (shortest path).
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2004-02-18 19:43 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-02-17 13:53 PXM/Nid/SLIT patch Robert Picco
2004-02-17 22:32 ` Jesse Barnes
2004-02-18 15:33 ` Robert Picco
2004-02-18 17:08 ` Christoph Hellwig
2004-02-18 18:56 ` Robert Picco
2004-02-18 18:59 ` David Mosberger
2004-02-18 19:04 ` Jesse Barnes
2004-02-18 19:06 ` Jesse Barnes
2004-02-18 19:13 ` Christoph Hellwig
2004-02-18 19:19 ` Robert Picco
2004-02-18 19:36 ` Jesse Barnes
2004-02-18 19:43 ` David Mosberger
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.