linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] memory hotplug disable boot option
@ 2010-06-25  1:06 Nathan Fontenot
  2010-06-25  2:04 ` KOSAKI Motohiro
  0 siblings, 1 reply; 34+ messages in thread
From: Nathan Fontenot @ 2010-06-25  1:06 UTC (permalink / raw)
  To: linux-kernel

Proposed patch to disable memory hotplug via a boot option,
mem_hotplug=[on|off].  The patch only disables memory hotplug in that it 
prevents the creation of the memory sysfs directories for memory sections.

This patch is meant to help alleviate very long boot times on systems with
large memory (1+ TB) and many memory sections (10's of thousands).

Signed-off-by: Nathan Fontenot <nfont@austin.ibm.com>

---
 drivers/base/memory.c  |   33 +++++++++++++++++++++++++++++++++
 drivers/base/node.c    |    3 +++
 include/linux/memory.h |    2 ++
 3 files changed, 38 insertions(+)

Index: linux-2.6/drivers/base/memory.c
===================================================================
--- linux-2.6.orig/drivers/base/memory.c	2010-06-23 15:07:03.000000000 -0500
+++ linux-2.6/drivers/base/memory.c	2010-06-24 18:46:52.000000000 -0500
@@ -29,6 +29,24 @@
 
 #define MEMORY_CLASS_NAME	"memory"
 
+static int mem_hp_enabled __read_mostly = 1;
+
+static int __setup_memory_hotplug(char *option)
+{
+	if (!strcmp(option, "off"))
+		mem_hp_enabled = 0;
+	else if (!strcmp(option, "on"))
+		mem_hp_enabled = 1;
+
+	return 1;
+}
+__setup("mem_hotplug=", __setup_memory_hotplug);
+
+int memory_hotplug_enabled(void)
+{
+	return mem_hp_enabled;
+}
+
 static struct sysdev_class memory_sysdev_class = {
 	.name = MEMORY_CLASS_NAME,
 };
@@ -320,6 +338,9 @@
 
 static int block_size_init(void)
 {
+	if (!memory_hotplug_enabled())
+		return 0;
+
 	return sysfs_create_file(&memory_sysdev_class.kset.kobj,
 				&attr_block_size_bytes.attr);
 }
@@ -442,6 +463,9 @@
 	unsigned long start_pfn;
 	int ret = 0;
 
+	if (!memory_hotplug_enabled())
+		return 0;
+
 	if (!mem)
 		return -ENOMEM;
 
@@ -483,6 +507,9 @@
 	struct memory_block *mem;
 	char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
 
+	if (!memory_hotplug_enabled())
+		return NULL;
+
 	/*
 	 * This only works because we know that section == sysdev->id
 	 * slightly redundant with sysdev_register()
@@ -504,6 +531,9 @@
 {
 	struct memory_block *mem;
 
+	if (!memory_hotplug_enabled())
+		return 0;
+
 	mem = find_memory_block(section);
 	unregister_mem_sect_under_nodes(mem);
 	mem_remove_simple_file(mem, phys_index);
@@ -541,6 +571,9 @@
 	int ret;
 	int err;
 
+	if (!memory_hotplug_enabled())
+		return 0;
+
 	memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops;
 	ret = sysdev_class_register(&memory_sysdev_class);
 	if (ret)
Index: linux-2.6/drivers/base/node.c
===================================================================
--- linux-2.6.orig/drivers/base/node.c	2010-06-23 15:07:03.000000000 -0500
+++ linux-2.6/drivers/base/node.c	2010-06-24 13:05:55.000000000 -0500
@@ -411,6 +411,9 @@
 	unsigned long pfn;
 	int err = 0;
 
+	if (!memory_hotplug_enabled())
+		return 0;
+
 	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
 		unsigned long section_nr = pfn_to_section_nr(pfn);
 		struct mem_section *mem_sect;
Index: linux-2.6/include/linux/memory.h
===================================================================
--- linux-2.6.orig/include/linux/memory.h	2010-06-23 15:07:24.000000000 -0500
+++ linux-2.6/include/linux/memory.h	2010-06-24 13:06:25.000000000 -0500
@@ -128,6 +128,8 @@
 #define hotplug_memory_notifier(fn, pri) do { } while (0)
 #endif
 
+extern int memory_hotplug_enabled(void);
+
 /*
  * 'struct memory_accessor' is a generic interface to provide
  * in-kernel access to persistent memory such as i2c or SPI EEPROMs

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-25  1:06 [PATCH] memory hotplug disable boot option Nathan Fontenot
@ 2010-06-25  2:04 ` KOSAKI Motohiro
  2010-06-25  9:19   ` Andi Kleen
  0 siblings, 1 reply; 34+ messages in thread
From: KOSAKI Motohiro @ 2010-06-25  2:04 UTC (permalink / raw)
  To: Nathan Fontenot; +Cc: kosaki.motohiro, linux-kernel

> Proposed patch to disable memory hotplug via a boot option,
> mem_hotplug=[on|off].  The patch only disables memory hotplug in that it 
> prevents the creation of the memory sysfs directories for memory sections.
> 
> This patch is meant to help alleviate very long boot times on systems with
> large memory (1+ TB) and many memory sections (10's of thousands).

Why making simple /sys file is so slowly? Couldn't we fix such performance
problem?


> 
> Signed-off-by: Nathan Fontenot <nfont@austin.ibm.com>
> 
> ---
>  drivers/base/memory.c  |   33 +++++++++++++++++++++++++++++++++
>  drivers/base/node.c    |    3 +++
>  include/linux/memory.h |    2 ++
>  3 files changed, 38 insertions(+)
> 
> Index: linux-2.6/drivers/base/memory.c
> ===================================================================
> --- linux-2.6.orig/drivers/base/memory.c	2010-06-23 15:07:03.000000000 -0500
> +++ linux-2.6/drivers/base/memory.c	2010-06-24 18:46:52.000000000 -0500
> @@ -29,6 +29,24 @@
>  
>  #define MEMORY_CLASS_NAME	"memory"
>  
> +static int mem_hp_enabled __read_mostly = 1;

please don't use 'hp' acronym. 'hotplug' is not so long word.


> +
> +static int __setup_memory_hotplug(char *option)
> +{
> +	if (!strcmp(option, "off"))
> +		mem_hp_enabled = 0;
> +	else if (!strcmp(option, "on"))
> +		mem_hp_enabled = 1;
> +
> +	return 1;
> +}
> +__setup("mem_hotplug=", __setup_memory_hotplug);

no documentation on Documentation/kernel-parameters.txt?





^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-25  2:04 ` KOSAKI Motohiro
@ 2010-06-25  9:19   ` Andi Kleen
  2010-06-25 14:51     ` Nathan Fontenot
  0 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2010-06-25  9:19 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: Nathan Fontenot, linux-kernel

KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> writes:

>> Proposed patch to disable memory hotplug via a boot option,
>> mem_hotplug=[on|off].  The patch only disables memory hotplug in that it 
>> prevents the creation of the memory sysfs directories for memory sections.
>> 
>> This patch is meant to help alleviate very long boot times on systems with
>> large memory (1+ TB) and many memory sections (10's of thousands).
>
> Why making simple /sys file is so slowly? Couldn't we fix such performance
> problem?

Yes I agree this really needs to be fixed properly, not hacked around
with an option. Nathan can you please post some profile logs of the long
boot times?  

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-25  9:19   ` Andi Kleen
@ 2010-06-25 14:51     ` Nathan Fontenot
  2010-06-25 14:56       ` Andi Kleen
  2010-06-28  2:20       ` KOSAKI Motohiro
  0 siblings, 2 replies; 34+ messages in thread
From: Nathan Fontenot @ 2010-06-25 14:51 UTC (permalink / raw)
  To: Andi Kleen; +Cc: KOSAKI Motohiro, linux-kernel

On 06/25/2010 04:19 AM, Andi Kleen wrote:
> KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> writes:
> 
>>> Proposed patch to disable memory hotplug via a boot option,
>>> mem_hotplug=[on|off].  The patch only disables memory hotplug in that it 
>>> prevents the creation of the memory sysfs directories for memory sections.
>>>
>>> This patch is meant to help alleviate very long boot times on systems with
>>> large memory (1+ TB) and many memory sections (10's of thousands).
>>
>> Why making simple /sys file is so slowly? Couldn't we fix such performance
>> problem?

The issue is the large number of sysfs memory directories that get created.
On a system with 1 TB of memory I am seeing ~63,00 directories.  The long
creation time is due to the string compare check in sysfs code to ensure
we are not creating a directory with a duplicate name.

> 
> Yes I agree this really needs to be fixed properly, not hacked around
> with an option. Nathan can you please post some profile logs of the long
> boot times?  
> 

At this point the only profiling data I have is from adding a printk before
and after the creation of the memory sysfs files in drivers/base/memory.c and
booting with printk.time=1.

With 250 GB of memory: 10 seconds
[    0.539562] Memory Start
[   10.450409] Memory End

With 1 TB of memory: 9.1 minutes
[   31.680168] Memory Start
[  584.186500] Memory End

I am hoping to get access to a machine with 2 TB of memory sometime soon and
can post data for boot times on an unpatched kernel.

I posted a patch earlier that updated sysfs to use reb-black trees to store
the sysfs dirent structs (though the patch has issues with namespaces).  Using
this patch on a 1 TB system the memory sysfs dir creation dropped to 2.2 minutes.

[    1.295874] Memory Start
[  137.293510] Memory End

On a system with 2 TB of memory, the reb-black tree patch dropped the sysfs file
creation time down to 33 minutes and total boot time to 1 hour 15 minutes. I
haven't measured an unpatched kernel, but total boot of an unpatched kernel
is just over 8 hours.

With 2 TB and patched kernel: 33 minutes
[    3.241679] Memory Start
[ 1986.973324] Memory End
 
I am open to other ideas on a solution for this.  With this many memory sections
I feel that the sysfs memory directory really isn't human readable, not with
10's or 100's of thousands of entries.  Perhaps moving to a flat file representation
and not create all the directories?

-Nathan

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-25 14:51     ` Nathan Fontenot
@ 2010-06-25 14:56       ` Andi Kleen
  2010-06-25 15:21         ` Nathan Fontenot
  2010-06-28  2:20       ` KOSAKI Motohiro
  1 sibling, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2010-06-25 14:56 UTC (permalink / raw)
  To: Nathan Fontenot; +Cc: Andi Kleen, KOSAKI Motohiro, linux-kernel

On Fri, Jun 25, 2010 at 09:51:37AM -0500, Nathan Fontenot wrote:
> > 
> >>> Proposed patch to disable memory hotplug via a boot option,
> >>> mem_hotplug=[on|off].  The patch only disables memory hotplug in that it 
> >>> prevents the creation of the memory sysfs directories for memory sections.
> >>>
> >>> This patch is meant to help alleviate very long boot times on systems with
> >>> large memory (1+ TB) and many memory sections (10's of thousands).
> >>
> >> Why making simple /sys file is so slowly? Couldn't we fix such performance
> >> problem?
> 
> The issue is the large number of sysfs memory directories that get created.
> On a system with 1 TB of memory I am seeing ~63,00 directories.  The long

Maybe you could simply make the memory blocks larger on such large systems?
I agree 6k blocks are probably not very useful, but maybe 10 or 100 are?

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-25 14:56       ` Andi Kleen
@ 2010-06-25 15:21         ` Nathan Fontenot
  2010-06-25 15:28           ` Andi Kleen
  0 siblings, 1 reply; 34+ messages in thread
From: Nathan Fontenot @ 2010-06-25 15:21 UTC (permalink / raw)
  To: Andi Kleen; +Cc: KOSAKI Motohiro, linux-kernel

On 06/25/2010 09:56 AM, Andi Kleen wrote:
> On Fri, Jun 25, 2010 at 09:51:37AM -0500, Nathan Fontenot wrote:
>>>
>>>>> Proposed patch to disable memory hotplug via a boot option,
>>>>> mem_hotplug=[on|off].  The patch only disables memory hotplug in that it 
>>>>> prevents the creation of the memory sysfs directories for memory sections.
>>>>>
>>>>> This patch is meant to help alleviate very long boot times on systems with
>>>>> large memory (1+ TB) and many memory sections (10's of thousands).
>>>>
>>>> Why making simple /sys file is so slowly? Couldn't we fix such performance
>>>> problem?
>>
>> The issue is the large number of sysfs memory directories that get created.
>> On a system with 1 TB of memory I am seeing ~63,00 directories.  The long
> 
> Maybe you could simply make the memory blocks larger on such large systems?
> I agree 6k blocks are probably not very useful, but maybe 10 or 100 are?
> 

Yes, this would work to reduce the number of memory sections created.  The reason
I have not gone this route is that increasing the memory section size would
break DLPAR memory add and remove for powerpc pseries.

-Nathan

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-25 15:21         ` Nathan Fontenot
@ 2010-06-25 15:28           ` Andi Kleen
  2010-06-25 16:00             ` Nathan Fontenot
  0 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2010-06-25 15:28 UTC (permalink / raw)
  To: Nathan Fontenot; +Cc: Andi Kleen, KOSAKI Motohiro, linux-kernel

> Yes, this would work to reduce the number of memory sections created.  The reason

Note it could be done on the sysfs interface level too, e.g. have one
entry that gets a bitmap as input/output.

> I have not gone this route is that increasing the memory section size would
> break DLPAR memory add and remove for powerpc pseries.

How would it break it?

But aren't you breaking this anyways by disabling the sysfs entries?!?

-Andi


-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-25 15:28           ` Andi Kleen
@ 2010-06-25 16:00             ` Nathan Fontenot
  0 siblings, 0 replies; 34+ messages in thread
From: Nathan Fontenot @ 2010-06-25 16:00 UTC (permalink / raw)
  To: Andi Kleen; +Cc: KOSAKI Motohiro, linux-kernel

On 06/25/2010 10:28 AM, Andi Kleen wrote:
>> Yes, this would work to reduce the number of memory sections created.  The reason
> 
> Note it could be done on the sysfs interface level too, e.g. have one
> entry that gets a bitmap as input/output.

This was sort of the idea I was thinking of for a flat representation in sysfs.
Instead of creating all of the directories for each memory section, have a single
file for the memory section attributes.  For example, an 'online' file.  Users
could read the file to get online memory section numbers and write to it to
online a specific memory section.

> 
>> I have not gone this route is that increasing the memory section size would
>> break DLPAR memory add and remove for powerpc pseries.
> 
> How would it break it?

For the powerpc/pseries hardware memory is assigned in LMB size chunks.  If the
memory section size is larger than the LMB size then DLPAR remove would have
to hotplug remove more memory than required to DLPAR remove the memory.  For
DLPAR memory add, we would fail because we could not hotplug add the larger
memory section since that much memory had not actually been given to the
machine.
 
> 
> But aren't you breaking this anyways by disabling the sysfs entries?!?

This is meant more for allowing people to bot systems without the long boot
times (as mentioned before 8+ hours for a 2TB system) even if it means they
will not be able to memory hotplug or memory DLPAR.

-Nathan



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-25 14:51     ` Nathan Fontenot
  2010-06-25 14:56       ` Andi Kleen
@ 2010-06-28  2:20       ` KOSAKI Motohiro
  2010-06-28  4:15         ` Eric W. Biederman
  2010-06-28 15:02         ` Greg KH
  1 sibling, 2 replies; 34+ messages in thread
From: KOSAKI Motohiro @ 2010-06-28  2:20 UTC (permalink / raw)
  To: Nathan Fontenot
  Cc: kosaki.motohiro, Andi Kleen, linux-kernel, Greg Kroah-Hartman,
	Eric W. Biederman

> On 06/25/2010 04:19 AM, Andi Kleen wrote:
> > KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> writes:
> > 
> >>> Proposed patch to disable memory hotplug via a boot option,
> >>> mem_hotplug=[on|off].  The patch only disables memory hotplug in that it 
> >>> prevents the creation of the memory sysfs directories for memory sections.
> >>>
> >>> This patch is meant to help alleviate very long boot times on systems with
> >>> large memory (1+ TB) and many memory sections (10's of thousands).
> >>
> >> Why making simple /sys file is so slowly? Couldn't we fix such performance
> >> problem?
> 
> The issue is the large number of sysfs memory directories that get created.
> On a system with 1 TB of memory I am seeing ~63,00 directories.  The long
> creation time is due to the string compare check in sysfs code to ensure
> we are not creating a directory with a duplicate name.

Ah, I see. probably this is sysfs issue. So Let's cc Greg and Eric.
Greg, I have dumb question. Why sysfs call strcmp() so heavily? I mean why sysfs
don't have hash based name dupliation check?

 - kosaki


> > 
> > Yes I agree this really needs to be fixed properly, not hacked around
> > with an option. Nathan can you please post some profile logs of the long
> > boot times?  
> > 
> 
> At this point the only profiling data I have is from adding a printk before
> and after the creation of the memory sysfs files in drivers/base/memory.c and
> booting with printk.time=1.
> 
> With 250 GB of memory: 10 seconds
> [    0.539562] Memory Start
> [   10.450409] Memory End
> 
> With 1 TB of memory: 9.1 minutes
> [   31.680168] Memory Start
> [  584.186500] Memory End
> 
> I am hoping to get access to a machine with 2 TB of memory sometime soon and
> can post data for boot times on an unpatched kernel.
> 
> I posted a patch earlier that updated sysfs to use reb-black trees to store
> the sysfs dirent structs (though the patch has issues with namespaces).  Using
> this patch on a 1 TB system the memory sysfs dir creation dropped to 2.2 minutes.
> 
> [    1.295874] Memory Start
> [  137.293510] Memory End
> 
> On a system with 2 TB of memory, the reb-black tree patch dropped the sysfs file
> creation time down to 33 minutes and total boot time to 1 hour 15 minutes. I
> haven't measured an unpatched kernel, but total boot of an unpatched kernel
> is just over 8 hours.
> 
> With 2 TB and patched kernel: 33 minutes
> [    3.241679] Memory Start
> [ 1986.973324] Memory End
>  
> I am open to other ideas on a solution for this.  With this many memory sections
> I feel that the sysfs memory directory really isn't human readable, not with
> 10's or 100's of thousands of entries.  Perhaps moving to a flat file representation
> and not create all the directories?
> 
> -Nathan



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-28  2:20       ` KOSAKI Motohiro
@ 2010-06-28  4:15         ` Eric W. Biederman
  2010-06-28 14:16           ` Andi Kleen
  2010-06-28 15:02         ` Greg KH
  1 sibling, 1 reply; 34+ messages in thread
From: Eric W. Biederman @ 2010-06-28  4:15 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Nathan Fontenot, Andi Kleen, linux-kernel, Greg Kroah-Hartman

KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> writes:

>> On 06/25/2010 04:19 AM, Andi Kleen wrote:
>> > KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> writes:
>> > 
>> >>> Proposed patch to disable memory hotplug via a boot option,
>> >>> mem_hotplug=[on|off].  The patch only disables memory hotplug in that it 
>> >>> prevents the creation of the memory sysfs directories for memory sections.
>> >>>
>> >>> This patch is meant to help alleviate very long boot times on systems with
>> >>> large memory (1+ TB) and many memory sections (10's of thousands).
>> >>
>> >> Why making simple /sys file is so slowly? Couldn't we fix such performance
>> >> problem?
>> 
>> The issue is the large number of sysfs memory directories that get created.
>> On a system with 1 TB of memory I am seeing ~63,00 directories.  The long
>> creation time is due to the string compare check in sysfs code to ensure
>> we are not creating a directory with a duplicate name.
>
> Ah, I see. probably this is sysfs issue. So Let's cc Greg and Eric.
> Greg, I have dumb question. Why sysfs call strcmp() so heavily? I mean why sysfs
> don't have hash based name dupliation check?

Simplicity of the current implementation.

I have a prototype patch sitting around somewhere.  I think ultimately
it makes sense to do something like extN's htree directory structure
in sysfs.  I wanted to get the tagged sysfs support in before I worked
on scalability because that slightly changes the requirements.

Improving the scalability here is certainly worth doing, but I am slightly
concerned there is something else algorithmically wrong if this is still
going to take 33 minutes to boot with 2TB.

Eric

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-28  4:15         ` Eric W. Biederman
@ 2010-06-28 14:16           ` Andi Kleen
  2010-06-28 19:43             ` Eric W. Biederman
  0 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2010-06-28 14:16 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: KOSAKI Motohiro, Nathan Fontenot, Andi Kleen, linux-kernel,
	Greg Kroah-Hartman

> I have a prototype patch sitting around somewhere.  I think ultimately
> it makes sense to do something like extN's htree directory structure
> in sysfs.  I wanted to get the tagged sysfs support in before I worked
> on scalability because that slightly changes the requirements.
> 
> Improving the scalability here is certainly worth doing, but I am slightly
> concerned there is something else algorithmically wrong if this is still
> going to take 33 minutes to boot with 2TB.

I'm don't think thousands of entries in sysfs is really a good idea. Even if you fix
the the insert algorithm issues a simple ls will still be very slow and there
will be likely other issues too. And nobody can claim that's a good interface.

This really needs a better a interface.
e.g. add some wildcard entry as it was earlier proposed.
and only generate the wildcard when some threshold is exceeded.

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-28  2:20       ` KOSAKI Motohiro
  2010-06-28  4:15         ` Eric W. Biederman
@ 2010-06-28 15:02         ` Greg KH
  2010-06-28 15:37           ` Nathan Fontenot
  1 sibling, 1 reply; 34+ messages in thread
From: Greg KH @ 2010-06-28 15:02 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Nathan Fontenot, Andi Kleen, linux-kernel, Eric W. Biederman

On Mon, Jun 28, 2010 at 11:20:27AM +0900, KOSAKI Motohiro wrote:
> > On 06/25/2010 04:19 AM, Andi Kleen wrote:
> > > KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> writes:
> > > 
> > >>> Proposed patch to disable memory hotplug via a boot option,
> > >>> mem_hotplug=[on|off].  The patch only disables memory hotplug in that it 
> > >>> prevents the creation of the memory sysfs directories for memory sections.
> > >>>
> > >>> This patch is meant to help alleviate very long boot times on systems with
> > >>> large memory (1+ TB) and many memory sections (10's of thousands).
> > >>
> > >> Why making simple /sys file is so slowly? Couldn't we fix such performance
> > >> problem?
> > 
> > The issue is the large number of sysfs memory directories that get created.
> > On a system with 1 TB of memory I am seeing ~63,00 directories.  The long
> > creation time is due to the string compare check in sysfs code to ensure
> > we are not creating a directory with a duplicate name.
> 
> Ah, I see. probably this is sysfs issue. So Let's cc Greg and Eric.
> Greg, I have dumb question. Why sysfs call strcmp() so heavily? I mean why sysfs
> don't have hash based name dupliation check?

Because we have not needed such complexity before.

You might want to take a step back and asky why you are creating 63
thousand directories in sysfs for large memory systems.  Are you really
going to use all of those directories?  What are they for?  Perhaps
someone created the wrong interface to memory and that needs to be fixed
instead?

And as always, patches are welcome :)

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-28 15:02         ` Greg KH
@ 2010-06-28 15:37           ` Nathan Fontenot
  2010-06-28 15:44             ` Greg KH
  0 siblings, 1 reply; 34+ messages in thread
From: Nathan Fontenot @ 2010-06-28 15:37 UTC (permalink / raw)
  To: Greg KH; +Cc: KOSAKI Motohiro, Andi Kleen, linux-kernel, Eric W. Biederman

On 06/28/2010 10:02 AM, Greg KH wrote:
> On Mon, Jun 28, 2010 at 11:20:27AM +0900, KOSAKI Motohiro wrote:
>>> On 06/25/2010 04:19 AM, Andi Kleen wrote:
>>>> KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> writes:
>>>>
>>>>>> Proposed patch to disable memory hotplug via a boot option,
>>>>>> mem_hotplug=[on|off].  The patch only disables memory hotplug in that it 
>>>>>> prevents the creation of the memory sysfs directories for memory sections.
>>>>>>
>>>>>> This patch is meant to help alleviate very long boot times on systems with
>>>>>> large memory (1+ TB) and many memory sections (10's of thousands).
>>>>>
>>>>> Why making simple /sys file is so slowly? Couldn't we fix such performance
>>>>> problem?
>>>
>>> The issue is the large number of sysfs memory directories that get created.
>>> On a system with 1 TB of memory I am seeing ~63,00 directories.  The long
>>> creation time is due to the string compare check in sysfs code to ensure
>>> we are not creating a directory with a duplicate name.
>>
>> Ah, I see. probably this is sysfs issue. So Let's cc Greg and Eric.
>> Greg, I have dumb question. Why sysfs call strcmp() so heavily? I mean why sysfs
>> don't have hash based name dupliation check?
> 
> Because we have not needed such complexity before.
> 
> You might want to take a step back and asky why you are creating 63
> thousand directories in sysfs for large memory systems.  Are you really
> going to use all of those directories?  What are they for?  Perhaps
> someone created the wrong interface to memory and that needs to be fixed
> instead?
> 

The directories being created are the standard directories, one for each of the memory
sections present at boot.  I think the most used files in each of these directories
is the state and removable file used to do memory hotplug.

-Nathan

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-28 15:37           ` Nathan Fontenot
@ 2010-06-28 15:44             ` Greg KH
  2010-06-29  0:04               ` Dave Hansen
  0 siblings, 1 reply; 34+ messages in thread
From: Greg KH @ 2010-06-28 15:44 UTC (permalink / raw)
  To: Nathan Fontenot
  Cc: KOSAKI Motohiro, Andi Kleen, linux-kernel, Eric W. Biederman

On Mon, Jun 28, 2010 at 10:37:12AM -0500, Nathan Fontenot wrote:
> On 06/28/2010 10:02 AM, Greg KH wrote:
> > On Mon, Jun 28, 2010 at 11:20:27AM +0900, KOSAKI Motohiro wrote:
> >>> On 06/25/2010 04:19 AM, Andi Kleen wrote:
> >>>> KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> writes:
> >>>>
> >>>>>> Proposed patch to disable memory hotplug via a boot option,
> >>>>>> mem_hotplug=[on|off].  The patch only disables memory hotplug in that it 
> >>>>>> prevents the creation of the memory sysfs directories for memory sections.
> >>>>>>
> >>>>>> This patch is meant to help alleviate very long boot times on systems with
> >>>>>> large memory (1+ TB) and many memory sections (10's of thousands).
> >>>>>
> >>>>> Why making simple /sys file is so slowly? Couldn't we fix such performance
> >>>>> problem?
> >>>
> >>> The issue is the large number of sysfs memory directories that get created.
> >>> On a system with 1 TB of memory I am seeing ~63,00 directories.  The long
> >>> creation time is due to the string compare check in sysfs code to ensure
> >>> we are not creating a directory with a duplicate name.
> >>
> >> Ah, I see. probably this is sysfs issue. So Let's cc Greg and Eric.
> >> Greg, I have dumb question. Why sysfs call strcmp() so heavily? I mean why sysfs
> >> don't have hash based name dupliation check?
> > 
> > Because we have not needed such complexity before.
> > 
> > You might want to take a step back and asky why you are creating 63
> > thousand directories in sysfs for large memory systems.  Are you really
> > going to use all of those directories?  What are they for?  Perhaps
> > someone created the wrong interface to memory and that needs to be fixed
> > instead?
> > 
> 
> The directories being created are the standard directories, one for each of the memory
> sections present at boot.  I think the most used files in each of these directories
> is the state and removable file used to do memory hotplug.

And perhaps we shouldn't really be creating so many directories?  Why
not work with the memory hotplug developers to change their interface to
not abuse sysfs in such a manner?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-28 14:16           ` Andi Kleen
@ 2010-06-28 19:43             ` Eric W. Biederman
  0 siblings, 0 replies; 34+ messages in thread
From: Eric W. Biederman @ 2010-06-28 19:43 UTC (permalink / raw)
  To: Andi Kleen
  Cc: KOSAKI Motohiro, Nathan Fontenot, linux-kernel, Greg Kroah-Hartman

Andi Kleen <andi@firstfloor.org> writes:

>> I have a prototype patch sitting around somewhere.  I think ultimately
>> it makes sense to do something like extN's htree directory structure
>> in sysfs.  I wanted to get the tagged sysfs support in before I worked
>> on scalability because that slightly changes the requirements.
>> 
>> Improving the scalability here is certainly worth doing, but I am slightly
>> concerned there is something else algorithmically wrong if this is still
>> going to take 33 minutes to boot with 2TB.
>
> I'm don't think thousands of entries in sysfs is really a good idea. Even if you fix
> the the insert algorithm issues a simple ls will still be very slow and there
> will be likely other issues too. And nobody can claim that's a good interface.

Yes.  I am much more interested in fixing lookup and stat performance,
if people only look at insert performance it is a bit of a joke.

That said there are cases of people using a lot of network virtual
network devices that are essentially sane that push sysfs.

This is only the second or third time sysfs scalability has come up
this year.  So at some level I think it is sane to fix sysfs regardless
of what we do with memory hotplug.

Ensuring that our insert times are O(logN) for insert, O(N) for readdir,
O(logN) for lookup and O(log1) for stat, seems useful if we can do it
without other complications.

Eric

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-28 15:44             ` Greg KH
@ 2010-06-29  0:04               ` Dave Hansen
  2010-06-29  2:56                 ` KOSAKI Motohiro
  0 siblings, 1 reply; 34+ messages in thread
From: Dave Hansen @ 2010-06-29  0:04 UTC (permalink / raw)
  To: Greg KH
  Cc: Nathan Fontenot, KOSAKI Motohiro, Andi Kleen, linux-kernel,
	Eric W. Biederman

On Mon, 2010-06-28 at 08:44 -0700, Greg KH wrote:
> > The directories being created are the standard directories, one for each of the memory
> > sections present at boot.  I think the most used files in each of these directories
> > is the state and removable file used to do memory hotplug.
> 
> And perhaps we shouldn't really be creating so many directories?  Why
> not work with the memory hotplug developers to change their interface to
> not abuse sysfs in such a manner? 

Heh, it wasn't abuse until we got this much memory.  But, I think this
one is pretty much 100% my fault.

Nathan, I think the right fix here is probably to untie sysfs from the
sections a bit.  We should be able to have sysfs dirs that represent
more than one contiguous SECTION_SIZE area of memory.

It will mean re-teaching some of the tools how things work.  They'll
have to know that you can split mem sections, and we'll have to come up
with a way to do the splitting.  

Does ppc *really* remove 16MB sections of RAM these days?  It's probably
worth checking with the firmware folks to see what the limits are in
practice.

-- Dave


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-29  0:04               ` Dave Hansen
@ 2010-06-29  2:56                 ` KOSAKI Motohiro
  2010-06-29 15:38                   ` Nathan Fontenot
  2010-06-29 16:03                   ` Dave Hansen
  0 siblings, 2 replies; 34+ messages in thread
From: KOSAKI Motohiro @ 2010-06-29  2:56 UTC (permalink / raw)
  To: Dave Hansen
  Cc: kosaki.motohiro, Greg KH, Nathan Fontenot, Andi Kleen,
	linux-kernel, Eric W. Biederman

> On Mon, 2010-06-28 at 08:44 -0700, Greg KH wrote:
> > > The directories being created are the standard directories, one for each of the memory
> > > sections present at boot.  I think the most used files in each of these directories
> > > is the state and removable file used to do memory hotplug.
> > 
> > And perhaps we shouldn't really be creating so many directories?  Why
> > not work with the memory hotplug developers to change their interface to
> > not abuse sysfs in such a manner? 
> 
> Heh, it wasn't abuse until we got this much memory.  But, I think this
> one is pretty much 100% my fault.
> 
> Nathan, I think the right fix here is probably to untie sysfs from the
> sections a bit.  We should be able to have sysfs dirs that represent
> more than one contiguous SECTION_SIZE area of memory.

Why do we need abi breakage? Yourself talked about we guess ppc don't
actually need 16MB section. I think IBM folks have to confirm it.
If our guessing is correct, the firmware fixing is only necessary.

Thats said, I don't 100% refuse your idea. it's interesting. but, 
In generical I hate _unncessary_ abi change.


> It will mean re-teaching some of the tools how things work.  They'll
> have to know that you can split mem sections, and we'll have to come up
> with a way to do the splitting.  
> 
> Does ppc *really* remove 16MB sections of RAM these days?  It's probably
> worth checking with the firmware folks to see what the limits are in
> practice.
> 
> -- Dave
> 




^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-29  2:56                 ` KOSAKI Motohiro
@ 2010-06-29 15:38                   ` Nathan Fontenot
  2010-06-30  0:00                     ` KOSAKI Motohiro
  2010-06-29 16:03                   ` Dave Hansen
  1 sibling, 1 reply; 34+ messages in thread
From: Nathan Fontenot @ 2010-06-29 15:38 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Dave Hansen, Greg KH, Andi Kleen, linux-kernel, Eric W. Biederman

On 06/28/2010 09:56 PM, KOSAKI Motohiro wrote:
>> On Mon, 2010-06-28 at 08:44 -0700, Greg KH wrote:
>>>> The directories being created are the standard directories, one for each of the memory
>>>> sections present at boot.  I think the most used files in each of these directories
>>>> is the state and removable file used to do memory hotplug.
>>>
>>> And perhaps we shouldn't really be creating so many directories?  Why
>>> not work with the memory hotplug developers to change their interface to
>>> not abuse sysfs in such a manner? 
>>
>> Heh, it wasn't abuse until we got this much memory.  But, I think this
>> one is pretty much 100% my fault.
>>
>> Nathan, I think the right fix here is probably to untie sysfs from the
>> sections a bit.  We should be able to have sysfs dirs that represent
>> more than one contiguous SECTION_SIZE area of memory.
> 
> Why do we need abi breakage? Yourself talked about we guess ppc don't
> actually need 16MB section. I think IBM folks have to confirm it.
> If our guessing is correct, the firmware fixing is only necessary.

Yes, ppc still needs to support add/remove of 16MB sections.  This correlates
to the smallest lmb size on ppc that we need to support.

> 
> Thats said, I don't 100% refuse your idea. it's interesting. but, 
> In generical I hate _unncessary_ abi change.

Me too, but I'm not sure the current sysfs layout of memory scales well
for machines with huge amounts of memory.

How about providing an alternate sysfs layout for systems that have a large
number of memory sections?  Even on the machines I worked with that have
1 and 2 TB of memory, if we increase the memory sections size to equal the
lmb size we still would be creating 6k+ directories for a 1 TB machine.
This would alleviate much of the perfomrance issue but still leaves us with 
a directory of thousands (or tens of thousands for really big systems) 
of memoryXXX subdirectories, which is not really human readable.

Or some method of having a single memory XXX dir represent multiple sections,
as Dave suggested would work. Perhaps there is a way to subdivide the
memory section dirs into separate dirs based on their node.

At the point of dealing with this many memory sections would it make sense
to not create directories for each of the memory sections? Perhaps just
files to report information about the memory sections.

-Nathan


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-29  2:56                 ` KOSAKI Motohiro
  2010-06-29 15:38                   ` Nathan Fontenot
@ 2010-06-29 16:03                   ` Dave Hansen
  2010-06-29 18:04                     ` Greg KH
  1 sibling, 1 reply; 34+ messages in thread
From: Dave Hansen @ 2010-06-29 16:03 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Greg KH, Nathan Fontenot, Andi Kleen, linux-kernel, Eric W. Biederman

On Tue, 2010-06-29 at 11:56 +0900, KOSAKI Motohiro wrote:
> > On Mon, 2010-06-28 at 08:44 -0700, Greg KH wrote:
> > > > The directories being created are the standard directories, one for each of the memory
> > > > sections present at boot.  I think the most used files in each of these directories
> > > > is the state and removable file used to do memory hotplug.
> > > 
> > > And perhaps we shouldn't really be creating so many directories?  Why
> > > not work with the memory hotplug developers to change their interface to
> > > not abuse sysfs in such a manner? 
> > 
> > Heh, it wasn't abuse until we got this much memory.  But, I think this
> > one is pretty much 100% my fault.
> > 
> > Nathan, I think the right fix here is probably to untie sysfs from the
> > sections a bit.  We should be able to have sysfs dirs that represent
> > more than one contiguous SECTION_SIZE area of memory.
> 
> Why do we need abi breakage? Yourself talked about we guess ppc don't
> actually need 16MB section. I think IBM folks have to confirm it.
> If our guessing is correct, the firmware fixing is only necessary.

>From the mouth of the kernel dumbass who coded this up: it's not the
firmware's fault.  We shouldn't punt this to them, and the proper fix
_isn't_ in the firmware, plus they may have other more fundamental
reasons to keep the LMB sizes what they are.

-- Dave


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-29 16:03                   ` Dave Hansen
@ 2010-06-29 18:04                     ` Greg KH
  2010-06-30  0:32                       ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 34+ messages in thread
From: Greg KH @ 2010-06-29 18:04 UTC (permalink / raw)
  To: Dave Hansen
  Cc: KOSAKI Motohiro, Nathan Fontenot, Andi Kleen, linux-kernel,
	Eric W. Biederman

On Tue, Jun 29, 2010 at 09:03:04AM -0700, Dave Hansen wrote:
> On Tue, 2010-06-29 at 11:56 +0900, KOSAKI Motohiro wrote:
> > > On Mon, 2010-06-28 at 08:44 -0700, Greg KH wrote:
> > > > > The directories being created are the standard directories, one for each of the memory
> > > > > sections present at boot.  I think the most used files in each of these directories
> > > > > is the state and removable file used to do memory hotplug.
> > > > 
> > > > And perhaps we shouldn't really be creating so many directories?  Why
> > > > not work with the memory hotplug developers to change their interface to
> > > > not abuse sysfs in such a manner? 
> > > 
> > > Heh, it wasn't abuse until we got this much memory.  But, I think this
> > > one is pretty much 100% my fault.
> > > 
> > > Nathan, I think the right fix here is probably to untie sysfs from the
> > > sections a bit.  We should be able to have sysfs dirs that represent
> > > more than one contiguous SECTION_SIZE area of memory.
> > 
> > Why do we need abi breakage? Yourself talked about we guess ppc don't
> > actually need 16MB section. I think IBM folks have to confirm it.
> > If our guessing is correct, the firmware fixing is only necessary.
> 
> >From the mouth of the kernel dumbass who coded this up: it's not the
> firmware's fault.  We shouldn't punt this to them, and the proper fix
> _isn't_ in the firmware, plus they may have other more fundamental
> reasons to keep the LMB sizes what they are.

I agree, this should be fixed in the api to userspace, having this many
sysfs directories and/or files is just looney.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-29 15:38                   ` Nathan Fontenot
@ 2010-06-30  0:00                     ` KOSAKI Motohiro
  0 siblings, 0 replies; 34+ messages in thread
From: KOSAKI Motohiro @ 2010-06-30  0:00 UTC (permalink / raw)
  To: Nathan Fontenot
  Cc: kosaki.motohiro, Dave Hansen, Greg KH, Andi Kleen, linux-kernel,
	Eric W. Biederman

> On 06/28/2010 09:56 PM, KOSAKI Motohiro wrote:
> >> On Mon, 2010-06-28 at 08:44 -0700, Greg KH wrote:
> >>>> The directories being created are the standard directories, one for each of the memory
> >>>> sections present at boot.  I think the most used files in each of these directories
> >>>> is the state and removable file used to do memory hotplug.
> >>>
> >>> And perhaps we shouldn't really be creating so many directories?  Why
> >>> not work with the memory hotplug developers to change their interface to
> >>> not abuse sysfs in such a manner? 
> >>
> >> Heh, it wasn't abuse until we got this much memory.  But, I think this
> >> one is pretty much 100% my fault.
> >>
> >> Nathan, I think the right fix here is probably to untie sysfs from the
> >> sections a bit.  We should be able to have sysfs dirs that represent
> >> more than one contiguous SECTION_SIZE area of memory.
> > 
> > Why do we need abi breakage? Yourself talked about we guess ppc don't
> > actually need 16MB section. I think IBM folks have to confirm it.
> > If our guessing is correct, the firmware fixing is only necessary.
> 
> Yes, ppc still needs to support add/remove of 16MB sections.  This correlates
> to the smallest lmb size on ppc that we need to support.

okey. I'm not against the change by strong reason. fortunatelly hotplug users are still few.
If we maintain CONFIG_OLD_MEMHOTPLUG_LAYOUT a while time, I guess
nobody oppose new one.


> > Thats said, I don't 100% refuse your idea. it's interesting. but, 
> > In generical I hate _unncessary_ abi change.
> 
> Me too, but I'm not sure the current sysfs layout of memory scales well
> for machines with huge amounts of memory.
> 
> How about providing an alternate sysfs layout for systems that have a large
> number of memory sections?  Even on the machines I worked with that have
> 1 and 2 TB of memory, if we increase the memory sections size to equal the
> lmb size we still would be creating 6k+ directories for a 1 TB machine.
> This would alleviate much of the perfomrance issue but still leaves us with 
> a directory of thousands (or tens of thousands for really big systems) 
> of memoryXXX subdirectories, which is not really human readable.

?? human readable?
As far as I observed, this dir have been no human readable since it was born. but
nobody complained this one because only hotplug-script need to read this.

Am I missing something?

> Or some method of having a single memory XXX dir represent multiple sections,
> as Dave suggested would work. Perhaps there is a way to subdivide the
> memory section dirs into separate dirs based on their node.
> 
> At the point of dealing with this many memory sections would it make sense
> to not create directories for each of the memory sections? Perhaps just
> files to report information about the memory sections.

Probably, It will works. I have one question. How many dir do the patch reduce 
on your machine? Do we need to combinate Eric's hash-dir patch?




^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-29 18:04                     ` Greg KH
@ 2010-06-30  0:32                       ` KAMEZAWA Hiroyuki
  2010-06-30 15:47                         ` Greg KH
  0 siblings, 1 reply; 34+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-06-30  0:32 UTC (permalink / raw)
  To: Greg KH
  Cc: Dave Hansen, KOSAKI Motohiro, Nathan Fontenot, Andi Kleen,
	linux-kernel, Eric W. Biederman

On Tue, 29 Jun 2010 11:04:15 -0700
Greg KH <gregkh@suse.de> wrote:

> On Tue, Jun 29, 2010 at 09:03:04AM -0700, Dave Hansen wrote:
> > On Tue, 2010-06-29 at 11:56 +0900, KOSAKI Motohiro wrote:
> > > > On Mon, 2010-06-28 at 08:44 -0700, Greg KH wrote:
> > > > > > The directories being created are the standard directories, one for each of the memory
> > > > > > sections present at boot.  I think the most used files in each of these directories
> > > > > > is the state and removable file used to do memory hotplug.
> > > > > 
> > > > > And perhaps we shouldn't really be creating so many directories?  Why
> > > > > not work with the memory hotplug developers to change their interface to
> > > > > not abuse sysfs in such a manner? 
> > > > 
> > > > Heh, it wasn't abuse until we got this much memory.  But, I think this
> > > > one is pretty much 100% my fault.
> > > > 
> > > > Nathan, I think the right fix here is probably to untie sysfs from the
> > > > sections a bit.  We should be able to have sysfs dirs that represent
> > > > more than one contiguous SECTION_SIZE area of memory.
> > > 
> > > Why do we need abi breakage? Yourself talked about we guess ppc don't
> > > actually need 16MB section. I think IBM folks have to confirm it.
> > > If our guessing is correct, the firmware fixing is only necessary.
> > 
> > >From the mouth of the kernel dumbass who coded this up: it's not the
> > firmware's fault.  We shouldn't punt this to them, and the proper fix
> > _isn't_ in the firmware, plus they may have other more fundamental
> > reasons to keep the LMB sizes what they are.
> 
> I agree, this should be fixed in the api to userspace, having this many
> sysfs directories and/or files is just looney.
> 
> thanks,
> 

Hmm, adding 
 CONFIG_NEW_MEMORY_SYSFS_LAYOUT or
 memory_sysfs_layout=small boot option
and adding a scalable interface for large scale machines ?
I'd like to consider something..

Thanks,
-Kame




^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-30  0:32                       ` KAMEZAWA Hiroyuki
@ 2010-06-30 15:47                         ` Greg KH
  2010-07-01  0:31                           ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 34+ messages in thread
From: Greg KH @ 2010-06-30 15:47 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Dave Hansen, KOSAKI Motohiro, Nathan Fontenot, Andi Kleen,
	linux-kernel, Eric W. Biederman

On Wed, Jun 30, 2010 at 09:32:51AM +0900, KAMEZAWA Hiroyuki wrote:
> On Tue, 29 Jun 2010 11:04:15 -0700
> Greg KH <gregkh@suse.de> wrote:
> 
> > On Tue, Jun 29, 2010 at 09:03:04AM -0700, Dave Hansen wrote:
> > > On Tue, 2010-06-29 at 11:56 +0900, KOSAKI Motohiro wrote:
> > > > > On Mon, 2010-06-28 at 08:44 -0700, Greg KH wrote:
> > > > > > > The directories being created are the standard directories, one for each of the memory
> > > > > > > sections present at boot.  I think the most used files in each of these directories
> > > > > > > is the state and removable file used to do memory hotplug.
> > > > > > 
> > > > > > And perhaps we shouldn't really be creating so many directories?  Why
> > > > > > not work with the memory hotplug developers to change their interface to
> > > > > > not abuse sysfs in such a manner? 
> > > > > 
> > > > > Heh, it wasn't abuse until we got this much memory.  But, I think this
> > > > > one is pretty much 100% my fault.
> > > > > 
> > > > > Nathan, I think the right fix here is probably to untie sysfs from the
> > > > > sections a bit.  We should be able to have sysfs dirs that represent
> > > > > more than one contiguous SECTION_SIZE area of memory.
> > > > 
> > > > Why do we need abi breakage? Yourself talked about we guess ppc don't
> > > > actually need 16MB section. I think IBM folks have to confirm it.
> > > > If our guessing is correct, the firmware fixing is only necessary.
> > > 
> > > >From the mouth of the kernel dumbass who coded this up: it's not the
> > > firmware's fault.  We shouldn't punt this to them, and the proper fix
> > > _isn't_ in the firmware, plus they may have other more fundamental
> > > reasons to keep the LMB sizes what they are.
> > 
> > I agree, this should be fixed in the api to userspace, having this many
> > sysfs directories and/or files is just looney.
> > 
> > thanks,
> > 
> 
> Hmm, adding 
>  CONFIG_NEW_MEMORY_SYSFS_LAYOUT or

It will not be "NEW" in a year :)

>  memory_sysfs_layout=small boot option

Yes, that sounds good.

> and adding a scalable interface for large scale machines ?
> I'd like to consider something..

Dynamically changing the layout on big memory boxes makes sense to me,
how about you?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-06-30 15:47                         ` Greg KH
@ 2010-07-01  0:31                           ` KAMEZAWA Hiroyuki
  2010-07-01  3:17                             ` Nathan Fontenot
                                               ` (3 more replies)
  0 siblings, 4 replies; 34+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-07-01  0:31 UTC (permalink / raw)
  To: Greg KH
  Cc: Dave Hansen, KOSAKI Motohiro, Nathan Fontenot, Andi Kleen,
	linux-kernel, Eric W. Biederman

On Wed, 30 Jun 2010 08:47:55 -0700
Greg KH <gregkh@suse.de> wrote:
> > and adding a scalable interface for large scale machines ?
> > I'd like to consider something..
> 
> Dynamically changing the layout on big memory boxes makes sense to me,
> how about you?
> 

like this ?
==
boot option:
memory_sysfs_layout=compact
memory_sysfs_layout=auto (default)
memory_sysfs_layout=full

Considering briefly, how about this compact layout ?

/sys/devices/system/memory/:
                            list, hide, show, memoryX...

list: // show available memory index list.
  #cat list
   0 1 2 ....10000...

show: //an interface to enable the interface.
  #echo INDEX > memory_index
  will create memoryINDEX diretory.

hide: //an interface to hide the interface.
  #echo INDEX > memory_hide
  will remove memoryINDEX sysfs directory.


In compact mode, all memoryX interface are hidden at boot.
In full mode, all memoryX interaface are shown.
The Boot option just affects status at boot. If users want, he can make
all memory sysfs in shown state.

At hot-add event (via acpi) or probe-event, newly created memory section
should be start from "shown" mode. hotplug scirpt can hide it after online.

At hot-remove, the users has to offline memory before hotplug. He'll has
to do check list and show interface.

I think this change is not very difficult technically but can this kind of
interface be allowed ?

Thanks,
-Kame





^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-07-01  0:31                           ` KAMEZAWA Hiroyuki
@ 2010-07-01  3:17                             ` Nathan Fontenot
  2010-07-01  3:30                               ` KAMEZAWA Hiroyuki
  2010-07-01  5:15                             ` KAMEZAWA Hiroyuki
                                               ` (2 subsequent siblings)
  3 siblings, 1 reply; 34+ messages in thread
From: Nathan Fontenot @ 2010-07-01  3:17 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Greg KH, Dave Hansen, KOSAKI Motohiro, Andi Kleen, linux-kernel,
	Eric W. Biederman

On 06/30/2010 07:31 PM, KAMEZAWA Hiroyuki wrote:
> On Wed, 30 Jun 2010 08:47:55 -0700
> Greg KH <gregkh@suse.de> wrote:
>>> and adding a scalable interface for large scale machines ?
>>> I'd like to consider something..
>>
>> Dynamically changing the layout on big memory boxes makes sense to me,
>> how about you?
>>
> 
> like this ?
> ==
> boot option:
> memory_sysfs_layout=compact
> memory_sysfs_layout=auto (default)
> memory_sysfs_layout=full
> 
> Considering briefly, how about this compact layout ?
> 
> /sys/devices/system/memory/:
>                             list, hide, show, memoryX...
> 
> list: // show available memory index list.
>   #cat list
>    0 1 2 ....10000...
> 
> show: //an interface to enable the interface.
>   #echo INDEX > memory_index
>   will create memoryINDEX diretory.
> 
> hide: //an interface to hide the interface.
>   #echo INDEX > memory_hide
>   will remove memoryINDEX sysfs directory.
> 
> 
> In compact mode, all memoryX interface are hidden at boot.
> In full mode, all memoryX interaface are shown.
> The Boot option just affects status at boot. If users want, he can make
> all memory sysfs in shown state.

Do we need to make something as complicated as dynamically adding and removing
the sysfs directories?  Why not a compact layout that just takes the files
that currently reside in the memoryXX dirs and move them up to the memory
directory. This would be state (which should probably be split into an
'online' and 'offline' file), removable, phys_index, and phys_device.

Doing a cat on each of these files would simply report the appropriate
information for all of the memory sections present.  We could even go
as far storing the online status and removable status as a bitmap instead
of putting it in the memory_block struct and use th in-kernel routines
for printing bitmaps.

Users could then do memory hotplug by echo'ing the memory section to 
online to the 'online' file, and echo'ing the section number to the
'offline' file to offline a section.

In this mode ew would skip the creation of all of the sysfs nodes and
completely remove the performance issue seen.

Since this new layout and possible mdification to the memory_block
structure would not easily allow switching between he different layouts,
perhaps this should be set via a config option.

-Nathan



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-07-01  3:17                             ` Nathan Fontenot
@ 2010-07-01  3:30                               ` KAMEZAWA Hiroyuki
  2010-07-01 23:28                                 ` Greg KH
  0 siblings, 1 reply; 34+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-07-01  3:30 UTC (permalink / raw)
  To: Nathan Fontenot
  Cc: Greg KH, Dave Hansen, KOSAKI Motohiro, Andi Kleen, linux-kernel,
	Eric W. Biederman

On Wed, 30 Jun 2010 22:17:35 -0500
Nathan Fontenot <nfont@austin.ibm.com> wrote:

> On 06/30/2010 07:31 PM, KAMEZAWA Hiroyuki wrote:
> > On Wed, 30 Jun 2010 08:47:55 -0700
> > Greg KH <gregkh@suse.de> wrote:
> >>> and adding a scalable interface for large scale machines ?
> >>> I'd like to consider something..
> >>
> >> Dynamically changing the layout on big memory boxes makes sense to me,
> >> how about you?
> >>
> > 
> > like this ?
> > ==
> > boot option:
> > memory_sysfs_layout=compact
> > memory_sysfs_layout=auto (default)
> > memory_sysfs_layout=full
> > 
> > Considering briefly, how about this compact layout ?
> > 
> > /sys/devices/system/memory/:
> >                             list, hide, show, memoryX...
> > 
> > list: // show available memory index list.
> >   #cat list
> >    0 1 2 ....10000...
> > 
> > show: //an interface to enable the interface.
> >   #echo INDEX > memory_index
> >   will create memoryINDEX diretory.
> > 
> > hide: //an interface to hide the interface.
> >   #echo INDEX > memory_hide
> >   will remove memoryINDEX sysfs directory.
> > 
> > 
> > In compact mode, all memoryX interface are hidden at boot.
> > In full mode, all memoryX interaface are shown.
> > The Boot option just affects status at boot. If users want, he can make
> > all memory sysfs in shown state.
> 
> Do we need to make something as complicated as dynamically adding and removing
> the sysfs directories?  Why not a compact layout that just takes the files
> that currently reside in the memoryXX dirs and move them up to the memory
> directory. This would be state (which should probably be split into an
> 'online' and 'offline' file), removable, phys_index, and phys_device.
> 
I've considered several patterns.

with 4096 bytes buffer of sysfs, "printting bitmap" just covers small
amount of sections even with smart ASCII format to show contiguous range
as a chunk. That's my concern. (and 'list' file in above example
is impossible to impelement.)

If I can use rmdir/mkdir interface, it's much simple rather than above "echo".
But it seems impossible.
Because we have memory information in /proc/iomem already, I think hide/show
interface (instead of mkdir/rmdir) is not very bad.

If you can implement highly scalable interface, please.
I just stop dreaming and ack yours.

Thanks,
-Kame



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-07-01  0:31                           ` KAMEZAWA Hiroyuki
  2010-07-01  3:17                             ` Nathan Fontenot
@ 2010-07-01  5:15                             ` KAMEZAWA Hiroyuki
  2010-07-01 13:23                             ` Dave Hansen
  2010-07-01 23:26                             ` Greg KH
  3 siblings, 0 replies; 34+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-07-01  5:15 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Greg KH, Dave Hansen, KOSAKI Motohiro, Nathan Fontenot,
	Andi Kleen, linux-kernel, Eric W. Biederman

On Thu, 1 Jul 2010 09:31:30 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> I think this change is not very difficult technically but can this kind of
> interface be allowed ?
> 
Here is a patch.

But I'm not a specialist of memory hotplug in these days. So, please
modify this as required even when you like this.
This patch is onto mmotm but will not hunk with mainline, I hope.
If nonsense, sorry for noise ;)

-Kame
==

Memory hotplug has interfaces for offlining memory per section.
With large memory/small section machine, we tend to have too many
sysfs directory as
  /sys/devices/system/memory/memoryXXX/ (XXX is section name)
But this interface is only necessary when we do hotplug.

This patch adds 2 new interfaces as
  /sys/devieces/system/memory/hide
  /sys/devieces/system/memory/show

'hide' will remove sysfs directroy of given number's section.
'show' will create sysfs directroy of given number's section.

  # echo 120 > /sys/devices/system/memory/hide
    ....memory120 will be removed.

This patch also adds hidememorysysfs boot option to hide
all sysfs for memory section at boot time.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 Documentation/kernel-parameters.txt |    2 
 Documentation/memory-hotplug.txt    |   25 +++++
 drivers/base/memory.c               |  165 +++++++++++++++++++++++++++++++++---
 drivers/base/node.c                 |    2 
 4 files changed, 184 insertions(+), 10 deletions(-)

Index: mmotm-2.6.35-0611/drivers/base/memory.c
===================================================================
--- mmotm-2.6.35-0611.orig/drivers/base/memory.c
+++ mmotm-2.6.35-0611/drivers/base/memory.c
@@ -23,10 +23,12 @@
 #include <linux/mutex.h>
 #include <linux/stat.h>
 #include <linux/slab.h>
+#include <linux/radix-tree.h>
 
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
 
+DEFINE_MUTEX(mem_sysfs_lock);
 #define MEMORY_CLASS_NAME	"memory"
 
 static struct sysdev_class memory_sysdev_class = {
@@ -324,6 +326,8 @@ static int block_size_init(void)
 				&attr_block_size_bytes.attr);
 }
 
+
+
 /*
  * Some architectures will have custom drivers to do this, and
  * will not need to do it from userspace.  The fake hot-add code
@@ -363,6 +367,7 @@ static inline int memory_probe_init(void
 }
 #endif
 
+
 #ifdef CONFIG_MEMORY_FAILURE
 /*
  * Support for offlining pages of memory
@@ -505,13 +510,14 @@ int remove_memory_block(unsigned long no
 	struct memory_block *mem;
 
 	mem = find_memory_block(section);
+	if (!mem) /* already hidden ? */
+		return 0;
 	unregister_mem_sect_under_nodes(mem);
 	mem_remove_simple_file(mem, phys_index);
 	mem_remove_simple_file(mem, state);
 	mem_remove_simple_file(mem, phys_device);
 	mem_remove_simple_file(mem, removable);
 	unregister_memory(mem, section);
-
 	return 0;
 }
 
@@ -521,20 +527,142 @@ int remove_memory_block(unsigned long no
  */
 int register_new_memory(int nid, struct mem_section *section)
 {
-	return add_memory_block(nid, section, MEM_OFFLINE, HOTPLUG);
+	int ret;
+
+	mutex_lock(&mem_sysfs_lock);
+	ret = add_memory_block(nid, section, MEM_OFFLINE, HOTPLUG);
+	mutex_unlock(&mem_sysfs_lock);
+	return ret;
 }
 
 int unregister_memory_section(struct mem_section *section)
 {
+	int ret;
+
 	if (!present_section(section))
 		return -EINVAL;
+	mutex_lock(&mem_sysfs_lock);
+	ret = remove_memory_block(0, section, 0);
+	mutex_unlock(&mem_sysfs_lock);
+	return ret;
+}
+
+/* Remember memory online/offline status for _hidden_ memory */
+
+RADIX_TREE(hidden_mems, GFP_KERNEL);
+static int record_mem_status(unsigned long section_nr, int status)
+{
+	int ret;
+	long lstat = status+1;
+	if (radix_tree_preload(GFP_KERNEL))
+		return -ENOMEM;
+	ret = radix_tree_insert(&hidden_mems, section_nr, (void*)lstat);
+	radix_tree_preload_end();
+	return ret;
+}
+
+static int lookup_mem_status(unsigned long section_nr)
+{
+	void *ptr;
+	/* we already have big mutex */
+	ptr= radix_tree_lookup(&hidden_mems, section_nr);
+	/* treate not-recorded mems'state as ONLINE */
+	if (!ptr)
+		return MEM_ONLINE;
+	return (long)ptr - 1;
+}
+
+static void forget_mem_status(unsigned long section_nr)
+{
+	radix_tree_delete(&hidden_mems, section_nr);
+}
+
+static ssize_t
+memory_show_store(struct class *class, struct class_attribute *attr,
+		   const char *buf, size_t count)
+{
+	struct mem_section *section;
+	unsigned long section_nr;
+	int nid, status;
+	ssize_t ret;
+
+	section_nr = simple_strtoull(buf, NULL, 0);
+	if (!present_section_nr(section_nr))
+		return -EINVAL;
+	section = __nr_to_section(section_nr);
+	VM_BUG_ON(!section);
+
+	mutex_lock(&mem_sysfs_lock);
+
+	if (find_memory_block(section)) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	nid = pfn_to_nid(section_nr_to_pfn(section_nr));
+
+	status = lookup_mem_status(section_nr);
+	ret = add_memory_block(nid, section, status, HOTPLUG);
+	if (ret)
+		goto out;
+	forget_mem_status(section_nr);
+	ret = count;
+out:
+	mutex_unlock(&mem_sysfs_lock);
+	return ret;
+}
 
-	return remove_memory_block(0, section, 0);
+static CLASS_ATTR(show, S_IWUSR, NULL, memory_show_store);
+static inline int memory_show_init(void)
+{
+	return sysfs_create_file(&memory_sysdev_class.kset.kobj,
+				&class_attr_show.attr);
 }
 
+static ssize_t
+memory_hide_store(struct class *class, struct class_attribute *attr,
+		   const char *buf, size_t count)
+{
+	struct mem_section *section;
+	struct memory_block *mem;
+	unsigned long section_nr;
+	ssize_t ret = -EINVAL;
+
+	section_nr = simple_strtoull(buf, NULL, 0);
+	if (!present_section_nr(section_nr))
+		return ret;
+	section = __nr_to_section(section_nr);
+	VM_BUG_ON(!section);
+
+	mutex_lock(&mem_sysfs_lock);
+	mem = find_memory_block(section);
+	if (!mem)
+		goto out;
+	record_mem_status(section_nr, mem->state);
+	unregister_mem_sect_under_nodes(mem);
+	mem_remove_simple_file(mem, phys_index);
+	mem_remove_simple_file(mem, state);
+	mem_remove_simple_file(mem, phys_device);
+	mem_remove_simple_file(mem, removable);
+	unregister_memory(mem, section);
+	mutex_unlock(&mem_sysfs_lock);
+	ret = count;
+out:
+	return ret;
+}
+
+static CLASS_ATTR(hide, S_IWUSR, NULL, memory_hide_store);
+static inline int memory_hide_init(void)
+{
+	return sysfs_create_file(&memory_sysdev_class.kset.kobj,
+				&class_attr_hide.attr);
+}
+
+
 /*
  * Initialize the sysfs support for memory devices...
  */
+static int hide_memory_sysfs __initdata;
 int __init memory_dev_init(void)
 {
 	unsigned int i;
@@ -550,13 +678,16 @@ int __init memory_dev_init(void)
 	 * Create entries for memory sections that were found
 	 * during boot and have been initialized
 	 */
-	for (i = 0; i < NR_MEM_SECTIONS; i++) {
-		if (!present_section_nr(i))
-			continue;
-		err = add_memory_block(0, __nr_to_section(i), MEM_ONLINE,
-				       BOOT);
-		if (!ret)
-			ret = err;
+	if (!hide_memory_sysfs){
+		for (i = 0; i < NR_MEM_SECTIONS; i++) {
+			if (!present_section_nr(i))
+				continue;
+			err = add_memory_block(0, __nr_to_section(i),
+					MEM_ONLINE,
+				        BOOT);
+			if (!ret)
+				ret = err;
+		}
 	}
 
 	err = memory_probe_init();
@@ -568,8 +699,22 @@ int __init memory_dev_init(void)
 	err = block_size_init();
 	if (!ret)
 		ret = err;
+	err = memory_show_init();
+	if (!ret)
+		ret = err;
+	err = memory_hide_init();
+	if (!ret)
+		ret = err;
 out:
 	if (ret)
 		printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
 	return ret;
 }
+
+static int __init  hidememorysysfs(char *s)
+{
+	hide_memory_sysfs = 1;
+	return 1;
+}
+__setup("hidememorysysfs", hidememorysysfs);
+
Index: mmotm-2.6.35-0611/drivers/base/node.c
===================================================================
--- mmotm-2.6.35-0611.orig/drivers/base/node.c
+++ mmotm-2.6.35-0611/drivers/base/node.c
@@ -421,6 +421,8 @@ static int link_mem_sections(int nid)
 			continue;
 		mem_sect = __nr_to_section(section_nr);
 		mem_blk = find_memory_block(mem_sect);
+		if (!mem_blk) /* hidden ? */
+			continue;
 		ret = register_mem_sect_under_node(mem_blk, nid);
 		if (!err)
 			err = ret;
Index: mmotm-2.6.35-0611/Documentation/memory-hotplug.txt
===================================================================
--- mmotm-2.6.35-0611.orig/Documentation/memory-hotplug.txt
+++ mmotm-2.6.35-0611/Documentation/memory-hotplug.txt
@@ -15,6 +15,7 @@ be changed often.
   1.3. Unit of Memory online/offline operation
 2. Kernel Configuration
 3. sysfs files for memory hotplug
+  3.1 hide and show sysfs
 4. Physical memory hot-add phase
   4.1 Hardware(Firmware) Support
   4.2 Notify memory hot-add event by hand
@@ -169,6 +170,30 @@ For example:
 A backlink will also be created:
 /sys/devices/system/memory/memory9/node0 -> ../../node/node0
 
+3.1 Hide and Show for sysfs.
+
+On some big memory system, memory syfs may contain too much sysfs
+directory and consume kernel resource too much. To handle that,
+memory sysfs has hide and show interface.
+
+hide : remove memoryXXX directory on demand.
+       A user can remove sysfs entry by writing memory section number.
+
+echo 120 > /sys/devices/system/memory/hide
+...then, memory120 disappears.
+
+show : create memory XXX directroy on demand.
+       A user can add sysfs entry by writing memory section number.
+       If memory doesn't exit, this fails.
+
+echo 120 > /sys/dev/ices/system/memory/show
+...then, memory120 directory is available.
+
+And we have boot option as "hidememorysysfs". This makes all
+memoryXXX sysfs at boot to be hidden. All hot-added sections
+will be visible automatically. Users can make memoryXXX sysfs
+entries by 'show' interface.
+
 --------------------------------
 4. Physical memory hot-add phase
 --------------------------------
Index: mmotm-2.6.35-0611/Documentation/kernel-parameters.txt
===================================================================
--- mmotm-2.6.35-0611.orig/Documentation/kernel-parameters.txt
+++ mmotm-2.6.35-0611/Documentation/kernel-parameters.txt
@@ -853,6 +853,8 @@ and is between 256 and 4096 characters. 
 			corresponding firmware-first mode error processing
 			logic will be disabled.
 
+	hidememorysyfs  [KNL, BOOT] hides memoryXXX directory in sysfs at BOOT.
+
 	highmem=nn[KMG]	[KNL,BOOT] forces the highmem zone to have an exact
 			size of <nn>. This works even on boxes that have no
 			highmem otherwise. This also works to reduce highmem


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-07-01  0:31                           ` KAMEZAWA Hiroyuki
  2010-07-01  3:17                             ` Nathan Fontenot
  2010-07-01  5:15                             ` KAMEZAWA Hiroyuki
@ 2010-07-01 13:23                             ` Dave Hansen
  2010-07-06 15:20                               ` Nathan Fontenot
  2010-07-01 23:26                             ` Greg KH
  3 siblings, 1 reply; 34+ messages in thread
From: Dave Hansen @ 2010-07-01 13:23 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Greg KH, KOSAKI Motohiro, Nathan Fontenot, Andi Kleen,
	linux-kernel, Eric W. Biederman

On Thu, 2010-07-01 at 09:31 +0900, KAMEZAWA Hiroyuki wrote:
> 
> Considering briefly, how about this compact layout ?
> 
> /sys/devices/system/memory/:
>                             list, hide, show, memoryX...
> 
> list: // show available memory index list.
>   #cat list
>    0 1 2 ....10000...
> 
> show: //an interface to enable the interface.
>   #echo INDEX > memory_index
>   will create memoryINDEX diretory.
> 
> hide: //an interface to hide the interface.
>   #echo INDEX > memory_hide
>   will remove memoryINDEX sysfs directory.

I was thinking more along the lines of just taking adjacent sections and
merging them.  We'll need a new "end address" or size file.  Maybe
"end_phys_index" or something similar.

Such a beast would not fix all of the pathological cases, like where
only every other 16MB section is populated with RAM, but I don't think
those are very common at all, especially in cases where there's a lot of
RAM.  But, it also has a chance of being relatively backward-compatible.
In most cases, we may even be able to calculate a new phys_block_size
where everything fits evenly and be fully backward-compatible with the
old ABI.

-- Dave


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-07-01  0:31                           ` KAMEZAWA Hiroyuki
                                               ` (2 preceding siblings ...)
  2010-07-01 13:23                             ` Dave Hansen
@ 2010-07-01 23:26                             ` Greg KH
  2010-07-02  5:50                               ` KAMEZAWA Hiroyuki
  3 siblings, 1 reply; 34+ messages in thread
From: Greg KH @ 2010-07-01 23:26 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Dave Hansen, KOSAKI Motohiro, Nathan Fontenot, Andi Kleen,
	linux-kernel, Eric W. Biederman

On Thu, Jul 01, 2010 at 09:31:30AM +0900, KAMEZAWA Hiroyuki wrote:
> On Wed, 30 Jun 2010 08:47:55 -0700
> Greg KH <gregkh@suse.de> wrote:
> > > and adding a scalable interface for large scale machines ?
> > > I'd like to consider something..
> > 
> > Dynamically changing the layout on big memory boxes makes sense to me,
> > how about you?
> > 
> 
> like this ?
> ==
> boot option:
> memory_sysfs_layout=compact
> memory_sysfs_layout=auto (default)
> memory_sysfs_layout=full
> 
> Considering briefly, how about this compact layout ?
> 
> /sys/devices/system/memory/:
>                             list, hide, show, memoryX...
> 
> list: // show available memory index list.
>   #cat list
>    0 1 2 ....10000...
> 
> show: //an interface to enable the interface.
>   #echo INDEX > memory_index
>   will create memoryINDEX diretory.
> 
> hide: //an interface to hide the interface.
>   #echo INDEX > memory_hide
>   will remove memoryINDEX sysfs directory.

Ick, that can get confusing very quickly, and not really solve any of
your root problems, right?

> In compact mode, all memoryX interface are hidden at boot.
> In full mode, all memoryX interaface are shown.
> The Boot option just affects status at boot. If users want, he can make
> all memory sysfs in shown state.
> 
> At hot-add event (via acpi) or probe-event, newly created memory section
> should be start from "shown" mode. hotplug scirpt can hide it after online.
> 
> At hot-remove, the users has to offline memory before hotplug. He'll has
> to do check list and show interface.
> 
> I think this change is not very difficult technically but can this kind of
> interface be allowed ?

Not really, I don't like it.

Why not just simplify what you currently have to not use so many
directories and files?

And maybe, this doesn't belong in sysfs at all...

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-07-01  3:30                               ` KAMEZAWA Hiroyuki
@ 2010-07-01 23:28                                 ` Greg KH
  0 siblings, 0 replies; 34+ messages in thread
From: Greg KH @ 2010-07-01 23:28 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Nathan Fontenot, Dave Hansen, KOSAKI Motohiro, Andi Kleen,
	linux-kernel, Eric W. Biederman

On Thu, Jul 01, 2010 at 12:30:57PM +0900, KAMEZAWA Hiroyuki wrote:
> On Wed, 30 Jun 2010 22:17:35 -0500
> Nathan Fontenot <nfont@austin.ibm.com> wrote:
> 
> > On 06/30/2010 07:31 PM, KAMEZAWA Hiroyuki wrote:
> > > On Wed, 30 Jun 2010 08:47:55 -0700
> > > Greg KH <gregkh@suse.de> wrote:
> > >>> and adding a scalable interface for large scale machines ?
> > >>> I'd like to consider something..
> > >>
> > >> Dynamically changing the layout on big memory boxes makes sense to me,
> > >> how about you?
> > >>
> > > 
> > > like this ?
> > > ==
> > > boot option:
> > > memory_sysfs_layout=compact
> > > memory_sysfs_layout=auto (default)
> > > memory_sysfs_layout=full
> > > 
> > > Considering briefly, how about this compact layout ?
> > > 
> > > /sys/devices/system/memory/:
> > >                             list, hide, show, memoryX...
> > > 
> > > list: // show available memory index list.
> > >   #cat list
> > >    0 1 2 ....10000...
> > > 
> > > show: //an interface to enable the interface.
> > >   #echo INDEX > memory_index
> > >   will create memoryINDEX diretory.
> > > 
> > > hide: //an interface to hide the interface.
> > >   #echo INDEX > memory_hide
> > >   will remove memoryINDEX sysfs directory.
> > > 
> > > 
> > > In compact mode, all memoryX interface are hidden at boot.
> > > In full mode, all memoryX interaface are shown.
> > > The Boot option just affects status at boot. If users want, he can make
> > > all memory sysfs in shown state.
> > 
> > Do we need to make something as complicated as dynamically adding and removing
> > the sysfs directories?  Why not a compact layout that just takes the files
> > that currently reside in the memoryXX dirs and move them up to the memory
> > directory. This would be state (which should probably be split into an
> > 'online' and 'offline' file), removable, phys_index, and phys_device.
> > 
> I've considered several patterns.
> 
> with 4096 bytes buffer of sysfs, "printting bitmap" just covers small
> amount of sections even with smart ASCII format to show contiguous range
> as a chunk. That's my concern. (and 'list' file in above example
> is impossible to impelement.)

Yes, that's my concern as well.  sysfs is "one value per file" and if
you are worried about the size of the sysfs buffer, something is wrong
with your interface.

> If I can use rmdir/mkdir interface, it's much simple rather than above "echo".
> But it seems impossible.

That's what configfs is for :)

> Because we have memory information in /proc/iomem already, I think hide/show
> interface (instead of mkdir/rmdir) is not very bad.
> 
> If you can implement highly scalable interface, please.
> I just stop dreaming and ack yours.

Why not just create your own filesystem for this, "memoryfs", where you
can do what you want and you don't have to worry about sysfs apis and
buffer sizes.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-07-01 23:26                             ` Greg KH
@ 2010-07-02  5:50                               ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 34+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-07-02  5:50 UTC (permalink / raw)
  To: Greg KH
  Cc: Dave Hansen, KOSAKI Motohiro, Nathan Fontenot, Andi Kleen,
	linux-kernel, Eric W. Biederman

On Thu, 1 Jul 2010 16:26:34 -0700
Greg KH <gregkh@suse.de> wrote:

> On Thu, Jul 01, 2010 at 09:31:30AM +0900, KAMEZAWA Hiroyuki wrote:
> > On Wed, 30 Jun 2010 08:47:55 -0700
> > Greg KH <gregkh@suse.de> wrote:
> > > > and adding a scalable interface for large scale machines ?
> > > > I'd like to consider something..
> > > 
> > > Dynamically changing the layout on big memory boxes makes sense to me,
> > > how about you?
> > > 
> > 
> > like this ?
> > ==
> > boot option:
> > memory_sysfs_layout=compact
> > memory_sysfs_layout=auto (default)
> > memory_sysfs_layout=full
> > 
> > Considering briefly, how about this compact layout ?
> > 
> > /sys/devices/system/memory/:
> >                             list, hide, show, memoryX...
> > 
> > list: // show available memory index list.
> >   #cat list
> >    0 1 2 ....10000...
> > 
> > show: //an interface to enable the interface.
> >   #echo INDEX > memory_index
> >   will create memoryINDEX diretory.
> > 
> > hide: //an interface to hide the interface.
> >   #echo INDEX > memory_hide
> >   will remove memoryINDEX sysfs directory.
> 
> Ick, that can get confusing very quickly, and not really solve any of
> your root problems, right?
> 
Why not ?

I'm sorry if I misunderstand the problem . It is tooo many entries under
a directory, which makes the routine slow. No ?
By this interface, the sysfs entries pops up on demand if the user hides
it at boot time (by boot options.)

IIUC, the most problematic IBM's 16MB-sections system has no notification
from firmware and their system management middleware controls all
memory hotplug information. IOW, all memory-hotplug actions are from
user-land, not from firmware. They should know which mem_section should be
visible under /memory before starting hotplug.

In other platforms, using ACPI, hot-added memory sections can be automatically
visible. At memory hot-remove, management software should now which phys_address
memory should be removed before it starts action. It can make visible
section directory.

For many many users, who never wants memory hotplug, they can avoid creating
sysfs for memory hotplug and will be happy.



> > In compact mode, all memoryX interface are hidden at boot.
> > In full mode, all memoryX interaface are shown.
> > The Boot option just affects status at boot. If users want, he can make
> > all memory sysfs in shown state.
> > 
> > At hot-add event (via acpi) or probe-event, newly created memory section
> > should be start from "shown" mode. hotplug scirpt can hide it after online.
> > 
> > At hot-remove, the users has to offline memory before hotplug. He'll has
> > to do check list and show interface.
> > 
> > I think this change is not very difficult technically but can this kind of
> > interface be allowed ?
> 
> Not really, I don't like it.
> 
> Why not just simplify what you currently have to not use so many
> directories and files?
> 
> And maybe, this doesn't belong in sysfs at all...
> 

Because the system has topology and sysfs's /sys/devices/system/ shows
system's topology by directories and files, we use it. Especially, "node"
information is important in some system and we'd made use of it.

If you recommend guys to remake all NUMA-cpu-memory topology information
in other fs, someone will do.

Fujitsu will not complain about that and welcome it because we have a long
period until RHEL7 ages.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-07-01 13:23                             ` Dave Hansen
@ 2010-07-06 15:20                               ` Nathan Fontenot
  2010-07-06 15:33                                 ` Dave Hansen
  0 siblings, 1 reply; 34+ messages in thread
From: Nathan Fontenot @ 2010-07-06 15:20 UTC (permalink / raw)
  To: Dave Hansen
  Cc: KAMEZAWA Hiroyuki, Greg KH, KOSAKI Motohiro, Andi Kleen,
	linux-kernel, Eric W. Biederman

On 07/01/2010 08:23 AM, Dave Hansen wrote:
> 
> I was thinking more along the lines of just taking adjacent sections and
> merging them.  We'll need a new "end address" or size file.  Maybe
> "end_phys_index" or something similar.
> 
> Such a beast would not fix all of the pathological cases, like where
> only every other 16MB section is populated with RAM, but I don't think
> those are very common at all, especially in cases where there's a lot of
> RAM.  But, it also has a chance of being relatively backward-compatible.
> In most cases, we may even be able to calculate a new phys_block_size
> where everything fits evenly and be fully backward-compatible with the
> old ABI.
> 

Under this scenario were you thinking that all of the memory sections that
reside under this memory block would then be acted upon as a whole.  For
example would we allow users to hotplug individual memory sections included
in th block, or would the memory block be acted upon as a whole?

-Nathan

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-07-06 15:20                               ` Nathan Fontenot
@ 2010-07-06 15:33                                 ` Dave Hansen
  2010-07-06 15:47                                   ` Nathan Fontenot
  0 siblings, 1 reply; 34+ messages in thread
From: Dave Hansen @ 2010-07-06 15:33 UTC (permalink / raw)
  To: Nathan Fontenot
  Cc: KAMEZAWA Hiroyuki, Greg KH, KOSAKI Motohiro, Andi Kleen,
	linux-kernel, Eric W. Biederman

On Tue, 2010-07-06 at 10:20 -0500, Nathan Fontenot wrote:
> On 07/01/2010 08:23 AM, Dave Hansen wrote:
> > 
> > I was thinking more along the lines of just taking adjacent sections and
> > merging them.  We'll need a new "end address" or size file.  Maybe
> > "end_phys_index" or something similar.
> > 
> > Such a beast would not fix all of the pathological cases, like where
> > only every other 16MB section is populated with RAM, but I don't think
> > those are very common at all, especially in cases where there's a lot of
> > RAM.  But, it also has a chance of being relatively backward-compatible.
> > In most cases, we may even be able to calculate a new phys_block_size
> > where everything fits evenly and be fully backward-compatible with the
> > old ABI.
> 
> Under this scenario were you thinking that all of the memory sections that
> reside under this memory block would then be acted upon as a whole.  For
> example would we allow users to hotplug individual memory sections included
> in th block, or would the memory block be acted upon as a whole?

I think we would need a mechanism that allowed the sysfs directories to
be broken down somehow.  If the merging is very successful, it could
lead to a case where no existing sysfs dir is a reasonable size to
remove.  That's what we'd have to avoid, so I think we'd _need_
splitting of some kind.


-- Dave


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] memory hotplug disable boot option
  2010-07-06 15:33                                 ` Dave Hansen
@ 2010-07-06 15:47                                   ` Nathan Fontenot
  0 siblings, 0 replies; 34+ messages in thread
From: Nathan Fontenot @ 2010-07-06 15:47 UTC (permalink / raw)
  To: Dave Hansen
  Cc: KAMEZAWA Hiroyuki, Greg KH, KOSAKI Motohiro, Andi Kleen,
	linux-kernel, Eric W. Biederman

On 07/06/2010 10:33 AM, Dave Hansen wrote:
> On Tue, 2010-07-06 at 10:20 -0500, Nathan Fontenot wrote:
>> On 07/01/2010 08:23 AM, Dave Hansen wrote:
>>>
>>> I was thinking more along the lines of just taking adjacent sections and
>>> merging them.  We'll need a new "end address" or size file.  Maybe
>>> "end_phys_index" or something similar.
>>>
>>> Such a beast would not fix all of the pathological cases, like where
>>> only every other 16MB section is populated with RAM, but I don't think
>>> those are very common at all, especially in cases where there's a lot of
>>> RAM.  But, it also has a chance of being relatively backward-compatible.
>>> In most cases, we may even be able to calculate a new phys_block_size
>>> where everything fits evenly and be fully backward-compatible with the
>>> old ABI.
>>
>> Under this scenario were you thinking that all of the memory sections that
>> reside under this memory block would then be acted upon as a whole.  For
>> example would we allow users to hotplug individual memory sections included
>> in th block, or would the memory block be acted upon as a whole?
> 
> I think we would need a mechanism that allowed the sysfs directories to
> be broken down somehow.  If the merging is very successful, it could
> lead to a case where no existing sysfs dir is a reasonable size to
> remove.  That's what we'd have to avoid, so I think we'd _need_
> splitting of some kind.
> 

Agreed.

Perhaps something along the following lines.  We create a sysfs directory
for each memory block, where each memory block can contain multiple memory
sections.  The default being one memory section per memory block.  All of
the existing files that are currently under each memoryXXX directory in sysfs
still appear along with a new file as Dave suggested to indicate the
number of memory sections contained in that memory block.  Additionally,
a new file named 'split' would allow users to split the memory block in
half, thus creating a new directory for the split-off half of the block.

A slight change in the state file behavior would also need to occur, in that
writing 'online' or 'offline' to the file would perform the appropriate
operation on all of the memory sections contained in the memory block.

Thoughts?  I can start working on this if no one has any objections.

-Nathan


^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2010-07-06 15:48 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-06-25  1:06 [PATCH] memory hotplug disable boot option Nathan Fontenot
2010-06-25  2:04 ` KOSAKI Motohiro
2010-06-25  9:19   ` Andi Kleen
2010-06-25 14:51     ` Nathan Fontenot
2010-06-25 14:56       ` Andi Kleen
2010-06-25 15:21         ` Nathan Fontenot
2010-06-25 15:28           ` Andi Kleen
2010-06-25 16:00             ` Nathan Fontenot
2010-06-28  2:20       ` KOSAKI Motohiro
2010-06-28  4:15         ` Eric W. Biederman
2010-06-28 14:16           ` Andi Kleen
2010-06-28 19:43             ` Eric W. Biederman
2010-06-28 15:02         ` Greg KH
2010-06-28 15:37           ` Nathan Fontenot
2010-06-28 15:44             ` Greg KH
2010-06-29  0:04               ` Dave Hansen
2010-06-29  2:56                 ` KOSAKI Motohiro
2010-06-29 15:38                   ` Nathan Fontenot
2010-06-30  0:00                     ` KOSAKI Motohiro
2010-06-29 16:03                   ` Dave Hansen
2010-06-29 18:04                     ` Greg KH
2010-06-30  0:32                       ` KAMEZAWA Hiroyuki
2010-06-30 15:47                         ` Greg KH
2010-07-01  0:31                           ` KAMEZAWA Hiroyuki
2010-07-01  3:17                             ` Nathan Fontenot
2010-07-01  3:30                               ` KAMEZAWA Hiroyuki
2010-07-01 23:28                                 ` Greg KH
2010-07-01  5:15                             ` KAMEZAWA Hiroyuki
2010-07-01 13:23                             ` Dave Hansen
2010-07-06 15:20                               ` Nathan Fontenot
2010-07-06 15:33                                 ` Dave Hansen
2010-07-06 15:47                                   ` Nathan Fontenot
2010-07-01 23:26                             ` Greg KH
2010-07-02  5:50                               ` KAMEZAWA Hiroyuki

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).