linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] memory cgroup: update root memory cgroup when node is onlined
@ 2012-09-13  7:14 Wen Congyang
  2012-09-13 10:06 ` Kamezawa Hiroyuki
  2012-09-13 20:59 ` Johannes Weiner
  0 siblings, 2 replies; 21+ messages in thread
From: Wen Congyang @ 2012-09-13  7:14 UTC (permalink / raw)
  To: linux-kernel, cgroups, linux-mm
  Cc: Jiang Liu, hannes, mhocko, bsingharora, KAMEZAWA Hiroyuki,
	Andrew Morton, hughd, paul.gortmaker

root_mem_cgroup->info.nodeinfo is initialized when the system boots.
But NODE_DATA(nid) is null if the node is not onlined, so
root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone contains
an invalid pointer. If we use numactl to bind a program to the node
after onlining the node and its memory, it will cause the kernel
panicked:

[   63.413436] BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60
[   63.414161] IP: [<ffffffff811870b9>] __mod_zone_page_state+0x9/0x60
[   63.414161] PGD 0 
[   63.414161] Oops: 0000 [#1] SMP 
[   63.414161] Modules linked in: acpi_memhotplug binfmt_misc dm_mirror dm_region_hash dm_log dm_mod ppdev sg microcode pcspkr virtio_console virtio_balloon snd_intel8x0 snd_ac9
7_codec ac97_bus snd_seq snd_seq_device snd_pcm snd_timer snd soundcore snd_page_alloc e1000 i2c_piix4 i2c_core floppy parport_pc parport sr_mod cdrom virtio_blk pata_acpi ata_g
eneric ata_piix libata scsi_mod
[   63.414161] CPU 2 
[   63.414161] Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs
...
[   63.414161] Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0)
[   63.414161] Stack:
[   63.414161]  ffff880039abdaf8 ffffffff8117390f ffff880039abdaf8 000000008167c601
[   63.414161]  ffffffff81174162 ffff88003a480f00 0000000000000001 ffff8800395e0000
[   63.414161]  ffff88003dbd0e80 0000000000000282 ffff880039abdb48 ffffffff81174181
[   63.414161] Call Trace:
[   63.414161]  [<ffffffff8117390f>] __pagevec_lru_add_fn+0xdf/0x140
[   63.414161]  [<ffffffff81174162>] ? pagevec_lru_move_fn+0x92/0x100
[   63.414161]  [<ffffffff81174181>] pagevec_lru_move_fn+0xb1/0x100
[   63.414161]  [<ffffffff81173830>] ? lru_add_page_tail+0x1b0/0x1b0
[   63.414161]  [<ffffffff811dff71>] ? exec_mmap+0x121/0x230
[   63.414161]  [<ffffffff811741ec>] __pagevec_lru_add+0x1c/0x30
[   63.414161]  [<ffffffff81174383>] lru_add_drain_cpu+0xa3/0x130
[   63.414161]  [<ffffffff8117443f>] lru_add_drain+0x2f/0x40
[   63.414161]  [<ffffffff81196da9>] exit_mmap+0x69/0x160
[   63.414161]  [<ffffffff810db115>] ? lock_release_holdtime+0x35/0x1a0
[   63.414161]  [<ffffffff81069c37>] mmput+0x77/0x100
[   63.414161]  [<ffffffff811dffc0>] exec_mmap+0x170/0x230
[   63.414161]  [<ffffffff811e0152>] flush_old_exec+0xd2/0x140
[   63.414161]  [<ffffffff812343ea>] load_elf_binary+0x32a/0xe70
[   63.414161]  [<ffffffff810d700d>] ? trace_hardirqs_off+0xd/0x10
[   63.414161]  [<ffffffff810b231f>] ? local_clock+0x6f/0x80
[   63.414161]  [<ffffffff810db115>] ? lock_release_holdtime+0x35/0x1a0
[   63.414161]  [<ffffffff810dd813>] ? __lock_release+0x133/0x1a0
[   63.414161]  [<ffffffff811e22e7>] ? search_binary_handler+0x1a7/0x4a0
[   63.414161]  [<ffffffff811e22f3>] search_binary_handler+0x1b3/0x4a0
[   63.414161]  [<ffffffff811e2194>] ? search_binary_handler+0x54/0x4a0
[   63.414161]  [<ffffffff812340c0>] ? set_brk+0xe0/0xe0
[   63.414161]  [<ffffffff811e284f>] do_execve_common+0x26f/0x320
[   63.414161]  [<ffffffff811bde33>] ? kmem_cache_alloc+0x113/0x220
[   63.414161]  [<ffffffff811e298a>] do_execve+0x3a/0x40
[   63.414161]  [<ffffffff8102061a>] sys_execve+0x4a/0x80
[   63.414161]  [<ffffffff81686c6c>] stub_execve+0x6c/0xc0
[   63.414161] Code: ff 03 00 00 48 c1 e7 0b 48 c1 e2 07 48 29 d7 48 03 3c c5 c0 27 d2 81 e8 a6 fe ff ff c9 c3 0f 1f 40 00 55 48 89 e5 0f 1f 44 00 00 <48> 8b 4f 60 89 f6 48 8d 44 31 40 65 44 8a 40 02 45 0f be c0 41 

The reason is that we don't update root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone
when onlining the node, and we try to access it.

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
---
 include/linux/memcontrol.h |    7 +++++++
 mm/memcontrol.c            |   14 ++++++++++++++
 mm/memory_hotplug.c        |    2 ++
 3 files changed, 23 insertions(+), 0 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8d9489f..87d8b77 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -182,6 +182,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						unsigned long *total_scanned);
 
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
+
+void update_root_mem_cgroup(int nid);
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void mem_cgroup_split_huge_fixup(struct page *head);
 #endif
@@ -374,6 +377,10 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
 				struct page *newpage)
 {
 }
+
+static inline void update_root_mem_cgroup(int nid)
+{
+}
 #endif /* CONFIG_MEMCG */
 
 #if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 795e525..c997a46 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3427,6 +3427,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
 	__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
 }
 
+/* NODE_DATA(nid) is changed */
+void update_root_mem_cgroup(int nid)
+{
+	struct mem_cgroup_per_node *pn;
+	struct mem_cgroup_per_zone *mz;
+	int zone;
+
+	pn = root_mem_cgroup->info.nodeinfo[nid];
+	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+		mz = &pn->zoneinfo[zone];
+		lruvec_init(&mz->lruvec, &NODE_DATA(nid)->node_zones[zone]);
+	}
+}
+
 #ifdef CONFIG_DEBUG_VM
 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
 {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3ad25f9..bf03b02 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -555,6 +555,8 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 
 	/* we can use NODE_DATA(nid) from here */
 
+	update_root_mem_cgroup(nid);
+
 	/* init node's zones as empty zones, we don't have any present pages.*/
 	free_area_init_node(nid, zones_size, start_pfn, zholes_size);
 
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH] memory cgroup: update root memory cgroup when node is onlined
  2012-09-13  7:14 [PATCH] memory cgroup: update root memory cgroup when node is onlined Wen Congyang
@ 2012-09-13 10:06 ` Kamezawa Hiroyuki
  2012-09-13 10:18   ` Wen Congyang
  2012-09-13 20:59 ` Johannes Weiner
  1 sibling, 1 reply; 21+ messages in thread
From: Kamezawa Hiroyuki @ 2012-09-13 10:06 UTC (permalink / raw)
  To: Wen Congyang
  Cc: linux-kernel, cgroups, linux-mm, Jiang Liu, hannes, mhocko,
	bsingharora, Andrew Morton, hughd, paul.gortmaker

(2012/09/13 16:14), Wen Congyang wrote:
> root_mem_cgroup->info.nodeinfo is initialized when the system boots.
> But NODE_DATA(nid) is null if the node is not onlined, so
> root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone contains
> an invalid pointer. If we use numactl to bind a program to the node
> after onlining the node and its memory, it will cause the kernel
> panicked:
>
> [   63.413436] BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60
> [   63.414161] IP: [<ffffffff811870b9>] __mod_zone_page_state+0x9/0x60
> [   63.414161] PGD 0
> [   63.414161] Oops: 0000 [#1] SMP
> [   63.414161] Modules linked in: acpi_memhotplug binfmt_misc dm_mirror dm_region_hash dm_log dm_mod ppdev sg microcode pcspkr virtio_console virtio_balloon snd_intel8x0 snd_ac9
> 7_codec ac97_bus snd_seq snd_seq_device snd_pcm snd_timer snd soundcore snd_page_alloc e1000 i2c_piix4 i2c_core floppy parport_pc parport sr_mod cdrom virtio_blk pata_acpi ata_g
> eneric ata_piix libata scsi_mod
> [   63.414161] CPU 2
> [   63.414161] Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs
> ...
> [   63.414161] Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0)
> [   63.414161] Stack:
> [   63.414161]  ffff880039abdaf8 ffffffff8117390f ffff880039abdaf8 000000008167c601
> [   63.414161]  ffffffff81174162 ffff88003a480f00 0000000000000001 ffff8800395e0000
> [   63.414161]  ffff88003dbd0e80 0000000000000282 ffff880039abdb48 ffffffff81174181
> [   63.414161] Call Trace:
> [   63.414161]  [<ffffffff8117390f>] __pagevec_lru_add_fn+0xdf/0x140
> [   63.414161]  [<ffffffff81174162>] ? pagevec_lru_move_fn+0x92/0x100
> [   63.414161]  [<ffffffff81174181>] pagevec_lru_move_fn+0xb1/0x100
> [   63.414161]  [<ffffffff81173830>] ? lru_add_page_tail+0x1b0/0x1b0
> [   63.414161]  [<ffffffff811dff71>] ? exec_mmap+0x121/0x230
> [   63.414161]  [<ffffffff811741ec>] __pagevec_lru_add+0x1c/0x30
> [   63.414161]  [<ffffffff81174383>] lru_add_drain_cpu+0xa3/0x130
> [   63.414161]  [<ffffffff8117443f>] lru_add_drain+0x2f/0x40
> [   63.414161]  [<ffffffff81196da9>] exit_mmap+0x69/0x160
> [   63.414161]  [<ffffffff810db115>] ? lock_release_holdtime+0x35/0x1a0
> [   63.414161]  [<ffffffff81069c37>] mmput+0x77/0x100
> [   63.414161]  [<ffffffff811dffc0>] exec_mmap+0x170/0x230
> [   63.414161]  [<ffffffff811e0152>] flush_old_exec+0xd2/0x140
> [   63.414161]  [<ffffffff812343ea>] load_elf_binary+0x32a/0xe70
> [   63.414161]  [<ffffffff810d700d>] ? trace_hardirqs_off+0xd/0x10
> [   63.414161]  [<ffffffff810b231f>] ? local_clock+0x6f/0x80
> [   63.414161]  [<ffffffff810db115>] ? lock_release_holdtime+0x35/0x1a0
> [   63.414161]  [<ffffffff810dd813>] ? __lock_release+0x133/0x1a0
> [   63.414161]  [<ffffffff811e22e7>] ? search_binary_handler+0x1a7/0x4a0
> [   63.414161]  [<ffffffff811e22f3>] search_binary_handler+0x1b3/0x4a0
> [   63.414161]  [<ffffffff811e2194>] ? search_binary_handler+0x54/0x4a0
> [   63.414161]  [<ffffffff812340c0>] ? set_brk+0xe0/0xe0
> [   63.414161]  [<ffffffff811e284f>] do_execve_common+0x26f/0x320
> [   63.414161]  [<ffffffff811bde33>] ? kmem_cache_alloc+0x113/0x220
> [   63.414161]  [<ffffffff811e298a>] do_execve+0x3a/0x40
> [   63.414161]  [<ffffffff8102061a>] sys_execve+0x4a/0x80
> [   63.414161]  [<ffffffff81686c6c>] stub_execve+0x6c/0xc0
> [   63.414161] Code: ff 03 00 00 48 c1 e7 0b 48 c1 e2 07 48 29 d7 48 03 3c c5 c0 27 d2 81 e8 a6 fe ff ff c9 c3 0f 1f 40 00 55 48 89 e5 0f 1f 44 00 00 <48> 8b 4f 60 89 f6 48 8d 44 31 40 65 44 8a 40 02 45 0f be c0 41
>
> The reason is that we don't update root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone
> when onlining the node, and we try to access it.
>
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> Reported-by: Tang Chen <tangchen@cn.fujitsu.com>

Thank you !!!

But, I think, all memcg's lruvec should be updated.
I guess you'll see panic again if you put tasks under memcg and
allocates memory on a new node.
  
Could you dig more ?

Thanks,
-Kame

> ---
>   include/linux/memcontrol.h |    7 +++++++
>   mm/memcontrol.c            |   14 ++++++++++++++
>   mm/memory_hotplug.c        |    2 ++
>   3 files changed, 23 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 8d9489f..87d8b77 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -182,6 +182,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>   						unsigned long *total_scanned);
>
>   void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
> +
> +void update_root_mem_cgroup(int nid);
> +
>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>   void mem_cgroup_split_huge_fixup(struct page *head);
>   #endif
> @@ -374,6 +377,10 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
>   				struct page *newpage)
>   {
>   }
> +
> +static inline void update_root_mem_cgroup(int nid)
> +{
> +}
>   #endif /* CONFIG_MEMCG */
>
>   #if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 795e525..c997a46 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3427,6 +3427,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
>   	__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
>   }
>
> +/* NODE_DATA(nid) is changed */
> +void update_root_mem_cgroup(int nid)
> +{
> +	struct mem_cgroup_per_node *pn;
> +	struct mem_cgroup_per_zone *mz;
> +	int zone;
> +
> +	pn = root_mem_cgroup->info.nodeinfo[nid];
> +	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
> +		mz = &pn->zoneinfo[zone];
> +		lruvec_init(&mz->lruvec, &NODE_DATA(nid)->node_zones[zone]);
> +	}
> +}
> +
>   #ifdef CONFIG_DEBUG_VM
>   static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
>   {
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 3ad25f9..bf03b02 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -555,6 +555,8 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
>
>   	/* we can use NODE_DATA(nid) from here */
>
> +	update_root_mem_cgroup(nid);
> +
>   	/* init node's zones as empty zones, we don't have any present pages.*/
>   	free_area_init_node(nid, zones_size, start_pfn, zholes_size);
>
>



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memory cgroup: update root memory cgroup when node is onlined
  2012-09-13 10:06 ` Kamezawa Hiroyuki
@ 2012-09-13 10:18   ` Wen Congyang
  0 siblings, 0 replies; 21+ messages in thread
From: Wen Congyang @ 2012-09-13 10:18 UTC (permalink / raw)
  To: Kamezawa Hiroyuki
  Cc: linux-kernel, cgroups, linux-mm, Jiang Liu, hannes, mhocko,
	bsingharora, Andrew Morton, hughd, paul.gortmaker

At 09/13/2012 06:06 PM, Kamezawa Hiroyuki Wrote:
> (2012/09/13 16:14), Wen Congyang wrote:
>> root_mem_cgroup->info.nodeinfo is initialized when the system boots.
>> But NODE_DATA(nid) is null if the node is not onlined, so
>> root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone contains
>> an invalid pointer. If we use numactl to bind a program to the node
>> after onlining the node and its memory, it will cause the kernel
>> panicked:
>>
>> [   63.413436] BUG: unable to handle kernel NULL pointer dereference
>> at 0000000000000f60
>> [   63.414161] IP: [<ffffffff811870b9>] __mod_zone_page_state+0x9/0x60
>> [   63.414161] PGD 0
>> [   63.414161] Oops: 0000 [#1] SMP
>> [   63.414161] Modules linked in: acpi_memhotplug binfmt_misc
>> dm_mirror dm_region_hash dm_log dm_mod ppdev sg microcode pcspkr
>> virtio_console virtio_balloon snd_intel8x0 snd_ac9
>> 7_codec ac97_bus snd_seq snd_seq_device snd_pcm snd_timer snd
>> soundcore snd_page_alloc e1000 i2c_piix4 i2c_core floppy parport_pc
>> parport sr_mod cdrom virtio_blk pata_acpi ata_g
>> eneric ata_piix libata scsi_mod
>> [   63.414161] CPU 2
>> [   63.414161] Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180
>> Bochs Bochs
>> ...
>> [   63.414161] Process numactl (pid: 1219, threadinfo
>> ffff880039abc000, task ffff8800383c4ce0)
>> [   63.414161] Stack:
>> [   63.414161]  ffff880039abdaf8 ffffffff8117390f ffff880039abdaf8
>> 000000008167c601
>> [   63.414161]  ffffffff81174162 ffff88003a480f00 0000000000000001
>> ffff8800395e0000
>> [   63.414161]  ffff88003dbd0e80 0000000000000282 ffff880039abdb48
>> ffffffff81174181
>> [   63.414161] Call Trace:
>> [   63.414161]  [<ffffffff8117390f>] __pagevec_lru_add_fn+0xdf/0x140
>> [   63.414161]  [<ffffffff81174162>] ? pagevec_lru_move_fn+0x92/0x100
>> [   63.414161]  [<ffffffff81174181>] pagevec_lru_move_fn+0xb1/0x100
>> [   63.414161]  [<ffffffff81173830>] ? lru_add_page_tail+0x1b0/0x1b0
>> [   63.414161]  [<ffffffff811dff71>] ? exec_mmap+0x121/0x230
>> [   63.414161]  [<ffffffff811741ec>] __pagevec_lru_add+0x1c/0x30
>> [   63.414161]  [<ffffffff81174383>] lru_add_drain_cpu+0xa3/0x130
>> [   63.414161]  [<ffffffff8117443f>] lru_add_drain+0x2f/0x40
>> [   63.414161]  [<ffffffff81196da9>] exit_mmap+0x69/0x160
>> [   63.414161]  [<ffffffff810db115>] ? lock_release_holdtime+0x35/0x1a0
>> [   63.414161]  [<ffffffff81069c37>] mmput+0x77/0x100
>> [   63.414161]  [<ffffffff811dffc0>] exec_mmap+0x170/0x230
>> [   63.414161]  [<ffffffff811e0152>] flush_old_exec+0xd2/0x140
>> [   63.414161]  [<ffffffff812343ea>] load_elf_binary+0x32a/0xe70
>> [   63.414161]  [<ffffffff810d700d>] ? trace_hardirqs_off+0xd/0x10
>> [   63.414161]  [<ffffffff810b231f>] ? local_clock+0x6f/0x80
>> [   63.414161]  [<ffffffff810db115>] ? lock_release_holdtime+0x35/0x1a0
>> [   63.414161]  [<ffffffff810dd813>] ? __lock_release+0x133/0x1a0
>> [   63.414161]  [<ffffffff811e22e7>] ? search_binary_handler+0x1a7/0x4a0
>> [   63.414161]  [<ffffffff811e22f3>] search_binary_handler+0x1b3/0x4a0
>> [   63.414161]  [<ffffffff811e2194>] ? search_binary_handler+0x54/0x4a0
>> [   63.414161]  [<ffffffff812340c0>] ? set_brk+0xe0/0xe0
>> [   63.414161]  [<ffffffff811e284f>] do_execve_common+0x26f/0x320
>> [   63.414161]  [<ffffffff811bde33>] ? kmem_cache_alloc+0x113/0x220
>> [   63.414161]  [<ffffffff811e298a>] do_execve+0x3a/0x40
>> [   63.414161]  [<ffffffff8102061a>] sys_execve+0x4a/0x80
>> [   63.414161]  [<ffffffff81686c6c>] stub_execve+0x6c/0xc0
>> [   63.414161] Code: ff 03 00 00 48 c1 e7 0b 48 c1 e2 07 48 29 d7 48
>> 03 3c c5 c0 27 d2 81 e8 a6 fe ff ff c9 c3 0f 1f 40 00 55 48 89 e5 0f
>> 1f 44 00 00 <48> 8b 4f 60 89 f6 48 8d 44 31 40 65 44 8a 40 02 45 0f be
>> c0 41
>>
>> The reason is that we don't update
>> root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone
>> when onlining the node, and we try to access it.
>>
>> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
>> Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
> 
> Thank you !!!
> 
> But, I think, all memcg's lruvec should be updated.
> I guess you'll see panic again if you put tasks under memcg and
> allocates memory on a new node.
>  
> Could you dig more ?

OK, I will do it.

Thanks
Wen Congyang

> 
> Thanks,
> -Kame
> 
>> ---
>>   include/linux/memcontrol.h |    7 +++++++
>>   mm/memcontrol.c            |   14 ++++++++++++++
>>   mm/memory_hotplug.c        |    2 ++
>>   3 files changed, 23 insertions(+), 0 deletions(-)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index 8d9489f..87d8b77 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -182,6 +182,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct
>> zone *zone, int order,
>>                           unsigned long *total_scanned);
>>
>>   void mem_cgroup_count_vm_event(struct mm_struct *mm, enum
>> vm_event_item idx);
>> +
>> +void update_root_mem_cgroup(int nid);
>> +
>>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>   void mem_cgroup_split_huge_fixup(struct page *head);
>>   #endif
>> @@ -374,6 +377,10 @@ static inline void
>> mem_cgroup_replace_page_cache(struct page *oldpage,
>>                   struct page *newpage)
>>   {
>>   }
>> +
>> +static inline void update_root_mem_cgroup(int nid)
>> +{
>> +}
>>   #endif /* CONFIG_MEMCG */
>>
>>   #if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 795e525..c997a46 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -3427,6 +3427,20 @@ void mem_cgroup_replace_page_cache(struct page
>> *oldpage,
>>       __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
>>   }
>>
>> +/* NODE_DATA(nid) is changed */
>> +void update_root_mem_cgroup(int nid)
>> +{
>> +    struct mem_cgroup_per_node *pn;
>> +    struct mem_cgroup_per_zone *mz;
>> +    int zone;
>> +
>> +    pn = root_mem_cgroup->info.nodeinfo[nid];
>> +    for (zone = 0; zone < MAX_NR_ZONES; zone++) {
>> +        mz = &pn->zoneinfo[zone];
>> +        lruvec_init(&mz->lruvec, &NODE_DATA(nid)->node_zones[zone]);
>> +    }
>> +}
>> +
>>   #ifdef CONFIG_DEBUG_VM
>>   static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
>>   {
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index 3ad25f9..bf03b02 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -555,6 +555,8 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid,
>> u64 start)
>>
>>       /* we can use NODE_DATA(nid) from here */
>>
>> +    update_root_mem_cgroup(nid);
>> +
>>       /* init node's zones as empty zones, we don't have any present
>> pages.*/
>>       free_area_init_node(nid, zones_size, start_pfn, zholes_size);
>>
>>
> 
> 
> 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memory cgroup: update root memory cgroup when node is onlined
  2012-09-13  7:14 [PATCH] memory cgroup: update root memory cgroup when node is onlined Wen Congyang
  2012-09-13 10:06 ` Kamezawa Hiroyuki
@ 2012-09-13 20:59 ` Johannes Weiner
  2012-09-14  1:36   ` Hugh Dickins
  2012-09-14  1:46   ` Wen Congyang
  1 sibling, 2 replies; 21+ messages in thread
From: Johannes Weiner @ 2012-09-13 20:59 UTC (permalink / raw)
  To: Wen Congyang
  Cc: linux-kernel, cgroups, linux-mm, Jiang Liu, mhocko, bsingharora,
	KAMEZAWA Hiroyuki, Andrew Morton, hughd, paul.gortmaker

Hi,

On Thu, Sep 13, 2012 at 03:14:28PM +0800, Wen Congyang wrote:
> root_mem_cgroup->info.nodeinfo is initialized when the system boots.
> But NODE_DATA(nid) is null if the node is not onlined, so
> root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone contains
> an invalid pointer. If we use numactl to bind a program to the node
> after onlining the node and its memory, it will cause the kernel
> panicked:

Is there any chance we could get rid of the zone backpointer in lruvec
again instead?  Adding new nodes is a rare event and so updating every
single memcg in the system might be just borderline crazy.  But can't
we just go back to passing the zone along with the lruvec down
vmscan.c paths?  I agree it's ugly to pass both, given their
relationship.  But I don't think the backpointer is any cleaner but in
addition less robust.

That being said, the crashing code in particular makes me wonder:

static __always_inline void add_page_to_lru_list(struct page *page,
				struct lruvec *lruvec, enum lru_list lru)
{
	int nr_pages = hpage_nr_pages(page);
	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
	list_add(&page->lru, &lruvec->lists[lru]);
	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
}

Why did we ever pass zone in here and then felt the need to replace it
with lruvec->zone in fa9add6 "mm/memcg: apply add/del_page to lruvec"?
A page does not roam between zones, its zone is a static property that
can be retrieved with page_zone().

Johannes

> [   63.413436] BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60
> [   63.414161] IP: [<ffffffff811870b9>] __mod_zone_page_state+0x9/0x60
> [   63.414161] PGD 0 
> [   63.414161] Oops: 0000 [#1] SMP 
> [   63.414161] Modules linked in: acpi_memhotplug binfmt_misc dm_mirror dm_region_hash dm_log dm_mod ppdev sg microcode pcspkr virtio_console virtio_balloon snd_intel8x0 snd_ac9
> 7_codec ac97_bus snd_seq snd_seq_device snd_pcm snd_timer snd soundcore snd_page_alloc e1000 i2c_piix4 i2c_core floppy parport_pc parport sr_mod cdrom virtio_blk pata_acpi ata_g
> eneric ata_piix libata scsi_mod
> [   63.414161] CPU 2 
> [   63.414161] Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs
> ...
> [   63.414161] Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0)
> [   63.414161] Stack:
> [   63.414161]  ffff880039abdaf8 ffffffff8117390f ffff880039abdaf8 000000008167c601
> [   63.414161]  ffffffff81174162 ffff88003a480f00 0000000000000001 ffff8800395e0000
> [   63.414161]  ffff88003dbd0e80 0000000000000282 ffff880039abdb48 ffffffff81174181
> [   63.414161] Call Trace:
> [   63.414161]  [<ffffffff8117390f>] __pagevec_lru_add_fn+0xdf/0x140
> [   63.414161]  [<ffffffff81174162>] ? pagevec_lru_move_fn+0x92/0x100
> [   63.414161]  [<ffffffff81174181>] pagevec_lru_move_fn+0xb1/0x100
> [   63.414161]  [<ffffffff81173830>] ? lru_add_page_tail+0x1b0/0x1b0
> [   63.414161]  [<ffffffff811dff71>] ? exec_mmap+0x121/0x230
> [   63.414161]  [<ffffffff811741ec>] __pagevec_lru_add+0x1c/0x30
> [   63.414161]  [<ffffffff81174383>] lru_add_drain_cpu+0xa3/0x130
> [   63.414161]  [<ffffffff8117443f>] lru_add_drain+0x2f/0x40
> [   63.414161]  [<ffffffff81196da9>] exit_mmap+0x69/0x160
> [   63.414161]  [<ffffffff810db115>] ? lock_release_holdtime+0x35/0x1a0
> [   63.414161]  [<ffffffff81069c37>] mmput+0x77/0x100
> [   63.414161]  [<ffffffff811dffc0>] exec_mmap+0x170/0x230
> [   63.414161]  [<ffffffff811e0152>] flush_old_exec+0xd2/0x140
> [   63.414161]  [<ffffffff812343ea>] load_elf_binary+0x32a/0xe70
> [   63.414161]  [<ffffffff810d700d>] ? trace_hardirqs_off+0xd/0x10
> [   63.414161]  [<ffffffff810b231f>] ? local_clock+0x6f/0x80
> [   63.414161]  [<ffffffff810db115>] ? lock_release_holdtime+0x35/0x1a0
> [   63.414161]  [<ffffffff810dd813>] ? __lock_release+0x133/0x1a0
> [   63.414161]  [<ffffffff811e22e7>] ? search_binary_handler+0x1a7/0x4a0
> [   63.414161]  [<ffffffff811e22f3>] search_binary_handler+0x1b3/0x4a0
> [   63.414161]  [<ffffffff811e2194>] ? search_binary_handler+0x54/0x4a0
> [   63.414161]  [<ffffffff812340c0>] ? set_brk+0xe0/0xe0
> [   63.414161]  [<ffffffff811e284f>] do_execve_common+0x26f/0x320
> [   63.414161]  [<ffffffff811bde33>] ? kmem_cache_alloc+0x113/0x220
> [   63.414161]  [<ffffffff811e298a>] do_execve+0x3a/0x40
> [   63.414161]  [<ffffffff8102061a>] sys_execve+0x4a/0x80
> [   63.414161]  [<ffffffff81686c6c>] stub_execve+0x6c/0xc0
> [   63.414161] Code: ff 03 00 00 48 c1 e7 0b 48 c1 e2 07 48 29 d7 48 03 3c c5 c0 27 d2 81 e8 a6 fe ff ff c9 c3 0f 1f 40 00 55 48 89 e5 0f 1f 44 00 00 <48> 8b 4f 60 89 f6 48 8d 44 31 40 65 44 8a 40 02 45 0f be c0 41 
> 
> The reason is that we don't update root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone
> when onlining the node, and we try to access it.
> 
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
> ---
>  include/linux/memcontrol.h |    7 +++++++
>  mm/memcontrol.c            |   14 ++++++++++++++
>  mm/memory_hotplug.c        |    2 ++
>  3 files changed, 23 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 8d9489f..87d8b77 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -182,6 +182,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>  						unsigned long *total_scanned);
>  
>  void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
> +
> +void update_root_mem_cgroup(int nid);
> +
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  void mem_cgroup_split_huge_fixup(struct page *head);
>  #endif
> @@ -374,6 +377,10 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
>  				struct page *newpage)
>  {
>  }
> +
> +static inline void update_root_mem_cgroup(int nid)
> +{
> +}
>  #endif /* CONFIG_MEMCG */
>  
>  #if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 795e525..c997a46 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3427,6 +3427,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
>  	__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
>  }
>  
> +/* NODE_DATA(nid) is changed */
> +void update_root_mem_cgroup(int nid)
> +{
> +	struct mem_cgroup_per_node *pn;
> +	struct mem_cgroup_per_zone *mz;
> +	int zone;
> +
> +	pn = root_mem_cgroup->info.nodeinfo[nid];
> +	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
> +		mz = &pn->zoneinfo[zone];
> +		lruvec_init(&mz->lruvec, &NODE_DATA(nid)->node_zones[zone]);
> +	}
> +}
> +
>  #ifdef CONFIG_DEBUG_VM
>  static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
>  {
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 3ad25f9..bf03b02 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -555,6 +555,8 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
>  
>  	/* we can use NODE_DATA(nid) from here */
>  
> +	update_root_mem_cgroup(nid);
> +
>  	/* init node's zones as empty zones, we don't have any present pages.*/
>  	free_area_init_node(nid, zones_size, start_pfn, zholes_size);
>  
> -- 
> 1.7.1

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memory cgroup: update root memory cgroup when node is onlined
  2012-09-13 20:59 ` Johannes Weiner
@ 2012-09-14  1:36   ` Hugh Dickins
  2012-09-14  1:53     ` Wen Congyang
                       ` (5 more replies)
  2012-09-14  1:46   ` Wen Congyang
  1 sibling, 6 replies; 21+ messages in thread
From: Hugh Dickins @ 2012-09-14  1:36 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Wen Congyang, linux-kernel, cgroups, linux-mm, Jiang Liu, mhocko,
	bsingharora, KAMEZAWA Hiroyuki, Andrew Morton,
	Konstantin Khlebnikov, paul.gortmaker

On Thu, 13 Sep 2012, Johannes Weiner wrote:
> On Thu, Sep 13, 2012 at 03:14:28PM +0800, Wen Congyang wrote:
> > root_mem_cgroup->info.nodeinfo is initialized when the system boots.
> > But NODE_DATA(nid) is null if the node is not onlined, so
> > root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone contains
> > an invalid pointer. If we use numactl to bind a program to the node
> > after onlining the node and its memory, it will cause the kernel
> > panicked:
> 
> Is there any chance we could get rid of the zone backpointer in lruvec
> again instead?

It could be done, but it would make me sad :(

> Adding new nodes is a rare event and so updating every
> single memcg in the system might be just borderline crazy.

Not horribly crazy, but rather ugly, yes.

> But can't
> we just go back to passing the zone along with the lruvec down
> vmscan.c paths?  I agree it's ugly to pass both, given their
> relationship.  But I don't think the backpointer is any cleaner but in
> addition less robust.

It's like how we use vma->mm: we could change everywhere to pass mm with
vma, but it looks cleaner and cuts down on long arglists to have mm in vma.
>From past experience, one of the things I worried about was adding extra
args to the reclaim stack.

> 
> That being said, the crashing code in particular makes me wonder:
> 
> static __always_inline void add_page_to_lru_list(struct page *page,
> 				struct lruvec *lruvec, enum lru_list lru)
> {
> 	int nr_pages = hpage_nr_pages(page);
> 	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
> 	list_add(&page->lru, &lruvec->lists[lru]);
> 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
> }
> 
> Why did we ever pass zone in here and then felt the need to replace it
> with lruvec->zone in fa9add6 "mm/memcg: apply add/del_page to lruvec"?
> A page does not roam between zones, its zone is a static property that
> can be retrieved with page_zone().

Just as in vmscan.c, we have the lruvec to hand, and that's what we
mainly want to operate upon, but there is also some need for zone.

(Both Konstantin and I were looking towards the day when we move the
lru_lock into the lruvec, removing more dependence on "zone".  Pretty
much the only reason that hasn't happened yet, is that we have not found
time to make a performance case convincingly - but that's another topic.)

Yes, page_zone(page) is a static property of the page, but it's not
necessarily cheap to evaluate: depends on how complex the memory model
and the spare page flags space, doesn't it?  We both preferred to
derive zone from lruvec where convenient.

How do you feel about this patch, and does it work for you guys?

You'd be right if you guessed that I started out without the
mem_cgroup_zone_lruvec part of it, but oops in get_scan_count
told me that's needed too.

Description to be filled in later: would it be needed for -stable,
or is onlining already broken in other ways that you're now fixing up?

Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
---

 include/linux/mmzone.h |    2 -
 mm/memcontrol.c        |   40 ++++++++++++++++++++++++++++++++-------
 mm/mmzone.c            |    6 -----
 mm/page_alloc.c        |    2 -
 4 files changed, 36 insertions(+), 14 deletions(-)

--- 3.6-rc5/include/linux/mmzone.h	2012-08-03 08:31:26.892842267 -0700
+++ linux/include/linux/mmzone.h	2012-09-13 17:07:51.893772372 -0700
@@ -744,7 +744,7 @@ extern int init_currently_empty_zone(str
 				     unsigned long size,
 				     enum memmap_context context);
 
-extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
+extern void lruvec_init(struct lruvec *lruvec);
 
 static inline struct zone *lruvec_zone(struct lruvec *lruvec)
 {
--- 3.6-rc5/mm/memcontrol.c	2012-08-03 08:31:27.060842270 -0700
+++ linux/mm/memcontrol.c	2012-09-13 17:46:36.870804625 -0700
@@ -1061,12 +1061,25 @@ struct lruvec *mem_cgroup_zone_lruvec(st
 				      struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_per_zone *mz;
+	struct lruvec *lruvec;
 
-	if (mem_cgroup_disabled())
-		return &zone->lruvec;
+	if (mem_cgroup_disabled()) {
+		lruvec = &zone->lruvec;
+		goto out;
+	}
 
 	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
-	return &mz->lruvec;
+	lruvec = &mz->lruvec;
+out:
+	/*
+	 * Since a node can be onlined after the mem_cgroup was created,
+	 * we have to be prepared to initialize lruvec->zone here.
+	 */
+	if (unlikely(lruvec->zone != zone)) {
+		VM_BUG_ON(lruvec->zone);
+		lruvec->zone = zone;
+	}
+	return lruvec;
 }
 
 /*
@@ -1093,9 +1106,12 @@ struct lruvec *mem_cgroup_page_lruvec(st
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc;
+	struct lruvec *lruvec;
 
-	if (mem_cgroup_disabled())
-		return &zone->lruvec;
+	if (mem_cgroup_disabled()) {
+		lruvec = &zone->lruvec;
+		goto out;
+	}
 
 	pc = lookup_page_cgroup(page);
 	memcg = pc->mem_cgroup;
@@ -1113,7 +1129,17 @@ struct lruvec *mem_cgroup_page_lruvec(st
 		pc->mem_cgroup = memcg = root_mem_cgroup;
 
 	mz = page_cgroup_zoneinfo(memcg, page);
-	return &mz->lruvec;
+	lruvec = &mz->lruvec;
+out:
+	/*
+	 * Since a node can be onlined after the mem_cgroup was created,
+	 * we have to be prepared to initialize lruvec->zone here.
+	 */
+	if (unlikely(lruvec->zone != zone)) {
+		VM_BUG_ON(lruvec->zone);
+		lruvec->zone = zone;
+	}
+	return lruvec;
 }
 
 /**
@@ -4742,7 +4768,7 @@ static int alloc_mem_cgroup_per_zone_inf
 
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
-		lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
+		lruvec_init(&mz->lruvec);
 		mz->usage_in_excess = 0;
 		mz->on_tree = false;
 		mz->memcg = memcg;
--- 3.6-rc5/mm/mmzone.c	2012-08-03 08:31:27.064842271 -0700
+++ linux/mm/mmzone.c	2012-09-13 17:06:28.921766001 -0700
@@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pf
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
 
-void lruvec_init(struct lruvec *lruvec, struct zone *zone)
+void lruvec_init(struct lruvec *lruvec)
 {
 	enum lru_list lru;
 
@@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec,
 
 	for_each_lru(lru)
 		INIT_LIST_HEAD(&lruvec->lists[lru]);
-
-#ifdef CONFIG_MEMCG
-	lruvec->zone = zone;
-#endif
 }
--- 3.6-rc5/mm/page_alloc.c	2012-08-22 14:25:39.508279046 -0700
+++ linux/mm/page_alloc.c	2012-09-13 17:06:08.265763526 -0700
@@ -4456,7 +4456,7 @@ static void __paginginit free_area_init_
 		zone->zone_pgdat = pgdat;
 
 		zone_pcp_init(zone);
-		lruvec_init(&zone->lruvec, zone);
+		lruvec_init(&zone->lruvec);
 		if (!size)
 			continue;
 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memory cgroup: update root memory cgroup when node is onlined
  2012-09-13 20:59 ` Johannes Weiner
  2012-09-14  1:36   ` Hugh Dickins
@ 2012-09-14  1:46   ` Wen Congyang
  2012-09-17  6:40     ` Hugh Dickins
  1 sibling, 1 reply; 21+ messages in thread
From: Wen Congyang @ 2012-09-14  1:46 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: linux-kernel, cgroups, linux-mm, Jiang Liu, mhocko, bsingharora,
	KAMEZAWA Hiroyuki, Andrew Morton, hughd, paul.gortmaker

At 09/14/2012 04:59 AM, Johannes Weiner Wrote:
> Hi,
> 
> On Thu, Sep 13, 2012 at 03:14:28PM +0800, Wen Congyang wrote:
>> root_mem_cgroup->info.nodeinfo is initialized when the system boots.
>> But NODE_DATA(nid) is null if the node is not onlined, so
>> root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone contains
>> an invalid pointer. If we use numactl to bind a program to the node
>> after onlining the node and its memory, it will cause the kernel
>> panicked:
> 
> Is there any chance we could get rid of the zone backpointer in lruvec
> again instead?  Adding new nodes is a rare event and so updating every
> single memcg in the system might be just borderline crazy.  But can't
> we just go back to passing the zone along with the lruvec down
> vmscan.c paths?  I agree it's ugly to pass both, given their
> relationship.  But I don't think the backpointer is any cleaner but in
> addition less robust.
> 
> That being said, the crashing code in particular makes me wonder:
> 
> static __always_inline void add_page_to_lru_list(struct page *page,
> 				struct lruvec *lruvec, enum lru_list lru)
> {
> 	int nr_pages = hpage_nr_pages(page);
> 	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
> 	list_add(&page->lru, &lruvec->lists[lru]);
> 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
> }
> 
> Why did we ever pass zone in here and then felt the need to replace it
> with lruvec->zone in fa9add6 "mm/memcg: apply add/del_page to lruvec"?
> A page does not roam between zones, its zone is a static property that
> can be retrieved with page_zone().

Hmm, if we don't update lruvec_zone(lruvec) when a node is onlined,
lruver_zone(lruvec) contains a invalid pointer, so we should check
all place that accesses lruvec_zone(lruvec). We store zone in lruvec
but we can't access it. And we can't avoid to access it in the
furture. So I think it is better to update it when the node is
onlined.

Thanks
Wen Congyang

> 
> Johannes
> 
>> [   63.413436] BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60
>> [   63.414161] IP: [<ffffffff811870b9>] __mod_zone_page_state+0x9/0x60
>> [   63.414161] PGD 0 
>> [   63.414161] Oops: 0000 [#1] SMP 
>> [   63.414161] Modules linked in: acpi_memhotplug binfmt_misc dm_mirror dm_region_hash dm_log dm_mod ppdev sg microcode pcspkr virtio_console virtio_balloon snd_intel8x0 snd_ac9
>> 7_codec ac97_bus snd_seq snd_seq_device snd_pcm snd_timer snd soundcore snd_page_alloc e1000 i2c_piix4 i2c_core floppy parport_pc parport sr_mod cdrom virtio_blk pata_acpi ata_g
>> eneric ata_piix libata scsi_mod
>> [   63.414161] CPU 2 
>> [   63.414161] Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs
>> ...
>> [   63.414161] Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0)
>> [   63.414161] Stack:
>> [   63.414161]  ffff880039abdaf8 ffffffff8117390f ffff880039abdaf8 000000008167c601
>> [   63.414161]  ffffffff81174162 ffff88003a480f00 0000000000000001 ffff8800395e0000
>> [   63.414161]  ffff88003dbd0e80 0000000000000282 ffff880039abdb48 ffffffff81174181
>> [   63.414161] Call Trace:
>> [   63.414161]  [<ffffffff8117390f>] __pagevec_lru_add_fn+0xdf/0x140
>> [   63.414161]  [<ffffffff81174162>] ? pagevec_lru_move_fn+0x92/0x100
>> [   63.414161]  [<ffffffff81174181>] pagevec_lru_move_fn+0xb1/0x100
>> [   63.414161]  [<ffffffff81173830>] ? lru_add_page_tail+0x1b0/0x1b0
>> [   63.414161]  [<ffffffff811dff71>] ? exec_mmap+0x121/0x230
>> [   63.414161]  [<ffffffff811741ec>] __pagevec_lru_add+0x1c/0x30
>> [   63.414161]  [<ffffffff81174383>] lru_add_drain_cpu+0xa3/0x130
>> [   63.414161]  [<ffffffff8117443f>] lru_add_drain+0x2f/0x40
>> [   63.414161]  [<ffffffff81196da9>] exit_mmap+0x69/0x160
>> [   63.414161]  [<ffffffff810db115>] ? lock_release_holdtime+0x35/0x1a0
>> [   63.414161]  [<ffffffff81069c37>] mmput+0x77/0x100
>> [   63.414161]  [<ffffffff811dffc0>] exec_mmap+0x170/0x230
>> [   63.414161]  [<ffffffff811e0152>] flush_old_exec+0xd2/0x140
>> [   63.414161]  [<ffffffff812343ea>] load_elf_binary+0x32a/0xe70
>> [   63.414161]  [<ffffffff810d700d>] ? trace_hardirqs_off+0xd/0x10
>> [   63.414161]  [<ffffffff810b231f>] ? local_clock+0x6f/0x80
>> [   63.414161]  [<ffffffff810db115>] ? lock_release_holdtime+0x35/0x1a0
>> [   63.414161]  [<ffffffff810dd813>] ? __lock_release+0x133/0x1a0
>> [   63.414161]  [<ffffffff811e22e7>] ? search_binary_handler+0x1a7/0x4a0
>> [   63.414161]  [<ffffffff811e22f3>] search_binary_handler+0x1b3/0x4a0
>> [   63.414161]  [<ffffffff811e2194>] ? search_binary_handler+0x54/0x4a0
>> [   63.414161]  [<ffffffff812340c0>] ? set_brk+0xe0/0xe0
>> [   63.414161]  [<ffffffff811e284f>] do_execve_common+0x26f/0x320
>> [   63.414161]  [<ffffffff811bde33>] ? kmem_cache_alloc+0x113/0x220
>> [   63.414161]  [<ffffffff811e298a>] do_execve+0x3a/0x40
>> [   63.414161]  [<ffffffff8102061a>] sys_execve+0x4a/0x80
>> [   63.414161]  [<ffffffff81686c6c>] stub_execve+0x6c/0xc0
>> [   63.414161] Code: ff 03 00 00 48 c1 e7 0b 48 c1 e2 07 48 29 d7 48 03 3c c5 c0 27 d2 81 e8 a6 fe ff ff c9 c3 0f 1f 40 00 55 48 89 e5 0f 1f 44 00 00 <48> 8b 4f 60 89 f6 48 8d 44 31 40 65 44 8a 40 02 45 0f be c0 41 
>>
>> The reason is that we don't update root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone
>> when onlining the node, and we try to access it.
>>
>> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
>> Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
>> ---
>>  include/linux/memcontrol.h |    7 +++++++
>>  mm/memcontrol.c            |   14 ++++++++++++++
>>  mm/memory_hotplug.c        |    2 ++
>>  3 files changed, 23 insertions(+), 0 deletions(-)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index 8d9489f..87d8b77 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -182,6 +182,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>>  						unsigned long *total_scanned);
>>  
>>  void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
>> +
>> +void update_root_mem_cgroup(int nid);
>> +
>>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>  void mem_cgroup_split_huge_fixup(struct page *head);
>>  #endif
>> @@ -374,6 +377,10 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
>>  				struct page *newpage)
>>  {
>>  }
>> +
>> +static inline void update_root_mem_cgroup(int nid)
>> +{
>> +}
>>  #endif /* CONFIG_MEMCG */
>>  
>>  #if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 795e525..c997a46 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -3427,6 +3427,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
>>  	__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
>>  }
>>  
>> +/* NODE_DATA(nid) is changed */
>> +void update_root_mem_cgroup(int nid)
>> +{
>> +	struct mem_cgroup_per_node *pn;
>> +	struct mem_cgroup_per_zone *mz;
>> +	int zone;
>> +
>> +	pn = root_mem_cgroup->info.nodeinfo[nid];
>> +	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
>> +		mz = &pn->zoneinfo[zone];
>> +		lruvec_init(&mz->lruvec, &NODE_DATA(nid)->node_zones[zone]);
>> +	}
>> +}
>> +
>>  #ifdef CONFIG_DEBUG_VM
>>  static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
>>  {
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index 3ad25f9..bf03b02 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -555,6 +555,8 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
>>  
>>  	/* we can use NODE_DATA(nid) from here */
>>  
>> +	update_root_mem_cgroup(nid);
>> +
>>  	/* init node's zones as empty zones, we don't have any present pages.*/
>>  	free_area_init_node(nid, zones_size, start_pfn, zholes_size);
>>  
>> -- 
>> 1.7.1
> 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memory cgroup: update root memory cgroup when node is onlined
  2012-09-14  1:36   ` Hugh Dickins
@ 2012-09-14  1:53     ` Wen Congyang
  2012-09-14 15:46     ` Johannes Weiner
                       ` (4 subsequent siblings)
  5 siblings, 0 replies; 21+ messages in thread
From: Wen Congyang @ 2012-09-14  1:53 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Johannes Weiner, linux-kernel, cgroups, linux-mm, Jiang Liu,
	mhocko, bsingharora, KAMEZAWA Hiroyuki, Andrew Morton,
	Konstantin Khlebnikov, paul.gortmaker

At 09/14/2012 09:36 AM, Hugh Dickins Wrote:
> On Thu, 13 Sep 2012, Johannes Weiner wrote:
>> On Thu, Sep 13, 2012 at 03:14:28PM +0800, Wen Congyang wrote:
>>> root_mem_cgroup->info.nodeinfo is initialized when the system boots.
>>> But NODE_DATA(nid) is null if the node is not onlined, so
>>> root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone contains
>>> an invalid pointer. If we use numactl to bind a program to the node
>>> after onlining the node and its memory, it will cause the kernel
>>> panicked:
>>
>> Is there any chance we could get rid of the zone backpointer in lruvec
>> again instead?
> 
> It could be done, but it would make me sad :(
> 
>> Adding new nodes is a rare event and so updating every
>> single memcg in the system might be just borderline crazy.
> 
> Not horribly crazy, but rather ugly, yes.
> 
>> But can't
>> we just go back to passing the zone along with the lruvec down
>> vmscan.c paths?  I agree it's ugly to pass both, given their
>> relationship.  But I don't think the backpointer is any cleaner but in
>> addition less robust.
> 
> It's like how we use vma->mm: we could change everywhere to pass mm with
> vma, but it looks cleaner and cuts down on long arglists to have mm in vma.
>>From past experience, one of the things I worried about was adding extra
> args to the reclaim stack.
> 
>>
>> That being said, the crashing code in particular makes me wonder:
>>
>> static __always_inline void add_page_to_lru_list(struct page *page,
>> 				struct lruvec *lruvec, enum lru_list lru)
>> {
>> 	int nr_pages = hpage_nr_pages(page);
>> 	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
>> 	list_add(&page->lru, &lruvec->lists[lru]);
>> 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
>> }
>>
>> Why did we ever pass zone in here and then felt the need to replace it
>> with lruvec->zone in fa9add6 "mm/memcg: apply add/del_page to lruvec"?
>> A page does not roam between zones, its zone is a static property that
>> can be retrieved with page_zone().
> 
> Just as in vmscan.c, we have the lruvec to hand, and that's what we
> mainly want to operate upon, but there is also some need for zone.
> 
> (Both Konstantin and I were looking towards the day when we move the
> lru_lock into the lruvec, removing more dependence on "zone".  Pretty
> much the only reason that hasn't happened yet, is that we have not found
> time to make a performance case convincingly - but that's another topic.)
> 
> Yes, page_zone(page) is a static property of the page, but it's not
> necessarily cheap to evaluate: depends on how complex the memory model
> and the spare page flags space, doesn't it?  We both preferred to
> derive zone from lruvec where convenient.
> 
> How do you feel about this patch, and does it work for you guys?
> 
> You'd be right if you guessed that I started out without the
> mem_cgroup_zone_lruvec part of it, but oops in get_scan_count
> told me that's needed too.
> 
> Description to be filled in later: would it be needed for -stable,
> or is onlining already broken in other ways that you're now fixing up?

We will test your patch, please wait some time.

> 
> Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
> Signed-off-by: Hugh Dickins <hughd@google.com>
> ---
> 
>  include/linux/mmzone.h |    2 -
>  mm/memcontrol.c        |   40 ++++++++++++++++++++++++++++++++-------
>  mm/mmzone.c            |    6 -----
>  mm/page_alloc.c        |    2 -
>  4 files changed, 36 insertions(+), 14 deletions(-)
> 
> --- 3.6-rc5/include/linux/mmzone.h	2012-08-03 08:31:26.892842267 -0700
> +++ linux/include/linux/mmzone.h	2012-09-13 17:07:51.893772372 -0700
> @@ -744,7 +744,7 @@ extern int init_currently_empty_zone(str
>  				     unsigned long size,
>  				     enum memmap_context context);
>  
> -extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
> +extern void lruvec_init(struct lruvec *lruvec);
>  
>  static inline struct zone *lruvec_zone(struct lruvec *lruvec)
>  {
> --- 3.6-rc5/mm/memcontrol.c	2012-08-03 08:31:27.060842270 -0700
> +++ linux/mm/memcontrol.c	2012-09-13 17:46:36.870804625 -0700
> @@ -1061,12 +1061,25 @@ struct lruvec *mem_cgroup_zone_lruvec(st
>  				      struct mem_cgroup *memcg)
>  {
>  	struct mem_cgroup_per_zone *mz;
> +	struct lruvec *lruvec;
>  
> -	if (mem_cgroup_disabled())
> -		return &zone->lruvec;
> +	if (mem_cgroup_disabled()) {
> +		lruvec = &zone->lruvec;
> +		goto out;
> +	}
>  
>  	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
> -	return &mz->lruvec;
> +	lruvec = &mz->lruvec;
> +out:
> +	/*
> +	 * Since a node can be onlined after the mem_cgroup was created,
> +	 * we have to be prepared to initialize lruvec->zone here.
> +	 */
> +	if (unlikely(lruvec->zone != zone)) {
> +		VM_BUG_ON(lruvec->zone);
> +		lruvec->zone = zone;
> +	}
> +	return lruvec;
>  }
>  
>  /*
> @@ -1093,9 +1106,12 @@ struct lruvec *mem_cgroup_page_lruvec(st
>  	struct mem_cgroup_per_zone *mz;
>  	struct mem_cgroup *memcg;
>  	struct page_cgroup *pc;
> +	struct lruvec *lruvec;
>  
> -	if (mem_cgroup_disabled())
> -		return &zone->lruvec;
> +	if (mem_cgroup_disabled()) {
> +		lruvec = &zone->lruvec;
> +		goto out;
> +	}
>  
>  	pc = lookup_page_cgroup(page);
>  	memcg = pc->mem_cgroup;
> @@ -1113,7 +1129,17 @@ struct lruvec *mem_cgroup_page_lruvec(st
>  		pc->mem_cgroup = memcg = root_mem_cgroup;
>  
>  	mz = page_cgroup_zoneinfo(memcg, page);
> -	return &mz->lruvec;
> +	lruvec = &mz->lruvec;
> +out:
> +	/*
> +	 * Since a node can be onlined after the mem_cgroup was created,
> +	 * we have to be prepared to initialize lruvec->zone here.
> +	 */
> +	if (unlikely(lruvec->zone != zone)) {
> +		VM_BUG_ON(lruvec->zone);

According your comment, lruver->zone != zone if a node is onlined. So
it is not a bug, and VM_BUG_ON() should not be used here.

Thanks
Wen Congyang

> +		lruvec->zone = zone;
> +	}
> +	return lruvec;
>  }
>  
>  /**
> @@ -4742,7 +4768,7 @@ static int alloc_mem_cgroup_per_zone_inf
>  
>  	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
>  		mz = &pn->zoneinfo[zone];
> -		lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
> +		lruvec_init(&mz->lruvec);
>  		mz->usage_in_excess = 0;
>  		mz->on_tree = false;
>  		mz->memcg = memcg;
> --- 3.6-rc5/mm/mmzone.c	2012-08-03 08:31:27.064842271 -0700
> +++ linux/mm/mmzone.c	2012-09-13 17:06:28.921766001 -0700
> @@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pf
>  }
>  #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
>  
> -void lruvec_init(struct lruvec *lruvec, struct zone *zone)
> +void lruvec_init(struct lruvec *lruvec)
>  {
>  	enum lru_list lru;
>  
> @@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec,
>  
>  	for_each_lru(lru)
>  		INIT_LIST_HEAD(&lruvec->lists[lru]);
> -
> -#ifdef CONFIG_MEMCG
> -	lruvec->zone = zone;
> -#endif
>  }
> --- 3.6-rc5/mm/page_alloc.c	2012-08-22 14:25:39.508279046 -0700
> +++ linux/mm/page_alloc.c	2012-09-13 17:06:08.265763526 -0700
> @@ -4456,7 +4456,7 @@ static void __paginginit free_area_init_
>  		zone->zone_pgdat = pgdat;
>  
>  		zone_pcp_init(zone);
> -		lruvec_init(&zone->lruvec, zone);
> +		lruvec_init(&zone->lruvec);
>  		if (!size)
>  			continue;
>  
> 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memory cgroup: update root memory cgroup when node is onlined
  2012-09-14  1:36   ` Hugh Dickins
  2012-09-14  1:53     ` Wen Congyang
@ 2012-09-14 15:46     ` Johannes Weiner
  2012-09-15 10:56     ` Konstantin Khlebnikov
                       ` (3 subsequent siblings)
  5 siblings, 0 replies; 21+ messages in thread
From: Johannes Weiner @ 2012-09-14 15:46 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Wen Congyang, linux-kernel, cgroups, linux-mm, Jiang Liu, mhocko,
	bsingharora, KAMEZAWA Hiroyuki, Andrew Morton,
	Konstantin Khlebnikov, paul.gortmaker

On Thu, Sep 13, 2012 at 06:36:34PM -0700, Hugh Dickins wrote:
> On Thu, 13 Sep 2012, Johannes Weiner wrote:
> > On Thu, Sep 13, 2012 at 03:14:28PM +0800, Wen Congyang wrote:
> > > root_mem_cgroup->info.nodeinfo is initialized when the system boots.
> > > But NODE_DATA(nid) is null if the node is not onlined, so
> > > root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone contains
> > > an invalid pointer. If we use numactl to bind a program to the node
> > > after onlining the node and its memory, it will cause the kernel
> > > panicked:
> > 
> > Is there any chance we could get rid of the zone backpointer in lruvec
> > again instead?
> 
> It could be done, but it would make me sad :(

We would not want that!

> > But can't
> > we just go back to passing the zone along with the lruvec down
> > vmscan.c paths?  I agree it's ugly to pass both, given their
> > relationship.  But I don't think the backpointer is any cleaner but in
> > addition less robust.
> 
> It's like how we use vma->mm: we could change everywhere to pass mm with
> vma, but it looks cleaner and cuts down on long arglists to have mm in vma.
> >From past experience, one of the things I worried about was adding extra
> args to the reclaim stack.

Ok, you certainly have a point.

> > That being said, the crashing code in particular makes me wonder:
> > 
> > static __always_inline void add_page_to_lru_list(struct page *page,
> > 				struct lruvec *lruvec, enum lru_list lru)
> > {
> > 	int nr_pages = hpage_nr_pages(page);
> > 	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
> > 	list_add(&page->lru, &lruvec->lists[lru]);
> > 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
> > }
> > 
> > Why did we ever pass zone in here and then felt the need to replace it
> > with lruvec->zone in fa9add6 "mm/memcg: apply add/del_page to lruvec"?
> > A page does not roam between zones, its zone is a static property that
> > can be retrieved with page_zone().
> 
> Just as in vmscan.c, we have the lruvec to hand, and that's what we
> mainly want to operate upon, but there is also some need for zone.
> 
> (Both Konstantin and I were looking towards the day when we move the
> lru_lock into the lruvec, removing more dependence on "zone".  Pretty
> much the only reason that hasn't happened yet, is that we have not found
> time to make a performance case convincingly - but that's another topic.)
> 
> Yes, page_zone(page) is a static property of the page, but it's not
> necessarily cheap to evaluate: depends on how complex the memory model
> and the spare page flags space, doesn't it?  We both preferred to
> derive zone from lruvec where convenient.
> 
> How do you feel about this patch, and does it work for you guys?
> 
> You'd be right if you guessed that I started out without the
> mem_cgroup_zone_lruvec part of it, but oops in get_scan_count
> told me that's needed too.
> 
> Description to be filled in later: would it be needed for -stable,
> or is onlining already broken in other ways that you're now fixing up?
> 
> Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
> Signed-off-by: Hugh Dickins <hughd@google.com>

This looks good to me, thanks.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memory cgroup: update root memory cgroup when node is onlined
  2012-09-14  1:36   ` Hugh Dickins
  2012-09-14  1:53     ` Wen Congyang
  2012-09-14 15:46     ` Johannes Weiner
@ 2012-09-15 10:56     ` Konstantin Khlebnikov
  2012-09-17  5:50     ` Wen Congyang
                       ` (2 subsequent siblings)
  5 siblings, 0 replies; 21+ messages in thread
From: Konstantin Khlebnikov @ 2012-09-15 10:56 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Johannes Weiner, Wen Congyang, linux-kernel, cgroups, linux-mm,
	Jiang Liu, mhocko, bsingharora, KAMEZAWA Hiroyuki, Andrew Morton,
	paul.gortmaker

Hugh Dickins wrote:
> On Thu, 13 Sep 2012, Johannes Weiner wrote:
>> On Thu, Sep 13, 2012 at 03:14:28PM +0800, Wen Congyang wrote:
>>> root_mem_cgroup->info.nodeinfo is initialized when the system boots.
>>> But NODE_DATA(nid) is null if the node is not onlined, so
>>> root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone contains
>>> an invalid pointer. If we use numactl to bind a program to the node
>>> after onlining the node and its memory, it will cause the kernel
>>> panicked:
>>
>> Is there any chance we could get rid of the zone backpointer in lruvec
>> again instead?
>
> It could be done, but it would make me sad :(

Me too

>
>> Adding new nodes is a rare event and so updating every
>> single memcg in the system might be just borderline crazy.
>
> Not horribly crazy, but rather ugly, yes.
>
>> But can't
>> we just go back to passing the zone along with the lruvec down
>> vmscan.c paths?  I agree it's ugly to pass both, given their
>> relationship.  But I don't think the backpointer is any cleaner but in
>> addition less robust.
>
> It's like how we use vma->mm: we could change everywhere to pass mm with
> vma, but it looks cleaner and cuts down on long arglists to have mm in vma.
>  From past experience, one of the things I worried about was adding extra
> args to the reclaim stack.
>
>>
>> That being said, the crashing code in particular makes me wonder:
>>
>> static __always_inline void add_page_to_lru_list(struct page *page,
>> 				struct lruvec *lruvec, enum lru_list lru)
>> {
>> 	int nr_pages = hpage_nr_pages(page);
>> 	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
>> 	list_add(&page->lru,&lruvec->lists[lru]);
>> 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
>> }
>>
>> Why did we ever pass zone in here and then felt the need to replace it
>> with lruvec->zone in fa9add6 "mm/memcg: apply add/del_page to lruvec"?
>> A page does not roam between zones, its zone is a static property that
>> can be retrieved with page_zone().
>
> Just as in vmscan.c, we have the lruvec to hand, and that's what we
> mainly want to operate upon, but there is also some need for zone.
>
> (Both Konstantin and I were looking towards the day when we move the
> lru_lock into the lruvec, removing more dependence on "zone".  Pretty
> much the only reason that hasn't happened yet, is that we have not found
> time to make a performance case convincingly - but that's another topic.)
>
> Yes, page_zone(page) is a static property of the page, but it's not
> necessarily cheap to evaluate: depends on how complex the memory model
> and the spare page flags space, doesn't it?  We both preferred to
> derive zone from lruvec where convenient.
>
> How do you feel about this patch, and does it work for you guys?
>
> You'd be right if you guessed that I started out without the
> mem_cgroup_zone_lruvec part of it, but oops in get_scan_count
> told me that's needed too.
>
> Description to be filled in later: would it be needed for -stable,
> or is onlining already broken in other ways that you're now fixing up?
>
> Reported-by: Tang Chen<tangchen@cn.fujitsu.com>
> Signed-off-by: Hugh Dickins<hughd@google.com>
> ---
>
>   include/linux/mmzone.h |    2 -
>   mm/memcontrol.c        |   40 ++++++++++++++++++++++++++++++++-------
>   mm/mmzone.c            |    6 -----
>   mm/page_alloc.c        |    2 -
>   4 files changed, 36 insertions(+), 14 deletions(-)
>
> --- 3.6-rc5/include/linux/mmzone.h	2012-08-03 08:31:26.892842267 -0700
> +++ linux/include/linux/mmzone.h	2012-09-13 17:07:51.893772372 -0700
> @@ -744,7 +744,7 @@ extern int init_currently_empty_zone(str
>   				     unsigned long size,
>   				     enum memmap_context context);
>
> -extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
> +extern void lruvec_init(struct lruvec *lruvec);
>
>   static inline struct zone *lruvec_zone(struct lruvec *lruvec)
>   {
> --- 3.6-rc5/mm/memcontrol.c	2012-08-03 08:31:27.060842270 -0700
> +++ linux/mm/memcontrol.c	2012-09-13 17:46:36.870804625 -0700
> @@ -1061,12 +1061,25 @@ struct lruvec *mem_cgroup_zone_lruvec(st
>   				      struct mem_cgroup *memcg)
>   {
>   	struct mem_cgroup_per_zone *mz;
> +	struct lruvec *lruvec;
>
> -	if (mem_cgroup_disabled())
> -		return&zone->lruvec;
> +	if (mem_cgroup_disabled()) {
> +		lruvec =&zone->lruvec;
> +		goto out;
> +	}
>
>   	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
> -	return&mz->lruvec;
> +	lruvec =&mz->lruvec;
> +out:
> +	/*
> +	 * Since a node can be onlined after the mem_cgroup was created,
> +	 * we have to be prepared to initialize lruvec->zone here.
> +	 */
> +	if (unlikely(lruvec->zone != zone)) {
> +		VM_BUG_ON(lruvec->zone);
> +		lruvec->zone = zone;
> +	}
> +	return lruvec;
>   }

Whoaaa, this makes me dizzy. I prefer hook in register_one_node().
BTW It would be nice to add notifier-chains into memory-hotplug.

>
>   /*
> @@ -1093,9 +1106,12 @@ struct lruvec *mem_cgroup_page_lruvec(st
>   	struct mem_cgroup_per_zone *mz;
>   	struct mem_cgroup *memcg;
>   	struct page_cgroup *pc;
> +	struct lruvec *lruvec;
>
> -	if (mem_cgroup_disabled())
> -		return&zone->lruvec;
> +	if (mem_cgroup_disabled()) {
> +		lruvec =&zone->lruvec;
> +		goto out;
> +	}
>
>   	pc = lookup_page_cgroup(page);
>   	memcg = pc->mem_cgroup;
> @@ -1113,7 +1129,17 @@ struct lruvec *mem_cgroup_page_lruvec(st
>   		pc->mem_cgroup = memcg = root_mem_cgroup;
>
>   	mz = page_cgroup_zoneinfo(memcg, page);
> -	return&mz->lruvec;
> +	lruvec =&mz->lruvec;
> +out:
> +	/*
> +	 * Since a node can be onlined after the mem_cgroup was created,
> +	 * we have to be prepared to initialize lruvec->zone here.
> +	 */
> +	if (unlikely(lruvec->zone != zone)) {
> +		VM_BUG_ON(lruvec->zone);
> +		lruvec->zone = zone;
> +	}
> +	return lruvec;
>   }
>
>   /**
> @@ -4742,7 +4768,7 @@ static int alloc_mem_cgroup_per_zone_inf
>
>   	for (zone = 0; zone<  MAX_NR_ZONES; zone++) {
>   		mz =&pn->zoneinfo[zone];
> -		lruvec_init(&mz->lruvec,&NODE_DATA(node)->node_zones[zone]);
> +		lruvec_init(&mz->lruvec);
>   		mz->usage_in_excess = 0;
>   		mz->on_tree = false;
>   		mz->memcg = memcg;
> --- 3.6-rc5/mm/mmzone.c	2012-08-03 08:31:27.064842271 -0700
> +++ linux/mm/mmzone.c	2012-09-13 17:06:28.921766001 -0700
> @@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pf
>   }
>   #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
>
> -void lruvec_init(struct lruvec *lruvec, struct zone *zone)
> +void lruvec_init(struct lruvec *lruvec)
>   {
>   	enum lru_list lru;
>
> @@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec,
>
>   	for_each_lru(lru)
>   		INIT_LIST_HEAD(&lruvec->lists[lru]);
> -
> -#ifdef CONFIG_MEMCG
> -	lruvec->zone = zone;
> -#endif
>   }
> --- 3.6-rc5/mm/page_alloc.c	2012-08-22 14:25:39.508279046 -0700
> +++ linux/mm/page_alloc.c	2012-09-13 17:06:08.265763526 -0700
> @@ -4456,7 +4456,7 @@ static void __paginginit free_area_init_
>   		zone->zone_pgdat = pgdat;
>
>   		zone_pcp_init(zone);
> -		lruvec_init(&zone->lruvec, zone);
> +		lruvec_init(&zone->lruvec);
>   		if (!size)
>   			continue;
>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memory cgroup: update root memory cgroup when node is onlined
  2012-09-14  1:36   ` Hugh Dickins
                       ` (2 preceding siblings ...)
  2012-09-15 10:56     ` Konstantin Khlebnikov
@ 2012-09-17  5:50     ` Wen Congyang
  2012-10-16  5:58     ` Wen Congyang
  2012-10-16  7:21     ` [PATCH] memory cgroup: update root memory cgroup when node is onlined Kamezawa Hiroyuki
  5 siblings, 0 replies; 21+ messages in thread
From: Wen Congyang @ 2012-09-17  5:50 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Johannes Weiner, linux-kernel, cgroups, linux-mm, Jiang Liu,
	mhocko, bsingharora, KAMEZAWA Hiroyuki, Andrew Morton,
	Konstantin Khlebnikov, paul.gortmaker

At 09/14/2012 09:36 AM, Hugh Dickins Wrote:
> On Thu, 13 Sep 2012, Johannes Weiner wrote:
>> On Thu, Sep 13, 2012 at 03:14:28PM +0800, Wen Congyang wrote:
>>> root_mem_cgroup->info.nodeinfo is initialized when the system boots.
>>> But NODE_DATA(nid) is null if the node is not onlined, so
>>> root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone contains
>>> an invalid pointer. If we use numactl to bind a program to the node
>>> after onlining the node and its memory, it will cause the kernel
>>> panicked:
>>
>> Is there any chance we could get rid of the zone backpointer in lruvec
>> again instead?
> 
> It could be done, but it would make me sad :(
> 
>> Adding new nodes is a rare event and so updating every
>> single memcg in the system might be just borderline crazy.
> 
> Not horribly crazy, but rather ugly, yes.
> 
>> But can't
>> we just go back to passing the zone along with the lruvec down
>> vmscan.c paths?  I agree it's ugly to pass both, given their
>> relationship.  But I don't think the backpointer is any cleaner but in
>> addition less robust.
> 
> It's like how we use vma->mm: we could change everywhere to pass mm with
> vma, but it looks cleaner and cuts down on long arglists to have mm in vma.
>>From past experience, one of the things I worried about was adding extra
> args to the reclaim stack.
> 
>>
>> That being said, the crashing code in particular makes me wonder:
>>
>> static __always_inline void add_page_to_lru_list(struct page *page,
>> 				struct lruvec *lruvec, enum lru_list lru)
>> {
>> 	int nr_pages = hpage_nr_pages(page);
>> 	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
>> 	list_add(&page->lru, &lruvec->lists[lru]);
>> 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
>> }
>>
>> Why did we ever pass zone in here and then felt the need to replace it
>> with lruvec->zone in fa9add6 "mm/memcg: apply add/del_page to lruvec"?
>> A page does not roam between zones, its zone is a static property that
>> can be retrieved with page_zone().
> 
> Just as in vmscan.c, we have the lruvec to hand, and that's what we
> mainly want to operate upon, but there is also some need for zone.
> 
> (Both Konstantin and I were looking towards the day when we move the
> lru_lock into the lruvec, removing more dependence on "zone".  Pretty
> much the only reason that hasn't happened yet, is that we have not found
> time to make a performance case convincingly - but that's another topic.)
> 
> Yes, page_zone(page) is a static property of the page, but it's not
> necessarily cheap to evaluate: depends on how complex the memory model
> and the spare page flags space, doesn't it?  We both preferred to
> derive zone from lruvec where convenient.
> 
> How do you feel about this patch, and does it work for you guys?
> 
> You'd be right if you guessed that I started out without the
> mem_cgroup_zone_lruvec part of it, but oops in get_scan_count
> told me that's needed too.
> 
> Description to be filled in later: would it be needed for -stable,
> or is onlining already broken in other ways that you're now fixing up?
> 
> Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
> Signed-off-by: Hugh Dickins <hughd@google.com>
> ---
> 
>  include/linux/mmzone.h |    2 -
>  mm/memcontrol.c        |   40 ++++++++++++++++++++++++++++++++-------
>  mm/mmzone.c            |    6 -----
>  mm/page_alloc.c        |    2 -
>  4 files changed, 36 insertions(+), 14 deletions(-)
> 
> --- 3.6-rc5/include/linux/mmzone.h	2012-08-03 08:31:26.892842267 -0700
> +++ linux/include/linux/mmzone.h	2012-09-13 17:07:51.893772372 -0700
> @@ -744,7 +744,7 @@ extern int init_currently_empty_zone(str
>  				     unsigned long size,
>  				     enum memmap_context context);
>  
> -extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
> +extern void lruvec_init(struct lruvec *lruvec);
>  
>  static inline struct zone *lruvec_zone(struct lruvec *lruvec)
>  {
> --- 3.6-rc5/mm/memcontrol.c	2012-08-03 08:31:27.060842270 -0700
> +++ linux/mm/memcontrol.c	2012-09-13 17:46:36.870804625 -0700
> @@ -1061,12 +1061,25 @@ struct lruvec *mem_cgroup_zone_lruvec(st
>  				      struct mem_cgroup *memcg)
>  {
>  	struct mem_cgroup_per_zone *mz;
> +	struct lruvec *lruvec;
>  
> -	if (mem_cgroup_disabled())
> -		return &zone->lruvec;
> +	if (mem_cgroup_disabled()) {
> +		lruvec = &zone->lruvec;
> +		goto out;
> +	}
>  
>  	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
> -	return &mz->lruvec;
> +	lruvec = &mz->lruvec;
> +out:
> +	/*
> +	 * Since a node can be onlined after the mem_cgroup was created,
> +	 * we have to be prepared to initialize lruvec->zone here.
> +	 */
> +	if (unlikely(lruvec->zone != zone)) {
> +		VM_BUG_ON(lruvec->zone);

If node is offlined and onlined again, lruvec->zone is not NULL, and not
equal to zone, this line will cause kernel panicked. 

> +		lruvec->zone = zone;
> +	}
> +	return lruvec;
>  }
>  
>  /*
> @@ -1093,9 +1106,12 @@ struct lruvec *mem_cgroup_page_lruvec(st
>  	struct mem_cgroup_per_zone *mz;
>  	struct mem_cgroup *memcg;
>  	struct page_cgroup *pc;
> +	struct lruvec *lruvec;
>  
> -	if (mem_cgroup_disabled())
> -		return &zone->lruvec;
> +	if (mem_cgroup_disabled()) {
> +		lruvec = &zone->lruvec;
> +		goto out;
> +	}
>  
>  	pc = lookup_page_cgroup(page);
>  	memcg = pc->mem_cgroup;
> @@ -1113,7 +1129,17 @@ struct lruvec *mem_cgroup_page_lruvec(st
>  		pc->mem_cgroup = memcg = root_mem_cgroup;
>  
>  	mz = page_cgroup_zoneinfo(memcg, page);
> -	return &mz->lruvec;
> +	lruvec = &mz->lruvec;
> +out:
> +	/*
> +	 * Since a node can be onlined after the mem_cgroup was created,
> +	 * we have to be prepared to initialize lruvec->zone here.
> +	 */
> +	if (unlikely(lruvec->zone != zone)) {
> +		VM_BUG_ON(lruvec->zone);

I apply your patch, and remove VM_BUG_ON(). I don't find any error in my test
now.


Thanks
Wen Congyang

> +		lruvec->zone = zone;
> +	}
> +	return lruvec;
>  }
>  
>  /**
> @@ -4742,7 +4768,7 @@ static int alloc_mem_cgroup_per_zone_inf
>  
>  	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
>  		mz = &pn->zoneinfo[zone];
> -		lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
> +		lruvec_init(&mz->lruvec);
>  		mz->usage_in_excess = 0;
>  		mz->on_tree = false;
>  		mz->memcg = memcg;
> --- 3.6-rc5/mm/mmzone.c	2012-08-03 08:31:27.064842271 -0700
> +++ linux/mm/mmzone.c	2012-09-13 17:06:28.921766001 -0700
> @@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pf
>  }
>  #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
>  
> -void lruvec_init(struct lruvec *lruvec, struct zone *zone)
> +void lruvec_init(struct lruvec *lruvec)
>  {
>  	enum lru_list lru;
>  
> @@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec,
>  
>  	for_each_lru(lru)
>  		INIT_LIST_HEAD(&lruvec->lists[lru]);
> -
> -#ifdef CONFIG_MEMCG
> -	lruvec->zone = zone;
> -#endif
>  }
> --- 3.6-rc5/mm/page_alloc.c	2012-08-22 14:25:39.508279046 -0700
> +++ linux/mm/page_alloc.c	2012-09-13 17:06:08.265763526 -0700
> @@ -4456,7 +4456,7 @@ static void __paginginit free_area_init_
>  		zone->zone_pgdat = pgdat;
>  
>  		zone_pcp_init(zone);
> -		lruvec_init(&zone->lruvec, zone);
> +		lruvec_init(&zone->lruvec);
>  		if (!size)
>  			continue;
>  
> 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memory cgroup: update root memory cgroup when node is onlined
  2012-09-14  1:46   ` Wen Congyang
@ 2012-09-17  6:40     ` Hugh Dickins
  0 siblings, 0 replies; 21+ messages in thread
From: Hugh Dickins @ 2012-09-17  6:40 UTC (permalink / raw)
  To: Wen Congyang
  Cc: Johannes Weiner, Konstantin Khlebnikov, linux-kernel, cgroups,
	linux-mm, Jiang Liu, mhocko, bsingharora, KAMEZAWA Hiroyuki,
	Andrew Morton, paul.gortmaker

On Fri, 14 Sep 2012, Wen Congyang wrote:
> 
> Hmm, if we don't update lruvec_zone(lruvec) when a node is onlined,
> lruver_zone(lruvec) contains a invalid pointer, so we should check
> all place that accesses lruvec_zone(lruvec). We store zone in lruvec
> but we can't access it. And we can't avoid to access it in the
> furture. So I think it is better to update it when the node is
> onlined.

Thanks for your feedback (-), and Johannes (+), and Konstantin (-):
though your remarks above appear to be in reply to Johannes, I took
them as applying to my patch.

I think it's okay to update lruvec->zone in mem_cgroup_zone_lruvec()
and mem_cgroup_page_lruvec(), rather than in each place we call
lruvec_zone(lruvec), because the lruvec addressed is not pulled out
of nowhere: it has been supplied by either mem_cgroup_zone_lruvec()
or mem_cgroup_page_lruvec().

But your comment did prompt me to go back and check on that, and
I did find one other place which makes its own determination of the
lruvec: mem_cgroup_force_empty_list().  Now, that happens to be safe
at present because of its list_empty() check, but it's easy to imagine
a future patch which would accidentally make it go wrong: so in the
updated version of my lruvec->zone patch below, I've changed the
assignments at the head of mem_cgroup_force_empty_list() to use
the standard mem_cgroup_zone_lruvec() to determine lruvec.

But I've made another change (in both places) too - though if you've
already been testing the earlier patch, this will not be any worse:
I've removed the VM_BUG_ONs, after realizing that hotremove of
memory node followed by hotadd of memory node would need to
update lruvec->zone even though it's currently non-zero -
unless hotremove goes through updating all memcgs in the
same way as you propose that hotadd does.

I'm not sure how I feel about my patch: really, I think I'd
like to see the "update all memcgs when memory node is onlined and
offlined" patch that you and Konstantin prefer (but I'd prefer one
of you to write), before we make up our minds which way to go.

If it's not too complicated, yes, I may well prefer it myself too;
but I worry that it may turn out to be hard to get the serialization
against mem cgroup creation and destruction right.

Konstantin remarks that it would be nice to have notifier chains
in memory hotplug: I believe that's already there, and, for example,
KSM is one user of hotplug_memory_notifier().

I've seen you comment elsewhere that memory hotremove is not working
with memcg, and I wonder if you've found it far away from working,
or nearly there.  That might determine which patch to choose: if
it's nearly working, then you may need to update all memcgs anyway
to get it fully working, in which case it would probably be right
to handle lruvec->zone that way too.  But if it's hard to get working,
but hotadd nearly right, then my patch may be the right choice for now.

I've still not written the patch comment: it was when I started to
write that, that I realized the hotadd after hotremove issue.

Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
---

 include/linux/mmzone.h |    2 -
 mm/memcontrol.c        |   46 +++++++++++++++++++++++++++++----------
 mm/mmzone.c            |    6 -----
 mm/page_alloc.c        |    2 -
 4 files changed, 38 insertions(+), 18 deletions(-)

--- 3.6-rc6/include/linux/mmzone.h	2012-08-03 08:31:26.892842267 -0700
+++ linux/include/linux/mmzone.h	2012-09-13 17:07:51.893772372 -0700
@@ -744,7 +744,7 @@ extern int init_currently_empty_zone(str
 				     unsigned long size,
 				     enum memmap_context context);
 
-extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
+extern void lruvec_init(struct lruvec *lruvec);
 
 static inline struct zone *lruvec_zone(struct lruvec *lruvec)
 {
--- 3.6-rc6/mm/memcontrol.c	2012-08-03 08:31:27.060842270 -0700
+++ linux/mm/memcontrol.c	2012-09-16 21:49:28.583284601 -0700
@@ -1061,12 +1061,24 @@ struct lruvec *mem_cgroup_zone_lruvec(st
 				      struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_per_zone *mz;
+	struct lruvec *lruvec;
 
-	if (mem_cgroup_disabled())
-		return &zone->lruvec;
+	if (mem_cgroup_disabled()) {
+		lruvec = &zone->lruvec;
+		goto out;
+	}
 
 	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
-	return &mz->lruvec;
+	lruvec = &mz->lruvec;
+out:
+	/*
+	 * Since a node can be onlined after the mem_cgroup was created,
+	 * we have to be prepared to initialize lruvec->zone here;
+	 * and if offlined then reonlined, we need to reinitialize it.
+	 */
+	if (unlikely(lruvec->zone != zone))
+		lruvec->zone = zone;
+	return lruvec;
 }
 
 /*
@@ -1093,9 +1105,12 @@ struct lruvec *mem_cgroup_page_lruvec(st
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc;
+	struct lruvec *lruvec;
 
-	if (mem_cgroup_disabled())
-		return &zone->lruvec;
+	if (mem_cgroup_disabled()) {
+		lruvec = &zone->lruvec;
+		goto out;
+	}
 
 	pc = lookup_page_cgroup(page);
 	memcg = pc->mem_cgroup;
@@ -1113,7 +1128,16 @@ struct lruvec *mem_cgroup_page_lruvec(st
 		pc->mem_cgroup = memcg = root_mem_cgroup;
 
 	mz = page_cgroup_zoneinfo(memcg, page);
-	return &mz->lruvec;
+	lruvec = &mz->lruvec;
+out:
+	/*
+	 * Since a node can be onlined after the mem_cgroup was created,
+	 * we have to be prepared to initialize lruvec->zone here;
+	 * and if offlined then reonlined, we need to reinitialize it.
+	 */
+	if (unlikely(lruvec->zone != zone))
+		lruvec->zone = zone;
+	return lruvec;
 }
 
 /**
@@ -3694,17 +3718,17 @@ unsigned long mem_cgroup_soft_limit_recl
 static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 				int node, int zid, enum lru_list lru)
 {
-	struct mem_cgroup_per_zone *mz;
+	struct lruvec *lruvec;
 	unsigned long flags, loop;
 	struct list_head *list;
 	struct page *busy;
 	struct zone *zone;
 
 	zone = &NODE_DATA(node)->node_zones[zid];
-	mz = mem_cgroup_zoneinfo(memcg, node, zid);
-	list = &mz->lruvec.lists[lru];
+	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+	list = &lruvec->lists[lru];
 
-	loop = mz->lru_size[lru];
+	loop = mem_cgroup_get_lru_size(lruvec, lru);
 	/* give some margin against EBUSY etc...*/
 	loop += 256;
 	busy = NULL;
@@ -4742,7 +4766,7 @@ static int alloc_mem_cgroup_per_zone_inf
 
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
-		lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
+		lruvec_init(&mz->lruvec);
 		mz->usage_in_excess = 0;
 		mz->on_tree = false;
 		mz->memcg = memcg;
--- 3.6-rc6/mm/mmzone.c	2012-08-03 08:31:27.064842271 -0700
+++ linux/mm/mmzone.c	2012-09-13 17:06:28.921766001 -0700
@@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pf
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
 
-void lruvec_init(struct lruvec *lruvec, struct zone *zone)
+void lruvec_init(struct lruvec *lruvec)
 {
 	enum lru_list lru;
 
@@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec,
 
 	for_each_lru(lru)
 		INIT_LIST_HEAD(&lruvec->lists[lru]);
-
-#ifdef CONFIG_MEMCG
-	lruvec->zone = zone;
-#endif
 }
--- 3.6-rc6/mm/page_alloc.c	2012-08-22 14:25:39.508279046 -0700
+++ linux/mm/page_alloc.c	2012-09-13 17:06:08.265763526 -0700
@@ -4456,7 +4456,7 @@ static void __paginginit free_area_init_
 		zone->zone_pgdat = pgdat;
 
 		zone_pcp_init(zone);
-		lruvec_init(&zone->lruvec, zone);
+		lruvec_init(&zone->lruvec);
 		if (!size)
 			continue;
 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memory cgroup: update root memory cgroup when node is onlined
  2012-09-14  1:36   ` Hugh Dickins
                       ` (3 preceding siblings ...)
  2012-09-17  5:50     ` Wen Congyang
@ 2012-10-16  5:58     ` Wen Congyang
  2012-10-18 19:03       ` Hugh Dickins
  2012-10-16  7:21     ` [PATCH] memory cgroup: update root memory cgroup when node is onlined Kamezawa Hiroyuki
  5 siblings, 1 reply; 21+ messages in thread
From: Wen Congyang @ 2012-10-16  5:58 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Johannes Weiner, linux-kernel, cgroups, linux-mm, Jiang Liu,
	mhocko, bsingharora, KAMEZAWA Hiroyuki, Andrew Morton,
	Konstantin Khlebnikov, paul.gortmaker

At 09/14/2012 09:36 AM, Hugh Dickins Wrote:
> On Thu, 13 Sep 2012, Johannes Weiner wrote:
>> On Thu, Sep 13, 2012 at 03:14:28PM +0800, Wen Congyang wrote:
>>> root_mem_cgroup->info.nodeinfo is initialized when the system boots.
>>> But NODE_DATA(nid) is null if the node is not onlined, so
>>> root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone contains
>>> an invalid pointer. If we use numactl to bind a program to the node
>>> after onlining the node and its memory, it will cause the kernel
>>> panicked:
>>
>> Is there any chance we could get rid of the zone backpointer in lruvec
>> again instead?
> 
> It could be done, but it would make me sad :(
> 
>> Adding new nodes is a rare event and so updating every
>> single memcg in the system might be just borderline crazy.
> 
> Not horribly crazy, but rather ugly, yes.
> 
>> But can't
>> we just go back to passing the zone along with the lruvec down
>> vmscan.c paths?  I agree it's ugly to pass both, given their
>> relationship.  But I don't think the backpointer is any cleaner but in
>> addition less robust.
> 
> It's like how we use vma->mm: we could change everywhere to pass mm with
> vma, but it looks cleaner and cuts down on long arglists to have mm in vma.
>>From past experience, one of the things I worried about was adding extra
> args to the reclaim stack.
> 
>>
>> That being said, the crashing code in particular makes me wonder:
>>
>> static __always_inline void add_page_to_lru_list(struct page *page,
>> 				struct lruvec *lruvec, enum lru_list lru)
>> {
>> 	int nr_pages = hpage_nr_pages(page);
>> 	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
>> 	list_add(&page->lru, &lruvec->lists[lru]);
>> 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
>> }
>>
>> Why did we ever pass zone in here and then felt the need to replace it
>> with lruvec->zone in fa9add6 "mm/memcg: apply add/del_page to lruvec"?
>> A page does not roam between zones, its zone is a static property that
>> can be retrieved with page_zone().
> 
> Just as in vmscan.c, we have the lruvec to hand, and that's what we
> mainly want to operate upon, but there is also some need for zone.
> 
> (Both Konstantin and I were looking towards the day when we move the
> lru_lock into the lruvec, removing more dependence on "zone".  Pretty
> much the only reason that hasn't happened yet, is that we have not found
> time to make a performance case convincingly - but that's another topic.)
> 
> Yes, page_zone(page) is a static property of the page, but it's not
> necessarily cheap to evaluate: depends on how complex the memory model
> and the spare page flags space, doesn't it?  We both preferred to
> derive zone from lruvec where convenient.
> 
> How do you feel about this patch, and does it work for you guys?
> 
> You'd be right if you guessed that I started out without the
> mem_cgroup_zone_lruvec part of it, but oops in get_scan_count
> told me that's needed too.
> 
> Description to be filled in later: would it be needed for -stable,
> or is onlining already broken in other ways that you're now fixing up?
> 
> Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
> Signed-off-by: Hugh Dickins <hughd@google.com>

Hi, all:

What about the status of this patch?

Thanks
Wen Congyang

> ---
> 
>  include/linux/mmzone.h |    2 -
>  mm/memcontrol.c        |   40 ++++++++++++++++++++++++++++++++-------
>  mm/mmzone.c            |    6 -----
>  mm/page_alloc.c        |    2 -
>  4 files changed, 36 insertions(+), 14 deletions(-)
> 
> --- 3.6-rc5/include/linux/mmzone.h	2012-08-03 08:31:26.892842267 -0700
> +++ linux/include/linux/mmzone.h	2012-09-13 17:07:51.893772372 -0700
> @@ -744,7 +744,7 @@ extern int init_currently_empty_zone(str
>  				     unsigned long size,
>  				     enum memmap_context context);
>  
> -extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
> +extern void lruvec_init(struct lruvec *lruvec);
>  
>  static inline struct zone *lruvec_zone(struct lruvec *lruvec)
>  {
> --- 3.6-rc5/mm/memcontrol.c	2012-08-03 08:31:27.060842270 -0700
> +++ linux/mm/memcontrol.c	2012-09-13 17:46:36.870804625 -0700
> @@ -1061,12 +1061,25 @@ struct lruvec *mem_cgroup_zone_lruvec(st
>  				      struct mem_cgroup *memcg)
>  {
>  	struct mem_cgroup_per_zone *mz;
> +	struct lruvec *lruvec;
>  
> -	if (mem_cgroup_disabled())
> -		return &zone->lruvec;
> +	if (mem_cgroup_disabled()) {
> +		lruvec = &zone->lruvec;
> +		goto out;
> +	}
>  
>  	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
> -	return &mz->lruvec;
> +	lruvec = &mz->lruvec;
> +out:
> +	/*
> +	 * Since a node can be onlined after the mem_cgroup was created,
> +	 * we have to be prepared to initialize lruvec->zone here.
> +	 */
> +	if (unlikely(lruvec->zone != zone)) {
> +		VM_BUG_ON(lruvec->zone);
> +		lruvec->zone = zone;
> +	}
> +	return lruvec;
>  }
>  
>  /*
> @@ -1093,9 +1106,12 @@ struct lruvec *mem_cgroup_page_lruvec(st
>  	struct mem_cgroup_per_zone *mz;
>  	struct mem_cgroup *memcg;
>  	struct page_cgroup *pc;
> +	struct lruvec *lruvec;
>  
> -	if (mem_cgroup_disabled())
> -		return &zone->lruvec;
> +	if (mem_cgroup_disabled()) {
> +		lruvec = &zone->lruvec;
> +		goto out;
> +	}
>  
>  	pc = lookup_page_cgroup(page);
>  	memcg = pc->mem_cgroup;
> @@ -1113,7 +1129,17 @@ struct lruvec *mem_cgroup_page_lruvec(st
>  		pc->mem_cgroup = memcg = root_mem_cgroup;
>  
>  	mz = page_cgroup_zoneinfo(memcg, page);
> -	return &mz->lruvec;
> +	lruvec = &mz->lruvec;
> +out:
> +	/*
> +	 * Since a node can be onlined after the mem_cgroup was created,
> +	 * we have to be prepared to initialize lruvec->zone here.
> +	 */
> +	if (unlikely(lruvec->zone != zone)) {
> +		VM_BUG_ON(lruvec->zone);
> +		lruvec->zone = zone;
> +	}
> +	return lruvec;
>  }
>  
>  /**
> @@ -4742,7 +4768,7 @@ static int alloc_mem_cgroup_per_zone_inf
>  
>  	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
>  		mz = &pn->zoneinfo[zone];
> -		lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
> +		lruvec_init(&mz->lruvec);
>  		mz->usage_in_excess = 0;
>  		mz->on_tree = false;
>  		mz->memcg = memcg;
> --- 3.6-rc5/mm/mmzone.c	2012-08-03 08:31:27.064842271 -0700
> +++ linux/mm/mmzone.c	2012-09-13 17:06:28.921766001 -0700
> @@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pf
>  }
>  #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
>  
> -void lruvec_init(struct lruvec *lruvec, struct zone *zone)
> +void lruvec_init(struct lruvec *lruvec)
>  {
>  	enum lru_list lru;
>  
> @@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec,
>  
>  	for_each_lru(lru)
>  		INIT_LIST_HEAD(&lruvec->lists[lru]);
> -
> -#ifdef CONFIG_MEMCG
> -	lruvec->zone = zone;
> -#endif
>  }
> --- 3.6-rc5/mm/page_alloc.c	2012-08-22 14:25:39.508279046 -0700
> +++ linux/mm/page_alloc.c	2012-09-13 17:06:08.265763526 -0700
> @@ -4456,7 +4456,7 @@ static void __paginginit free_area_init_
>  		zone->zone_pgdat = pgdat;
>  
>  		zone_pcp_init(zone);
> -		lruvec_init(&zone->lruvec, zone);
> +		lruvec_init(&zone->lruvec);
>  		if (!size)
>  			continue;
>  
> 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memory cgroup: update root memory cgroup when node is onlined
  2012-09-14  1:36   ` Hugh Dickins
                       ` (4 preceding siblings ...)
  2012-10-16  5:58     ` Wen Congyang
@ 2012-10-16  7:21     ` Kamezawa Hiroyuki
  5 siblings, 0 replies; 21+ messages in thread
From: Kamezawa Hiroyuki @ 2012-10-16  7:21 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Johannes Weiner, Wen Congyang, linux-kernel, cgroups, linux-mm,
	Jiang Liu, mhocko, bsingharora, Andrew Morton,
	Konstantin Khlebnikov, paul.gortmaker

(2012/09/14 10:36), Hugh Dickins wrote:
> On Thu, 13 Sep 2012, Johannes Weiner wrote:
>> On Thu, Sep 13, 2012 at 03:14:28PM +0800, Wen Congyang wrote:
>>> root_mem_cgroup->info.nodeinfo is initialized when the system boots.
>>> But NODE_DATA(nid) is null if the node is not onlined, so
>>> root_mem_cgroup->info.nodeinfo[nid]->zoneinfo[zone].lruvec.zone contains
>>> an invalid pointer. If we use numactl to bind a program to the node
>>> after onlining the node and its memory, it will cause the kernel
>>> panicked:
>>
>> Is there any chance we could get rid of the zone backpointer in lruvec
>> again instead?
>
> It could be done, but it would make me sad :(
>
>> Adding new nodes is a rare event and so updating every
>> single memcg in the system might be just borderline crazy.
>
> Not horribly crazy, but rather ugly, yes.
>
>> But can't
>> we just go back to passing the zone along with the lruvec down
>> vmscan.c paths?  I agree it's ugly to pass both, given their
>> relationship.  But I don't think the backpointer is any cleaner but in
>> addition less robust.
>
> It's like how we use vma->mm: we could change everywhere to pass mm with
> vma, but it looks cleaner and cuts down on long arglists to have mm in vma.
>  From past experience, one of the things I worried about was adding extra
> args to the reclaim stack.
>
>>
>> That being said, the crashing code in particular makes me wonder:
>>
>> static __always_inline void add_page_to_lru_list(struct page *page,
>> 				struct lruvec *lruvec, enum lru_list lru)
>> {
>> 	int nr_pages = hpage_nr_pages(page);
>> 	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
>> 	list_add(&page->lru, &lruvec->lists[lru]);
>> 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
>> }
>>
>> Why did we ever pass zone in here and then felt the need to replace it
>> with lruvec->zone in fa9add6 "mm/memcg: apply add/del_page to lruvec"?
>> A page does not roam between zones, its zone is a static property that
>> can be retrieved with page_zone().
>
> Just as in vmscan.c, we have the lruvec to hand, and that's what we
> mainly want to operate upon, but there is also some need for zone.
>
> (Both Konstantin and I were looking towards the day when we move the
> lru_lock into the lruvec, removing more dependence on "zone".  Pretty
> much the only reason that hasn't happened yet, is that we have not found
> time to make a performance case convincingly - but that's another topic.)
>
> Yes, page_zone(page) is a static property of the page, but it's not
> necessarily cheap to evaluate: depends on how complex the memory model
> and the spare page flags space, doesn't it?  We both preferred to
> derive zone from lruvec where convenient.
>
> How do you feel about this patch, and does it work for you guys?
>
> You'd be right if you guessed that I started out without the
> mem_cgroup_zone_lruvec part of it, but oops in get_scan_count
> told me that's needed too.
>
> Description to be filled in later: would it be needed for -stable,
> or is onlining already broken in other ways that you're now fixing up?
>
> Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
> Signed-off-by: Hugh Dickins <hughd@google.com>

Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>




^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memory cgroup: update root memory cgroup when node is onlined
  2012-10-16  5:58     ` Wen Congyang
@ 2012-10-18 19:03       ` Hugh Dickins
  2012-10-18 22:03         ` Johannes Weiner
  0 siblings, 1 reply; 21+ messages in thread
From: Hugh Dickins @ 2012-10-18 19:03 UTC (permalink / raw)
  To: Wen Congyang
  Cc: Johannes Weiner, linux-kernel, cgroups, linux-mm, Jiang Liu,
	mhocko, bsingharora, KAMEZAWA Hiroyuki, Andrew Morton,
	Konstantin Khlebnikov, paul.gortmaker

On Tue, 16 Oct 2012, Wen Congyang wrote:
> At 09/14/2012 09:36 AM, Hugh Dickins Wrote:
> > 
> > Description to be filled in later: would it be needed for -stable,
> > or is onlining already broken in other ways that you're now fixing up?
> > 
> > Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
> > Signed-off-by: Hugh Dickins <hughd@google.com>
> 
> Hi, all:
> 
> What about the status of this patch?

Sorry I'm being so unresponsive at the moment (or, as usual).

When I sent the fixed version afterwards (minus mistaken VM_BUG_ON,
plus safer mem_cgroup_force_empty_list), I expected you or Konstantin
to respond with a patch to fix it as you preferred (at offline/online);
so this was on hold until we could compare and decide between them.

In the meantime, I assume, we've all come to feel that this way is
simple, and probably the best way for now; or at least good enough,
and we all have better things to do than play with alternatives.

I'll write up the description of the fixed version, and post it for
3.7, including the Acks from Hannes and KAMEZAWA (assuming they carry
forward to the second version) - but probably not today or tomorrow.

But please help me: I still don't know if it's needed for -stable.
We introduced the bug in 3.5, but I see lots of memory hotplug fixes
coming by from you and others, so I do not know if this lruvec->zone
fix is useful by itself in 3.5 and 3.6, or not - please tell me.

Thanks,
Hugh

> 
> Thanks
> Wen Congyang
> 
> > ---
> > 
> >  include/linux/mmzone.h |    2 -
> >  mm/memcontrol.c        |   40 ++++++++++++++++++++++++++++++++-------
> >  mm/mmzone.c            |    6 -----
> >  mm/page_alloc.c        |    2 -
> >  4 files changed, 36 insertions(+), 14 deletions(-)
> > 
> > --- 3.6-rc5/include/linux/mmzone.h	2012-08-03 08:31:26.892842267 -0700
> > +++ linux/include/linux/mmzone.h	2012-09-13 17:07:51.893772372 -0700
> > @@ -744,7 +744,7 @@ extern int init_currently_empty_zone(str
> >  				     unsigned long size,
> >  				     enum memmap_context context);
> >  
> > -extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
> > +extern void lruvec_init(struct lruvec *lruvec);
> >  
> >  static inline struct zone *lruvec_zone(struct lruvec *lruvec)
> >  {
> > --- 3.6-rc5/mm/memcontrol.c	2012-08-03 08:31:27.060842270 -0700
> > +++ linux/mm/memcontrol.c	2012-09-13 17:46:36.870804625 -0700
> > @@ -1061,12 +1061,25 @@ struct lruvec *mem_cgroup_zone_lruvec(st
> >  				      struct mem_cgroup *memcg)
> >  {
> >  	struct mem_cgroup_per_zone *mz;
> > +	struct lruvec *lruvec;
> >  
> > -	if (mem_cgroup_disabled())
> > -		return &zone->lruvec;
> > +	if (mem_cgroup_disabled()) {
> > +		lruvec = &zone->lruvec;
> > +		goto out;
> > +	}
> >  
> >  	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
> > -	return &mz->lruvec;
> > +	lruvec = &mz->lruvec;
> > +out:
> > +	/*
> > +	 * Since a node can be onlined after the mem_cgroup was created,
> > +	 * we have to be prepared to initialize lruvec->zone here.
> > +	 */
> > +	if (unlikely(lruvec->zone != zone)) {
> > +		VM_BUG_ON(lruvec->zone);
> > +		lruvec->zone = zone;
> > +	}
> > +	return lruvec;
> >  }
> >  
> >  /*
> > @@ -1093,9 +1106,12 @@ struct lruvec *mem_cgroup_page_lruvec(st
> >  	struct mem_cgroup_per_zone *mz;
> >  	struct mem_cgroup *memcg;
> >  	struct page_cgroup *pc;
> > +	struct lruvec *lruvec;
> >  
> > -	if (mem_cgroup_disabled())
> > -		return &zone->lruvec;
> > +	if (mem_cgroup_disabled()) {
> > +		lruvec = &zone->lruvec;
> > +		goto out;
> > +	}
> >  
> >  	pc = lookup_page_cgroup(page);
> >  	memcg = pc->mem_cgroup;
> > @@ -1113,7 +1129,17 @@ struct lruvec *mem_cgroup_page_lruvec(st
> >  		pc->mem_cgroup = memcg = root_mem_cgroup;
> >  
> >  	mz = page_cgroup_zoneinfo(memcg, page);
> > -	return &mz->lruvec;
> > +	lruvec = &mz->lruvec;
> > +out:
> > +	/*
> > +	 * Since a node can be onlined after the mem_cgroup was created,
> > +	 * we have to be prepared to initialize lruvec->zone here.
> > +	 */
> > +	if (unlikely(lruvec->zone != zone)) {
> > +		VM_BUG_ON(lruvec->zone);
> > +		lruvec->zone = zone;
> > +	}
> > +	return lruvec;
> >  }
> >  
> >  /**
> > @@ -4742,7 +4768,7 @@ static int alloc_mem_cgroup_per_zone_inf
> >  
> >  	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
> >  		mz = &pn->zoneinfo[zone];
> > -		lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
> > +		lruvec_init(&mz->lruvec);
> >  		mz->usage_in_excess = 0;
> >  		mz->on_tree = false;
> >  		mz->memcg = memcg;
> > --- 3.6-rc5/mm/mmzone.c	2012-08-03 08:31:27.064842271 -0700
> > +++ linux/mm/mmzone.c	2012-09-13 17:06:28.921766001 -0700
> > @@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pf
> >  }
> >  #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
> >  
> > -void lruvec_init(struct lruvec *lruvec, struct zone *zone)
> > +void lruvec_init(struct lruvec *lruvec)
> >  {
> >  	enum lru_list lru;
> >  
> > @@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec,
> >  
> >  	for_each_lru(lru)
> >  		INIT_LIST_HEAD(&lruvec->lists[lru]);
> > -
> > -#ifdef CONFIG_MEMCG
> > -	lruvec->zone = zone;
> > -#endif
> >  }
> > --- 3.6-rc5/mm/page_alloc.c	2012-08-22 14:25:39.508279046 -0700
> > +++ linux/mm/page_alloc.c	2012-09-13 17:06:08.265763526 -0700
> > @@ -4456,7 +4456,7 @@ static void __paginginit free_area_init_
> >  		zone->zone_pgdat = pgdat;
> >  
> >  		zone_pcp_init(zone);
> > -		lruvec_init(&zone->lruvec, zone);
> > +		lruvec_init(&zone->lruvec);
> >  		if (!size)
> >  			continue;
> >  
> > 
> 
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memory cgroup: update root memory cgroup when node is onlined
  2012-10-18 19:03       ` Hugh Dickins
@ 2012-10-18 22:03         ` Johannes Weiner
  2012-11-02  1:28           ` [PATCH] memcg: fix hotplugged memory zone oops Hugh Dickins
  0 siblings, 1 reply; 21+ messages in thread
From: Johannes Weiner @ 2012-10-18 22:03 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Wen Congyang, linux-kernel, cgroups, linux-mm, Jiang Liu, mhocko,
	bsingharora, KAMEZAWA Hiroyuki, Andrew Morton,
	Konstantin Khlebnikov, paul.gortmaker

On Thu, Oct 18, 2012 at 12:03:44PM -0700, Hugh Dickins wrote:
> On Tue, 16 Oct 2012, Wen Congyang wrote:
> > At 09/14/2012 09:36 AM, Hugh Dickins Wrote:
> > > 
> > > Description to be filled in later: would it be needed for -stable,
> > > or is onlining already broken in other ways that you're now fixing up?
> > > 
> > > Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
> > > Signed-off-by: Hugh Dickins <hughd@google.com>
> > 
> > Hi, all:
> > 
> > What about the status of this patch?
> 
> Sorry I'm being so unresponsive at the moment (or, as usual).
> 
> When I sent the fixed version afterwards (minus mistaken VM_BUG_ON,
> plus safer mem_cgroup_force_empty_list), I expected you or Konstantin
> to respond with a patch to fix it as you preferred (at offline/online);
> so this was on hold until we could compare and decide between them.
> 
> In the meantime, I assume, we've all come to feel that this way is
> simple, and probably the best way for now; or at least good enough,
> and we all have better things to do than play with alternatives.
> 
> I'll write up the description of the fixed version, and post it for
> 3.7, including the Acks from Hannes and KAMEZAWA (assuming they carry
> forward to the second version) - but probably not today or tomorrow.

Mine does, thanks for asking :-)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH] memcg: fix hotplugged memory zone oops
  2012-10-18 22:03         ` Johannes Weiner
@ 2012-11-02  1:28           ` Hugh Dickins
  2012-11-02 10:21             ` Michal Hocko
  0 siblings, 1 reply; 21+ messages in thread
From: Hugh Dickins @ 2012-11-02  1:28 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Wen Congyang, linux-kernel, cgroups, linux-mm,
	Jiang Liu, mhocko, bsingharora, KAMEZAWA Hiroyuki,
	Konstantin Khlebnikov, paul.gortmaker, Tang Chen

When MEMCG is configured on (even when it's disabled by boot option),
when adding or removing a page to/from its lru list, the zone pointer
used for stats updates is nowadays taken from the struct lruvec.
(On many configurations, calculating zone from page is slower.)

But we have no code to update all the lruvecs (per zone, per memcg)
when a memory node is hotadded.  Here's an extract from the oops which
results when running numactl to bind a program to a newly onlined node:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60
IP: [<ffffffff811870b9>] __mod_zone_page_state+0x9/0x60
PGD 0 
Oops: 0000 [#1] SMP 
CPU 2 
Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs
Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0)
Stack:
 ffff880039abdaf8 ffffffff8117390f ffff880039abdaf8 000000008167c601
 ffffffff81174162 ffff88003a480f00 0000000000000001 ffff8800395e0000
 ffff88003dbd0e80 0000000000000282 ffff880039abdb48 ffffffff81174181
Call Trace:
 [<ffffffff8117390f>] __pagevec_lru_add_fn+0xdf/0x140
 [<ffffffff81174181>] pagevec_lru_move_fn+0xb1/0x100
 [<ffffffff811741ec>] __pagevec_lru_add+0x1c/0x30
 [<ffffffff81174383>] lru_add_drain_cpu+0xa3/0x130
 [<ffffffff8117443f>] lru_add_drain+0x2f/0x40
 ...

The natural solution might be to use a memcg callback whenever memory
is hotadded; but that solution has not been scoped out, and it happens
that we do have an easy location at which to update lruvec->zone.  The
lruvec pointer is discovered either by mem_cgroup_zone_lruvec() or by
mem_cgroup_page_lruvec(), and both of those do know the right zone.

So check and set lruvec->zone in those; and remove the inadequate
attempt to set lruvec->zone from lruvec_init(), which is called
before NODE_DATA(node) has been allocated in such cases.

Ah, there was one exceptionr.  For no particularly good reason,
mem_cgroup_force_empty_list() has its own code for deciding lruvec.
Change it to use the standard mem_cgroup_zone_lruvec() and
mem_cgroup_get_lru_size() too.  In fact it was already safe against
such an oops (the lru lists in danger could only be empty),
but we're better proofed against future changes this way.

Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: stable@vger.kernel.org
---
I've marked this for stable (3.6) since we introduced the problem
in 3.5 (now closed to stable); but I have no idea if this is the
only fix needed to get memory hotadd working with memcg in 3.6,
and received no answer when I enquired twice before.

 include/linux/mmzone.h |    2 -
 mm/memcontrol.c        |   46 +++++++++++++++++++++++++++++----------
 mm/mmzone.c            |    6 -----
 mm/page_alloc.c        |    2 -
 4 files changed, 38 insertions(+), 18 deletions(-)

--- 3.7-rc3/include/linux/mmzone.h	2012-10-14 16:16:57.665308933 -0700
+++ linux/include/linux/mmzone.h	2012-11-01 14:31:04.284185741 -0700
@@ -752,7 +752,7 @@ extern int init_currently_empty_zone(str
 				     unsigned long size,
 				     enum memmap_context context);
 
-extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
+extern void lruvec_init(struct lruvec *lruvec);
 
 static inline struct zone *lruvec_zone(struct lruvec *lruvec)
 {
--- 3.7-rc3/mm/memcontrol.c	2012-10-14 16:16:58.341309118 -0700
+++ linux/mm/memcontrol.c	2012-11-01 14:31:04.284185741 -0700
@@ -1055,12 +1055,24 @@ struct lruvec *mem_cgroup_zone_lruvec(st
 				      struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_per_zone *mz;
+	struct lruvec *lruvec;
 
-	if (mem_cgroup_disabled())
-		return &zone->lruvec;
+	if (mem_cgroup_disabled()) {
+		lruvec = &zone->lruvec;
+		goto out;
+	}
 
 	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
-	return &mz->lruvec;
+	lruvec = &mz->lruvec;
+out:
+	/*
+	 * Since a node can be onlined after the mem_cgroup was created,
+	 * we have to be prepared to initialize lruvec->zone here;
+	 * and if offlined then reonlined, we need to reinitialize it.
+	 */
+	if (unlikely(lruvec->zone != zone))
+		lruvec->zone = zone;
+	return lruvec;
 }
 
 /*
@@ -1087,9 +1099,12 @@ struct lruvec *mem_cgroup_page_lruvec(st
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc;
+	struct lruvec *lruvec;
 
-	if (mem_cgroup_disabled())
-		return &zone->lruvec;
+	if (mem_cgroup_disabled()) {
+		lruvec = &zone->lruvec;
+		goto out;
+	}
 
 	pc = lookup_page_cgroup(page);
 	memcg = pc->mem_cgroup;
@@ -1107,7 +1122,16 @@ struct lruvec *mem_cgroup_page_lruvec(st
 		pc->mem_cgroup = memcg = root_mem_cgroup;
 
 	mz = page_cgroup_zoneinfo(memcg, page);
-	return &mz->lruvec;
+	lruvec = &mz->lruvec;
+out:
+	/*
+	 * Since a node can be onlined after the mem_cgroup was created,
+	 * we have to be prepared to initialize lruvec->zone here;
+	 * and if offlined then reonlined, we need to reinitialize it.
+	 */
+	if (unlikely(lruvec->zone != zone))
+		lruvec->zone = zone;
+	return lruvec;
 }
 
 /**
@@ -3688,17 +3712,17 @@ unsigned long mem_cgroup_soft_limit_recl
 static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 				int node, int zid, enum lru_list lru)
 {
-	struct mem_cgroup_per_zone *mz;
+	struct lruvec *lruvec;
 	unsigned long flags, loop;
 	struct list_head *list;
 	struct page *busy;
 	struct zone *zone;
 
 	zone = &NODE_DATA(node)->node_zones[zid];
-	mz = mem_cgroup_zoneinfo(memcg, node, zid);
-	list = &mz->lruvec.lists[lru];
+	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+	list = &lruvec->lists[lru];
 
-	loop = mz->lru_size[lru];
+	loop = mem_cgroup_get_lru_size(lruvec, lru);
 	/* give some margin against EBUSY etc...*/
 	loop += 256;
 	busy = NULL;
@@ -4736,7 +4760,7 @@ static int alloc_mem_cgroup_per_zone_inf
 
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
-		lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
+		lruvec_init(&mz->lruvec);
 		mz->usage_in_excess = 0;
 		mz->on_tree = false;
 		mz->memcg = memcg;
--- 3.7-rc3/mm/mmzone.c	2012-09-30 16:47:46.000000000 -0700
+++ linux/mm/mmzone.c	2012-11-01 14:31:04.284185741 -0700
@@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pf
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
 
-void lruvec_init(struct lruvec *lruvec, struct zone *zone)
+void lruvec_init(struct lruvec *lruvec)
 {
 	enum lru_list lru;
 
@@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec,
 
 	for_each_lru(lru)
 		INIT_LIST_HEAD(&lruvec->lists[lru]);
-
-#ifdef CONFIG_MEMCG
-	lruvec->zone = zone;
-#endif
 }
--- 3.7-rc3/mm/page_alloc.c	2012-10-28 13:48:00.021774166 -0700
+++ linux/mm/page_alloc.c	2012-11-01 14:31:04.284185741 -0700
@@ -4505,7 +4505,7 @@ static void __paginginit free_area_init_
 		zone->zone_pgdat = pgdat;
 
 		zone_pcp_init(zone);
-		lruvec_init(&zone->lruvec, zone);
+		lruvec_init(&zone->lruvec);
 		if (!size)
 			continue;
 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memcg: fix hotplugged memory zone oops
  2012-11-02  1:28           ` [PATCH] memcg: fix hotplugged memory zone oops Hugh Dickins
@ 2012-11-02 10:21             ` Michal Hocko
  2012-11-02 10:54               ` Michal Hocko
  2012-11-02 23:31               ` Hugh Dickins
  0 siblings, 2 replies; 21+ messages in thread
From: Michal Hocko @ 2012-11-02 10:21 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Andrew Morton, Johannes Weiner, Wen Congyang, linux-kernel,
	cgroups, linux-mm, Jiang Liu, bsingharora, KAMEZAWA Hiroyuki,
	Konstantin Khlebnikov, paul.gortmaker, Tang Chen

On Thu 01-11-12 18:28:02, Hugh Dickins wrote:
> When MEMCG is configured on (even when it's disabled by boot option),
> when adding or removing a page to/from its lru list, the zone pointer
> used for stats updates is nowadays taken from the struct lruvec.
> (On many configurations, calculating zone from page is slower.)
> 
> But we have no code to update all the lruvecs (per zone, per memcg)
> when a memory node is hotadded.  Here's an extract from the oops which
> results when running numactl to bind a program to a newly onlined node:
> 
> BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60
> IP: [<ffffffff811870b9>] __mod_zone_page_state+0x9/0x60
> PGD 0 
> Oops: 0000 [#1] SMP 
> CPU 2 
> Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs
> Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0)
> Stack:
>  ffff880039abdaf8 ffffffff8117390f ffff880039abdaf8 000000008167c601
>  ffffffff81174162 ffff88003a480f00 0000000000000001 ffff8800395e0000
>  ffff88003dbd0e80 0000000000000282 ffff880039abdb48 ffffffff81174181
> Call Trace:
>  [<ffffffff8117390f>] __pagevec_lru_add_fn+0xdf/0x140
>  [<ffffffff81174181>] pagevec_lru_move_fn+0xb1/0x100
>  [<ffffffff811741ec>] __pagevec_lru_add+0x1c/0x30
>  [<ffffffff81174383>] lru_add_drain_cpu+0xa3/0x130
>  [<ffffffff8117443f>] lru_add_drain+0x2f/0x40
>  ...
> 
> The natural solution might be to use a memcg callback whenever memory
> is hotadded; but that solution has not been scoped out, and it happens
> that we do have an easy location at which to update lruvec->zone.  The
> lruvec pointer is discovered either by mem_cgroup_zone_lruvec() or by
> mem_cgroup_page_lruvec(), and both of those do know the right zone.
> 
> So check and set lruvec->zone in those; and remove the inadequate
> attempt to set lruvec->zone from lruvec_init(), which is called
> before NODE_DATA(node) has been allocated in such cases.
> 
> Ah, there was one exceptionr.  For no particularly good reason,
> mem_cgroup_force_empty_list() has its own code for deciding lruvec.
> Change it to use the standard mem_cgroup_zone_lruvec() and
> mem_cgroup_get_lru_size() too.  In fact it was already safe against
> such an oops (the lru lists in danger could only be empty),
> but we're better proofed against future changes this way.
> 
> Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
> Signed-off-by: Hugh Dickins <hughd@google.com>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
> Cc: Wen Congyang <wency@cn.fujitsu.com>
> Cc: stable@vger.kernel.org

OK, it adds "an overhead" also when there is no hotadd going on but this
is just one additional mem access&cmp&je so it shouldn't be noticable
(lruvec->zone is used most of the time later so it not a pointless
load).
It is also easier to backport for stable. But is there any reason to fix
it later properly in the hotadd hook?

Anyway
Acked-by: Michal Hocko <mhocko@suse.cz>

Thanks!
> ---
> I've marked this for stable (3.6) since we introduced the problem
> in 3.5 (now closed to stable); but I have no idea if this is the
> only fix needed to get memory hotadd working with memcg in 3.6,
> and received no answer when I enquired twice before.
> 
>  include/linux/mmzone.h |    2 -
>  mm/memcontrol.c        |   46 +++++++++++++++++++++++++++++----------
>  mm/mmzone.c            |    6 -----
>  mm/page_alloc.c        |    2 -
>  4 files changed, 38 insertions(+), 18 deletions(-)
> 
> --- 3.7-rc3/include/linux/mmzone.h	2012-10-14 16:16:57.665308933 -0700
> +++ linux/include/linux/mmzone.h	2012-11-01 14:31:04.284185741 -0700
> @@ -752,7 +752,7 @@ extern int init_currently_empty_zone(str
>  				     unsigned long size,
>  				     enum memmap_context context);
>  
> -extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
> +extern void lruvec_init(struct lruvec *lruvec);
>  
>  static inline struct zone *lruvec_zone(struct lruvec *lruvec)
>  {
> --- 3.7-rc3/mm/memcontrol.c	2012-10-14 16:16:58.341309118 -0700
> +++ linux/mm/memcontrol.c	2012-11-01 14:31:04.284185741 -0700
> @@ -1055,12 +1055,24 @@ struct lruvec *mem_cgroup_zone_lruvec(st
>  				      struct mem_cgroup *memcg)
>  {
>  	struct mem_cgroup_per_zone *mz;
> +	struct lruvec *lruvec;
>  
> -	if (mem_cgroup_disabled())
> -		return &zone->lruvec;
> +	if (mem_cgroup_disabled()) {
> +		lruvec = &zone->lruvec;
> +		goto out;
> +	}
>  
>  	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
> -	return &mz->lruvec;
> +	lruvec = &mz->lruvec;
> +out:
> +	/*
> +	 * Since a node can be onlined after the mem_cgroup was created,
> +	 * we have to be prepared to initialize lruvec->zone here;
> +	 * and if offlined then reonlined, we need to reinitialize it.
> +	 */
> +	if (unlikely(lruvec->zone != zone))
> +		lruvec->zone = zone;
> +	return lruvec;
>  }
>  
>  /*
> @@ -1087,9 +1099,12 @@ struct lruvec *mem_cgroup_page_lruvec(st
>  	struct mem_cgroup_per_zone *mz;
>  	struct mem_cgroup *memcg;
>  	struct page_cgroup *pc;
> +	struct lruvec *lruvec;
>  
> -	if (mem_cgroup_disabled())
> -		return &zone->lruvec;
> +	if (mem_cgroup_disabled()) {
> +		lruvec = &zone->lruvec;
> +		goto out;
> +	}
>  
>  	pc = lookup_page_cgroup(page);
>  	memcg = pc->mem_cgroup;
> @@ -1107,7 +1122,16 @@ struct lruvec *mem_cgroup_page_lruvec(st
>  		pc->mem_cgroup = memcg = root_mem_cgroup;
>  
>  	mz = page_cgroup_zoneinfo(memcg, page);
> -	return &mz->lruvec;
> +	lruvec = &mz->lruvec;
> +out:
> +	/*
> +	 * Since a node can be onlined after the mem_cgroup was created,
> +	 * we have to be prepared to initialize lruvec->zone here;
> +	 * and if offlined then reonlined, we need to reinitialize it.
> +	 */
> +	if (unlikely(lruvec->zone != zone))
> +		lruvec->zone = zone;
> +	return lruvec;
>  }
>  
>  /**
> @@ -3688,17 +3712,17 @@ unsigned long mem_cgroup_soft_limit_recl
>  static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
>  				int node, int zid, enum lru_list lru)
>  {
> -	struct mem_cgroup_per_zone *mz;
> +	struct lruvec *lruvec;
>  	unsigned long flags, loop;
>  	struct list_head *list;
>  	struct page *busy;
>  	struct zone *zone;
>  
>  	zone = &NODE_DATA(node)->node_zones[zid];
> -	mz = mem_cgroup_zoneinfo(memcg, node, zid);
> -	list = &mz->lruvec.lists[lru];
> +	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> +	list = &lruvec->lists[lru];
>  
> -	loop = mz->lru_size[lru];
> +	loop = mem_cgroup_get_lru_size(lruvec, lru);
>  	/* give some margin against EBUSY etc...*/
>  	loop += 256;
>  	busy = NULL;
> @@ -4736,7 +4760,7 @@ static int alloc_mem_cgroup_per_zone_inf
>  
>  	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
>  		mz = &pn->zoneinfo[zone];
> -		lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
> +		lruvec_init(&mz->lruvec);
>  		mz->usage_in_excess = 0;
>  		mz->on_tree = false;
>  		mz->memcg = memcg;
> --- 3.7-rc3/mm/mmzone.c	2012-09-30 16:47:46.000000000 -0700
> +++ linux/mm/mmzone.c	2012-11-01 14:31:04.284185741 -0700
> @@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pf
>  }
>  #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
>  
> -void lruvec_init(struct lruvec *lruvec, struct zone *zone)
> +void lruvec_init(struct lruvec *lruvec)
>  {
>  	enum lru_list lru;
>  
> @@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec,
>  
>  	for_each_lru(lru)
>  		INIT_LIST_HEAD(&lruvec->lists[lru]);
> -
> -#ifdef CONFIG_MEMCG
> -	lruvec->zone = zone;
> -#endif
>  }
> --- 3.7-rc3/mm/page_alloc.c	2012-10-28 13:48:00.021774166 -0700
> +++ linux/mm/page_alloc.c	2012-11-01 14:31:04.284185741 -0700
> @@ -4505,7 +4505,7 @@ static void __paginginit free_area_init_
>  		zone->zone_pgdat = pgdat;
>  
>  		zone_pcp_init(zone);
> -		lruvec_init(&zone->lruvec, zone);
> +		lruvec_init(&zone->lruvec);
>  		if (!size)
>  			continue;
>  

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memcg: fix hotplugged memory zone oops
  2012-11-02 10:21             ` Michal Hocko
@ 2012-11-02 10:54               ` Michal Hocko
  2012-11-02 23:37                 ` Hugh Dickins
  2012-11-02 23:31               ` Hugh Dickins
  1 sibling, 1 reply; 21+ messages in thread
From: Michal Hocko @ 2012-11-02 10:54 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Andrew Morton, Johannes Weiner, Wen Congyang, linux-kernel,
	cgroups, linux-mm, Jiang Liu, bsingharora, KAMEZAWA Hiroyuki,
	Konstantin Khlebnikov, paul.gortmaker, Tang Chen

On Fri 02-11-12 11:21:59, Michal Hocko wrote:
> On Thu 01-11-12 18:28:02, Hugh Dickins wrote:
[...]

And I forgot to mention that the following hunk will clash with
"memcg: Simplify mem_cgroup_force_empty_list error handling" which is in
linux-next already (via Tejun's tree). 
Would it be easier to split the patch into the real fix and the hunk
bellow? That one doesn't have to go into stable anyway and we would save
some merging conflicts. The updated fix on top of -mm tree is bellow for
your convinience.

> >  /**
> > @@ -3688,17 +3712,17 @@ unsigned long mem_cgroup_soft_limit_recl
> >  static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
> >  				int node, int zid, enum lru_list lru)
> >  {
> > -	struct mem_cgroup_per_zone *mz;
> > +	struct lruvec *lruvec;
> >  	unsigned long flags, loop;
> >  	struct list_head *list;
> >  	struct page *busy;
> >  	struct zone *zone;
> >  
> >  	zone = &NODE_DATA(node)->node_zones[zid];
> > -	mz = mem_cgroup_zoneinfo(memcg, node, zid);
> > -	list = &mz->lruvec.lists[lru];
> > +	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> > +	list = &lruvec->lists[lru];
> >  
> > -	loop = mz->lru_size[lru];
> > +	loop = mem_cgroup_get_lru_size(lruvec, lru);
> >  	/* give some margin against EBUSY etc...*/
> >  	loop += 256;
> >  	busy = NULL;

---
>From 0e55c305a050502a4b2f5167efed58bb6585520b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 2 Nov 2012 11:47:48 +0100
Subject: [PATCH] memcg: use mem_cgroup_zone_lruvec in
 mem_cgroup_force_empty_list

For no particularly good reason, mem_cgroup_force_empty_list() has
its own code for deciding lruvec. Change it to use the standard
mem_cgroup_zone_lruvec() and mem_cgroup_get_lru_size() too.
In fact it was already safe against oops after memory hotadd (see
"memcg: fix hotplugged memory zone oops" for more details) when lruvec
is not initialized yet (the lru lists in danger could only be empty),
but we're better proofed against future changes this way.

Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Wen Congyang <wency@cn.fujitsu.com>
---
 mm/memcontrol.c |    6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6c72d65..32f0ecf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3710,15 +3710,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 				int node, int zid, enum lru_list lru)
 {
-	struct mem_cgroup_per_zone *mz;
+	struct lruvec *lruvec;
 	unsigned long flags;
 	struct list_head *list;
 	struct page *busy;
 	struct zone *zone;
 
 	zone = &NODE_DATA(node)->node_zones[zid];
-	mz = mem_cgroup_zoneinfo(memcg, node, zid);
-	list = &mz->lruvec.lists[lru];
+	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+	list = &lruvec->lists[lru];
 
 	busy = NULL;
 	do {
-- 
1.7.10.4

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH] memcg: fix hotplugged memory zone oops
  2012-11-02 10:21             ` Michal Hocko
  2012-11-02 10:54               ` Michal Hocko
@ 2012-11-02 23:31               ` Hugh Dickins
  1 sibling, 0 replies; 21+ messages in thread
From: Hugh Dickins @ 2012-11-02 23:31 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, Johannes Weiner, Wen Congyang, linux-kernel,
	cgroups, linux-mm, Jiang Liu, bsingharora, KAMEZAWA Hiroyuki,
	Konstantin Khlebnikov, paul.gortmaker, Tang Chen

On Fri, 2 Nov 2012, Michal Hocko wrote:
> 
> OK, it adds "an overhead" also when there is no hotadd going on but this
> is just one additional mem access&cmp&je so it shouldn't be noticable
> (lruvec->zone is used most of the time later so it not a pointless
> load).

I think so too.

> It is also easier to backport for stable.

Yes.

> But is there any reason to fix it later properly in the hotadd hook?

I don't know.  Not everybody liked it fixed this way: it's not
unreasonable to see this as a quick hack rather than the right fix.

I was expecting objectors to post alternative hotadd hook patches,
then we could compare and decide.  That didn't happen; but we can
certainly look to taking out these lines later if something we
agree is better comes along.  Not high on anyone's agenda, I think.

> 
> Anyway
> Acked-by: Michal Hocko <mhocko@suse.cz>

Thanks,
Hugh

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memcg: fix hotplugged memory zone oops
  2012-11-02 10:54               ` Michal Hocko
@ 2012-11-02 23:37                 ` Hugh Dickins
  2012-11-03  7:00                   ` Michal Hocko
  0 siblings, 1 reply; 21+ messages in thread
From: Hugh Dickins @ 2012-11-02 23:37 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, Johannes Weiner, Wen Congyang, linux-kernel,
	cgroups, linux-mm, Jiang Liu, bsingharora, KAMEZAWA Hiroyuki,
	Konstantin Khlebnikov, paul.gortmaker, Tang Chen

On Fri, 2 Nov 2012, Michal Hocko wrote:
> On Fri 02-11-12 11:21:59, Michal Hocko wrote:
> > On Thu 01-11-12 18:28:02, Hugh Dickins wrote:
> [...]
> 
> And I forgot to mention that the following hunk will clash with
> "memcg: Simplify mem_cgroup_force_empty_list error handling" which is in
> linux-next already (via Tejun's tree). 

Oh, via Tejun's tree.  Right, when I checked mmotm there was no problem.

> Would it be easier to split the patch into the real fix and the hunk
> bellow? That one doesn't have to go into stable anyway and we would save
> some merging conflicts. The updated fix on top of -mm tree is bellow for
> your convinience.

I'd prefer to leave it as one patch, so even the "future proof" part
of the fix goes into 3.7 and stable.  But your point is that you have
already seen the future, and it forks in a slightly different direction!

Well, I don't want to be obstructive, but it doesn't look difficult
to resolve.  Perhaps if I hold off on splitting them, and see if akpm
barks at me or not :)

Hugh

> > >  /**
> > > @@ -3688,17 +3712,17 @@ unsigned long mem_cgroup_soft_limit_recl
> > >  static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
> > >  				int node, int zid, enum lru_list lru)
> > >  {
> > > -	struct mem_cgroup_per_zone *mz;
> > > +	struct lruvec *lruvec;
> > >  	unsigned long flags, loop;
> > >  	struct list_head *list;
> > >  	struct page *busy;
> > >  	struct zone *zone;
> > >  
> > >  	zone = &NODE_DATA(node)->node_zones[zid];
> > > -	mz = mem_cgroup_zoneinfo(memcg, node, zid);
> > > -	list = &mz->lruvec.lists[lru];
> > > +	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> > > +	list = &lruvec->lists[lru];
> > >  
> > > -	loop = mz->lru_size[lru];
> > > +	loop = mem_cgroup_get_lru_size(lruvec, lru);
> > >  	/* give some margin against EBUSY etc...*/
> > >  	loop += 256;
> > >  	busy = NULL;

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] memcg: fix hotplugged memory zone oops
  2012-11-02 23:37                 ` Hugh Dickins
@ 2012-11-03  7:00                   ` Michal Hocko
  0 siblings, 0 replies; 21+ messages in thread
From: Michal Hocko @ 2012-11-03  7:00 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Andrew Morton, Johannes Weiner, Wen Congyang, linux-kernel,
	cgroups, linux-mm, Jiang Liu, bsingharora, KAMEZAWA Hiroyuki,
	Konstantin Khlebnikov, paul.gortmaker, Tang Chen

On Fri 02-11-12 16:37:37, Hugh Dickins wrote:
> On Fri, 2 Nov 2012, Michal Hocko wrote:
> > On Fri 02-11-12 11:21:59, Michal Hocko wrote:
> > > On Thu 01-11-12 18:28:02, Hugh Dickins wrote:
> > [...]
> > 
> > And I forgot to mention that the following hunk will clash with
> > "memcg: Simplify mem_cgroup_force_empty_list error handling" which is in
> > linux-next already (via Tejun's tree). 
> 
> Oh, via Tejun's tree.  Right, when I checked mmotm there was no problem.

Yeah, whole that thing goes through Tejun's tree because there are many
follow up clean ups depending on that change.

> > Would it be easier to split the patch into the real fix and the hunk
> > bellow? That one doesn't have to go into stable anyway and we would save
> > some merging conflicts. The updated fix on top of -mm tree is bellow for
> > your convinience.
> 
> I'd prefer to leave it as one patch, so even the "future proof" part
> of the fix goes into 3.7 and stable.  But your point is that you have
> already seen the future, and it forks in a slightly different direction!
> 
> Well, I don't want to be obstructive, but it doesn't look difficult
> to resolve.  

True.

> Perhaps if I hold off on splitting them, and see if akpm barks at me
> or not :)
> 
> Hugh
> 
> > > >  /**
> > > > @@ -3688,17 +3712,17 @@ unsigned long mem_cgroup_soft_limit_recl
> > > >  static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
> > > >  				int node, int zid, enum lru_list lru)
> > > >  {
> > > > -	struct mem_cgroup_per_zone *mz;
> > > > +	struct lruvec *lruvec;
> > > >  	unsigned long flags, loop;
> > > >  	struct list_head *list;
> > > >  	struct page *busy;
> > > >  	struct zone *zone;
> > > >  
> > > >  	zone = &NODE_DATA(node)->node_zones[zid];
> > > > -	mz = mem_cgroup_zoneinfo(memcg, node, zid);
> > > > -	list = &mz->lruvec.lists[lru];
> > > > +	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> > > > +	list = &lruvec->lists[lru];
> > > >  
> > > > -	loop = mz->lru_size[lru];
> > > > +	loop = mem_cgroup_get_lru_size(lruvec, lru);
> > > >  	/* give some margin against EBUSY etc...*/
> > > >  	loop += 256;
> > > >  	busy = NULL;

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2012-11-03  7:00 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-09-13  7:14 [PATCH] memory cgroup: update root memory cgroup when node is onlined Wen Congyang
2012-09-13 10:06 ` Kamezawa Hiroyuki
2012-09-13 10:18   ` Wen Congyang
2012-09-13 20:59 ` Johannes Weiner
2012-09-14  1:36   ` Hugh Dickins
2012-09-14  1:53     ` Wen Congyang
2012-09-14 15:46     ` Johannes Weiner
2012-09-15 10:56     ` Konstantin Khlebnikov
2012-09-17  5:50     ` Wen Congyang
2012-10-16  5:58     ` Wen Congyang
2012-10-18 19:03       ` Hugh Dickins
2012-10-18 22:03         ` Johannes Weiner
2012-11-02  1:28           ` [PATCH] memcg: fix hotplugged memory zone oops Hugh Dickins
2012-11-02 10:21             ` Michal Hocko
2012-11-02 10:54               ` Michal Hocko
2012-11-02 23:37                 ` Hugh Dickins
2012-11-03  7:00                   ` Michal Hocko
2012-11-02 23:31               ` Hugh Dickins
2012-10-16  7:21     ` [PATCH] memory cgroup: update root memory cgroup when node is onlined Kamezawa Hiroyuki
2012-09-14  1:46   ` Wen Congyang
2012-09-17  6:40     ` Hugh Dickins

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).