mm: memcontrol: Expose THP events on a per-memcg basis
diff mbox series

Message ID 20190129205852.GA7310@chrisdown.name
State In Next
Headers show
Series
  • mm: memcontrol: Expose THP events on a per-memcg basis
Related show

Commit Message

Chris Down Jan. 29, 2019, 8:58 p.m. UTC
Currently THP allocation events data is fairly opaque, since you can
only get it system-wide. This patch makes it easier to reason about
transparent hugepage behaviour on a per-memcg basis.

For anonymous THP-backed pages, we already have MEMCG_RSS_HUGE in v1,
which is used for v1's rss_huge [sic]. This is reused here as it's
fairly involved to untangle NR_ANON_THPS right now to make it
per-memcg, since right now some of this is delegated to rmap before we
have any memcg actually assigned to the page. It's a good idea to rework
that, but let's leave untangling THP allocation for a future patch.

Signed-off-by: Chris Down <chris@chrisdown.name>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: linux-kernel@vger.kernel.org
Cc: cgroups@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: kernel-team@fb.com
---
 Documentation/admin-guide/cgroup-v2.rst | 14 ++++++++++++++
 mm/huge_memory.c                        |  2 ++
 mm/khugepaged.c                         |  2 ++
 mm/memcontrol.c                         | 13 +++++++++++++
 4 files changed, 31 insertions(+)

Comments

Johannes Weiner Jan. 29, 2019, 10:15 p.m. UTC | #1
On Tue, Jan 29, 2019 at 03:58:52PM -0500, Chris Down wrote:
> Currently THP allocation events data is fairly opaque, since you can
> only get it system-wide. This patch makes it easier to reason about
> transparent hugepage behaviour on a per-memcg basis.
> 
> For anonymous THP-backed pages, we already have MEMCG_RSS_HUGE in v1,
> which is used for v1's rss_huge [sic]. This is reused here as it's
> fairly involved to untangle NR_ANON_THPS right now to make it
> per-memcg, since right now some of this is delegated to rmap before we
> have any memcg actually assigned to the page. It's a good idea to rework
> that, but let's leave untangling THP allocation for a future patch.
>
> Signed-off-by: Chris Down <chris@chrisdown.name>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Roman Gushchin <guro@fb.com>
> Cc: linux-kernel@vger.kernel.org
> Cc: cgroups@vger.kernel.org
> Cc: linux-mm@kvack.org
> Cc: kernel-team@fb.com

Looks good to me. It's useful to know if a cgroup is getting the THP
coverage and allocation policy it's asking for.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

The fallback numbers could be useful as well, but they're tricky to
obtain as there isn't an obvious memcg context. We can do them later.
kernel test robot Feb. 1, 2019, 1:39 a.m. UTC | #2
Hi Chris,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.0-rc4]
[cannot apply to next-20190131]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Chris-Down/mm-memcontrol-Expose-THP-events-on-a-per-memcg-basis/20190201-022143
config: x86_64-randconfig-j1-01290405 (attached as .config)
compiler: gcc-4.9 (Debian 4.9.4-2) 4.9.4
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   mm/memcontrol.c: In function 'memory_stat_show':
>> mm/memcontrol.c:5625:52: error: 'THP_FAULT_ALLOC' undeclared (first use in this function)
     seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]);
                                                       ^
   mm/memcontrol.c:5625:52: note: each undeclared identifier is reported only once for each function it appears in
   mm/memcontrol.c:5627:17: error: 'THP_COLLAPSE_ALLOC' undeclared (first use in this function)
         acc.events[THP_COLLAPSE_ALLOC]);
                    ^

vim +/THP_FAULT_ALLOC +5625 mm/memcontrol.c

  5541	
  5542	static int memory_stat_show(struct seq_file *m, void *v)
  5543	{
  5544		struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
  5545		struct accumulated_stats acc;
  5546		int i;
  5547	
  5548		/*
  5549		 * Provide statistics on the state of the memory subsystem as
  5550		 * well as cumulative event counters that show past behavior.
  5551		 *
  5552		 * This list is ordered following a combination of these gradients:
  5553		 * 1) generic big picture -> specifics and details
  5554		 * 2) reflecting userspace activity -> reflecting kernel heuristics
  5555		 *
  5556		 * Current memory state:
  5557		 */
  5558	
  5559		memset(&acc, 0, sizeof(acc));
  5560		acc.stats_size = MEMCG_NR_STAT;
  5561		acc.events_size = NR_VM_EVENT_ITEMS;
  5562		accumulate_memcg_tree(memcg, &acc);
  5563	
  5564		seq_printf(m, "anon %llu\n",
  5565			   (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
  5566		seq_printf(m, "file %llu\n",
  5567			   (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
  5568		seq_printf(m, "kernel_stack %llu\n",
  5569			   (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
  5570		seq_printf(m, "slab %llu\n",
  5571			   (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
  5572				 acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
  5573		seq_printf(m, "sock %llu\n",
  5574			   (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
  5575	
  5576		seq_printf(m, "shmem %llu\n",
  5577			   (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
  5578		seq_printf(m, "file_mapped %llu\n",
  5579			   (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
  5580		seq_printf(m, "file_dirty %llu\n",
  5581			   (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
  5582		seq_printf(m, "file_writeback %llu\n",
  5583			   (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
  5584	
  5585		/*
  5586		 * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
  5587		 * with the NR_ANON_THP vm counter, but right now it's a pain in the
  5588		 * arse because it requires migrating the work out of rmap to a place
  5589		 * where the page->mem_cgroup is set up and stable.
  5590		 */
  5591		seq_printf(m, "anon_thp %llu\n",
  5592			   (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE);
  5593	
  5594		for (i = 0; i < NR_LRU_LISTS; i++)
  5595			seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
  5596				   (u64)acc.lru_pages[i] * PAGE_SIZE);
  5597	
  5598		seq_printf(m, "slab_reclaimable %llu\n",
  5599			   (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
  5600		seq_printf(m, "slab_unreclaimable %llu\n",
  5601			   (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
  5602	
  5603		/* Accumulated memory events */
  5604	
  5605		seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
  5606		seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
  5607	
  5608		seq_printf(m, "workingset_refault %lu\n",
  5609			   acc.stat[WORKINGSET_REFAULT]);
  5610		seq_printf(m, "workingset_activate %lu\n",
  5611			   acc.stat[WORKINGSET_ACTIVATE]);
  5612		seq_printf(m, "workingset_nodereclaim %lu\n",
  5613			   acc.stat[WORKINGSET_NODERECLAIM]);
  5614	
  5615		seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
  5616		seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
  5617			   acc.events[PGSCAN_DIRECT]);
  5618		seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
  5619			   acc.events[PGSTEAL_DIRECT]);
  5620		seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
  5621		seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
  5622		seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
  5623		seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
  5624	
> 5625		seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]);
  5626		seq_printf(m, "thp_collapse_alloc %lu\n",
  5627			   acc.events[THP_COLLAPSE_ALLOC]);
  5628	
  5629		return 0;
  5630	}
  5631	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
kernel test robot Feb. 1, 2019, 2:57 a.m. UTC | #3
Hi Chris,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.0-rc4]
[cannot apply to next-20190131]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Chris-Down/mm-memcontrol-Expose-THP-events-on-a-per-memcg-basis/20190201-022143
config: sh-allmodconfig (attached as .config)
compiler: sh4-linux-gnu-gcc (Debian 8.2.0-11) 8.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=8.2.0 make.cross ARCH=sh 

All errors (new ones prefixed by >>):

   mm/memcontrol.c: In function 'memory_stat_show':
>> mm/memcontrol.c:5625:52: error: 'THP_FAULT_ALLOC' undeclared (first use in this function); did you mean 'THP_FILE_ALLOC'?
     seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]);
                                                       ^~~~~~~~~~~~~~~
                                                       THP_FILE_ALLOC
   mm/memcontrol.c:5625:52: note: each undeclared identifier is reported only once for each function it appears in
>> mm/memcontrol.c:5627:17: error: 'THP_COLLAPSE_ALLOC' undeclared (first use in this function); did you mean 'THP_FILE_ALLOC'?
         acc.events[THP_COLLAPSE_ALLOC]);
                    ^~~~~~~~~~~~~~~~~~
                    THP_FILE_ALLOC

vim +5625 mm/memcontrol.c

  5541	
  5542	static int memory_stat_show(struct seq_file *m, void *v)
  5543	{
  5544		struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
  5545		struct accumulated_stats acc;
  5546		int i;
  5547	
  5548		/*
  5549		 * Provide statistics on the state of the memory subsystem as
  5550		 * well as cumulative event counters that show past behavior.
  5551		 *
  5552		 * This list is ordered following a combination of these gradients:
  5553		 * 1) generic big picture -> specifics and details
  5554		 * 2) reflecting userspace activity -> reflecting kernel heuristics
  5555		 *
  5556		 * Current memory state:
  5557		 */
  5558	
  5559		memset(&acc, 0, sizeof(acc));
  5560		acc.stats_size = MEMCG_NR_STAT;
  5561		acc.events_size = NR_VM_EVENT_ITEMS;
  5562		accumulate_memcg_tree(memcg, &acc);
  5563	
  5564		seq_printf(m, "anon %llu\n",
  5565			   (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
  5566		seq_printf(m, "file %llu\n",
  5567			   (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
  5568		seq_printf(m, "kernel_stack %llu\n",
  5569			   (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
  5570		seq_printf(m, "slab %llu\n",
  5571			   (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
  5572				 acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
  5573		seq_printf(m, "sock %llu\n",
  5574			   (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
  5575	
  5576		seq_printf(m, "shmem %llu\n",
  5577			   (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
  5578		seq_printf(m, "file_mapped %llu\n",
  5579			   (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
  5580		seq_printf(m, "file_dirty %llu\n",
  5581			   (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
  5582		seq_printf(m, "file_writeback %llu\n",
  5583			   (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
  5584	
  5585		/*
  5586		 * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
  5587		 * with the NR_ANON_THP vm counter, but right now it's a pain in the
  5588		 * arse because it requires migrating the work out of rmap to a place
  5589		 * where the page->mem_cgroup is set up and stable.
  5590		 */
  5591		seq_printf(m, "anon_thp %llu\n",
  5592			   (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE);
  5593	
  5594		for (i = 0; i < NR_LRU_LISTS; i++)
  5595			seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
  5596				   (u64)acc.lru_pages[i] * PAGE_SIZE);
  5597	
  5598		seq_printf(m, "slab_reclaimable %llu\n",
  5599			   (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
  5600		seq_printf(m, "slab_unreclaimable %llu\n",
  5601			   (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
  5602	
  5603		/* Accumulated memory events */
  5604	
  5605		seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
  5606		seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
  5607	
  5608		seq_printf(m, "workingset_refault %lu\n",
  5609			   acc.stat[WORKINGSET_REFAULT]);
  5610		seq_printf(m, "workingset_activate %lu\n",
  5611			   acc.stat[WORKINGSET_ACTIVATE]);
  5612		seq_printf(m, "workingset_nodereclaim %lu\n",
  5613			   acc.stat[WORKINGSET_NODERECLAIM]);
  5614	
  5615		seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
  5616		seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
  5617			   acc.events[PGSCAN_DIRECT]);
  5618		seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
  5619			   acc.events[PGSTEAL_DIRECT]);
  5620		seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
  5621		seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
  5622		seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
  5623		seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
  5624	
> 5625		seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]);
  5626		seq_printf(m, "thp_collapse_alloc %lu\n",
> 5627			   acc.events[THP_COLLAPSE_ALLOC]);
  5628	
  5629		return 0;
  5630	}
  5631	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Chris Down Feb. 1, 2019, 2:58 a.m. UTC | #4
kbuild test robot writes:
>Thank you for the patch! Yet something to improve:
>
>[auto build test ERROR on linus/master]
>[also build test ERROR on v5.0-rc4]

This was already fixed and is now in linux-next.

Patch
diff mbox series

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 7bf3f129c68b..b6989b39ed8e 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1189,6 +1189,10 @@  PAGE_SIZE multiple when read back.
 		Amount of cached filesystem data that was modified and
 		is currently being written back to disk
 
+	  anon_thp
+		Amount of memory used in anonymous mappings backed by
+		transparent hugepages
+
 	  inactive_anon, active_anon, inactive_file, active_file, unevictable
 		Amount of memory, swap-backed and filesystem-backed,
 		on the internal memory management lists used by the
@@ -1248,6 +1252,16 @@  PAGE_SIZE multiple when read back.
 
 		Amount of reclaimed lazyfree pages
 
+	  thp_fault_alloc
+
+		Number of transparent hugepages which were allocated to satisfy
+		a page fault, including COW faults
+
+	  thp_collapse_alloc
+
+		Number of transparent hugepages which were allocated to
+		allow collapsing an existing range of pages
+
   memory.swap.current
 	A read-only single value file which exists on non-root
 	cgroups.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f5f1d4324fe2..6cb7a748aa33 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -617,6 +617,7 @@  static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		mm_inc_nr_ptes(vma->vm_mm);
 		spin_unlock(vmf->ptl);
 		count_vm_event(THP_FAULT_ALLOC);
+		count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
 	}
 
 	return 0;
@@ -1339,6 +1340,7 @@  vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
 	}
 
 	count_vm_event(THP_FAULT_ALLOC);
+	count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
 
 	if (!page)
 		clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ceb242ca6ef6..54f3d33f897a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1075,6 +1075,7 @@  static void collapse_huge_page(struct mm_struct *mm,
 	BUG_ON(!pmd_none(*pmd));
 	page_add_new_anon_rmap(new_page, vma, address, true);
 	mem_cgroup_commit_charge(new_page, memcg, false, true);
+	count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
 	lru_cache_add_active_or_unevictable(new_page, vma);
 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, address, pmd, _pmd);
@@ -1503,6 +1504,7 @@  static void collapse_shmem(struct mm_struct *mm,
 		page_ref_add(new_page, HPAGE_PMD_NR - 1);
 		set_page_dirty(new_page);
 		mem_cgroup_commit_charge(new_page, memcg, false, true);
+		count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
 		lru_cache_add_anon(new_page);
 
 		/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 18f4aefbe0bf..2f4fe2fb9046 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5603,6 +5603,15 @@  static int memory_stat_show(struct seq_file *m, void *v)
 	seq_printf(m, "file_writeback %llu\n",
 		   (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
 
+	/*
+	 * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
+	 * with the NR_ANON_THP vm counter, but right now it's a pain in the
+	 * arse because it requires migrating the work out of rmap to a place
+	 * where the page->mem_cgroup is set up and stable.
+	 */
+	seq_printf(m, "anon_thp %llu\n",
+		   (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE);
+
 	for (i = 0; i < NR_LRU_LISTS; i++)
 		seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
 			   (u64)acc.lru_pages[i] * PAGE_SIZE);
@@ -5634,6 +5643,10 @@  static int memory_stat_show(struct seq_file *m, void *v)
 	seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
 	seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
 
+	seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]);
+	seq_printf(m, "thp_collapse_alloc %lu\n",
+		   acc.events[THP_COLLAPSE_ALLOC]);
+
 	return 0;
 }