linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 1/3] mm: embed the memcg pointer directly into struct page
@ 2014-11-02  3:15 Johannes Weiner
  2014-11-02  3:15 ` [patch 2/3] mm: page_cgroup: rename file to mm/swap_cgroup.c Johannes Weiner
                   ` (9 more replies)
  0 siblings, 10 replies; 38+ messages in thread
From: Johannes Weiner @ 2014-11-02  3:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Vladimir Davydov, Tejun Heo, David Miller,
	linux-mm, cgroups, linux-kernel

Memory cgroups used to have 5 per-page pointers.  To allow users to
disable that amount of overhead during runtime, those pointers were
allocated in a separate array, with a translation layer between them
and struct page.

There is now only one page pointer remaining: the memcg pointer, that
indicates which cgroup the page is associated with when charged.  The
complexity of runtime allocation and the runtime translation overhead
is no longer justified to save that *potential* 0.19% of memory.  With
CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
after the page->private member and doesn't even increase struct page,
and then this patch actually saves space.  Remaining users that care
can still compile their kernels without CONFIG_MEMCG.

   text    data     bss     dec     hex     filename
8828345 1725264  983040 11536649 b00909  vmlinux.old
8827425 1725264  966656 11519345 afc571  vmlinux.new

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/memcontrol.h  |   6 +-
 include/linux/mm_types.h    |   5 +
 include/linux/mmzone.h      |  12 --
 include/linux/page_cgroup.h |  53 --------
 init/main.c                 |   7 -
 mm/memcontrol.c             | 124 +++++------------
 mm/page_alloc.c             |   2 -
 mm/page_cgroup.c            | 319 --------------------------------------------
 8 files changed, 41 insertions(+), 487 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d4575a1d6e99..dafba59b31b4 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -25,7 +25,6 @@
 #include <linux/jump_label.h>
 
 struct mem_cgroup;
-struct page_cgroup;
 struct page;
 struct mm_struct;
 struct kmem_cache;
@@ -466,8 +465,6 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
  * memcg_kmem_uncharge_pages: uncharge pages from memcg
  * @page: pointer to struct page being freed
  * @order: allocation order.
- *
- * there is no need to specify memcg here, since it is embedded in page_cgroup
  */
 static inline void
 memcg_kmem_uncharge_pages(struct page *page, int order)
@@ -484,8 +481,7 @@ memcg_kmem_uncharge_pages(struct page *page, int order)
  *
  * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
  * failure of the allocation. if @page is NULL, this function will revert the
- * charges. Otherwise, it will commit the memcg given by @memcg to the
- * corresponding page_cgroup.
+ * charges. Otherwise, it will commit @page to @memcg.
  */
 static inline void
 memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index de183328abb0..57e47dffbdda 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -23,6 +23,7 @@
 #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
 
 struct address_space;
+struct mem_cgroup;
 
 #define USE_SPLIT_PTE_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
 #define USE_SPLIT_PMD_PTLOCKS	(USE_SPLIT_PTE_PTLOCKS && \
@@ -168,6 +169,10 @@ struct page {
 		struct page *first_page;	/* Compound tail pages */
 	};
 
+#ifdef CONFIG_MEMCG
+	struct mem_cgroup *mem_cgroup;
+#endif
+
 	/*
 	 * On machines where all RAM is mapped into kernel address space,
 	 * we can simply calculate the virtual address. On machines with
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 48bf12ef6620..de32d9344446 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -713,9 +713,6 @@ typedef struct pglist_data {
 	int nr_zones;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
 	struct page *node_mem_map;
-#ifdef CONFIG_MEMCG
-	struct page_cgroup *node_page_cgroup;
-#endif
 #endif
 #ifndef CONFIG_NO_BOOTMEM
 	struct bootmem_data *bdata;
@@ -1069,7 +1066,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
 #define SECTION_ALIGN_DOWN(pfn)	((pfn) & PAGE_SECTION_MASK)
 
 struct page;
-struct page_cgroup;
 struct mem_section {
 	/*
 	 * This is, logically, a pointer to an array of struct
@@ -1087,14 +1083,6 @@ struct mem_section {
 
 	/* See declaration of similar field in struct zone */
 	unsigned long *pageblock_flags;
-#ifdef CONFIG_MEMCG
-	/*
-	 * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
-	 * section. (see memcontrol.h/page_cgroup.h about this.)
-	 */
-	struct page_cgroup *page_cgroup;
-	unsigned long pad;
-#endif
 	/*
 	 * WARNING: mem_section must be a power-of-2 in size for the
 	 * calculation and use of SECTION_ROOT_MASK to make sense.
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 1289be6b436c..65be35785c86 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -1,59 +1,6 @@
 #ifndef __LINUX_PAGE_CGROUP_H
 #define __LINUX_PAGE_CGROUP_H
 
-struct pglist_data;
-
-#ifdef CONFIG_MEMCG
-struct mem_cgroup;
-
-/*
- * Page Cgroup can be considered as an extended mem_map.
- * A page_cgroup page is associated with every page descriptor. The
- * page_cgroup helps us identify information about the cgroup
- * All page cgroups are allocated at boot or memory hotplug event,
- * then the page cgroup for pfn always exists.
- */
-struct page_cgroup {
-	struct mem_cgroup *mem_cgroup;
-};
-
-extern void pgdat_page_cgroup_init(struct pglist_data *pgdat);
-
-#ifdef CONFIG_SPARSEMEM
-static inline void page_cgroup_init_flatmem(void)
-{
-}
-extern void page_cgroup_init(void);
-#else
-extern void page_cgroup_init_flatmem(void);
-static inline void page_cgroup_init(void)
-{
-}
-#endif
-
-struct page_cgroup *lookup_page_cgroup(struct page *page);
-
-#else /* !CONFIG_MEMCG */
-struct page_cgroup;
-
-static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat)
-{
-}
-
-static inline struct page_cgroup *lookup_page_cgroup(struct page *page)
-{
-	return NULL;
-}
-
-static inline void page_cgroup_init(void)
-{
-}
-
-static inline void page_cgroup_init_flatmem(void)
-{
-}
-#endif /* CONFIG_MEMCG */
-
 #include <linux/swap.h>
 
 #ifdef CONFIG_MEMCG_SWAP
diff --git a/init/main.c b/init/main.c
index c4912d9abaee..a091bbcadd33 100644
--- a/init/main.c
+++ b/init/main.c
@@ -51,7 +51,6 @@
 #include <linux/mempolicy.h>
 #include <linux/key.h>
 #include <linux/buffer_head.h>
-#include <linux/page_cgroup.h>
 #include <linux/debug_locks.h>
 #include <linux/debugobjects.h>
 #include <linux/lockdep.h>
@@ -485,11 +484,6 @@ void __init __weak thread_info_cache_init(void)
  */
 static void __init mm_init(void)
 {
-	/*
-	 * page_cgroup requires contiguous pages,
-	 * bigger than MAX_ORDER unless SPARSEMEM.
-	 */
-	page_cgroup_init_flatmem();
 	mem_init();
 	kmem_cache_init();
 	percpu_init_late();
@@ -627,7 +621,6 @@ asmlinkage __visible void __init start_kernel(void)
 		initrd_start = 0;
 	}
 #endif
-	page_cgroup_init();
 	debug_objects_mem_init();
 	kmemleak_init();
 	setup_per_cpu_pageset();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 19bac2ce827c..dc5e0abb18cb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1274,7 +1274,6 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
 {
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup *memcg;
-	struct page_cgroup *pc;
 	struct lruvec *lruvec;
 
 	if (mem_cgroup_disabled()) {
@@ -1282,8 +1281,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
 		goto out;
 	}
 
-	pc = lookup_page_cgroup(page);
-	memcg = pc->mem_cgroup;
+	memcg = page->mem_cgroup;
 	/*
 	 * Swapcache readahead pages are added to the LRU - and
 	 * possibly migrated - before they are charged.
@@ -2020,16 +2018,13 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
 					      unsigned long *flags)
 {
 	struct mem_cgroup *memcg;
-	struct page_cgroup *pc;
 
 	rcu_read_lock();
 
 	if (mem_cgroup_disabled())
 		return NULL;
-
-	pc = lookup_page_cgroup(page);
 again:
-	memcg = pc->mem_cgroup;
+	memcg = page->mem_cgroup;
 	if (unlikely(!memcg))
 		return NULL;
 
@@ -2038,7 +2033,7 @@ again:
 		return memcg;
 
 	spin_lock_irqsave(&memcg->move_lock, *flags);
-	if (memcg != pc->mem_cgroup) {
+	if (memcg != page->mem_cgroup) {
 		spin_unlock_irqrestore(&memcg->move_lock, *flags);
 		goto again;
 	}
@@ -2405,15 +2400,12 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 {
 	struct mem_cgroup *memcg;
-	struct page_cgroup *pc;
 	unsigned short id;
 	swp_entry_t ent;
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 
-	pc = lookup_page_cgroup(page);
-	memcg = pc->mem_cgroup;
-
+	memcg = page->mem_cgroup;
 	if (memcg) {
 		if (!css_tryget_online(&memcg->css))
 			memcg = NULL;
@@ -2463,10 +2455,9 @@ static void unlock_page_lru(struct page *page, int isolated)
 static void commit_charge(struct page *page, struct mem_cgroup *memcg,
 			  bool lrucare)
 {
-	struct page_cgroup *pc = lookup_page_cgroup(page);
 	int isolated;
 
-	VM_BUG_ON_PAGE(pc->mem_cgroup, page);
+	VM_BUG_ON_PAGE(page->mem_cgroup, page);
 
 	/*
 	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
@@ -2477,7 +2468,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
 
 	/*
 	 * Nobody should be changing or seriously looking at
-	 * pc->mem_cgroup at this point:
+	 * page->mem_cgroup at this point:
 	 *
 	 * - the page is uncharged
 	 *
@@ -2489,7 +2480,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
 	 * - a page cache insertion, a swapin fault, or a migration
 	 *   have the page locked
 	 */
-	pc->mem_cgroup = memcg;
+	page->mem_cgroup = memcg;
 
 	if (lrucare)
 		unlock_page_lru(page, isolated);
@@ -2972,8 +2963,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
 			      int order)
 {
-	struct page_cgroup *pc;
-
 	VM_BUG_ON(mem_cgroup_is_root(memcg));
 
 	/* The page allocation failed. Revert */
@@ -2981,14 +2970,12 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
 		memcg_uncharge_kmem(memcg, 1 << order);
 		return;
 	}
-	pc = lookup_page_cgroup(page);
-	pc->mem_cgroup = memcg;
+	page->mem_cgroup = memcg;
 }
 
 void __memcg_kmem_uncharge_pages(struct page *page, int order)
 {
-	struct page_cgroup *pc = lookup_page_cgroup(page);
-	struct mem_cgroup *memcg = pc->mem_cgroup;
+	struct mem_cgroup *memcg = page->mem_cgroup;
 
 	if (!memcg)
 		return;
@@ -2996,7 +2983,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
 	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
 
 	memcg_uncharge_kmem(memcg, 1 << order);
-	pc->mem_cgroup = NULL;
+	page->mem_cgroup = NULL;
 }
 #else
 static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
@@ -3014,16 +3001,15 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
-	struct page_cgroup *pc = lookup_page_cgroup(head);
 	int i;
 
 	if (mem_cgroup_disabled())
 		return;
 
 	for (i = 1; i < HPAGE_PMD_NR; i++)
-		pc[i].mem_cgroup = pc[0].mem_cgroup;
+		head[i].mem_cgroup = head->mem_cgroup;
 
-	__this_cpu_sub(pc[0].mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
+	__this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
 		       HPAGE_PMD_NR);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -3032,7 +3018,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
  * mem_cgroup_move_account - move account of the page
  * @page: the page
  * @nr_pages: number of regular pages (>1 for huge pages)
- * @pc:	page_cgroup of the page.
  * @from: mem_cgroup which the page is moved from.
  * @to:	mem_cgroup which the page is moved to. @from != @to.
  *
@@ -3045,7 +3030,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
  */
 static int mem_cgroup_move_account(struct page *page,
 				   unsigned int nr_pages,
-				   struct page_cgroup *pc,
 				   struct mem_cgroup *from,
 				   struct mem_cgroup *to)
 {
@@ -3065,7 +3049,7 @@ static int mem_cgroup_move_account(struct page *page,
 		goto out;
 
 	/*
-	 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
+	 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
 	 * of its source page while we change it: page migration takes
 	 * both pages off the LRU, but page cache replacement doesn't.
 	 */
@@ -3073,7 +3057,7 @@ static int mem_cgroup_move_account(struct page *page,
 		goto out;
 
 	ret = -EINVAL;
-	if (pc->mem_cgroup != from)
+	if (page->mem_cgroup != from)
 		goto out_unlock;
 
 	spin_lock_irqsave(&from->move_lock, flags);
@@ -3093,13 +3077,13 @@ static int mem_cgroup_move_account(struct page *page,
 	}
 
 	/*
-	 * It is safe to change pc->mem_cgroup here because the page
+	 * It is safe to change page->mem_cgroup here because the page
 	 * is referenced, charged, and isolated - we can't race with
 	 * uncharging, charging, migration, or LRU putback.
 	 */
 
 	/* caller should have done css_get */
-	pc->mem_cgroup = to;
+	page->mem_cgroup = to;
 	spin_unlock_irqrestore(&from->move_lock, flags);
 
 	ret = 0;
@@ -3174,36 +3158,17 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 #endif
 
 #ifdef CONFIG_DEBUG_VM
-static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
-{
-	struct page_cgroup *pc;
-
-	pc = lookup_page_cgroup(page);
-	/*
-	 * Can be NULL while feeding pages into the page allocator for
-	 * the first time, i.e. during boot or memory hotplug;
-	 * or when mem_cgroup_disabled().
-	 */
-	if (likely(pc) && pc->mem_cgroup)
-		return pc;
-	return NULL;
-}
-
 bool mem_cgroup_bad_page_check(struct page *page)
 {
 	if (mem_cgroup_disabled())
 		return false;
 
-	return lookup_page_cgroup_used(page) != NULL;
+	return page->mem_cgroup != NULL;
 }
 
 void mem_cgroup_print_bad_page(struct page *page)
 {
-	struct page_cgroup *pc;
-
-	pc = lookup_page_cgroup_used(page);
-	if (pc)
-		pr_alert("pc:%p pc->mem_cgroup:%p\n", pc, pc->mem_cgroup);
+	pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
 }
 #endif
 
@@ -5123,7 +5088,6 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		unsigned long addr, pte_t ptent, union mc_target *target)
 {
 	struct page *page = NULL;
-	struct page_cgroup *pc;
 	enum mc_target_type ret = MC_TARGET_NONE;
 	swp_entry_t ent = { .val = 0 };
 
@@ -5137,13 +5101,12 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 	if (!page && !ent.val)
 		return ret;
 	if (page) {
-		pc = lookup_page_cgroup(page);
 		/*
 		 * Do only loose check w/o serialization.
-		 * mem_cgroup_move_account() checks the pc is valid or
+		 * mem_cgroup_move_account() checks the page is valid or
 		 * not under LRU exclusion.
 		 */
-		if (pc->mem_cgroup == mc.from) {
+		if (page->mem_cgroup == mc.from) {
 			ret = MC_TARGET_PAGE;
 			if (target)
 				target->page = page;
@@ -5171,15 +5134,13 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t pmd, union mc_target *target)
 {
 	struct page *page = NULL;
-	struct page_cgroup *pc;
 	enum mc_target_type ret = MC_TARGET_NONE;
 
 	page = pmd_page(pmd);
 	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
 	if (!move_anon())
 		return ret;
-	pc = lookup_page_cgroup(page);
-	if (pc->mem_cgroup == mc.from) {
+	if (page->mem_cgroup == mc.from) {
 		ret = MC_TARGET_PAGE;
 		if (target) {
 			get_page(page);
@@ -5378,7 +5339,6 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 	enum mc_target_type target_type;
 	union mc_target target;
 	struct page *page;
-	struct page_cgroup *pc;
 
 	/*
 	 * We don't take compound_lock() here but no race with splitting thp
@@ -5399,9 +5359,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 		if (target_type == MC_TARGET_PAGE) {
 			page = target.page;
 			if (!isolate_lru_page(page)) {
-				pc = lookup_page_cgroup(page);
 				if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
-							pc, mc.from, mc.to)) {
+							     mc.from, mc.to)) {
 					mc.precharge -= HPAGE_PMD_NR;
 					mc.moved_charge += HPAGE_PMD_NR;
 				}
@@ -5429,9 +5388,7 @@ retry:
 			page = target.page;
 			if (isolate_lru_page(page))
 				goto put;
-			pc = lookup_page_cgroup(page);
-			if (!mem_cgroup_move_account(page, 1, pc,
-						     mc.from, mc.to)) {
+			if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
 				mc.precharge--;
 				/* we uncharge from mc.from later. */
 				mc.moved_charge++;
@@ -5623,7 +5580,6 @@ static void __init enable_swap_cgroup(void)
 void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 {
 	struct mem_cgroup *memcg;
-	struct page_cgroup *pc;
 	unsigned short oldid;
 
 	VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5632,8 +5588,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 	if (!do_swap_account)
 		return;
 
-	pc = lookup_page_cgroup(page);
-	memcg = pc->mem_cgroup;
+	memcg = page->mem_cgroup;
 
 	/* Readahead page, never charged */
 	if (!memcg)
@@ -5643,7 +5598,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 	VM_BUG_ON_PAGE(oldid, page);
 	mem_cgroup_swap_statistics(memcg, true);
 
-	pc->mem_cgroup = NULL;
+	page->mem_cgroup = NULL;
 
 	if (!mem_cgroup_is_root(memcg))
 		page_counter_uncharge(&memcg->memory, 1);
@@ -5710,7 +5665,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 		goto out;
 
 	if (PageSwapCache(page)) {
-		struct page_cgroup *pc = lookup_page_cgroup(page);
 		/*
 		 * Every swap fault against a single page tries to charge the
 		 * page, bail as early as possible.  shmem_unuse() encounters
@@ -5718,7 +5672,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 		 * the page lock, which serializes swap cache removal, which
 		 * in turn serializes uncharging.
 		 */
-		if (pc->mem_cgroup)
+		if (page->mem_cgroup)
 			goto out;
 	}
 
@@ -5871,7 +5825,6 @@ static void uncharge_list(struct list_head *page_list)
 	next = page_list->next;
 	do {
 		unsigned int nr_pages = 1;
-		struct page_cgroup *pc;
 
 		page = list_entry(next, struct page, lru);
 		next = page->lru.next;
@@ -5879,23 +5832,22 @@ static void uncharge_list(struct list_head *page_list)
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 		VM_BUG_ON_PAGE(page_count(page), page);
 
-		pc = lookup_page_cgroup(page);
-		if (!pc->mem_cgroup)
+		if (!page->mem_cgroup)
 			continue;
 
 		/*
 		 * Nobody should be changing or seriously looking at
-		 * pc->mem_cgroup at this point, we have fully
+		 * page->mem_cgroup at this point, we have fully
 		 * exclusive access to the page.
 		 */
 
-		if (memcg != pc->mem_cgroup) {
+		if (memcg != page->mem_cgroup) {
 			if (memcg) {
 				uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
 					       nr_huge, page);
 				pgpgout = nr_anon = nr_file = nr_huge = 0;
 			}
-			memcg = pc->mem_cgroup;
+			memcg = page->mem_cgroup;
 		}
 
 		if (PageTransHuge(page)) {
@@ -5909,7 +5861,7 @@ static void uncharge_list(struct list_head *page_list)
 		else
 			nr_file += nr_pages;
 
-		pc->mem_cgroup = NULL;
+		page->mem_cgroup = NULL;
 
 		pgpgout++;
 	} while (next != page_list);
@@ -5928,14 +5880,11 @@ static void uncharge_list(struct list_head *page_list)
  */
 void mem_cgroup_uncharge(struct page *page)
 {
-	struct page_cgroup *pc;
-
 	if (mem_cgroup_disabled())
 		return;
 
 	/* Don't touch page->lru of any random page, pre-check: */
-	pc = lookup_page_cgroup(page);
-	if (!pc->mem_cgroup)
+	if (!page->mem_cgroup)
 		return;
 
 	INIT_LIST_HEAD(&page->lru);
@@ -5972,7 +5921,6 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
 			bool lrucare)
 {
 	struct mem_cgroup *memcg;
-	struct page_cgroup *pc;
 	int isolated;
 
 	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
@@ -5987,8 +5935,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
 		return;
 
 	/* Page cache replacement: new page already charged? */
-	pc = lookup_page_cgroup(newpage);
-	if (pc->mem_cgroup)
+	if (newpage->mem_cgroup)
 		return;
 
 	/*
@@ -5997,15 +5944,14 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
 	 * uncharged page when the PFN walker finds a page that
 	 * reclaim just put back on the LRU but has not released yet.
 	 */
-	pc = lookup_page_cgroup(oldpage);
-	memcg = pc->mem_cgroup;
+	memcg = oldpage->mem_cgroup;
 	if (!memcg)
 		return;
 
 	if (lrucare)
 		lock_page_lru(oldpage, &isolated);
 
-	pc->mem_cgroup = NULL;
+	oldpage->mem_cgroup = NULL;
 
 	if (lrucare)
 		unlock_page_lru(oldpage, isolated);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fa9426389c6c..6a952237a677 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,7 +48,6 @@
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
-#include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
@@ -4899,7 +4898,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 #endif
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
-	pgdat_page_cgroup_init(pgdat);
 
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5331c2bd85a2..f0f31c1d4d0c 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -1,326 +1,7 @@
 #include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/bootmem.h>
-#include <linux/bit_spinlock.h>
 #include <linux/page_cgroup.h>
-#include <linux/hash.h>
-#include <linux/slab.h>
-#include <linux/memory.h>
 #include <linux/vmalloc.h>
-#include <linux/cgroup.h>
 #include <linux/swapops.h>
-#include <linux/kmemleak.h>
-
-static unsigned long total_usage;
-
-#if !defined(CONFIG_SPARSEMEM)
-
-
-void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
-{
-	pgdat->node_page_cgroup = NULL;
-}
-
-struct page_cgroup *lookup_page_cgroup(struct page *page)
-{
-	unsigned long pfn = page_to_pfn(page);
-	unsigned long offset;
-	struct page_cgroup *base;
-
-	base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
-#ifdef CONFIG_DEBUG_VM
-	/*
-	 * The sanity checks the page allocator does upon freeing a
-	 * page can reach here before the page_cgroup arrays are
-	 * allocated when feeding a range of pages to the allocator
-	 * for the first time during bootup or memory hotplug.
-	 */
-	if (unlikely(!base))
-		return NULL;
-#endif
-	offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
-	return base + offset;
-}
-
-static int __init alloc_node_page_cgroup(int nid)
-{
-	struct page_cgroup *base;
-	unsigned long table_size;
-	unsigned long nr_pages;
-
-	nr_pages = NODE_DATA(nid)->node_spanned_pages;
-	if (!nr_pages)
-		return 0;
-
-	table_size = sizeof(struct page_cgroup) * nr_pages;
-
-	base = memblock_virt_alloc_try_nid_nopanic(
-			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
-			BOOTMEM_ALLOC_ACCESSIBLE, nid);
-	if (!base)
-		return -ENOMEM;
-	NODE_DATA(nid)->node_page_cgroup = base;
-	total_usage += table_size;
-	return 0;
-}
-
-void __init page_cgroup_init_flatmem(void)
-{
-
-	int nid, fail;
-
-	if (mem_cgroup_disabled())
-		return;
-
-	for_each_online_node(nid)  {
-		fail = alloc_node_page_cgroup(nid);
-		if (fail)
-			goto fail;
-	}
-	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
-	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
-	" don't want memory cgroups\n");
-	return;
-fail:
-	printk(KERN_CRIT "allocation of page_cgroup failed.\n");
-	printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
-	panic("Out of memory");
-}
-
-#else /* CONFIG_FLAT_NODE_MEM_MAP */
-
-struct page_cgroup *lookup_page_cgroup(struct page *page)
-{
-	unsigned long pfn = page_to_pfn(page);
-	struct mem_section *section = __pfn_to_section(pfn);
-#ifdef CONFIG_DEBUG_VM
-	/*
-	 * The sanity checks the page allocator does upon freeing a
-	 * page can reach here before the page_cgroup arrays are
-	 * allocated when feeding a range of pages to the allocator
-	 * for the first time during bootup or memory hotplug.
-	 */
-	if (!section->page_cgroup)
-		return NULL;
-#endif
-	return section->page_cgroup + pfn;
-}
-
-static void *__meminit alloc_page_cgroup(size_t size, int nid)
-{
-	gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
-	void *addr = NULL;
-
-	addr = alloc_pages_exact_nid(nid, size, flags);
-	if (addr) {
-		kmemleak_alloc(addr, size, 1, flags);
-		return addr;
-	}
-
-	if (node_state(nid, N_HIGH_MEMORY))
-		addr = vzalloc_node(size, nid);
-	else
-		addr = vzalloc(size);
-
-	return addr;
-}
-
-static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
-{
-	struct mem_section *section;
-	struct page_cgroup *base;
-	unsigned long table_size;
-
-	section = __pfn_to_section(pfn);
-
-	if (section->page_cgroup)
-		return 0;
-
-	table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
-	base = alloc_page_cgroup(table_size, nid);
-
-	/*
-	 * The value stored in section->page_cgroup is (base - pfn)
-	 * and it does not point to the memory block allocated above,
-	 * causing kmemleak false positives.
-	 */
-	kmemleak_not_leak(base);
-
-	if (!base) {
-		printk(KERN_ERR "page cgroup allocation failure\n");
-		return -ENOMEM;
-	}
-
-	/*
-	 * The passed "pfn" may not be aligned to SECTION.  For the calculation
-	 * we need to apply a mask.
-	 */
-	pfn &= PAGE_SECTION_MASK;
-	section->page_cgroup = base - pfn;
-	total_usage += table_size;
-	return 0;
-}
-#ifdef CONFIG_MEMORY_HOTPLUG
-static void free_page_cgroup(void *addr)
-{
-	if (is_vmalloc_addr(addr)) {
-		vfree(addr);
-	} else {
-		struct page *page = virt_to_page(addr);
-		size_t table_size =
-			sizeof(struct page_cgroup) * PAGES_PER_SECTION;
-
-		BUG_ON(PageReserved(page));
-		kmemleak_free(addr);
-		free_pages_exact(addr, table_size);
-	}
-}
-
-static void __free_page_cgroup(unsigned long pfn)
-{
-	struct mem_section *ms;
-	struct page_cgroup *base;
-
-	ms = __pfn_to_section(pfn);
-	if (!ms || !ms->page_cgroup)
-		return;
-	base = ms->page_cgroup + pfn;
-	free_page_cgroup(base);
-	ms->page_cgroup = NULL;
-}
-
-static int __meminit online_page_cgroup(unsigned long start_pfn,
-				unsigned long nr_pages,
-				int nid)
-{
-	unsigned long start, end, pfn;
-	int fail = 0;
-
-	start = SECTION_ALIGN_DOWN(start_pfn);
-	end = SECTION_ALIGN_UP(start_pfn + nr_pages);
-
-	if (nid == -1) {
-		/*
-		 * In this case, "nid" already exists and contains valid memory.
-		 * "start_pfn" passed to us is a pfn which is an arg for
-		 * online__pages(), and start_pfn should exist.
-		 */
-		nid = pfn_to_nid(start_pfn);
-		VM_BUG_ON(!node_state(nid, N_ONLINE));
-	}
-
-	for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
-		if (!pfn_present(pfn))
-			continue;
-		fail = init_section_page_cgroup(pfn, nid);
-	}
-	if (!fail)
-		return 0;
-
-	/* rollback */
-	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
-		__free_page_cgroup(pfn);
-
-	return -ENOMEM;
-}
-
-static int __meminit offline_page_cgroup(unsigned long start_pfn,
-				unsigned long nr_pages, int nid)
-{
-	unsigned long start, end, pfn;
-
-	start = SECTION_ALIGN_DOWN(start_pfn);
-	end = SECTION_ALIGN_UP(start_pfn + nr_pages);
-
-	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
-		__free_page_cgroup(pfn);
-	return 0;
-
-}
-
-static int __meminit page_cgroup_callback(struct notifier_block *self,
-			       unsigned long action, void *arg)
-{
-	struct memory_notify *mn = arg;
-	int ret = 0;
-	switch (action) {
-	case MEM_GOING_ONLINE:
-		ret = online_page_cgroup(mn->start_pfn,
-				   mn->nr_pages, mn->status_change_nid);
-		break;
-	case MEM_OFFLINE:
-		offline_page_cgroup(mn->start_pfn,
-				mn->nr_pages, mn->status_change_nid);
-		break;
-	case MEM_CANCEL_ONLINE:
-		offline_page_cgroup(mn->start_pfn,
-				mn->nr_pages, mn->status_change_nid);
-		break;
-	case MEM_GOING_OFFLINE:
-		break;
-	case MEM_ONLINE:
-	case MEM_CANCEL_OFFLINE:
-		break;
-	}
-
-	return notifier_from_errno(ret);
-}
-
-#endif
-
-void __init page_cgroup_init(void)
-{
-	unsigned long pfn;
-	int nid;
-
-	if (mem_cgroup_disabled())
-		return;
-
-	for_each_node_state(nid, N_MEMORY) {
-		unsigned long start_pfn, end_pfn;
-
-		start_pfn = node_start_pfn(nid);
-		end_pfn = node_end_pfn(nid);
-		/*
-		 * start_pfn and end_pfn may not be aligned to SECTION and the
-		 * page->flags of out of node pages are not initialized.  So we
-		 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
-		 */
-		for (pfn = start_pfn;
-		     pfn < end_pfn;
-                     pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
-
-			if (!pfn_valid(pfn))
-				continue;
-			/*
-			 * Nodes's pfns can be overlapping.
-			 * We know some arch can have a nodes layout such as
-			 * -------------pfn-------------->
-			 * N0 | N1 | N2 | N0 | N1 | N2|....
-			 */
-			if (pfn_to_nid(pfn) != nid)
-				continue;
-			if (init_section_page_cgroup(pfn, nid))
-				goto oom;
-		}
-	}
-	hotplug_memory_notifier(page_cgroup_callback, 0);
-	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
-	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
-			 "don't want memory cgroups\n");
-	return;
-oom:
-	printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
-	panic("Out of memory");
-}
-
-void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
-{
-	return;
-}
-
-#endif
-
 
 #ifdef CONFIG_MEMCG_SWAP
 
-- 
2.1.3


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [patch 2/3] mm: page_cgroup: rename file to mm/swap_cgroup.c
  2014-11-02  3:15 [patch 1/3] mm: embed the memcg pointer directly into struct page Johannes Weiner
@ 2014-11-02  3:15 ` Johannes Weiner
  2014-11-03  4:22   ` David Miller
                     ` (3 more replies)
  2014-11-02  3:15 ` [patch 3/3] mm: move page->mem_cgroup bad page handling into generic code Johannes Weiner
                   ` (8 subsequent siblings)
  9 siblings, 4 replies; 38+ messages in thread
From: Johannes Weiner @ 2014-11-02  3:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Vladimir Davydov, Tejun Heo, David Miller,
	linux-mm, cgroups, linux-kernel

Now that the external page_cgroup data structure and its lookup is
gone, the only code remaining in there is swap slot accounting.

Rename it and move the conditional compilation into mm/Makefile.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 MAINTAINERS                 |   2 +-
 include/linux/page_cgroup.h |  40 ---------
 include/linux/swap_cgroup.h |  42 +++++++++
 mm/Makefile                 |   3 +-
 mm/memcontrol.c             |   2 +-
 mm/page_cgroup.c            | 211 --------------------------------------------
 mm/swap_cgroup.c            | 208 +++++++++++++++++++++++++++++++++++++++++++
 mm/swap_state.c             |   1 -
 mm/swapfile.c               |   2 +-
 9 files changed, 255 insertions(+), 256 deletions(-)
 delete mode 100644 include/linux/page_cgroup.h
 create mode 100644 include/linux/swap_cgroup.h
 delete mode 100644 mm/page_cgroup.c
 create mode 100644 mm/swap_cgroup.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 7e31be07197e..3a60389d3a13 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2583,7 +2583,7 @@ L:	cgroups@vger.kernel.org
 L:	linux-mm@kvack.org
 S:	Maintained
 F:	mm/memcontrol.c
-F:	mm/page_cgroup.c
+F:	mm/swap_cgroup.c
 
 CORETEMP HARDWARE MONITORING DRIVER
 M:	Fenghua Yu <fenghua.yu@intel.com>
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
deleted file mode 100644
index 65be35785c86..000000000000
--- a/include/linux/page_cgroup.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef __LINUX_PAGE_CGROUP_H
-#define __LINUX_PAGE_CGROUP_H
-
-#include <linux/swap.h>
-
-#ifdef CONFIG_MEMCG_SWAP
-extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
-					unsigned short old, unsigned short new);
-extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
-extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
-extern int swap_cgroup_swapon(int type, unsigned long max_pages);
-extern void swap_cgroup_swapoff(int type);
-#else
-
-static inline
-unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
-{
-	return 0;
-}
-
-static inline
-unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
-{
-	return 0;
-}
-
-static inline int
-swap_cgroup_swapon(int type, unsigned long max_pages)
-{
-	return 0;
-}
-
-static inline void swap_cgroup_swapoff(int type)
-{
-	return;
-}
-
-#endif /* CONFIG_MEMCG_SWAP */
-
-#endif /* __LINUX_PAGE_CGROUP_H */
diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h
new file mode 100644
index 000000000000..145306bdc92f
--- /dev/null
+++ b/include/linux/swap_cgroup.h
@@ -0,0 +1,42 @@
+#ifndef __LINUX_SWAP_CGROUP_H
+#define __LINUX_SWAP_CGROUP_H
+
+#include <linux/swap.h>
+
+#ifdef CONFIG_MEMCG_SWAP
+
+extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
+					unsigned short old, unsigned short new);
+extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
+extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
+extern int swap_cgroup_swapon(int type, unsigned long max_pages);
+extern void swap_cgroup_swapoff(int type);
+
+#else
+
+static inline
+unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
+{
+	return 0;
+}
+
+static inline
+unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
+{
+	return 0;
+}
+
+static inline int
+swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+	return 0;
+}
+
+static inline void swap_cgroup_swapoff(int type)
+{
+	return;
+}
+
+#endif /* CONFIG_MEMCG_SWAP */
+
+#endif /* __LINUX_SWAP_CGROUP_H */
diff --git a/mm/Makefile b/mm/Makefile
index 27ddb80403a9..d9d579484f15 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -56,7 +56,8 @@ obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
 obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
-obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
+obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
+obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dc5e0abb18cb..fbb41a170eae 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -51,7 +51,7 @@
 #include <linux/seq_file.h>
 #include <linux/vmpressure.h>
 #include <linux/mm_inline.h>
-#include <linux/page_cgroup.h>
+#include <linux/swap_cgroup.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include <linux/lockdep.h>
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
deleted file mode 100644
index f0f31c1d4d0c..000000000000
--- a/mm/page_cgroup.c
+++ /dev/null
@@ -1,211 +0,0 @@
-#include <linux/mm.h>
-#include <linux/page_cgroup.h>
-#include <linux/vmalloc.h>
-#include <linux/swapops.h>
-
-#ifdef CONFIG_MEMCG_SWAP
-
-static DEFINE_MUTEX(swap_cgroup_mutex);
-struct swap_cgroup_ctrl {
-	struct page **map;
-	unsigned long length;
-	spinlock_t	lock;
-};
-
-static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
-
-struct swap_cgroup {
-	unsigned short		id;
-};
-#define SC_PER_PAGE	(PAGE_SIZE/sizeof(struct swap_cgroup))
-
-/*
- * SwapCgroup implements "lookup" and "exchange" operations.
- * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
- * against SwapCache. At swap_free(), this is accessed directly from swap.
- *
- * This means,
- *  - we have no race in "exchange" when we're accessed via SwapCache because
- *    SwapCache(and its swp_entry) is under lock.
- *  - When called via swap_free(), there is no user of this entry and no race.
- * Then, we don't need lock around "exchange".
- *
- * TODO: we can push these buffers out to HIGHMEM.
- */
-
-/*
- * allocate buffer for swap_cgroup.
- */
-static int swap_cgroup_prepare(int type)
-{
-	struct page *page;
-	struct swap_cgroup_ctrl *ctrl;
-	unsigned long idx, max;
-
-	ctrl = &swap_cgroup_ctrl[type];
-
-	for (idx = 0; idx < ctrl->length; idx++) {
-		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-		if (!page)
-			goto not_enough_page;
-		ctrl->map[idx] = page;
-	}
-	return 0;
-not_enough_page:
-	max = idx;
-	for (idx = 0; idx < max; idx++)
-		__free_page(ctrl->map[idx]);
-
-	return -ENOMEM;
-}
-
-static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
-					struct swap_cgroup_ctrl **ctrlp)
-{
-	pgoff_t offset = swp_offset(ent);
-	struct swap_cgroup_ctrl *ctrl;
-	struct page *mappage;
-	struct swap_cgroup *sc;
-
-	ctrl = &swap_cgroup_ctrl[swp_type(ent)];
-	if (ctrlp)
-		*ctrlp = ctrl;
-
-	mappage = ctrl->map[offset / SC_PER_PAGE];
-	sc = page_address(mappage);
-	return sc + offset % SC_PER_PAGE;
-}
-
-/**
- * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
- * @ent: swap entry to be cmpxchged
- * @old: old id
- * @new: new id
- *
- * Returns old id at success, 0 at failure.
- * (There is no mem_cgroup using 0 as its id)
- */
-unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
-					unsigned short old, unsigned short new)
-{
-	struct swap_cgroup_ctrl *ctrl;
-	struct swap_cgroup *sc;
-	unsigned long flags;
-	unsigned short retval;
-
-	sc = lookup_swap_cgroup(ent, &ctrl);
-
-	spin_lock_irqsave(&ctrl->lock, flags);
-	retval = sc->id;
-	if (retval == old)
-		sc->id = new;
-	else
-		retval = 0;
-	spin_unlock_irqrestore(&ctrl->lock, flags);
-	return retval;
-}
-
-/**
- * swap_cgroup_record - record mem_cgroup for this swp_entry.
- * @ent: swap entry to be recorded into
- * @id: mem_cgroup to be recorded
- *
- * Returns old value at success, 0 at failure.
- * (Of course, old value can be 0.)
- */
-unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
-{
-	struct swap_cgroup_ctrl *ctrl;
-	struct swap_cgroup *sc;
-	unsigned short old;
-	unsigned long flags;
-
-	sc = lookup_swap_cgroup(ent, &ctrl);
-
-	spin_lock_irqsave(&ctrl->lock, flags);
-	old = sc->id;
-	sc->id = id;
-	spin_unlock_irqrestore(&ctrl->lock, flags);
-
-	return old;
-}
-
-/**
- * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
- * @ent: swap entry to be looked up.
- *
- * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
- */
-unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
-{
-	return lookup_swap_cgroup(ent, NULL)->id;
-}
-
-int swap_cgroup_swapon(int type, unsigned long max_pages)
-{
-	void *array;
-	unsigned long array_size;
-	unsigned long length;
-	struct swap_cgroup_ctrl *ctrl;
-
-	if (!do_swap_account)
-		return 0;
-
-	length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
-	array_size = length * sizeof(void *);
-
-	array = vzalloc(array_size);
-	if (!array)
-		goto nomem;
-
-	ctrl = &swap_cgroup_ctrl[type];
-	mutex_lock(&swap_cgroup_mutex);
-	ctrl->length = length;
-	ctrl->map = array;
-	spin_lock_init(&ctrl->lock);
-	if (swap_cgroup_prepare(type)) {
-		/* memory shortage */
-		ctrl->map = NULL;
-		ctrl->length = 0;
-		mutex_unlock(&swap_cgroup_mutex);
-		vfree(array);
-		goto nomem;
-	}
-	mutex_unlock(&swap_cgroup_mutex);
-
-	return 0;
-nomem:
-	printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
-	printk(KERN_INFO
-		"swap_cgroup can be disabled by swapaccount=0 boot option\n");
-	return -ENOMEM;
-}
-
-void swap_cgroup_swapoff(int type)
-{
-	struct page **map;
-	unsigned long i, length;
-	struct swap_cgroup_ctrl *ctrl;
-
-	if (!do_swap_account)
-		return;
-
-	mutex_lock(&swap_cgroup_mutex);
-	ctrl = &swap_cgroup_ctrl[type];
-	map = ctrl->map;
-	length = ctrl->length;
-	ctrl->map = NULL;
-	ctrl->length = 0;
-	mutex_unlock(&swap_cgroup_mutex);
-
-	if (map) {
-		for (i = 0; i < length; i++) {
-			struct page *page = map[i];
-			if (page)
-				__free_page(page);
-		}
-		vfree(map);
-	}
-}
-
-#endif
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
new file mode 100644
index 000000000000..b5f7f24b8dd1
--- /dev/null
+++ b/mm/swap_cgroup.c
@@ -0,0 +1,208 @@
+#include <linux/swap_cgroup.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+
+#include <linux/swapops.h> /* depends on mm.h include */
+
+static DEFINE_MUTEX(swap_cgroup_mutex);
+struct swap_cgroup_ctrl {
+	struct page **map;
+	unsigned long length;
+	spinlock_t	lock;
+};
+
+static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+
+struct swap_cgroup {
+	unsigned short		id;
+};
+#define SC_PER_PAGE	(PAGE_SIZE/sizeof(struct swap_cgroup))
+
+/*
+ * SwapCgroup implements "lookup" and "exchange" operations.
+ * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
+ * against SwapCache. At swap_free(), this is accessed directly from swap.
+ *
+ * This means,
+ *  - we have no race in "exchange" when we're accessed via SwapCache because
+ *    SwapCache(and its swp_entry) is under lock.
+ *  - When called via swap_free(), there is no user of this entry and no race.
+ * Then, we don't need lock around "exchange".
+ *
+ * TODO: we can push these buffers out to HIGHMEM.
+ */
+
+/*
+ * allocate buffer for swap_cgroup.
+ */
+static int swap_cgroup_prepare(int type)
+{
+	struct page *page;
+	struct swap_cgroup_ctrl *ctrl;
+	unsigned long idx, max;
+
+	ctrl = &swap_cgroup_ctrl[type];
+
+	for (idx = 0; idx < ctrl->length; idx++) {
+		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!page)
+			goto not_enough_page;
+		ctrl->map[idx] = page;
+	}
+	return 0;
+not_enough_page:
+	max = idx;
+	for (idx = 0; idx < max; idx++)
+		__free_page(ctrl->map[idx]);
+
+	return -ENOMEM;
+}
+
+static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
+					struct swap_cgroup_ctrl **ctrlp)
+{
+	pgoff_t offset = swp_offset(ent);
+	struct swap_cgroup_ctrl *ctrl;
+	struct page *mappage;
+	struct swap_cgroup *sc;
+
+	ctrl = &swap_cgroup_ctrl[swp_type(ent)];
+	if (ctrlp)
+		*ctrlp = ctrl;
+
+	mappage = ctrl->map[offset / SC_PER_PAGE];
+	sc = page_address(mappage);
+	return sc + offset % SC_PER_PAGE;
+}
+
+/**
+ * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
+ * @ent: swap entry to be cmpxchged
+ * @old: old id
+ * @new: new id
+ *
+ * Returns old id at success, 0 at failure.
+ * (There is no mem_cgroup using 0 as its id)
+ */
+unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
+					unsigned short old, unsigned short new)
+{
+	struct swap_cgroup_ctrl *ctrl;
+	struct swap_cgroup *sc;
+	unsigned long flags;
+	unsigned short retval;
+
+	sc = lookup_swap_cgroup(ent, &ctrl);
+
+	spin_lock_irqsave(&ctrl->lock, flags);
+	retval = sc->id;
+	if (retval == old)
+		sc->id = new;
+	else
+		retval = 0;
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+	return retval;
+}
+
+/**
+ * swap_cgroup_record - record mem_cgroup for this swp_entry.
+ * @ent: swap entry to be recorded into
+ * @id: mem_cgroup to be recorded
+ *
+ * Returns old value at success, 0 at failure.
+ * (Of course, old value can be 0.)
+ */
+unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
+{
+	struct swap_cgroup_ctrl *ctrl;
+	struct swap_cgroup *sc;
+	unsigned short old;
+	unsigned long flags;
+
+	sc = lookup_swap_cgroup(ent, &ctrl);
+
+	spin_lock_irqsave(&ctrl->lock, flags);
+	old = sc->id;
+	sc->id = id;
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+
+	return old;
+}
+
+/**
+ * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
+ * @ent: swap entry to be looked up.
+ *
+ * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
+ */
+unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
+{
+	return lookup_swap_cgroup(ent, NULL)->id;
+}
+
+int swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+	void *array;
+	unsigned long array_size;
+	unsigned long length;
+	struct swap_cgroup_ctrl *ctrl;
+
+	if (!do_swap_account)
+		return 0;
+
+	length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
+	array_size = length * sizeof(void *);
+
+	array = vzalloc(array_size);
+	if (!array)
+		goto nomem;
+
+	ctrl = &swap_cgroup_ctrl[type];
+	mutex_lock(&swap_cgroup_mutex);
+	ctrl->length = length;
+	ctrl->map = array;
+	spin_lock_init(&ctrl->lock);
+	if (swap_cgroup_prepare(type)) {
+		/* memory shortage */
+		ctrl->map = NULL;
+		ctrl->length = 0;
+		mutex_unlock(&swap_cgroup_mutex);
+		vfree(array);
+		goto nomem;
+	}
+	mutex_unlock(&swap_cgroup_mutex);
+
+	return 0;
+nomem:
+	printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
+	printk(KERN_INFO
+		"swap_cgroup can be disabled by swapaccount=0 boot option\n");
+	return -ENOMEM;
+}
+
+void swap_cgroup_swapoff(int type)
+{
+	struct page **map;
+	unsigned long i, length;
+	struct swap_cgroup_ctrl *ctrl;
+
+	if (!do_swap_account)
+		return;
+
+	mutex_lock(&swap_cgroup_mutex);
+	ctrl = &swap_cgroup_ctrl[type];
+	map = ctrl->map;
+	length = ctrl->length;
+	ctrl->map = NULL;
+	ctrl->length = 0;
+	mutex_unlock(&swap_cgroup_mutex);
+
+	if (map) {
+		for (i = 0; i < length; i++) {
+			struct page *page = map[i];
+			if (page)
+				__free_page(page);
+		}
+		vfree(map);
+	}
+}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 154444918685..9711342987a0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,7 +17,6 @@
 #include <linux/blkdev.h>
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
-#include <linux/page_cgroup.h>
 
 #include <asm/pgtable.h>
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8798b2e0ac59..63f55ccb9b26 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -38,7 +38,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
-#include <linux/page_cgroup.h>
+#include <linux/swap_cgroup.h>
 
 static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
 				 unsigned char);
-- 
2.1.3


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [patch 3/3] mm: move page->mem_cgroup bad page handling into generic code
  2014-11-02  3:15 [patch 1/3] mm: embed the memcg pointer directly into struct page Johannes Weiner
  2014-11-02  3:15 ` [patch 2/3] mm: page_cgroup: rename file to mm/swap_cgroup.c Johannes Weiner
@ 2014-11-02  3:15 ` Johannes Weiner
  2014-11-03  4:23   ` David Miller
                     ` (4 more replies)
  2014-11-03  4:22 ` [patch 1/3] mm: embed the memcg pointer directly into struct page David Miller
                   ` (7 subsequent siblings)
  9 siblings, 5 replies; 38+ messages in thread
From: Johannes Weiner @ 2014-11-02  3:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Vladimir Davydov, Tejun Heo, David Miller,
	linux-mm, cgroups, linux-kernel

Now that the external page_cgroup data structure and its lookup is
gone, let the generic bad_page() check for page->mem_cgroup sanity.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/memcontrol.h |  4 ----
 mm/debug.c                 |  5 ++++-
 mm/memcontrol.c            | 15 ---------------
 mm/page_alloc.c            | 12 ++++++++----
 4 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index dafba59b31b4..e789551d4db0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -173,10 +173,6 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
 void mem_cgroup_split_huge_fixup(struct page *head);
 #endif
 
-#ifdef CONFIG_DEBUG_VM
-bool mem_cgroup_bad_page_check(struct page *page);
-void mem_cgroup_print_bad_page(struct page *page);
-#endif
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
 
diff --git a/mm/debug.c b/mm/debug.c
index 5ce45c9a29b5..0e58f3211f89 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -95,7 +95,10 @@ void dump_page_badflags(struct page *page, const char *reason,
 		dump_flags(page->flags & badflags,
 				pageflag_names, ARRAY_SIZE(pageflag_names));
 	}
-	mem_cgroup_print_bad_page(page);
+#ifdef CONFIG_MEMCG
+	if (page->mem_cgroup)
+		pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
+#endif
 }
 
 void dump_page(struct page *page, const char *reason)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fbb41a170eae..3645641513a1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3157,21 +3157,6 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 }
 #endif
 
-#ifdef CONFIG_DEBUG_VM
-bool mem_cgroup_bad_page_check(struct page *page)
-{
-	if (mem_cgroup_disabled())
-		return false;
-
-	return page->mem_cgroup != NULL;
-}
-
-void mem_cgroup_print_bad_page(struct page *page)
-{
-	pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
-}
-#endif
-
 static DEFINE_MUTEX(memcg_limit_mutex);
 
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6a952237a677..161da09fcda2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -653,8 +653,10 @@ static inline int free_pages_check(struct page *page)
 		bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
 		bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
 	}
-	if (unlikely(mem_cgroup_bad_page_check(page)))
-		bad_reason = "cgroup check failed";
+#ifdef CONFIG_MEMCG
+	if (unlikely(page->mem_cgroup))
+		bad_reason = "page still charged to cgroup";
+#endif
 	if (unlikely(bad_reason)) {
 		bad_page(page, bad_reason, bad_flags);
 		return 1;
@@ -920,8 +922,10 @@ static inline int check_new_page(struct page *page)
 		bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
 		bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
 	}
-	if (unlikely(mem_cgroup_bad_page_check(page)))
-		bad_reason = "cgroup check failed";
+#ifdef CONFIG_MEMCG
+	if (unlikely(page->mem_cgroup))
+		bad_reason = "page still charged to cgroup";
+#endif
 	if (unlikely(bad_reason)) {
 		bad_page(page, bad_reason, bad_flags);
 		return 1;
-- 
2.1.3


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-02  3:15 [patch 1/3] mm: embed the memcg pointer directly into struct page Johannes Weiner
  2014-11-02  3:15 ` [patch 2/3] mm: page_cgroup: rename file to mm/swap_cgroup.c Johannes Weiner
  2014-11-02  3:15 ` [patch 3/3] mm: move page->mem_cgroup bad page handling into generic code Johannes Weiner
@ 2014-11-03  4:22 ` David Miller
  2014-11-03  8:02 ` Joonsoo Kim
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 38+ messages in thread
From: David Miller @ 2014-11-03  4:22 UTC (permalink / raw)
  To: hannes; +Cc: akpm, mhocko, vdavydov, tj, linux-mm, cgroups, linux-kernel

From: Johannes Weiner <hannes@cmpxchg.org>
Date: Sat,  1 Nov 2014 23:15:54 -0400

> Memory cgroups used to have 5 per-page pointers.  To allow users to
> disable that amount of overhead during runtime, those pointers were
> allocated in a separate array, with a translation layer between them
> and struct page.
> 
> There is now only one page pointer remaining: the memcg pointer, that
> indicates which cgroup the page is associated with when charged.  The
> complexity of runtime allocation and the runtime translation overhead
> is no longer justified to save that *potential* 0.19% of memory.  With
> CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
> after the page->private member and doesn't even increase struct page,
> and then this patch actually saves space.  Remaining users that care
> can still compile their kernels without CONFIG_MEMCG.
> 
>    text    data     bss     dec     hex     filename
> 8828345 1725264  983040 11536649 b00909  vmlinux.old
> 8827425 1725264  966656 11519345 afc571  vmlinux.new
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Looks great:

Acked-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 2/3] mm: page_cgroup: rename file to mm/swap_cgroup.c
  2014-11-02  3:15 ` [patch 2/3] mm: page_cgroup: rename file to mm/swap_cgroup.c Johannes Weiner
@ 2014-11-03  4:22   ` David Miller
  2014-11-03 16:57   ` Michal Hocko
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 38+ messages in thread
From: David Miller @ 2014-11-03  4:22 UTC (permalink / raw)
  To: hannes; +Cc: akpm, mhocko, vdavydov, tj, linux-mm, cgroups, linux-kernel

From: Johannes Weiner <hannes@cmpxchg.org>
Date: Sat,  1 Nov 2014 23:15:55 -0400

> Now that the external page_cgroup data structure and its lookup is
> gone, the only code remaining in there is swap slot accounting.
> 
> Rename it and move the conditional compilation into mm/Makefile.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Acked-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/3] mm: move page->mem_cgroup bad page handling into generic code
  2014-11-02  3:15 ` [patch 3/3] mm: move page->mem_cgroup bad page handling into generic code Johannes Weiner
@ 2014-11-03  4:23   ` David Miller
  2014-11-03 16:58   ` Michal Hocko
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 38+ messages in thread
From: David Miller @ 2014-11-03  4:23 UTC (permalink / raw)
  To: hannes; +Cc: akpm, mhocko, vdavydov, tj, linux-mm, cgroups, linux-kernel

From: Johannes Weiner <hannes@cmpxchg.org>
Date: Sat,  1 Nov 2014 23:15:56 -0400

> Now that the external page_cgroup data structure and its lookup is
> gone, let the generic bad_page() check for page->mem_cgroup sanity.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Acked-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-02  3:15 [patch 1/3] mm: embed the memcg pointer directly into struct page Johannes Weiner
                   ` (2 preceding siblings ...)
  2014-11-03  4:22 ` [patch 1/3] mm: embed the memcg pointer directly into struct page David Miller
@ 2014-11-03  8:02 ` Joonsoo Kim
  2014-11-03 15:09   ` Johannes Weiner
  2014-11-03 16:51 ` Michal Hocko
                   ` (5 subsequent siblings)
  9 siblings, 1 reply; 38+ messages in thread
From: Joonsoo Kim @ 2014-11-03  8:02 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Michal Hocko, Vladimir Davydov, Tejun Heo,
	David Miller, linux-mm, cgroups, linux-kernel

On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> Memory cgroups used to have 5 per-page pointers.  To allow users to
> disable that amount of overhead during runtime, those pointers were
> allocated in a separate array, with a translation layer between them
> and struct page.

Hello, Johannes.

I'd like to leave this translation layer.
Could you just disable that code with #ifdef until next user comes?

In our company, we uses PAGE_OWNER on mm tree which is the feature
saying who allocates the page. To use PAGE_OWNER needs modifying
struct page and then needs re-compile. This re-compile makes us difficult
to use this feature. So, we decide to implement run-time configurable
PAGE_OWNER through page_cgroup's translation layer code. Moreover, with
this infrastructure, I plan to implement some other debugging feature.

Because of my laziness, it didn't submitted to LKML. But, I will
submit it as soon as possible. If the code is removed, I would
copy-and-paste the code, but, it would cause lose of the history on
that code. So if possible, I'd like to leave that code now.

Thanks.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-03  8:02 ` Joonsoo Kim
@ 2014-11-03 15:09   ` Johannes Weiner
  2014-11-03 16:42     ` David Miller
                       ` (2 more replies)
  0 siblings, 3 replies; 38+ messages in thread
From: Johannes Weiner @ 2014-11-03 15:09 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, Michal Hocko, Vladimir Davydov, Tejun Heo,
	David Miller, linux-mm, cgroups, linux-kernel

Hi Joonsoo,

On Mon, Nov 03, 2014 at 05:02:08PM +0900, Joonsoo Kim wrote:
> On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> > Memory cgroups used to have 5 per-page pointers.  To allow users to
> > disable that amount of overhead during runtime, those pointers were
> > allocated in a separate array, with a translation layer between them
> > and struct page.
> 
> Hello, Johannes.
> 
> I'd like to leave this translation layer.
> Could you just disable that code with #ifdef until next user comes?
> 
> In our company, we uses PAGE_OWNER on mm tree which is the feature
> saying who allocates the page. To use PAGE_OWNER needs modifying
> struct page and then needs re-compile. This re-compile makes us difficult
> to use this feature. So, we decide to implement run-time configurable
> PAGE_OWNER through page_cgroup's translation layer code. Moreover, with
> this infrastructure, I plan to implement some other debugging feature.
> 
> Because of my laziness, it didn't submitted to LKML. But, I will
> submit it as soon as possible. If the code is removed, I would
> copy-and-paste the code, but, it would cause lose of the history on
> that code. So if possible, I'd like to leave that code now.

Please re-introduce this code when your new usecase is ready to be
upstreamed.  There is little reason to burden an unrelated feature
with a sizable chunk of dead code for a vague future user.

Thanks

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-03 15:09   ` Johannes Weiner
@ 2014-11-03 16:42     ` David Miller
  2014-11-03 17:02     ` Michal Hocko
  2014-11-04  0:40     ` Joonsoo Kim
  2 siblings, 0 replies; 38+ messages in thread
From: David Miller @ 2014-11-03 16:42 UTC (permalink / raw)
  To: hannes
  Cc: iamjoonsoo.kim, akpm, mhocko, vdavydov, tj, linux-mm, cgroups,
	linux-kernel

From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 3 Nov 2014 10:09:42 -0500

> Please re-introduce this code when your new usecase is ready to be
> upstreamed.  There is little reason to burden an unrelated feature
> with a sizable chunk of dead code for a vague future user.

+1

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-02  3:15 [patch 1/3] mm: embed the memcg pointer directly into struct page Johannes Weiner
                   ` (3 preceding siblings ...)
  2014-11-03  8:02 ` Joonsoo Kim
@ 2014-11-03 16:51 ` Michal Hocko
  2014-11-03 17:17 ` Michal Hocko
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 38+ messages in thread
From: Michal Hocko @ 2014-11-03 16:51 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Vladimir Davydov, Tejun Heo, David Miller,
	linux-mm, cgroups, linux-kernel

On Sat 01-11-14 23:15:54, Johannes Weiner wrote:
> Memory cgroups used to have 5 per-page pointers.  To allow users to
> disable that amount of overhead during runtime, those pointers were
> allocated in a separate array, with a translation layer between them
> and struct page.
> 
> There is now only one page pointer remaining: the memcg pointer, that
> indicates which cgroup the page is associated with when charged.  The
> complexity of runtime allocation and the runtime translation overhead
> is no longer justified to save that *potential* 0.19% of memory.  With
> CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
> after the page->private member and doesn't even increase struct page,
> and then this patch actually saves space.  Remaining users that care
> can still compile their kernels without CONFIG_MEMCG.

CONFIG_SLAB should be OK as well because there should be one slot for
pointer left if no special debugging is enabled.

>    text    data     bss     dec     hex     filename
> 8828345 1725264  983040 11536649 b00909  vmlinux.old
> 8827425 1725264  966656 11519345 afc571  vmlinux.new
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Very nice!

Acked-by: Michal Hocko <mhocko@suse.cz>

Thanks!

> ---
>  include/linux/memcontrol.h  |   6 +-
>  include/linux/mm_types.h    |   5 +
>  include/linux/mmzone.h      |  12 --
>  include/linux/page_cgroup.h |  53 --------
>  init/main.c                 |   7 -
>  mm/memcontrol.c             | 124 +++++------------
>  mm/page_alloc.c             |   2 -
>  mm/page_cgroup.c            | 319 --------------------------------------------
>  8 files changed, 41 insertions(+), 487 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index d4575a1d6e99..dafba59b31b4 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -25,7 +25,6 @@
>  #include <linux/jump_label.h>
>  
>  struct mem_cgroup;
> -struct page_cgroup;
>  struct page;
>  struct mm_struct;
>  struct kmem_cache;
> @@ -466,8 +465,6 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
>   * memcg_kmem_uncharge_pages: uncharge pages from memcg
>   * @page: pointer to struct page being freed
>   * @order: allocation order.
> - *
> - * there is no need to specify memcg here, since it is embedded in page_cgroup
>   */
>  static inline void
>  memcg_kmem_uncharge_pages(struct page *page, int order)
> @@ -484,8 +481,7 @@ memcg_kmem_uncharge_pages(struct page *page, int order)
>   *
>   * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
>   * failure of the allocation. if @page is NULL, this function will revert the
> - * charges. Otherwise, it will commit the memcg given by @memcg to the
> - * corresponding page_cgroup.
> + * charges. Otherwise, it will commit @page to @memcg.
>   */
>  static inline void
>  memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index de183328abb0..57e47dffbdda 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -23,6 +23,7 @@
>  #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
>  
>  struct address_space;
> +struct mem_cgroup;
>  
>  #define USE_SPLIT_PTE_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
>  #define USE_SPLIT_PMD_PTLOCKS	(USE_SPLIT_PTE_PTLOCKS && \
> @@ -168,6 +169,10 @@ struct page {
>  		struct page *first_page;	/* Compound tail pages */
>  	};
>  
> +#ifdef CONFIG_MEMCG
> +	struct mem_cgroup *mem_cgroup;
> +#endif
> +
>  	/*
>  	 * On machines where all RAM is mapped into kernel address space,
>  	 * we can simply calculate the virtual address. On machines with
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 48bf12ef6620..de32d9344446 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -713,9 +713,6 @@ typedef struct pglist_data {
>  	int nr_zones;
>  #ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
>  	struct page *node_mem_map;
> -#ifdef CONFIG_MEMCG
> -	struct page_cgroup *node_page_cgroup;
> -#endif
>  #endif
>  #ifndef CONFIG_NO_BOOTMEM
>  	struct bootmem_data *bdata;
> @@ -1069,7 +1066,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
>  #define SECTION_ALIGN_DOWN(pfn)	((pfn) & PAGE_SECTION_MASK)
>  
>  struct page;
> -struct page_cgroup;
>  struct mem_section {
>  	/*
>  	 * This is, logically, a pointer to an array of struct
> @@ -1087,14 +1083,6 @@ struct mem_section {
>  
>  	/* See declaration of similar field in struct zone */
>  	unsigned long *pageblock_flags;
> -#ifdef CONFIG_MEMCG
> -	/*
> -	 * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
> -	 * section. (see memcontrol.h/page_cgroup.h about this.)
> -	 */
> -	struct page_cgroup *page_cgroup;
> -	unsigned long pad;
> -#endif
>  	/*
>  	 * WARNING: mem_section must be a power-of-2 in size for the
>  	 * calculation and use of SECTION_ROOT_MASK to make sense.
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 1289be6b436c..65be35785c86 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -1,59 +1,6 @@
>  #ifndef __LINUX_PAGE_CGROUP_H
>  #define __LINUX_PAGE_CGROUP_H
>  
> -struct pglist_data;
> -
> -#ifdef CONFIG_MEMCG
> -struct mem_cgroup;
> -
> -/*
> - * Page Cgroup can be considered as an extended mem_map.
> - * A page_cgroup page is associated with every page descriptor. The
> - * page_cgroup helps us identify information about the cgroup
> - * All page cgroups are allocated at boot or memory hotplug event,
> - * then the page cgroup for pfn always exists.
> - */
> -struct page_cgroup {
> -	struct mem_cgroup *mem_cgroup;
> -};
> -
> -extern void pgdat_page_cgroup_init(struct pglist_data *pgdat);
> -
> -#ifdef CONFIG_SPARSEMEM
> -static inline void page_cgroup_init_flatmem(void)
> -{
> -}
> -extern void page_cgroup_init(void);
> -#else
> -extern void page_cgroup_init_flatmem(void);
> -static inline void page_cgroup_init(void)
> -{
> -}
> -#endif
> -
> -struct page_cgroup *lookup_page_cgroup(struct page *page);
> -
> -#else /* !CONFIG_MEMCG */
> -struct page_cgroup;
> -
> -static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat)
> -{
> -}
> -
> -static inline struct page_cgroup *lookup_page_cgroup(struct page *page)
> -{
> -	return NULL;
> -}
> -
> -static inline void page_cgroup_init(void)
> -{
> -}
> -
> -static inline void page_cgroup_init_flatmem(void)
> -{
> -}
> -#endif /* CONFIG_MEMCG */
> -
>  #include <linux/swap.h>
>  
>  #ifdef CONFIG_MEMCG_SWAP
> diff --git a/init/main.c b/init/main.c
> index c4912d9abaee..a091bbcadd33 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -51,7 +51,6 @@
>  #include <linux/mempolicy.h>
>  #include <linux/key.h>
>  #include <linux/buffer_head.h>
> -#include <linux/page_cgroup.h>
>  #include <linux/debug_locks.h>
>  #include <linux/debugobjects.h>
>  #include <linux/lockdep.h>
> @@ -485,11 +484,6 @@ void __init __weak thread_info_cache_init(void)
>   */
>  static void __init mm_init(void)
>  {
> -	/*
> -	 * page_cgroup requires contiguous pages,
> -	 * bigger than MAX_ORDER unless SPARSEMEM.
> -	 */
> -	page_cgroup_init_flatmem();
>  	mem_init();
>  	kmem_cache_init();
>  	percpu_init_late();
> @@ -627,7 +621,6 @@ asmlinkage __visible void __init start_kernel(void)
>  		initrd_start = 0;
>  	}
>  #endif
> -	page_cgroup_init();
>  	debug_objects_mem_init();
>  	kmemleak_init();
>  	setup_per_cpu_pageset();
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 19bac2ce827c..dc5e0abb18cb 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1274,7 +1274,6 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
>  {
>  	struct mem_cgroup_per_zone *mz;
>  	struct mem_cgroup *memcg;
> -	struct page_cgroup *pc;
>  	struct lruvec *lruvec;
>  
>  	if (mem_cgroup_disabled()) {
> @@ -1282,8 +1281,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
>  		goto out;
>  	}
>  
> -	pc = lookup_page_cgroup(page);
> -	memcg = pc->mem_cgroup;
> +	memcg = page->mem_cgroup;
>  	/*
>  	 * Swapcache readahead pages are added to the LRU - and
>  	 * possibly migrated - before they are charged.
> @@ -2020,16 +2018,13 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
>  					      unsigned long *flags)
>  {
>  	struct mem_cgroup *memcg;
> -	struct page_cgroup *pc;
>  
>  	rcu_read_lock();
>  
>  	if (mem_cgroup_disabled())
>  		return NULL;
> -
> -	pc = lookup_page_cgroup(page);
>  again:
> -	memcg = pc->mem_cgroup;
> +	memcg = page->mem_cgroup;
>  	if (unlikely(!memcg))
>  		return NULL;
>  
> @@ -2038,7 +2033,7 @@ again:
>  		return memcg;
>  
>  	spin_lock_irqsave(&memcg->move_lock, *flags);
> -	if (memcg != pc->mem_cgroup) {
> +	if (memcg != page->mem_cgroup) {
>  		spin_unlock_irqrestore(&memcg->move_lock, *flags);
>  		goto again;
>  	}
> @@ -2405,15 +2400,12 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
>  struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
>  {
>  	struct mem_cgroup *memcg;
> -	struct page_cgroup *pc;
>  	unsigned short id;
>  	swp_entry_t ent;
>  
>  	VM_BUG_ON_PAGE(!PageLocked(page), page);
>  
> -	pc = lookup_page_cgroup(page);
> -	memcg = pc->mem_cgroup;
> -
> +	memcg = page->mem_cgroup;
>  	if (memcg) {
>  		if (!css_tryget_online(&memcg->css))
>  			memcg = NULL;
> @@ -2463,10 +2455,9 @@ static void unlock_page_lru(struct page *page, int isolated)
>  static void commit_charge(struct page *page, struct mem_cgroup *memcg,
>  			  bool lrucare)
>  {
> -	struct page_cgroup *pc = lookup_page_cgroup(page);
>  	int isolated;
>  
> -	VM_BUG_ON_PAGE(pc->mem_cgroup, page);
> +	VM_BUG_ON_PAGE(page->mem_cgroup, page);
>  
>  	/*
>  	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
> @@ -2477,7 +2468,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
>  
>  	/*
>  	 * Nobody should be changing or seriously looking at
> -	 * pc->mem_cgroup at this point:
> +	 * page->mem_cgroup at this point:
>  	 *
>  	 * - the page is uncharged
>  	 *
> @@ -2489,7 +2480,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
>  	 * - a page cache insertion, a swapin fault, or a migration
>  	 *   have the page locked
>  	 */
> -	pc->mem_cgroup = memcg;
> +	page->mem_cgroup = memcg;
>  
>  	if (lrucare)
>  		unlock_page_lru(page, isolated);
> @@ -2972,8 +2963,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
>  void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
>  			      int order)
>  {
> -	struct page_cgroup *pc;
> -
>  	VM_BUG_ON(mem_cgroup_is_root(memcg));
>  
>  	/* The page allocation failed. Revert */
> @@ -2981,14 +2970,12 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
>  		memcg_uncharge_kmem(memcg, 1 << order);
>  		return;
>  	}
> -	pc = lookup_page_cgroup(page);
> -	pc->mem_cgroup = memcg;
> +	page->mem_cgroup = memcg;
>  }
>  
>  void __memcg_kmem_uncharge_pages(struct page *page, int order)
>  {
> -	struct page_cgroup *pc = lookup_page_cgroup(page);
> -	struct mem_cgroup *memcg = pc->mem_cgroup;
> +	struct mem_cgroup *memcg = page->mem_cgroup;
>  
>  	if (!memcg)
>  		return;
> @@ -2996,7 +2983,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
>  	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
>  
>  	memcg_uncharge_kmem(memcg, 1 << order);
> -	pc->mem_cgroup = NULL;
> +	page->mem_cgroup = NULL;
>  }
>  #else
>  static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
> @@ -3014,16 +3001,15 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
>   */
>  void mem_cgroup_split_huge_fixup(struct page *head)
>  {
> -	struct page_cgroup *pc = lookup_page_cgroup(head);
>  	int i;
>  
>  	if (mem_cgroup_disabled())
>  		return;
>  
>  	for (i = 1; i < HPAGE_PMD_NR; i++)
> -		pc[i].mem_cgroup = pc[0].mem_cgroup;
> +		head[i].mem_cgroup = head->mem_cgroup;
>  
> -	__this_cpu_sub(pc[0].mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
> +	__this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
>  		       HPAGE_PMD_NR);
>  }
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
> @@ -3032,7 +3018,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
>   * mem_cgroup_move_account - move account of the page
>   * @page: the page
>   * @nr_pages: number of regular pages (>1 for huge pages)
> - * @pc:	page_cgroup of the page.
>   * @from: mem_cgroup which the page is moved from.
>   * @to:	mem_cgroup which the page is moved to. @from != @to.
>   *
> @@ -3045,7 +3030,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
>   */
>  static int mem_cgroup_move_account(struct page *page,
>  				   unsigned int nr_pages,
> -				   struct page_cgroup *pc,
>  				   struct mem_cgroup *from,
>  				   struct mem_cgroup *to)
>  {
> @@ -3065,7 +3049,7 @@ static int mem_cgroup_move_account(struct page *page,
>  		goto out;
>  
>  	/*
> -	 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
> +	 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
>  	 * of its source page while we change it: page migration takes
>  	 * both pages off the LRU, but page cache replacement doesn't.
>  	 */
> @@ -3073,7 +3057,7 @@ static int mem_cgroup_move_account(struct page *page,
>  		goto out;
>  
>  	ret = -EINVAL;
> -	if (pc->mem_cgroup != from)
> +	if (page->mem_cgroup != from)
>  		goto out_unlock;
>  
>  	spin_lock_irqsave(&from->move_lock, flags);
> @@ -3093,13 +3077,13 @@ static int mem_cgroup_move_account(struct page *page,
>  	}
>  
>  	/*
> -	 * It is safe to change pc->mem_cgroup here because the page
> +	 * It is safe to change page->mem_cgroup here because the page
>  	 * is referenced, charged, and isolated - we can't race with
>  	 * uncharging, charging, migration, or LRU putback.
>  	 */
>  
>  	/* caller should have done css_get */
> -	pc->mem_cgroup = to;
> +	page->mem_cgroup = to;
>  	spin_unlock_irqrestore(&from->move_lock, flags);
>  
>  	ret = 0;
> @@ -3174,36 +3158,17 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
>  #endif
>  
>  #ifdef CONFIG_DEBUG_VM
> -static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
> -{
> -	struct page_cgroup *pc;
> -
> -	pc = lookup_page_cgroup(page);
> -	/*
> -	 * Can be NULL while feeding pages into the page allocator for
> -	 * the first time, i.e. during boot or memory hotplug;
> -	 * or when mem_cgroup_disabled().
> -	 */
> -	if (likely(pc) && pc->mem_cgroup)
> -		return pc;
> -	return NULL;
> -}
> -
>  bool mem_cgroup_bad_page_check(struct page *page)
>  {
>  	if (mem_cgroup_disabled())
>  		return false;
>  
> -	return lookup_page_cgroup_used(page) != NULL;
> +	return page->mem_cgroup != NULL;
>  }
>  
>  void mem_cgroup_print_bad_page(struct page *page)
>  {
> -	struct page_cgroup *pc;
> -
> -	pc = lookup_page_cgroup_used(page);
> -	if (pc)
> -		pr_alert("pc:%p pc->mem_cgroup:%p\n", pc, pc->mem_cgroup);
> +	pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
>  }
>  #endif
>  
> @@ -5123,7 +5088,6 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
>  		unsigned long addr, pte_t ptent, union mc_target *target)
>  {
>  	struct page *page = NULL;
> -	struct page_cgroup *pc;
>  	enum mc_target_type ret = MC_TARGET_NONE;
>  	swp_entry_t ent = { .val = 0 };
>  
> @@ -5137,13 +5101,12 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
>  	if (!page && !ent.val)
>  		return ret;
>  	if (page) {
> -		pc = lookup_page_cgroup(page);
>  		/*
>  		 * Do only loose check w/o serialization.
> -		 * mem_cgroup_move_account() checks the pc is valid or
> +		 * mem_cgroup_move_account() checks the page is valid or
>  		 * not under LRU exclusion.
>  		 */
> -		if (pc->mem_cgroup == mc.from) {
> +		if (page->mem_cgroup == mc.from) {
>  			ret = MC_TARGET_PAGE;
>  			if (target)
>  				target->page = page;
> @@ -5171,15 +5134,13 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
>  		unsigned long addr, pmd_t pmd, union mc_target *target)
>  {
>  	struct page *page = NULL;
> -	struct page_cgroup *pc;
>  	enum mc_target_type ret = MC_TARGET_NONE;
>  
>  	page = pmd_page(pmd);
>  	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
>  	if (!move_anon())
>  		return ret;
> -	pc = lookup_page_cgroup(page);
> -	if (pc->mem_cgroup == mc.from) {
> +	if (page->mem_cgroup == mc.from) {
>  		ret = MC_TARGET_PAGE;
>  		if (target) {
>  			get_page(page);
> @@ -5378,7 +5339,6 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
>  	enum mc_target_type target_type;
>  	union mc_target target;
>  	struct page *page;
> -	struct page_cgroup *pc;
>  
>  	/*
>  	 * We don't take compound_lock() here but no race with splitting thp
> @@ -5399,9 +5359,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
>  		if (target_type == MC_TARGET_PAGE) {
>  			page = target.page;
>  			if (!isolate_lru_page(page)) {
> -				pc = lookup_page_cgroup(page);
>  				if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
> -							pc, mc.from, mc.to)) {
> +							     mc.from, mc.to)) {
>  					mc.precharge -= HPAGE_PMD_NR;
>  					mc.moved_charge += HPAGE_PMD_NR;
>  				}
> @@ -5429,9 +5388,7 @@ retry:
>  			page = target.page;
>  			if (isolate_lru_page(page))
>  				goto put;
> -			pc = lookup_page_cgroup(page);
> -			if (!mem_cgroup_move_account(page, 1, pc,
> -						     mc.from, mc.to)) {
> +			if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
>  				mc.precharge--;
>  				/* we uncharge from mc.from later. */
>  				mc.moved_charge++;
> @@ -5623,7 +5580,6 @@ static void __init enable_swap_cgroup(void)
>  void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
>  {
>  	struct mem_cgroup *memcg;
> -	struct page_cgroup *pc;
>  	unsigned short oldid;
>  
>  	VM_BUG_ON_PAGE(PageLRU(page), page);
> @@ -5632,8 +5588,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
>  	if (!do_swap_account)
>  		return;
>  
> -	pc = lookup_page_cgroup(page);
> -	memcg = pc->mem_cgroup;
> +	memcg = page->mem_cgroup;
>  
>  	/* Readahead page, never charged */
>  	if (!memcg)
> @@ -5643,7 +5598,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
>  	VM_BUG_ON_PAGE(oldid, page);
>  	mem_cgroup_swap_statistics(memcg, true);
>  
> -	pc->mem_cgroup = NULL;
> +	page->mem_cgroup = NULL;
>  
>  	if (!mem_cgroup_is_root(memcg))
>  		page_counter_uncharge(&memcg->memory, 1);
> @@ -5710,7 +5665,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
>  		goto out;
>  
>  	if (PageSwapCache(page)) {
> -		struct page_cgroup *pc = lookup_page_cgroup(page);
>  		/*
>  		 * Every swap fault against a single page tries to charge the
>  		 * page, bail as early as possible.  shmem_unuse() encounters
> @@ -5718,7 +5672,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
>  		 * the page lock, which serializes swap cache removal, which
>  		 * in turn serializes uncharging.
>  		 */
> -		if (pc->mem_cgroup)
> +		if (page->mem_cgroup)
>  			goto out;
>  	}
>  
> @@ -5871,7 +5825,6 @@ static void uncharge_list(struct list_head *page_list)
>  	next = page_list->next;
>  	do {
>  		unsigned int nr_pages = 1;
> -		struct page_cgroup *pc;
>  
>  		page = list_entry(next, struct page, lru);
>  		next = page->lru.next;
> @@ -5879,23 +5832,22 @@ static void uncharge_list(struct list_head *page_list)
>  		VM_BUG_ON_PAGE(PageLRU(page), page);
>  		VM_BUG_ON_PAGE(page_count(page), page);
>  
> -		pc = lookup_page_cgroup(page);
> -		if (!pc->mem_cgroup)
> +		if (!page->mem_cgroup)
>  			continue;
>  
>  		/*
>  		 * Nobody should be changing or seriously looking at
> -		 * pc->mem_cgroup at this point, we have fully
> +		 * page->mem_cgroup at this point, we have fully
>  		 * exclusive access to the page.
>  		 */
>  
> -		if (memcg != pc->mem_cgroup) {
> +		if (memcg != page->mem_cgroup) {
>  			if (memcg) {
>  				uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
>  					       nr_huge, page);
>  				pgpgout = nr_anon = nr_file = nr_huge = 0;
>  			}
> -			memcg = pc->mem_cgroup;
> +			memcg = page->mem_cgroup;
>  		}
>  
>  		if (PageTransHuge(page)) {
> @@ -5909,7 +5861,7 @@ static void uncharge_list(struct list_head *page_list)
>  		else
>  			nr_file += nr_pages;
>  
> -		pc->mem_cgroup = NULL;
> +		page->mem_cgroup = NULL;
>  
>  		pgpgout++;
>  	} while (next != page_list);
> @@ -5928,14 +5880,11 @@ static void uncharge_list(struct list_head *page_list)
>   */
>  void mem_cgroup_uncharge(struct page *page)
>  {
> -	struct page_cgroup *pc;
> -
>  	if (mem_cgroup_disabled())
>  		return;
>  
>  	/* Don't touch page->lru of any random page, pre-check: */
> -	pc = lookup_page_cgroup(page);
> -	if (!pc->mem_cgroup)
> +	if (!page->mem_cgroup)
>  		return;
>  
>  	INIT_LIST_HEAD(&page->lru);
> @@ -5972,7 +5921,6 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
>  			bool lrucare)
>  {
>  	struct mem_cgroup *memcg;
> -	struct page_cgroup *pc;
>  	int isolated;
>  
>  	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
> @@ -5987,8 +5935,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
>  		return;
>  
>  	/* Page cache replacement: new page already charged? */
> -	pc = lookup_page_cgroup(newpage);
> -	if (pc->mem_cgroup)
> +	if (newpage->mem_cgroup)
>  		return;
>  
>  	/*
> @@ -5997,15 +5944,14 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
>  	 * uncharged page when the PFN walker finds a page that
>  	 * reclaim just put back on the LRU but has not released yet.
>  	 */
> -	pc = lookup_page_cgroup(oldpage);
> -	memcg = pc->mem_cgroup;
> +	memcg = oldpage->mem_cgroup;
>  	if (!memcg)
>  		return;
>  
>  	if (lrucare)
>  		lock_page_lru(oldpage, &isolated);
>  
> -	pc->mem_cgroup = NULL;
> +	oldpage->mem_cgroup = NULL;
>  
>  	if (lrucare)
>  		unlock_page_lru(oldpage, isolated);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index fa9426389c6c..6a952237a677 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -48,7 +48,6 @@
>  #include <linux/backing-dev.h>
>  #include <linux/fault-inject.h>
>  #include <linux/page-isolation.h>
> -#include <linux/page_cgroup.h>
>  #include <linux/debugobjects.h>
>  #include <linux/kmemleak.h>
>  #include <linux/compaction.h>
> @@ -4899,7 +4898,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>  #endif
>  	init_waitqueue_head(&pgdat->kswapd_wait);
>  	init_waitqueue_head(&pgdat->pfmemalloc_wait);
> -	pgdat_page_cgroup_init(pgdat);
>  
>  	for (j = 0; j < MAX_NR_ZONES; j++) {
>  		struct zone *zone = pgdat->node_zones + j;
> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> index 5331c2bd85a2..f0f31c1d4d0c 100644
> --- a/mm/page_cgroup.c
> +++ b/mm/page_cgroup.c
> @@ -1,326 +1,7 @@
>  #include <linux/mm.h>
> -#include <linux/mmzone.h>
> -#include <linux/bootmem.h>
> -#include <linux/bit_spinlock.h>
>  #include <linux/page_cgroup.h>
> -#include <linux/hash.h>
> -#include <linux/slab.h>
> -#include <linux/memory.h>
>  #include <linux/vmalloc.h>
> -#include <linux/cgroup.h>
>  #include <linux/swapops.h>
> -#include <linux/kmemleak.h>
> -
> -static unsigned long total_usage;
> -
> -#if !defined(CONFIG_SPARSEMEM)
> -
> -
> -void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
> -{
> -	pgdat->node_page_cgroup = NULL;
> -}
> -
> -struct page_cgroup *lookup_page_cgroup(struct page *page)
> -{
> -	unsigned long pfn = page_to_pfn(page);
> -	unsigned long offset;
> -	struct page_cgroup *base;
> -
> -	base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
> -#ifdef CONFIG_DEBUG_VM
> -	/*
> -	 * The sanity checks the page allocator does upon freeing a
> -	 * page can reach here before the page_cgroup arrays are
> -	 * allocated when feeding a range of pages to the allocator
> -	 * for the first time during bootup or memory hotplug.
> -	 */
> -	if (unlikely(!base))
> -		return NULL;
> -#endif
> -	offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
> -	return base + offset;
> -}
> -
> -static int __init alloc_node_page_cgroup(int nid)
> -{
> -	struct page_cgroup *base;
> -	unsigned long table_size;
> -	unsigned long nr_pages;
> -
> -	nr_pages = NODE_DATA(nid)->node_spanned_pages;
> -	if (!nr_pages)
> -		return 0;
> -
> -	table_size = sizeof(struct page_cgroup) * nr_pages;
> -
> -	base = memblock_virt_alloc_try_nid_nopanic(
> -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
> -			BOOTMEM_ALLOC_ACCESSIBLE, nid);
> -	if (!base)
> -		return -ENOMEM;
> -	NODE_DATA(nid)->node_page_cgroup = base;
> -	total_usage += table_size;
> -	return 0;
> -}
> -
> -void __init page_cgroup_init_flatmem(void)
> -{
> -
> -	int nid, fail;
> -
> -	if (mem_cgroup_disabled())
> -		return;
> -
> -	for_each_online_node(nid)  {
> -		fail = alloc_node_page_cgroup(nid);
> -		if (fail)
> -			goto fail;
> -	}
> -	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
> -	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
> -	" don't want memory cgroups\n");
> -	return;
> -fail:
> -	printk(KERN_CRIT "allocation of page_cgroup failed.\n");
> -	printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
> -	panic("Out of memory");
> -}
> -
> -#else /* CONFIG_FLAT_NODE_MEM_MAP */
> -
> -struct page_cgroup *lookup_page_cgroup(struct page *page)
> -{
> -	unsigned long pfn = page_to_pfn(page);
> -	struct mem_section *section = __pfn_to_section(pfn);
> -#ifdef CONFIG_DEBUG_VM
> -	/*
> -	 * The sanity checks the page allocator does upon freeing a
> -	 * page can reach here before the page_cgroup arrays are
> -	 * allocated when feeding a range of pages to the allocator
> -	 * for the first time during bootup or memory hotplug.
> -	 */
> -	if (!section->page_cgroup)
> -		return NULL;
> -#endif
> -	return section->page_cgroup + pfn;
> -}
> -
> -static void *__meminit alloc_page_cgroup(size_t size, int nid)
> -{
> -	gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
> -	void *addr = NULL;
> -
> -	addr = alloc_pages_exact_nid(nid, size, flags);
> -	if (addr) {
> -		kmemleak_alloc(addr, size, 1, flags);
> -		return addr;
> -	}
> -
> -	if (node_state(nid, N_HIGH_MEMORY))
> -		addr = vzalloc_node(size, nid);
> -	else
> -		addr = vzalloc(size);
> -
> -	return addr;
> -}
> -
> -static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
> -{
> -	struct mem_section *section;
> -	struct page_cgroup *base;
> -	unsigned long table_size;
> -
> -	section = __pfn_to_section(pfn);
> -
> -	if (section->page_cgroup)
> -		return 0;
> -
> -	table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
> -	base = alloc_page_cgroup(table_size, nid);
> -
> -	/*
> -	 * The value stored in section->page_cgroup is (base - pfn)
> -	 * and it does not point to the memory block allocated above,
> -	 * causing kmemleak false positives.
> -	 */
> -	kmemleak_not_leak(base);
> -
> -	if (!base) {
> -		printk(KERN_ERR "page cgroup allocation failure\n");
> -		return -ENOMEM;
> -	}
> -
> -	/*
> -	 * The passed "pfn" may not be aligned to SECTION.  For the calculation
> -	 * we need to apply a mask.
> -	 */
> -	pfn &= PAGE_SECTION_MASK;
> -	section->page_cgroup = base - pfn;
> -	total_usage += table_size;
> -	return 0;
> -}
> -#ifdef CONFIG_MEMORY_HOTPLUG
> -static void free_page_cgroup(void *addr)
> -{
> -	if (is_vmalloc_addr(addr)) {
> -		vfree(addr);
> -	} else {
> -		struct page *page = virt_to_page(addr);
> -		size_t table_size =
> -			sizeof(struct page_cgroup) * PAGES_PER_SECTION;
> -
> -		BUG_ON(PageReserved(page));
> -		kmemleak_free(addr);
> -		free_pages_exact(addr, table_size);
> -	}
> -}
> -
> -static void __free_page_cgroup(unsigned long pfn)
> -{
> -	struct mem_section *ms;
> -	struct page_cgroup *base;
> -
> -	ms = __pfn_to_section(pfn);
> -	if (!ms || !ms->page_cgroup)
> -		return;
> -	base = ms->page_cgroup + pfn;
> -	free_page_cgroup(base);
> -	ms->page_cgroup = NULL;
> -}
> -
> -static int __meminit online_page_cgroup(unsigned long start_pfn,
> -				unsigned long nr_pages,
> -				int nid)
> -{
> -	unsigned long start, end, pfn;
> -	int fail = 0;
> -
> -	start = SECTION_ALIGN_DOWN(start_pfn);
> -	end = SECTION_ALIGN_UP(start_pfn + nr_pages);
> -
> -	if (nid == -1) {
> -		/*
> -		 * In this case, "nid" already exists and contains valid memory.
> -		 * "start_pfn" passed to us is a pfn which is an arg for
> -		 * online__pages(), and start_pfn should exist.
> -		 */
> -		nid = pfn_to_nid(start_pfn);
> -		VM_BUG_ON(!node_state(nid, N_ONLINE));
> -	}
> -
> -	for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
> -		if (!pfn_present(pfn))
> -			continue;
> -		fail = init_section_page_cgroup(pfn, nid);
> -	}
> -	if (!fail)
> -		return 0;
> -
> -	/* rollback */
> -	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
> -		__free_page_cgroup(pfn);
> -
> -	return -ENOMEM;
> -}
> -
> -static int __meminit offline_page_cgroup(unsigned long start_pfn,
> -				unsigned long nr_pages, int nid)
> -{
> -	unsigned long start, end, pfn;
> -
> -	start = SECTION_ALIGN_DOWN(start_pfn);
> -	end = SECTION_ALIGN_UP(start_pfn + nr_pages);
> -
> -	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
> -		__free_page_cgroup(pfn);
> -	return 0;
> -
> -}
> -
> -static int __meminit page_cgroup_callback(struct notifier_block *self,
> -			       unsigned long action, void *arg)
> -{
> -	struct memory_notify *mn = arg;
> -	int ret = 0;
> -	switch (action) {
> -	case MEM_GOING_ONLINE:
> -		ret = online_page_cgroup(mn->start_pfn,
> -				   mn->nr_pages, mn->status_change_nid);
> -		break;
> -	case MEM_OFFLINE:
> -		offline_page_cgroup(mn->start_pfn,
> -				mn->nr_pages, mn->status_change_nid);
> -		break;
> -	case MEM_CANCEL_ONLINE:
> -		offline_page_cgroup(mn->start_pfn,
> -				mn->nr_pages, mn->status_change_nid);
> -		break;
> -	case MEM_GOING_OFFLINE:
> -		break;
> -	case MEM_ONLINE:
> -	case MEM_CANCEL_OFFLINE:
> -		break;
> -	}
> -
> -	return notifier_from_errno(ret);
> -}
> -
> -#endif
> -
> -void __init page_cgroup_init(void)
> -{
> -	unsigned long pfn;
> -	int nid;
> -
> -	if (mem_cgroup_disabled())
> -		return;
> -
> -	for_each_node_state(nid, N_MEMORY) {
> -		unsigned long start_pfn, end_pfn;
> -
> -		start_pfn = node_start_pfn(nid);
> -		end_pfn = node_end_pfn(nid);
> -		/*
> -		 * start_pfn and end_pfn may not be aligned to SECTION and the
> -		 * page->flags of out of node pages are not initialized.  So we
> -		 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
> -		 */
> -		for (pfn = start_pfn;
> -		     pfn < end_pfn;
> -                     pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
> -
> -			if (!pfn_valid(pfn))
> -				continue;
> -			/*
> -			 * Nodes's pfns can be overlapping.
> -			 * We know some arch can have a nodes layout such as
> -			 * -------------pfn-------------->
> -			 * N0 | N1 | N2 | N0 | N1 | N2|....
> -			 */
> -			if (pfn_to_nid(pfn) != nid)
> -				continue;
> -			if (init_section_page_cgroup(pfn, nid))
> -				goto oom;
> -		}
> -	}
> -	hotplug_memory_notifier(page_cgroup_callback, 0);
> -	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
> -	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
> -			 "don't want memory cgroups\n");
> -	return;
> -oom:
> -	printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
> -	panic("Out of memory");
> -}
> -
> -void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
> -{
> -	return;
> -}
> -
> -#endif
> -
>  
>  #ifdef CONFIG_MEMCG_SWAP
>  
> -- 
> 2.1.3
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 2/3] mm: page_cgroup: rename file to mm/swap_cgroup.c
  2014-11-02  3:15 ` [patch 2/3] mm: page_cgroup: rename file to mm/swap_cgroup.c Johannes Weiner
  2014-11-03  4:22   ` David Miller
@ 2014-11-03 16:57   ` Michal Hocko
  2014-11-03 17:30   ` Vladimir Davydov
  2014-11-04  8:37   ` Kamezawa Hiroyuki
  3 siblings, 0 replies; 38+ messages in thread
From: Michal Hocko @ 2014-11-03 16:57 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Vladimir Davydov, Tejun Heo, David Miller,
	linux-mm, cgroups, linux-kernel

On Sat 01-11-14 23:15:55, Johannes Weiner wrote:
> Now that the external page_cgroup data structure and its lookup is
> gone, the only code remaining in there is swap slot accounting.
> 
> Rename it and move the conditional compilation into mm/Makefile.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Acked-by: Michal Hocko <mhocko@suse.cz>

> ---
>  MAINTAINERS                 |   2 +-
>  include/linux/page_cgroup.h |  40 ---------
>  include/linux/swap_cgroup.h |  42 +++++++++
>  mm/Makefile                 |   3 +-
>  mm/memcontrol.c             |   2 +-
>  mm/page_cgroup.c            | 211 --------------------------------------------
>  mm/swap_cgroup.c            | 208 +++++++++++++++++++++++++++++++++++++++++++
>  mm/swap_state.c             |   1 -
>  mm/swapfile.c               |   2 +-
>  9 files changed, 255 insertions(+), 256 deletions(-)
>  delete mode 100644 include/linux/page_cgroup.h
>  create mode 100644 include/linux/swap_cgroup.h
>  delete mode 100644 mm/page_cgroup.c
>  create mode 100644 mm/swap_cgroup.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 7e31be07197e..3a60389d3a13 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2583,7 +2583,7 @@ L:	cgroups@vger.kernel.org
>  L:	linux-mm@kvack.org
>  S:	Maintained
>  F:	mm/memcontrol.c
> -F:	mm/page_cgroup.c
> +F:	mm/swap_cgroup.c
>  
>  CORETEMP HARDWARE MONITORING DRIVER
>  M:	Fenghua Yu <fenghua.yu@intel.com>
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> deleted file mode 100644
> index 65be35785c86..000000000000
> --- a/include/linux/page_cgroup.h
> +++ /dev/null
> @@ -1,40 +0,0 @@
> -#ifndef __LINUX_PAGE_CGROUP_H
> -#define __LINUX_PAGE_CGROUP_H
> -
> -#include <linux/swap.h>
> -
> -#ifdef CONFIG_MEMCG_SWAP
> -extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
> -					unsigned short old, unsigned short new);
> -extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
> -extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
> -extern int swap_cgroup_swapon(int type, unsigned long max_pages);
> -extern void swap_cgroup_swapoff(int type);
> -#else
> -
> -static inline
> -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
> -{
> -	return 0;
> -}
> -
> -static inline
> -unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
> -{
> -	return 0;
> -}
> -
> -static inline int
> -swap_cgroup_swapon(int type, unsigned long max_pages)
> -{
> -	return 0;
> -}
> -
> -static inline void swap_cgroup_swapoff(int type)
> -{
> -	return;
> -}
> -
> -#endif /* CONFIG_MEMCG_SWAP */
> -
> -#endif /* __LINUX_PAGE_CGROUP_H */
> diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h
> new file mode 100644
> index 000000000000..145306bdc92f
> --- /dev/null
> +++ b/include/linux/swap_cgroup.h
> @@ -0,0 +1,42 @@
> +#ifndef __LINUX_SWAP_CGROUP_H
> +#define __LINUX_SWAP_CGROUP_H
> +
> +#include <linux/swap.h>
> +
> +#ifdef CONFIG_MEMCG_SWAP
> +
> +extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
> +					unsigned short old, unsigned short new);
> +extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
> +extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
> +extern int swap_cgroup_swapon(int type, unsigned long max_pages);
> +extern void swap_cgroup_swapoff(int type);
> +
> +#else
> +
> +static inline
> +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
> +{
> +	return 0;
> +}
> +
> +static inline
> +unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
> +{
> +	return 0;
> +}
> +
> +static inline int
> +swap_cgroup_swapon(int type, unsigned long max_pages)
> +{
> +	return 0;
> +}
> +
> +static inline void swap_cgroup_swapoff(int type)
> +{
> +	return;
> +}
> +
> +#endif /* CONFIG_MEMCG_SWAP */
> +
> +#endif /* __LINUX_SWAP_CGROUP_H */
> diff --git a/mm/Makefile b/mm/Makefile
> index 27ddb80403a9..d9d579484f15 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -56,7 +56,8 @@ obj-$(CONFIG_MIGRATION) += migrate.o
>  obj-$(CONFIG_QUICKLIST) += quicklist.o
>  obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
>  obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
> -obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
> +obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
> +obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
>  obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
>  obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
>  obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index dc5e0abb18cb..fbb41a170eae 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -51,7 +51,7 @@
>  #include <linux/seq_file.h>
>  #include <linux/vmpressure.h>
>  #include <linux/mm_inline.h>
> -#include <linux/page_cgroup.h>
> +#include <linux/swap_cgroup.h>
>  #include <linux/cpu.h>
>  #include <linux/oom.h>
>  #include <linux/lockdep.h>
> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> deleted file mode 100644
> index f0f31c1d4d0c..000000000000
> --- a/mm/page_cgroup.c
> +++ /dev/null
> @@ -1,211 +0,0 @@
> -#include <linux/mm.h>
> -#include <linux/page_cgroup.h>
> -#include <linux/vmalloc.h>
> -#include <linux/swapops.h>
> -
> -#ifdef CONFIG_MEMCG_SWAP
> -
> -static DEFINE_MUTEX(swap_cgroup_mutex);
> -struct swap_cgroup_ctrl {
> -	struct page **map;
> -	unsigned long length;
> -	spinlock_t	lock;
> -};
> -
> -static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
> -
> -struct swap_cgroup {
> -	unsigned short		id;
> -};
> -#define SC_PER_PAGE	(PAGE_SIZE/sizeof(struct swap_cgroup))
> -
> -/*
> - * SwapCgroup implements "lookup" and "exchange" operations.
> - * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
> - * against SwapCache. At swap_free(), this is accessed directly from swap.
> - *
> - * This means,
> - *  - we have no race in "exchange" when we're accessed via SwapCache because
> - *    SwapCache(and its swp_entry) is under lock.
> - *  - When called via swap_free(), there is no user of this entry and no race.
> - * Then, we don't need lock around "exchange".
> - *
> - * TODO: we can push these buffers out to HIGHMEM.
> - */
> -
> -/*
> - * allocate buffer for swap_cgroup.
> - */
> -static int swap_cgroup_prepare(int type)
> -{
> -	struct page *page;
> -	struct swap_cgroup_ctrl *ctrl;
> -	unsigned long idx, max;
> -
> -	ctrl = &swap_cgroup_ctrl[type];
> -
> -	for (idx = 0; idx < ctrl->length; idx++) {
> -		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> -		if (!page)
> -			goto not_enough_page;
> -		ctrl->map[idx] = page;
> -	}
> -	return 0;
> -not_enough_page:
> -	max = idx;
> -	for (idx = 0; idx < max; idx++)
> -		__free_page(ctrl->map[idx]);
> -
> -	return -ENOMEM;
> -}
> -
> -static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
> -					struct swap_cgroup_ctrl **ctrlp)
> -{
> -	pgoff_t offset = swp_offset(ent);
> -	struct swap_cgroup_ctrl *ctrl;
> -	struct page *mappage;
> -	struct swap_cgroup *sc;
> -
> -	ctrl = &swap_cgroup_ctrl[swp_type(ent)];
> -	if (ctrlp)
> -		*ctrlp = ctrl;
> -
> -	mappage = ctrl->map[offset / SC_PER_PAGE];
> -	sc = page_address(mappage);
> -	return sc + offset % SC_PER_PAGE;
> -}
> -
> -/**
> - * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
> - * @ent: swap entry to be cmpxchged
> - * @old: old id
> - * @new: new id
> - *
> - * Returns old id at success, 0 at failure.
> - * (There is no mem_cgroup using 0 as its id)
> - */
> -unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
> -					unsigned short old, unsigned short new)
> -{
> -	struct swap_cgroup_ctrl *ctrl;
> -	struct swap_cgroup *sc;
> -	unsigned long flags;
> -	unsigned short retval;
> -
> -	sc = lookup_swap_cgroup(ent, &ctrl);
> -
> -	spin_lock_irqsave(&ctrl->lock, flags);
> -	retval = sc->id;
> -	if (retval == old)
> -		sc->id = new;
> -	else
> -		retval = 0;
> -	spin_unlock_irqrestore(&ctrl->lock, flags);
> -	return retval;
> -}
> -
> -/**
> - * swap_cgroup_record - record mem_cgroup for this swp_entry.
> - * @ent: swap entry to be recorded into
> - * @id: mem_cgroup to be recorded
> - *
> - * Returns old value at success, 0 at failure.
> - * (Of course, old value can be 0.)
> - */
> -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
> -{
> -	struct swap_cgroup_ctrl *ctrl;
> -	struct swap_cgroup *sc;
> -	unsigned short old;
> -	unsigned long flags;
> -
> -	sc = lookup_swap_cgroup(ent, &ctrl);
> -
> -	spin_lock_irqsave(&ctrl->lock, flags);
> -	old = sc->id;
> -	sc->id = id;
> -	spin_unlock_irqrestore(&ctrl->lock, flags);
> -
> -	return old;
> -}
> -
> -/**
> - * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
> - * @ent: swap entry to be looked up.
> - *
> - * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
> - */
> -unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
> -{
> -	return lookup_swap_cgroup(ent, NULL)->id;
> -}
> -
> -int swap_cgroup_swapon(int type, unsigned long max_pages)
> -{
> -	void *array;
> -	unsigned long array_size;
> -	unsigned long length;
> -	struct swap_cgroup_ctrl *ctrl;
> -
> -	if (!do_swap_account)
> -		return 0;
> -
> -	length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
> -	array_size = length * sizeof(void *);
> -
> -	array = vzalloc(array_size);
> -	if (!array)
> -		goto nomem;
> -
> -	ctrl = &swap_cgroup_ctrl[type];
> -	mutex_lock(&swap_cgroup_mutex);
> -	ctrl->length = length;
> -	ctrl->map = array;
> -	spin_lock_init(&ctrl->lock);
> -	if (swap_cgroup_prepare(type)) {
> -		/* memory shortage */
> -		ctrl->map = NULL;
> -		ctrl->length = 0;
> -		mutex_unlock(&swap_cgroup_mutex);
> -		vfree(array);
> -		goto nomem;
> -	}
> -	mutex_unlock(&swap_cgroup_mutex);
> -
> -	return 0;
> -nomem:
> -	printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
> -	printk(KERN_INFO
> -		"swap_cgroup can be disabled by swapaccount=0 boot option\n");
> -	return -ENOMEM;
> -}
> -
> -void swap_cgroup_swapoff(int type)
> -{
> -	struct page **map;
> -	unsigned long i, length;
> -	struct swap_cgroup_ctrl *ctrl;
> -
> -	if (!do_swap_account)
> -		return;
> -
> -	mutex_lock(&swap_cgroup_mutex);
> -	ctrl = &swap_cgroup_ctrl[type];
> -	map = ctrl->map;
> -	length = ctrl->length;
> -	ctrl->map = NULL;
> -	ctrl->length = 0;
> -	mutex_unlock(&swap_cgroup_mutex);
> -
> -	if (map) {
> -		for (i = 0; i < length; i++) {
> -			struct page *page = map[i];
> -			if (page)
> -				__free_page(page);
> -		}
> -		vfree(map);
> -	}
> -}
> -
> -#endif
> diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
> new file mode 100644
> index 000000000000..b5f7f24b8dd1
> --- /dev/null
> +++ b/mm/swap_cgroup.c
> @@ -0,0 +1,208 @@
> +#include <linux/swap_cgroup.h>
> +#include <linux/vmalloc.h>
> +#include <linux/mm.h>
> +
> +#include <linux/swapops.h> /* depends on mm.h include */
> +
> +static DEFINE_MUTEX(swap_cgroup_mutex);
> +struct swap_cgroup_ctrl {
> +	struct page **map;
> +	unsigned long length;
> +	spinlock_t	lock;
> +};
> +
> +static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
> +
> +struct swap_cgroup {
> +	unsigned short		id;
> +};
> +#define SC_PER_PAGE	(PAGE_SIZE/sizeof(struct swap_cgroup))
> +
> +/*
> + * SwapCgroup implements "lookup" and "exchange" operations.
> + * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
> + * against SwapCache. At swap_free(), this is accessed directly from swap.
> + *
> + * This means,
> + *  - we have no race in "exchange" when we're accessed via SwapCache because
> + *    SwapCache(and its swp_entry) is under lock.
> + *  - When called via swap_free(), there is no user of this entry and no race.
> + * Then, we don't need lock around "exchange".
> + *
> + * TODO: we can push these buffers out to HIGHMEM.
> + */
> +
> +/*
> + * allocate buffer for swap_cgroup.
> + */
> +static int swap_cgroup_prepare(int type)
> +{
> +	struct page *page;
> +	struct swap_cgroup_ctrl *ctrl;
> +	unsigned long idx, max;
> +
> +	ctrl = &swap_cgroup_ctrl[type];
> +
> +	for (idx = 0; idx < ctrl->length; idx++) {
> +		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +		if (!page)
> +			goto not_enough_page;
> +		ctrl->map[idx] = page;
> +	}
> +	return 0;
> +not_enough_page:
> +	max = idx;
> +	for (idx = 0; idx < max; idx++)
> +		__free_page(ctrl->map[idx]);
> +
> +	return -ENOMEM;
> +}
> +
> +static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
> +					struct swap_cgroup_ctrl **ctrlp)
> +{
> +	pgoff_t offset = swp_offset(ent);
> +	struct swap_cgroup_ctrl *ctrl;
> +	struct page *mappage;
> +	struct swap_cgroup *sc;
> +
> +	ctrl = &swap_cgroup_ctrl[swp_type(ent)];
> +	if (ctrlp)
> +		*ctrlp = ctrl;
> +
> +	mappage = ctrl->map[offset / SC_PER_PAGE];
> +	sc = page_address(mappage);
> +	return sc + offset % SC_PER_PAGE;
> +}
> +
> +/**
> + * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
> + * @ent: swap entry to be cmpxchged
> + * @old: old id
> + * @new: new id
> + *
> + * Returns old id at success, 0 at failure.
> + * (There is no mem_cgroup using 0 as its id)
> + */
> +unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
> +					unsigned short old, unsigned short new)
> +{
> +	struct swap_cgroup_ctrl *ctrl;
> +	struct swap_cgroup *sc;
> +	unsigned long flags;
> +	unsigned short retval;
> +
> +	sc = lookup_swap_cgroup(ent, &ctrl);
> +
> +	spin_lock_irqsave(&ctrl->lock, flags);
> +	retval = sc->id;
> +	if (retval == old)
> +		sc->id = new;
> +	else
> +		retval = 0;
> +	spin_unlock_irqrestore(&ctrl->lock, flags);
> +	return retval;
> +}
> +
> +/**
> + * swap_cgroup_record - record mem_cgroup for this swp_entry.
> + * @ent: swap entry to be recorded into
> + * @id: mem_cgroup to be recorded
> + *
> + * Returns old value at success, 0 at failure.
> + * (Of course, old value can be 0.)
> + */
> +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
> +{
> +	struct swap_cgroup_ctrl *ctrl;
> +	struct swap_cgroup *sc;
> +	unsigned short old;
> +	unsigned long flags;
> +
> +	sc = lookup_swap_cgroup(ent, &ctrl);
> +
> +	spin_lock_irqsave(&ctrl->lock, flags);
> +	old = sc->id;
> +	sc->id = id;
> +	spin_unlock_irqrestore(&ctrl->lock, flags);
> +
> +	return old;
> +}
> +
> +/**
> + * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
> + * @ent: swap entry to be looked up.
> + *
> + * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
> + */
> +unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
> +{
> +	return lookup_swap_cgroup(ent, NULL)->id;
> +}
> +
> +int swap_cgroup_swapon(int type, unsigned long max_pages)
> +{
> +	void *array;
> +	unsigned long array_size;
> +	unsigned long length;
> +	struct swap_cgroup_ctrl *ctrl;
> +
> +	if (!do_swap_account)
> +		return 0;
> +
> +	length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
> +	array_size = length * sizeof(void *);
> +
> +	array = vzalloc(array_size);
> +	if (!array)
> +		goto nomem;
> +
> +	ctrl = &swap_cgroup_ctrl[type];
> +	mutex_lock(&swap_cgroup_mutex);
> +	ctrl->length = length;
> +	ctrl->map = array;
> +	spin_lock_init(&ctrl->lock);
> +	if (swap_cgroup_prepare(type)) {
> +		/* memory shortage */
> +		ctrl->map = NULL;
> +		ctrl->length = 0;
> +		mutex_unlock(&swap_cgroup_mutex);
> +		vfree(array);
> +		goto nomem;
> +	}
> +	mutex_unlock(&swap_cgroup_mutex);
> +
> +	return 0;
> +nomem:
> +	printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
> +	printk(KERN_INFO
> +		"swap_cgroup can be disabled by swapaccount=0 boot option\n");
> +	return -ENOMEM;
> +}
> +
> +void swap_cgroup_swapoff(int type)
> +{
> +	struct page **map;
> +	unsigned long i, length;
> +	struct swap_cgroup_ctrl *ctrl;
> +
> +	if (!do_swap_account)
> +		return;
> +
> +	mutex_lock(&swap_cgroup_mutex);
> +	ctrl = &swap_cgroup_ctrl[type];
> +	map = ctrl->map;
> +	length = ctrl->length;
> +	ctrl->map = NULL;
> +	ctrl->length = 0;
> +	mutex_unlock(&swap_cgroup_mutex);
> +
> +	if (map) {
> +		for (i = 0; i < length; i++) {
> +			struct page *page = map[i];
> +			if (page)
> +				__free_page(page);
> +		}
> +		vfree(map);
> +	}
> +}
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 154444918685..9711342987a0 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -17,7 +17,6 @@
>  #include <linux/blkdev.h>
>  #include <linux/pagevec.h>
>  #include <linux/migrate.h>
> -#include <linux/page_cgroup.h>
>  
>  #include <asm/pgtable.h>
>  
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 8798b2e0ac59..63f55ccb9b26 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -38,7 +38,7 @@
>  #include <asm/pgtable.h>
>  #include <asm/tlbflush.h>
>  #include <linux/swapops.h>
> -#include <linux/page_cgroup.h>
> +#include <linux/swap_cgroup.h>
>  
>  static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
>  				 unsigned char);
> -- 
> 2.1.3
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/3] mm: move page->mem_cgroup bad page handling into generic code
  2014-11-02  3:15 ` [patch 3/3] mm: move page->mem_cgroup bad page handling into generic code Johannes Weiner
  2014-11-03  4:23   ` David Miller
@ 2014-11-03 16:58   ` Michal Hocko
  2014-11-03 17:32   ` Vladimir Davydov
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 38+ messages in thread
From: Michal Hocko @ 2014-11-03 16:58 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Vladimir Davydov, Tejun Heo, David Miller,
	linux-mm, cgroups, linux-kernel

On Sat 01-11-14 23:15:56, Johannes Weiner wrote:
> Now that the external page_cgroup data structure and its lookup is
> gone, let the generic bad_page() check for page->mem_cgroup sanity.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Acked-by: Michal Hocko <mhocko@suse.cz>

> ---
>  include/linux/memcontrol.h |  4 ----
>  mm/debug.c                 |  5 ++++-
>  mm/memcontrol.c            | 15 ---------------
>  mm/page_alloc.c            | 12 ++++++++----
>  4 files changed, 12 insertions(+), 24 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index dafba59b31b4..e789551d4db0 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -173,10 +173,6 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
>  void mem_cgroup_split_huge_fixup(struct page *head);
>  #endif
>  
> -#ifdef CONFIG_DEBUG_VM
> -bool mem_cgroup_bad_page_check(struct page *page);
> -void mem_cgroup_print_bad_page(struct page *page);
> -#endif
>  #else /* CONFIG_MEMCG */
>  struct mem_cgroup;
>  
> diff --git a/mm/debug.c b/mm/debug.c
> index 5ce45c9a29b5..0e58f3211f89 100644
> --- a/mm/debug.c
> +++ b/mm/debug.c
> @@ -95,7 +95,10 @@ void dump_page_badflags(struct page *page, const char *reason,
>  		dump_flags(page->flags & badflags,
>  				pageflag_names, ARRAY_SIZE(pageflag_names));
>  	}
> -	mem_cgroup_print_bad_page(page);
> +#ifdef CONFIG_MEMCG
> +	if (page->mem_cgroup)
> +		pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
> +#endif
>  }
>  
>  void dump_page(struct page *page, const char *reason)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index fbb41a170eae..3645641513a1 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3157,21 +3157,6 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
>  }
>  #endif
>  
> -#ifdef CONFIG_DEBUG_VM
> -bool mem_cgroup_bad_page_check(struct page *page)
> -{
> -	if (mem_cgroup_disabled())
> -		return false;
> -
> -	return page->mem_cgroup != NULL;
> -}
> -
> -void mem_cgroup_print_bad_page(struct page *page)
> -{
> -	pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
> -}
> -#endif
> -
>  static DEFINE_MUTEX(memcg_limit_mutex);
>  
>  static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 6a952237a677..161da09fcda2 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -653,8 +653,10 @@ static inline int free_pages_check(struct page *page)
>  		bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
>  		bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
>  	}
> -	if (unlikely(mem_cgroup_bad_page_check(page)))
> -		bad_reason = "cgroup check failed";
> +#ifdef CONFIG_MEMCG
> +	if (unlikely(page->mem_cgroup))
> +		bad_reason = "page still charged to cgroup";
> +#endif
>  	if (unlikely(bad_reason)) {
>  		bad_page(page, bad_reason, bad_flags);
>  		return 1;
> @@ -920,8 +922,10 @@ static inline int check_new_page(struct page *page)
>  		bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
>  		bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
>  	}
> -	if (unlikely(mem_cgroup_bad_page_check(page)))
> -		bad_reason = "cgroup check failed";
> +#ifdef CONFIG_MEMCG
> +	if (unlikely(page->mem_cgroup))
> +		bad_reason = "page still charged to cgroup";
> +#endif
>  	if (unlikely(bad_reason)) {
>  		bad_page(page, bad_reason, bad_flags);
>  		return 1;
> -- 
> 2.1.3
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-03 15:09   ` Johannes Weiner
  2014-11-03 16:42     ` David Miller
@ 2014-11-03 17:02     ` Michal Hocko
  2014-11-04  0:40     ` Joonsoo Kim
  2 siblings, 0 replies; 38+ messages in thread
From: Michal Hocko @ 2014-11-03 17:02 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Joonsoo Kim, Andrew Morton, Vladimir Davydov, Tejun Heo,
	David Miller, linux-mm, cgroups, linux-kernel

On Mon 03-11-14 10:09:42, Johannes Weiner wrote:
> Hi Joonsoo,
> 
> On Mon, Nov 03, 2014 at 05:02:08PM +0900, Joonsoo Kim wrote:
> > On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> > > Memory cgroups used to have 5 per-page pointers.  To allow users to
> > > disable that amount of overhead during runtime, those pointers were
> > > allocated in a separate array, with a translation layer between them
> > > and struct page.
> > 
> > Hello, Johannes.
> > 
> > I'd like to leave this translation layer.
> > Could you just disable that code with #ifdef until next user comes?
> > 
> > In our company, we uses PAGE_OWNER on mm tree which is the feature
> > saying who allocates the page. To use PAGE_OWNER needs modifying
> > struct page and then needs re-compile. This re-compile makes us difficult
> > to use this feature. So, we decide to implement run-time configurable
> > PAGE_OWNER through page_cgroup's translation layer code. Moreover, with
> > this infrastructure, I plan to implement some other debugging feature.
> > 
> > Because of my laziness, it didn't submitted to LKML. But, I will
> > submit it as soon as possible. If the code is removed, I would
> > copy-and-paste the code, but, it would cause lose of the history on
> > that code. So if possible, I'd like to leave that code now.
> 
> Please re-introduce this code when your new usecase is ready to be
> upstreamed.  There is little reason to burden an unrelated feature
> with a sizable chunk of dead code for a vague future user.

Completely agreed! I would hate to have some random feature piggy back
on memcg internal structures. For that to be acceptable the page_cgroup
would have to be abstracted into a more generic structure which wouldn't
be under CONFIG_MEMCG anyway.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-02  3:15 [patch 1/3] mm: embed the memcg pointer directly into struct page Johannes Weiner
                   ` (4 preceding siblings ...)
  2014-11-03 16:51 ` Michal Hocko
@ 2014-11-03 17:17 ` Michal Hocko
  2014-11-03 17:30 ` Vladimir Davydov
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 38+ messages in thread
From: Michal Hocko @ 2014-11-03 17:17 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Vladimir Davydov, Tejun Heo, David Miller,
	linux-mm, cgroups, linux-kernel

Documentation/cgroups/memory.txt is outdate even more hopelessly than
before. It deserves a complete rewrite but I guess something like the
following should be added in the meantime to prepare potential readers
about the trap.
---
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 67613ff0270c..46b2b5080317 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -1,5 +1,10 @@
 Memory Resource Controller
 
+NOTE: This document is hopelessly outdated and it asks for a complete
+      rewrite. It still contains a useful information so we are keeping it
+      here but make sure to check the current code if you need a deeper
+      understanding.
+
 NOTE: The Memory Resource Controller has generically been referred to as the
       memory controller in this document. Do not confuse memory controller
       used here with the memory controller that is used in hardware.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-02  3:15 [patch 1/3] mm: embed the memcg pointer directly into struct page Johannes Weiner
                   ` (5 preceding siblings ...)
  2014-11-03 17:17 ` Michal Hocko
@ 2014-11-03 17:30 ` Vladimir Davydov
  2014-11-03 21:06 ` Kirill A. Shutemov
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 38+ messages in thread
From: Vladimir Davydov @ 2014-11-03 17:30 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Michal Hocko, Tejun Heo, David Miller, linux-mm,
	cgroups, linux-kernel

On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> Memory cgroups used to have 5 per-page pointers.  To allow users to
> disable that amount of overhead during runtime, those pointers were
> allocated in a separate array, with a translation layer between them
> and struct page.
> 
> There is now only one page pointer remaining: the memcg pointer, that
> indicates which cgroup the page is associated with when charged.  The
> complexity of runtime allocation and the runtime translation overhead
> is no longer justified to save that *potential* 0.19% of memory.  With
> CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
> after the page->private member and doesn't even increase struct page,
> and then this patch actually saves space.  Remaining users that care
> can still compile their kernels without CONFIG_MEMCG.
> 
>    text    data     bss     dec     hex     filename
> 8828345 1725264  983040 11536649 b00909  vmlinux.old
> 8827425 1725264  966656 11519345 afc571  vmlinux.new
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Acked-by: Vladimir Davydov <vdavydov@parallels.com>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 2/3] mm: page_cgroup: rename file to mm/swap_cgroup.c
  2014-11-02  3:15 ` [patch 2/3] mm: page_cgroup: rename file to mm/swap_cgroup.c Johannes Weiner
  2014-11-03  4:22   ` David Miller
  2014-11-03 16:57   ` Michal Hocko
@ 2014-11-03 17:30   ` Vladimir Davydov
  2014-11-04  8:37   ` Kamezawa Hiroyuki
  3 siblings, 0 replies; 38+ messages in thread
From: Vladimir Davydov @ 2014-11-03 17:30 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Michal Hocko, Tejun Heo, David Miller, linux-mm,
	cgroups, linux-kernel

On Sat, Nov 01, 2014 at 11:15:55PM -0400, Johannes Weiner wrote:
> Now that the external page_cgroup data structure and its lookup is
> gone, the only code remaining in there is swap slot accounting.
> 
> Rename it and move the conditional compilation into mm/Makefile.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Acked-by: Vladimir Davydov <vdavydov@parallels.com>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/3] mm: move page->mem_cgroup bad page handling into generic code
  2014-11-02  3:15 ` [patch 3/3] mm: move page->mem_cgroup bad page handling into generic code Johannes Weiner
  2014-11-03  4:23   ` David Miller
  2014-11-03 16:58   ` Michal Hocko
@ 2014-11-03 17:32   ` Vladimir Davydov
  2014-11-03 18:27   ` [patch] mm: move page->mem_cgroup bad page handling into generic code fix Johannes Weiner
  2014-11-04  8:39   ` [patch 3/3] mm: move page->mem_cgroup bad page handling into generic code Kamezawa Hiroyuki
  4 siblings, 0 replies; 38+ messages in thread
From: Vladimir Davydov @ 2014-11-03 17:32 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Michal Hocko, Tejun Heo, David Miller, linux-mm,
	cgroups, linux-kernel

On Sat, Nov 01, 2014 at 11:15:56PM -0400, Johannes Weiner wrote:
> Now that the external page_cgroup data structure and its lookup is
> gone, let the generic bad_page() check for page->mem_cgroup sanity.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Acked-by: Vladimir Davydov <vdavydov@parallels.com>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [patch] mm: move page->mem_cgroup bad page handling into generic code fix
  2014-11-02  3:15 ` [patch 3/3] mm: move page->mem_cgroup bad page handling into generic code Johannes Weiner
                     ` (2 preceding siblings ...)
  2014-11-03 17:32   ` Vladimir Davydov
@ 2014-11-03 18:27   ` Johannes Weiner
  2014-11-03 20:10     ` David Miller
  2014-11-04  8:39   ` [patch 3/3] mm: move page->mem_cgroup bad page handling into generic code Kamezawa Hiroyuki
  4 siblings, 1 reply; 38+ messages in thread
From: Johannes Weiner @ 2014-11-03 18:27 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Vladimir Davydov, Tejun Heo, David Miller,
	linux-mm, cgroups, linux-kernel

Remove unneeded !CONFIG_MEMCG memcg bad_page() dummies as well.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/memcontrol.h | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e789551d4db0..41ae72b02b55 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -342,19 +342,6 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 }
 #endif /* CONFIG_MEMCG */
 
-#if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
-static inline bool
-mem_cgroup_bad_page_check(struct page *page)
-{
-	return false;
-}
-
-static inline void
-mem_cgroup_print_bad_page(struct page *page)
-{
-}
-#endif
-
 enum {
 	UNDER_LIMIT,
 	SOFT_LIMIT,
-- 
2.1.3

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [patch] mm: move page->mem_cgroup bad page handling into generic code fix
  2014-11-03 18:27   ` [patch] mm: move page->mem_cgroup bad page handling into generic code fix Johannes Weiner
@ 2014-11-03 20:10     ` David Miller
  0 siblings, 0 replies; 38+ messages in thread
From: David Miller @ 2014-11-03 20:10 UTC (permalink / raw)
  To: hannes; +Cc: akpm, mhocko, vdavydov, tj, linux-mm, cgroups, linux-kernel

From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 3 Nov 2014 13:27:40 -0500

> Remove unneeded !CONFIG_MEMCG memcg bad_page() dummies as well.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Acked-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-02  3:15 [patch 1/3] mm: embed the memcg pointer directly into struct page Johannes Weiner
                   ` (6 preceding siblings ...)
  2014-11-03 17:30 ` Vladimir Davydov
@ 2014-11-03 21:06 ` Kirill A. Shutemov
  2014-11-03 21:36   ` Johannes Weiner
  2014-11-04  8:36 ` Kamezawa Hiroyuki
  2014-11-06 18:55 ` Konstantin Khlebnikov
  9 siblings, 1 reply; 38+ messages in thread
From: Kirill A. Shutemov @ 2014-11-03 21:06 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Michal Hocko, Vladimir Davydov, Tejun Heo,
	David Miller, linux-mm, cgroups, linux-kernel

On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> Memory cgroups used to have 5 per-page pointers.  To allow users to
> disable that amount of overhead during runtime, those pointers were
> allocated in a separate array, with a translation layer between them
> and struct page.
> 
> There is now only one page pointer remaining: the memcg pointer, that
> indicates which cgroup the page is associated with when charged.  The
> complexity of runtime allocation and the runtime translation overhead
> is no longer justified to save that *potential* 0.19% of memory.

How much do you win by the change?

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-03 21:06 ` Kirill A. Shutemov
@ 2014-11-03 21:36   ` Johannes Weiner
  2014-11-03 21:52     ` Kirill A. Shutemov
  0 siblings, 1 reply; 38+ messages in thread
From: Johannes Weiner @ 2014-11-03 21:36 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrew Morton, Michal Hocko, Vladimir Davydov, Tejun Heo,
	David Miller, linux-mm, cgroups, linux-kernel

On Mon, Nov 03, 2014 at 11:06:07PM +0200, Kirill A. Shutemov wrote:
> On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> > Memory cgroups used to have 5 per-page pointers.  To allow users to
> > disable that amount of overhead during runtime, those pointers were
> > allocated in a separate array, with a translation layer between them
> > and struct page.
> > 
> > There is now only one page pointer remaining: the memcg pointer, that
> > indicates which cgroup the page is associated with when charged.  The
> > complexity of runtime allocation and the runtime translation overhead
> > is no longer justified to save that *potential* 0.19% of memory.
> 
> How much do you win by the change?

Heh, that would have followed right after where you cut the quote:
with CONFIG_SLUB, that pointer actually sits in already existing
struct page padding, which means that I'm saving one pointer per page
(8 bytes per 4096 byte page, 0.19% of memory), plus the pointer and
padding in each memory section.  I also save the (minor) translation
overhead going from page to page_cgroup and the maintenance burden
that stems from having these auxiliary arrays (see deleted code).

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-03 21:36   ` Johannes Weiner
@ 2014-11-03 21:52     ` Kirill A. Shutemov
  2014-11-03 21:58       ` David Miller
  0 siblings, 1 reply; 38+ messages in thread
From: Kirill A. Shutemov @ 2014-11-03 21:52 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Michal Hocko, Vladimir Davydov, Tejun Heo,
	David Miller, linux-mm, cgroups, linux-kernel

On Mon, Nov 03, 2014 at 04:36:28PM -0500, Johannes Weiner wrote:
> On Mon, Nov 03, 2014 at 11:06:07PM +0200, Kirill A. Shutemov wrote:
> > On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> > > Memory cgroups used to have 5 per-page pointers.  To allow users to
> > > disable that amount of overhead during runtime, those pointers were
> > > allocated in a separate array, with a translation layer between them
> > > and struct page.
> > > 
> > > There is now only one page pointer remaining: the memcg pointer, that
> > > indicates which cgroup the page is associated with when charged.  The
> > > complexity of runtime allocation and the runtime translation overhead
> > > is no longer justified to save that *potential* 0.19% of memory.
> > 
> > How much do you win by the change?
> 
> Heh, that would have followed right after where you cut the quote:
> with CONFIG_SLUB, that pointer actually sits in already existing
> struct page padding, which means that I'm saving one pointer per page
> (8 bytes per 4096 byte page, 0.19% of memory), plus the pointer and
> padding in each memory section.  I also save the (minor) translation
> overhead going from page to page_cgroup and the maintenance burden
> that stems from having these auxiliary arrays (see deleted code).

I read the description. I want to know if runtime win (any benchmark data?)
from moving mem_cgroup back to the struct page is measurable.

If the win is not significant, I would prefer to not occupy the padding:
I'm sure we will be able to find a better use for the space in struct page
in the future.

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-03 21:52     ` Kirill A. Shutemov
@ 2014-11-03 21:58       ` David Miller
  2014-11-03 22:36         ` Johannes Weiner
  0 siblings, 1 reply; 38+ messages in thread
From: David Miller @ 2014-11-03 21:58 UTC (permalink / raw)
  To: kirill
  Cc: hannes, akpm, mhocko, vdavydov, tj, linux-mm, cgroups, linux-kernel

From: "Kirill A. Shutemov" <kirill@shutemov.name>
Date: Mon, 3 Nov 2014 23:52:06 +0200

> On Mon, Nov 03, 2014 at 04:36:28PM -0500, Johannes Weiner wrote:
>> On Mon, Nov 03, 2014 at 11:06:07PM +0200, Kirill A. Shutemov wrote:
>> > On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
>> > > Memory cgroups used to have 5 per-page pointers.  To allow users to
>> > > disable that amount of overhead during runtime, those pointers were
>> > > allocated in a separate array, with a translation layer between them
>> > > and struct page.
>> > > 
>> > > There is now only one page pointer remaining: the memcg pointer, that
>> > > indicates which cgroup the page is associated with when charged.  The
>> > > complexity of runtime allocation and the runtime translation overhead
>> > > is no longer justified to save that *potential* 0.19% of memory.
>> > 
>> > How much do you win by the change?
>> 
>> Heh, that would have followed right after where you cut the quote:
>> with CONFIG_SLUB, that pointer actually sits in already existing
>> struct page padding, which means that I'm saving one pointer per page
>> (8 bytes per 4096 byte page, 0.19% of memory), plus the pointer and
>> padding in each memory section.  I also save the (minor) translation
>> overhead going from page to page_cgroup and the maintenance burden
>> that stems from having these auxiliary arrays (see deleted code).
> 
> I read the description. I want to know if runtime win (any benchmark data?)
> from moving mem_cgroup back to the struct page is measurable.
> 
> If the win is not significant, I would prefer to not occupy the padding:
> I'm sure we will be able to find a better use for the space in struct page
> in the future.

I think the simplification benefits completely trump any performan
metric.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-03 21:58       ` David Miller
@ 2014-11-03 22:36         ` Johannes Weiner
  2014-11-04 13:06           ` Michal Hocko
  0 siblings, 1 reply; 38+ messages in thread
From: Johannes Weiner @ 2014-11-03 22:36 UTC (permalink / raw)
  To: David Miller
  Cc: kirill, akpm, mhocko, vdavydov, tj, linux-mm, cgroups, linux-kernel

On Mon, Nov 03, 2014 at 04:58:07PM -0500, David Miller wrote:
> From: "Kirill A. Shutemov" <kirill@shutemov.name>
> Date: Mon, 3 Nov 2014 23:52:06 +0200
> 
> > On Mon, Nov 03, 2014 at 04:36:28PM -0500, Johannes Weiner wrote:
> >> On Mon, Nov 03, 2014 at 11:06:07PM +0200, Kirill A. Shutemov wrote:
> >> > On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> >> > > Memory cgroups used to have 5 per-page pointers.  To allow users to
> >> > > disable that amount of overhead during runtime, those pointers were
> >> > > allocated in a separate array, with a translation layer between them
> >> > > and struct page.
> >> > > 
> >> > > There is now only one page pointer remaining: the memcg pointer, that
> >> > > indicates which cgroup the page is associated with when charged.  The
> >> > > complexity of runtime allocation and the runtime translation overhead
> >> > > is no longer justified to save that *potential* 0.19% of memory.
> >> > 
> >> > How much do you win by the change?
> >> 
> >> Heh, that would have followed right after where you cut the quote:
> >> with CONFIG_SLUB, that pointer actually sits in already existing
> >> struct page padding, which means that I'm saving one pointer per page
> >> (8 bytes per 4096 byte page, 0.19% of memory), plus the pointer and
> >> padding in each memory section.  I also save the (minor) translation
> >> overhead going from page to page_cgroup and the maintenance burden
> >> that stems from having these auxiliary arrays (see deleted code).
> > 
> > I read the description. I want to know if runtime win (any benchmark data?)
> > from moving mem_cgroup back to the struct page is measurable.
> > 
> > If the win is not significant, I would prefer to not occupy the padding:
> > I'm sure we will be able to find a better use for the space in struct page
> > in the future.
> 
> I think the simplification benefits completely trump any performan
> metric.

I agree.

Also, nobody is using that space currently, and I can save memory by
moving the pointer in there.  Should we later add another pointer to
struct page we are only back to the status quo - with the difference
that booting with cgroup_disable=memory will no longer save the extra
pointer per page, but again, if you care that much, you can disable
memory cgroups at compile-time.

So don't look at it as occpuying the padding, it is rather taking away
the ability to allocate that single memcg pointer at runtime, while at
the same time saving a bit of memory for common configurations until
somebody else needs the struct page padding.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-03 15:09   ` Johannes Weiner
  2014-11-03 16:42     ` David Miller
  2014-11-03 17:02     ` Michal Hocko
@ 2014-11-04  0:40     ` Joonsoo Kim
  2 siblings, 0 replies; 38+ messages in thread
From: Joonsoo Kim @ 2014-11-04  0:40 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Michal Hocko, Vladimir Davydov, Tejun Heo,
	David Miller, linux-mm, cgroups, linux-kernel

On Mon, Nov 03, 2014 at 10:09:42AM -0500, Johannes Weiner wrote:
> Hi Joonsoo,
> 
> On Mon, Nov 03, 2014 at 05:02:08PM +0900, Joonsoo Kim wrote:
> > On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> > > Memory cgroups used to have 5 per-page pointers.  To allow users to
> > > disable that amount of overhead during runtime, those pointers were
> > > allocated in a separate array, with a translation layer between them
> > > and struct page.
> > 
> > Hello, Johannes.
> > 
> > I'd like to leave this translation layer.
> > Could you just disable that code with #ifdef until next user comes?
> > 
> > In our company, we uses PAGE_OWNER on mm tree which is the feature
> > saying who allocates the page. To use PAGE_OWNER needs modifying
> > struct page and then needs re-compile. This re-compile makes us difficult
> > to use this feature. So, we decide to implement run-time configurable
> > PAGE_OWNER through page_cgroup's translation layer code. Moreover, with
> > this infrastructure, I plan to implement some other debugging feature.
> > 
> > Because of my laziness, it didn't submitted to LKML. But, I will
> > submit it as soon as possible. If the code is removed, I would
> > copy-and-paste the code, but, it would cause lose of the history on
> > that code. So if possible, I'd like to leave that code now.
> 
> Please re-introduce this code when your new usecase is ready to be
> upstreamed.  There is little reason to burden an unrelated feature
> with a sizable chunk of dead code for a vague future user.

Okay.

Thanks.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-02  3:15 [patch 1/3] mm: embed the memcg pointer directly into struct page Johannes Weiner
                   ` (7 preceding siblings ...)
  2014-11-03 21:06 ` Kirill A. Shutemov
@ 2014-11-04  8:36 ` Kamezawa Hiroyuki
  2014-11-04 13:27   ` Johannes Weiner
  2014-11-06 18:55 ` Konstantin Khlebnikov
  9 siblings, 1 reply; 38+ messages in thread
From: Kamezawa Hiroyuki @ 2014-11-04  8:36 UTC (permalink / raw)
  To: Johannes Weiner, Andrew Morton
  Cc: Michal Hocko, Vladimir Davydov, Tejun Heo, David Miller,
	linux-mm, cgroups, linux-kernel

(2014/11/02 12:15), Johannes Weiner wrote:
> Memory cgroups used to have 5 per-page pointers.  To allow users to
> disable that amount of overhead during runtime, those pointers were
> allocated in a separate array, with a translation layer between them
> and struct page.
> 
> There is now only one page pointer remaining: the memcg pointer, that
> indicates which cgroup the page is associated with when charged.  The
> complexity of runtime allocation and the runtime translation overhead
> is no longer justified to save that *potential* 0.19% of memory.  With
> CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
> after the page->private member and doesn't even increase struct page,
> and then this patch actually saves space.  Remaining users that care
> can still compile their kernels without CONFIG_MEMCG.
> 
>     text    data     bss     dec     hex     filename
> 8828345 1725264  983040 11536649 b00909  vmlinux.old
> 8827425 1725264  966656 11519345 afc571  vmlinux.new
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>   include/linux/memcontrol.h  |   6 +-
>   include/linux/mm_types.h    |   5 +
>   include/linux/mmzone.h      |  12 --
>   include/linux/page_cgroup.h |  53 --------
>   init/main.c                 |   7 -
>   mm/memcontrol.c             | 124 +++++------------
>   mm/page_alloc.c             |   2 -
>   mm/page_cgroup.c            | 319 --------------------------------------------
>   8 files changed, 41 insertions(+), 487 deletions(-)
> 

Great! 
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

BTW, init/Kconfig comments shouldn't be updated ?
(I'm sorry if it has been updated since your latest fix.)





^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 2/3] mm: page_cgroup: rename file to mm/swap_cgroup.c
  2014-11-02  3:15 ` [patch 2/3] mm: page_cgroup: rename file to mm/swap_cgroup.c Johannes Weiner
                     ` (2 preceding siblings ...)
  2014-11-03 17:30   ` Vladimir Davydov
@ 2014-11-04  8:37   ` Kamezawa Hiroyuki
  3 siblings, 0 replies; 38+ messages in thread
From: Kamezawa Hiroyuki @ 2014-11-04  8:37 UTC (permalink / raw)
  To: Johannes Weiner, Andrew Morton
  Cc: Michal Hocko, Vladimir Davydov, Tejun Heo, David Miller,
	linux-mm, cgroups, linux-kernel

(2014/11/02 12:15), Johannes Weiner wrote:
> Now that the external page_cgroup data structure and its lookup is
> gone, the only code remaining in there is swap slot accounting.
> 
> Rename it and move the conditional compilation into mm/Makefile.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>



^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 3/3] mm: move page->mem_cgroup bad page handling into generic code
  2014-11-02  3:15 ` [patch 3/3] mm: move page->mem_cgroup bad page handling into generic code Johannes Weiner
                     ` (3 preceding siblings ...)
  2014-11-03 18:27   ` [patch] mm: move page->mem_cgroup bad page handling into generic code fix Johannes Weiner
@ 2014-11-04  8:39   ` Kamezawa Hiroyuki
  4 siblings, 0 replies; 38+ messages in thread
From: Kamezawa Hiroyuki @ 2014-11-04  8:39 UTC (permalink / raw)
  To: Johannes Weiner, Andrew Morton
  Cc: Michal Hocko, Vladimir Davydov, Tejun Heo, David Miller,
	linux-mm, cgroups, linux-kernel

(2014/11/02 12:15), Johannes Weiner wrote:
> Now that the external page_cgroup data structure and its lookup is
> gone, let the generic bad_page() check for page->mem_cgroup sanity.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>



^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-03 22:36         ` Johannes Weiner
@ 2014-11-04 13:06           ` Michal Hocko
  2014-11-04 13:48             ` Johannes Weiner
  0 siblings, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2014-11-04 13:06 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: David Miller, kirill, akpm, vdavydov, tj, linux-mm, cgroups,
	linux-kernel

On Mon 03-11-14 17:36:26, Johannes Weiner wrote:
[...]
> Also, nobody is using that space currently, and I can save memory by
> moving the pointer in there.  Should we later add another pointer to
> struct page we are only back to the status quo - with the difference
> that booting with cgroup_disable=memory will no longer save the extra
> pointer per page, but again, if you care that much, you can disable
> memory cgroups at compile-time.

There would be a slight inconvenience for 32b machines with distribution
kernels which cannot simply drop CONFIG_MEMCG from the config.
Especially those 32b machines with a lot of memory.

I have checked configuration used for OpenSUSE PAE kernel. Both the
struct page and the code size grow. There are additional 4B with SLAB
and SLUB gets 8 because of the alignment in the struct page. So the
overhead is 4B per page with SLUB.

This doesn't sound too bad to me considering that 64b actually even
saves some space with SLUB and it is at the same level with SLAB and
more importantly gets rid of the lookup in hot paths.

The code size grows (~1.5k) most probably due to struct page pointer
arithmetic (but I haven't checked that) but the data section shrinks
for SLAB. So we have additional 1.6k for SLUB. I guess this is
acceptable.

   text    data     bss     dec     hex filename
8427489  887684 3186688 12501861         bec365 mmotm/vmlinux.slab
8429060  883588 3186688 12499336         beb988 page_cgroup/vmlinux.slab

8438894  883428 3186688 12509010         bedf52 mmotm/vmlinux.slub
8440529  883428 3186688 12510645         bee5b5 page_cgroup/vmlinux.slub

So to me it sounds like the savings for 64b are worth minor inconvenience
for 32b which is clearly on decline and I would definitely not encourage
people to use PAE kernels with a lot of memory where the difference
might matter. For the most x86 32b deployments (laptops with 4G) the
difference shouldn't be noticeable. I am not familiar with other archs
so the situation might be different there.

If this would be a problem for some reason, though, we can reintroduce
the external page descriptor and translation layer conditionally
depending on the arch. It seems there will be some users of the external
descriptors anyway so a struct page_external can hold memcg pointer as
well.

This should probably go into the changelog, I guess.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-04  8:36 ` Kamezawa Hiroyuki
@ 2014-11-04 13:27   ` Johannes Weiner
  2014-11-04 13:41     ` Michal Hocko
  0 siblings, 1 reply; 38+ messages in thread
From: Johannes Weiner @ 2014-11-04 13:27 UTC (permalink / raw)
  To: Kamezawa Hiroyuki
  Cc: Andrew Morton, Michal Hocko, Vladimir Davydov, Tejun Heo,
	David Miller, linux-mm, cgroups, linux-kernel

On Tue, Nov 04, 2014 at 05:36:39PM +0900, Kamezawa Hiroyuki wrote:
> (2014/11/02 12:15), Johannes Weiner wrote:
> > Memory cgroups used to have 5 per-page pointers.  To allow users to
> > disable that amount of overhead during runtime, those pointers were
> > allocated in a separate array, with a translation layer between them
> > and struct page.
> > 
> > There is now only one page pointer remaining: the memcg pointer, that
> > indicates which cgroup the page is associated with when charged.  The
> > complexity of runtime allocation and the runtime translation overhead
> > is no longer justified to save that *potential* 0.19% of memory.  With
> > CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
> > after the page->private member and doesn't even increase struct page,
> > and then this patch actually saves space.  Remaining users that care
> > can still compile their kernels without CONFIG_MEMCG.
> > 
> >     text    data     bss     dec     hex     filename
> > 8828345 1725264  983040 11536649 b00909  vmlinux.old
> > 8827425 1725264  966656 11519345 afc571  vmlinux.new
> > 
> > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> > ---
> >   include/linux/memcontrol.h  |   6 +-
> >   include/linux/mm_types.h    |   5 +
> >   include/linux/mmzone.h      |  12 --
> >   include/linux/page_cgroup.h |  53 --------
> >   init/main.c                 |   7 -
> >   mm/memcontrol.c             | 124 +++++------------
> >   mm/page_alloc.c             |   2 -
> >   mm/page_cgroup.c            | 319 --------------------------------------------
> >   8 files changed, 41 insertions(+), 487 deletions(-)
> > 
> 
> Great! 
> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Thank you!

> BTW, init/Kconfig comments shouldn't be updated ?
> (I'm sorry if it has been updated since your latest fix.)

Good point.  How about this?

---
From: Johannes Weiner <hannes@cmpxchg.org>
Subject: [patch] mm: move page->mem_cgroup bad page handling into generic code fix

Remove obsolete memory saving recommendations from the MEMCG Kconfig
help text.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 init/Kconfig | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 01b7f2a6abf7..d68d8b0780b3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -983,18 +983,6 @@ config MEMCG
 	  Provides a memory resource controller that manages both anonymous
 	  memory and page cache. (See Documentation/cgroups/memory.txt)
 
-	  Note that setting this option increases fixed memory overhead
-	  associated with each page of memory in the system. By this,
-	  8(16)bytes/PAGE_SIZE on 32(64)bit system will be occupied by memory
-	  usage tracking struct at boot. Total amount of this is printed out
-	  at boot.
-
-	  Only enable when you're ok with these trade offs and really
-	  sure you need the memory resource controller. Even when you enable
-	  this, you can set "cgroup_disable=memory" at your boot option to
-	  disable memory resource controller and you can avoid overheads.
-	  (and lose benefits of memory resource controller)
-
 config MEMCG_SWAP
 	bool "Memory Resource Controller Swap Extension"
 	depends on MEMCG && SWAP
-- 
2.1.3


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-04 13:27   ` Johannes Weiner
@ 2014-11-04 13:41     ` Michal Hocko
  2014-11-04 14:09       ` Johannes Weiner
  0 siblings, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2014-11-04 13:41 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Kamezawa Hiroyuki, Andrew Morton, Vladimir Davydov, Tejun Heo,
	David Miller, linux-mm, cgroups, linux-kernel

On Tue 04-11-14 08:27:01, Johannes Weiner wrote:
> From: Johannes Weiner <hannes@cmpxchg.org>
> Subject: [patch] mm: move page->mem_cgroup bad page handling into generic code fix
> 
> Remove obsolete memory saving recommendations from the MEMCG Kconfig
> help text.

The memory overhead is still there. So I do not think it is good to
remove the message altogether. The current overhead might be 4 or 8B
depending on the configuration. What about
"
	Note that setting this option might increase fixed memory
	overhead associated with each page descriptor in the system.
	The memory overhead depends on the architecture and other
	configuration options which have influence on the size and
	alignment on the page descriptor (struct page). Namely
	CONFIG_SLUB has a requirement for page alignment to two words
	which in turn means that 64b systems might not see any memory
	overhead as the additional data fits into alignment. On the
	other hand 32b systems might see 8B memory overhead.
"

> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  init/Kconfig | 12 ------------
>  1 file changed, 12 deletions(-)
> 
> diff --git a/init/Kconfig b/init/Kconfig
> index 01b7f2a6abf7..d68d8b0780b3 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -983,18 +983,6 @@ config MEMCG
>  	  Provides a memory resource controller that manages both anonymous
>  	  memory and page cache. (See Documentation/cgroups/memory.txt)
>  
> -	  Note that setting this option increases fixed memory overhead
> -	  associated with each page of memory in the system. By this,
> -	  8(16)bytes/PAGE_SIZE on 32(64)bit system will be occupied by memory
> -	  usage tracking struct at boot. Total amount of this is printed out
> -	  at boot.
> -
> -	  Only enable when you're ok with these trade offs and really
> -	  sure you need the memory resource controller. Even when you enable
> -	  this, you can set "cgroup_disable=memory" at your boot option to
> -	  disable memory resource controller and you can avoid overheads.
> -	  (and lose benefits of memory resource controller)
> -
>  config MEMCG_SWAP
>  	bool "Memory Resource Controller Swap Extension"
>  	depends on MEMCG && SWAP
> -- 
> 2.1.3
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-04 13:06           ` Michal Hocko
@ 2014-11-04 13:48             ` Johannes Weiner
  2014-11-04 14:50               ` Michal Hocko
  0 siblings, 1 reply; 38+ messages in thread
From: Johannes Weiner @ 2014-11-04 13:48 UTC (permalink / raw)
  To: Michal Hocko
  Cc: David Miller, kirill, akpm, vdavydov, tj, linux-mm, cgroups,
	linux-kernel

On Tue, Nov 04, 2014 at 02:06:52PM +0100, Michal Hocko wrote:
> The code size grows (~1.5k) most probably due to struct page pointer
> arithmetic (but I haven't checked that) but the data section shrinks
> for SLAB. So we have additional 1.6k for SLUB. I guess this is
> acceptable.
> 
>    text    data     bss     dec     hex filename
> 8427489  887684 3186688 12501861         bec365 mmotm/vmlinux.slab
> 8429060  883588 3186688 12499336         beb988 page_cgroup/vmlinux.slab
> 
> 8438894  883428 3186688 12509010         bedf52 mmotm/vmlinux.slub
> 8440529  883428 3186688 12510645         bee5b5 page_cgroup/vmlinux.slub

That's unexpected.  It's not much, but how could the object size grow
at all when that much code is removed and we replace the lookups with
simple struct member accesses?  Are you positive these are the right
object files, in the right order?

> So to me it sounds like the savings for 64b are worth minor inconvenience
> for 32b which is clearly on decline and I would definitely not encourage
> people to use PAE kernels with a lot of memory where the difference
> might matter. For the most x86 32b deployments (laptops with 4G) the
> difference shouldn't be noticeable. I am not familiar with other archs
> so the situation might be different there.

On 32 bit, the overhead is 0.098% of memory, so 4MB on a 4G machine.
This should be acceptable, even for the three people that run on the
cutting edge of 3.18-based PAE distribution kernels. :-)

> This should probably go into the changelog, I guess.

Which part?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-04 13:41     ` Michal Hocko
@ 2014-11-04 14:09       ` Johannes Weiner
  2014-11-04 15:00         ` Michal Hocko
  2014-11-04 16:34         ` David Miller
  0 siblings, 2 replies; 38+ messages in thread
From: Johannes Weiner @ 2014-11-04 14:09 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Kamezawa Hiroyuki, Andrew Morton, Vladimir Davydov, Tejun Heo,
	David Miller, linux-mm, cgroups, linux-kernel

On Tue, Nov 04, 2014 at 02:41:10PM +0100, Michal Hocko wrote:
> On Tue 04-11-14 08:27:01, Johannes Weiner wrote:
> > From: Johannes Weiner <hannes@cmpxchg.org>
> > Subject: [patch] mm: move page->mem_cgroup bad page handling into generic code fix
> > 
> > Remove obsolete memory saving recommendations from the MEMCG Kconfig
> > help text.
> 
> The memory overhead is still there. So I do not think it is good to
> remove the message altogether. The current overhead might be 4 or 8B
> depending on the configuration. What about
> "
> 	Note that setting this option might increase fixed memory
> 	overhead associated with each page descriptor in the system.
> 	The memory overhead depends on the architecture and other
> 	configuration options which have influence on the size and
> 	alignment on the page descriptor (struct page). Namely
> 	CONFIG_SLUB has a requirement for page alignment to two words
> 	which in turn means that 64b systems might not see any memory
> 	overhead as the additional data fits into alignment. On the
> 	other hand 32b systems might see 8B memory overhead.
> "

What difference does it make whether this feature maybe costs an extra
pointer per page or not?  These texts are supposed to help decide with
the selection, but this is not a "good to have, if affordable" type of
runtime debugging option.  You either need cgroup memory accounting
and limiting or not.  There is no possible trade-off to be had.

Slub and numa balancing don't mention this, either, simply because
this cost is negligible or irrelevant when it comes to these knobs.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-04 13:48             ` Johannes Weiner
@ 2014-11-04 14:50               ` Michal Hocko
  0 siblings, 0 replies; 38+ messages in thread
From: Michal Hocko @ 2014-11-04 14:50 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: David Miller, kirill, akpm, vdavydov, tj, linux-mm, cgroups,
	linux-kernel

On Tue 04-11-14 08:48:41, Johannes Weiner wrote:
> On Tue, Nov 04, 2014 at 02:06:52PM +0100, Michal Hocko wrote:
> > The code size grows (~1.5k) most probably due to struct page pointer
> > arithmetic (but I haven't checked that) but the data section shrinks
> > for SLAB. So we have additional 1.6k for SLUB. I guess this is
> > acceptable.
> > 
> >    text    data     bss     dec     hex filename
> > 8427489  887684 3186688 12501861         bec365 mmotm/vmlinux.slab
> > 8429060  883588 3186688 12499336         beb988 page_cgroup/vmlinux.slab
> > 
> > 8438894  883428 3186688 12509010         bedf52 mmotm/vmlinux.slub
> > 8440529  883428 3186688 12510645         bee5b5 page_cgroup/vmlinux.slub
> 
> That's unexpected.  It's not much, but how could the object size grow
> at all when that much code is removed and we replace the lookups with
> simple struct member accesses?  Are you positive these are the right
> object files, in the right order?

Double checked (the base is [1] and page_cgroup refers to these 3
patches). Please note that this is a distribution config (OpenSUSE
13.2) so it enables a lot of things. And I would really expect that 36B
resp. 40B pointer arithmetic will do more instructions than 32B and this
piles up when it is used all over the place.

memcontrol.o shrinks 0.2k
$ size {mmotm,page_cgroup}/mm/memcontrol.o
   text    data     bss     dec     hex filename
  25337    3095       2   28434    6f12 mmotm/mm/memcontrol.o
  25123    3095       2   28220    6e3c page_cgroup/mm/memcontrol.o

and page_cgroup.o saves 0.5k
$ size mmotm/mm/page_cgroup.o page_cgroup/mm/swap_cgroup.o 
   text    data     bss     dec     hex filename
   1419      24     352    1795     703 mmotm/mm/page_cgroup.o
    849      24     348    1221     4c5 page_cgroup/mm/swap_cgroup.o

But built-in.o files grow or keep the same size (this is with
CONFIG_SLAB and gcc 4.8.2)
$ size {mmotm,page_cgroup}/*/built-in.o | sort -k1 -n | awk '!/text/{new = (i++ % 2); if (!new) {val = $1; last_line=$0} else if ($1-val != 0) {diff = $1 - val; printf("%s\n%s diff %d\n", last_line, $0, diff); sum+=diff}}END{printf("Sum diff %d\n", sum)}'
  14481   19586      81   34148    8564 mmotm/init/built-in.o
  14483   19586      81   34150    8566 page_cgroup/init/built-in.o diff 2
  68679    2082      12   70773   11475 mmotm/crypto/built-in.o
  68711    2082      12   70805   11495 page_cgroup/crypto/built-in.o diff 32
 131583   26496    2376  160455   272c7 mmotm/lib/built-in.o
 131631   26496    2376  160503   272f7 page_cgroup/lib/built-in.o diff 48
 229809   12346    1548  243703   3b7f7 mmotm/block/built-in.o
 229937   12346    1548  243831   3b877 page_cgroup/block/built-in.o diff 128
 308015   20442   16280  344737   542a1 mmotm/security/built-in.o
 308031   20442   16280  344753   542b1 page_cgroup/security/built-in.o diff 16
 507979   47110   27236  582325   8e2b5 mmotm/mm/built-in.o
 508540   47110   27236  582886   8e4e6 page_cgroup/mm/built-in.o diff 561
1033752   77064   13212 1124028  1126bc mmotm/fs/built-in.o
1033784   77064   13212 1124060  1126dc page_cgroup/fs/built-in.o diff 32
1099218   51979   33512 1184709  1213c5 mmotm/net/built-in.o
1099282   51979   33512 1184773  121405 page_cgroup/net/built-in.o diff 64
1180475  127020  705068 2012563  1eb593 mmotm/kernel/built-in.o
1180683  127020  705068 2012771  1eb663 page_cgroup/kernel/built-in.o diff 208
2193400  152698   34856 2380954  24549a mmotm/drivers/built-in.o
2193528  152698   34856 2381082  24551a page_cgroup/drivers/built-in.o diff 128
Sum diff 1219

this is not a complete list but mm part eats only 0.5k the rest is small
but it adds up.

> > So to me it sounds like the savings for 64b are worth minor inconvenience
> > for 32b which is clearly on decline and I would definitely not encourage
> > people to use PAE kernels with a lot of memory where the difference
> > might matter. For the most x86 32b deployments (laptops with 4G) the
> > difference shouldn't be noticeable. I am not familiar with other archs
> > so the situation might be different there.
> 
> On 32 bit, the overhead is 0.098% of memory, so 4MB on a 4G machine.
> This should be acceptable, even for the three people that run on the
> cutting edge of 3.18-based PAE distribution kernels. :-)
> 
> > This should probably go into the changelog, I guess.
> 
> Which part?

About potential increased memory footprint on 32b systems (aka don't
sell it as a full win ;))

---
[1] https://git.kernel.org/cgit/linux/kernel/git/mhocko/mm.git/tag/?h=since-3.17&id=mmotm-2014-10-29-14-19
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-04 14:09       ` Johannes Weiner
@ 2014-11-04 15:00         ` Michal Hocko
  2014-11-04 17:46           ` Johannes Weiner
  2014-11-04 16:34         ` David Miller
  1 sibling, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2014-11-04 15:00 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Kamezawa Hiroyuki, Andrew Morton, Vladimir Davydov, Tejun Heo,
	David Miller, linux-mm, cgroups, linux-kernel

On Tue 04-11-14 09:09:37, Johannes Weiner wrote:
> On Tue, Nov 04, 2014 at 02:41:10PM +0100, Michal Hocko wrote:
> > On Tue 04-11-14 08:27:01, Johannes Weiner wrote:
> > > From: Johannes Weiner <hannes@cmpxchg.org>
> > > Subject: [patch] mm: move page->mem_cgroup bad page handling into generic code fix
> > > 
> > > Remove obsolete memory saving recommendations from the MEMCG Kconfig
> > > help text.
> > 
> > The memory overhead is still there. So I do not think it is good to
> > remove the message altogether. The current overhead might be 4 or 8B
> > depending on the configuration. What about
> > "
> > 	Note that setting this option might increase fixed memory
> > 	overhead associated with each page descriptor in the system.
> > 	The memory overhead depends on the architecture and other
> > 	configuration options which have influence on the size and
> > 	alignment on the page descriptor (struct page). Namely
> > 	CONFIG_SLUB has a requirement for page alignment to two words
> > 	which in turn means that 64b systems might not see any memory
> > 	overhead as the additional data fits into alignment. On the
> > 	other hand 32b systems might see 8B memory overhead.
> > "
> 
> What difference does it make whether this feature maybe costs an extra
> pointer per page or not?  These texts are supposed to help decide with
> the selection, but this is not a "good to have, if affordable" type of
> runtime debugging option.  You either need cgroup memory accounting
> and limiting or not.  There is no possible trade-off to be had.

If you are compiling the kernel for your specific usecase then it
is clear. You enable only what you really need/want. But if you are
providing a pre-built kernel and considering which features to enable
then an information about overhead might be useful. You can simply
disable the feature for memory restricted kernel flavors.

> Slub and numa balancing don't mention this, either, simply because
> this cost is negligible or irrelevant when it comes to these knobs.

I agree that the overhead seems negligible but does it hurt us to
mention it though?

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-04 14:09       ` Johannes Weiner
  2014-11-04 15:00         ` Michal Hocko
@ 2014-11-04 16:34         ` David Miller
  1 sibling, 0 replies; 38+ messages in thread
From: David Miller @ 2014-11-04 16:34 UTC (permalink / raw)
  To: hannes
  Cc: mhocko, kamezawa.hiroyu, akpm, vdavydov, tj, linux-mm, cgroups,
	linux-kernel

From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 4 Nov 2014 09:09:37 -0500

> You either need cgroup memory accounting and limiting or not.  There
> is no possible trade-off to be had.

I couldn't have said it better myself, +1.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-04 15:00         ` Michal Hocko
@ 2014-11-04 17:46           ` Johannes Weiner
  0 siblings, 0 replies; 38+ messages in thread
From: Johannes Weiner @ 2014-11-04 17:46 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Kamezawa Hiroyuki, Andrew Morton, Vladimir Davydov, Tejun Heo,
	David Miller, linux-mm, cgroups, linux-kernel

On Tue, Nov 04, 2014 at 04:00:39PM +0100, Michal Hocko wrote:
> On Tue 04-11-14 09:09:37, Johannes Weiner wrote:
> > On Tue, Nov 04, 2014 at 02:41:10PM +0100, Michal Hocko wrote:
> > > On Tue 04-11-14 08:27:01, Johannes Weiner wrote:
> > > > From: Johannes Weiner <hannes@cmpxchg.org>
> > > > Subject: [patch] mm: move page->mem_cgroup bad page handling into generic code fix
> > > > 
> > > > Remove obsolete memory saving recommendations from the MEMCG Kconfig
> > > > help text.
> > > 
> > > The memory overhead is still there. So I do not think it is good to
> > > remove the message altogether. The current overhead might be 4 or 8B
> > > depending on the configuration. What about
> > > "
> > > 	Note that setting this option might increase fixed memory
> > > 	overhead associated with each page descriptor in the system.
> > > 	The memory overhead depends on the architecture and other
> > > 	configuration options which have influence on the size and
> > > 	alignment on the page descriptor (struct page). Namely
> > > 	CONFIG_SLUB has a requirement for page alignment to two words
> > > 	which in turn means that 64b systems might not see any memory
> > > 	overhead as the additional data fits into alignment. On the
> > > 	other hand 32b systems might see 8B memory overhead.
> > > "
> > 
> > What difference does it make whether this feature maybe costs an extra
> > pointer per page or not?  These texts are supposed to help decide with
> > the selection, but this is not a "good to have, if affordable" type of
> > runtime debugging option.  You either need cgroup memory accounting
> > and limiting or not.  There is no possible trade-off to be had.
> 
> If you are compiling the kernel for your specific usecase then it
> is clear. You enable only what you really need/want. But if you are
> providing a pre-built kernel and considering which features to enable
> then an information about overhead might be useful. You can simply
> disable the feature for memory restricted kernel flavors.
> 
> > Slub and numa balancing don't mention this, either, simply because
> > this cost is negligible or irrelevant when it comes to these knobs.
> 
> I agree that the overhead seems negligible but does it hurt us to
> mention it though?

Yes, it's fairly misleading.  What about the instructions it adds to
the fault hotpaths?  The additional memory footprint of each cgroup
created?  You're adding 9 lines to point out one specific cost aspect,
when the entire feature is otherwise summed up in two lines.  The
per-page overhead of memcg is not exceptional or unexpected if you
know what it does - which you should when you enable it, even as a
distributor - and such a gross overrepresentation in the help text is
more confusing than helpful.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [patch 1/3] mm: embed the memcg pointer directly into struct page
  2014-11-02  3:15 [patch 1/3] mm: embed the memcg pointer directly into struct page Johannes Weiner
                   ` (8 preceding siblings ...)
  2014-11-04  8:36 ` Kamezawa Hiroyuki
@ 2014-11-06 18:55 ` Konstantin Khlebnikov
  9 siblings, 0 replies; 38+ messages in thread
From: Konstantin Khlebnikov @ 2014-11-06 18:55 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Michal Hocko, Vladimir Davydov, Tejun Heo,
	David Miller, linux-mm, cgroups, Linux Kernel Mailing List

On Sun, Nov 2, 2014 at 6:15 AM, Johannes Weiner <hannes@cmpxchg.org> wrote:
> Memory cgroups used to have 5 per-page pointers.  To allow users to
> disable that amount of overhead during runtime, those pointers were
> allocated in a separate array, with a translation layer between them
> and struct page.
>
> There is now only one page pointer remaining: the memcg pointer, that
> indicates which cgroup the page is associated with when charged.  The
> complexity of runtime allocation and the runtime translation overhead
> is no longer justified to save that *potential* 0.19% of memory.  With
> CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
> after the page->private member and doesn't even increase struct page,
> and then this patch actually saves space.  Remaining users that care
> can still compile their kernels without CONFIG_MEMCG.
>
>    text    data     bss     dec     hex     filename
> 8828345 1725264  983040 11536649 b00909  vmlinux.old
> 8827425 1725264  966656 11519345 afc571  vmlinux.new
>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Great! Never thought I'd see this. =)

Acked-by: Konstantin Khlebnikov <koct9i@gmail.com>


> ---
>  include/linux/memcontrol.h  |   6 +-
>  include/linux/mm_types.h    |   5 +
>  include/linux/mmzone.h      |  12 --
>  include/linux/page_cgroup.h |  53 --------
>  init/main.c                 |   7 -
>  mm/memcontrol.c             | 124 +++++------------
>  mm/page_alloc.c             |   2 -
>  mm/page_cgroup.c            | 319 --------------------------------------------
>  8 files changed, 41 insertions(+), 487 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index d4575a1d6e99..dafba59b31b4 100644



> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2014-11-06 18:56 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-11-02  3:15 [patch 1/3] mm: embed the memcg pointer directly into struct page Johannes Weiner
2014-11-02  3:15 ` [patch 2/3] mm: page_cgroup: rename file to mm/swap_cgroup.c Johannes Weiner
2014-11-03  4:22   ` David Miller
2014-11-03 16:57   ` Michal Hocko
2014-11-03 17:30   ` Vladimir Davydov
2014-11-04  8:37   ` Kamezawa Hiroyuki
2014-11-02  3:15 ` [patch 3/3] mm: move page->mem_cgroup bad page handling into generic code Johannes Weiner
2014-11-03  4:23   ` David Miller
2014-11-03 16:58   ` Michal Hocko
2014-11-03 17:32   ` Vladimir Davydov
2014-11-03 18:27   ` [patch] mm: move page->mem_cgroup bad page handling into generic code fix Johannes Weiner
2014-11-03 20:10     ` David Miller
2014-11-04  8:39   ` [patch 3/3] mm: move page->mem_cgroup bad page handling into generic code Kamezawa Hiroyuki
2014-11-03  4:22 ` [patch 1/3] mm: embed the memcg pointer directly into struct page David Miller
2014-11-03  8:02 ` Joonsoo Kim
2014-11-03 15:09   ` Johannes Weiner
2014-11-03 16:42     ` David Miller
2014-11-03 17:02     ` Michal Hocko
2014-11-04  0:40     ` Joonsoo Kim
2014-11-03 16:51 ` Michal Hocko
2014-11-03 17:17 ` Michal Hocko
2014-11-03 17:30 ` Vladimir Davydov
2014-11-03 21:06 ` Kirill A. Shutemov
2014-11-03 21:36   ` Johannes Weiner
2014-11-03 21:52     ` Kirill A. Shutemov
2014-11-03 21:58       ` David Miller
2014-11-03 22:36         ` Johannes Weiner
2014-11-04 13:06           ` Michal Hocko
2014-11-04 13:48             ` Johannes Weiner
2014-11-04 14:50               ` Michal Hocko
2014-11-04  8:36 ` Kamezawa Hiroyuki
2014-11-04 13:27   ` Johannes Weiner
2014-11-04 13:41     ` Michal Hocko
2014-11-04 14:09       ` Johannes Weiner
2014-11-04 15:00         ` Michal Hocko
2014-11-04 17:46           ` Johannes Weiner
2014-11-04 16:34         ` David Miller
2014-11-06 18:55 ` Konstantin Khlebnikov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).