linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH] Remove cgroup member from struct page
@ 2008-08-31 17:47 Balbir Singh
  2008-09-01  0:01 ` KAMEZAWA Hiroyuki
                   ` (2 more replies)
  0 siblings, 3 replies; 72+ messages in thread
From: Balbir Singh @ 2008-08-31 17:47 UTC (permalink / raw)
  To: Andrew Morton
  Cc: hugh, kamezawa.hiroyu, menage, xemul, linux-kernel, linux-mm


This is a rewrite of a patch I had written long back to remove struct page
(I shared the patches with Kamezawa, but never posted them anywhere else).
I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).

I've tested the patches on an x86_64 box, I've run a simple test running
under the memory control group and the same test running concurrently under
two different groups (and creating pressure within their groups). I've also
compiled the patch with CGROUP_MEM_RES_CTLR turned off.

Advantages of the patch

1. It removes the extra pointer in struct page

Disadvantages

1. It adds an additional lock structure to struct page_cgroup
2. Radix tree lookup is not an O(1) operation, once the page is known
   getting to the page_cgroup (pc) is a little more expensive now.

This is an initial RFC for comments

TODOs

1. Test the page migration changes
2. Test the performance impact of the patch/approach

Comments/Reviews?

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---

 include/linux/memcontrol.h |   23 +++++-
 include/linux/mm_types.h   |    4 -
 mm/memcontrol.c            |  165 +++++++++++++++++++++++++++++++++------------
 3 files changed, 144 insertions(+), 48 deletions(-)

diff -puN mm/memcontrol.c~memcg_move_to_radix_tree mm/memcontrol.c
--- linux-2.6.27-rc5/mm/memcontrol.c~memcg_move_to_radix_tree	2008-08-30 22:49:28.000000000 +0530
+++ linux-2.6.27-rc5-balbir/mm/memcontrol.c	2008-08-31 23:03:06.000000000 +0530
@@ -24,7 +24,7 @@
 #include <linux/smp.h>
 #include <linux/page-flags.h>
 #include <linux/backing-dev.h>
-#include <linux/bit_spinlock.h>
+#include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
@@ -40,6 +40,9 @@ struct cgroup_subsys mem_cgroup_subsys _
 static struct kmem_cache *page_cgroup_cache __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 
+static struct radix_tree_root mem_cgroup_tree;
+static spinlock_t mem_cgroup_tree_lock;
+
 /*
  * Statistics for memory cgroup.
  */
@@ -159,6 +162,7 @@ struct page_cgroup {
 	struct page *page;
 	struct mem_cgroup *mem_cgroup;
 	int flags;
+	spinlock_t lock;
 };
 #define PAGE_CGROUP_FLAG_CACHE	   (0x1)	/* charged as cache */
 #define PAGE_CGROUP_FLAG_ACTIVE    (0x2)	/* page is active in this cgroup */
@@ -248,35 +252,94 @@ struct mem_cgroup *mem_cgroup_from_task(
 				struct mem_cgroup, css);
 }
 
-static inline int page_cgroup_locked(struct page *page)
+static int page_assign_page_cgroup(struct page *page, struct page_cgroup *pc,
+					gfp_t gfp_mask)
 {
-	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+	unsigned long pfn = page_to_pfn(page);
+	unsigned long flags;
+	int err = 0;
+	struct page_cgroup *old_pc;
+
+	if (pc) {
+		err = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+		if (err) {
+			printk(KERN_WARNING "could not preload radix tree "
+				"in %s\n", __func__);
+			goto done;
+		}
+	}
+
+	spin_lock_irqsave(&mem_cgroup_tree_lock, flags);
+	old_pc = radix_tree_lookup(&mem_cgroup_tree, pfn);
+	if (pc && old_pc) {
+		err = -EEXIST;
+		goto pc_race;
+	}
+	if (pc) {
+		err = radix_tree_insert(&mem_cgroup_tree, pfn, pc);
+		if (err)
+			printk(KERN_WARNING "Inserting into radix tree failed "
+				"in %s\n", __func__);
+	} else
+		radix_tree_delete(&mem_cgroup_tree, pfn);
+pc_race:
+	spin_unlock_irqrestore(&mem_cgroup_tree_lock, flags);
+done:
+	if (pc)
+		radix_tree_preload_end();
+	return err;
 }
 
-static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
+struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
+						bool trylock)
 {
-	VM_BUG_ON(!page_cgroup_locked(page));
-	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
+	unsigned long pfn = page_to_pfn(page);
+	struct page_cgroup *pc;
+	int ret;
+
+	BUG_ON(lock && trylock);
+
+	rcu_read_lock();
+	pc = radix_tree_lookup(&mem_cgroup_tree, pfn);
+
+	if (pc && lock)
+		spin_lock(&pc->lock);
+
+	if (pc && trylock) {
+		ret = spin_trylock(&pc->lock);
+		if (!ret)
+			pc = NULL;
+	}
+
+	rcu_read_unlock();
+
+	return pc;
 }
 
-struct page_cgroup *page_get_page_cgroup(struct page *page)
+static inline void lock_page_cgroup(struct page_cgroup *pc)
 {
-	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
+	BUG_ON(!pc);
+	spin_lock(&pc->lock);
 }
 
-static void lock_page_cgroup(struct page *page)
+static __always_inline void lock_page_cgroup_irqsave(struct page_cgroup *pc,
+							unsigned long *flags)
 {
-	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+	BUG_ON(!pc);
+	spin_lock_irqsave(&pc->lock, *flags);
 }
 
-static int try_lock_page_cgroup(struct page *page)
+static inline void unlock_page_cgroup_irqrestore(struct page_cgroup *pc,
+							unsigned long flags)
 {
-	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+	BUG_ON(!pc);
+	spin_unlock_irqrestore(&pc->lock, flags);
 }
 
-static void unlock_page_cgroup(struct page *page)
+static inline void unlock_page_cgroup(struct page_cgroup *pc)
 {
-	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+	BUG_ON(!pc);
+	spin_unlock(&pc->lock);
 }
 
 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
@@ -377,17 +440,15 @@ void mem_cgroup_move_lists(struct page *
 	 * safely get to page_cgroup without it, so just try_lock it:
 	 * mem_cgroup_isolate_pages allows for page left on wrong list.
 	 */
-	if (!try_lock_page_cgroup(page))
+	pc = page_get_page_cgroup_trylock(page);
+	if (!pc)
 		return;
 
-	pc = page_get_page_cgroup(page);
-	if (pc) {
-		mz = page_cgroup_zoneinfo(pc);
-		spin_lock_irqsave(&mz->lru_lock, flags);
-		__mem_cgroup_move_lists(pc, lru);
-		spin_unlock_irqrestore(&mz->lru_lock, flags);
-	}
-	unlock_page_cgroup(page);
+	mz = page_cgroup_zoneinfo(pc);
+	spin_lock_irqsave(&mz->lru_lock, flags);
+	__mem_cgroup_move_lists(pc, lru);
+	spin_unlock_irqrestore(&mz->lru_lock, flags);
+	unlock_page_cgroup(pc);
 }
 
 /*
@@ -516,7 +577,7 @@ static int mem_cgroup_charge_common(stru
 				struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *mem;
-	struct page_cgroup *pc;
+	struct page_cgroup *pc, *old_pc;
 	unsigned long flags;
 	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct mem_cgroup_per_zone *mz;
@@ -569,6 +630,8 @@ static int mem_cgroup_charge_common(stru
 
 	pc->mem_cgroup = mem;
 	pc->page = page;
+	spin_lock_init(&pc->lock);
+
 	/*
 	 * If a page is accounted as a page cache, insert to inactive list.
 	 * If anon, insert to active list.
@@ -582,22 +645,34 @@ static int mem_cgroup_charge_common(stru
 	} else
 		pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
 
-	lock_page_cgroup(page);
-	if (unlikely(page_get_page_cgroup(page))) {
-		unlock_page_cgroup(page);
+	old_pc = page_get_page_cgroup_locked(page);
+	if (old_pc) {
+		unlock_page_cgroup(old_pc);
+		res_counter_uncharge(&mem->res, PAGE_SIZE);
+		css_put(&mem->css);
+		kmem_cache_free(page_cgroup_cache, old_pc);
+		goto done;
+	}
+
+	lock_page_cgroup(pc);
+	/*
+	 * page_get_page_cgroup() does not necessarily guarantee that
+	 * there will be no race in checking for pc, page_assign_page_pc()
+	 * will definitely catch it.
+	 */
+	if (page_assign_page_cgroup(page, pc, gfp_mask)) {
+		unlock_page_cgroup(pc);
 		res_counter_uncharge(&mem->res, PAGE_SIZE);
 		css_put(&mem->css);
 		kmem_cache_free(page_cgroup_cache, pc);
 		goto done;
 	}
-	page_assign_page_cgroup(page, pc);
 
 	mz = page_cgroup_zoneinfo(pc);
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	__mem_cgroup_add_list(mz, pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
-
-	unlock_page_cgroup(page);
+	unlock_page_cgroup(pc);
 done:
 	return 0;
 out:
@@ -645,15 +720,14 @@ int mem_cgroup_cache_charge(struct page 
 	if (!(gfp_mask & __GFP_WAIT)) {
 		struct page_cgroup *pc;
 
-		lock_page_cgroup(page);
-		pc = page_get_page_cgroup(page);
+		pc = page_get_page_cgroup_locked(page);
 		if (pc) {
 			VM_BUG_ON(pc->page != page);
 			VM_BUG_ON(!pc->mem_cgroup);
-			unlock_page_cgroup(page);
+			unlock_page_cgroup(pc);
 			return 0;
 		}
-		unlock_page_cgroup(page);
+		unlock_page_cgroup(pc);
 	}
 
 	if (unlikely(!mm))
@@ -673,6 +747,7 @@ __mem_cgroup_uncharge_common(struct page
 	struct mem_cgroup *mem;
 	struct mem_cgroup_per_zone *mz;
 	unsigned long flags;
+	int ret;
 
 	if (mem_cgroup_subsys.disabled)
 		return;
@@ -680,8 +755,7 @@ __mem_cgroup_uncharge_common(struct page
 	/*
 	 * Check if our page_cgroup is valid
 	 */
-	lock_page_cgroup(page);
-	pc = page_get_page_cgroup(page);
+	pc = page_get_page_cgroup_locked(page);
 	if (unlikely(!pc))
 		goto unlock;
 
@@ -697,8 +771,9 @@ __mem_cgroup_uncharge_common(struct page
 	__mem_cgroup_remove_list(mz, pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
 
-	page_assign_page_cgroup(page, NULL);
-	unlock_page_cgroup(page);
+	ret = page_assign_page_cgroup(page, NULL, GFP_KERNEL);
+	VM_BUG_ON(ret);
+	unlock_page_cgroup(pc);
 
 	mem = pc->mem_cgroup;
 	res_counter_uncharge(&mem->res, PAGE_SIZE);
@@ -707,7 +782,14 @@ __mem_cgroup_uncharge_common(struct page
 	kmem_cache_free(page_cgroup_cache, pc);
 	return;
 unlock:
-	unlock_page_cgroup(page);
+	unlock_page_cgroup(pc);
+}
+
+void page_reset_bad_cgroup(struct page *page)
+{
+	int ret;
+	ret = page_assign_page_cgroup(page, NULL, GFP_KERNEL);
+	VM_BUG_ON(ret);
 }
 
 void mem_cgroup_uncharge_page(struct page *page)
@@ -734,15 +816,14 @@ int mem_cgroup_prepare_migration(struct 
 	if (mem_cgroup_subsys.disabled)
 		return 0;
 
-	lock_page_cgroup(page);
-	pc = page_get_page_cgroup(page);
+	pc = page_get_page_cgroup_locked(page);
 	if (pc) {
 		mem = pc->mem_cgroup;
 		css_get(&mem->css);
 		if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
 			ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	}
-	unlock_page_cgroup(page);
+	unlock_page_cgroup(pc);
 	if (mem) {
 		ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
 			ctype, mem);
diff -puN include/linux/memcontrol.h~memcg_move_to_radix_tree include/linux/memcontrol.h
--- linux-2.6.27-rc5/include/linux/memcontrol.h~memcg_move_to_radix_tree	2008-08-30 22:49:28.000000000 +0530
+++ linux-2.6.27-rc5-balbir/include/linux/memcontrol.h	2008-08-31 22:57:08.000000000 +0530
@@ -27,9 +27,28 @@ struct mm_struct;
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
-#define page_reset_bad_cgroup(page)	((page)->page_cgroup = 0)
+extern void page_reset_bad_cgroup(struct page *page);
+extern struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
+							bool trylock);
+
+static __always_inline
+struct page_cgroup *page_get_page_cgroup(struct page *page)
+{
+	return __page_get_page_cgroup(page, false, false);
+}
+
+static __always_inline
+struct page_cgroup *page_get_page_cgroup_trylock(struct page *page)
+{
+	return __page_get_page_cgroup(page, false, true);
+}
+
+static __always_inline
+struct page_cgroup *page_get_page_cgroup_locked(struct page *page)
+{
+	return __page_get_page_cgroup(page, true, false);
+}
 
-extern struct page_cgroup *page_get_page_cgroup(struct page *page);
 extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
diff -puN include/linux/mm_types.h~memcg_move_to_radix_tree include/linux/mm_types.h
--- linux-2.6.27-rc5/include/linux/mm_types.h~memcg_move_to_radix_tree	2008-08-31 13:30:57.000000000 +0530
+++ linux-2.6.27-rc5-balbir/include/linux/mm_types.h	2008-08-31 13:32:03.000000000 +0530
@@ -92,10 +92,6 @@ struct page {
 	void *virtual;			/* Kernel virtual address (NULL if
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-	unsigned long page_cgroup;
-#endif
-
 #ifdef CONFIG_KMEMCHECK
 	void *shadow;
 #endif
diff -puN mm/page_alloc.c~memcg_move_to_radix_tree mm/page_alloc.c
_

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-08-31 17:47 [RFC][PATCH] Remove cgroup member from struct page Balbir Singh
@ 2008-09-01  0:01 ` KAMEZAWA Hiroyuki
  2008-09-01  3:28   ` Balbir Singh
  2008-09-01  6:56   ` Nick Piggin
  2008-09-01  2:39 ` KAMEZAWA Hiroyuki
  2008-09-01  9:03 ` Pavel Emelyanov
  2 siblings, 2 replies; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-01  0:01 UTC (permalink / raw)
  To: balbir
  Cc: Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm, nickpiggin

On Sun, 31 Aug 2008 23:17:56 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> 
> This is a rewrite of a patch I had written long back to remove struct page
> (I shared the patches with Kamezawa, but never posted them anywhere else).
> I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).
> 
It's just because I think there is no strong requirements for 64bit count/mapcount.
There is no ZERO_PAGE() for ANON (by Nick Piggin. I add him to CC.)
(shmem still use it but impact is not big.)

> I've tested the patches on an x86_64 box, I've run a simple test running
> under the memory control group and the same test running concurrently under
> two different groups (and creating pressure within their groups). I've also
> compiled the patch with CGROUP_MEM_RES_CTLR turned off.
> 
> Advantages of the patch
> 
> 1. It removes the extra pointer in struct page
> 
> Disadvantages
> 
> 1. It adds an additional lock structure to struct page_cgroup
> 2. Radix tree lookup is not an O(1) operation, once the page is known
>    getting to the page_cgroup (pc) is a little more expensive now.
> 
> This is an initial RFC for comments
> 
> TODOs
> 
> 1. Test the page migration changes
> 2. Test the performance impact of the patch/approach
> 
> Comments/Reviews?
> 
plz wait until lockless page cgroup....

And If you don't support radix-tree-delete(), pre-allocating all at boot is better.

BTW, why pc->lock is necessary ? It increases size of struct page_cgroup and reduce 
the advantege of your patch's to half (8bytes -> 4bytes).

Thanks,
-Kame



> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
> ---
> 
>  include/linux/memcontrol.h |   23 +++++-
>  include/linux/mm_types.h   |    4 -
>  mm/memcontrol.c            |  165 +++++++++++++++++++++++++++++++++------------
>  3 files changed, 144 insertions(+), 48 deletions(-)
> 
> diff -puN mm/memcontrol.c~memcg_move_to_radix_tree mm/memcontrol.c
> --- linux-2.6.27-rc5/mm/memcontrol.c~memcg_move_to_radix_tree	2008-08-30 22:49:28.000000000 +0530
> +++ linux-2.6.27-rc5-balbir/mm/memcontrol.c	2008-08-31 23:03:06.000000000 +0530
> @@ -24,7 +24,7 @@
>  #include <linux/smp.h>
>  #include <linux/page-flags.h>
>  #include <linux/backing-dev.h>
> -#include <linux/bit_spinlock.h>
> +#include <linux/radix-tree.h>
>  #include <linux/rcupdate.h>
>  #include <linux/slab.h>
>  #include <linux/swap.h>
> @@ -40,6 +40,9 @@ struct cgroup_subsys mem_cgroup_subsys _
>  static struct kmem_cache *page_cgroup_cache __read_mostly;
>  #define MEM_CGROUP_RECLAIM_RETRIES	5
>  
> +static struct radix_tree_root mem_cgroup_tree;
> +static spinlock_t mem_cgroup_tree_lock;
> +
>  /*
>   * Statistics for memory cgroup.
>   */
> @@ -159,6 +162,7 @@ struct page_cgroup {
>  	struct page *page;
>  	struct mem_cgroup *mem_cgroup;
>  	int flags;
> +	spinlock_t lock;
>  };
>  #define PAGE_CGROUP_FLAG_CACHE	   (0x1)	/* charged as cache */
>  #define PAGE_CGROUP_FLAG_ACTIVE    (0x2)	/* page is active in this cgroup */
> @@ -248,35 +252,94 @@ struct mem_cgroup *mem_cgroup_from_task(
>  				struct mem_cgroup, css);
>  }
>  
> -static inline int page_cgroup_locked(struct page *page)
> +static int page_assign_page_cgroup(struct page *page, struct page_cgroup *pc,
> +					gfp_t gfp_mask)
>  {
> -	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +	unsigned long pfn = page_to_pfn(page);
> +	unsigned long flags;
> +	int err = 0;
> +	struct page_cgroup *old_pc;
> +
> +	if (pc) {
> +		err = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
> +		if (err) {
> +			printk(KERN_WARNING "could not preload radix tree "
> +				"in %s\n", __func__);
> +			goto done;
> +		}
> +	}
> +
> +	spin_lock_irqsave(&mem_cgroup_tree_lock, flags);
> +	old_pc = radix_tree_lookup(&mem_cgroup_tree, pfn);
> +	if (pc && old_pc) {
> +		err = -EEXIST;
> +		goto pc_race;
> +	}
> +	if (pc) {
> +		err = radix_tree_insert(&mem_cgroup_tree, pfn, pc);
> +		if (err)
> +			printk(KERN_WARNING "Inserting into radix tree failed "
> +				"in %s\n", __func__);
> +	} else
> +		radix_tree_delete(&mem_cgroup_tree, pfn);
> +pc_race:
> +	spin_unlock_irqrestore(&mem_cgroup_tree_lock, flags);
> +done:
> +	if (pc)
> +		radix_tree_preload_end();
> +	return err;
>  }
>  
> -static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
> +struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
> +						bool trylock)
>  {
> -	VM_BUG_ON(!page_cgroup_locked(page));
> -	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
> +	unsigned long pfn = page_to_pfn(page);
> +	struct page_cgroup *pc;
> +	int ret;
> +
> +	BUG_ON(lock && trylock);
> +
> +	rcu_read_lock();
> +	pc = radix_tree_lookup(&mem_cgroup_tree, pfn);
> +
> +	if (pc && lock)
> +		spin_lock(&pc->lock);
> +
> +	if (pc && trylock) {
> +		ret = spin_trylock(&pc->lock);
> +		if (!ret)
> +			pc = NULL;
> +	}
> +
> +	rcu_read_unlock();
> +
> +	return pc;
>  }
>  
> -struct page_cgroup *page_get_page_cgroup(struct page *page)
> +static inline void lock_page_cgroup(struct page_cgroup *pc)
>  {
> -	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
> +	BUG_ON(!pc);
> +	spin_lock(&pc->lock);
>  }
>  
> -static void lock_page_cgroup(struct page *page)
> +static __always_inline void lock_page_cgroup_irqsave(struct page_cgroup *pc,
> +							unsigned long *flags)
>  {
> -	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +	BUG_ON(!pc);
> +	spin_lock_irqsave(&pc->lock, *flags);
>  }
>  
> -static int try_lock_page_cgroup(struct page *page)
> +static inline void unlock_page_cgroup_irqrestore(struct page_cgroup *pc,
> +							unsigned long flags)
>  {
> -	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +	BUG_ON(!pc);
> +	spin_unlock_irqrestore(&pc->lock, flags);
>  }
>  
> -static void unlock_page_cgroup(struct page *page)
> +static inline void unlock_page_cgroup(struct page_cgroup *pc)
>  {
> -	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +	BUG_ON(!pc);
> +	spin_unlock(&pc->lock);
>  }
>  
>  static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
> @@ -377,17 +440,15 @@ void mem_cgroup_move_lists(struct page *
>  	 * safely get to page_cgroup without it, so just try_lock it:
>  	 * mem_cgroup_isolate_pages allows for page left on wrong list.
>  	 */
> -	if (!try_lock_page_cgroup(page))
> +	pc = page_get_page_cgroup_trylock(page);
> +	if (!pc)
>  		return;
>  
> -	pc = page_get_page_cgroup(page);
> -	if (pc) {
> -		mz = page_cgroup_zoneinfo(pc);
> -		spin_lock_irqsave(&mz->lru_lock, flags);
> -		__mem_cgroup_move_lists(pc, lru);
> -		spin_unlock_irqrestore(&mz->lru_lock, flags);
> -	}
> -	unlock_page_cgroup(page);
> +	mz = page_cgroup_zoneinfo(pc);
> +	spin_lock_irqsave(&mz->lru_lock, flags);
> +	__mem_cgroup_move_lists(pc, lru);
> +	spin_unlock_irqrestore(&mz->lru_lock, flags);
> +	unlock_page_cgroup(pc);
>  }
>  
>  /*
> @@ -516,7 +577,7 @@ static int mem_cgroup_charge_common(stru
>  				struct mem_cgroup *memcg)
>  {
>  	struct mem_cgroup *mem;
> -	struct page_cgroup *pc;
> +	struct page_cgroup *pc, *old_pc;
>  	unsigned long flags;
>  	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
>  	struct mem_cgroup_per_zone *mz;
> @@ -569,6 +630,8 @@ static int mem_cgroup_charge_common(stru
>  
>  	pc->mem_cgroup = mem;
>  	pc->page = page;
> +	spin_lock_init(&pc->lock);
> +
>  	/*
>  	 * If a page is accounted as a page cache, insert to inactive list.
>  	 * If anon, insert to active list.
> @@ -582,22 +645,34 @@ static int mem_cgroup_charge_common(stru
>  	} else
>  		pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
>  
> -	lock_page_cgroup(page);
> -	if (unlikely(page_get_page_cgroup(page))) {
> -		unlock_page_cgroup(page);
> +	old_pc = page_get_page_cgroup_locked(page);
> +	if (old_pc) {
> +		unlock_page_cgroup(old_pc);
> +		res_counter_uncharge(&mem->res, PAGE_SIZE);
> +		css_put(&mem->css);
> +		kmem_cache_free(page_cgroup_cache, old_pc);
> +		goto done;
> +	}
> +
> +	lock_page_cgroup(pc);
> +	/*
> +	 * page_get_page_cgroup() does not necessarily guarantee that
> +	 * there will be no race in checking for pc, page_assign_page_pc()
> +	 * will definitely catch it.
> +	 */
> +	if (page_assign_page_cgroup(page, pc, gfp_mask)) {
> +		unlock_page_cgroup(pc);
>  		res_counter_uncharge(&mem->res, PAGE_SIZE);
>  		css_put(&mem->css);
>  		kmem_cache_free(page_cgroup_cache, pc);
>  		goto done;
>  	}
> -	page_assign_page_cgroup(page, pc);
>  
>  	mz = page_cgroup_zoneinfo(pc);
>  	spin_lock_irqsave(&mz->lru_lock, flags);
>  	__mem_cgroup_add_list(mz, pc);
>  	spin_unlock_irqrestore(&mz->lru_lock, flags);
> -
> -	unlock_page_cgroup(page);
> +	unlock_page_cgroup(pc);
>  done:
>  	return 0;
>  out:
> @@ -645,15 +720,14 @@ int mem_cgroup_cache_charge(struct page 
>  	if (!(gfp_mask & __GFP_WAIT)) {
>  		struct page_cgroup *pc;
>  
> -		lock_page_cgroup(page);
> -		pc = page_get_page_cgroup(page);
> +		pc = page_get_page_cgroup_locked(page);
>  		if (pc) {
>  			VM_BUG_ON(pc->page != page);
>  			VM_BUG_ON(!pc->mem_cgroup);
> -			unlock_page_cgroup(page);
> +			unlock_page_cgroup(pc);
>  			return 0;
>  		}
> -		unlock_page_cgroup(page);
> +		unlock_page_cgroup(pc);
>  	}
>  
>  	if (unlikely(!mm))
> @@ -673,6 +747,7 @@ __mem_cgroup_uncharge_common(struct page
>  	struct mem_cgroup *mem;
>  	struct mem_cgroup_per_zone *mz;
>  	unsigned long flags;
> +	int ret;
>  
>  	if (mem_cgroup_subsys.disabled)
>  		return;
> @@ -680,8 +755,7 @@ __mem_cgroup_uncharge_common(struct page
>  	/*
>  	 * Check if our page_cgroup is valid
>  	 */
> -	lock_page_cgroup(page);
> -	pc = page_get_page_cgroup(page);
> +	pc = page_get_page_cgroup_locked(page);
>  	if (unlikely(!pc))
>  		goto unlock;
>  
> @@ -697,8 +771,9 @@ __mem_cgroup_uncharge_common(struct page
>  	__mem_cgroup_remove_list(mz, pc);
>  	spin_unlock_irqrestore(&mz->lru_lock, flags);
>  
> -	page_assign_page_cgroup(page, NULL);
> -	unlock_page_cgroup(page);
> +	ret = page_assign_page_cgroup(page, NULL, GFP_KERNEL);
> +	VM_BUG_ON(ret);
> +	unlock_page_cgroup(pc);
>  
>  	mem = pc->mem_cgroup;
>  	res_counter_uncharge(&mem->res, PAGE_SIZE);
> @@ -707,7 +782,14 @@ __mem_cgroup_uncharge_common(struct page
>  	kmem_cache_free(page_cgroup_cache, pc);
>  	return;
>  unlock:
> -	unlock_page_cgroup(page);
> +	unlock_page_cgroup(pc);
> +}
> +
> +void page_reset_bad_cgroup(struct page *page)
> +{
> +	int ret;
> +	ret = page_assign_page_cgroup(page, NULL, GFP_KERNEL);
> +	VM_BUG_ON(ret);
>  }
>  
>  void mem_cgroup_uncharge_page(struct page *page)
> @@ -734,15 +816,14 @@ int mem_cgroup_prepare_migration(struct 
>  	if (mem_cgroup_subsys.disabled)
>  		return 0;
>  
> -	lock_page_cgroup(page);
> -	pc = page_get_page_cgroup(page);
> +	pc = page_get_page_cgroup_locked(page);
>  	if (pc) {
>  		mem = pc->mem_cgroup;
>  		css_get(&mem->css);
>  		if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
>  			ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
>  	}
> -	unlock_page_cgroup(page);
> +	unlock_page_cgroup(pc);
>  	if (mem) {
>  		ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
>  			ctype, mem);
> diff -puN include/linux/memcontrol.h~memcg_move_to_radix_tree include/linux/memcontrol.h
> --- linux-2.6.27-rc5/include/linux/memcontrol.h~memcg_move_to_radix_tree	2008-08-30 22:49:28.000000000 +0530
> +++ linux-2.6.27-rc5-balbir/include/linux/memcontrol.h	2008-08-31 22:57:08.000000000 +0530
> @@ -27,9 +27,28 @@ struct mm_struct;
>  
>  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
>  
> -#define page_reset_bad_cgroup(page)	((page)->page_cgroup = 0)
> +extern void page_reset_bad_cgroup(struct page *page);
> +extern struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
> +							bool trylock);
> +
> +static __always_inline
> +struct page_cgroup *page_get_page_cgroup(struct page *page)
> +{
> +	return __page_get_page_cgroup(page, false, false);
> +}
> +
> +static __always_inline
> +struct page_cgroup *page_get_page_cgroup_trylock(struct page *page)
> +{
> +	return __page_get_page_cgroup(page, false, true);
> +}
> +
> +static __always_inline
> +struct page_cgroup *page_get_page_cgroup_locked(struct page *page)
> +{
> +	return __page_get_page_cgroup(page, true, false);
> +}
>  
> -extern struct page_cgroup *page_get_page_cgroup(struct page *page);
>  extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
>  				gfp_t gfp_mask);
>  extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
> diff -puN include/linux/mm_types.h~memcg_move_to_radix_tree include/linux/mm_types.h
> --- linux-2.6.27-rc5/include/linux/mm_types.h~memcg_move_to_radix_tree	2008-08-31 13:30:57.000000000 +0530
> +++ linux-2.6.27-rc5-balbir/include/linux/mm_types.h	2008-08-31 13:32:03.000000000 +0530
> @@ -92,10 +92,6 @@ struct page {
>  	void *virtual;			/* Kernel virtual address (NULL if
>  					   not kmapped, ie. highmem) */
>  #endif /* WANT_PAGE_VIRTUAL */
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> -	unsigned long page_cgroup;
> -#endif
> -
>  #ifdef CONFIG_KMEMCHECK
>  	void *shadow;
>  #endif
> diff -puN mm/page_alloc.c~memcg_move_to_radix_tree mm/page_alloc.c
> _
> 
> -- 
> 	Balbir
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-08-31 17:47 [RFC][PATCH] Remove cgroup member from struct page Balbir Singh
  2008-09-01  0:01 ` KAMEZAWA Hiroyuki
@ 2008-09-01  2:39 ` KAMEZAWA Hiroyuki
  2008-09-01  3:42   ` Balbir Singh
  2008-09-01  9:03 ` Pavel Emelyanov
  2 siblings, 1 reply; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-01  2:39 UTC (permalink / raw)
  To: balbir; +Cc: Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Sun, 31 Aug 2008 23:17:56 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> 
> This is a rewrite of a patch I had written long back to remove struct page
> (I shared the patches with Kamezawa, but never posted them anywhere else).
> I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).
> 
> I've tested the patches on an x86_64 box, I've run a simple test running
> under the memory control group and the same test running concurrently under
> two different groups (and creating pressure within their groups). I've also
> compiled the patch with CGROUP_MEM_RES_CTLR turned off.
> 
> Advantages of the patch
> 
> 1. It removes the extra pointer in struct page
> 
> Disadvantages
> 
> 1. It adds an additional lock structure to struct page_cgroup
> 2. Radix tree lookup is not an O(1) operation, once the page is known
>    getting to the page_cgroup (pc) is a little more expensive now.
> 
> This is an initial RFC for comments
> 
> TODOs
> 
> 1. Test the page migration changes
> 2. Test the performance impact of the patch/approach
> 
> Comments/Reviews?
> 
BTW, how deep this radix-tree on 4GB/32GB/64GB/256GB machine ?

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  0:01 ` KAMEZAWA Hiroyuki
@ 2008-09-01  3:28   ` Balbir Singh
  2008-09-01  4:03     ` KAMEZAWA Hiroyuki
  2008-09-01  6:56   ` Nick Piggin
  1 sibling, 1 reply; 72+ messages in thread
From: Balbir Singh @ 2008-09-01  3:28 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm, nickpiggin

KAMEZAWA Hiroyuki wrote:
> On Sun, 31 Aug 2008 23:17:56 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
>> This is a rewrite of a patch I had written long back to remove struct page
>> (I shared the patches with Kamezawa, but never posted them anywhere else).
>> I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).
>>
> It's just because I think there is no strong requirements for 64bit count/mapcount.
> There is no ZERO_PAGE() for ANON (by Nick Piggin. I add him to CC.)
> (shmem still use it but impact is not big.)
> 

I understand the comment, but not it's context. Are you suggesting that the
sizeof _count and _mapcount can be reduced? Hence the impact of having a member
in struct page is not all that large? I think the patch is definitely very
important for 32 bit systems.

>> I've tested the patches on an x86_64 box, I've run a simple test running
>> under the memory control group and the same test running concurrently under
>> two different groups (and creating pressure within their groups). I've also
>> compiled the patch with CGROUP_MEM_RES_CTLR turned off.
>>
>> Advantages of the patch
>>
>> 1. It removes the extra pointer in struct page
>>
>> Disadvantages
>>
>> 1. It adds an additional lock structure to struct page_cgroup
>> 2. Radix tree lookup is not an O(1) operation, once the page is known
>>    getting to the page_cgroup (pc) is a little more expensive now.
>>
>> This is an initial RFC for comments
>>
>> TODOs
>>
>> 1. Test the page migration changes
>> 2. Test the performance impact of the patch/approach
>>
>> Comments/Reviews?
>>
> plz wait until lockless page cgroup....
> 

That depends, if we can get the lockless page cgroup done quickly, I don't mind
waiting, but if it is going to take longer, I would rather push these changes
in. There should not be too much overhead in porting lockless page cgroup patch
on top of this (remove pc->lock and use pc->flags). I'll help out, so as to
avoid wastage of your effort.

> And If you don't support radix-tree-delete(), pre-allocating all at boot is better.
> 

We do use radix-tree-delete() in the code, please see below. Pre-allocating has
the disadvantage that we will pre-allocate even for kernel pages, etc.

> BTW, why pc->lock is necessary ? It increases size of struct page_cgroup and reduce 
> the advantege of your patch's to half (8bytes -> 4bytes).
> 

Yes, I've mentioned that as a disadvantage. Are you suggesting that with
lockless page cgroup we won't need pc->lock?

> Thanks,
> -Kame

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  2:39 ` KAMEZAWA Hiroyuki
@ 2008-09-01  3:42   ` Balbir Singh
  0 siblings, 0 replies; 72+ messages in thread
From: Balbir Singh @ 2008-09-01  3:42 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

KAMEZAWA Hiroyuki wrote:
> On Sun, 31 Aug 2008 23:17:56 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
>> This is a rewrite of a patch I had written long back to remove struct page
>> (I shared the patches with Kamezawa, but never posted them anywhere else).
>> I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).
>>
>> I've tested the patches on an x86_64 box, I've run a simple test running
>> under the memory control group and the same test running concurrently under
>> two different groups (and creating pressure within their groups). I've also
>> compiled the patch with CGROUP_MEM_RES_CTLR turned off.
>>
>> Advantages of the patch
>>
>> 1. It removes the extra pointer in struct page
>>
>> Disadvantages
>>
>> 1. It adds an additional lock structure to struct page_cgroup
>> 2. Radix tree lookup is not an O(1) operation, once the page is known
>>    getting to the page_cgroup (pc) is a little more expensive now.
>>
>> This is an initial RFC for comments
>>
>> TODOs
>>
>> 1. Test the page migration changes
>> 2. Test the performance impact of the patch/approach
>>
>> Comments/Reviews?
>>
> BTW, how deep this radix-tree on 4GB/32GB/64GB/256GB machine ?

Good Question,

My ball-park estimates are

number of pfns = RADIX_TREE_TAG_LONGS/(RADIX_TREE_TAG_LONGS - 1) *
(RADIX_TREE_LONGS^n - 1)

and "n" is the number we are looking for.

For a 64 bit system with 256 GB and 4KB page size, I've calculated it to be 9
levels deep.

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  3:28   ` Balbir Singh
@ 2008-09-01  4:03     ` KAMEZAWA Hiroyuki
  2008-09-01  5:17       ` KAMEZAWA Hiroyuki
  2008-09-01  6:09       ` Balbir Singh
  0 siblings, 2 replies; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-01  4:03 UTC (permalink / raw)
  To: balbir
  Cc: Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm, nickpiggin

On Mon, 01 Sep 2008 08:58:32 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> KAMEZAWA Hiroyuki wrote:
> > On Sun, 31 Aug 2008 23:17:56 +0530
> > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > 
> >> This is a rewrite of a patch I had written long back to remove struct page
> >> (I shared the patches with Kamezawa, but never posted them anywhere else).
> >> I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).
> >>
> > It's just because I think there is no strong requirements for 64bit count/mapcount.
> > There is no ZERO_PAGE() for ANON (by Nick Piggin. I add him to CC.)
> > (shmem still use it but impact is not big.)
> > 
> 
> I understand the comment, but not it's context. Are you suggesting that the
> sizeof _count and _mapcount can be reduced? Hence the impact of having a member
> in struct page is not all that large? I think the patch is definitely very
> important for 32 bit systems.
Maybe they cannot be reduced. For 32bit systems, if the machine doesn't equip
crazy amounts of memory (as 32GB) I don't think this 32bit is not very large.

Let's calculate. 1GB/4096 x 4 bytes = 1 MB per 1GB. 
But you adds spinlock_t, then what this patch reduce is not so big. Maybe only
hundreds of kilobytes. (All pages in HIGHMEM will be used with structpage_cgroup.)


> >> I've tested the patches on an x86_64 box, I've run a simple test running
> >> under the memory control group and the same test running concurrently under
> >> two different groups (and creating pressure within their groups). I've also
> >> compiled the patch with CGROUP_MEM_RES_CTLR turned off.
> >>
> >> Advantages of the patch
> >>
> >> 1. It removes the extra pointer in struct page
> >>
> >> Disadvantages
> >>
> >> 1. It adds an additional lock structure to struct page_cgroup
> >> 2. Radix tree lookup is not an O(1) operation, once the page is known
> >>    getting to the page_cgroup (pc) is a little more expensive now.
> >>
> >> This is an initial RFC for comments
> >>
> >> TODOs
> >>
> >> 1. Test the page migration changes
> >> 2. Test the performance impact of the patch/approach
> >>
> >> Comments/Reviews?
> >>
> > plz wait until lockless page cgroup....
> > 
> 
> That depends, if we can get the lockless page cgroup done quickly, I don't mind
> waiting, but if it is going to take longer, I would rather push these changes
> in. 
The development of lockless-page_cgroup is not stalled. I'm just waiting for
my 8cpu box comes back from maintainance...
If you want to see, I'll post v3 with brief result on small (2cpu) box.

> There should not be too much overhead in porting lockless page cgroup patch
> on top of this (remove pc->lock and use pc->flags). I'll help out, so as to
> avoid wastage of your effort.
> 
> > And If you don't support radix-tree-delete(), pre-allocating all at boot is better.
> > 
> 
> We do use radix-tree-delete() in the code, please see below. Pre-allocating has
> the disadvantage that we will pre-allocate even for kernel pages, etc.
> 
Sorry. I missed pc==NULL case.


> > BTW, why pc->lock is necessary ? It increases size of struct page_cgroup and reduce 
> > the advantege of your patch's to half (8bytes -> 4bytes).
> > 
> 
> Yes, I've mentioned that as a disadvantage. Are you suggesting that with
> lockless page cgroup we won't need pc->lock?
> 
Not so clear at this stage. 

Thanks,
-Kame




^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  4:03     ` KAMEZAWA Hiroyuki
@ 2008-09-01  5:17       ` KAMEZAWA Hiroyuki
  2008-09-01  6:16         ` Balbir Singh
  2008-09-01  6:09       ` Balbir Singh
  1 sibling, 1 reply; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-01  5:17 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: balbir, Andrew Morton, hugh, menage, xemul, linux-kernel,
	linux-mm, nickpiggin

On Mon, 1 Sep 2008 13:03:51 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > That depends, if we can get the lockless page cgroup done quickly, I don't mind
> > waiting, but if it is going to take longer, I would rather push these changes
> > in. 
> The development of lockless-page_cgroup is not stalled. I'm just waiting for
> my 8cpu box comes back from maintainance...
> If you want to see, I'll post v3 with brief result on small (2cpu) box.
> 
This is current status (result of unixbench.)
result of 2core/1socket x86-64 system.

==
[disabled]
Execl Throughput                           3103.3 lps   (29.7 secs, 3 samples)
C Compiler Throughput                      1052.0 lpm   (60.0 secs, 3 samples)
Shell Scripts (1 concurrent)               5915.0 lpm   (60.0 secs, 3 samples)
Shell Scripts (8 concurrent)               1142.7 lpm   (60.0 secs, 3 samples)
Shell Scripts (16 concurrent)               586.0 lpm   (60.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places         131463.3 lpm   (30.0 secs, 3 samples)

[rc4mm1]
Execl Throughput                           3004.4 lps   (29.6 secs, 3 samples)
C Compiler Throughput                      1017.9 lpm   (60.0 secs, 3 samples)
Shell Scripts (1 concurrent)               5726.3 lpm   (60.0 secs, 3 samples)
Shell Scripts (8 concurrent)               1124.3 lpm   (60.0 secs, 3 samples)
Shell Scripts (16 concurrent)               576.0 lpm   (60.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places         125446.5 lpm   (30.0 secs, 3 samples)

[lockless]
Execl Throughput                           3041.0 lps   (29.8 secs, 3 samples)
C Compiler Throughput                      1025.7 lpm   (60.0 secs, 3 samples)
Shell Scripts (1 concurrent)               5713.6 lpm   (60.0 secs, 3 samples)
Shell Scripts (8 concurrent)               1113.7 lpm   (60.0 secs, 3 samples)
Shell Scripts (16 concurrent)               571.3 lpm   (60.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places         125417.9 lpm   (30.0 secs, 3 samples)
==

>From this, single-thread results are good. multi-process results are not good ;)
So, I think the number of atomic ops are reduced but I have should-be-fixed
contention or cache-bouncing problem yet. I'd like to fix this and check on 8 core
system when it is back.
Recently, I wonder within-3%-overhead is realistic goal.

Thanks,
-Kame





^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  4:03     ` KAMEZAWA Hiroyuki
  2008-09-01  5:17       ` KAMEZAWA Hiroyuki
@ 2008-09-01  6:09       ` Balbir Singh
  2008-09-01  6:24         ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 72+ messages in thread
From: Balbir Singh @ 2008-09-01  6:09 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm, nickpiggin

KAMEZAWA Hiroyuki wrote:
> On Mon, 01 Sep 2008 08:58:32 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
>> KAMEZAWA Hiroyuki wrote:
>>> On Sun, 31 Aug 2008 23:17:56 +0530
>>> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>>
>>>> This is a rewrite of a patch I had written long back to remove struct page
>>>> (I shared the patches with Kamezawa, but never posted them anywhere else).
>>>> I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).
>>>>
>>> It's just because I think there is no strong requirements for 64bit count/mapcount.
>>> There is no ZERO_PAGE() for ANON (by Nick Piggin. I add him to CC.)
>>> (shmem still use it but impact is not big.)
>>>
>> I understand the comment, but not it's context. Are you suggesting that the
>> sizeof _count and _mapcount can be reduced? Hence the impact of having a member
>> in struct page is not all that large? I think the patch is definitely very
>> important for 32 bit systems.
> Maybe they cannot be reduced. For 32bit systems, if the machine doesn't equip
> crazy amounts of memory (as 32GB) I don't think this 32bit is not very large.
> 
> Let's calculate. 1GB/4096 x 4 bytes = 1 MB per 1GB. 
> But you adds spinlock_t, then what this patch reduce is not so big. Maybe only
> hundreds of kilobytes. (All pages in HIGHMEM will be used with structpage_cgroup.)
> 

There are other things like sizeof(struct page) crossing cacheline boundaries
and if we pass cgroup_disabled=memory, we save on the radix tree slots and
memory used there.

> 
>>>> I've tested the patches on an x86_64 box, I've run a simple test running
>>>> under the memory control group and the same test running concurrently under
>>>> two different groups (and creating pressure within their groups). I've also
>>>> compiled the patch with CGROUP_MEM_RES_CTLR turned off.
>>>>
>>>> Advantages of the patch
>>>>
>>>> 1. It removes the extra pointer in struct page
>>>>
>>>> Disadvantages
>>>>
>>>> 1. It adds an additional lock structure to struct page_cgroup
>>>> 2. Radix tree lookup is not an O(1) operation, once the page is known
>>>>    getting to the page_cgroup (pc) is a little more expensive now.
>>>>
>>>> This is an initial RFC for comments
>>>>
>>>> TODOs
>>>>
>>>> 1. Test the page migration changes
>>>> 2. Test the performance impact of the patch/approach
>>>>
>>>> Comments/Reviews?
>>>>
>>> plz wait until lockless page cgroup....
>>>
>> That depends, if we can get the lockless page cgroup done quickly, I don't mind
>> waiting, but if it is going to take longer, I would rather push these changes
>> in. 
> The development of lockless-page_cgroup is not stalled. I'm just waiting for
> my 8cpu box comes back from maintainance...
> If you want to see, I'll post v3 with brief result on small (2cpu) box.
> 

I understand and I am not pushing you to completing it, but at the same time I
don't want to queue up behind it for long. I suspect the cost of porting
lockless page cache on top of my patches should not be high, but I'll never know
till I try :)

>> There should not be too much overhead in porting lockless page cgroup patch
>> on top of this (remove pc->lock and use pc->flags). I'll help out, so as to
>> avoid wastage of your effort.
>>
>>> And If you don't support radix-tree-delete(), pre-allocating all at boot is better.
>>>
>> We do use radix-tree-delete() in the code, please see below. Pre-allocating has
>> the disadvantage that we will pre-allocate even for kernel pages, etc.
>>
> Sorry. I missed pc==NULL case.
> 

No Problem

> 
>>> BTW, why pc->lock is necessary ? It increases size of struct page_cgroup and reduce 
>>> the advantege of your patch's to half (8bytes -> 4bytes).
>>>
>> Yes, I've mentioned that as a disadvantage. Are you suggesting that with
>> lockless page cgroup we won't need pc->lock?
>>
> Not so clear at this stage. 
> 
> Thanks,
> -Kame
> 
> 
> 


-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  5:17       ` KAMEZAWA Hiroyuki
@ 2008-09-01  6:16         ` Balbir Singh
  0 siblings, 0 replies; 72+ messages in thread
From: Balbir Singh @ 2008-09-01  6:16 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm, nickpiggin

KAMEZAWA Hiroyuki wrote:
> On Mon, 1 Sep 2008 13:03:51 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>>> That depends, if we can get the lockless page cgroup done quickly, I don't mind
>>> waiting, but if it is going to take longer, I would rather push these changes
>>> in. 
>> The development of lockless-page_cgroup is not stalled. I'm just waiting for
>> my 8cpu box comes back from maintainance...
>> If you want to see, I'll post v3 with brief result on small (2cpu) box.
>>
> This is current status (result of unixbench.)
> result of 2core/1socket x86-64 system.
> 
> ==
> [disabled]
> Execl Throughput                           3103.3 lps   (29.7 secs, 3 samples)
> C Compiler Throughput                      1052.0 lpm   (60.0 secs, 3 samples)
> Shell Scripts (1 concurrent)               5915.0 lpm   (60.0 secs, 3 samples)
> Shell Scripts (8 concurrent)               1142.7 lpm   (60.0 secs, 3 samples)
> Shell Scripts (16 concurrent)               586.0 lpm   (60.0 secs, 3 samples)
> Dc: sqrt(2) to 99 decimal places         131463.3 lpm   (30.0 secs, 3 samples)
> 
> [rc4mm1]
> Execl Throughput                           3004.4 lps   (29.6 secs, 3 samples)
> C Compiler Throughput                      1017.9 lpm   (60.0 secs, 3 samples)
> Shell Scripts (1 concurrent)               5726.3 lpm   (60.0 secs, 3 samples)
> Shell Scripts (8 concurrent)               1124.3 lpm   (60.0 secs, 3 samples)
> Shell Scripts (16 concurrent)               576.0 lpm   (60.0 secs, 3 samples)
> Dc: sqrt(2) to 99 decimal places         125446.5 lpm   (30.0 secs, 3 samples)
> 
> [lockless]
> Execl Throughput                           3041.0 lps   (29.8 secs, 3 samples)
> C Compiler Throughput                      1025.7 lpm   (60.0 secs, 3 samples)
> Shell Scripts (1 concurrent)               5713.6 lpm   (60.0 secs, 3 samples)
> Shell Scripts (8 concurrent)               1113.7 lpm   (60.0 secs, 3 samples)
> Shell Scripts (16 concurrent)               571.3 lpm   (60.0 secs, 3 samples)
> Dc: sqrt(2) to 99 decimal places         125417.9 lpm   (30.0 secs, 3 samples)
> ==
> 
> From this, single-thread results are good. multi-process results are not good ;)
> So, I think the number of atomic ops are reduced but I have should-be-fixed
> contention or cache-bouncing problem yet. I'd like to fix this and check on 8 core
> system when it is back.
> Recently, I wonder within-3%-overhead is realistic goal.

It would be nice to be under 3% and lower if possible. I know it is a hard goal
to achieve, but worth striving for. I'll try and extract some numbers with the
radix tree changes and see if I am adding to the overhead (in terms of time) :)

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  6:09       ` Balbir Singh
@ 2008-09-01  6:24         ` KAMEZAWA Hiroyuki
  2008-09-01  6:25           ` Balbir Singh
  0 siblings, 1 reply; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-01  6:24 UTC (permalink / raw)
  To: balbir
  Cc: Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm, nickpiggin

On Mon, 01 Sep 2008 11:39:26 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:


> > The development of lockless-page_cgroup is not stalled. I'm just waiting for
> > my 8cpu box comes back from maintainance...
> > If you want to see, I'll post v3 with brief result on small (2cpu) box.
> > 
> 
> I understand and I am not pushing you to completing it, but at the same time I
> don't want to queue up behind it for long. I suspect the cost of porting
> lockless page cache on top of my patches should not be high, but I'll never know
> till I try :)
> 
My point is, your patch adds big lock. Then, I don't have to do meaningless effort
to reduce lock.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  6:24         ` KAMEZAWA Hiroyuki
@ 2008-09-01  6:25           ` Balbir Singh
  2008-09-01  6:59             ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 72+ messages in thread
From: Balbir Singh @ 2008-09-01  6:25 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm, nickpiggin

KAMEZAWA Hiroyuki wrote:
> On Mon, 01 Sep 2008 11:39:26 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
> 
>>> The development of lockless-page_cgroup is not stalled. I'm just waiting for
>>> my 8cpu box comes back from maintainance...
>>> If you want to see, I'll post v3 with brief result on small (2cpu) box.
>>>
>> I understand and I am not pushing you to completing it, but at the same time I
>> don't want to queue up behind it for long. I suspect the cost of porting
>> lockless page cache on top of my patches should not be high, but I'll never know
>> till I try :)
>>
> My point is, your patch adds big lock. Then, I don't have to do meaningless effort
> to reduce lock.

My patch does not add a big lock, it moves the lock from struct
page->page_cgroup to struct page_cgroup. The other locking added is the locking
overhead associated with inserting entries into the radix tree, true. I ran
oprofile along with lockdep and lockstats enabled on my patches. I don't see the
radix_tree or page_cgroup->lock showing up, I see __slab_free and __slab_alloc
showing up. I'll poke a little further.

Please don't let my patch stop you, we'll integrate the best of both worlds and
what is good for memcg.

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  0:01 ` KAMEZAWA Hiroyuki
  2008-09-01  3:28   ` Balbir Singh
@ 2008-09-01  6:56   ` Nick Piggin
  2008-09-01  7:17     ` Balbir Singh
  2008-09-01  7:19     ` KAMEZAWA Hiroyuki
  1 sibling, 2 replies; 72+ messages in thread
From: Nick Piggin @ 2008-09-01  6:56 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: balbir, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Monday 01 September 2008 10:01, KAMEZAWA Hiroyuki wrote:
> On Sun, 31 Aug 2008 23:17:56 +0530
>
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > This is a rewrite of a patch I had written long back to remove struct
> > page (I shared the patches with Kamezawa, but never posted them anywhere
> > else). I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug
> > 2008).
>
> It's just because I think there is no strong requirements for 64bit
> count/mapcount. There is no ZERO_PAGE() for ANON (by Nick Piggin. I add him
> to CC.) (shmem still use it but impact is not big.)

I think it would be nice to reduce the impact when it is not configured
anyway. Normally I would not mind so much, but this is something that
many distros will want to enable but fewer users will make use of it.

I think it is always a very good idea to try to reduce struct page size.
When looking at the performance impact though, just be careful with the
alignment of struct page... I actually think it is going to be a
performance win in many cases to make struct page 64 bytes.


> > I've tested the patches on an x86_64 box, I've run a simple test running
> > under the memory control group and the same test running concurrently
> > under two different groups (and creating pressure within their groups).
> > I've also compiled the patch with CGROUP_MEM_RES_CTLR turned off.
> >
> > Advantages of the patch
> >
> > 1. It removes the extra pointer in struct page
> >
> > Disadvantages
> >
> > 1. It adds an additional lock structure to struct page_cgroup
> > 2. Radix tree lookup is not an O(1) operation, once the page is known
> >    getting to the page_cgroup (pc) is a little more expensive now.
> >
> > This is an initial RFC for comments
> >
> > TODOs
> >
> > 1. Test the page migration changes
> > 2. Test the performance impact of the patch/approach
> >
> > Comments/Reviews?
>
> plz wait until lockless page cgroup....
>
> And If you don't support radix-tree-delete(), pre-allocating all at boot is
> better.

If you do that, it might even be an idea to allocate flat arrays with
bootmem. It would just be slightly more tricky more tricky to fit this
in with the memory model. But that's not a requirement, just an idea
for a small optimisation.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  6:25           ` Balbir Singh
@ 2008-09-01  6:59             ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-01  6:59 UTC (permalink / raw)
  To: balbir
  Cc: Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm, nickpiggin

On Mon, 01 Sep 2008 11:55:39 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> KAMEZAWA Hiroyuki wrote:
> > On Mon, 01 Sep 2008 11:39:26 +0530
> > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > 
> > 
> >>> The development of lockless-page_cgroup is not stalled. I'm just waiting for
> >>> my 8cpu box comes back from maintainance...
> >>> If you want to see, I'll post v3 with brief result on small (2cpu) box.
> >>>
> >> I understand and I am not pushing you to completing it, but at the same time I
> >> don't want to queue up behind it for long. I suspect the cost of porting
> >> lockless page cache on top of my patches should not be high, but I'll never know
> >> till I try :)
> >>
> > My point is, your patch adds big lock. Then, I don't have to do meaningless effort
> > to reduce lock.
> 
> My patch does not add a big lock, it moves the lock from struct
> page->page_cgroup to struct page_cgroup. The other locking added is the locking
> overhead associated with inserting entries into the radix tree, true. I ran
> oprofile along with lockdep and lockstats enabled on my patches. I don't see the
> radix_tree or page_cgroup->lock showing up, I see __slab_free and __slab_alloc
> showing up. I'll poke a little further.
> 
Hmm, one concern I have now is I don't see any contention on  res_counter->lock in
recent lock_stat test....which was usually on the top of list in past.
Did you see it ?

> Please don't let my patch stop you, we'll integrate the best of both worlds and
> what is good for memcg.
> 
Thank you.

To be honest, I wonder control via page_cgroup may be too rich for 32bit archs ;(.

Regards,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  6:56   ` Nick Piggin
@ 2008-09-01  7:17     ` Balbir Singh
  2008-09-01  7:19     ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 72+ messages in thread
From: Balbir Singh @ 2008-09-01  7:17 UTC (permalink / raw)
  To: Nick Piggin
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, hugh, menage, Pavel Emelianov,
	linux-kernel, linux-mm, Andi Kleen

Nick Piggin wrote:
> On Monday 01 September 2008 10:01, KAMEZAWA Hiroyuki wrote:
>> On Sun, 31 Aug 2008 23:17:56 +0530
>>
>> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>> This is a rewrite of a patch I had written long back to remove struct
>>> page (I shared the patches with Kamezawa, but never posted them anywhere
>>> else). I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug
>>> 2008).
>> It's just because I think there is no strong requirements for 64bit
>> count/mapcount. There is no ZERO_PAGE() for ANON (by Nick Piggin. I add him
>> to CC.) (shmem still use it but impact is not big.)
> 
> I think it would be nice to reduce the impact when it is not configured
> anyway. Normally I would not mind so much, but this is something that
> many distros will want to enable but fewer users will make use of it.
> 
> I think it is always a very good idea to try to reduce struct page size.
> When looking at the performance impact though, just be careful with the
> alignment of struct page... I actually think it is going to be a
> performance win in many cases to make struct page 64 bytes.
> 

I agree with the last point, but then when I see
http://www.mail-archive.com/git-commits-head@vger.kernel.org/msg41546.html
I am tempted to reduce the size. I did ask Andi about on which x86_64 machines
are we exceeding the cache size, but heard nothing back.

The questions to answer are

1. On 64 bit systems, would be OK making the size of struct page 64 bytes
2. Are we OK with this approach, even though we might penalize 32 bit systems

> 
>>> I've tested the patches on an x86_64 box, I've run a simple test running
>>> under the memory control group and the same test running concurrently
>>> under two different groups (and creating pressure within their groups).
>>> I've also compiled the patch with CGROUP_MEM_RES_CTLR turned off.
>>>
>>> Advantages of the patch
>>>
>>> 1. It removes the extra pointer in struct page
>>>
>>> Disadvantages
>>>
>>> 1. It adds an additional lock structure to struct page_cgroup
>>> 2. Radix tree lookup is not an O(1) operation, once the page is known
>>>    getting to the page_cgroup (pc) is a little more expensive now.
>>>
>>> This is an initial RFC for comments
>>>
>>> TODOs
>>>
>>> 1. Test the page migration changes
>>> 2. Test the performance impact of the patch/approach
>>>
>>> Comments/Reviews?
>> plz wait until lockless page cgroup....
>>
>> And If you don't support radix-tree-delete(), pre-allocating all at boot is
>> better.
> 
> If you do that, it might even be an idea to allocate flat arrays with
> bootmem. It would just be slightly more tricky more tricky to fit this
> in with the memory model. But that's not a requirement, just an idea
> for a small optimisation.

Yes, I thought about it, but dropped it for two reasons

1. We don't want a static overhead (irrespective of the memory used and user pages)
2. Like you said, it means fitting into every memory model to get it right.

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  6:56   ` Nick Piggin
  2008-09-01  7:17     ` Balbir Singh
@ 2008-09-01  7:19     ` KAMEZAWA Hiroyuki
  2008-09-01  7:43       ` Nick Piggin
  1 sibling, 1 reply; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-01  7:19 UTC (permalink / raw)
  To: Nick Piggin
  Cc: balbir, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Mon, 1 Sep 2008 16:56:44 +1000
Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> On Monday 01 September 2008 10:01, KAMEZAWA Hiroyuki wrote:
> > On Sun, 31 Aug 2008 23:17:56 +0530
> >
> > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > > This is a rewrite of a patch I had written long back to remove struct
> > > page (I shared the patches with Kamezawa, but never posted them anywhere
> > > else). I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug
> > > 2008).
> >
> > It's just because I think there is no strong requirements for 64bit
> > count/mapcount. There is no ZERO_PAGE() for ANON (by Nick Piggin. I add him
> > to CC.) (shmem still use it but impact is not big.)
> 
> I think it would be nice to reduce the impact when it is not configured
> anyway. Normally I would not mind so much, but this is something that
> many distros will want to enable but fewer users will make use of it.
> 
> I think it is always a very good idea to try to reduce struct page size.
> When looking at the performance impact though, just be careful with the
> alignment of struct page... I actually think it is going to be a
> performance win in many cases to make struct page 64 bytes.
> 
On 32bit, sizeof(struct page) = 32bytes + 4bytes(page_cgroup)
On 64bit, sizeof(struct page) = 56bytes + 8bytes(page_cgroup)
So, 32bit case is a problem.

> 
> If you do that, it might even be an idea to allocate flat arrays with
> bootmem. It would just be slightly more tricky more tricky to fit this
> in with the memory model. But that's not a requirement, just an idea
> for a small optimisation.
> 
If we make mem_res_controller available only under SPARSEMEM, I think we can
do in very straightfoward way.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  7:19     ` KAMEZAWA Hiroyuki
@ 2008-09-01  7:43       ` Nick Piggin
  2008-09-02  9:24         ` Balbir Singh
  0 siblings, 1 reply; 72+ messages in thread
From: Nick Piggin @ 2008-09-01  7:43 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: balbir, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Monday 01 September 2008 17:19, KAMEZAWA Hiroyuki wrote:
> On Mon, 1 Sep 2008 16:56:44 +1000
>
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> > On Monday 01 September 2008 10:01, KAMEZAWA Hiroyuki wrote:
> > > On Sun, 31 Aug 2008 23:17:56 +0530
> > >
> > > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > > > This is a rewrite of a patch I had written long back to remove struct
> > > > page (I shared the patches with Kamezawa, but never posted them
> > > > anywhere else). I spent the weekend, cleaning them up for
> > > > 2.6.27-rc5-mmotm (29 Aug 2008).
> > >
> > > It's just because I think there is no strong requirements for 64bit
> > > count/mapcount. There is no ZERO_PAGE() for ANON (by Nick Piggin. I add
> > > him to CC.) (shmem still use it but impact is not big.)
> >
> > I think it would be nice to reduce the impact when it is not configured
> > anyway. Normally I would not mind so much, but this is something that
> > many distros will want to enable but fewer users will make use of it.
> >
> > I think it is always a very good idea to try to reduce struct page size.
> > When looking at the performance impact though, just be careful with the
> > alignment of struct page... I actually think it is going to be a
> > performance win in many cases to make struct page 64 bytes.
>
> On 32bit, sizeof(struct page) = 32bytes + 4bytes(page_cgroup)
> On 64bit, sizeof(struct page) = 56bytes + 8bytes(page_cgroup)
> So, 32bit case is a problem.

Right. Well, either one is a problem because we always prefer to have
less things in struct page rather than more ;)


> > If you do that, it might even be an idea to allocate flat arrays with
> > bootmem. It would just be slightly more tricky more tricky to fit this
> > in with the memory model. But that's not a requirement, just an idea
> > for a small optimisation.
>
> If we make mem_res_controller available only under SPARSEMEM, I think we
> can do in very straightfoward way.

That could be a reasonable solution.  Balbir has other concerns about
this... so I think it is OK to try the radix tree approach first.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-08-31 17:47 [RFC][PATCH] Remove cgroup member from struct page Balbir Singh
  2008-09-01  0:01 ` KAMEZAWA Hiroyuki
  2008-09-01  2:39 ` KAMEZAWA Hiroyuki
@ 2008-09-01  9:03 ` Pavel Emelyanov
  2008-09-01  9:17   ` Balbir Singh
  2 siblings, 1 reply; 72+ messages in thread
From: Pavel Emelyanov @ 2008-09-01  9:03 UTC (permalink / raw)
  To: Balbir Singh
  Cc: Andrew Morton, hugh, kamezawa.hiroyu, menage, linux-kernel, linux-mm

Balbir Singh wrote:
> This is a rewrite of a patch I had written long back to remove struct page
> (I shared the patches with Kamezawa, but never posted them anywhere else).
> I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).
> 
> I've tested the patches on an x86_64 box, I've run a simple test running
> under the memory control group and the same test running concurrently under
> two different groups (and creating pressure within their groups). I've also
> compiled the patch with CGROUP_MEM_RES_CTLR turned off.
> 
> Advantages of the patch
> 
> 1. It removes the extra pointer in struct page
> 
> Disadvantages
> 
> 1. It adds an additional lock structure to struct page_cgroup
> 2. Radix tree lookup is not an O(1) operation, once the page is known
>    getting to the page_cgroup (pc) is a little more expensive now.

And besides, we also have a global lock, that protects even lookup
from this structure. Won't this affect us too much on bug-smp nodes?

> This is an initial RFC for comments
> 
> TODOs
> 
> 1. Test the page migration changes
> 2. Test the performance impact of the patch/approach
> 
> Comments/Reviews?
> 
> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  9:03 ` Pavel Emelyanov
@ 2008-09-01  9:17   ` Balbir Singh
  2008-09-01  9:43     ` Pavel Emelyanov
  2008-09-01 13:19     ` Peter Zijlstra
  0 siblings, 2 replies; 72+ messages in thread
From: Balbir Singh @ 2008-09-01  9:17 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Andrew Morton, hugh, kamezawa.hiroyu, menage, linux-kernel, linux-mm

Pavel Emelyanov wrote:
> Balbir Singh wrote:
>> This is a rewrite of a patch I had written long back to remove struct page
>> (I shared the patches with Kamezawa, but never posted them anywhere else).
>> I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).
>>
>> I've tested the patches on an x86_64 box, I've run a simple test running
>> under the memory control group and the same test running concurrently under
>> two different groups (and creating pressure within their groups). I've also
>> compiled the patch with CGROUP_MEM_RES_CTLR turned off.
>>
>> Advantages of the patch
>>
>> 1. It removes the extra pointer in struct page
>>
>> Disadvantages
>>
>> 1. It adds an additional lock structure to struct page_cgroup
>> 2. Radix tree lookup is not an O(1) operation, once the page is known
>>    getting to the page_cgroup (pc) is a little more expensive now.
> 
> And besides, we also have a global lock, that protects even lookup
> from this structure. Won't this affect us too much on bug-smp nodes?

Sorry, not sure I understand. The lookup is done under RCU. Updates are done
using the global lock. It should not be hard to make the radix tree per node
later (as an iterative refinement).

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  9:17   ` Balbir Singh
@ 2008-09-01  9:43     ` Pavel Emelyanov
  2008-09-01 13:19     ` Peter Zijlstra
  1 sibling, 0 replies; 72+ messages in thread
From: Pavel Emelyanov @ 2008-09-01  9:43 UTC (permalink / raw)
  To: balbir
  Cc: Andrew Morton, hugh, kamezawa.hiroyu, menage, linux-kernel, linux-mm

Balbir Singh wrote:
> Pavel Emelyanov wrote:
>> Balbir Singh wrote:
>>> This is a rewrite of a patch I had written long back to remove struct page
>>> (I shared the patches with Kamezawa, but never posted them anywhere else).
>>> I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).
>>>
>>> I've tested the patches on an x86_64 box, I've run a simple test running
>>> under the memory control group and the same test running concurrently under
>>> two different groups (and creating pressure within their groups). I've also
>>> compiled the patch with CGROUP_MEM_RES_CTLR turned off.
>>>
>>> Advantages of the patch
>>>
>>> 1. It removes the extra pointer in struct page
>>>
>>> Disadvantages
>>>
>>> 1. It adds an additional lock structure to struct page_cgroup
>>> 2. Radix tree lookup is not an O(1) operation, once the page is known
>>>    getting to the page_cgroup (pc) is a little more expensive now.
>> And besides, we also have a global lock, that protects even lookup
>> from this structure. Won't this affect us too much on bug-smp nodes?
> 
> Sorry, not sure I understand. The lookup is done under RCU. Updates are done
> using the global lock. It should not be hard to make the radix tree per node
> later (as an iterative refinement).
> 

OOPS, indeed. Sorry for the confusion, then. :)

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  9:17   ` Balbir Singh
  2008-09-01  9:43     ` Pavel Emelyanov
@ 2008-09-01 13:19     ` Peter Zijlstra
  2008-09-02  7:35       ` Balbir Singh
  1 sibling, 1 reply; 72+ messages in thread
From: Peter Zijlstra @ 2008-09-01 13:19 UTC (permalink / raw)
  To: balbir
  Cc: Pavel Emelyanov, Andrew Morton, hugh, kamezawa.hiroyu, menage,
	linux-kernel, linux-mm

On Mon, 2008-09-01 at 14:47 +0530, Balbir Singh wrote:
> Pavel Emelyanov wrote:
> > Balbir Singh wrote:
> >> This is a rewrite of a patch I had written long back to remove struct page
> >> (I shared the patches with Kamezawa, but never posted them anywhere else).
> >> I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).
> >>
> >> I've tested the patches on an x86_64 box, I've run a simple test running
> >> under the memory control group and the same test running concurrently under
> >> two different groups (and creating pressure within their groups). I've also
> >> compiled the patch with CGROUP_MEM_RES_CTLR turned off.
> >>
> >> Advantages of the patch
> >>
> >> 1. It removes the extra pointer in struct page
> >>
> >> Disadvantages
> >>
> >> 1. It adds an additional lock structure to struct page_cgroup
> >> 2. Radix tree lookup is not an O(1) operation, once the page is known
> >>    getting to the page_cgroup (pc) is a little more expensive now.
> > 
> > And besides, we also have a global lock, that protects even lookup
> > from this structure. Won't this affect us too much on bug-smp nodes?
> 
> Sorry, not sure I understand. The lookup is done under RCU. Updates are done
> using the global lock. It should not be hard to make the radix tree per node
> later (as an iterative refinement).

Or you could have a look at the concurrent radix tree, esp for dense
trees it can save a lot on lock bouncing.

Latest code available at:

http://programming.kicks-ass.net/kernel-patches/concurrent-pagecache/27-rc3/


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01 13:19     ` Peter Zijlstra
@ 2008-09-02  7:35       ` Balbir Singh
  0 siblings, 0 replies; 72+ messages in thread
From: Balbir Singh @ 2008-09-02  7:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Pavel Emelyanov, Andrew Morton, hugh, kamezawa.hiroyu, menage,
	linux-kernel, linux-mm

Peter Zijlstra wrote:
> On Mon, 2008-09-01 at 14:47 +0530, Balbir Singh wrote:
>> Pavel Emelyanov wrote:
>>> Balbir Singh wrote:
>>>> This is a rewrite of a patch I had written long back to remove struct page
>>>> (I shared the patches with Kamezawa, but never posted them anywhere else).
>>>> I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).
>>>>
>>>> I've tested the patches on an x86_64 box, I've run a simple test running
>>>> under the memory control group and the same test running concurrently under
>>>> two different groups (and creating pressure within their groups). I've also
>>>> compiled the patch with CGROUP_MEM_RES_CTLR turned off.
>>>>
>>>> Advantages of the patch
>>>>
>>>> 1. It removes the extra pointer in struct page
>>>>
>>>> Disadvantages
>>>>
>>>> 1. It adds an additional lock structure to struct page_cgroup
>>>> 2. Radix tree lookup is not an O(1) operation, once the page is known
>>>>    getting to the page_cgroup (pc) is a little more expensive now.
>>> And besides, we also have a global lock, that protects even lookup
>>> from this structure. Won't this affect us too much on bug-smp nodes?
>> Sorry, not sure I understand. The lookup is done under RCU. Updates are done
>> using the global lock. It should not be hard to make the radix tree per node
>> later (as an iterative refinement).
> 
> Or you could have a look at the concurrent radix tree, esp for dense
> trees it can save a lot on lock bouncing.
> 
> Latest code available at:
> 
> http://programming.kicks-ass.net/kernel-patches/concurrent-pagecache/27-rc3/

I'll try them later. I see a lot of slab allocations/frees (expected). I am
trying to experiment with alternatives to reduce them.

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-01  7:43       ` Nick Piggin
@ 2008-09-02  9:24         ` Balbir Singh
  2008-09-02 10:02           ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 72+ messages in thread
From: Balbir Singh @ 2008-09-02  9:24 UTC (permalink / raw)
  To: Nick Piggin
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, hugh, menage, xemul,
	linux-kernel, linux-mm

Nick Piggin wrote:
> That could be a reasonable solution.  Balbir has other concerns about
> this... so I think it is OK to try the radix tree approach first.

Thanks, Nick!

Kamezawa-San, I would like to integrate the radix tree patches after review and
some more testing then integrate your patchset on top of it. Do you have any
objections/concerns with the suggested approach?

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-02 10:02           ` KAMEZAWA Hiroyuki
@ 2008-09-02  9:58             ` Balbir Singh
  2008-09-02 10:07               ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 72+ messages in thread
From: Balbir Singh @ 2008-09-02  9:58 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

KAMEZAWA Hiroyuki wrote:
> On Tue, 02 Sep 2008 14:54:17 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
>> Nick Piggin wrote:
>>> That could be a reasonable solution.  Balbir has other concerns about
>>> this... so I think it is OK to try the radix tree approach first.
>> Thanks, Nick!
>>
>> Kamezawa-San, I would like to integrate the radix tree patches after review and
>> some more testing then integrate your patchset on top of it. Do you have any
>> objections/concerns with the suggested approach?
>>
> please show performance number first.

Yes, that is why said some more testing. I am running lmbench and kernbench on
it and some other tests, I'll get back with numbers.

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-02  9:24         ` Balbir Singh
@ 2008-09-02 10:02           ` KAMEZAWA Hiroyuki
  2008-09-02  9:58             ` Balbir Singh
  0 siblings, 1 reply; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-02 10:02 UTC (permalink / raw)
  To: balbir
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Tue, 02 Sep 2008 14:54:17 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> Nick Piggin wrote:
> > That could be a reasonable solution.  Balbir has other concerns about
> > this... so I think it is OK to try the radix tree approach first.
> 
> Thanks, Nick!
> 
> Kamezawa-San, I would like to integrate the radix tree patches after review and
> some more testing then integrate your patchset on top of it. Do you have any
> objections/concerns with the suggested approach?
> 
please show performance number first.

Thanks,
-kame


> -- 
> 	Balbir
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-02  9:58             ` Balbir Singh
@ 2008-09-02 10:07               ` KAMEZAWA Hiroyuki
  2008-09-02 10:12                 ` Balbir Singh
  0 siblings, 1 reply; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-02 10:07 UTC (permalink / raw)
  To: balbir
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Tue, 02 Sep 2008 15:28:34 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> KAMEZAWA Hiroyuki wrote:
> > On Tue, 02 Sep 2008 14:54:17 +0530
> > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > 
> >> Nick Piggin wrote:
> >>> That could be a reasonable solution.  Balbir has other concerns about
> >>> this... so I think it is OK to try the radix tree approach first.
> >> Thanks, Nick!
> >>
> >> Kamezawa-San, I would like to integrate the radix tree patches after review and
> >> some more testing then integrate your patchset on top of it. Do you have any
> >> objections/concerns with the suggested approach?
> >>
> > please show performance number first.
> 
> Yes, that is why said some more testing. I am running lmbench and kernbench on
> it and some other tests, I'll get back with numbers.
> 
A test which is not suffer much from I/O is better.
And please don't worry about my patches. I'll reschedule if yours goes first.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-02 10:07               ` KAMEZAWA Hiroyuki
@ 2008-09-02 10:12                 ` Balbir Singh
  2008-09-02 10:57                   ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 72+ messages in thread
From: Balbir Singh @ 2008-09-02 10:12 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

KAMEZAWA Hiroyuki wrote:
> On Tue, 02 Sep 2008 15:28:34 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
>> KAMEZAWA Hiroyuki wrote:
>>> On Tue, 02 Sep 2008 14:54:17 +0530
>>> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>>
>>>> Nick Piggin wrote:
>>>>> That could be a reasonable solution.  Balbir has other concerns about
>>>>> this... so I think it is OK to try the radix tree approach first.
>>>> Thanks, Nick!
>>>>
>>>> Kamezawa-San, I would like to integrate the radix tree patches after review and
>>>> some more testing then integrate your patchset on top of it. Do you have any
>>>> objections/concerns with the suggested approach?
>>>>
>>> please show performance number first.
>> Yes, that is why said some more testing. I am running lmbench and kernbench on
>> it and some other tests, I'll get back with numbers.
>>
> A test which is not suffer much from I/O is better.
> And please don't worry about my patches. I'll reschedule if yours goes first.
> 
Thanks, I'll try and find the right set of tests.
-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-02 10:12                 ` Balbir Singh
@ 2008-09-02 10:57                   ` KAMEZAWA Hiroyuki
  2008-09-02 12:37                     ` Balbir Singh
  0 siblings, 1 reply; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-02 10:57 UTC (permalink / raw)
  To: balbir
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Tue, 02 Sep 2008 15:42:43 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> >>>> Kamezawa-San, I would like to integrate the radix tree patches after review and
> >>>> some more testing then integrate your patchset on top of it. Do you have any
> >>>> objections/concerns with the suggested approach?
> >>>>
> >>> please show performance number first.
> >> Yes, that is why said some more testing. I am running lmbench and kernbench on
> >> it and some other tests, I'll get back with numbers.
> >>
> > A test which is not suffer much from I/O is better.
> > And please don't worry about my patches. I'll reschedule if yours goes first.
> > 
> Thanks, I'll try and find the right set of tests.

Maybe it's good time to share my concerns.

IMHO, the memory resource controller is for dividing memory into groups.

We have following choices to divide memory into groups, now.
  - cpuset(+ fake NUMA)
  - VM (kvm, Xen, etc...)
  - memory resource controller. (memcg)

Considering 3 aspects peformance, flexibility, isolation(security).
My expectaion is

peroformance   : cpuset > memcg >> VMs
flexibility    : memcg  > VMs >> cpuset.
isolation      : VMs >> cpuset >= memcg

The word 'flexibility' sounds sweet *but* it's just one of aspects.
If the peformance is cpuset > VMs > memcg, I'll advise users to use VMs.

I think VMs are getting faster and faster. memcg will be slower when we add new
'fancy' feature more. (I think we need some more features.)
So, I want to keep memcg fast as much as possible at this stage.

But yes, memory usage overhead of page->page_cgroup, struct page_cgroup is big
on 32bit arch. I'll say users to use VMs, maybe ;)

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-02 10:57                   ` KAMEZAWA Hiroyuki
@ 2008-09-02 12:37                     ` Balbir Singh
  2008-09-03  3:33                       ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 72+ messages in thread
From: Balbir Singh @ 2008-09-02 12:37 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

KAMEZAWA Hiroyuki wrote:
> On Tue, 02 Sep 2008 15:42:43 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>>>>> Kamezawa-San, I would like to integrate the radix tree patches after review and
>>>>>> some more testing then integrate your patchset on top of it. Do you have any
>>>>>> objections/concerns with the suggested approach?
>>>>>>
>>>>> please show performance number first.
>>>> Yes, that is why said some more testing. I am running lmbench and kernbench on
>>>> it and some other tests, I'll get back with numbers.
>>>>
>>> A test which is not suffer much from I/O is better.
>>> And please don't worry about my patches. I'll reschedule if yours goes first.
>>>
>> Thanks, I'll try and find the right set of tests.
> 
> Maybe it's good time to share my concerns.
> 
> IMHO, the memory resource controller is for dividing memory into groups.
> 
> We have following choices to divide memory into groups, now.
>   - cpuset(+ fake NUMA)
>   - VM (kvm, Xen, etc...)
>   - memory resource controller. (memcg)
> 
> Considering 3 aspects peformance, flexibility, isolation(security).
> My expectaion is
> 
> peroformance   : cpuset > memcg >> VMs
> flexibility    : memcg  > VMs >> cpuset.
> isolation      : VMs >> cpuset >= memcg
> 
> The word 'flexibility' sounds sweet *but* it's just one of aspects.
> If the peformance is cpuset > VMs > memcg, I'll advise users to use VMs.
> 
> I think VMs are getting faster and faster. memcg will be slower when we add new
> 'fancy' feature more. (I think we need some more features.)
> So, I want to keep memcg fast as much as possible at this stage.
> 
> But yes, memory usage overhead of page->page_cgroup, struct page_cgroup is big
> on 32bit arch. I'll say users to use VMs, maybe ;)

I understand your concern and I am not trying to reduce memcg's performance - or
add a fancy feature. I am trying to make memcg more friendly for distros. I see
your point about the overhead. I just got back my results - I see a 4% overhead
with the patches. Let me see if I can rework them for better performance.


-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-02 12:37                     ` Balbir Singh
@ 2008-09-03  3:33                       ` KAMEZAWA Hiroyuki
  2008-09-03  7:31                         ` Balbir Singh
  2008-09-08 15:28                         ` Balbir Singh
  0 siblings, 2 replies; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-03  3:33 UTC (permalink / raw)
  To: balbir
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Tue, 02 Sep 2008 18:07:18 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> I understand your concern and I am not trying to reduce memcg's performance - or
> add a fancy feature. I am trying to make memcg more friendly for distros. I see
> your point about the overhead. I just got back my results - I see a 4% overhead
> with the patches. Let me see if I can rework them for better performance.
> 
Just an idea, by using atomic_ops page_cgroup patch, you can encode page_cgroup->lock
to page_cgroup->flags and use bit_spinlock(), I think.
(my new patch set use bit_spinlock on page_cgroup->flags for avoiding some race.)

This will save extra 4 bytes.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-03  3:33                       ` KAMEZAWA Hiroyuki
@ 2008-09-03  7:31                         ` Balbir Singh
  2008-09-08 15:28                         ` Balbir Singh
  1 sibling, 0 replies; 72+ messages in thread
From: Balbir Singh @ 2008-09-03  7:31 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

KAMEZAWA Hiroyuki wrote:
> On Tue, 02 Sep 2008 18:07:18 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>> I understand your concern and I am not trying to reduce memcg's performance - or
>> add a fancy feature. I am trying to make memcg more friendly for distros. I see
>> your point about the overhead. I just got back my results - I see a 4% overhead
>> with the patches. Let me see if I can rework them for better performance.
>>
> Just an idea, by using atomic_ops page_cgroup patch, you can encode page_cgroup->lock
> to page_cgroup->flags and use bit_spinlock(), I think.
> (my new patch set use bit_spinlock on page_cgroup->flags for avoiding some race.)
> 
> This will save extra 4 bytes.

Exactly the next step I was thinking about (since we already use it, in the
current form). Thanks for the suggestion!

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-03  3:33                       ` KAMEZAWA Hiroyuki
  2008-09-03  7:31                         ` Balbir Singh
@ 2008-09-08 15:28                         ` Balbir Singh
  2008-09-09  3:57                           ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 72+ messages in thread
From: Balbir Singh @ 2008-09-08 15:28 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2008-09-03 12:33:06]:

> On Tue, 02 Sep 2008 18:07:18 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > I understand your concern and I am not trying to reduce memcg's performance - or
> > add a fancy feature. I am trying to make memcg more friendly for distros. I see
> > your point about the overhead. I just got back my results - I see a 4% overhead
> > with the patches. Let me see if I can rework them for better performance.
> > 
> Just an idea, by using atomic_ops page_cgroup patch, you can encode page_cgroup->lock
> to page_cgroup->flags and use bit_spinlock(), I think.
> (my new patch set use bit_spinlock on page_cgroup->flags for avoiding some race.)
> 
> This will save extra 4 bytes.

Sorry for the delay in sending out the new patch, I am traveling and
thus a little less responsive. Here is the update patch


v3...v2
1. Convert flags to unsigned long
2. Move page_cgroup->lock to a bit spin lock in flags

v2...v1

1. Fix a small bug, don't call radix_tree_preload_end(), if preload fails

This is a rewrite of a patch I had written long back to remove struct page
(I shared the patches with Kamezawa, but never posted them anywhere else).
I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).

I've tested the patches on an x86_64 box, I've run a simple test running
under the memory control group and the same test running concurrently under
two different groups (and creating pressure within their groups). I've also
compiled the patch with CGROUP_MEM_RES_CTLR turned off.

Advantages of the patch

1. It removes the extra pointer in struct page

Disadvantages

1. Radix tree lookup is not an O(1) operation, once the page is known
   getting to the page_cgroup (pc) is a little more expensive now.

This is an initial RFC for comments

TODOs

1. Test the page migration changes

Performance

In a unixbench run, these patches had a performance impact of 2% (slowdown).

Comments/Reviews?

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---

 include/linux/memcontrol.h |   23 +++++
 include/linux/mm_types.h   |    4 
 mm/memcontrol.c            |  187 +++++++++++++++++++++++++++++----------------
 3 files changed, 144 insertions(+), 70 deletions(-)

diff -puN mm/memcontrol.c~memcg_move_to_radix_tree mm/memcontrol.c
--- linux-2.6.27-rc5/mm/memcontrol.c~memcg_move_to_radix_tree	2008-09-04 15:45:54.000000000 +0530
+++ linux-2.6.27-rc5-balbir/mm/memcontrol.c	2008-09-08 20:15:30.000000000 +0530
@@ -24,6 +24,7 @@
 #include <linux/smp.h>
 #include <linux/page-flags.h>
 #include <linux/backing-dev.h>
+#include <linux/radix-tree.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
@@ -40,6 +41,9 @@ struct cgroup_subsys mem_cgroup_subsys _
 static struct kmem_cache *page_cgroup_cache __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 
+static struct radix_tree_root mem_cgroup_tree;
+static spinlock_t mem_cgroup_tree_lock;
+
 /*
  * Statistics for memory cgroup.
  */
@@ -137,20 +141,6 @@ struct mem_cgroup {
 static struct mem_cgroup init_mem_cgroup;
 
 /*
- * We use the lower bit of the page->page_cgroup pointer as a bit spin
- * lock.  We need to ensure that page->page_cgroup is at least two
- * byte aligned (based on comments from Nick Piggin).  But since
- * bit_spin_lock doesn't actually set that lock bit in a non-debug
- * uniprocessor kernel, we should avoid setting it here too.
- */
-#define PAGE_CGROUP_LOCK_BIT 	0x0
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-#define PAGE_CGROUP_LOCK 	(1 << PAGE_CGROUP_LOCK_BIT)
-#else
-#define PAGE_CGROUP_LOCK	0x0
-#endif
-
-/*
  * A page_cgroup page is associated with every page descriptor. The
  * page_cgroup helps us identify information about the cgroup
  */
@@ -158,12 +148,17 @@ struct page_cgroup {
 	struct list_head lru;		/* per cgroup LRU list */
 	struct page *page;
 	struct mem_cgroup *mem_cgroup;
-	int flags;
+	unsigned long flags;
 };
-#define PAGE_CGROUP_FLAG_CACHE	   (0x1)	/* charged as cache */
-#define PAGE_CGROUP_FLAG_ACTIVE    (0x2)	/* page is active in this cgroup */
-#define PAGE_CGROUP_FLAG_FILE	   (0x4)	/* page is file system backed */
-#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8)	/* page is unevictableable */
+
+/*
+ * LOCK_BIT is 0, with value 1
+ */
+#define PAGE_CGROUP_FLAG_LOCK_BIT   (0x0)  /* lock bit */
+#define PAGE_CGROUP_FLAG_CACHE	   (0x2)   /* charged as cache */
+#define PAGE_CGROUP_FLAG_ACTIVE    (0x4)   /* page is active in this cgroup */
+#define PAGE_CGROUP_FLAG_FILE	   (0x8)   /* page is file system backed */
+#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x10)/* page is unevictableable */
 
 static int page_cgroup_nid(struct page_cgroup *pc)
 {
@@ -248,35 +243,81 @@ struct mem_cgroup *mem_cgroup_from_task(
 				struct mem_cgroup, css);
 }
 
-static inline int page_cgroup_locked(struct page *page)
+static inline void lock_page_cgroup(struct page_cgroup *pc)
 {
-	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+	bit_spin_lock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
 }
 
-static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
+static inline int trylock_page_cgroup(struct page_cgroup *pc)
 {
-	VM_BUG_ON(!page_cgroup_locked(page));
-	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
+	return bit_spin_trylock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
 }
 
-struct page_cgroup *page_get_page_cgroup(struct page *page)
+static inline void unlock_page_cgroup(struct page_cgroup *pc)
 {
-	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
+	bit_spin_unlock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
 }
 
-static void lock_page_cgroup(struct page *page)
+static int page_assign_page_cgroup(struct page *page, struct page_cgroup *pc,
+					gfp_t gfp_mask)
 {
-	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
+	unsigned long pfn = page_to_pfn(page);
+	unsigned long flags;
+	int err = 0;
+	struct page_cgroup *old_pc;
 
-static int try_lock_page_cgroup(struct page *page)
-{
-	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+	if (pc) {
+		err = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+		if (err) {
+			printk(KERN_WARNING "could not preload radix tree "
+				"in %s\n", __func__);
+			goto done;
+		}
+	}
+
+	spin_lock_irqsave(&mem_cgroup_tree_lock, flags);
+	old_pc = radix_tree_lookup(&mem_cgroup_tree, pfn);
+	if (pc && old_pc) {
+		err = -EEXIST;
+		goto pc_race;
+	}
+	if (pc) {
+		err = radix_tree_insert(&mem_cgroup_tree, pfn, pc);
+		if (err)
+			printk(KERN_WARNING "Inserting into radix tree failed "
+				"in %s\n", __func__);
+	} else
+		radix_tree_delete(&mem_cgroup_tree, pfn);
+pc_race:
+	spin_unlock_irqrestore(&mem_cgroup_tree_lock, flags);
+	if (pc)
+		radix_tree_preload_end();
+done:
+	return err;
 }
 
-static void unlock_page_cgroup(struct page *page)
+struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
+						bool trylock)
 {
-	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+	unsigned long pfn = page_to_pfn(page);
+	struct page_cgroup *pc;
+	int ret;
+
+	rcu_read_lock();
+	pc = radix_tree_lookup(&mem_cgroup_tree, pfn);
+
+	if (pc && lock)
+		lock_page_cgroup(pc);
+
+	if (pc && trylock) {
+		ret = trylock_page_cgroup(pc);
+		if (!ret)
+			pc = NULL;
+	}
+
+	rcu_read_unlock();
+
+	return pc;
 }
 
 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
@@ -377,17 +418,15 @@ void mem_cgroup_move_lists(struct page *
 	 * safely get to page_cgroup without it, so just try_lock it:
 	 * mem_cgroup_isolate_pages allows for page left on wrong list.
 	 */
-	if (!try_lock_page_cgroup(page))
+	pc = page_get_page_cgroup_trylock(page);
+	if (!pc)
 		return;
 
-	pc = page_get_page_cgroup(page);
-	if (pc) {
-		mz = page_cgroup_zoneinfo(pc);
-		spin_lock_irqsave(&mz->lru_lock, flags);
-		__mem_cgroup_move_lists(pc, lru);
-		spin_unlock_irqrestore(&mz->lru_lock, flags);
-	}
-	unlock_page_cgroup(page);
+	mz = page_cgroup_zoneinfo(pc);
+	spin_lock_irqsave(&mz->lru_lock, flags);
+	__mem_cgroup_move_lists(pc, lru);
+	spin_unlock_irqrestore(&mz->lru_lock, flags);
+	unlock_page_cgroup(pc);
 }
 
 /*
@@ -516,7 +555,7 @@ static int mem_cgroup_charge_common(stru
 				struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *mem;
-	struct page_cgroup *pc;
+	struct page_cgroup *pc, *old_pc;
 	unsigned long flags;
 	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct mem_cgroup_per_zone *mz;
@@ -569,35 +608,49 @@ static int mem_cgroup_charge_common(stru
 
 	pc->mem_cgroup = mem;
 	pc->page = page;
+	pc->flags = 0;		/* No lock, no other bits either */
+
 	/*
 	 * If a page is accounted as a page cache, insert to inactive list.
 	 * If anon, insert to active list.
 	 */
 	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) {
-		pc->flags = PAGE_CGROUP_FLAG_CACHE;
+		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
 		if (page_is_file_cache(page))
 			pc->flags |= PAGE_CGROUP_FLAG_FILE;
 		else
 			pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
 	} else
-		pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
+		pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
+
+	old_pc = page_get_page_cgroup_locked(page);
+	if (old_pc) {
+		unlock_page_cgroup(old_pc);
+		res_counter_uncharge(&mem->res, PAGE_SIZE);
+		css_put(&mem->css);
+		kmem_cache_free(page_cgroup_cache, pc);
+		goto done;
+	}
 
-	lock_page_cgroup(page);
-	if (unlikely(page_get_page_cgroup(page))) {
-		unlock_page_cgroup(page);
+	lock_page_cgroup(pc);
+	/*
+	 * page_get_page_cgroup() does not necessarily guarantee that
+	 * there will be no race in checking for pc, page_assign_page_pc()
+	 * will definitely catch it.
+	 */
+	if (page_assign_page_cgroup(page, pc, gfp_mask)) {
+		unlock_page_cgroup(pc);
 		res_counter_uncharge(&mem->res, PAGE_SIZE);
 		css_put(&mem->css);
 		kmem_cache_free(page_cgroup_cache, pc);
 		goto done;
 	}
-	page_assign_page_cgroup(page, pc);
 
 	mz = page_cgroup_zoneinfo(pc);
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	__mem_cgroup_add_list(mz, pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
-
-	unlock_page_cgroup(page);
+	unlock_page_cgroup(pc);
 done:
 	return 0;
 out:
@@ -645,15 +698,13 @@ int mem_cgroup_cache_charge(struct page 
 	if (!(gfp_mask & __GFP_WAIT)) {
 		struct page_cgroup *pc;
 
-		lock_page_cgroup(page);
-		pc = page_get_page_cgroup(page);
+		pc = page_get_page_cgroup_locked(page);
 		if (pc) {
 			VM_BUG_ON(pc->page != page);
 			VM_BUG_ON(!pc->mem_cgroup);
-			unlock_page_cgroup(page);
+			unlock_page_cgroup(pc);
 			return 0;
 		}
-		unlock_page_cgroup(page);
 	}
 
 	if (unlikely(!mm))
@@ -673,6 +724,7 @@ __mem_cgroup_uncharge_common(struct page
 	struct mem_cgroup *mem;
 	struct mem_cgroup_per_zone *mz;
 	unsigned long flags;
+	int ret;
 
 	if (mem_cgroup_subsys.disabled)
 		return;
@@ -680,8 +732,7 @@ __mem_cgroup_uncharge_common(struct page
 	/*
 	 * Check if our page_cgroup is valid
 	 */
-	lock_page_cgroup(page);
-	pc = page_get_page_cgroup(page);
+	pc = page_get_page_cgroup_locked(page);
 	if (unlikely(!pc))
 		goto unlock;
 
@@ -697,8 +748,9 @@ __mem_cgroup_uncharge_common(struct page
 	__mem_cgroup_remove_list(mz, pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
 
-	page_assign_page_cgroup(page, NULL);
-	unlock_page_cgroup(page);
+	ret = page_assign_page_cgroup(page, NULL, GFP_KERNEL);
+	VM_BUG_ON(ret);
+	unlock_page_cgroup(pc);
 
 	mem = pc->mem_cgroup;
 	res_counter_uncharge(&mem->res, PAGE_SIZE);
@@ -707,7 +759,14 @@ __mem_cgroup_uncharge_common(struct page
 	kmem_cache_free(page_cgroup_cache, pc);
 	return;
 unlock:
-	unlock_page_cgroup(page);
+	unlock_page_cgroup(pc);
+}
+
+void page_reset_bad_cgroup(struct page *page)
+{
+	int ret;
+	ret = page_assign_page_cgroup(page, NULL, GFP_KERNEL);
+	VM_BUG_ON(ret);
 }
 
 void mem_cgroup_uncharge_page(struct page *page)
@@ -734,15 +793,14 @@ int mem_cgroup_prepare_migration(struct 
 	if (mem_cgroup_subsys.disabled)
 		return 0;
 
-	lock_page_cgroup(page);
-	pc = page_get_page_cgroup(page);
+	pc = page_get_page_cgroup_locked(page);
 	if (pc) {
 		mem = pc->mem_cgroup;
 		css_get(&mem->css);
 		if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
 			ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+		unlock_page_cgroup(pc);
 	}
-	unlock_page_cgroup(page);
 	if (mem) {
 		ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
 			ctype, mem);
@@ -1107,6 +1165,7 @@ mem_cgroup_create(struct cgroup_subsys *
 	if (unlikely((cont->parent) == NULL)) {
 		mem = &init_mem_cgroup;
 		page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
+		spin_lock_init(&mem_cgroup_tree_lock);
 	} else {
 		mem = mem_cgroup_alloc();
 		if (!mem)
diff -puN include/linux/memcontrol.h~memcg_move_to_radix_tree include/linux/memcontrol.h
--- linux-2.6.27-rc5/include/linux/memcontrol.h~memcg_move_to_radix_tree	2008-09-04 15:45:54.000000000 +0530
+++ linux-2.6.27-rc5-balbir/include/linux/memcontrol.h	2008-09-04 15:45:54.000000000 +0530
@@ -27,9 +27,28 @@ struct mm_struct;
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
-#define page_reset_bad_cgroup(page)	((page)->page_cgroup = 0)
+extern void page_reset_bad_cgroup(struct page *page);
+extern struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
+							bool trylock);
+
+static __always_inline
+struct page_cgroup *page_get_page_cgroup(struct page *page)
+{
+	return __page_get_page_cgroup(page, false, false);
+}
+
+static __always_inline
+struct page_cgroup *page_get_page_cgroup_trylock(struct page *page)
+{
+	return __page_get_page_cgroup(page, false, true);
+}
+
+static __always_inline
+struct page_cgroup *page_get_page_cgroup_locked(struct page *page)
+{
+	return __page_get_page_cgroup(page, true, false);
+}
 
-extern struct page_cgroup *page_get_page_cgroup(struct page *page);
 extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
diff -puN include/linux/mm_types.h~memcg_move_to_radix_tree include/linux/mm_types.h
--- linux-2.6.27-rc5/include/linux/mm_types.h~memcg_move_to_radix_tree	2008-09-04 15:45:54.000000000 +0530
+++ linux-2.6.27-rc5-balbir/include/linux/mm_types.h	2008-09-04 15:45:54.000000000 +0530
@@ -92,10 +92,6 @@ struct page {
 	void *virtual;			/* Kernel virtual address (NULL if
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-	unsigned long page_cgroup;
-#endif
-
 #ifdef CONFIG_KMEMCHECK
 	void *shadow;
 #endif
diff -puN mm/page_alloc.c~memcg_move_to_radix_tree mm/page_alloc.c
_

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-08 15:28                         ` Balbir Singh
@ 2008-09-09  3:57                           ` KAMEZAWA Hiroyuki
  2008-09-09  3:58                             ` Nick Piggin
  2008-09-09  4:18                             ` Balbir Singh
  0 siblings, 2 replies; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-09  3:57 UTC (permalink / raw)
  To: balbir
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Mon, 8 Sep 2008 20:58:10 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> Sorry for the delay in sending out the new patch, I am traveling and
> thus a little less responsive. Here is the update patch
> 
> 
Hmm.. I've considered this approach for a while and my answer is that
this is not what you really want.

Because you just moves the placement of pointer from memmap to
radix_tree both in GFP_KERNEL, total kernel memory usage is not changed.
So, at least, you have to add some address calculation (as I did in March)
to getting address of page_cgroup. But page_cgroup itself consumes 32bytes
per page. Then.....

My proposal to 32bit system is following 
 - remove page_cgroup completely.
   - As a result, there is no per-cgroup lru. But it will not be bad
     bacause the number of cgroups and pages are not big.
     just a trade-off between kernel-memory-space v.s. speed.
   - Removing page_cgroup and just remember address of mem_cgroup per page.

How do you think ?

Thanks,
-Kame


> v3...v2
> 1. Convert flags to unsigned long
> 2. Move page_cgroup->lock to a bit spin lock in flags
> 
> v2...v1
> 
> 1. Fix a small bug, don't call radix_tree_preload_end(), if preload fails
> 
> This is a rewrite of a patch I had written long back to remove struct page
> (I shared the patches with Kamezawa, but never posted them anywhere else).
> I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).
> 
> I've tested the patches on an x86_64 box, I've run a simple test running
> under the memory control group and the same test running concurrently under
> two different groups (and creating pressure within their groups). I've also
> compiled the patch with CGROUP_MEM_RES_CTLR turned off.
> 
> Advantages of the patch
> 
> 1. It removes the extra pointer in struct page
> 
> Disadvantages
> 
> 1. Radix tree lookup is not an O(1) operation, once the page is known
>    getting to the page_cgroup (pc) is a little more expensive now.
> 
> This is an initial RFC for comments
> 
> TODOs
> 
> 1. Test the page migration changes
> 
> Performance
> 
> In a unixbench run, these patches had a performance impact of 2% (slowdown).
> 
> Comments/Reviews?
> 
> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
> ---
> 
>  include/linux/memcontrol.h |   23 +++++
>  include/linux/mm_types.h   |    4 
>  mm/memcontrol.c            |  187 +++++++++++++++++++++++++++++----------------
>  3 files changed, 144 insertions(+), 70 deletions(-)
> 
> diff -puN mm/memcontrol.c~memcg_move_to_radix_tree mm/memcontrol.c
> --- linux-2.6.27-rc5/mm/memcontrol.c~memcg_move_to_radix_tree	2008-09-04 15:45:54.000000000 +0530
> +++ linux-2.6.27-rc5-balbir/mm/memcontrol.c	2008-09-08 20:15:30.000000000 +0530
> @@ -24,6 +24,7 @@
>  #include <linux/smp.h>
>  #include <linux/page-flags.h>
>  #include <linux/backing-dev.h>
> +#include <linux/radix-tree.h>
>  #include <linux/bit_spinlock.h>
>  #include <linux/rcupdate.h>
>  #include <linux/slab.h>
> @@ -40,6 +41,9 @@ struct cgroup_subsys mem_cgroup_subsys _
>  static struct kmem_cache *page_cgroup_cache __read_mostly;
>  #define MEM_CGROUP_RECLAIM_RETRIES	5
>  
> +static struct radix_tree_root mem_cgroup_tree;
> +static spinlock_t mem_cgroup_tree_lock;
> +
>  /*
>   * Statistics for memory cgroup.
>   */
> @@ -137,20 +141,6 @@ struct mem_cgroup {
>  static struct mem_cgroup init_mem_cgroup;
>  
>  /*
> - * We use the lower bit of the page->page_cgroup pointer as a bit spin
> - * lock.  We need to ensure that page->page_cgroup is at least two
> - * byte aligned (based on comments from Nick Piggin).  But since
> - * bit_spin_lock doesn't actually set that lock bit in a non-debug
> - * uniprocessor kernel, we should avoid setting it here too.
> - */
> -#define PAGE_CGROUP_LOCK_BIT 	0x0
> -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
> -#define PAGE_CGROUP_LOCK 	(1 << PAGE_CGROUP_LOCK_BIT)
> -#else
> -#define PAGE_CGROUP_LOCK	0x0
> -#endif
> -
> -/*
>   * A page_cgroup page is associated with every page descriptor. The
>   * page_cgroup helps us identify information about the cgroup
>   */
> @@ -158,12 +148,17 @@ struct page_cgroup {
>  	struct list_head lru;		/* per cgroup LRU list */
>  	struct page *page;
>  	struct mem_cgroup *mem_cgroup;
> -	int flags;
> +	unsigned long flags;
>  };
> -#define PAGE_CGROUP_FLAG_CACHE	   (0x1)	/* charged as cache */
> -#define PAGE_CGROUP_FLAG_ACTIVE    (0x2)	/* page is active in this cgroup */
> -#define PAGE_CGROUP_FLAG_FILE	   (0x4)	/* page is file system backed */
> -#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8)	/* page is unevictableable */
> +
> +/*
> + * LOCK_BIT is 0, with value 1
> + */
> +#define PAGE_CGROUP_FLAG_LOCK_BIT   (0x0)  /* lock bit */
> +#define PAGE_CGROUP_FLAG_CACHE	   (0x2)   /* charged as cache */
> +#define PAGE_CGROUP_FLAG_ACTIVE    (0x4)   /* page is active in this cgroup */
> +#define PAGE_CGROUP_FLAG_FILE	   (0x8)   /* page is file system backed */
> +#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x10)/* page is unevictableable */
>  
>  static int page_cgroup_nid(struct page_cgroup *pc)
>  {
> @@ -248,35 +243,81 @@ struct mem_cgroup *mem_cgroup_from_task(
>  				struct mem_cgroup, css);
>  }
>  
> -static inline int page_cgroup_locked(struct page *page)
> +static inline void lock_page_cgroup(struct page_cgroup *pc)
>  {
> -	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +	bit_spin_lock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
>  }
>  
> -static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
> +static inline int trylock_page_cgroup(struct page_cgroup *pc)
>  {
> -	VM_BUG_ON(!page_cgroup_locked(page));
> -	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
> +	return bit_spin_trylock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
>  }
>  
> -struct page_cgroup *page_get_page_cgroup(struct page *page)
> +static inline void unlock_page_cgroup(struct page_cgroup *pc)
>  {
> -	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
> +	bit_spin_unlock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
>  }
>  
> -static void lock_page_cgroup(struct page *page)
> +static int page_assign_page_cgroup(struct page *page, struct page_cgroup *pc,
> +					gfp_t gfp_mask)
>  {
> -	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> -}
> +	unsigned long pfn = page_to_pfn(page);
> +	unsigned long flags;
> +	int err = 0;
> +	struct page_cgroup *old_pc;
>  
> -static int try_lock_page_cgroup(struct page *page)
> -{
> -	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +	if (pc) {
> +		err = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
> +		if (err) {
> +			printk(KERN_WARNING "could not preload radix tree "
> +				"in %s\n", __func__);
> +			goto done;
> +		}
> +	}
> +
> +	spin_lock_irqsave(&mem_cgroup_tree_lock, flags);
> +	old_pc = radix_tree_lookup(&mem_cgroup_tree, pfn);
> +	if (pc && old_pc) {
> +		err = -EEXIST;
> +		goto pc_race;
> +	}
> +	if (pc) {
> +		err = radix_tree_insert(&mem_cgroup_tree, pfn, pc);
> +		if (err)
> +			printk(KERN_WARNING "Inserting into radix tree failed "
> +				"in %s\n", __func__);
> +	} else
> +		radix_tree_delete(&mem_cgroup_tree, pfn);
> +pc_race:
> +	spin_unlock_irqrestore(&mem_cgroup_tree_lock, flags);
> +	if (pc)
> +		radix_tree_preload_end();
> +done:
> +	return err;
>  }
>  
> -static void unlock_page_cgroup(struct page *page)
> +struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
> +						bool trylock)
>  {
> -	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +	unsigned long pfn = page_to_pfn(page);
> +	struct page_cgroup *pc;
> +	int ret;
> +
> +	rcu_read_lock();
> +	pc = radix_tree_lookup(&mem_cgroup_tree, pfn);
> +
> +	if (pc && lock)
> +		lock_page_cgroup(pc);
> +
> +	if (pc && trylock) {
> +		ret = trylock_page_cgroup(pc);
> +		if (!ret)
> +			pc = NULL;
> +	}
> +
> +	rcu_read_unlock();
> +
> +	return pc;
>  }
>  
>  static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
> @@ -377,17 +418,15 @@ void mem_cgroup_move_lists(struct page *
>  	 * safely get to page_cgroup without it, so just try_lock it:
>  	 * mem_cgroup_isolate_pages allows for page left on wrong list.
>  	 */
> -	if (!try_lock_page_cgroup(page))
> +	pc = page_get_page_cgroup_trylock(page);
> +	if (!pc)
>  		return;
>  
> -	pc = page_get_page_cgroup(page);
> -	if (pc) {
> -		mz = page_cgroup_zoneinfo(pc);
> -		spin_lock_irqsave(&mz->lru_lock, flags);
> -		__mem_cgroup_move_lists(pc, lru);
> -		spin_unlock_irqrestore(&mz->lru_lock, flags);
> -	}
> -	unlock_page_cgroup(page);
> +	mz = page_cgroup_zoneinfo(pc);
> +	spin_lock_irqsave(&mz->lru_lock, flags);
> +	__mem_cgroup_move_lists(pc, lru);
> +	spin_unlock_irqrestore(&mz->lru_lock, flags);
> +	unlock_page_cgroup(pc);
>  }
>  
>  /*
> @@ -516,7 +555,7 @@ static int mem_cgroup_charge_common(stru
>  				struct mem_cgroup *memcg)
>  {
>  	struct mem_cgroup *mem;
> -	struct page_cgroup *pc;
> +	struct page_cgroup *pc, *old_pc;
>  	unsigned long flags;
>  	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
>  	struct mem_cgroup_per_zone *mz;
> @@ -569,35 +608,49 @@ static int mem_cgroup_charge_common(stru
>  
>  	pc->mem_cgroup = mem;
>  	pc->page = page;
> +	pc->flags = 0;		/* No lock, no other bits either */
> +
>  	/*
>  	 * If a page is accounted as a page cache, insert to inactive list.
>  	 * If anon, insert to active list.
>  	 */
>  	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) {
> -		pc->flags = PAGE_CGROUP_FLAG_CACHE;
> +		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
>  		if (page_is_file_cache(page))
>  			pc->flags |= PAGE_CGROUP_FLAG_FILE;
>  		else
>  			pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
>  	} else
> -		pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
> +		pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
> +
> +	old_pc = page_get_page_cgroup_locked(page);
> +	if (old_pc) {
> +		unlock_page_cgroup(old_pc);
> +		res_counter_uncharge(&mem->res, PAGE_SIZE);
> +		css_put(&mem->css);
> +		kmem_cache_free(page_cgroup_cache, pc);
> +		goto done;
> +	}
>  
> -	lock_page_cgroup(page);
> -	if (unlikely(page_get_page_cgroup(page))) {
> -		unlock_page_cgroup(page);
> +	lock_page_cgroup(pc);
> +	/*
> +	 * page_get_page_cgroup() does not necessarily guarantee that
> +	 * there will be no race in checking for pc, page_assign_page_pc()
> +	 * will definitely catch it.
> +	 */
> +	if (page_assign_page_cgroup(page, pc, gfp_mask)) {
> +		unlock_page_cgroup(pc);
>  		res_counter_uncharge(&mem->res, PAGE_SIZE);
>  		css_put(&mem->css);
>  		kmem_cache_free(page_cgroup_cache, pc);
>  		goto done;
>  	}
> -	page_assign_page_cgroup(page, pc);
>  
>  	mz = page_cgroup_zoneinfo(pc);
>  	spin_lock_irqsave(&mz->lru_lock, flags);
>  	__mem_cgroup_add_list(mz, pc);
>  	spin_unlock_irqrestore(&mz->lru_lock, flags);
> -
> -	unlock_page_cgroup(page);
> +	unlock_page_cgroup(pc);
>  done:
>  	return 0;
>  out:
> @@ -645,15 +698,13 @@ int mem_cgroup_cache_charge(struct page 
>  	if (!(gfp_mask & __GFP_WAIT)) {
>  		struct page_cgroup *pc;
>  
> -		lock_page_cgroup(page);
> -		pc = page_get_page_cgroup(page);
> +		pc = page_get_page_cgroup_locked(page);
>  		if (pc) {
>  			VM_BUG_ON(pc->page != page);
>  			VM_BUG_ON(!pc->mem_cgroup);
> -			unlock_page_cgroup(page);
> +			unlock_page_cgroup(pc);
>  			return 0;
>  		}
> -		unlock_page_cgroup(page);
>  	}
>  
>  	if (unlikely(!mm))
> @@ -673,6 +724,7 @@ __mem_cgroup_uncharge_common(struct page
>  	struct mem_cgroup *mem;
>  	struct mem_cgroup_per_zone *mz;
>  	unsigned long flags;
> +	int ret;
>  
>  	if (mem_cgroup_subsys.disabled)
>  		return;
> @@ -680,8 +732,7 @@ __mem_cgroup_uncharge_common(struct page
>  	/*
>  	 * Check if our page_cgroup is valid
>  	 */
> -	lock_page_cgroup(page);
> -	pc = page_get_page_cgroup(page);
> +	pc = page_get_page_cgroup_locked(page);
>  	if (unlikely(!pc))
>  		goto unlock;
>  
> @@ -697,8 +748,9 @@ __mem_cgroup_uncharge_common(struct page
>  	__mem_cgroup_remove_list(mz, pc);
>  	spin_unlock_irqrestore(&mz->lru_lock, flags);
>  
> -	page_assign_page_cgroup(page, NULL);
> -	unlock_page_cgroup(page);
> +	ret = page_assign_page_cgroup(page, NULL, GFP_KERNEL);
> +	VM_BUG_ON(ret);
> +	unlock_page_cgroup(pc);
>  
>  	mem = pc->mem_cgroup;
>  	res_counter_uncharge(&mem->res, PAGE_SIZE);
> @@ -707,7 +759,14 @@ __mem_cgroup_uncharge_common(struct page
>  	kmem_cache_free(page_cgroup_cache, pc);
>  	return;
>  unlock:
> -	unlock_page_cgroup(page);
> +	unlock_page_cgroup(pc);
> +}
> +
> +void page_reset_bad_cgroup(struct page *page)
> +{
> +	int ret;
> +	ret = page_assign_page_cgroup(page, NULL, GFP_KERNEL);
> +	VM_BUG_ON(ret);
>  }
>  
>  void mem_cgroup_uncharge_page(struct page *page)
> @@ -734,15 +793,14 @@ int mem_cgroup_prepare_migration(struct 
>  	if (mem_cgroup_subsys.disabled)
>  		return 0;
>  
> -	lock_page_cgroup(page);
> -	pc = page_get_page_cgroup(page);
> +	pc = page_get_page_cgroup_locked(page);
>  	if (pc) {
>  		mem = pc->mem_cgroup;
>  		css_get(&mem->css);
>  		if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
>  			ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
> +		unlock_page_cgroup(pc);
>  	}
> -	unlock_page_cgroup(page);
>  	if (mem) {
>  		ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
>  			ctype, mem);
> @@ -1107,6 +1165,7 @@ mem_cgroup_create(struct cgroup_subsys *
>  	if (unlikely((cont->parent) == NULL)) {
>  		mem = &init_mem_cgroup;
>  		page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
> +		spin_lock_init(&mem_cgroup_tree_lock);
>  	} else {
>  		mem = mem_cgroup_alloc();
>  		if (!mem)
> diff -puN include/linux/memcontrol.h~memcg_move_to_radix_tree include/linux/memcontrol.h
> --- linux-2.6.27-rc5/include/linux/memcontrol.h~memcg_move_to_radix_tree	2008-09-04 15:45:54.000000000 +0530
> +++ linux-2.6.27-rc5-balbir/include/linux/memcontrol.h	2008-09-04 15:45:54.000000000 +0530
> @@ -27,9 +27,28 @@ struct mm_struct;
>  
>  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
>  
> -#define page_reset_bad_cgroup(page)	((page)->page_cgroup = 0)
> +extern void page_reset_bad_cgroup(struct page *page);
> +extern struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
> +							bool trylock);
> +
> +static __always_inline
> +struct page_cgroup *page_get_page_cgroup(struct page *page)
> +{
> +	return __page_get_page_cgroup(page, false, false);
> +}
> +
> +static __always_inline
> +struct page_cgroup *page_get_page_cgroup_trylock(struct page *page)
> +{
> +	return __page_get_page_cgroup(page, false, true);
> +}
> +
> +static __always_inline
> +struct page_cgroup *page_get_page_cgroup_locked(struct page *page)
> +{
> +	return __page_get_page_cgroup(page, true, false);
> +}
>  
> -extern struct page_cgroup *page_get_page_cgroup(struct page *page);
>  extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
>  				gfp_t gfp_mask);
>  extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
> diff -puN include/linux/mm_types.h~memcg_move_to_radix_tree include/linux/mm_types.h
> --- linux-2.6.27-rc5/include/linux/mm_types.h~memcg_move_to_radix_tree	2008-09-04 15:45:54.000000000 +0530
> +++ linux-2.6.27-rc5-balbir/include/linux/mm_types.h	2008-09-04 15:45:54.000000000 +0530
> @@ -92,10 +92,6 @@ struct page {
>  	void *virtual;			/* Kernel virtual address (NULL if
>  					   not kmapped, ie. highmem) */
>  #endif /* WANT_PAGE_VIRTUAL */
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> -	unsigned long page_cgroup;
> -#endif
> -
>  #ifdef CONFIG_KMEMCHECK
>  	void *shadow;
>  #endif
> diff -puN mm/page_alloc.c~memcg_move_to_radix_tree mm/page_alloc.c
> _
> 
> -- 
> 	Balbir
> 


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-09  3:57                           ` KAMEZAWA Hiroyuki
@ 2008-09-09  3:58                             ` Nick Piggin
  2008-09-09  4:53                               ` KAMEZAWA Hiroyuki
  2008-09-09  4:18                             ` Balbir Singh
  1 sibling, 1 reply; 72+ messages in thread
From: Nick Piggin @ 2008-09-09  3:58 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: balbir, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Tuesday 09 September 2008 13:57, KAMEZAWA Hiroyuki wrote:
> On Mon, 8 Sep 2008 20:58:10 +0530
>
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > Sorry for the delay in sending out the new patch, I am traveling and
> > thus a little less responsive. Here is the update patch
>
> Hmm.. I've considered this approach for a while and my answer is that
> this is not what you really want.
>
> Because you just moves the placement of pointer from memmap to
> radix_tree both in GFP_KERNEL, total kernel memory usage is not changed.
> So, at least, you have to add some address calculation (as I did in March)
> to getting address of page_cgroup. But page_cgroup itself consumes 32bytes
> per page. Then.....

Just keep in mind that an important point is to make it more attractive
to configure cgroup into the kernel, but have it disabled or unused at
runtime.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-09  3:57                           ` KAMEZAWA Hiroyuki
  2008-09-09  3:58                             ` Nick Piggin
@ 2008-09-09  4:18                             ` Balbir Singh
  2008-09-09  4:55                               ` KAMEZAWA Hiroyuki
  2008-09-09  7:37                               ` KAMEZAWA Hiroyuki
  1 sibling, 2 replies; 72+ messages in thread
From: Balbir Singh @ 2008-09-09  4:18 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

KAMEZAWA Hiroyuki wrote:
> On Mon, 8 Sep 2008 20:58:10 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>> Sorry for the delay in sending out the new patch, I am traveling and
>> thus a little less responsive. Here is the update patch
>>
>>
> Hmm.. I've considered this approach for a while and my answer is that
> this is not what you really want.
> 
> Because you just moves the placement of pointer from memmap to
> radix_tree both in GFP_KERNEL, total kernel memory usage is not changed.

Agreed, but we do reduce the sizeof(struct page) without adding on to
page_cgroup's size. So why don't we want this?

> So, at least, you have to add some address calculation (as I did in March)
> to getting address of page_cgroup.

What address calculation do we need, sorry I don't recollect it.

 But page_cgroup itself consumes 32bytes
> per page. Then.....
> 
> My proposal to 32bit system is following 
>  - remove page_cgroup completely.
>    - As a result, there is no per-cgroup lru. But it will not be bad
>      bacause the number of cgroups and pages are not big.
>      just a trade-off between kernel-memory-space v.s. speed.

32 bit systems with PAE can support quite a lot of memory, so I am not sure I
agree. I don't like this approach

>    - Removing page_cgroup and just remember address of mem_cgroup per page.
> 

This is on top of the suggested approach above?

> How do you think ?
> 

I don't like the approach.

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-09  3:58                             ` Nick Piggin
@ 2008-09-09  4:53                               ` KAMEZAWA Hiroyuki
  2008-09-09  5:00                                 ` Nick Piggin
  0 siblings, 1 reply; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-09  4:53 UTC (permalink / raw)
  To: Nick Piggin
  Cc: balbir, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Tue, 9 Sep 2008 13:58:27 +1000
Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> On Tuesday 09 September 2008 13:57, KAMEZAWA Hiroyuki wrote:
> > On Mon, 8 Sep 2008 20:58:10 +0530
> >
> > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > > Sorry for the delay in sending out the new patch, I am traveling and
> > > thus a little less responsive. Here is the update patch
> >
> > Hmm.. I've considered this approach for a while and my answer is that
> > this is not what you really want.
> >
> > Because you just moves the placement of pointer from memmap to
> > radix_tree both in GFP_KERNEL, total kernel memory usage is not changed.
> > So, at least, you have to add some address calculation (as I did in March)
> > to getting address of page_cgroup. But page_cgroup itself consumes 32bytes
> > per page. Then.....
> 
> Just keep in mind that an important point is to make it more attractive
> to configure cgroup into the kernel, but have it disabled or unused at
> runtime.
> 

Hmm..kicking out 4bytes per 4096bytes if disabled ?

maybe a routine like SPARSEMEM is a choice.

Following is pointer pre-allocation. (just pointer, not page_cgroup itself)
==
#define PCG_SECTION_SHIFT	(10)
#define PCG_SECTION_SIZE	(1 << PCG_SECTION_SHIFT)

struct pcg_section {
	struct page_cgroup **map[PCG_SECTION_SHIFT]; //array of pointer.
};

struct page_cgroup *get_page_cgroup(unsigned long pfn)
{
	struct pcg_section *sec;
	sec = pcg_section[(pfn >> PCG_SECTION_SHIFT)];
	return *sec->page_cgroup[(pfn & ((1 << PCG_SECTTION_SHIFT) - 1];
}
==
If we go extreme, we can use kmap_atomic() for pointer array.

Overhead of pointer-walk is not so bad, maybe.

For 64bit systems, we can find a way like SPARSEMEM_VMEMMAP.

Thanks,
-Kame




Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-09  4:18                             ` Balbir Singh
@ 2008-09-09  4:55                               ` KAMEZAWA Hiroyuki
  2008-09-09  7:37                               ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-09  4:55 UTC (permalink / raw)
  To: balbir
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Mon, 08 Sep 2008 21:18:37 -0700
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> KAMEZAWA Hiroyuki wrote:
> > On Mon, 8 Sep 2008 20:58:10 +0530
> > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> >> Sorry for the delay in sending out the new patch, I am traveling and
> >> thus a little less responsive. Here is the update patch
> >>
> >>
> > Hmm.. I've considered this approach for a while and my answer is that
> > this is not what you really want.
> > 
> > Because you just moves the placement of pointer from memmap to
> > radix_tree both in GFP_KERNEL, total kernel memory usage is not changed.
> 
> Agreed, but we do reduce the sizeof(struct page) without adding on to
> page_cgroup's size. So why don't we want this?
> 
> > So, at least, you have to add some address calculation (as I did in March)
> > to getting address of page_cgroup.
> 
> What address calculation do we need, sorry I don't recollect it.
> 
   base_address = base_addrees_of_page_group_chunk_of_pfn.
   base_address + offset_of_pfn_from_base_pfn * sizeof (struct page_cgroup).

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-09  4:53                               ` KAMEZAWA Hiroyuki
@ 2008-09-09  5:00                                 ` Nick Piggin
  2008-09-09  5:12                                   ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 72+ messages in thread
From: Nick Piggin @ 2008-09-09  5:00 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: balbir, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Tuesday 09 September 2008 14:53, KAMEZAWA Hiroyuki wrote:
> On Tue, 9 Sep 2008 13:58:27 +1000
>
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> > On Tuesday 09 September 2008 13:57, KAMEZAWA Hiroyuki wrote:
> > > On Mon, 8 Sep 2008 20:58:10 +0530
> > >
> > > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > > > Sorry for the delay in sending out the new patch, I am traveling and
> > > > thus a little less responsive. Here is the update patch
> > >
> > > Hmm.. I've considered this approach for a while and my answer is that
> > > this is not what you really want.
> > >
> > > Because you just moves the placement of pointer from memmap to
> > > radix_tree both in GFP_KERNEL, total kernel memory usage is not
> > > changed. So, at least, you have to add some address calculation (as I
> > > did in March) to getting address of page_cgroup. But page_cgroup itself
> > > consumes 32bytes per page. Then.....
> >
> > Just keep in mind that an important point is to make it more attractive
> > to configure cgroup into the kernel, but have it disabled or unused at
> > runtime.
>
> Hmm..kicking out 4bytes per 4096bytes if disabled ?

Yeah of course. 4 or 8 bytes. Everything adds up. There is nothing special
about cgroups that says it is allowed to use fields in struct page where
others cannot. Put it in perspective: we try very hard not to allocate new
*bits* in page flags, which is only 4 bytes per 131072 bytes.


> maybe a routine like SPARSEMEM is a choice.
>
> Following is pointer pre-allocation. (just pointer, not page_cgroup itself)
> ==
> #define PCG_SECTION_SHIFT	(10)
> #define PCG_SECTION_SIZE	(1 << PCG_SECTION_SHIFT)
>
> struct pcg_section {
> 	struct page_cgroup **map[PCG_SECTION_SHIFT]; //array of pointer.
> };
>
> struct page_cgroup *get_page_cgroup(unsigned long pfn)
> {
> 	struct pcg_section *sec;
> 	sec = pcg_section[(pfn >> PCG_SECTION_SHIFT)];
> 	return *sec->page_cgroup[(pfn & ((1 << PCG_SECTTION_SHIFT) - 1];
> }
> ==
> If we go extreme, we can use kmap_atomic() for pointer array.
>
> Overhead of pointer-walk is not so bad, maybe.
>
> For 64bit systems, we can find a way like SPARSEMEM_VMEMMAP.

Yes I too think that would be the ideal way to go to get the best of
performance in the enabled case. However Balbir I believe is interested
in memory savings if not all pages have cgroups... I don't know, I don't
care so much about the "enabled" case, so I'll leave you two to fight it
out :)

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-09  5:00                                 ` Nick Piggin
@ 2008-09-09  5:12                                   ` KAMEZAWA Hiroyuki
  2008-09-09 12:24                                     ` Balbir Singh
  2008-09-09 12:30                                     ` kamezawa.hiroyu
  0 siblings, 2 replies; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-09  5:12 UTC (permalink / raw)
  To: Nick Piggin
  Cc: balbir, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Tue, 9 Sep 2008 15:00:10 +1000
Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> > maybe a routine like SPARSEMEM is a choice.
> >
> > Following is pointer pre-allocation. (just pointer, not page_cgroup itself)
> > ==
> > #define PCG_SECTION_SHIFT	(10)
> > #define PCG_SECTION_SIZE	(1 << PCG_SECTION_SHIFT)
> >
> > struct pcg_section {
> > 	struct page_cgroup **map[PCG_SECTION_SHIFT]; //array of pointer.
> > };
> >
> > struct page_cgroup *get_page_cgroup(unsigned long pfn)
> > {
> > 	struct pcg_section *sec;
> > 	sec = pcg_section[(pfn >> PCG_SECTION_SHIFT)];
> > 	return *sec->page_cgroup[(pfn & ((1 << PCG_SECTTION_SHIFT) - 1];
> > }
> > ==
> > If we go extreme, we can use kmap_atomic() for pointer array.
> >
> > Overhead of pointer-walk is not so bad, maybe.
> >
> > For 64bit systems, we can find a way like SPARSEMEM_VMEMMAP.
> 
> Yes I too think that would be the ideal way to go to get the best of
> performance in the enabled case. However Balbir I believe is interested
> in memory savings if not all pages have cgroups... I don't know, I don't
> care so much about the "enabled" case, so I'll leave you two to fight it
> out :)
> 
I'll add a new patch on my set.

Balbir, are you ok to CONFIG_CGROUP_MEM_RES_CTLR depends on CONFIG_SPARSEMEM ?
I thinks SPARSEMEM(SPARSEMEM_VMEMMAP) is widely used in various archs now.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-09  4:18                             ` Balbir Singh
  2008-09-09  4:55                               ` KAMEZAWA Hiroyuki
@ 2008-09-09  7:37                               ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-09  7:37 UTC (permalink / raw)
  To: balbir
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Mon, 08 Sep 2008 21:18:37 -0700
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> KAMEZAWA Hiroyuki wrote:
> > On Mon, 8 Sep 2008 20:58:10 +0530
> > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> >> Sorry for the delay in sending out the new patch, I am traveling and
> >> thus a little less responsive. Here is the update patch
> >>
> >>
> > Hmm.. I've considered this approach for a while and my answer is that
> > this is not what you really want.
> > 
> > Because you just moves the placement of pointer from memmap to
> > radix_tree both in GFP_KERNEL, total kernel memory usage is not changed.
> 
> Agreed, but we do reduce the sizeof(struct page) without adding on to
> page_cgroup's size. So why don't we want this?
> 
Because it's depends on CONFIG.

But ok, I'll recall my patches from grave. Maybe it can be a hint for you.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-09  5:12                                   ` KAMEZAWA Hiroyuki
@ 2008-09-09 12:24                                     ` Balbir Singh
  2008-09-09 12:28                                       ` Nick Piggin
  2008-09-09 12:30                                     ` kamezawa.hiroyu
  1 sibling, 1 reply; 72+ messages in thread
From: Balbir Singh @ 2008-09-09 12:24 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

KAMEZAWA Hiroyuki wrote:
> On Tue, 9 Sep 2008 15:00:10 +1000
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>>> maybe a routine like SPARSEMEM is a choice.
>>>
>>> Following is pointer pre-allocation. (just pointer, not page_cgroup itself)
>>> ==
>>> #define PCG_SECTION_SHIFT	(10)
>>> #define PCG_SECTION_SIZE	(1 << PCG_SECTION_SHIFT)
>>>
>>> struct pcg_section {
>>> 	struct page_cgroup **map[PCG_SECTION_SHIFT]; //array of pointer.
>>> };
>>>
>>> struct page_cgroup *get_page_cgroup(unsigned long pfn)
>>> {
>>> 	struct pcg_section *sec;
>>> 	sec = pcg_section[(pfn >> PCG_SECTION_SHIFT)];
>>> 	return *sec->page_cgroup[(pfn & ((1 << PCG_SECTTION_SHIFT) - 1];
>>> }
>>> ==
>>> If we go extreme, we can use kmap_atomic() for pointer array.
>>>
>>> Overhead of pointer-walk is not so bad, maybe.
>>>
>>> For 64bit systems, we can find a way like SPARSEMEM_VMEMMAP.
>> Yes I too think that would be the ideal way to go to get the best of
>> performance in the enabled case. However Balbir I believe is interested
>> in memory savings if not all pages have cgroups... I don't know, I don't
>> care so much about the "enabled" case, so I'll leave you two to fight it
>> out :)
>>
> I'll add a new patch on my set.
> 
> Balbir, are you ok to CONFIG_CGROUP_MEM_RES_CTLR depends on CONFIG_SPARSEMEM ?
> I thinks SPARSEMEM(SPARSEMEM_VMEMMAP) is widely used in various archs now.

Can't we make it more generic. I was thinking of allocating memory for each node
for page_cgroups (of the size of spanned_pages) at initialization time. I've not
yet prototyped the idea. BTW, even with your approach I fail to see why we need
to add a dependency on CONFIG_SPARSEMEM (but again it is 4:30 in the morning and
I might be missing the obvious)

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-09 12:24                                     ` Balbir Singh
@ 2008-09-09 12:28                                       ` Nick Piggin
  0 siblings, 0 replies; 72+ messages in thread
From: Nick Piggin @ 2008-09-09 12:28 UTC (permalink / raw)
  To: balbir
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, hugh, menage, xemul,
	linux-kernel, linux-mm

On Tuesday 09 September 2008 22:24, Balbir Singh wrote:
> KAMEZAWA Hiroyuki wrote:

> > Balbir, are you ok to CONFIG_CGROUP_MEM_RES_CTLR depends on
> > CONFIG_SPARSEMEM ? I thinks SPARSEMEM(SPARSEMEM_VMEMMAP) is widely used
> > in various archs now.
>
> Can't we make it more generic. I was thinking of allocating memory for each
> node for page_cgroups (of the size of spanned_pages) at initialization
> time. I've not yet prototyped the idea. BTW, even with your approach I fail
> to see why we need to add a dependency on CONFIG_SPARSEMEM (but again it is
> 4:30 in the morning and I might be missing the obvious)

I guess it would be just a matter of coding up the implementation for
each model you want to support. In some cases, you might lose memory
(eg in the case of flatmem where you have holes in ram), but in those
cases you lose memory from the struct pages anyway so I wouldn't worry
too much.

I think it would be reasonable to provide an implementation for flatmem
as well (which AFAIK is the other non-deprecated memory model). It
should not be hard to code AFAIKS.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-09  5:12                                   ` KAMEZAWA Hiroyuki
  2008-09-09 12:24                                     ` Balbir Singh
@ 2008-09-09 12:30                                     ` kamezawa.hiroyu
  2008-09-09 12:34                                       ` Balbir Singh
  2008-09-10  1:20                                       ` [Approach #2] " Balbir Singh
  1 sibling, 2 replies; 72+ messages in thread
From: kamezawa.hiroyu @ 2008-09-09 12:30 UTC (permalink / raw)
  To: balbir
  Cc: KAMEZAWA Hiroyuki, Nick Piggin, Andrew Morton, hugh, menage,
	xemul, linux-kernel, linux-mm

----- Original Message -----
>> Balbir, are you ok to CONFIG_CGROUP_MEM_RES_CTLR depends on CONFIG_SPARSEME
M ?
>> I thinks SPARSEMEM(SPARSEMEM_VMEMMAP) is widely used in various archs now.
>
>Can't we make it more generic. I was thinking of allocating memory for each n
ode
>for page_cgroups (of the size of spanned_pages) at initialization time. I've 
not
>yet prototyped the idea. BTW, even with your approach I fail to see why we ne
ed
>to add a dependency on CONFIG_SPARSEMEM (but again it is 4:30 in the morning 
and
>I might be missing the obvious)

Doesn't have big issue without CONFIG_SPARSEMEM, maybe.
Sorry for my confusion.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page
  2008-09-09 12:30                                     ` kamezawa.hiroyu
@ 2008-09-09 12:34                                       ` Balbir Singh
  2008-09-10  1:20                                       ` [Approach #2] " Balbir Singh
  1 sibling, 0 replies; 72+ messages in thread
From: Balbir Singh @ 2008-09-09 12:34 UTC (permalink / raw)
  To: kamezawa.hiroyu
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

kamezawa.hiroyu@jp.fujitsu.com wrote:
> ----- Original Message -----
>>> Balbir, are you ok to CONFIG_CGROUP_MEM_RES_CTLR depends on CONFIG_SPARSEME
> M ?
>>> I thinks SPARSEMEM(SPARSEMEM_VMEMMAP) is widely used in various archs now.
>> Can't we make it more generic. I was thinking of allocating memory for each n
> ode
>> for page_cgroups (of the size of spanned_pages) at initialization time. I've 
> not
>> yet prototyped the idea. BTW, even with your approach I fail to see why we ne
> ed
>> to add a dependency on CONFIG_SPARSEMEM (but again it is 4:30 in the morning 
> and
>> I might be missing the obvious)
> 
> Doesn't have big issue without CONFIG_SPARSEMEM, maybe.
> Sorry for my confusion.

No problem, I am glad to know that we are not limited to a particular model.

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-09 12:30                                     ` kamezawa.hiroyu
  2008-09-09 12:34                                       ` Balbir Singh
@ 2008-09-10  1:20                                       ` Balbir Singh
  2008-09-10  1:49                                         ` KAMEZAWA Hiroyuki
  2008-09-10 22:21                                         ` Dave Hansen
  1 sibling, 2 replies; 72+ messages in thread
From: Balbir Singh @ 2008-09-10  1:20 UTC (permalink / raw)
  To: kamezawa.hiroyu
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2008-09-09 21:30:12]:

> ----- Original Message -----
> >> Balbir, are you ok to CONFIG_CGROUP_MEM_RES_CTLR depends on CONFIG_SPARSEME
> M ?
> >> I thinks SPARSEMEM(SPARSEMEM_VMEMMAP) is widely used in various archs now.
> >
> >Can't we make it more generic. I was thinking of allocating memory for each n
> ode
> >for page_cgroups (of the size of spanned_pages) at initialization time. I've 
> not
> >yet prototyped the idea. BTW, even with your approach I fail to see why we ne
> ed
> >to add a dependency on CONFIG_SPARSEMEM (but again it is 4:30 in the morning 
> and
> >I might be missing the obvious)
> 
> Doesn't have big issue without CONFIG_SPARSEMEM, maybe.
> Sorry for my confusion.
>

OK, here is approach #2, it works for me and gives me really good
performance (surpassing even the current memory controller). I am
seeing almost a 7% increase

Caveats

1. Uses more memory (since it allocates memory for each node based on
   spanned_pages. Ignores holes, so might not be the most efficient,
   but it is a tradeoff of complexity versus space. I propose refining it
   as we go along.
2. Does not currently handle alloc_bootmem failure
3. Needs lots of testing/tuning and polishing

I've tested it on an x86_64 box with 4G of memory

Again, this is an early RFC patch, please review test. 

Comments/Reviews?

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---

 include/linux/memcontrol.h |   32 ++++++
 include/linux/mm_types.h   |    4 
 mm/memcontrol.c            |  212 +++++++++++++++++++++++++++------------------
 mm/page_alloc.c            |   10 --
 4 files changed, 162 insertions(+), 96 deletions(-)

diff -puN mm/memcontrol.c~memcg_move_to_radix_tree mm/memcontrol.c
--- linux-2.6.27-rc5/mm/memcontrol.c~memcg_move_to_radix_tree	2008-09-04 03:15:54.000000000 -0700
+++ linux-2.6.27-rc5-balbir/mm/memcontrol.c	2008-09-09 17:56:54.000000000 -0700
@@ -18,6 +18,7 @@
  */
 
 #include <linux/res_counter.h>
+#include <linux/bootmem.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
@@ -37,9 +38,10 @@
 #include <asm/uaccess.h>
 
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
-static struct kmem_cache *page_cgroup_cache __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 
+static struct page_cgroup *pcg_map[MAX_NUMNODES];
+
 /*
  * Statistics for memory cgroup.
  */
@@ -137,20 +139,6 @@ struct mem_cgroup {
 static struct mem_cgroup init_mem_cgroup;
 
 /*
- * We use the lower bit of the page->page_cgroup pointer as a bit spin
- * lock.  We need to ensure that page->page_cgroup is at least two
- * byte aligned (based on comments from Nick Piggin).  But since
- * bit_spin_lock doesn't actually set that lock bit in a non-debug
- * uniprocessor kernel, we should avoid setting it here too.
- */
-#define PAGE_CGROUP_LOCK_BIT 	0x0
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-#define PAGE_CGROUP_LOCK 	(1 << PAGE_CGROUP_LOCK_BIT)
-#else
-#define PAGE_CGROUP_LOCK	0x0
-#endif
-
-/*
  * A page_cgroup page is associated with every page descriptor. The
  * page_cgroup helps us identify information about the cgroup
  */
@@ -158,12 +146,26 @@ struct page_cgroup {
 	struct list_head lru;		/* per cgroup LRU list */
 	struct page *page;
 	struct mem_cgroup *mem_cgroup;
-	int flags;
+	unsigned long flags;
 };
-#define PAGE_CGROUP_FLAG_CACHE	   (0x1)	/* charged as cache */
-#define PAGE_CGROUP_FLAG_ACTIVE    (0x2)	/* page is active in this cgroup */
-#define PAGE_CGROUP_FLAG_FILE	   (0x4)	/* page is file system backed */
-#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8)	/* page is unevictableable */
+
+/*
+ * LOCK_BIT is 0, with value 1
+ */
+#define PAGE_CGROUP_FLAG_LOCK_BIT  (0x0)  /* lock bit */
+
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+#define PAGE_CGROUP_FLAG_LOCK      (0x1)  /* lock value */
+#else
+#define PAGE_CGROUP_FLAG_LOCK      (0x0)  /* lock value */
+#endif
+
+#define PAGE_CGROUP_FLAG_CACHE	   (0x2)   /* charged as cache */
+#define PAGE_CGROUP_FLAG_ACTIVE    (0x4)   /* page is active in this cgroup */
+#define PAGE_CGROUP_FLAG_FILE	   (0x8)   /* page is file system backed */
+#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x10)/* page is unevictableable */
+#define PAGE_CGROUP_FLAG_INUSE     (0x20)/* pc is allocated and in use */
+#define PAGE_CGROUP_FLAG_VALID     (0x40)/* pc is allocated and in use */
 
 static int page_cgroup_nid(struct page_cgroup *pc)
 {
@@ -248,35 +250,99 @@ struct mem_cgroup *mem_cgroup_from_task(
 				struct mem_cgroup, css);
 }
 
-static inline int page_cgroup_locked(struct page *page)
+static inline void lock_page_cgroup(struct page_cgroup *pc)
 {
-	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+	bit_spin_lock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
 }
 
-static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
+static inline int trylock_page_cgroup(struct page_cgroup *pc)
 {
-	VM_BUG_ON(!page_cgroup_locked(page));
-	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
+	return bit_spin_trylock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
 }
 
-struct page_cgroup *page_get_page_cgroup(struct page *page)
+static inline void unlock_page_cgroup(struct page_cgroup *pc)
 {
-	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
+	bit_spin_unlock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
 }
 
-static void lock_page_cgroup(struct page *page)
+/*
+ * Called from memmap_init_zone(), has the advantage of dealing with
+ * memory_hotplug (Addition of memory)
+ */
+int page_cgroup_alloc(int n)
 {
-	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+	struct pglist_data *pgdat;
+	unsigned long size, start, end;
+
+	if (mem_cgroup_subsys.disabled)
+		return;
+
+	pgdat = NODE_DATA(n);
+	/*
+	 * Already allocated, leave
+	 */
+	if (pcg_map[n])
+		return 0;
+
+	start = pgdat->node_start_pfn;
+	end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+	size = (end - start) * sizeof(struct page_cgroup);
+	printk("Allocating %lu bytes for node %d\n", size, n);
+	pcg_map[n] = alloc_bootmem_node(pgdat, size);
+	/*
+	 * We can do smoother recovery
+	 */
+	BUG_ON(!pcg_map[n]);
+	return 0;
 }
 
-static int try_lock_page_cgroup(struct page *page)
+void page_cgroup_init(int nid, unsigned long pfn)
 {
-	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+	unsigned long node_pfn;
+	struct page_cgroup *pc;
+
+	if (mem_cgroup_subsys.disabled)
+		return;
+
+	node_pfn = pfn - NODE_DATA(nid)->node_start_pfn;
+	pc = &pcg_map[nid][node_pfn];
+
+	BUG_ON(!pc);
+	pc->flags = PAGE_CGROUP_FLAG_VALID;
+	INIT_LIST_HEAD(&pc->lru);
+	pc->page = NULL;
+	pc->mem_cgroup = NULL;
 }
 
-static void unlock_page_cgroup(struct page *page)
+struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
+						bool trylock)
 {
-	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+	struct page_cgroup *pc;
+	int ret;
+	int node = page_to_nid(page);
+	unsigned long pfn;
+
+	pfn = page_to_pfn(page) - NODE_DATA(node)->node_start_pfn;
+	pc = &pcg_map[node][pfn];
+	BUG_ON(!(pc->flags & PAGE_CGROUP_FLAG_VALID));
+	if (lock)
+		lock_page_cgroup(pc);
+	else if (trylock) {
+		ret = trylock_page_cgroup(pc);
+		if (!ret)
+			pc = NULL;
+	}
+
+	return pc;
+}
+
+/*
+ * Should be called with page_cgroup lock held. Any additions to pc->flags
+ * should be reflected here. This might seem ugly, refine it later.
+ */
+void page_clear_page_cgroup(struct page_cgroup *pc)
+{
+	pc->flags &= ~PAGE_CGROUP_FLAG_INUSE;
 }
 
 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
@@ -377,17 +443,15 @@ void mem_cgroup_move_lists(struct page *
 	 * safely get to page_cgroup without it, so just try_lock it:
 	 * mem_cgroup_isolate_pages allows for page left on wrong list.
 	 */
-	if (!try_lock_page_cgroup(page))
+	pc = page_get_page_cgroup_trylock(page);
+	if (!pc)
 		return;
 
-	pc = page_get_page_cgroup(page);
-	if (pc) {
-		mz = page_cgroup_zoneinfo(pc);
-		spin_lock_irqsave(&mz->lru_lock, flags);
-		__mem_cgroup_move_lists(pc, lru);
-		spin_unlock_irqrestore(&mz->lru_lock, flags);
-	}
-	unlock_page_cgroup(page);
+	mz = page_cgroup_zoneinfo(pc);
+	spin_lock_irqsave(&mz->lru_lock, flags);
+	__mem_cgroup_move_lists(pc, lru);
+	spin_unlock_irqrestore(&mz->lru_lock, flags);
+	unlock_page_cgroup(pc);
 }
 
 /*
@@ -521,10 +585,6 @@ static int mem_cgroup_charge_common(stru
 	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct mem_cgroup_per_zone *mz;
 
-	pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
-	if (unlikely(pc == NULL))
-		goto err;
-
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
 	 * The mm_struct's mem_cgroup changes on task migration if the
@@ -567,43 +627,40 @@ static int mem_cgroup_charge_common(stru
 		}
 	}
 
+	pc = page_get_page_cgroup_locked(page);
+	if (pc->flags & PAGE_CGROUP_FLAG_INUSE) {
+		unlock_page_cgroup(pc);
+		res_counter_uncharge(&mem->res, PAGE_SIZE);
+		css_put(&mem->css);
+		goto done;
+	}
+
 	pc->mem_cgroup = mem;
 	pc->page = page;
+	pc->flags |= PAGE_CGROUP_FLAG_INUSE;
+
 	/*
 	 * If a page is accounted as a page cache, insert to inactive list.
 	 * If anon, insert to active list.
 	 */
 	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) {
-		pc->flags = PAGE_CGROUP_FLAG_CACHE;
+		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
 		if (page_is_file_cache(page))
 			pc->flags |= PAGE_CGROUP_FLAG_FILE;
 		else
 			pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
 	} else
-		pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
-
-	lock_page_cgroup(page);
-	if (unlikely(page_get_page_cgroup(page))) {
-		unlock_page_cgroup(page);
-		res_counter_uncharge(&mem->res, PAGE_SIZE);
-		css_put(&mem->css);
-		kmem_cache_free(page_cgroup_cache, pc);
-		goto done;
-	}
-	page_assign_page_cgroup(page, pc);
+		pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
 
 	mz = page_cgroup_zoneinfo(pc);
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	__mem_cgroup_add_list(mz, pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
-
-	unlock_page_cgroup(page);
+	unlock_page_cgroup(pc);
 done:
 	return 0;
 out:
 	css_put(&mem->css);
-	kmem_cache_free(page_cgroup_cache, pc);
-err:
 	return -ENOMEM;
 }
 
@@ -645,15 +702,14 @@ int mem_cgroup_cache_charge(struct page 
 	if (!(gfp_mask & __GFP_WAIT)) {
 		struct page_cgroup *pc;
 
-		lock_page_cgroup(page);
-		pc = page_get_page_cgroup(page);
-		if (pc) {
+		pc = page_get_page_cgroup_locked(page);
+		if (pc->flags & PAGE_CGROUP_FLAG_INUSE) {
 			VM_BUG_ON(pc->page != page);
 			VM_BUG_ON(!pc->mem_cgroup);
-			unlock_page_cgroup(page);
+			unlock_page_cgroup(pc);
 			return 0;
 		}
-		unlock_page_cgroup(page);
+		unlock_page_cgroup(pc);
 	}
 
 	if (unlikely(!mm))
@@ -680,34 +736,30 @@ __mem_cgroup_uncharge_common(struct page
 	/*
 	 * Check if our page_cgroup is valid
 	 */
-	lock_page_cgroup(page);
-	pc = page_get_page_cgroup(page);
-	if (unlikely(!pc))
-		goto unlock;
-
+	pc = page_get_page_cgroup_locked(page);
 	VM_BUG_ON(pc->page != page);
+	VM_BUG_ON(!(pc->flags & PAGE_CGROUP_FLAG_INUSE));
 
 	if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
 	    && ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
 		|| page_mapped(page)))
 		goto unlock;
 
+	page_clear_page_cgroup(pc);
 	mz = page_cgroup_zoneinfo(pc);
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	__mem_cgroup_remove_list(mz, pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
 
-	page_assign_page_cgroup(page, NULL);
-	unlock_page_cgroup(page);
+	unlock_page_cgroup(pc);
 
 	mem = pc->mem_cgroup;
 	res_counter_uncharge(&mem->res, PAGE_SIZE);
 	css_put(&mem->css);
 
-	kmem_cache_free(page_cgroup_cache, pc);
 	return;
 unlock:
-	unlock_page_cgroup(page);
+	unlock_page_cgroup(pc);
 }
 
 void mem_cgroup_uncharge_page(struct page *page)
@@ -734,15 +786,14 @@ int mem_cgroup_prepare_migration(struct 
 	if (mem_cgroup_subsys.disabled)
 		return 0;
 
-	lock_page_cgroup(page);
-	pc = page_get_page_cgroup(page);
-	if (pc) {
+	pc = page_get_page_cgroup_locked(page);
+	if (pc->flags & PAGE_CGROUP_FLAG_INUSE) {
 		mem = pc->mem_cgroup;
 		css_get(&mem->css);
 		if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
 			ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	}
-	unlock_page_cgroup(page);
+	unlock_page_cgroup(pc);
 	if (mem) {
 		ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
 			ctype, mem);
@@ -1106,7 +1157,6 @@ mem_cgroup_create(struct cgroup_subsys *
 
 	if (unlikely((cont->parent) == NULL)) {
 		mem = &init_mem_cgroup;
-		page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
 	} else {
 		mem = mem_cgroup_alloc();
 		if (!mem)
diff -puN include/linux/memcontrol.h~memcg_move_to_radix_tree include/linux/memcontrol.h
--- linux-2.6.27-rc5/include/linux/memcontrol.h~memcg_move_to_radix_tree	2008-09-04 03:15:54.000000000 -0700
+++ linux-2.6.27-rc5-balbir/include/linux/memcontrol.h	2008-09-09 13:02:29.000000000 -0700
@@ -27,9 +27,29 @@ struct mm_struct;
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
-#define page_reset_bad_cgroup(page)	((page)->page_cgroup = 0)
+extern void page_cgroup_init(int nid, unsigned long pfn);
+extern int page_cgroup_alloc(int n);
+extern struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
+							bool trylock);
+
+static __always_inline
+struct page_cgroup *page_get_page_cgroup(struct page *page)
+{
+	return __page_get_page_cgroup(page, false, false);
+}
+
+static __always_inline
+struct page_cgroup *page_get_page_cgroup_trylock(struct page *page)
+{
+	return __page_get_page_cgroup(page, false, true);
+}
+
+static __always_inline
+struct page_cgroup *page_get_page_cgroup_locked(struct page *page)
+{
+	return __page_get_page_cgroup(page, true, false);
+}
 
-extern struct page_cgroup *page_get_page_cgroup(struct page *page);
 extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
@@ -73,7 +93,13 @@ extern long mem_cgroup_calc_reclaim(stru
 					int priority, enum lru_list lru);
 
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
-static inline void page_reset_bad_cgroup(struct page *page)
+
+static inline int page_cgroup_alloc(int n)
+{
+	return 0;
+}
+
+static inline void page_cgroup_init(int nid, unsigned long pfn)
 {
 }
 
diff -puN include/linux/mm_types.h~memcg_move_to_radix_tree include/linux/mm_types.h
--- linux-2.6.27-rc5/include/linux/mm_types.h~memcg_move_to_radix_tree	2008-09-04 03:15:54.000000000 -0700
+++ linux-2.6.27-rc5-balbir/include/linux/mm_types.h	2008-09-04 03:15:54.000000000 -0700
@@ -92,10 +92,6 @@ struct page {
 	void *virtual;			/* Kernel virtual address (NULL if
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-	unsigned long page_cgroup;
-#endif
-
 #ifdef CONFIG_KMEMCHECK
 	void *shadow;
 #endif
diff -puN mm/page_alloc.c~memcg_move_to_radix_tree mm/page_alloc.c
--- linux-2.6.27-rc5/mm/page_alloc.c~memcg_move_to_radix_tree	2008-09-04 03:15:54.000000000 -0700
+++ linux-2.6.27-rc5-balbir/mm/page_alloc.c	2008-09-09 13:02:50.000000000 -0700
@@ -223,17 +223,11 @@ static inline int bad_range(struct zone 
 
 static void bad_page(struct page *page)
 {
-	void *pc = page_get_page_cgroup(page);
-
 	printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
 		"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
 		current->comm, page, (int)(2*sizeof(unsigned long)),
 		(unsigned long)page->flags, page->mapping,
 		page_mapcount(page), page_count(page));
-	if (pc) {
-		printk(KERN_EMERG "cgroup:%p\n", pc);
-		page_reset_bad_cgroup(page);
-	}
 	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
 		KERN_EMERG "Backtrace:\n");
 	dump_stack();
@@ -472,7 +466,6 @@ static inline void free_pages_check(stru
 	free_page_mlock(page);
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
-		(page_get_page_cgroup(page) != NULL) |
 		(page_count(page) != 0)  |
 		(page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
 		bad_page(page);
@@ -609,7 +602,6 @@ static void prep_new_page(struct page *p
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
-		(page_get_page_cgroup(page) != NULL) |
 		(page_count(page) != 0)  |
 		(page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
 		bad_page(page);
@@ -2652,6 +2644,7 @@ void __meminit memmap_init_zone(unsigned
 	unsigned long pfn;
 	struct zone *z;
 
+	page_cgroup_alloc(nid);
 	z = &NODE_DATA(nid)->node_zones[zone];
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		/*
@@ -2670,6 +2663,7 @@ void __meminit memmap_init_zone(unsigned
 		mminit_verify_page_links(page, zone, nid, pfn);
 		init_page_count(page);
 		reset_page_mapcount(page);
+		page_cgroup_init(nid, pfn);
 		SetPageReserved(page);
 		/*
 		 * Mark the block movable so that blocks are reserved for
_

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-10  1:20                                       ` [Approach #2] " Balbir Singh
@ 2008-09-10  1:49                                         ` KAMEZAWA Hiroyuki
  2008-09-10  2:11                                           ` Balbir Singh
  2008-09-10 20:44                                           ` Nick Piggin
  2008-09-10 22:21                                         ` Dave Hansen
  1 sibling, 2 replies; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-10  1:49 UTC (permalink / raw)
  To: balbir
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Tue, 9 Sep 2008 18:20:48 -0700
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2008-09-09 21:30:12]:
> OK, here is approach #2, it works for me and gives me really good
> performance (surpassing even the current memory controller). I am
> seeing almost a 7% increase
This number is from pre-allcation, maybe.
We really do alloc-at-boot all page_cgroup ? This seems a big change.

> 
> Caveats
> 
> 1. Uses more memory (since it allocates memory for each node based on
>    spanned_pages. Ignores holes, so might not be the most efficient,
>    but it is a tradeoff of complexity versus space. I propose refining it
>    as we go along.
> 2. Does not currently handle alloc_bootmem failure
> 3. Needs lots of testing/tuning and polishing
> 
If we can do alloc-at-boot, we can make memcg much simpler.



> I've tested it on an x86_64 box with 4G of memory
> 
> Again, this is an early RFC patch, please review test. 
> 
> Comments/Reviews?
> 
> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
> ---
> 
>  include/linux/memcontrol.h |   32 ++++++
>  include/linux/mm_types.h   |    4 
>  mm/memcontrol.c            |  212 +++++++++++++++++++++++++++------------------
>  mm/page_alloc.c            |   10 --
>  4 files changed, 162 insertions(+), 96 deletions(-)
> 
> diff -puN mm/memcontrol.c~memcg_move_to_radix_tree mm/memcontrol.c
> --- linux-2.6.27-rc5/mm/memcontrol.c~memcg_move_to_radix_tree	2008-09-04 03:15:54.000000000 -0700
> +++ linux-2.6.27-rc5-balbir/mm/memcontrol.c	2008-09-09 17:56:54.000000000 -0700
> @@ -18,6 +18,7 @@
>   */
>  
>  #include <linux/res_counter.h>
> +#include <linux/bootmem.h>
>  #include <linux/memcontrol.h>
>  #include <linux/cgroup.h>
>  #include <linux/mm.h>
> @@ -37,9 +38,10 @@
>  #include <asm/uaccess.h>
>  
>  struct cgroup_subsys mem_cgroup_subsys __read_mostly;
> -static struct kmem_cache *page_cgroup_cache __read_mostly;
>  #define MEM_CGROUP_RECLAIM_RETRIES	5
>  
> +static struct page_cgroup *pcg_map[MAX_NUMNODES];
> +
>  /*
>   * Statistics for memory cgroup.
>   */
> @@ -137,20 +139,6 @@ struct mem_cgroup {
>  static struct mem_cgroup init_mem_cgroup;
>  
>  /*
> - * We use the lower bit of the page->page_cgroup pointer as a bit spin
> - * lock.  We need to ensure that page->page_cgroup is at least two
> - * byte aligned (based on comments from Nick Piggin).  But since
> - * bit_spin_lock doesn't actually set that lock bit in a non-debug
> - * uniprocessor kernel, we should avoid setting it here too.
> - */
> -#define PAGE_CGROUP_LOCK_BIT 	0x0
> -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
> -#define PAGE_CGROUP_LOCK 	(1 << PAGE_CGROUP_LOCK_BIT)
> -#else
> -#define PAGE_CGROUP_LOCK	0x0
> -#endif
> -
> -/*
>   * A page_cgroup page is associated with every page descriptor. The
>   * page_cgroup helps us identify information about the cgroup
>   */
> @@ -158,12 +146,26 @@ struct page_cgroup {
>  	struct list_head lru;		/* per cgroup LRU list */
>  	struct page *page;
>  	struct mem_cgroup *mem_cgroup;
> -	int flags;
> +	unsigned long flags;
>  };
> -#define PAGE_CGROUP_FLAG_CACHE	   (0x1)	/* charged as cache */
> -#define PAGE_CGROUP_FLAG_ACTIVE    (0x2)	/* page is active in this cgroup */
> -#define PAGE_CGROUP_FLAG_FILE	   (0x4)	/* page is file system backed */
> -#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8)	/* page is unevictableable */
> +
> +/*
> + * LOCK_BIT is 0, with value 1
> + */
> +#define PAGE_CGROUP_FLAG_LOCK_BIT  (0x0)  /* lock bit */
> +
> +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
> +#define PAGE_CGROUP_FLAG_LOCK      (0x1)  /* lock value */
> +#else
> +#define PAGE_CGROUP_FLAG_LOCK      (0x0)  /* lock value */
> +#endif
> +
> +#define PAGE_CGROUP_FLAG_CACHE	   (0x2)   /* charged as cache */
> +#define PAGE_CGROUP_FLAG_ACTIVE    (0x4)   /* page is active in this cgroup */
> +#define PAGE_CGROUP_FLAG_FILE	   (0x8)   /* page is file system backed */
> +#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x10)/* page is unevictableable */
> +#define PAGE_CGROUP_FLAG_INUSE     (0x20)/* pc is allocated and in use */
> +#define PAGE_CGROUP_FLAG_VALID     (0x40)/* pc is allocated and in use */
>  
>  static int page_cgroup_nid(struct page_cgroup *pc)
>  {
> @@ -248,35 +250,99 @@ struct mem_cgroup *mem_cgroup_from_task(
>  				struct mem_cgroup, css);
>  }
>  
> -static inline int page_cgroup_locked(struct page *page)
> +static inline void lock_page_cgroup(struct page_cgroup *pc)
>  {
> -	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +	bit_spin_lock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
>  }
>  
> -static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
> +static inline int trylock_page_cgroup(struct page_cgroup *pc)
>  {
> -	VM_BUG_ON(!page_cgroup_locked(page));
> -	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
> +	return bit_spin_trylock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
>  }
>  
> -struct page_cgroup *page_get_page_cgroup(struct page *page)
> +static inline void unlock_page_cgroup(struct page_cgroup *pc)
>  {
> -	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
> +	bit_spin_unlock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
>  }
>  
> -static void lock_page_cgroup(struct page *page)
> +/*
> + * Called from memmap_init_zone(), has the advantage of dealing with
> + * memory_hotplug (Addition of memory)
> + */
> +int page_cgroup_alloc(int n)
>  {
> -	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +	struct pglist_data *pgdat;
> +	unsigned long size, start, end;
> +
> +	if (mem_cgroup_subsys.disabled)
> +		return;
> +
> +	pgdat = NODE_DATA(n);
> +	/*
> +	 * Already allocated, leave
> +	 */
> +	if (pcg_map[n])
> +		return 0;
> +
> +	start = pgdat->node_start_pfn;
> +	end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
> +	size = (end - start) * sizeof(struct page_cgroup);
> +	printk("Allocating %lu bytes for node %d\n", size, n);
^^^^^

1. This is nonsense...do you know the memory map of IBM's (maybe ppc) machine ?
Node's memory are splitted into several pieces and not ordered by node number.
example)
   Node 0 | Node 1 | Node 2 | Node 1 | Node 2 | 

This seems special but when I helped SPARSEMEM and MEMORY_HOTPLUG,
I saw mannnny kinds of memory map. As you wrote, this should be re-designed.

2. If pre-allocating all is ok, I stop my work. Mine is of-no-use.
But you have to know that by pre-allocationg, we can't use avoid-lru-lock
by batch like page_vec technique. We can't delay uncharge because a page
can be reused soon.




> +	pcg_map[n] = alloc_bootmem_node(pgdat, size);
> +	/*
> +	 * We can do smoother recovery
> +	 */
> +	BUG_ON(!pcg_map[n]);
> +	return 0;
>  }
>  
> -static int try_lock_page_cgroup(struct page *page)
> +void page_cgroup_init(int nid, unsigned long pfn)
>  {
> -	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +	unsigned long node_pfn;
> +	struct page_cgroup *pc;
> +
> +	if (mem_cgroup_subsys.disabled)
> +		return;
> +
> +	node_pfn = pfn - NODE_DATA(nid)->node_start_pfn;
> +	pc = &pcg_map[nid][node_pfn];
> +
> +	BUG_ON(!pc);
> +	pc->flags = PAGE_CGROUP_FLAG_VALID;
> +	INIT_LIST_HEAD(&pc->lru);
> +	pc->page = NULL;
This NULL is unnecessary. pc->page = pnf_to_page(pfn) always.


> +	pc->mem_cgroup = NULL;
>  }
>  
> -static void unlock_page_cgroup(struct page *page)
> +struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
> +						bool trylock)
>  {
> -	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +	struct page_cgroup *pc;
> +	int ret;
> +	int node = page_to_nid(page);
> +	unsigned long pfn;
> +
> +	pfn = page_to_pfn(page) - NODE_DATA(node)->node_start_pfn;
> +	pc = &pcg_map[node][pfn];
> +	BUG_ON(!(pc->flags & PAGE_CGROUP_FLAG_VALID));
> +	if (lock)
> +		lock_page_cgroup(pc);
> +	else if (trylock) {
> +		ret = trylock_page_cgroup(pc);
> +		if (!ret)
> +			pc = NULL;
> +	}
> +
> +	return pc;
> +}
> +
> +/*
> + * Should be called with page_cgroup lock held. Any additions to pc->flags
> + * should be reflected here. This might seem ugly, refine it later.
> + */
> +void page_clear_page_cgroup(struct page_cgroup *pc)
> +{
> +	pc->flags &= ~PAGE_CGROUP_FLAG_INUSE;
>  }
>  
>  static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
> @@ -377,17 +443,15 @@ void mem_cgroup_move_lists(struct page *
>  	 * safely get to page_cgroup without it, so just try_lock it:
>  	 * mem_cgroup_isolate_pages allows for page left on wrong list.
>  	 */
> -	if (!try_lock_page_cgroup(page))
> +	pc = page_get_page_cgroup_trylock(page);
> +	if (!pc)
>  		return;
>  
> -	pc = page_get_page_cgroup(page);
> -	if (pc) {
> -		mz = page_cgroup_zoneinfo(pc);
> -		spin_lock_irqsave(&mz->lru_lock, flags);
> -		__mem_cgroup_move_lists(pc, lru);
> -		spin_unlock_irqrestore(&mz->lru_lock, flags);
> -	}
> -	unlock_page_cgroup(page);
> +	mz = page_cgroup_zoneinfo(pc);
> +	spin_lock_irqsave(&mz->lru_lock, flags);
> +	__mem_cgroup_move_lists(pc, lru);
> +	spin_unlock_irqrestore(&mz->lru_lock, flags);
> +	unlock_page_cgroup(pc);

This lock/unlock_page_cgroup is against what ?

>  }
>  
>  /*
> @@ -521,10 +585,6 @@ static int mem_cgroup_charge_common(stru
>  	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
>  	struct mem_cgroup_per_zone *mz;
>  
> -	pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
> -	if (unlikely(pc == NULL))
> -		goto err;
> -
>  	/*
>  	 * We always charge the cgroup the mm_struct belongs to.
>  	 * The mm_struct's mem_cgroup changes on task migration if the
> @@ -567,43 +627,40 @@ static int mem_cgroup_charge_common(stru
>  		}
>  	}
>  
> +	pc = page_get_page_cgroup_locked(page);
> +	if (pc->flags & PAGE_CGROUP_FLAG_INUSE) {
> +		unlock_page_cgroup(pc);
> +		res_counter_uncharge(&mem->res, PAGE_SIZE);
> +		css_put(&mem->css);
> +		goto done;
> +	}
> +
Can this happen ? Our direction should be
VM_BUG_ON(pc->flags & PAGE_CGROUP_FLAG_INUSE)



>  	pc->mem_cgroup = mem;
>  	pc->page = page;
> +	pc->flags |= PAGE_CGROUP_FLAG_INUSE;
> +
>  	/*
>  	 * If a page is accounted as a page cache, insert to inactive list.
>  	 * If anon, insert to active list.
>  	 */
>  	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) {
> -		pc->flags = PAGE_CGROUP_FLAG_CACHE;
> +		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
>  		if (page_is_file_cache(page))
>  			pc->flags |= PAGE_CGROUP_FLAG_FILE;
>  		else
>  			pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
>  	} else
> -		pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
> -
> -	lock_page_cgroup(page);
> -	if (unlikely(page_get_page_cgroup(page))) {
> -		unlock_page_cgroup(page);
> -		res_counter_uncharge(&mem->res, PAGE_SIZE);
> -		css_put(&mem->css);
> -		kmem_cache_free(page_cgroup_cache, pc);
> -		goto done;
> -	}
> -	page_assign_page_cgroup(page, pc);
> +		pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
>  
>  	mz = page_cgroup_zoneinfo(pc);
>  	spin_lock_irqsave(&mz->lru_lock, flags);
>  	__mem_cgroup_add_list(mz, pc);
>  	spin_unlock_irqrestore(&mz->lru_lock, flags);
> -
> -	unlock_page_cgroup(page);
> +	unlock_page_cgroup(pc);

Is this lock/unlock_page_cgroup is for what kind of race ?

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-10  1:49                                         ` KAMEZAWA Hiroyuki
@ 2008-09-10  2:11                                           ` Balbir Singh
  2008-09-10  2:35                                             ` KAMEZAWA Hiroyuki
  2008-09-10 20:44                                           ` Nick Piggin
  1 sibling, 1 reply; 72+ messages in thread
From: Balbir Singh @ 2008-09-10  2:11 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

KAMEZAWA Hiroyuki wrote:
> On Tue, 9 Sep 2008 18:20:48 -0700
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
>> * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2008-09-09 21:30:12]:
>> OK, here is approach #2, it works for me and gives me really good
>> performance (surpassing even the current memory controller). I am
>> seeing almost a 7% increase
> This number is from pre-allcation, maybe.
> We really do alloc-at-boot all page_cgroup ? This seems a big change.
> 
>> Caveats
>>
>> 1. Uses more memory (since it allocates memory for each node based on
>>    spanned_pages. Ignores holes, so might not be the most efficient,
>>    but it is a tradeoff of complexity versus space. I propose refining it
>>    as we go along.
>> 2. Does not currently handle alloc_bootmem failure
>> 3. Needs lots of testing/tuning and polishing
>>
> If we can do alloc-at-boot, we can make memcg much simpler.
> 
> 
> 
>> I've tested it on an x86_64 box with 4G of memory
>>
>> Again, this is an early RFC patch, please review test. 
>>
>> Comments/Reviews?
>>
>> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
>> ---
>>
>>  include/linux/memcontrol.h |   32 ++++++
>>  include/linux/mm_types.h   |    4 
>>  mm/memcontrol.c            |  212 +++++++++++++++++++++++++++------------------
>>  mm/page_alloc.c            |   10 --
>>  4 files changed, 162 insertions(+), 96 deletions(-)
>>
>> diff -puN mm/memcontrol.c~memcg_move_to_radix_tree mm/memcontrol.c
>> --- linux-2.6.27-rc5/mm/memcontrol.c~memcg_move_to_radix_tree	2008-09-04 03:15:54.000000000 -0700
>> +++ linux-2.6.27-rc5-balbir/mm/memcontrol.c	2008-09-09 17:56:54.000000000 -0700
>> @@ -18,6 +18,7 @@
>>   */
>>  
>>  #include <linux/res_counter.h>
>> +#include <linux/bootmem.h>
>>  #include <linux/memcontrol.h>
>>  #include <linux/cgroup.h>
>>  #include <linux/mm.h>
>> @@ -37,9 +38,10 @@
>>  #include <asm/uaccess.h>
>>  
>>  struct cgroup_subsys mem_cgroup_subsys __read_mostly;
>> -static struct kmem_cache *page_cgroup_cache __read_mostly;
>>  #define MEM_CGROUP_RECLAIM_RETRIES	5
>>  
>> +static struct page_cgroup *pcg_map[MAX_NUMNODES];
>> +
>>  /*
>>   * Statistics for memory cgroup.
>>   */
>> @@ -137,20 +139,6 @@ struct mem_cgroup {
>>  static struct mem_cgroup init_mem_cgroup;
>>  
>>  /*
>> - * We use the lower bit of the page->page_cgroup pointer as a bit spin
>> - * lock.  We need to ensure that page->page_cgroup is at least two
>> - * byte aligned (based on comments from Nick Piggin).  But since
>> - * bit_spin_lock doesn't actually set that lock bit in a non-debug
>> - * uniprocessor kernel, we should avoid setting it here too.
>> - */
>> -#define PAGE_CGROUP_LOCK_BIT 	0x0
>> -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
>> -#define PAGE_CGROUP_LOCK 	(1 << PAGE_CGROUP_LOCK_BIT)
>> -#else
>> -#define PAGE_CGROUP_LOCK	0x0
>> -#endif
>> -
>> -/*
>>   * A page_cgroup page is associated with every page descriptor. The
>>   * page_cgroup helps us identify information about the cgroup
>>   */
>> @@ -158,12 +146,26 @@ struct page_cgroup {
>>  	struct list_head lru;		/* per cgroup LRU list */
>>  	struct page *page;
>>  	struct mem_cgroup *mem_cgroup;
>> -	int flags;
>> +	unsigned long flags;
>>  };
>> -#define PAGE_CGROUP_FLAG_CACHE	   (0x1)	/* charged as cache */
>> -#define PAGE_CGROUP_FLAG_ACTIVE    (0x2)	/* page is active in this cgroup */
>> -#define PAGE_CGROUP_FLAG_FILE	   (0x4)	/* page is file system backed */
>> -#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8)	/* page is unevictableable */
>> +
>> +/*
>> + * LOCK_BIT is 0, with value 1
>> + */
>> +#define PAGE_CGROUP_FLAG_LOCK_BIT  (0x0)  /* lock bit */
>> +
>> +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
>> +#define PAGE_CGROUP_FLAG_LOCK      (0x1)  /* lock value */
>> +#else
>> +#define PAGE_CGROUP_FLAG_LOCK      (0x0)  /* lock value */
>> +#endif
>> +
>> +#define PAGE_CGROUP_FLAG_CACHE	   (0x2)   /* charged as cache */
>> +#define PAGE_CGROUP_FLAG_ACTIVE    (0x4)   /* page is active in this cgroup */
>> +#define PAGE_CGROUP_FLAG_FILE	   (0x8)   /* page is file system backed */
>> +#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x10)/* page is unevictableable */
>> +#define PAGE_CGROUP_FLAG_INUSE     (0x20)/* pc is allocated and in use */
>> +#define PAGE_CGROUP_FLAG_VALID     (0x40)/* pc is allocated and in use */
>>  
>>  static int page_cgroup_nid(struct page_cgroup *pc)
>>  {
>> @@ -248,35 +250,99 @@ struct mem_cgroup *mem_cgroup_from_task(
>>  				struct mem_cgroup, css);
>>  }
>>  
>> -static inline int page_cgroup_locked(struct page *page)
>> +static inline void lock_page_cgroup(struct page_cgroup *pc)
>>  {
>> -	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
>> +	bit_spin_lock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
>>  }
>>  
>> -static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
>> +static inline int trylock_page_cgroup(struct page_cgroup *pc)
>>  {
>> -	VM_BUG_ON(!page_cgroup_locked(page));
>> -	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
>> +	return bit_spin_trylock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
>>  }
>>  
>> -struct page_cgroup *page_get_page_cgroup(struct page *page)
>> +static inline void unlock_page_cgroup(struct page_cgroup *pc)
>>  {
>> -	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
>> +	bit_spin_unlock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
>>  }
>>  
>> -static void lock_page_cgroup(struct page *page)
>> +/*
>> + * Called from memmap_init_zone(), has the advantage of dealing with
>> + * memory_hotplug (Addition of memory)
>> + */
>> +int page_cgroup_alloc(int n)
>>  {
>> -	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
>> +	struct pglist_data *pgdat;
>> +	unsigned long size, start, end;
>> +
>> +	if (mem_cgroup_subsys.disabled)
>> +		return;
>> +
>> +	pgdat = NODE_DATA(n);
>> +	/*
>> +	 * Already allocated, leave
>> +	 */
>> +	if (pcg_map[n])
>> +		return 0;
>> +
>> +	start = pgdat->node_start_pfn;
>> +	end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
>> +	size = (end - start) * sizeof(struct page_cgroup);
>> +	printk("Allocating %lu bytes for node %d\n", size, n);
> ^^^^^
> 
> 1. This is nonsense...do you know the memory map of IBM's (maybe ppc) machine ?
> Node's memory are splitted into several pieces and not ordered by node number.
> example)
>    Node 0 | Node 1 | Node 2 | Node 1 | Node 2 | 
> 
> This seems special but when I helped SPARSEMEM and MEMORY_HOTPLUG,
> I saw mannnny kinds of memory map. As you wrote, this should be re-designed.
> 

Thanks, so that means that we cannot before hand predict the size of pcg_map[n],
we'll need to do an incremental addition to pcg_map?

> 2. If pre-allocating all is ok, I stop my work. Mine is of-no-use.

One of the goals of this patch is refinement, it is a starting piece, something
I shared very early. I am not asking you to stop your work. While I think
pre-allocating is not the best way to do this, the trade off is the sparseness
of the machine. I don't mind doing it in other ways, but we'll still need to do
some batch'ed preallocation (of a smaller size maybe).

> But you have to know that by pre-allocationg, we can't use avoid-lru-lock
> by batch like page_vec technique. We can't delay uncharge because a page
> can be reused soon.
> 
> 

Care to elaborate on this? Why not? If the page is reused, we act on the batch
and sync it up

> 
> 
>> +	pcg_map[n] = alloc_bootmem_node(pgdat, size);
>> +	/*
>> +	 * We can do smoother recovery
>> +	 */
>> +	BUG_ON(!pcg_map[n]);
>> +	return 0;
>>  }
>>  
>> -static int try_lock_page_cgroup(struct page *page)
>> +void page_cgroup_init(int nid, unsigned long pfn)
>>  {
>> -	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
>> +	unsigned long node_pfn;
>> +	struct page_cgroup *pc;
>> +
>> +	if (mem_cgroup_subsys.disabled)
>> +		return;
>> +
>> +	node_pfn = pfn - NODE_DATA(nid)->node_start_pfn;
>> +	pc = &pcg_map[nid][node_pfn];
>> +
>> +	BUG_ON(!pc);
>> +	pc->flags = PAGE_CGROUP_FLAG_VALID;
>> +	INIT_LIST_HEAD(&pc->lru);
>> +	pc->page = NULL;
> This NULL is unnecessary. pc->page = pnf_to_page(pfn) always.
> 

OK

> 
>> +	pc->mem_cgroup = NULL;
>>  }
>>  
>> -static void unlock_page_cgroup(struct page *page)
>> +struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
>> +						bool trylock)
>>  {
>> -	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
>> +	struct page_cgroup *pc;
>> +	int ret;
>> +	int node = page_to_nid(page);
>> +	unsigned long pfn;
>> +
>> +	pfn = page_to_pfn(page) - NODE_DATA(node)->node_start_pfn;
>> +	pc = &pcg_map[node][pfn];
>> +	BUG_ON(!(pc->flags & PAGE_CGROUP_FLAG_VALID));
>> +	if (lock)
>> +		lock_page_cgroup(pc);
>> +	else if (trylock) {
>> +		ret = trylock_page_cgroup(pc);
>> +		if (!ret)
>> +			pc = NULL;
>> +	}
>> +
>> +	return pc;
>> +}
>> +
>> +/*
>> + * Should be called with page_cgroup lock held. Any additions to pc->flags
>> + * should be reflected here. This might seem ugly, refine it later.
>> + */
>> +void page_clear_page_cgroup(struct page_cgroup *pc)
>> +{
>> +	pc->flags &= ~PAGE_CGROUP_FLAG_INUSE;
>>  }
>>  
>>  static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
>> @@ -377,17 +443,15 @@ void mem_cgroup_move_lists(struct page *
>>  	 * safely get to page_cgroup without it, so just try_lock it:
>>  	 * mem_cgroup_isolate_pages allows for page left on wrong list.
>>  	 */
>> -	if (!try_lock_page_cgroup(page))
>> +	pc = page_get_page_cgroup_trylock(page);
>> +	if (!pc)
>>  		return;
>>  
>> -	pc = page_get_page_cgroup(page);
>> -	if (pc) {
>> -		mz = page_cgroup_zoneinfo(pc);
>> -		spin_lock_irqsave(&mz->lru_lock, flags);
>> -		__mem_cgroup_move_lists(pc, lru);
>> -		spin_unlock_irqrestore(&mz->lru_lock, flags);
>> -	}
>> -	unlock_page_cgroup(page);
>> +	mz = page_cgroup_zoneinfo(pc);
>> +	spin_lock_irqsave(&mz->lru_lock, flags);
>> +	__mem_cgroup_move_lists(pc, lru);
>> +	spin_unlock_irqrestore(&mz->lru_lock, flags);
>> +	unlock_page_cgroup(pc);
> 
> This lock/unlock_page_cgroup is against what ?
> 

We use page_cgroup_zoneinfo(pc), we want to make sure pc does not disappear or
change from underneath us.

>>  }
>>  
>>  /*
>> @@ -521,10 +585,6 @@ static int mem_cgroup_charge_common(stru
>>  	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
>>  	struct mem_cgroup_per_zone *mz;
>>  
>> -	pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
>> -	if (unlikely(pc == NULL))
>> -		goto err;
>> -
>>  	/*
>>  	 * We always charge the cgroup the mm_struct belongs to.
>>  	 * The mm_struct's mem_cgroup changes on task migration if the
>> @@ -567,43 +627,40 @@ static int mem_cgroup_charge_common(stru
>>  		}
>>  	}
>>  
>> +	pc = page_get_page_cgroup_locked(page);
>> +	if (pc->flags & PAGE_CGROUP_FLAG_INUSE) {
>> +		unlock_page_cgroup(pc);
>> +		res_counter_uncharge(&mem->res, PAGE_SIZE);
>> +		css_put(&mem->css);
>> +		goto done;
>> +	}
>> +
> Can this happen ? Our direction should be
> VM_BUG_ON(pc->flags & PAGE_CGROUP_FLAG_INUSE)
> 

Yes, it can.. several people trying to map the same page at once. Can't we race
doing that?

> 
> 
>>  	pc->mem_cgroup = mem;
>>  	pc->page = page;
>> +	pc->flags |= PAGE_CGROUP_FLAG_INUSE;
>> +
>>  	/*
>>  	 * If a page is accounted as a page cache, insert to inactive list.
>>  	 * If anon, insert to active list.
>>  	 */
>>  	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) {
>> -		pc->flags = PAGE_CGROUP_FLAG_CACHE;
>> +		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
>>  		if (page_is_file_cache(page))
>>  			pc->flags |= PAGE_CGROUP_FLAG_FILE;
>>  		else
>>  			pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
>>  	} else
>> -		pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
>> -
>> -	lock_page_cgroup(page);
>> -	if (unlikely(page_get_page_cgroup(page))) {
>> -		unlock_page_cgroup(page);
>> -		res_counter_uncharge(&mem->res, PAGE_SIZE);
>> -		css_put(&mem->css);
>> -		kmem_cache_free(page_cgroup_cache, pc);
>> -		goto done;
>> -	}
>> -	page_assign_page_cgroup(page, pc);
>> +		pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
>>  
>>  	mz = page_cgroup_zoneinfo(pc);
>>  	spin_lock_irqsave(&mz->lru_lock, flags);
>>  	__mem_cgroup_add_list(mz, pc);
>>  	spin_unlock_irqrestore(&mz->lru_lock, flags);
>> -
>> -	unlock_page_cgroup(page);
>> +	unlock_page_cgroup(pc);
> 
> Is this lock/unlock_page_cgroup is for what kind of race ?

for setting pc->flags and for setting pc->page and pc->mem_cgroup.


-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-10  2:11                                           ` Balbir Singh
@ 2008-09-10  2:35                                             ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-10  2:35 UTC (permalink / raw)
  To: balbir
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Tue, 09 Sep 2008 19:11:09 -0700
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > 1. This is nonsense...do you know the memory map of IBM's (maybe ppc) machine ?
> > Node's memory are splitted into several pieces and not ordered by node number.
> > example)
> >    Node 0 | Node 1 | Node 2 | Node 1 | Node 2 | 
> > 
> > This seems special but when I helped SPARSEMEM and MEMORY_HOTPLUG,
> > I saw mannnny kinds of memory map. As you wrote, this should be re-designed.
> > 
> 
> Thanks, so that means that we cannot before hand predict the size of pcg_map[n],
> we'll need to do an incremental addition to pcg_map?
Or use some "allocate a chunk of page_cgroup for a chunk of continuous pages".
(This is the reason I mentioned SPARSEMEM.)

> 
> > 2. If pre-allocating all is ok, I stop my work. Mine is of-no-use.
> 
> One of the goals of this patch is refinement, it is a starting piece, something
> I shared very early. I am not asking you to stop your work. While I think
> pre-allocating is not the best way to do this, the trade off is the sparseness
> of the machine. I don't mind doing it in other ways, but we'll still need to do
> some batch'ed preallocation (of a smaller size maybe).
> 
Hmm, maybe clarifying trade-off and comapring them is the first step.
I'll post my idea if it comes.

> > But you have to know that by pre-allocationg, we can't use avoid-lru-lock
> > by batch like page_vec technique. We can't delay uncharge because a page
> > can be reused soon.
> > 
> > 
> 
> Care to elaborate on this? Why not? If the page is reused, we act on the batch
> and sync it up
> 
And touch vec on other cpu ? The reason "vec" is fast is because it's per-cpu.
If we want to use "delaying", we'll have to make page_cgroup unused and not-on-lru
when the page of page_cgroup is added to free queue.

> > 
> > 
> >> +	pcg_map[n] = alloc_bootmem_node(pgdat, size);
> >> +	/*
> >> +	 * We can do smoother recovery
> >> +	 */
> >> +	BUG_ON(!pcg_map[n]);
> >> +	return 0;
> >>  }
> >>  
> >> -static int try_lock_page_cgroup(struct page *page)
> >> +void page_cgroup_init(int nid, unsigned long pfn)
> >>  {
> >> -	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> >> +	unsigned long node_pfn;
> >> +	struct page_cgroup *pc;
> >> +
> >> +	if (mem_cgroup_subsys.disabled)
> >> +		return;
> >> +
> >> +	node_pfn = pfn - NODE_DATA(nid)->node_start_pfn;
> >> +	pc = &pcg_map[nid][node_pfn];
> >> +
> >> +	BUG_ON(!pc);
> >> +	pc->flags = PAGE_CGROUP_FLAG_VALID;
> >> +	INIT_LIST_HEAD(&pc->lru);
> >> +	pc->page = NULL;
> > This NULL is unnecessary. pc->page = pnf_to_page(pfn) always.
> > 
> 
> OK
> 
> > 
> >> +	pc->mem_cgroup = NULL;
> >>  }
> >>  
> >> -static void unlock_page_cgroup(struct page *page)
> >> +struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
> >> +						bool trylock)
> >>  {
> >> -	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> >> +	struct page_cgroup *pc;
> >> +	int ret;
> >> +	int node = page_to_nid(page);
> >> +	unsigned long pfn;
> >> +
> >> +	pfn = page_to_pfn(page) - NODE_DATA(node)->node_start_pfn;
> >> +	pc = &pcg_map[node][pfn];
> >> +	BUG_ON(!(pc->flags & PAGE_CGROUP_FLAG_VALID));
> >> +	if (lock)
> >> +		lock_page_cgroup(pc);
> >> +	else if (trylock) {
> >> +		ret = trylock_page_cgroup(pc);
> >> +		if (!ret)
> >> +			pc = NULL;
> >> +	}
> >> +
> >> +	return pc;
> >> +}
> >> +
> >> +/*
> >> + * Should be called with page_cgroup lock held. Any additions to pc->flags
> >> + * should be reflected here. This might seem ugly, refine it later.
> >> + */
> >> +void page_clear_page_cgroup(struct page_cgroup *pc)
> >> +{
> >> +	pc->flags &= ~PAGE_CGROUP_FLAG_INUSE;
> >>  }
> >>  
> >>  static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
> >> @@ -377,17 +443,15 @@ void mem_cgroup_move_lists(struct page *
> >>  	 * safely get to page_cgroup without it, so just try_lock it:
> >>  	 * mem_cgroup_isolate_pages allows for page left on wrong list.
> >>  	 */
> >> -	if (!try_lock_page_cgroup(page))
> >> +	pc = page_get_page_cgroup_trylock(page);
> >> +	if (!pc)
> >>  		return;
> >>  
> >> -	pc = page_get_page_cgroup(page);
> >> -	if (pc) {
> >> -		mz = page_cgroup_zoneinfo(pc);
> >> -		spin_lock_irqsave(&mz->lru_lock, flags);
> >> -		__mem_cgroup_move_lists(pc, lru);
> >> -		spin_unlock_irqrestore(&mz->lru_lock, flags);
> >> -	}
> >> -	unlock_page_cgroup(page);
> >> +	mz = page_cgroup_zoneinfo(pc);
> >> +	spin_lock_irqsave(&mz->lru_lock, flags);
> >> +	__mem_cgroup_move_lists(pc, lru);
> >> +	spin_unlock_irqrestore(&mz->lru_lock, flags);
> >> +	unlock_page_cgroup(pc);
> > 
> > This lock/unlock_page_cgroup is against what ?
> > 
> 
> We use page_cgroup_zoneinfo(pc), we want to make sure pc does not disappear or
> change from underneath us.
> 
> >>  }
> >>  
> >>  /*
> >> @@ -521,10 +585,6 @@ static int mem_cgroup_charge_common(stru
> >>  	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> >>  	struct mem_cgroup_per_zone *mz;
> >>  
> >> -	pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
> >> -	if (unlikely(pc == NULL))
> >> -		goto err;
> >> -
> >>  	/*
> >>  	 * We always charge the cgroup the mm_struct belongs to.
> >>  	 * The mm_struct's mem_cgroup changes on task migration if the
> >> @@ -567,43 +627,40 @@ static int mem_cgroup_charge_common(stru
> >>  		}
> >>  	}
> >>  
> >> +	pc = page_get_page_cgroup_locked(page);
> >> +	if (pc->flags & PAGE_CGROUP_FLAG_INUSE) {
> >> +		unlock_page_cgroup(pc);
> >> +		res_counter_uncharge(&mem->res, PAGE_SIZE);
> >> +		css_put(&mem->css);
> >> +		goto done;
> >> +	}
> >> +
> > Can this happen ? Our direction should be
> > VM_BUG_ON(pc->flags & PAGE_CGROUP_FLAG_INUSE)
> > 
> 
> Yes, it can.. several people trying to map the same page at once. Can't we race
> doing that?
> 
I'll dig this. My version(lockless) already removed this and use VM_BUG_ON()

> > 
> > 
> >>  	pc->mem_cgroup = mem;
> >>  	pc->page = page;
> >> +	pc->flags |= PAGE_CGROUP_FLAG_INUSE;
> >> +
> >>  	/*
> >>  	 * If a page is accounted as a page cache, insert to inactive list.
> >>  	 * If anon, insert to active list.
> >>  	 */
> >>  	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) {
> >> -		pc->flags = PAGE_CGROUP_FLAG_CACHE;
> >> +		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
> >>  		if (page_is_file_cache(page))
> >>  			pc->flags |= PAGE_CGROUP_FLAG_FILE;
> >>  		else
> >>  			pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
> >>  	} else
> >> -		pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
> >> -
> >> -	lock_page_cgroup(page);
> >> -	if (unlikely(page_get_page_cgroup(page))) {
> >> -		unlock_page_cgroup(page);
> >> -		res_counter_uncharge(&mem->res, PAGE_SIZE);
> >> -		css_put(&mem->css);
> >> -		kmem_cache_free(page_cgroup_cache, pc);
> >> -		goto done;
> >> -	}
> >> -	page_assign_page_cgroup(page, pc);
> >> +		pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
> >>  
> >>  	mz = page_cgroup_zoneinfo(pc);
> >>  	spin_lock_irqsave(&mz->lru_lock, flags);
> >>  	__mem_cgroup_add_list(mz, pc);
> >>  	spin_unlock_irqrestore(&mz->lru_lock, flags);
> >> -
> >> -	unlock_page_cgroup(page);
> >> +	unlock_page_cgroup(pc);
> > 
> > Is this lock/unlock_page_cgroup is for what kind of race ?
> 
> for setting pc->flags and for setting pc->page and pc->mem_cgroup.
> 

Hmm...there is a confustion, maybe.

The page_cgroup is now 1:1 to struct page. Then, we can guarantee that

- There is no race between charge v.s. uncharge.

Only problem is force_empty. (But it's difficult..)

This means pc->mem_cgroup is safe here. 
And pc->flags should be atomic flags, anyway. I believe we have to record
"Dirty bit" at el, later.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-10 20:44                                           ` Nick Piggin
@ 2008-09-10 11:03                                             ` KAMEZAWA Hiroyuki
  2008-09-10 21:02                                               ` Nick Piggin
  0 siblings, 1 reply; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-10 11:03 UTC (permalink / raw)
  To: Nick Piggin
  Cc: balbir, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Thu, 11 Sep 2008 06:44:37 +1000
Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> On Wednesday 10 September 2008 11:49, KAMEZAWA Hiroyuki wrote:
> > On Tue, 9 Sep 2008 18:20:48 -0700
> >
> > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > > * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2008-09-09
> > > 21:30:12]: OK, here is approach #2, it works for me and gives me really
> > > good performance (surpassing even the current memory controller). I am
> > > seeing almost a 7% increase
> >
> > This number is from pre-allcation, maybe.
> > We really do alloc-at-boot all page_cgroup ? This seems a big change.
> 
> It seems really nice to me -- we get the best of both worlds, less overhead
> for those who don't enable the memory controller, and even better
> performance for those who do.

No trobles for me for allocating-all-at-boot policy.
My small concern is
  - wasting page_cgroup for hugepage area.
  - memory hotplug

> 
> Are you expecting many users to want to turn this on and off at runtime?
> I wouldn't expect so, but I don't know enough about them.
> 
There is no runtime switch. only at boot.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-10 21:02                                               ` Nick Piggin
@ 2008-09-10 11:27                                                 ` KAMEZAWA Hiroyuki
  2008-09-10 14:34                                                   ` Balbir Singh
  0 siblings, 1 reply; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-10 11:27 UTC (permalink / raw)
  To: Nick Piggin
  Cc: balbir, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Thu, 11 Sep 2008 07:02:44 +1000
Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> On Wednesday 10 September 2008 21:03, KAMEZAWA Hiroyuki wrote:
> > On Thu, 11 Sep 2008 06:44:37 +1000
> >
> > Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> > > On Wednesday 10 September 2008 11:49, KAMEZAWA Hiroyuki wrote:
> > > > On Tue, 9 Sep 2008 18:20:48 -0700
> > > >
> > > > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > > > > * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2008-09-09
> > > > > 21:30:12]: OK, here is approach #2, it works for me and gives me
> > > > > really good performance (surpassing even the current memory
> > > > > controller). I am seeing almost a 7% increase
> > > >
> > > > This number is from pre-allcation, maybe.
> > > > We really do alloc-at-boot all page_cgroup ? This seems a big change.
> > >
> > > It seems really nice to me -- we get the best of both worlds, less
> > > overhead for those who don't enable the memory controller, and even
> > > better performance for those who do.
> >
> > No trobles for me for allocating-all-at-boot policy.
> > My small concern is
> >   - wasting page_cgroup for hugepage area.
> >   - memory hotplug
> 
> In those cases you still waste the struct page area too. I realise that
> isn't a good way to justify even more wastage. But I guess it is
> relatively low. At least, I would think the users would be more happy to
> get a 7% performance increase for small pages! :)
> 
I guess the increase mostly because we can completely avoid kmalloc/kfree slow path.

Balbir, how about fix our way to allocate-all-at-boot-policy ?
If you say yes, I think I can help you and I'll find usable part from my garbage.

Following is lockless+remove-page-cgroup-pointer-from-page-struct patch's result.

rc5-mm1
==
Execl Throughput                           3006.5 lps   (29.8 secs, 3 samples)
C Compiler Throughput                      1006.7 lpm   (60.0 secs, 3 samples)
Shell Scripts (1 concurrent)               4863.7 lpm   (60.0 secs, 3 samples)
Shell Scripts (8 concurrent)                943.7 lpm   (60.0 secs, 3 samples)
Shell Scripts (16 concurrent)               482.7 lpm   (60.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places         124804.9 lpm   (30.0 secs, 3 samples)

lockless
==
Execl Throughput                           3035.5 lps   (29.6 secs, 3 samples)
C Compiler Throughput                      1010.3 lpm   (60.0 secs, 3 samples)
Shell Scripts (1 concurrent)               4881.0 lpm   (60.0 secs, 3 samples)
Shell Scripts (8 concurrent)                947.7 lpm   (60.0 secs, 3 samples)
Shell Scripts (16 concurrent)               485.0 lpm   (60.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places         125437.9 lpm   (30.0 secs, 3 samples)

lockless + remove page cgroup pointer (my version).
==
Execl Throughput                           3021.1 lps   (29.5 secs, 3 samples)
C Compiler Throughput                       980.3 lpm   (60.0 secs, 3 samples)
Shell Scripts (1 concurrent)               4600.0 lpm   (60.0 secs, 3 samples)
Shell Scripts (8 concurrent)                915.7 lpm   (60.0 secs, 3 samples)
Shell Scripts (16 concurrent)               468.3 lpm   (60.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places         124909.1 lpm   (30.0 secs, 3 samples)

Oh,yes. siginificant slow down. I'm glad to kick this patch out to trash box.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-10 11:27                                                 ` KAMEZAWA Hiroyuki
@ 2008-09-10 14:34                                                   ` Balbir Singh
  0 siblings, 0 replies; 72+ messages in thread
From: Balbir Singh @ 2008-09-10 14:34 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Nick Piggin, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

KAMEZAWA Hiroyuki wrote:
> On Thu, 11 Sep 2008 07:02:44 +1000
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
>> On Wednesday 10 September 2008 21:03, KAMEZAWA Hiroyuki wrote:
>>> On Thu, 11 Sep 2008 06:44:37 +1000
>>>
>>> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>>>> On Wednesday 10 September 2008 11:49, KAMEZAWA Hiroyuki wrote:
>>>>> On Tue, 9 Sep 2008 18:20:48 -0700
>>>>>
>>>>> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>>>>> * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2008-09-09
>>>>>> 21:30:12]: OK, here is approach #2, it works for me and gives me
>>>>>> really good performance (surpassing even the current memory
>>>>>> controller). I am seeing almost a 7% increase
>>>>> This number is from pre-allcation, maybe.
>>>>> We really do alloc-at-boot all page_cgroup ? This seems a big change.
>>>> It seems really nice to me -- we get the best of both worlds, less
>>>> overhead for those who don't enable the memory controller, and even
>>>> better performance for those who do.
>>> No trobles for me for allocating-all-at-boot policy.
>>> My small concern is
>>>   - wasting page_cgroup for hugepage area.
>>>   - memory hotplug
>> In those cases you still waste the struct page area too. I realise that
>> isn't a good way to justify even more wastage. But I guess it is
>> relatively low. At least, I would think the users would be more happy to
>> get a 7% performance increase for small pages! :)
>>
> I guess the increase mostly because we can completely avoid kmalloc/kfree slow path.
> 

Correct

> Balbir, how about fix our way to allocate-all-at-boot-policy ?
> If you say yes, I think I can help you and I'll find usable part from my garbage.
> 

I am perfectly fine with it, I'll need your expertise to get the
alloc-at-boot-policy correct.

> Following is lockless+remove-page-cgroup-pointer-from-page-struct patch's result.
> 
> rc5-mm1
> ==
> Execl Throughput                           3006.5 lps   (29.8 secs, 3 samples)
> C Compiler Throughput                      1006.7 lpm   (60.0 secs, 3 samples)
> Shell Scripts (1 concurrent)               4863.7 lpm   (60.0 secs, 3 samples)
> Shell Scripts (8 concurrent)                943.7 lpm   (60.0 secs, 3 samples)
> Shell Scripts (16 concurrent)               482.7 lpm   (60.0 secs, 3 samples)
> Dc: sqrt(2) to 99 decimal places         124804.9 lpm   (30.0 secs, 3 samples)
> 
> lockless
> ==
> Execl Throughput                           3035.5 lps   (29.6 secs, 3 samples)
> C Compiler Throughput                      1010.3 lpm   (60.0 secs, 3 samples)
> Shell Scripts (1 concurrent)               4881.0 lpm   (60.0 secs, 3 samples)
> Shell Scripts (8 concurrent)                947.7 lpm   (60.0 secs, 3 samples)
> Shell Scripts (16 concurrent)               485.0 lpm   (60.0 secs, 3 samples)
> Dc: sqrt(2) to 99 decimal places         125437.9 lpm   (30.0 secs, 3 samples)
> 
> lockless + remove page cgroup pointer (my version).
> ==
> Execl Throughput                           3021.1 lps   (29.5 secs, 3 samples)
> C Compiler Throughput                       980.3 lpm   (60.0 secs, 3 samples)
> Shell Scripts (1 concurrent)               4600.0 lpm   (60.0 secs, 3 samples)
> Shell Scripts (8 concurrent)                915.7 lpm   (60.0 secs, 3 samples)
> Shell Scripts (16 concurrent)               468.3 lpm   (60.0 secs, 3 samples)
> Dc: sqrt(2) to 99 decimal places         124909.1 lpm   (30.0 secs, 3 samples)
> 
> Oh,yes. siginificant slow down. I'm glad to kick this patch out to trash box.
> 
> Thanks,
> -Kame
> 


-- 
	Thanks,
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-10  1:49                                         ` KAMEZAWA Hiroyuki
  2008-09-10  2:11                                           ` Balbir Singh
@ 2008-09-10 20:44                                           ` Nick Piggin
  2008-09-10 11:03                                             ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 72+ messages in thread
From: Nick Piggin @ 2008-09-10 20:44 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: balbir, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Wednesday 10 September 2008 11:49, KAMEZAWA Hiroyuki wrote:
> On Tue, 9 Sep 2008 18:20:48 -0700
>
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2008-09-09
> > 21:30:12]: OK, here is approach #2, it works for me and gives me really
> > good performance (surpassing even the current memory controller). I am
> > seeing almost a 7% increase
>
> This number is from pre-allcation, maybe.
> We really do alloc-at-boot all page_cgroup ? This seems a big change.

It seems really nice to me -- we get the best of both worlds, less overhead
for those who don't enable the memory controller, and even better
performance for those who do.

Are you expecting many users to want to turn this on and off at runtime?
I wouldn't expect so, but I don't know enough about them.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-10 11:03                                             ` KAMEZAWA Hiroyuki
@ 2008-09-10 21:02                                               ` Nick Piggin
  2008-09-10 11:27                                                 ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 72+ messages in thread
From: Nick Piggin @ 2008-09-10 21:02 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: balbir, Andrew Morton, hugh, menage, xemul, linux-kernel, linux-mm

On Wednesday 10 September 2008 21:03, KAMEZAWA Hiroyuki wrote:
> On Thu, 11 Sep 2008 06:44:37 +1000
>
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> > On Wednesday 10 September 2008 11:49, KAMEZAWA Hiroyuki wrote:
> > > On Tue, 9 Sep 2008 18:20:48 -0700
> > >
> > > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > > > * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2008-09-09
> > > > 21:30:12]: OK, here is approach #2, it works for me and gives me
> > > > really good performance (surpassing even the current memory
> > > > controller). I am seeing almost a 7% increase
> > >
> > > This number is from pre-allcation, maybe.
> > > We really do alloc-at-boot all page_cgroup ? This seems a big change.
> >
> > It seems really nice to me -- we get the best of both worlds, less
> > overhead for those who don't enable the memory controller, and even
> > better performance for those who do.
>
> No trobles for me for allocating-all-at-boot policy.
> My small concern is
>   - wasting page_cgroup for hugepage area.
>   - memory hotplug

In those cases you still waste the struct page area too. I realise that
isn't a good way to justify even more wastage. But I guess it is
relatively low. At least, I would think the users would be more happy to
get a 7% performance increase for small pages! :)


> > Are you expecting many users to want to turn this on and off at runtime?
> > I wouldn't expect so, but I don't know enough about them.
>
> There is no runtime switch. only at boot.

Fine


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-10  1:20                                       ` [Approach #2] " Balbir Singh
  2008-09-10  1:49                                         ` KAMEZAWA Hiroyuki
@ 2008-09-10 22:21                                         ` Dave Hansen
  2008-09-10 22:31                                           ` David Miller
                                                             ` (2 more replies)
  1 sibling, 3 replies; 72+ messages in thread
From: Dave Hansen @ 2008-09-10 22:21 UTC (permalink / raw)
  To: balbir
  Cc: kamezawa.hiroyu, Nick Piggin, Andrew Morton, hugh, menage, xemul,
	linux-kernel, linux-mm

On Tue, 2008-09-09 at 18:20 -0700, Balbir Singh wrote:
> +       start = pgdat->node_start_pfn;
> +       end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
> +       size = (end - start) * sizeof(struct page_cgroup);
> +       printk("Allocating %lu bytes for node %d\n", size, n);
> +       pcg_map[n] = alloc_bootmem_node(pgdat, size);
> +       /*
> +        * We can do smoother recovery
> +        */
> +       BUG_ON(!pcg_map[n]);
> +       return 0;
>  }

This will really suck for sparse memory machines.  Imagine a machine
with 1GB of memory at 0x0 and another 1GB of memory at 1TB up in the
address space.

You also need to consider how it works with memory hotplug and how
you're going to grow it at runtime.

Oh, and doesn't alloc_bootmem() panic() if it fails internally anyway?

I need to look at your other approach. :)

-- Dave


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-10 22:21                                         ` Dave Hansen
@ 2008-09-10 22:31                                           ` David Miller
  2008-09-10 22:36                                           ` Balbir Singh
  2008-09-10 22:38                                           ` [Approach #2] [RFC][PATCH] Remove cgroup member from struct page Nick Piggin
  2 siblings, 0 replies; 72+ messages in thread
From: David Miller @ 2008-09-10 22:31 UTC (permalink / raw)
  To: dave
  Cc: balbir, kamezawa.hiroyu, nickpiggin, akpm, hugh, menage, xemul,
	linux-kernel, linux-mm

From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Wed, 10 Sep 2008 15:21:00 -0700

> This will really suck for sparse memory machines.  Imagine a machine
> with 1GB of memory at 0x0 and another 1GB of memory at 1TB up in the
> address space.

You just described the workstation I am typing this from :-)

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-10 22:21                                         ` Dave Hansen
  2008-09-10 22:31                                           ` David Miller
@ 2008-09-10 22:36                                           ` Balbir Singh
  2008-09-10 22:56                                             ` Dave Hansen
  2008-09-10 22:38                                           ` [Approach #2] [RFC][PATCH] Remove cgroup member from struct page Nick Piggin
  2 siblings, 1 reply; 72+ messages in thread
From: Balbir Singh @ 2008-09-10 22:36 UTC (permalink / raw)
  To: Dave Hansen
  Cc: kamezawa.hiroyu, Nick Piggin, Andrew Morton, hugh, menage, xemul,
	linux-kernel, linux-mm

Dave Hansen wrote:
> On Tue, 2008-09-09 at 18:20 -0700, Balbir Singh wrote:
>> +       start = pgdat->node_start_pfn;
>> +       end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
>> +       size = (end - start) * sizeof(struct page_cgroup);
>> +       printk("Allocating %lu bytes for node %d\n", size, n);
>> +       pcg_map[n] = alloc_bootmem_node(pgdat, size);
>> +       /*
>> +        * We can do smoother recovery
>> +        */
>> +       BUG_ON(!pcg_map[n]);
>> +       return 0;
>>  }
> 
> This will really suck for sparse memory machines.  Imagine a machine
> with 1GB of memory at 0x0 and another 1GB of memory at 1TB up in the
> address space.
> 

I would hate to re-implement the entire sparsemem code :(
Kame did suggest making the memory controller depend on sparsemem (to hook in
from there for allocations)

> You also need to consider how it works with memory hotplug and how
> you're going to grow it at runtime.
> 

Yes, true. This is not the final version, a very very early version that I
posted for initial comments.

> Oh, and doesn't alloc_bootmem() panic() if it fails internally anyway?
> 
> I need to look at your other approach. :)

We'll need some slab_is_available() sort of checks that sparse.c uses and also
deal with memory hotplug add and remove.

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-10 22:21                                         ` Dave Hansen
  2008-09-10 22:31                                           ` David Miller
  2008-09-10 22:36                                           ` Balbir Singh
@ 2008-09-10 22:38                                           ` Nick Piggin
  2 siblings, 0 replies; 72+ messages in thread
From: Nick Piggin @ 2008-09-10 22:38 UTC (permalink / raw)
  To: Dave Hansen
  Cc: balbir, kamezawa.hiroyu, Andrew Morton, hugh, menage, xemul,
	linux-kernel, linux-mm

On Thursday 11 September 2008 08:21, Dave Hansen wrote:
> On Tue, 2008-09-09 at 18:20 -0700, Balbir Singh wrote:
> > +       start = pgdat->node_start_pfn;
> > +       end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
> > +       size = (end - start) * sizeof(struct page_cgroup);
> > +       printk("Allocating %lu bytes for node %d\n", size, n);
> > +       pcg_map[n] = alloc_bootmem_node(pgdat, size);
> > +       /*
> > +        * We can do smoother recovery
> > +        */
> > +       BUG_ON(!pcg_map[n]);
> > +       return 0;
> >  }
>
> This will really suck for sparse memory machines.  Imagine a machine
> with 1GB of memory at 0x0 and another 1GB of memory at 1TB up in the
> address space.
>
> You also need to consider how it works with memory hotplug and how
> you're going to grow it at runtime.

I think it should try to hook into the physical memory model
code. I thought it was going to do that but didn't look at the
details...

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-10 22:36                                           ` Balbir Singh
@ 2008-09-10 22:56                                             ` Dave Hansen
  2008-09-11  1:35                                               ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 72+ messages in thread
From: Dave Hansen @ 2008-09-10 22:56 UTC (permalink / raw)
  To: balbir
  Cc: kamezawa.hiroyu, Nick Piggin, Andrew Morton, hugh, menage, xemul,
	linux-kernel, linux-mm

On Wed, 2008-09-10 at 15:36 -0700, Balbir Singh wrote:
> Dave Hansen wrote:
> > On Tue, 2008-09-09 at 18:20 -0700, Balbir Singh wrote:
> >> +       start = pgdat->node_start_pfn;
> >> +       end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
> >> +       size = (end - start) * sizeof(struct page_cgroup);
> >> +       printk("Allocating %lu bytes for node %d\n", size, n);
> >> +       pcg_map[n] = alloc_bootmem_node(pgdat, size);
> >> +       /*
> >> +        * We can do smoother recovery
> >> +        */
> >> +       BUG_ON(!pcg_map[n]);
> >> +       return 0;
> >>  }
> > 
> > This will really suck for sparse memory machines.  Imagine a machine
> > with 1GB of memory at 0x0 and another 1GB of memory at 1TB up in the
> > address space.
> > 
> 
> I would hate to re-implement the entire sparsemem code :(
> Kame did suggest making the memory controller depend on sparsemem (to hook in
> from there for allocations)

Yeah, you could just make another mem_section member.  Or, you could
work to abstract the sparsemem code so that other people can use it, or
maybe make it more dynamic so we can have multiple pfn->object lookups
in parallel.  Adding the struct member is obviously easier.

-- Dave


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-10 22:56                                             ` Dave Hansen
@ 2008-09-11  1:35                                               ` KAMEZAWA Hiroyuki
  2008-09-11  1:47                                                 ` Balbir Singh
  0 siblings, 1 reply; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-11  1:35 UTC (permalink / raw)
  To: Dave Hansen
  Cc: balbir, Nick Piggin, Andrew Morton, hugh, menage, xemul,
	linux-kernel, linux-mm

On Wed, 10 Sep 2008 15:56:48 -0700
Dave Hansen <dave@linux.vnet.ibm.com> wrote:

> On Wed, 2008-09-10 at 15:36 -0700, Balbir Singh wrote:
> > Dave Hansen wrote:
> > > On Tue, 2008-09-09 at 18:20 -0700, Balbir Singh wrote:
> > >> +       start = pgdat->node_start_pfn;
> > >> +       end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
> > >> +       size = (end - start) * sizeof(struct page_cgroup);
> > >> +       printk("Allocating %lu bytes for node %d\n", size, n);
> > >> +       pcg_map[n] = alloc_bootmem_node(pgdat, size);
> > >> +       /*
> > >> +        * We can do smoother recovery
> > >> +        */
> > >> +       BUG_ON(!pcg_map[n]);
> > >> +       return 0;
> > >>  }
> > > 
> > > This will really suck for sparse memory machines.  Imagine a machine
> > > with 1GB of memory at 0x0 and another 1GB of memory at 1TB up in the
> > > address space.
> > > 
> > 
> > I would hate to re-implement the entire sparsemem code :(
> > Kame did suggest making the memory controller depend on sparsemem (to hook in
> > from there for allocations)
> 
> Yeah, you could just make another mem_section member.  Or, you could
> work to abstract the sparsemem code so that other people can use it, or
> maybe make it more dynamic so we can have multiple pfn->object lookups
> in parallel.  Adding the struct member is obviously easier.
> 
Don't worry. I'll care sparse memory map and hotplug.
But whether making this depends on SPARSEMEM or not is not fixed yet.
I'll try generic one, at first. If it's dirty, start discussion about SPARSEMEM.

(Honestly, I love sparsemem than others ;)

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-11  1:35                                               ` KAMEZAWA Hiroyuki
@ 2008-09-11  1:47                                                 ` Balbir Singh
  2008-09-11  1:56                                                   ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 72+ messages in thread
From: Balbir Singh @ 2008-09-11  1:47 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Dave Hansen, Nick Piggin, Andrew Morton, hugh, menage, xemul,
	linux-kernel, linux-mm

KAMEZAWA Hiroyuki wrote:
> On Wed, 10 Sep 2008 15:56:48 -0700
> Dave Hansen <dave@linux.vnet.ibm.com> wrote:
> 
>> On Wed, 2008-09-10 at 15:36 -0700, Balbir Singh wrote:
>>> Dave Hansen wrote:
>>>> On Tue, 2008-09-09 at 18:20 -0700, Balbir Singh wrote:
>>>>> +       start = pgdat->node_start_pfn;
>>>>> +       end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
>>>>> +       size = (end - start) * sizeof(struct page_cgroup);
>>>>> +       printk("Allocating %lu bytes for node %d\n", size, n);
>>>>> +       pcg_map[n] = alloc_bootmem_node(pgdat, size);
>>>>> +       /*
>>>>> +        * We can do smoother recovery
>>>>> +        */
>>>>> +       BUG_ON(!pcg_map[n]);
>>>>> +       return 0;
>>>>>  }
>>>> This will really suck for sparse memory machines.  Imagine a machine
>>>> with 1GB of memory at 0x0 and another 1GB of memory at 1TB up in the
>>>> address space.
>>>>
>>> I would hate to re-implement the entire sparsemem code :(
>>> Kame did suggest making the memory controller depend on sparsemem (to hook in
>>> from there for allocations)
>> Yeah, you could just make another mem_section member.  Or, you could
>> work to abstract the sparsemem code so that other people can use it, or
>> maybe make it more dynamic so we can have multiple pfn->object lookups
>> in parallel.  Adding the struct member is obviously easier.
>>
> Don't worry. I'll care sparse memory map and hotplug.
> But whether making this depends on SPARSEMEM or not is not fixed yet.
> I'll try generic one, at first. If it's dirty, start discussion about SPARSEMEM.
> 
> (Honestly, I love sparsemem than others ;)

My concern is that if we depend on sparsemem, then we force distros to turn on
sparsemem (which might be the default, but not on all architectures), we might
end up losing those architectures (w.r.t. turning on the memory controller)
where sparsemem is not the default on the distro.


-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page
  2008-09-11  1:47                                                 ` Balbir Singh
@ 2008-09-11  1:56                                                   ` KAMEZAWA Hiroyuki
  2008-09-17 23:28                                                     ` [RFC][PATCH] Remove cgroup member from struct page (v3) Balbir Singh
  0 siblings, 1 reply; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-11  1:56 UTC (permalink / raw)
  To: balbir
  Cc: Dave Hansen, Nick Piggin, Andrew Morton, hugh, menage, xemul,
	linux-kernel, linux-mm

On Wed, 10 Sep 2008 18:47:25 -0700
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> KAMEZAWA Hiroyuki wrote:
> > On Wed, 10 Sep 2008 15:56:48 -0700
> > Dave Hansen <dave@linux.vnet.ibm.com> wrote:
> > 
> >> On Wed, 2008-09-10 at 15:36 -0700, Balbir Singh wrote:
> >>> Dave Hansen wrote:
> >>>> On Tue, 2008-09-09 at 18:20 -0700, Balbir Singh wrote:
> >>>>> +       start = pgdat->node_start_pfn;
> >>>>> +       end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
> >>>>> +       size = (end - start) * sizeof(struct page_cgroup);
> >>>>> +       printk("Allocating %lu bytes for node %d\n", size, n);
> >>>>> +       pcg_map[n] = alloc_bootmem_node(pgdat, size);
> >>>>> +       /*
> >>>>> +        * We can do smoother recovery
> >>>>> +        */
> >>>>> +       BUG_ON(!pcg_map[n]);
> >>>>> +       return 0;
> >>>>>  }
> >>>> This will really suck for sparse memory machines.  Imagine a machine
> >>>> with 1GB of memory at 0x0 and another 1GB of memory at 1TB up in the
> >>>> address space.
> >>>>
> >>> I would hate to re-implement the entire sparsemem code :(
> >>> Kame did suggest making the memory controller depend on sparsemem (to hook in
> >>> from there for allocations)
> >> Yeah, you could just make another mem_section member.  Or, you could
> >> work to abstract the sparsemem code so that other people can use it, or
> >> maybe make it more dynamic so we can have multiple pfn->object lookups
> >> in parallel.  Adding the struct member is obviously easier.
> >>
> > Don't worry. I'll care sparse memory map and hotplug.
> > But whether making this depends on SPARSEMEM or not is not fixed yet.
> > I'll try generic one, at first. If it's dirty, start discussion about SPARSEMEM.
> > 
> > (Honestly, I love sparsemem than others ;)
> 
> My concern is that if we depend on sparsemem, then we force distros to turn on
> sparsemem (which might be the default, but not on all architectures), we might
> end up losing those architectures (w.r.t. turning on the memory controller)
> where sparsemem is not the default on the distro.
> 
Yes. I share your concern. Then, I'll try not-on-sparsemem version, at first.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [RFC][PATCH] Remove cgroup member from struct page (v3)
  2008-09-11  1:56                                                   ` KAMEZAWA Hiroyuki
@ 2008-09-17 23:28                                                     ` Balbir Singh
  2008-09-18  1:40                                                       ` Andrew Morton
  0 siblings, 1 reply; 72+ messages in thread
From: Balbir Singh @ 2008-09-17 23:28 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Dave Hansen, Nick Piggin, Andrew Morton, hugh, menage, xemul,
	linux-kernel, linux-mm

Before trying the sparsemem approach, I tried a radix tree per node,
per zone and I seemed to actually get some performance
improvement.(1.5% (noise maybe))

But please do see and review (tested on my x86_64 box with unixbench
and some other simple tests)

v4..v3
1. Use a radix tree per node, per zone

v3...v2
1. Convert flags to unsigned long
2. Move page_cgroup->lock to a bit spin lock in flags

v2...v1

1. Fix a small bug, don't call radix_tree_preload_end(), if preload fails

This is a rewrite of a patch I had written long back to remove struct page
(I shared the patches with Kamezawa, but never posted them anywhere else).
I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).

I've tested the patches on an x86_64 box, I've run a simple test running
under the memory control group and the same test running concurrently under
two different groups (and creating pressure within their groups).

Advantages of the patch

1. It removes the extra pointer in struct page

Disadvantages

1. Radix tree lookup is not an O(1) operation, once the page is known
   getting to the page_cgroup (pc) is a little more expensive now.

This is an initial RFC (version 3) for comments

TODOs

1. Test the page migration changes

Performance

In a unixbench run, these patches had a performance impact of 2% (slowdown).

Comments/Reviews?

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---

 include/linux/memcontrol.h |   23 +++
 include/linux/mm_types.h   |    4 
 mm/memcontrol.c            |  264 ++++++++++++++++++++++++++++++++++-----------
 3 files changed, 221 insertions(+), 70 deletions(-)

diff -puN mm/memcontrol.c~memcg_move_to_radix_tree mm/memcontrol.c
--- linux-2.6.27-rc5/mm/memcontrol.c~memcg_move_to_radix_tree	2008-09-16 17:10:15.000000000 -0700
+++ linux-2.6.27-rc5-balbir/mm/memcontrol.c	2008-09-17 16:15:01.000000000 -0700
@@ -24,6 +24,7 @@
 #include <linux/smp.h>
 #include <linux/page-flags.h>
 #include <linux/backing-dev.h>
+#include <linux/radix-tree.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
@@ -41,6 +42,24 @@ static struct kmem_cache *page_cgroup_ca
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 
 /*
+ * MAX_NR_ZONES might be an invalid optimization, since we don't use
+ * allocations from __GFP_HIGHMEM
+ */
+
+struct mem_cgroup_radix_tree_per_zone {
+	struct radix_tree_root tree;
+	spinlock_t lock;
+};
+
+struct mem_cgroup_radix_tree_per_node {
+	struct mem_cgroup_radix_tree_per_zone per_zone[MAX_NR_ZONES];
+};
+
+static struct mem_cgroup_radix_tree_per_node
+*mem_cgroup_radix_tree_info[MAX_NUMNODES];
+static int radix_tree_initialized;
+
+/*
  * Statistics for memory cgroup.
  */
 enum mem_cgroup_stat_index {
@@ -137,20 +156,6 @@ struct mem_cgroup {
 static struct mem_cgroup init_mem_cgroup;
 
 /*
- * We use the lower bit of the page->page_cgroup pointer as a bit spin
- * lock.  We need to ensure that page->page_cgroup is at least two
- * byte aligned (based on comments from Nick Piggin).  But since
- * bit_spin_lock doesn't actually set that lock bit in a non-debug
- * uniprocessor kernel, we should avoid setting it here too.
- */
-#define PAGE_CGROUP_LOCK_BIT 	0x0
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-#define PAGE_CGROUP_LOCK 	(1 << PAGE_CGROUP_LOCK_BIT)
-#else
-#define PAGE_CGROUP_LOCK	0x0
-#endif
-
-/*
  * A page_cgroup page is associated with every page descriptor. The
  * page_cgroup helps us identify information about the cgroup
  */
@@ -158,12 +163,17 @@ struct page_cgroup {
 	struct list_head lru;		/* per cgroup LRU list */
 	struct page *page;
 	struct mem_cgroup *mem_cgroup;
-	int flags;
+	unsigned long flags;
 };
-#define PAGE_CGROUP_FLAG_CACHE	   (0x1)	/* charged as cache */
-#define PAGE_CGROUP_FLAG_ACTIVE    (0x2)	/* page is active in this cgroup */
-#define PAGE_CGROUP_FLAG_FILE	   (0x4)	/* page is file system backed */
-#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8)	/* page is unevictableable */
+
+/*
+ * LOCK_BIT is 0, with value 1
+ */
+#define PAGE_CGROUP_FLAG_LOCK_BIT  (0x0)    /* lock bit */
+#define PAGE_CGROUP_FLAG_CACHE	   (0x2)    /* charged as cache */
+#define PAGE_CGROUP_FLAG_ACTIVE    (0x4)    /* page is active in this cgroup */
+#define PAGE_CGROUP_FLAG_FILE	   (0x8)    /* page is file system backed */
+#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x10) /* page is unevictableable */
 
 static int page_cgroup_nid(struct page_cgroup *pc)
 {
@@ -248,35 +258,103 @@ struct mem_cgroup *mem_cgroup_from_task(
 				struct mem_cgroup, css);
 }
 
-static inline int page_cgroup_locked(struct page *page)
+static inline void lock_page_cgroup(struct page_cgroup *pc)
 {
-	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+	BUG_ON(!pc);
+	bit_spin_lock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
 }
 
-static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
+static inline int trylock_page_cgroup(struct page_cgroup *pc)
 {
-	VM_BUG_ON(!page_cgroup_locked(page));
-	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
+	BUG_ON(!pc);
+	return bit_spin_trylock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
 }
 
-struct page_cgroup *page_get_page_cgroup(struct page *page)
+static inline void unlock_page_cgroup(struct page_cgroup *pc)
 {
-	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
+	BUG_ON(!pc);
+	BUG_ON(!bit_spin_is_locked(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags));
+	bit_spin_unlock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
 }
 
-static void lock_page_cgroup(struct page *page)
+static int page_assign_page_cgroup(struct page *page, struct page_cgroup *pc,
+					gfp_t gfp_mask)
 {
-	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
+	unsigned long pfn = page_to_pfn(page);
+	unsigned long flags;
+	int err = 0;
+	struct page_cgroup *old_pc;
+	int node = page_to_nid(page);
+	int zone = page_zonenum(page);
+	struct mem_cgroup_radix_tree_per_node *pn;
 
-static int try_lock_page_cgroup(struct page *page)
-{
-	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+	if (pc) {
+		err = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+		if (err) {
+			printk(KERN_WARNING "could not preload radix tree "
+				"in %s\n", __func__);
+			goto done;
+		}
+	}
+
+	pn = mem_cgroup_radix_tree_info[node];
+	spin_lock_irqsave(&pn->per_zone[zone].lock, flags);
+	old_pc = radix_tree_lookup(&pn->per_zone[zone].tree, pfn);
+	if (pc && old_pc) {
+		err = -EEXIST;
+		goto pc_race;
+	}
+	if (pc) {
+		err = radix_tree_insert(&pn->per_zone[zone].tree, pfn, pc);
+		if (err)
+			printk(KERN_WARNING "Inserting into radix tree failed "
+				"in %s\n", __func__);
+	} else
+		radix_tree_delete(&pn->per_zone[zone].tree, pfn);
+pc_race:
+	spin_unlock_irqrestore(&pn->per_zone[zone].lock, flags);
+	if (pc)
+		radix_tree_preload_end();
+done:
+	return err;
 }
 
-static void unlock_page_cgroup(struct page *page)
+struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
+						bool trylock)
 {
-	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+	unsigned long pfn = page_to_pfn(page);
+	struct page_cgroup *pc;
+	int ret;
+	int node = page_to_nid(page);
+	int zone = page_zonenum(page);
+	struct mem_cgroup_radix_tree_per_node *pn;
+
+	/*
+	 * If radix tree is not initialized, then there is no association
+	 * between page_cgroups and pages. This is likely to occur at
+	 * boot time (from free_all_bootmem... leading to free_hot_cold_page)
+	 */
+	if (!radix_tree_initialized)
+		return NULL;
+
+	BUG_ON(lock && trylock);
+
+	pn = mem_cgroup_radix_tree_info[node];
+	rcu_read_lock();
+	pc = radix_tree_lookup(&pn->per_zone[zone].tree, pfn);
+
+	if (pc && lock)
+		lock_page_cgroup(pc);
+
+	if (pc && trylock) {
+		ret = trylock_page_cgroup(pc);
+		if (!ret)
+			pc = NULL;
+	}
+
+	rcu_read_unlock();
+
+	return pc;
 }
 
 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
@@ -377,17 +455,15 @@ void mem_cgroup_move_lists(struct page *
 	 * safely get to page_cgroup without it, so just try_lock it:
 	 * mem_cgroup_isolate_pages allows for page left on wrong list.
 	 */
-	if (!try_lock_page_cgroup(page))
+	pc = page_get_page_cgroup_trylock(page);
+	if (!pc)
 		return;
 
-	pc = page_get_page_cgroup(page);
-	if (pc) {
-		mz = page_cgroup_zoneinfo(pc);
-		spin_lock_irqsave(&mz->lru_lock, flags);
-		__mem_cgroup_move_lists(pc, lru);
-		spin_unlock_irqrestore(&mz->lru_lock, flags);
-	}
-	unlock_page_cgroup(page);
+	mz = page_cgroup_zoneinfo(pc);
+	spin_lock_irqsave(&mz->lru_lock, flags);
+	__mem_cgroup_move_lists(pc, lru);
+	spin_unlock_irqrestore(&mz->lru_lock, flags);
+	unlock_page_cgroup(pc);
 }
 
 /*
@@ -516,7 +592,7 @@ static int mem_cgroup_charge_common(stru
 				struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *mem;
-	struct page_cgroup *pc;
+	struct page_cgroup *pc, *old_pc;
 	unsigned long flags;
 	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct mem_cgroup_per_zone *mz;
@@ -569,35 +645,49 @@ static int mem_cgroup_charge_common(stru
 
 	pc->mem_cgroup = mem;
 	pc->page = page;
+	pc->flags = 0;		/* No lock, no other bits either */
+
 	/*
 	 * If a page is accounted as a page cache, insert to inactive list.
 	 * If anon, insert to active list.
 	 */
 	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) {
-		pc->flags = PAGE_CGROUP_FLAG_CACHE;
+		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
 		if (page_is_file_cache(page))
 			pc->flags |= PAGE_CGROUP_FLAG_FILE;
 		else
 			pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
 	} else
-		pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
+		pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
 
-	lock_page_cgroup(page);
-	if (unlikely(page_get_page_cgroup(page))) {
-		unlock_page_cgroup(page);
+	old_pc = page_get_page_cgroup_locked(page);
+	if (old_pc) {
+		unlock_page_cgroup(old_pc);
+		res_counter_uncharge(&mem->res, PAGE_SIZE);
+		css_put(&mem->css);
+		kmem_cache_free(page_cgroup_cache, pc);
+		goto done;
+	}
+
+	lock_page_cgroup(pc);
+	/*
+	 * page_get_page_cgroup() does not necessarily guarantee that
+	 * there will be no race in checking for pc, page_assign_page_pc()
+	 * will definitely catch it.
+	 */
+	if (page_assign_page_cgroup(page, pc, gfp_mask)) {
+		unlock_page_cgroup(pc);
 		res_counter_uncharge(&mem->res, PAGE_SIZE);
 		css_put(&mem->css);
 		kmem_cache_free(page_cgroup_cache, pc);
 		goto done;
 	}
-	page_assign_page_cgroup(page, pc);
 
 	mz = page_cgroup_zoneinfo(pc);
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	__mem_cgroup_add_list(mz, pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
-
-	unlock_page_cgroup(page);
+	unlock_page_cgroup(pc);
 done:
 	return 0;
 out:
@@ -645,15 +735,13 @@ int mem_cgroup_cache_charge(struct page 
 	if (!(gfp_mask & __GFP_WAIT)) {
 		struct page_cgroup *pc;
 
-		lock_page_cgroup(page);
-		pc = page_get_page_cgroup(page);
+		pc = page_get_page_cgroup_locked(page);
 		if (pc) {
 			VM_BUG_ON(pc->page != page);
 			VM_BUG_ON(!pc->mem_cgroup);
-			unlock_page_cgroup(page);
+			unlock_page_cgroup(pc);
 			return 0;
 		}
-		unlock_page_cgroup(page);
 	}
 
 	if (unlikely(!mm))
@@ -673,6 +761,7 @@ __mem_cgroup_uncharge_common(struct page
 	struct mem_cgroup *mem;
 	struct mem_cgroup_per_zone *mz;
 	unsigned long flags;
+	int ret;
 
 	if (mem_cgroup_subsys.disabled)
 		return;
@@ -680,8 +769,7 @@ __mem_cgroup_uncharge_common(struct page
 	/*
 	 * Check if our page_cgroup is valid
 	 */
-	lock_page_cgroup(page);
-	pc = page_get_page_cgroup(page);
+	pc = page_get_page_cgroup_locked(page);
 	if (unlikely(!pc))
 		goto unlock;
 
@@ -697,8 +785,9 @@ __mem_cgroup_uncharge_common(struct page
 	__mem_cgroup_remove_list(mz, pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
 
-	page_assign_page_cgroup(page, NULL);
-	unlock_page_cgroup(page);
+	ret = page_assign_page_cgroup(page, NULL, GFP_KERNEL);
+	VM_BUG_ON(ret);
+	unlock_page_cgroup(pc);
 
 	mem = pc->mem_cgroup;
 	res_counter_uncharge(&mem->res, PAGE_SIZE);
@@ -707,7 +796,14 @@ __mem_cgroup_uncharge_common(struct page
 	kmem_cache_free(page_cgroup_cache, pc);
 	return;
 unlock:
-	unlock_page_cgroup(page);
+	unlock_page_cgroup(pc);
+}
+
+void page_reset_bad_cgroup(struct page *page)
+{
+	int ret;
+	ret = page_assign_page_cgroup(page, NULL, GFP_KERNEL);
+	VM_BUG_ON(ret);
 }
 
 void mem_cgroup_uncharge_page(struct page *page)
@@ -734,15 +830,14 @@ int mem_cgroup_prepare_migration(struct 
 	if (mem_cgroup_subsys.disabled)
 		return 0;
 
-	lock_page_cgroup(page);
-	pc = page_get_page_cgroup(page);
+	pc = page_get_page_cgroup_locked(page);
 	if (pc) {
 		mem = pc->mem_cgroup;
 		css_get(&mem->css);
 		if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
 			ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+		unlock_page_cgroup(pc);
 	}
-	unlock_page_cgroup(page);
 	if (mem) {
 		ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
 			ctype, mem);
@@ -1038,6 +1133,38 @@ static struct cftype mem_cgroup_files[] 
 	},
 };
 
+/**
+ *
+ * @node: node for which we intend to alloc radix tree info
+ *
+ * NOTE: using per zone radix trees might not be such a great idea, since
+ * we don't allocate any of the page cgroups using __GFP_HIGHMEM
+ */
+static int alloc_mem_cgroup_per_zone_radix_tree_info(int node)
+{
+	struct mem_cgroup_radix_tree_per_node *pn;
+	int n = node;
+	int zone;
+
+	if (!node_state(node, N_NORMAL_MEMORY))
+		n = -1;
+
+	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, n);
+	if (!pn)
+		return -ENOMEM;
+
+	mem_cgroup_radix_tree_info[node] = pn;
+
+	for (zone = 0; zone < MAX_NR_ZONES; zone++)
+		spin_lock_init(&pn->per_zone[zone].lock);
+	return 0;
+}
+
+static void free_mem_cgroup_per_zone_radix_tree_info(int node)
+{
+	kfree(mem_cgroup_radix_tree_info[node]);
+}
+
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
 	struct mem_cgroup_per_node *pn;
@@ -1103,10 +1230,16 @@ mem_cgroup_create(struct cgroup_subsys *
 {
 	struct mem_cgroup *mem;
 	int node;
+	bool radix_tree_info_allocated = false;
 
 	if (unlikely((cont->parent) == NULL)) {
 		mem = &init_mem_cgroup;
 		page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
+		radix_tree_info_allocated = true;
+		for_each_node_state(node, N_POSSIBLE)
+			if (alloc_mem_cgroup_per_zone_radix_tree_info(node))
+				goto cleanup_radix_tree;
+		radix_tree_initialized = 1;
 	} else {
 		mem = mem_cgroup_alloc();
 		if (!mem)
@@ -1120,6 +1253,9 @@ mem_cgroup_create(struct cgroup_subsys *
 			goto free_out;
 
 	return &mem->css;
+cleanup_radix_tree:
+	free_mem_cgroup_per_zone_radix_tree_info(node);
+	return ERR_PTR(-ENOMEM);
 free_out:
 	for_each_node_state(node, N_POSSIBLE)
 		free_mem_cgroup_per_zone_info(mem, node);
diff -puN include/linux/memcontrol.h~memcg_move_to_radix_tree include/linux/memcontrol.h
--- linux-2.6.27-rc5/include/linux/memcontrol.h~memcg_move_to_radix_tree	2008-09-16 17:10:15.000000000 -0700
+++ linux-2.6.27-rc5-balbir/include/linux/memcontrol.h	2008-09-16 17:10:15.000000000 -0700
@@ -27,9 +27,28 @@ struct mm_struct;
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
-#define page_reset_bad_cgroup(page)	((page)->page_cgroup = 0)
+extern void page_reset_bad_cgroup(struct page *page);
+extern struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
+							bool trylock);
+
+static __always_inline
+struct page_cgroup *page_get_page_cgroup(struct page *page)
+{
+	return __page_get_page_cgroup(page, false, false);
+}
+
+static __always_inline
+struct page_cgroup *page_get_page_cgroup_trylock(struct page *page)
+{
+	return __page_get_page_cgroup(page, false, true);
+}
+
+static __always_inline
+struct page_cgroup *page_get_page_cgroup_locked(struct page *page)
+{
+	return __page_get_page_cgroup(page, true, false);
+}
 
-extern struct page_cgroup *page_get_page_cgroup(struct page *page);
 extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
diff -puN include/linux/mm_types.h~memcg_move_to_radix_tree include/linux/mm_types.h
--- linux-2.6.27-rc5/include/linux/mm_types.h~memcg_move_to_radix_tree	2008-09-16 17:10:15.000000000 -0700
+++ linux-2.6.27-rc5-balbir/include/linux/mm_types.h	2008-09-16 17:10:15.000000000 -0700
@@ -92,10 +92,6 @@ struct page {
 	void *virtual;			/* Kernel virtual address (NULL if
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-	unsigned long page_cgroup;
-#endif
-
 #ifdef CONFIG_KMEMCHECK
 	void *shadow;
 #endif
diff -puN mm/page_alloc.c~memcg_move_to_radix_tree mm/page_alloc.c
diff -puN include/linux/mmzone.h~memcg_move_to_radix_tree include/linux/mmzone.h
_

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page (v3)
  2008-09-17 23:28                                                     ` [RFC][PATCH] Remove cgroup member from struct page (v3) Balbir Singh
@ 2008-09-18  1:40                                                       ` Andrew Morton
  2008-09-18  3:57                                                         ` Balbir Singh
                                                                           ` (2 more replies)
  0 siblings, 3 replies; 72+ messages in thread
From: Andrew Morton @ 2008-09-18  1:40 UTC (permalink / raw)
  To: balbir
  Cc: KAMEZAWA Hiroyuki, Dave Hansen, Nick Piggin, hugh, menage, xemul,
	linux-kernel, linux-mm

On Wed, 17 Sep 2008 16:28:26 -0700 Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> 
> Before trying the sparsemem approach, I tried a radix tree per node,
> per zone and I seemed to actually get some performance
> improvement.(1.5% (noise maybe))
> 
> But please do see and review (tested on my x86_64 box with unixbench
> and some other simple tests)
> 
> v4..v3
> 1. Use a radix tree per node, per zone
> 
> v3...v2
> 1. Convert flags to unsigned long
> 2. Move page_cgroup->lock to a bit spin lock in flags
> 
> v2...v1
> 
> 1. Fix a small bug, don't call radix_tree_preload_end(), if preload fails
> 
> This is a rewrite of a patch I had written long back to remove struct page
> (I shared the patches with Kamezawa, but never posted them anywhere else).
> I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).
> 
> I've tested the patches on an x86_64 box, I've run a simple test running
> under the memory control group and the same test running concurrently under
> two different groups (and creating pressure within their groups).
> 
> Advantages of the patch
> 
> 1. It removes the extra pointer in struct page
> 
> Disadvantages
> 
> 1. Radix tree lookup is not an O(1) operation, once the page is known
>    getting to the page_cgroup (pc) is a little more expensive now.

Why are we doing this?  I can guess, but I'd rather not have to.

a) It's slower.

b) It uses even more memory worst-case.

c) It uses less memory best-case.

someone somewhere decided that (Aa + Bb) / Cc < 1.0.  What are the values
of A, B and C and where did they come from? ;)

(IOW, your changelog is in the category "sucky", along with 90% of the others)

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page (v3)
  2008-09-18  1:40                                                       ` Andrew Morton
@ 2008-09-18  3:57                                                         ` Balbir Singh
  2008-09-18  5:00                                                           ` KAMEZAWA Hiroyuki
  2008-09-18  4:26                                                         ` Hirokazu Takahashi
  2008-09-18  4:43                                                         ` KAMEZAWA Hiroyuki
  2 siblings, 1 reply; 72+ messages in thread
From: Balbir Singh @ 2008-09-18  3:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KAMEZAWA Hiroyuki, Dave Hansen, Nick Piggin, hugh, menage, xemul,
	linux-kernel, linux-mm

Andrew Morton wrote:
> On Wed, 17 Sep 2008 16:28:26 -0700 Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
>> Before trying the sparsemem approach, I tried a radix tree per node,
>> per zone and I seemed to actually get some performance
>> improvement.(1.5% (noise maybe))
>>
>> But please do see and review (tested on my x86_64 box with unixbench
>> and some other simple tests)
>>
>> v4..v3
>> 1. Use a radix tree per node, per zone
>>
>> v3...v2
>> 1. Convert flags to unsigned long
>> 2. Move page_cgroup->lock to a bit spin lock in flags
>>
>> v2...v1
>>
>> 1. Fix a small bug, don't call radix_tree_preload_end(), if preload fails
>>
>> This is a rewrite of a patch I had written long back to remove struct page
>> (I shared the patches with Kamezawa, but never posted them anywhere else).
>> I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).
>>
>> I've tested the patches on an x86_64 box, I've run a simple test running
>> under the memory control group and the same test running concurrently under
>> two different groups (and creating pressure within their groups).
>>
>> Advantages of the patch
>>
>> 1. It removes the extra pointer in struct page
>>
>> Disadvantages
>>
>> 1. Radix tree lookup is not an O(1) operation, once the page is known
>>    getting to the page_cgroup (pc) is a little more expensive now.
> 
> Why are we doing this?  I can guess, but I'd rather not have to.
> 
> a) It's slower.
> 
> b) It uses even more memory worst-case.
> 
> c) It uses less memory best-case.
> 
> someone somewhere decided that (Aa + Bb) / Cc < 1.0.  What are the values
> of A, B and C and where did they come from? ;)
> 
> (IOW, your changelog is in the category "sucky", along with 90% of the others)

Yes, I agree, to be honest we discussed the reasons on the mailing list and
those should go to the changelog. I'll do that in the next version of the
patches. These are early RFC patches, but the changelog does suck.

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page (v3)
  2008-09-18  1:40                                                       ` Andrew Morton
  2008-09-18  3:57                                                         ` Balbir Singh
@ 2008-09-18  4:26                                                         ` Hirokazu Takahashi
  2008-09-18  4:50                                                           ` KAMEZAWA Hiroyuki
  2008-09-18  4:43                                                         ` KAMEZAWA Hiroyuki
  2 siblings, 1 reply; 72+ messages in thread
From: Hirokazu Takahashi @ 2008-09-18  4:26 UTC (permalink / raw)
  To: akpm
  Cc: balbir, kamezawa.hiroyu, dave, nickpiggin, hugh, menage, xemul,
	linux-kernel, linux-mm

Hi,

> > Before trying the sparsemem approach, I tried a radix tree per node,
> > per zone and I seemed to actually get some performance
> > improvement.(1.5% (noise maybe))
> > 
> > But please do see and review (tested on my x86_64 box with unixbench
> > and some other simple tests)
> > 
> > v4..v3
> > 1. Use a radix tree per node, per zone
> > 
> > v3...v2
> > 1. Convert flags to unsigned long
> > 2. Move page_cgroup->lock to a bit spin lock in flags
> > 
> > v2...v1
> > 
> > 1. Fix a small bug, don't call radix_tree_preload_end(), if preload fails
> > 
> > This is a rewrite of a patch I had written long back to remove struct page
> > (I shared the patches with Kamezawa, but never posted them anywhere else).
> > I spent the weekend, cleaning them up for 2.6.27-rc5-mmotm (29 Aug 2008).
> > 
> > I've tested the patches on an x86_64 box, I've run a simple test running
> > under the memory control group and the same test running concurrently under
> > two different groups (and creating pressure within their groups).
> > 
> > Advantages of the patch
> > 
> > 1. It removes the extra pointer in struct page
> > 
> > Disadvantages
> > 
> > 1. Radix tree lookup is not an O(1) operation, once the page is known
> >    getting to the page_cgroup (pc) is a little more expensive now.
> 
> Why are we doing this?  I can guess, but I'd rather not have to.

I think this design is just temporary and the goal is to pre-allocate
all page_cgroups at boot time if it isn't disabled.

But I think each memory model type should have its own way of managing
its page_cgroup arrays as doing for its struct page arrays.
It would be better rather than the sparsemem approach he said.

> a) It's slower.
> 
> b) It uses even more memory worst-case.
> 
> c) It uses less memory best-case.
> 
> someone somewhere decided that (Aa + Bb) / Cc < 1.0.  What are the values
> of A, B and C and where did they come from? ;)
> 
> (IOW, your changelog is in the category "sucky", along with 90% of the others)
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page (v3)
  2008-09-18  1:40                                                       ` Andrew Morton
  2008-09-18  3:57                                                         ` Balbir Singh
  2008-09-18  4:26                                                         ` Hirokazu Takahashi
@ 2008-09-18  4:43                                                         ` KAMEZAWA Hiroyuki
  2008-09-18  4:58                                                           ` Balbir Singh
  2 siblings, 1 reply; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-18  4:43 UTC (permalink / raw)
  To: Andrew Morton
  Cc: balbir, Dave Hansen, Nick Piggin, hugh, menage, xemul,
	linux-kernel, linux-mm

On Wed, 17 Sep 2008 18:40:08 -0700
Andrew Morton <akpm@linux-foundation.org> wrote:
> > Advantages of the patch
> > 
> > 1. It removes the extra pointer in struct page
> > 
> > Disadvantages
> > 
> > 1. Radix tree lookup is not an O(1) operation, once the page is known
> >    getting to the page_cgroup (pc) is a little more expensive now.
> 
> Why are we doing this?  I can guess, but I'd rather not have to.
> 
> a) It's slower.
> 
> b) It uses even more memory worst-case.
> 
> c) It uses less memory best-case.
> 
> someone somewhere decided that (Aa + Bb) / Cc < 1.0.  What are the values
> of A, B and C and where did they come from? ;)
> 

Balbir, don't you like pre-allocate-page-cgroup-at-boot at all ?
I don't like radix-tree for objects which can spread to very vast/sparse area ;)

BTW, I already have lazy-lru-by-pagevec protocol on my patch(hash version) and
seems to work well. I'm now testing it and will post today if I'm enough lucky.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page (v3)
  2008-09-18  4:26                                                         ` Hirokazu Takahashi
@ 2008-09-18  4:50                                                           ` KAMEZAWA Hiroyuki
  2008-09-18  6:13                                                             ` Hirokazu Takahashi
  0 siblings, 1 reply; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-18  4:50 UTC (permalink / raw)
  To: Hirokazu Takahashi
  Cc: akpm, balbir, dave, nickpiggin, hugh, menage, xemul,
	linux-kernel, linux-mm

On Thu, 18 Sep 2008 13:26:13 +0900 (JST)
Hirokazu Takahashi <taka@valinux.co.jp> wrote:


> But I think each memory model type should have its own way of managing
> its page_cgroup arrays as doing for its struct page arrays.
> It would be better rather than the sparsemem approach he said.
> 
My patch adds an interface. Then...
FLATMEM support will be very easy.
I'll ignore DISCONTIGMEM and SPARSEMEM (they will use my 'hash')
SPARSEMEM_VMEMMAP support will took some amount of time. It will need
per-arch patches.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page (v3)
  2008-09-18  4:43                                                         ` KAMEZAWA Hiroyuki
@ 2008-09-18  4:58                                                           ` Balbir Singh
  2008-09-18  5:15                                                             ` KAMEZAWA Hiroyuki
  2008-09-18 11:01                                                             ` KAMEZAWA Hiroyuki
  0 siblings, 2 replies; 72+ messages in thread
From: Balbir Singh @ 2008-09-18  4:58 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, Dave Hansen, Nick Piggin, hugh, menage, xemul,
	linux-kernel, linux-mm

KAMEZAWA Hiroyuki wrote:
> On Wed, 17 Sep 2008 18:40:08 -0700
> Andrew Morton <akpm@linux-foundation.org> wrote:
>>> Advantages of the patch
>>>
>>> 1. It removes the extra pointer in struct page
>>>
>>> Disadvantages
>>>
>>> 1. Radix tree lookup is not an O(1) operation, once the page is known
>>>    getting to the page_cgroup (pc) is a little more expensive now.
>> Why are we doing this?  I can guess, but I'd rather not have to.
>>
>> a) It's slower.
>>
>> b) It uses even more memory worst-case.
>>
>> c) It uses less memory best-case.
>>
>> someone somewhere decided that (Aa + Bb) / Cc < 1.0.  What are the values
>> of A, B and C and where did they come from? ;)
>>
> 
> Balbir, don't you like pre-allocate-page-cgroup-at-boot at all ?
> I don't like radix-tree for objects which can spread to very vast/sparse area ;)
> 

I tried one version, but before trying a pre-allocation version, I wanted to
spread out the radix-tree and try and the results seemed quite impressive. We
can still do pre-allocation, but it gets more complicated as we start supporting
all memory models. I do have a design on paper, but it is much more complex than
this.

> BTW, I already have lazy-lru-by-pagevec protocol on my patch(hash version) and
> seems to work well. I'm now testing it and will post today if I'm enough lucky.

cool! Please do post what numbers you see as well. I would appreciate if you can
try this version and see what sort of performance issues you see.

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page (v3)
  2008-09-18  3:57                                                         ` Balbir Singh
@ 2008-09-18  5:00                                                           ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-18  5:00 UTC (permalink / raw)
  To: balbir
  Cc: Andrew Morton, Dave Hansen, Nick Piggin, hugh, menage, xemul,
	linux-kernel, linux-mm

On Wed, 17 Sep 2008 20:57:54 -0700
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> Andrew Morton wrote:

> > someone somewhere decided that (Aa + Bb) / Cc < 1.0.  What are the values
> > of A, B and C and where did they come from? ;)
> > 
> > (IOW, your changelog is in the category "sucky", along with 90% of the others)
> 
> Yes, I agree, to be honest we discussed the reasons on the mailing list and
> those should go to the changelog. I'll do that in the next version of the
> patches. These are early RFC patches, but the changelog does suck.
> 
IIRC, (Aa + Bb) / Cc < 1.0 discussion was following.

Because we have to maintain pointer to page_cgroup in radix-tree (ZONE_NORMAL)

1. memory usage will increase when memory cgroup is enabled.
   The amount memory usage increase just depends on the height of radix-tree.

2. memory usage will decrease when memory cgroup is disabled.
   This saves 4bytes per 4096bytes. (on x86-32)

Thanks,
-Kame  


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page (v3)
  2008-09-18  4:58                                                           ` Balbir Singh
@ 2008-09-18  5:15                                                             ` KAMEZAWA Hiroyuki
  2008-09-18 11:01                                                             ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-18  5:15 UTC (permalink / raw)
  To: balbir
  Cc: Andrew Morton, Dave Hansen, Nick Piggin, hugh, menage, xemul,
	linux-kernel, linux-mm

On Wed, 17 Sep 2008 21:58:08 -0700
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> KAMEZAWA Hiroyuki wrote:

> > BTW, I already have lazy-lru-by-pagevec protocol on my patch(hash version) and
> > seems to work well. I'm now testing it and will post today if I'm enough lucky.
> 
> cool! Please do post what numbers you see as well. I would appreciate if you can
> try this version and see what sort of performance issues you see.
> 
I'll see what happens. your patch is agasint rc6-mmtom ?

BTW, my 8cpu/2socket/Xeon 3.16GHz host is back now. Maybe good for seeing performance.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page (v3)
  2008-09-18  4:50                                                           ` KAMEZAWA Hiroyuki
@ 2008-09-18  6:13                                                             ` Hirokazu Takahashi
  0 siblings, 0 replies; 72+ messages in thread
From: Hirokazu Takahashi @ 2008-09-18  6:13 UTC (permalink / raw)
  To: kamezawa.hiroyu
  Cc: akpm, balbir, dave, nickpiggin, hugh, menage, xemul,
	linux-kernel, linux-mm

Hello,

> > But I think each memory model type should have its own way of managing
> > its page_cgroup arrays as doing for its struct page arrays.
> > It would be better rather than the sparsemem approach he said.
> > 
> My patch adds an interface. Then...
> FLATMEM support will be very easy.

Yes, that is true, it is really easy.

> I'll ignore DISCONTIGMEM and SPARSEMEM (they will use my 'hash')

What part of this do you think is the problem to implement it in the
straight way for this model?
I think it won't be difficult to implement it since each pgdat can have
its page_cgroup array, which can care about holes in the node as well as
doing it for its struct page array.

> SPARSEMEM_VMEMMAP support will took some amount of time. It will need
> per-arch patches.

Yes, each of ia64, powerpc and x86_64 use this memory model.

We should also care about the regular SPARSEMEM case as you mentioned.

> Thanks,
> -Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page (v3)
  2008-09-18  4:58                                                           ` Balbir Singh
  2008-09-18  5:15                                                             ` KAMEZAWA Hiroyuki
@ 2008-09-18 11:01                                                             ` KAMEZAWA Hiroyuki
  2008-09-18 23:56                                                               ` Balbir Singh
  1 sibling, 1 reply; 72+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-09-18 11:01 UTC (permalink / raw)
  To: balbir
  Cc: Andrew Morton, Dave Hansen, Nick Piggin, hugh, menage, xemul,
	linux-kernel, linux-mm

On Wed, 17 Sep 2008 21:58:08 -0700
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > BTW, I already have lazy-lru-by-pagevec protocol on my patch(hash version) and
> > seems to work well. I'm now testing it and will post today if I'm enough lucky.
> 
> cool! Please do post what numbers you see as well. I would appreciate if you can
> try this version and see what sort of performance issues you see.
> 

This is the result on 8cpu box. I think I have to reduce footprint of fastpath of
my patch ;)

Test result of your patch is (2).
==
Xeon 8cpu/2socket/1-node equips 48GB of memory.
run shell/exec benchmark 3 times just after boot.

lps ... loops per sec.
lpm ... loops per min.
(*) Shell tests somtimes fail because of division by zero, etc...

(1). rc6-mm1(2008/9/13 version)
==
Run                                       == 1st ==  == 2nd ==  ==3rd==
Execl Throughput                           2425.2     2534.5     2465.8  (lps)
C Compiler Throughput                      1438.3     1476.3     1459.1  (lpm)
Shell Scripts (1 concurrent)               9360.3     9368.3     9360.0  (lpm)
Shell Scripts (8 concurrent)               3868.0     3870.0     3868.0  (lpm)
Shell Scripts (16 concurrent)              2207.0     2204.0     2201.0  (lpm)
Dc: sqrt(2) to 99 decimal places         101644.3   102184.5   102118.5  (lpm)

(2). (1) +remove-page-cgroup-pointer-v3 (radix-tree + dynamic allocation)
==
Run                                       == 1st ==  == 2nd ==  == 3rd ==
Execl Throughput                           2514.1      2548.9    2648.7  (lps)
C Compiler Throughput                      1353.9      1324.6    1324.7  (lpm)
Shell Scripts (1 concurrent)               8866.7      8871.0    8856.0  (lpm)
Shell Scripts (8 concurrent)               3674.3      3680.0    3677.7  (lpm)
Shell Scripts (16 concurrent)              failed.     failed    2094.3  (lpm)
Dc: sqrt(2) to 99 decimal places          98837.0     98206.9   98250.6  (lpm)

(3). (1) + pre-allocation by "vmalloc" + hash + misc(atomic flags etc..)
==
Run                                       == 1st ==  == 2nd ==  == 3rd ==
Execl Throughput                           2385.4      2579.2    2361.5  (lps)
C Compiler Throughput                      1424.3      1436.3    1430.6  (lpm)
Shell Scripts (1 concurrent)               9222.0      9234.0    9246.7  (lpm)
Shell Scripts (8 concurrent)               3787.7      3799.3    failed  (lpm)
Shell Scripts (16 concurrent)              2165.7      2166.7    failed  (lpm)
Dc: sqrt(2) to 99 decimal places         102228.9    102658.5   104049.8 (lpm)

(4). (3) + get/put page charge/uncharge + lazy lru handling
Run                                       == 1st ==  == 2nd ==  == 3rd ==
Execl Throughput                           2349.4      2335.7    2338.9  (lps)
C Compiler Throughput                      1430.8      1445.0    1435.3  (lpm)
Shell Scripts (1 concurrent)               9250.3      9262.0    9265.0  (lpm)
Shell Scripts (8 concurrent)               3831.0      3834.4    3833.3  (lpm)
Shell Scripts (16 concurrent)              2193.3      2195.3    2196.0  (lpm)
Dc: sqrt(2) to 99 decimal places         102956.8    102886.9   101884.6 (lpm)


It seems "execl" test is affected by footprint and cache hit rate than other
tests. I need some more efforts for reducing overhead in (4).

Note:
(1)'s struct page is 64 bytes.
(2)(3)(4)'s struct page is 56 bytes.
 

-Kame


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [RFC][PATCH] Remove cgroup member from struct page (v3)
  2008-09-18 11:01                                                             ` KAMEZAWA Hiroyuki
@ 2008-09-18 23:56                                                               ` Balbir Singh
  0 siblings, 0 replies; 72+ messages in thread
From: Balbir Singh @ 2008-09-18 23:56 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, Dave Hansen, Nick Piggin, hugh, menage, xemul,
	linux-kernel, linux-mm

KAMEZAWA Hiroyuki wrote:
> On Wed, 17 Sep 2008 21:58:08 -0700
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>> BTW, I already have lazy-lru-by-pagevec protocol on my patch(hash version) and
>>> seems to work well. I'm now testing it and will post today if I'm enough lucky.
>> cool! Please do post what numbers you see as well. I would appreciate if you can
>> try this version and see what sort of performance issues you see.
>>
> 
> This is the result on 8cpu box. I think I have to reduce footprint of fastpath of
> my patch ;)
> 
> Test result of your patch is (2).
> ==
> Xeon 8cpu/2socket/1-node equips 48GB of memory.
> run shell/exec benchmark 3 times just after boot.
> 
> lps ... loops per sec.
> lpm ... loops per min.
> (*) Shell tests somtimes fail because of division by zero, etc...
> 
> (1). rc6-mm1(2008/9/13 version)
> ==
> Run                                       == 1st ==  == 2nd ==  ==3rd==
> Execl Throughput                           2425.2     2534.5     2465.8  (lps)
> C Compiler Throughput                      1438.3     1476.3     1459.1  (lpm)
> Shell Scripts (1 concurrent)               9360.3     9368.3     9360.0  (lpm)
> Shell Scripts (8 concurrent)               3868.0     3870.0     3868.0  (lpm)
> Shell Scripts (16 concurrent)              2207.0     2204.0     2201.0  (lpm)
> Dc: sqrt(2) to 99 decimal places         101644.3   102184.5   102118.5  (lpm)
> 
> (2). (1) +remove-page-cgroup-pointer-v3 (radix-tree + dynamic allocation)
> ==
> Run                                       == 1st ==  == 2nd ==  == 3rd ==
> Execl Throughput                           2514.1      2548.9    2648.7  (lps)
> C Compiler Throughput                      1353.9      1324.6    1324.7  (lpm)
> Shell Scripts (1 concurrent)               8866.7      8871.0    8856.0  (lpm)
> Shell Scripts (8 concurrent)               3674.3      3680.0    3677.7  (lpm)
> Shell Scripts (16 concurrent)              failed.     failed    2094.3  (lpm)
> Dc: sqrt(2) to 99 decimal places          98837.0     98206.9   98250.6  (lpm)
> 
> (3). (1) + pre-allocation by "vmalloc" + hash + misc(atomic flags etc..)
> ==
> Run                                       == 1st ==  == 2nd ==  == 3rd ==
> Execl Throughput                           2385.4      2579.2    2361.5  (lps)
> C Compiler Throughput                      1424.3      1436.3    1430.6  (lpm)
> Shell Scripts (1 concurrent)               9222.0      9234.0    9246.7  (lpm)
> Shell Scripts (8 concurrent)               3787.7      3799.3    failed  (lpm)
> Shell Scripts (16 concurrent)              2165.7      2166.7    failed  (lpm)
> Dc: sqrt(2) to 99 decimal places         102228.9    102658.5   104049.8 (lpm)
> 
> (4). (3) + get/put page charge/uncharge + lazy lru handling
> Run                                       == 1st ==  == 2nd ==  == 3rd ==
> Execl Throughput                           2349.4      2335.7    2338.9  (lps)
> C Compiler Throughput                      1430.8      1445.0    1435.3  (lpm)
> Shell Scripts (1 concurrent)               9250.3      9262.0    9265.0  (lpm)
> Shell Scripts (8 concurrent)               3831.0      3834.4    3833.3  (lpm)
> Shell Scripts (16 concurrent)              2193.3      2195.3    2196.0  (lpm)
> Dc: sqrt(2) to 99 decimal places         102956.8    102886.9   101884.6 (lpm)
> 
> 
> It seems "execl" test is affected by footprint and cache hit rate than other
> tests. I need some more efforts for reducing overhead in (4).
> 
> Note:
> (1)'s struct page is 64 bytes.
> (2)(3)(4)'s struct page is 56 bytes.

Thanks, Kame! I'll look at the lazy lru patches and see if I can find anything.
Do you have a unified patch anywhere, I seem to get confused with the patches, I
see 10/9, 11/9 and 12/9. I'll do some analysis when I find some free time, I am
currently at plumbers conference.

-- 
	Balbir

^ permalink raw reply	[flat|nested] 72+ messages in thread

end of thread, other threads:[~2008-09-18 23:56 UTC | newest]

Thread overview: 72+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-08-31 17:47 [RFC][PATCH] Remove cgroup member from struct page Balbir Singh
2008-09-01  0:01 ` KAMEZAWA Hiroyuki
2008-09-01  3:28   ` Balbir Singh
2008-09-01  4:03     ` KAMEZAWA Hiroyuki
2008-09-01  5:17       ` KAMEZAWA Hiroyuki
2008-09-01  6:16         ` Balbir Singh
2008-09-01  6:09       ` Balbir Singh
2008-09-01  6:24         ` KAMEZAWA Hiroyuki
2008-09-01  6:25           ` Balbir Singh
2008-09-01  6:59             ` KAMEZAWA Hiroyuki
2008-09-01  6:56   ` Nick Piggin
2008-09-01  7:17     ` Balbir Singh
2008-09-01  7:19     ` KAMEZAWA Hiroyuki
2008-09-01  7:43       ` Nick Piggin
2008-09-02  9:24         ` Balbir Singh
2008-09-02 10:02           ` KAMEZAWA Hiroyuki
2008-09-02  9:58             ` Balbir Singh
2008-09-02 10:07               ` KAMEZAWA Hiroyuki
2008-09-02 10:12                 ` Balbir Singh
2008-09-02 10:57                   ` KAMEZAWA Hiroyuki
2008-09-02 12:37                     ` Balbir Singh
2008-09-03  3:33                       ` KAMEZAWA Hiroyuki
2008-09-03  7:31                         ` Balbir Singh
2008-09-08 15:28                         ` Balbir Singh
2008-09-09  3:57                           ` KAMEZAWA Hiroyuki
2008-09-09  3:58                             ` Nick Piggin
2008-09-09  4:53                               ` KAMEZAWA Hiroyuki
2008-09-09  5:00                                 ` Nick Piggin
2008-09-09  5:12                                   ` KAMEZAWA Hiroyuki
2008-09-09 12:24                                     ` Balbir Singh
2008-09-09 12:28                                       ` Nick Piggin
2008-09-09 12:30                                     ` kamezawa.hiroyu
2008-09-09 12:34                                       ` Balbir Singh
2008-09-10  1:20                                       ` [Approach #2] " Balbir Singh
2008-09-10  1:49                                         ` KAMEZAWA Hiroyuki
2008-09-10  2:11                                           ` Balbir Singh
2008-09-10  2:35                                             ` KAMEZAWA Hiroyuki
2008-09-10 20:44                                           ` Nick Piggin
2008-09-10 11:03                                             ` KAMEZAWA Hiroyuki
2008-09-10 21:02                                               ` Nick Piggin
2008-09-10 11:27                                                 ` KAMEZAWA Hiroyuki
2008-09-10 14:34                                                   ` Balbir Singh
2008-09-10 22:21                                         ` Dave Hansen
2008-09-10 22:31                                           ` David Miller
2008-09-10 22:36                                           ` Balbir Singh
2008-09-10 22:56                                             ` Dave Hansen
2008-09-11  1:35                                               ` KAMEZAWA Hiroyuki
2008-09-11  1:47                                                 ` Balbir Singh
2008-09-11  1:56                                                   ` KAMEZAWA Hiroyuki
2008-09-17 23:28                                                     ` [RFC][PATCH] Remove cgroup member from struct page (v3) Balbir Singh
2008-09-18  1:40                                                       ` Andrew Morton
2008-09-18  3:57                                                         ` Balbir Singh
2008-09-18  5:00                                                           ` KAMEZAWA Hiroyuki
2008-09-18  4:26                                                         ` Hirokazu Takahashi
2008-09-18  4:50                                                           ` KAMEZAWA Hiroyuki
2008-09-18  6:13                                                             ` Hirokazu Takahashi
2008-09-18  4:43                                                         ` KAMEZAWA Hiroyuki
2008-09-18  4:58                                                           ` Balbir Singh
2008-09-18  5:15                                                             ` KAMEZAWA Hiroyuki
2008-09-18 11:01                                                             ` KAMEZAWA Hiroyuki
2008-09-18 23:56                                                               ` Balbir Singh
2008-09-10 22:38                                           ` [Approach #2] [RFC][PATCH] Remove cgroup member from struct page Nick Piggin
2008-09-09  4:18                             ` Balbir Singh
2008-09-09  4:55                               ` KAMEZAWA Hiroyuki
2008-09-09  7:37                               ` KAMEZAWA Hiroyuki
2008-09-01  2:39 ` KAMEZAWA Hiroyuki
2008-09-01  3:42   ` Balbir Singh
2008-09-01  9:03 ` Pavel Emelyanov
2008-09-01  9:17   ` Balbir Singh
2008-09-01  9:43     ` Pavel Emelyanov
2008-09-01 13:19     ` Peter Zijlstra
2008-09-02  7:35       ` Balbir Singh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).