On Wed, Apr 13, 2011 at 1:25 AM, KAMEZAWA Hiroyuki <
kamezawa.hiroyu@jp.fujitsu.com> wrote:

> On Wed, 13 Apr 2011 00:03:02 -0700
> Ying Han <yinghan@google.com> wrote:
>
> > There are two watermarks added per-memcg including "high_wmark" and
> "low_wmark".
> > The per-memcg kswapd is invoked when the memcg's memory
> usage(usage_in_bytes)
> > is higher than the low_wmark. Then the kswapd thread starts to reclaim
> pages
> > until the usage is lower than the high_wmark.
> >
> > Each watermark is calculated based on the hard_limit(limit_in_bytes) for
> each
> > memcg. Each time the hard_limit is changed, the corresponding wmarks are
> > re-calculated. Since memory controller charges only user pages, there is
> > no need for a "min_wmark". The current calculation of wmarks is a
> function of
> > "wmark_ratio" which is set to 0 by default. When the value is 0, the
> watermarks
> > are equal to the hard_limit.
> >
> > changelog v3..v2:
> > 1. Add VM_BUG_ON() on couple of places.
> > 2. Remove the spinlock on the min_free_kbytes since the consequence of
> reading
> > stale data.
> > 3. Remove the "min_free_kbytes" API and replace it with wmark_ratio based
> on
> > hard_limit.
> >
> > changelog v2..v1:
> > 1. Remove the res_counter_charge on wmark due to performance concern.
> > 2. Move the new APIs min_free_kbytes, reclaim_wmarks into seperate
> commit.
> > 3. Calculate the min_free_kbytes automatically based on the
> limit_in_bytes.
> > 4. make the wmark to be consistant with core VM which checks the free
> pages
> > instead of usage.
> > 5. changed wmark to be boolean
> >
> > Signed-off-by: Ying Han <yinghan@google.com>
> > ---
> >  include/linux/memcontrol.h  |    1 +
> >  include/linux/res_counter.h |   80
> +++++++++++++++++++++++++++++++++++++++++++
> >  kernel/res_counter.c        |    6 +++
> >  mm/memcontrol.c             |   52 ++++++++++++++++++++++++++++
> >  4 files changed, 139 insertions(+), 0 deletions(-)
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 5a5ce70..3ece36d 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -82,6 +82,7 @@ int task_in_mem_cgroup(struct task_struct *task, const
> struct mem_cgroup *mem);
> >
> >  extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page
> *page);
> >  extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
> > +extern int mem_cgroup_watermark_ok(struct mem_cgroup *mem, int
> charge_flags);
> >
> >  static inline
> >  int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup
> *cgroup)
> > diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
> > index c9d625c..fa4181b 100644
> > --- a/include/linux/res_counter.h
> > +++ b/include/linux/res_counter.h
> > @@ -39,6 +39,16 @@ struct res_counter {
> >        */
> >       unsigned long long soft_limit;
> >       /*
> > +      * the limit that reclaim triggers. TODO: res_counter in mem
> > +      * or wmark_limit.
> > +      */
> > +     unsigned long long low_wmark_limit;
> > +     /*
> > +      * the limit that reclaim stops. TODO: res_counter in mem or
> > +      * wmark_limit.
> > +      */
>
> What does this TODO mean ?
>

Legacy comment. I will remove it.

>
>
> > +     unsigned long long high_wmark_limit;
> > +     /*
> >        * the number of unsuccessful attempts to consume the resource
> >        */
> >       unsigned long long failcnt;
> > @@ -55,6 +65,9 @@ struct res_counter {
> >
> >  #define RESOURCE_MAX (unsigned long long)LLONG_MAX
> >
> > +#define CHARGE_WMARK_LOW     0x01
> > +#define CHARGE_WMARK_HIGH    0x02
> > +
> >  /**
> >   * Helpers to interact with userspace
> >   * res_counter_read_u64() - returns the value of the specified member.
> > @@ -92,6 +105,8 @@ enum {
> >       RES_LIMIT,
> >       RES_FAILCNT,
> >       RES_SOFT_LIMIT,
> > +     RES_LOW_WMARK_LIMIT,
> > +     RES_HIGH_WMARK_LIMIT
> >  };
> >
> >  /*
> > @@ -147,6 +162,24 @@ static inline unsigned long long
> res_counter_margin(struct res_counter *cnt)
> >       return margin;
> >  }
> >
> > +static inline bool
> > +res_counter_high_wmark_limit_check_locked(struct res_counter *cnt)
> > +{
> > +     if (cnt->usage < cnt->high_wmark_limit)
> > +             return true;
> > +
> > +     return false;
> > +}
> > +
> > +static inline bool
> > +res_counter_low_wmark_limit_check_locked(struct res_counter *cnt)
> > +{
> > +     if (cnt->usage < cnt->low_wmark_limit)
> > +             return true;
> > +
> > +     return false;
> > +}
> > +
>
>
> >  /**
> >   * Get the difference between the usage and the soft limit
> >   * @cnt: The counter
> > @@ -169,6 +202,30 @@ res_counter_soft_limit_excess(struct res_counter
> *cnt)
> >       return excess;
> >  }
> >
> > +static inline bool
> > +res_counter_check_under_low_wmark_limit(struct res_counter *cnt)
> > +{
> > +     bool ret;
> > +     unsigned long flags;
> > +
> > +     spin_lock_irqsave(&cnt->lock, flags);
> > +     ret = res_counter_low_wmark_limit_check_locked(cnt);
> > +     spin_unlock_irqrestore(&cnt->lock, flags);
> > +     return ret;
> > +}
> > +
> > +static inline bool
> > +res_counter_check_under_high_wmark_limit(struct res_counter *cnt)
> > +{
> > +     bool ret;
> > +     unsigned long flags;
> > +
> > +     spin_lock_irqsave(&cnt->lock, flags);
> > +     ret = res_counter_high_wmark_limit_check_locked(cnt);
> > +     spin_unlock_irqrestore(&cnt->lock, flags);
> > +     return ret;
> > +}
> > +
>
> Why internal functions are named as _check_ ? I like _under_.
>

Changed and will be on next post.

>
>
> >  static inline void res_counter_reset_max(struct res_counter *cnt)
> >  {
> >       unsigned long flags;
> > @@ -214,4 +271,27 @@ res_counter_set_soft_limit(struct res_counter *cnt,
> >       return 0;
> >  }
> >
> > +static inline int
> > +res_counter_set_high_wmark_limit(struct res_counter *cnt,
> > +                             unsigned long long wmark_limit)
> > +{
> > +     unsigned long flags;
> > +
> > +     spin_lock_irqsave(&cnt->lock, flags);
> > +     cnt->high_wmark_limit = wmark_limit;
> > +     spin_unlock_irqrestore(&cnt->lock, flags);
> > +     return 0;
> > +}
> > +
> > +static inline int
> > +res_counter_set_low_wmark_limit(struct res_counter *cnt,
> > +                             unsigned long long wmark_limit)
> > +{
> > +     unsigned long flags;
> > +
> > +     spin_lock_irqsave(&cnt->lock, flags);
> > +     cnt->low_wmark_limit = wmark_limit;
> > +     spin_unlock_irqrestore(&cnt->lock, flags);
> > +     return 0;
> > +}
> >  #endif
> > diff --git a/kernel/res_counter.c b/kernel/res_counter.c
> > index 34683ef..206a724 100644
> > --- a/kernel/res_counter.c
> > +++ b/kernel/res_counter.c
> > @@ -19,6 +19,8 @@ void res_counter_init(struct res_counter *counter,
> struct res_counter *parent)
> >       spin_lock_init(&counter->lock);
> >       counter->limit = RESOURCE_MAX;
> >       counter->soft_limit = RESOURCE_MAX;
> > +     counter->low_wmark_limit = RESOURCE_MAX;
> > +     counter->high_wmark_limit = RESOURCE_MAX;
> >       counter->parent = parent;
> >  }
> >
> > @@ -103,6 +105,10 @@ res_counter_member(struct res_counter *counter, int
> member)
> >               return &counter->failcnt;
> >       case RES_SOFT_LIMIT:
> >               return &counter->soft_limit;
> > +     case RES_LOW_WMARK_LIMIT:
> > +             return &counter->low_wmark_limit;
> > +     case RES_HIGH_WMARK_LIMIT:
> > +             return &counter->high_wmark_limit;
> >       };
> >
> >       BUG();
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 4407dd0..664cdc5 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -272,6 +272,8 @@ struct mem_cgroup {
> >        */
> >       struct mem_cgroup_stat_cpu nocpu_base;
> >       spinlock_t pcp_counter_lock;
> > +
> > +     int wmark_ratio;
> >  };
> >
> >  /* Stuffs for move charges at task migration. */
> > @@ -353,6 +355,7 @@ static void mem_cgroup_get(struct mem_cgroup *mem);
> >  static void mem_cgroup_put(struct mem_cgroup *mem);
> >  static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
> >  static void drain_all_stock_async(void);
> > +static unsigned long get_wmark_ratio(struct mem_cgroup *mem);
> >
> >  static struct mem_cgroup_per_zone *
> >  mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
> > @@ -813,6 +816,27 @@ static inline bool mem_cgroup_is_root(struct
> mem_cgroup *mem)
> >       return (mem == root_mem_cgroup);
> >  }
> >
> > +static void setup_per_memcg_wmarks(struct mem_cgroup *mem)
> > +{
> > +     u64 limit;
> > +     unsigned long wmark_ratio;
> > +
> > +     wmark_ratio = get_wmark_ratio(mem);
> > +     limit = mem_cgroup_get_limit(mem);
> > +     if (wmark_ratio == 0) {
> > +             res_counter_set_low_wmark_limit(&mem->res, limit);
> > +             res_counter_set_high_wmark_limit(&mem->res, limit);
> > +     } else {
> > +             unsigned long low_wmark, high_wmark;
> > +             unsigned long long tmp = (wmark_ratio * limit) / 100;
>
> could you make this ratio as /1000 ? percent is too big.
> And, considering misc. cases, I don't think having per-memcg "ratio" is
> good.
>
> How about following ?
>
>  - provides an automatic wmark without knob. 0 wmark is okay, for me.
>  - provides 2 intrerfaces as
>        memory.low_wmark_distance_in_bytes,  # == hard_limit - low_wmark.
>        memory.high_wmark_in_bytes,          # == hard_limit - high_wmark.
>   (need to add sanity check into set_limit.)
>
> Hmm. Making the wmarks tunable individually make sense to me. One problem I
do notice is that making the hard_limit as the bar might not working well on
over-committing system. Which means the per-cgroup background reclaim might
not be triggered before global memory pressure. Ideally, we would like to do
more per-cgroup reclaim before triggering global memory pressure.

How about adding the two APIs but make the calculation based on:

-- by default, the wmarks are equal to hard_limit. ( no background reclaim)
-- provides 2 intrerfaces as
       memory.low_wmark_distance_in_bytes,  # == min(hard_limit, soft_limit)
- low_wmark.
       memory.high_wmark_in_bytes,          # == min(hard_limit, soft_limit)
- high_wmark.


> > +
> > +             low_wmark = tmp;
> > +             high_wmark = tmp - (tmp >> 8);
> > +             res_counter_set_low_wmark_limit(&mem->res, low_wmark);
> > +             res_counter_set_high_wmark_limit(&mem->res, high_wmark);
> > +     }
> > +}
>
> Could you explan what low_wmark/high_wmark means somewhere ?
>

Will add comments.

>
> In this patch, kswapd runs while
>
>        high_wmark < usage < low_wmark
> ?
>
> Hmm, I like
>        low_wmark < usage < high_wmark.
>
> ;) because it's kswapd.
>
> I adopt the same concept of global kswapd where low_wmark triggers the
kswpd and hight_wmark stop it. And here, we have

(limit - high_wmark) < free < (limit - low_wmark)

--Ying

>
> > +
> >  /*
> >   * Following LRU functions are allowed to be used without PCG_LOCK.
> >   * Operations are called by routine of global LRU independently from
> memcg.
> > @@ -1195,6 +1219,16 @@ static unsigned int get_swappiness(struct
> mem_cgroup *memcg)
> >       return memcg->swappiness;
> >  }
> >
> > +static unsigned long get_wmark_ratio(struct mem_cgroup *memcg)
> > +{
> > +     struct cgroup *cgrp = memcg->css.cgroup;
> > +
> > +     VM_BUG_ON(!cgrp);
> > +     VM_BUG_ON(!cgrp->parent);
> > +
>
> Does this happen ?
>
> > +     return memcg->wmark_ratio;
> > +}
> > +
> >  static void mem_cgroup_start_move(struct mem_cgroup *mem)
> >  {
> >       int cpu;
> > @@ -3205,6 +3239,7 @@ static int mem_cgroup_resize_limit(struct
> mem_cgroup *memcg,
> >                       else
> >                               memcg->memsw_is_minimum = false;
> >               }
> > +             setup_per_memcg_wmarks(memcg);
> >               mutex_unlock(&set_limit_mutex);
> >
> >               if (!ret)
> > @@ -3264,6 +3299,7 @@ static int mem_cgroup_resize_memsw_limit(struct
> mem_cgroup *memcg,
> >                       else
> >                               memcg->memsw_is_minimum = false;
> >               }
> > +             setup_per_memcg_wmarks(memcg);
> >               mutex_unlock(&set_limit_mutex);
> >
> >               if (!ret)
> > @@ -4521,6 +4557,22 @@ static void __init enable_swap_cgroup(void)
> >  }
> >  #endif
> >
> > +int mem_cgroup_watermark_ok(struct mem_cgroup *mem,
> > +                             int charge_flags)
> > +{
> > +     long ret = 0;
> > +     int flags = CHARGE_WMARK_LOW | CHARGE_WMARK_HIGH;
> > +
> > +     VM_BUG_ON((charge_flags & flags) == flags);
> > +
> > +     if (charge_flags & CHARGE_WMARK_LOW)
> > +             ret = res_counter_check_under_low_wmark_limit(&mem->res);
> > +     if (charge_flags & CHARGE_WMARK_HIGH)
> > +             ret = res_counter_check_under_high_wmark_limit(&mem->res);
> > +
> > +     return ret;
> > +}
>
> Hmm, do we need this unified function ?
>
> Thanks,
> -Kame
>
>