From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <vdavydov.dev@gmail.com>
ARC-Seal: i=1; a=rsa-sha256; t=1526230061; cv=none;
        d=google.com; s=arc-20160816;
        b=n0C+eOS1XlFfLVXKpWbBpKoXMcLO8WVdPJ8Wzgw/q+D+I9Zui6WDnTCwGAlu6tWRfo
         ekAzNAsO19UnYTP1ZWHZQ/Fwu79eVXlOtRE4LpNN7ciQNJH+RE0EvJnuh9KOpsI5YTFT
         H4anZFhgQaQAcpq3ASfYAcMepPS7GEFh/8ZsPQkz/WsVqU+5Sl4qBXP9dY9P+Yhst3KV
         57xDJ9md+jzxzc7C+iO90GjBIvYbvrGxDxnf2tOvsohqnVZhmCThCCFKE/pcsjypoUut
         bzxF8tWcnDODUz2BBWcivRM6TUkFQFEs9QcTpRBfd4ol4RjMPPtt/9gvj7v5eO9dkKoe
         IKNA==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816;
        h=in-reply-to:content-disposition:mime-version:references:message-id
         :subject:cc:to:from:date:dkim-signature:arc-authentication-results;
        bh=jHJW5wT3jnietXTNME/Q7kem1YjZSx+2k6FPVDr5eOg=;
        b=cKI3Jy1C8x836fNXEfMrOMWeKbspuK/dj+ppoyQcirxhzxlH3u7I8BiSWwztf+BZ9R
         8hHVKFPTeQ0ITZ955t4NQrYG5k2BdPXDdsuW2YV43UbyaFZbp/Z+LIcCx4kwZSQfWcUy
         a9uop5bR47rLaC2w/5GacM1wSTZiXygEP/rVU74okv3ElFcILlck13/Tq6/Lz3qqaLSm
         B0YUqac9U1LU3QwB4jCwyOHn6B2ZR77sBI22P4j0tB8xF0qbly7mkCKwkMgkrFhgCYqq
         rBlQ+vVf7X4nnOWrB2Iti7GwpkqK+QL5ocs1pKWXr1uYrg6r9L5EG+YpXyWOrBpIFWhC
         HUnw==
ARC-Authentication-Results: i=1; mx.google.com;
       dkim=pass header.i=@gmail.com header.s=20161025 header.b=mQppxKJg;
       spf=pass (google.com: domain of vdavydov.dev@gmail.com designates 209.85.220.65 as permitted sender) smtp.mailfrom=vdavydov.dev@gmail.com;
       dmarc=pass (p=NONE sp=QUARANTINE dis=NONE) header.from=gmail.com
Authentication-Results: mx.google.com;
       dkim=pass header.i=@gmail.com header.s=20161025 header.b=mQppxKJg;
       spf=pass (google.com: domain of vdavydov.dev@gmail.com designates 209.85.220.65 as permitted sender) smtp.mailfrom=vdavydov.dev@gmail.com;
       dmarc=pass (p=NONE sp=QUARANTINE dis=NONE) header.from=gmail.com
X-Google-Smtp-Source: AB8JxZpLbBUrGA40bG/SeoF6VWNbkdGwwcDoZ5sYGpYPeLAcwHI8DSIaQtqSif2ROqPozVXH0DJpzA==
Date: Sun, 13 May 2018 19:47:38 +0300
From: Vladimir Davydov <vdavydov.dev@gmail.com>
To: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: akpm@linux-foundation.org, shakeelb@google.com, viro@zeniv.linux.org.uk,
	hannes@cmpxchg.org, mhocko@kernel.org, tglx@linutronix.de,
	pombredanne@nexb.com, stummala@codeaurora.org,
	gregkh@linuxfoundation.org, sfr@canb.auug.org.au, guro@fb.com,
	mka@chromium.org, penguin-kernel@I-love.SAKURA.ne.jp,
	chris@chris-wilson.co.uk, longman@redhat.com, minchan@kernel.org,
	ying.huang@intel.com, mgorman@techsingularity.net, jbacik@fb.com,
	linux@roeck-us.net, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, willy@infradead.org, lirongqing@baidu.com,
	aryabinin@virtuozzo.com
Subject: Re: [PATCH v5 03/13] mm: Assign memcg-aware shrinkers bitmap to memcg
Message-ID: <20180513164738.tufhk5i7bnsxsq4l@esperanza>
References: <152594582808.22949.8353313986092337675.stgit@localhost.localdomain>
 <152594595644.22949.8473969450800431565.stgit@localhost.localdomain>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <152594595644.22949.8473969450800431565.stgit@localhost.localdomain>
X-getmail-retrieved-from-mailbox: INBOX
X-GMAIL-THRID: =?utf-8?q?1600070321367125899?=
X-GMAIL-MSGID: =?utf-8?q?1600368213142054239?=
X-Mailing-List: linux-kernel@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>

On Thu, May 10, 2018 at 12:52:36PM +0300, Kirill Tkhai wrote:
> Imagine a big node with many cpus, memory cgroups and containers.
> Let we have 200 containers, every container has 10 mounts,
> and 10 cgroups. All container tasks don't touch foreign
> containers mounts. If there is intensive pages write,
> and global reclaim happens, a writing task has to iterate
> over all memcgs to shrink slab, before it's able to go
> to shrink_page_list().
> 
> Iteration over all the memcg slabs is very expensive:
> the task has to visit 200 * 10 = 2000 shrinkers
> for every memcg, and since there are 2000 memcgs,
> the total calls are 2000 * 2000 = 4000000.
> 
> So, the shrinker makes 4 million do_shrink_slab() calls
> just to try to isolate SWAP_CLUSTER_MAX pages in one
> of the actively writing memcg via shrink_page_list().
> I've observed a node spending almost 100% in kernel,
> making useless iteration over already shrinked slab.
> 
> This patch adds bitmap of memcg-aware shrinkers to memcg.
> The size of the bitmap depends on bitmap_nr_ids, and during
> memcg life it's maintained to be enough to fit bitmap_nr_ids
> shrinkers. Every bit in the map is related to corresponding
> shrinker id.
> 
> Next patches will maintain set bit only for really charged
> memcg. This will allow shrink_slab() to increase its
> performance in significant way. See the last patch for
> the numbers.
> 
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> ---
>  include/linux/memcontrol.h |   21 ++++++++
>  mm/memcontrol.c            |  116 ++++++++++++++++++++++++++++++++++++++++++++
>  mm/vmscan.c                |   16 ++++++
>  3 files changed, 152 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 6cbea2f25a87..e5e7e0fc7158 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -105,6 +105,17 @@ struct lruvec_stat {
>  	long count[NR_VM_NODE_STAT_ITEMS];
>  };
>  
> +#ifdef CONFIG_MEMCG_SHRINKER
> +/*
> + * Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
> + * which have elements charged to this memcg.
> + */
> +struct memcg_shrinker_map {
> +	struct rcu_head rcu;
> +	unsigned long map[0];
> +};
> +#endif /* CONFIG_MEMCG_SHRINKER */
> +

AFAIR we don't normally ifdef structure definitions.

>  /*
>   * per-zone information in memory controller.
>   */
> @@ -118,6 +129,9 @@ struct mem_cgroup_per_node {
>  
>  	struct mem_cgroup_reclaim_iter	iter[DEF_PRIORITY + 1];
>  
> +#ifdef CONFIG_MEMCG_SHRINKER
> +	struct memcg_shrinker_map __rcu	*shrinker_map;
> +#endif
>  	struct rb_node		tree_node;	/* RB tree node */
>  	unsigned long		usage_in_excess;/* Set to the value by which */
>  						/* the soft limit is exceeded*/
> @@ -1255,4 +1269,11 @@ static inline void memcg_put_cache_ids(void)
>  
>  #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
>  
> +#ifdef CONFIG_MEMCG_SHRINKER

> +#define MEMCG_SHRINKER_MAP(memcg, nid) (memcg->nodeinfo[nid]->shrinker_map)

I don't really like this helper macro. Accessing shrinker_map directly
looks cleaner IMO.

> +
> +extern int memcg_shrinker_nr_max;

As I've mentioned before, the capacity of shrinker map should be a
private business of memcontrol.c IMHO. We shouldn't use it in vmscan.c
as max shrinker id, instead we should introduce another variable for
this, private to vmscan.c.

> +extern int memcg_expand_shrinker_maps(int old_id, int id);

... Then this function would take just one argument, max id, and would
update shrinker_map capacity if necessary in memcontrol.c under the
corresponding mutex, which would look much more readable IMHO as all
shrinker_map related manipulations would be isolated in memcontrol.c.

> +#endif /* CONFIG_MEMCG_SHRINKER */
> +
>  #endif /* _LINUX_MEMCONTROL_H */
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 3df3efa7ff40..18e0fdf302a9 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -322,6 +322,116 @@ struct workqueue_struct *memcg_kmem_cache_wq;
>  
>  #endif /* !CONFIG_SLOB */
>  
> +#ifdef CONFIG_MEMCG_SHRINKER
> +int memcg_shrinker_nr_max;

memcg_shrinker_map_capacity, may be?

> +static DEFINE_MUTEX(shrinkers_nr_max_mutex);

memcg_shrinker_map_mutex?

> +
> +static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
> +{
> +	kvfree(container_of(head, struct memcg_shrinker_map, rcu));
> +}
> +
> +static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
> +					 int size, int old_size)

If you followed my advice and made the shrinker_map_capacity private to
memcontrol.c, you wouldn't need to pass old_size here either, just max
shrinker id.

> +{
> +	struct memcg_shrinker_map *new, *old;
> +	int nid;
> +
> +	lockdep_assert_held(&shrinkers_nr_max_mutex);
> +
> +	for_each_node(nid) {
> +		old = rcu_dereference_protected(MEMCG_SHRINKER_MAP(memcg, nid), true);
> +		/* Not yet online memcg */
> +		if (old_size && !old)
> +			return 0;
> +
> +		new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
> +		if (!new)
> +			return -ENOMEM;
> +
> +		/* Set all old bits, clear all new bits */
> +		memset(new->map, (int)0xff, old_size);
> +		memset((void *)new->map + old_size, 0, size - old_size);
> +
> +		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
> +		if (old)
> +			call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
> +	}
> +
> +	return 0;
> +}
> +
> +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
> +{
> +	struct mem_cgroup_per_node *pn;
> +	struct memcg_shrinker_map *map;
> +	int nid;
> +
> +	if (memcg == root_mem_cgroup)
> +		return;

Nit: there's mem_cgroup_is_root() helper.

> +
> +	mutex_lock(&shrinkers_nr_max_mutex);

Why do you need to take the mutex here? You don't access shrinker map
capacity here AFAICS.

> +	for_each_node(nid) {
> +		pn = mem_cgroup_nodeinfo(memcg, nid);
> +		map = rcu_dereference_protected(pn->shrinker_map, true);
> +		if (map)
> +			call_rcu(&map->rcu, memcg_free_shrinker_map_rcu);
> +		rcu_assign_pointer(pn->shrinker_map, NULL);
> +	}
> +	mutex_unlock(&shrinkers_nr_max_mutex);
> +}
> +
> +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
> +{
> +	int ret, size = memcg_shrinker_nr_max/BITS_PER_BYTE;
> +
> +	if (memcg == root_mem_cgroup)
> +		return 0;

Nit: mem_cgroup_is_root().

> +
> +	mutex_lock(&shrinkers_nr_max_mutex);

> +	ret = memcg_expand_one_shrinker_map(memcg, size, 0);

I don't think it's worth reusing the function designed for reallocating
shrinker maps for initial allocation. Please just fold the code here -
it will make both 'alloc' and 'expand' easier to follow IMHO.

> +	mutex_unlock(&shrinkers_nr_max_mutex);
> +
> +	if (ret)
> +		memcg_free_shrinker_maps(memcg);
> +
> +	return ret;
> +}
> +

> +static struct idr mem_cgroup_idr;

Stray change.

> +
> +int memcg_expand_shrinker_maps(int old_nr, int nr)
> +{
> +	int size, old_size, ret = 0;
> +	struct mem_cgroup *memcg;
> +
> +	old_size = old_nr / BITS_PER_BYTE;
> +	size = nr / BITS_PER_BYTE;
> +
> +	mutex_lock(&shrinkers_nr_max_mutex);
> +

> +	if (!root_mem_cgroup)
> +		goto unlock;

This wants a comment.

> +
> +	for_each_mem_cgroup(memcg) {
> +		if (memcg == root_mem_cgroup)

Nit: mem_cgroup_is_root().

> +			continue;
> +		ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
> +		if (ret)
> +			goto unlock;
> +	}
> +unlock:
> +	mutex_unlock(&shrinkers_nr_max_mutex);
> +	return ret;
> +}
> +#else /* CONFIG_MEMCG_SHRINKER */
> +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
> +{
> +	return 0;
> +}
> +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
> +#endif /* CONFIG_MEMCG_SHRINKER */
> +
>  /**
>   * mem_cgroup_css_from_page - css of the memcg associated with a page
>   * @page: page of interest
> @@ -4471,6 +4581,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
>  {
>  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
>  
> +	if (memcg_alloc_shrinker_maps(memcg)) {
> +		mem_cgroup_id_remove(memcg);
> +		return -ENOMEM;
> +	}
> +
>  	/* Online state pins memcg ID, memcg ID pins CSS */
>  	atomic_set(&memcg->id.ref, 1);
>  	css_get(css);
> @@ -4522,6 +4637,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
>  	vmpressure_cleanup(&memcg->vmpressure);
>  	cancel_work_sync(&memcg->high_work);
>  	mem_cgroup_remove_from_trees(memcg);
> +	memcg_free_shrinker_maps(memcg);
>  	memcg_free_kmem(memcg);
>  	mem_cgroup_free(memcg);
>  }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index d691beac1048..d8a2870710e0 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -174,12 +174,26 @@ static DEFINE_IDR(shrinker_idr);
>  
>  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
>  {
> -	int id, ret;
> +	int id, nr, ret;
>  
>  	down_write(&shrinker_rwsem);
>  	ret = id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
>  	if (ret < 0)
>  		goto unlock;
> +
> +	if (id >= memcg_shrinker_nr_max) {
> +		nr = memcg_shrinker_nr_max * 2;
> +		if (nr == 0)
> +			nr = BITS_PER_BYTE;
> +		BUG_ON(id >= nr);

The logic defining shrinker map capacity growth should be private to
memcontrol.c IMHO.

> +
> +		if (memcg_expand_shrinker_maps(memcg_shrinker_nr_max, nr)) {
> +			idr_remove(&shrinker_idr, id);
> +			goto unlock;
> +		}
> +		memcg_shrinker_nr_max = nr;
> +	}
> +
>  	shrinker->id = id;
>  	ret = 0;
>  unlock:
>