linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: zhong jiang <zhongjiang-ali@linux.alibaba.com>
To: Michal Hocko <mhocko@suse.com>
Cc: hannes@cmpxchg.org, akpm@linux-foundation.org, linux-mm@kvack.org
Subject: Re: [PATCH] mm: Do not deactivate when the cgroup has plenty inactive page
Date: Thu, 15 Oct 2020 00:57:31 +0800	[thread overview]
Message-ID: <9212fc5a-35dd-1347-2b0e-ee73c58f1fd8@linux.alibaba.com> (raw)
In-Reply-To: <20201014144742.GH4440@dhcp22.suse.cz>

[-- Attachment #1: Type: text/plain, Size: 2767 bytes --]


On 2020/10/14 10:47 下午, Michal Hocko wrote:
> On Wed 14-10-20 22:21:58, zhong jiang wrote:
>> On 2020/9/30 12:02 上午, Michal Hocko wrote:
>>> On Sun 27-09-20 10:39:22, zhong jiang wrote:
>>>> On 2020/9/25 8:07 下午, Michal Hocko wrote:
>>>>> On Fri 25-09-20 19:49:12, zhongjiang-ali wrote:
>>>>>> After appling the series patches(mm: fix page aging across multiple cgroups),
>>>>>> cgroup memory reclaim strategy is based on reclaim root's inactive:active
>>>>>> ratio. if the target lruvec need to deactivate, its children cgroup also will
>>>>>> deactivate. That will result in hot page to be reclaimed and other cgroup's
>>>>>> cold page will be left, which is not expected.
>>>>>>
>>>>>> The patch will not force deactivate when inactive_is_low is not true unless
>>>>>> we has scanned the inactive list and memory is unable to reclaim.
>>>>> Do you have any data to present?
>>>> I write an testcase that cgroup B has a lot of hot pagecache and cgroup C
>>>> is full of cold pagecache.  and
>>>>
>>>> their parent cgroup A will trigger the reclaim due of it's limit has been
>>>> breached.
>>>>
>>>>
>>>> The testcase should assume that we should not reclaim the  hot pagecache in
>>>> cgroup B because C has
>>>>
>>>> plenty cold pagecache.   Unfortunately,  I can see cgroup B hot pagecache
>>>> has been decreased when
>>>>
>>>> cgroup A trigger the reclaim.
>>> Thank you, this is more or less what've expected from your initial
>>> description. An extended form would be preferable for the changelog to
>>> make the setup more clear. But you still haven't quantified the effect.
>>> It would be also good to describe what is the effect on the workload
>>> described by 53138cea7f39 ("mm: vmscan: move file exhaustion detection
>>> to the node level"). A more extended description on the implementation
>>> would be nice as well.
>> Hi,  Michal
>>
>> I'm sorry for lately reply due of a long vacation.  But that indeed breach
>> the initial purpose.
>>
>> Do you think the patch make sense, or any benchmark can be recommended to
>> get some data.
> To be honest I have really hard time to grasp what is the effect of this
> patch. Also memory reclaim tuning without any data doesn't really sound
> convincing. Do you see any real workload that is benefiting from this
> change or this is mostly a based on reading the code and a theoretical
> concern?

I write an testcase based on LTP to test the condition  and see active 
page will be deactived but

child cgroup still has a lot of cold memory, which is not unexpected.

The testcase is attached.

> Please understand that the existing balancing is quite complex and any
> changes should be carfully analyzed and described.
>

[-- Attachment #2: memcg_balance.c --]
[-- Type: text/plain, Size: 7122 bytes --]

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/sysinfo.h>
#include <sys/wait.h>

#include "tst_test.h"

#define HOT_CACHE	(1024 * 1024 * 500UL)
#define COLD_CACHE	(1024 * 1024 * 1024 * 2UL)
#define MEMORY_MAX	(COLD_CACHE / 2)
#define HOT_MAX		(1024 * 1024 * 400UL)
#define MEMCG_ROOT 	"/sys/fs/cgroup/memory"
#define MEMCG_CHILD_HOT		"memcg_test_0"
#define MEMCG_CHILD_COLD	"memcg_test_1"
#define tempfile_hot	"memcg_test.hot"
#define tempfile_cold	"memcg_test.cold"
#define TEST_PASS	0
#define TEST_FAIL	-1

static inline int values_close(long a, long b, int err)
{
    return abs(a - b) <= (a + b) / 100 * err;
}

static int cg_create(char *path, char *parent, char *memcg)
{
	sprintf(path, "%s/%s", parent, memcg);

	return mkdir(path, 0644);
}

static int cg_write(char *memcg, char *control, char *buf, ssize_t len)
{
	char path[PATH_MAX];
	int fd;

	sprintf(path, "%s/%s", memcg, control);
	fd = open(path, O_WRONLY);
	if (fd < 0)
		return fd;
	len = write(fd, buf, len);
	close(fd);

	return len;
}

static int cg_move(char *cgroup)
{
	char pid_name[10] = {0,};
	char buf[PATH_MAX];

	strcpy(buf, cgroup);
	sprintf(pid_name, "%d", getpid());

	return cg_write(dirname(buf), "cgroup.procs", pid_name, strlen(pid_name));
}

static int cg_destroy(char *parent, char *cgroup)
{
	char path[PATH_MAX];
	int ret;
	int loops = 1;

	if (!cgroup)
		sprintf(path, "%s", parent);
	else
		sprintf(path, "%s/%s", parent, cgroup);
retry:
	ret = rmdir(path);
	if (ret && errno == EBUSY && loops) {
		ret = cg_move(path);
		if (ret < 0)
			goto out;
		else {
			loops--;
			goto retry;
		}
	} else if (ret && errno == ENOENT)
		ret = 0;
out:
	return ret;
}

static int open_temp_file(char *name)
{
    return open(name, O_CREAT | O_RDWR | O_TRUNC, S_IRWXU);
}

static void remove_temp_file(char *name)
{
    unlink(name);
}

static void cleanup(void)
{
	char parent[PATH_MAX];

	if (!access(tempfile_hot, F_OK))
		remove_temp_file(tempfile_hot);
	if (!access(tempfile_cold, F_OK))
		remove_temp_file(tempfile_cold);

	sprintf(parent, "%s/%s", MEMCG_ROOT, "memcg_test");
	if (!access(parent, F_OK)) {
		cg_destroy(parent, MEMCG_CHILD_HOT);
		cg_destroy(parent, MEMCG_CHILD_COLD);
		cg_destroy(parent, NULL);
	}
}

static int cg_write_max(char *memcg, unsigned long max)
{
	char buf[100];

	sprintf(buf, "%ld", max);
	return cg_write(memcg, "memory.limit_in_bytes", buf, 100);
}

static void remove_temp_files(void)
{
	remove_temp_file(tempfile_hot);
	remove_temp_file(tempfile_cold);
}

static int read_file(int fd, unsigned long size, char *buf, int pagesize)
{
	unsigned long i;
	int len = 0;

	lseek(fd, 0, SEEK_SET);
	for (i = 0; i < size; i += pagesize) {
		len = read(fd, buf, pagesize);
		if (len < 0)
			break;
	}

	return len;
}

static void memcg_active_file(char *path, int fd, unsigned long size)
{
	char *buf;
	char *value;
	int pagesize = getpagesize();

	value = malloc(pagesize);
	buf = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
	memset(buf, 'A', size);

	/* trigger memory reclaim to putback inactive page to active list */
	cg_write_max(path, HOT_MAX);
	munmap(buf, size);
	/* mark_page_accessed will make the page to be activated */
	read_file(fd, size, value, pagesize);
	free(value);
}

static int touch_pagecache(int fd, unsigned long size)
{
	char *buf;
	int pagesize = getpagesize();
	int len = -1;

	buf = malloc(pagesize);
	if (ftruncate(fd, size))
		goto out;

	len = read_file(fd, size, buf, pagesize);
out:
	free(buf);
	return len;
}
 
static int cg_read_stat(char *memcg, char *control, ssize_t *value)
{
	char path[PATH_MAX];
	char str[1024], temp[20];
	int found = 0;
	FILE *fp;

	sprintf(path, "%s/%s", memcg, "memory.stat");
	fp = fopen(path, "r");
	if (!fp)
		goto out;

	while (fgets(str, 1024, fp) != NULL) {
		if (!strncmp(str, control, strlen(control))) {
			sscanf(str, "%s %ld", temp, value);
			found = 1;
			break;
		} else
			continue;
	}

	fclose(fp);
	if (found)
		return 0;
out:
	return -1;
}

static int cg_write_conf(char *path, char *parent, char *memcg)
{
	char pid[10];

	if (cg_create(path, parent, memcg))
		goto out;

	sprintf(pid, "%d", getpid());
	cg_write(path, "cgroup.procs", pid, strlen(pid));

	return 0;
out:
	return -1;
}

static int memcg_create_cold(char *parent, char *memcg)
{
	int fd, ret;
	char path[PATH_MAX] = {0};
	pid_t pid;

	pid = fork();
	if (pid > 0) {
		if (wait(NULL) < 0)
			return -1;
		else
			return 0;
	} else if (pid == 0) {
		ret = cg_write_conf(path, parent, memcg);
		if (ret < 0)
			goto out;

		fd = open_temp_file(tempfile_cold);
		if (fd < 0)
			goto out_destroy;

		if (touch_pagecache(fd, COLD_CACHE) < 0)
			goto out_fd;

		close(fd);
		exit(0);
	} else
		return -1;

out_fd:
	close(fd);
	remove_temp_file(tempfile_cold);
out_destroy:
	cg_destroy(path, NULL);
out:
	exit(-1);
}

static int memcg_create_hot(char *parent, char *memcg)
{
	int ret, fd;
	char path[PATH_MAX];
	pid_t pid;

	pid = fork();
	if (pid > 0) {
		if (wait(NULL) < 0)
			return -1;
		else
			return 0;
	} else if (pid == 0) {
		ret = cg_write_conf(path, parent, memcg);
		if (ret < 0)
			goto out;

		fd = open_temp_file(tempfile_hot);
		if (fd < 0)
			goto out_destroy;

		if (touch_pagecache(fd, HOT_CACHE) < 0)
			goto out_fd;
		memcg_active_file(path, fd, HOT_CACHE);

		close(fd);
		exit(0);
	} else
		return -1;

out_fd:
	close(fd);
	remove_temp_file(tempfile_hot);
out_destroy:
	cg_destroy(path, NULL);
out:
	exit(-1);
}

static void memcg_get_active_pages(char *parent, char *memcg, long *active)
{
	char path[PATH_MAX];

	sprintf(path, "%s/%s", parent, memcg);
	cg_read_stat(path, "active_file", active);
}

static int memcg_measure_compare(char *parent, char *memcg, ssize_t size)
{
	long active;

	memcg_get_active_pages(parent, memcg, &active);
	if (values_close(active, size, 3))
		return 0;
	else
		return -1;
}

static void memcg_balance_run(void)
{
	char parent[PATH_MAX];
	int ret;
	long hot_active;

	if (access(MEMCG_ROOT, F_OK))
		tst_brk(TCONF, "memory has not been mounted");

	ret = cg_write_conf(parent, MEMCG_ROOT, "memcg_test");
	if (ret)
		goto out;

	ret = memcg_create_hot(parent, MEMCG_CHILD_HOT);
	if (ret)
		goto out_parent;

	memcg_get_active_pages(parent, MEMCG_CHILD_HOT, &hot_active);
	ret = memcg_create_cold(parent, MEMCG_CHILD_COLD);
	if (ret)
		goto out_child_hot;

	/* set parent memcg's limit_in_bytes to trigger memory reclaim. */
	ret = cg_write_max(parent, MEMORY_MAX);
	if (ret < 0)
		goto out_parent;

	/* Assume that we should not less that the specified value within a certain deviation */
	ret = memcg_measure_compare(parent, MEMCG_CHILD_HOT, hot_active);
	if (ret)
		goto out_child_cold;

	remove_temp_files();

out_child_cold:
	cg_destroy(parent, MEMCG_CHILD_COLD);
out_child_hot:
	cg_destroy(parent, MEMCG_CHILD_HOT);
out_parent:
	cg_destroy(parent, NULL);
out:
	if (ret == TEST_FAIL)
		tst_res(TFAIL, "memcg balance");
	else
		tst_res(TPASS, "memcg balance");
}

static struct tst_test test = {
	.test_all = memcg_balance_run,
	.min_kver = "4.19",
	.needs_root = 1,
	.cleanup = cleanup,
	.timeout = 3600,
};

      reply	other threads:[~2020-10-14 16:57 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-25 11:49 [PATCH] mm: Do not deactivate when the cgroup has plenty inactive page zhongjiang-ali
2020-09-25 12:07 ` Michal Hocko
2020-09-27  2:39   ` zhong jiang
2020-09-29 16:02     ` Michal Hocko
2020-10-14 14:21       ` zhong jiang
2020-10-14 14:47         ` Michal Hocko
2020-10-14 16:57           ` zhong jiang [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=9212fc5a-35dd-1347-2b0e-ee73c58f1fd8@linux.alibaba.com \
    --to=zhongjiang-ali@linux.alibaba.com \
    --cc=akpm@linux-foundation.org \
    --cc=hannes@cmpxchg.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).