linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/7] thp/khugepaged improvements and CoW semantics
@ 2020-03-27 17:05 Kirill A. Shutemov
  2020-03-27 17:05 ` [PATCH 1/7] khugepaged: Add self test Kirill A. Shutemov
                   ` (6 more replies)
  0 siblings, 7 replies; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-27 17:05 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli; +Cc: linux-mm, linux-kernel, Kirill A. Shutemov

The patchset adds khugepaged selftest (anon-THP only for now), expands
cases khugepaged can handle and switches anon-THP copy-on-write handling
to 4k.

Please review and consider applying.

Kirill A. Shutemov (7):
  khugepaged: Add self test
  khugepaged: Do not stop collapse if less than half PTEs are referenced
  khugepaged: Drain LRU add pagevec to get rid of extra pins
  khugepaged: Allow to callapse a page shared across fork
  khugepaged: Allow to collapse PTE-mapped compound pages
  thp: Change CoW semantics for anon-THP
  khugepaged: Introduce 'max_ptes_shared' tunable

 Documentation/admin-guide/mm/transhuge.rst |   7 +
 mm/huge_memory.c                           | 247 +-----
 mm/khugepaged.c                            | 124 ++-
 tools/testing/selftests/vm/Makefile        |   1 +
 tools/testing/selftests/vm/khugepaged.c    | 924 +++++++++++++++++++++
 5 files changed, 1057 insertions(+), 246 deletions(-)
 create mode 100644 tools/testing/selftests/vm/khugepaged.c

-- 
2.26.0



^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH 1/7] khugepaged: Add self test
  2020-03-27 17:05 [PATCH 0/7] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
@ 2020-03-27 17:05 ` Kirill A. Shutemov
  2020-03-27 17:05 ` [PATCH 2/7] khugepaged: Do not stop collapse if less than half PTEs are referenced Kirill A. Shutemov
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-27 17:05 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli; +Cc: linux-mm, linux-kernel, Kirill A. Shutemov

The test checks if khugepaged is able to recover huge page where we
expetect to do so. It only covers anon-THP for now.

Currenlty the test shows few failures. They are going to be addressed by
the following patches.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 tools/testing/selftests/vm/Makefile     |   1 +
 tools/testing/selftests/vm/khugepaged.c | 841 ++++++++++++++++++++++++
 2 files changed, 842 insertions(+)
 create mode 100644 tools/testing/selftests/vm/khugepaged.c

diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 7f9a8a8c31da..981d0dc21f9e 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -18,6 +18,7 @@ TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
+TEST_GEN_FILES += khugepaged
 
 ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64))
 TEST_GEN_FILES += va_128TBswitch
diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c
new file mode 100644
index 000000000000..193bde6a1534
--- /dev/null
+++ b/tools/testing/selftests/vm/khugepaged.c
@@ -0,0 +1,841 @@
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+#include <sys/wait.h>
+
+#ifndef MADV_PAGEOUT
+#define MADV_PAGEOUT 21
+#endif
+
+#define BASE_ADDR ((void *)(1UL << 30))
+static unsigned long hpage_pmd_size;
+static unsigned long page_size;
+static int hpage_pmd_nr;
+
+#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
+
+enum thp_enabled {
+	THP_ALWAYS,
+	THP_MADVISE,
+	THP_NEVER,
+};
+
+static const char *thp_enabled_strings[] = {
+	"always",
+	"madvise",
+	"never",
+	NULL
+};
+
+enum thp_defrag {
+	THP_DEFRAG_ALWAYS,
+	THP_DEFRAG_DEFER,
+	THP_DEFRAG_DEFER_MADVISE,
+	THP_DEFRAG_MADVISE,
+	THP_DEFRAG_NEVER,
+};
+
+static const char *thp_defrag_strings[] = {
+	"always",
+	"defer",
+	"defer+madvise",
+	"madvise",
+	"never",
+	NULL
+};
+
+enum shmem_enabled {
+	SHMEM_ALWAYS,
+	SHMEM_WITHIN_SIZE,
+	SHMEM_ADVISE,
+	SHMEM_NEVER,
+	SHMEM_DENY,
+	SHMEM_FORCE,
+};
+
+static const char *shmem_enabled_strings[] = {
+	"always",
+	"within_size",
+	"advise",
+	"never",
+	"deny",
+	"force",
+	NULL
+};
+
+struct khugepaged_settings {
+	bool defrag;
+	unsigned int alloc_sleep_millisecs;
+	unsigned int scan_sleep_millisecs;
+	unsigned int max_ptes_none;
+	unsigned int max_ptes_swap;
+	unsigned long pages_to_scan;
+};
+
+struct settings {
+	enum thp_enabled thp_enabled;
+	enum thp_defrag thp_defrag;
+	enum shmem_enabled shmem_enabled;
+	bool debug_cow;
+	bool use_zero_page;
+	struct khugepaged_settings khugepaged;
+};
+
+static struct settings default_settings = {
+	.thp_enabled = THP_MADVISE,
+	.thp_defrag = THP_DEFRAG_ALWAYS,
+	.shmem_enabled = SHMEM_NEVER,
+	.debug_cow = 0,
+	.use_zero_page = 0,
+	.khugepaged = {
+		.defrag = 1,
+		.alloc_sleep_millisecs = 10,
+		.scan_sleep_millisecs = 10,
+	},
+};
+
+static struct settings saved_settings;
+static bool skip_settings_restore;
+
+static int exit_status;
+
+static void success(const char *msg)
+{
+	printf(" \e[32m%s\e[0m\n", msg);
+}
+
+static void fail(const char *msg)
+{
+	printf(" \e[31m%s\e[0m\n", msg);
+	exit_status++;
+}
+
+static int read_file(const char *path, char *buf, size_t buflen)
+{
+	int fd;
+	ssize_t numread;
+
+	fd = open(path, O_RDONLY);
+	if (fd == -1)
+		return 0;
+
+	numread = read(fd, buf, buflen - 1);
+	if (numread < 1) {
+		close(fd);
+		return 0;
+	}
+
+	buf[numread] = '\0';
+	close(fd);
+
+	return (unsigned int) numread;
+}
+
+static int write_file(const char *path, const char *buf, size_t buflen)
+{
+	int fd;
+	ssize_t numwritten;
+
+	fd = open(path, O_WRONLY);
+	if (fd == -1)
+		return 0;
+
+	numwritten = write(fd, buf, buflen - 1);
+	close(fd);
+	if (numwritten < 1)
+		return 0;
+
+	return (unsigned int) numwritten;
+}
+
+static int read_string(const char *name, const char *strings[])
+{
+	char path[PATH_MAX];
+	char buf[256];
+	char *c;
+	int ret;
+
+	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+	if (ret >= PATH_MAX) {
+		printf("%s: Pathname is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+	if (!read_file(path, buf, sizeof(buf))) {
+		perror(path);
+		exit(EXIT_FAILURE);
+	}
+
+	c = strchr(buf, '[');
+	if (!c) {
+		printf("%s: Parse failure\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+	c++;
+	memmove(buf, c, sizeof(buf) - (c - buf));
+
+	c = strchr(buf, ']');
+	if (!c) {
+		printf("%s: Parse failure\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+	*c = '\0';
+
+	ret = 0;
+	while (strings[ret]) {
+		if (!strcmp(strings[ret], buf))
+			return ret;
+		ret++;
+	}
+
+	printf("Failed to parse %s\n", name);
+	exit(EXIT_FAILURE);
+}
+
+static void write_string(const char *name, const char *val)
+{
+	char path[PATH_MAX];
+	int ret;
+
+	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+	if (ret >= PATH_MAX) {
+		printf("%s: Pathname is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+	if (!write_file(path, val, strlen(val) + 1)) {
+		perror(path);
+		exit(EXIT_FAILURE);
+	}
+}
+
+static const unsigned long read_num(const char *name)
+{
+	char path[PATH_MAX];
+	char buf[21];
+	int ret;
+
+	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+	if (ret >= PATH_MAX) {
+		printf("%s: Pathname is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+	ret = read_file(path, buf, sizeof(buf));
+	if (ret < 0) {
+		perror("read_file(read_num)");
+		exit(EXIT_FAILURE);
+	}
+
+	return strtoul(buf, NULL, 10);
+}
+
+static void write_num(const char *name, unsigned long num)
+{
+	char path[PATH_MAX];
+	char buf[21];
+	int ret;
+
+	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+	if (ret >= PATH_MAX) {
+		printf("%s: Pathname is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+	sprintf(buf, "%ld", num);
+	if (!write_file(path, buf, strlen(buf) + 1)) {
+		perror(path);
+		exit(EXIT_FAILURE);
+	}
+}
+
+static void write_settings(struct settings *settings)
+{
+	struct khugepaged_settings *khugepaged = &settings->khugepaged;
+
+	write_string("enabled", thp_enabled_strings[settings->thp_enabled]);
+	write_string("defrag", thp_defrag_strings[settings->thp_defrag]);
+	write_string("shmem_enabled",
+			shmem_enabled_strings[settings->shmem_enabled]);
+	write_num("debug_cow", settings->debug_cow);
+	write_num("use_zero_page", settings->use_zero_page);
+
+	write_num("khugepaged/defrag", khugepaged->defrag);
+	write_num("khugepaged/alloc_sleep_millisecs",
+			khugepaged->alloc_sleep_millisecs);
+	write_num("khugepaged/scan_sleep_millisecs",
+			khugepaged->scan_sleep_millisecs);
+	write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
+	write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
+	write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
+}
+
+static void restore_settings(int sig)
+{
+	if (skip_settings_restore)
+		goto out;
+
+	printf("Restore THP and khugepaged settings...");
+	write_settings(&saved_settings);
+	success("OK");
+	if (sig)
+		exit(EXIT_FAILURE);
+out:
+	exit(exit_status);
+}
+
+static void save_settings(void)
+{
+	printf("Save THP and khugepaged settings...");
+	saved_settings = (struct settings) {
+		.thp_enabled = read_string("enabled", thp_enabled_strings),
+		.thp_defrag = read_string("defrag", thp_defrag_strings),
+		.shmem_enabled =
+			read_string("shmem_enabled", shmem_enabled_strings),
+		.debug_cow = read_num("debug_cow"),
+		.use_zero_page = read_num("use_zero_page"),
+	};
+	saved_settings.khugepaged = (struct khugepaged_settings) {
+		.defrag = read_num("khugepaged/defrag"),
+		.alloc_sleep_millisecs =
+			read_num("khugepaged/alloc_sleep_millisecs"),
+		.scan_sleep_millisecs =
+			read_num("khugepaged/scan_sleep_millisecs"),
+		.max_ptes_none = read_num("khugepaged/max_ptes_none"),
+		.max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
+		.pages_to_scan = read_num("khugepaged/pages_to_scan"),
+	};
+	success("OK");
+
+	signal(SIGTERM, restore_settings);
+	signal(SIGINT, restore_settings);
+	signal(SIGHUP, restore_settings);
+	signal(SIGQUIT, restore_settings);
+}
+
+static void adjust_settings(void)
+{
+
+	printf("Adjust settings...");
+	write_settings(&default_settings);
+	success("OK");
+}
+
+#define CHECK_HUGE_FMT "sed -ne " \
+	"'/^%lx/,/^AnonHugePages/{/^AnonHugePages:\\s*%ld kB/ q1}' " \
+	"/proc/%d/smaps"
+
+static bool check_huge(void *p)
+{
+	char *cmd;
+	int ret;
+
+	ret = asprintf(&cmd, CHECK_HUGE_FMT,
+			(unsigned long)p, hpage_pmd_size >> 10, getpid());
+	if (ret < 0) {
+		perror("asprintf(CHECK_FMT)");
+		exit(EXIT_FAILURE);
+	}
+
+	ret = system(cmd);
+	free(cmd);
+	if (ret < 0 || !WIFEXITED(ret)) {
+		perror("system(check_huge)");
+		exit(EXIT_FAILURE);
+	}
+
+	return WEXITSTATUS(ret);
+}
+
+#define CHECK_SWAP_FMT "sed -ne " \
+	"'/^%lx/,/^Swap:/{/^Swap:\\s*%ld kB/ q1}' " \
+	"/proc/%d/smaps"
+
+static bool check_swap(void *p, unsigned long size)
+{
+	char *cmd;
+	int ret;
+
+	ret = asprintf(&cmd, CHECK_SWAP_FMT,
+			(unsigned long)p, size >> 10, getpid());
+	if (ret < 0) {
+		perror("asprintf(CHECK_SWAP)");
+		exit(EXIT_FAILURE);
+	}
+
+	ret = system(cmd);
+	free(cmd);
+	if (ret < 0 || !WIFEXITED(ret)) {
+		perror("system(check_swap)");
+		exit(EXIT_FAILURE);
+	}
+
+	return WEXITSTATUS(ret);
+}
+
+static void *alloc_mapping(void)
+{
+	void *p;
+
+	p = mmap(BASE_ADDR, hpage_pmd_size, PROT_READ | PROT_WRITE,
+			MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (p != BASE_ADDR) {
+		printf("Failed to allocate VMA at %p\n", BASE_ADDR);
+		exit(EXIT_FAILURE);
+	}
+
+	return p;
+}
+
+static void fill_memory(int *p, unsigned long start, unsigned long end)
+{
+	int i;
+
+	for (i = start / page_size; i < end / page_size; i++)
+		p[i * page_size / sizeof(*p)] = i + 0xdead0000;
+}
+
+static void validate_memory(int *p, unsigned long start, unsigned long end)
+{
+	int i;
+
+	for (i = start / page_size; i < end / page_size; i++) {
+		if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
+			printf("Page %d is corrupted: %#x\n",
+					i, p[i * page_size / sizeof(*p)]);
+			exit(EXIT_FAILURE);
+		}
+	}
+}
+
+#define TICK 500000
+static bool wait_for_scan(const char *msg, char *p)
+{
+	int full_scans;
+	int timeout = 6; /* 3 seconds */
+
+	/* Sanity check */
+	if (check_huge(p)) {
+		printf("Unexpected huge page\n");
+		exit(EXIT_FAILURE);
+	}
+
+	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+
+	/* Wait until the second full_scan completed */
+	full_scans = read_num("khugepaged/full_scans") + 2;
+
+	printf("%s...", msg);
+	while (timeout--) {
+		if (check_huge(p))
+			break;
+		if (read_num("khugepaged/full_scans") >= full_scans)
+			break;
+		printf(".");
+		usleep(TICK);
+	}
+
+	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+	return !timeout;
+}
+
+static void alloc_at_fault(void)
+{
+	struct settings settings = default_settings;
+	char *p;
+
+	settings.thp_enabled = THP_ALWAYS;
+	write_settings(&settings);
+
+	p = alloc_mapping();
+	*p = 1;
+	printf("Allocate huge page on fault...");
+	if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	write_settings(&default_settings);
+
+	madvise(p, page_size, MADV_DONTNEED);
+	printf("Split huge PMD on MADV_DONTNEED...");
+	if (!check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_full(void)
+{
+	void *p;
+
+	p = alloc_mapping();
+	fill_memory(p, 0, hpage_pmd_size);
+	if (wait_for_scan("Collapse fully populated PTE table", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, hpage_pmd_size);
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_empty(void)
+{
+	void *p;
+
+	p = alloc_mapping();
+	if (wait_for_scan("Do not collapse empty PTE table", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		fail("Fail");
+	else
+		success("OK");
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_signle_pte_entry(void)
+{
+	void *p;
+
+	p = alloc_mapping();
+	fill_memory(p, 0, page_size);
+	if (wait_for_scan("Collapse PTE table with single PTE entry present", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, page_size);
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_none(void)
+{
+	int max_ptes_none = hpage_pmd_nr / 2;
+	struct settings settings = default_settings;
+	void *p;
+
+	settings.khugepaged.max_ptes_none = max_ptes_none;
+	write_settings(&settings);
+
+	p = alloc_mapping();
+
+	fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
+	if (wait_for_scan("Do not collapse with max_ptes_none exeeded", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		fail("Fail");
+	else
+		success("OK");
+	validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
+
+	fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
+	if (wait_for_scan("Collapse with max_ptes_none PTEs empty", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
+
+	munmap(p, hpage_pmd_size);
+	write_settings(&default_settings);
+}
+
+static void collapse_swapin_single_pte(void)
+{
+	void *p;
+	p = alloc_mapping();
+	fill_memory(p, 0, hpage_pmd_size);
+
+	printf("Swapout one page...");
+	if (madvise(p, page_size, MADV_PAGEOUT)) {
+		perror("madvise(MADV_PAGEOUT)");
+		exit(EXIT_FAILURE);
+	}
+	if (check_swap(p, page_size)) {
+		success("OK");
+	} else {
+		fail("Fail");
+		goto out;
+	}
+
+	if (wait_for_scan("Collapse with swaping in single PTE entry", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, hpage_pmd_size);
+out:
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_swap(void)
+{
+	int max_ptes_swap = read_num("khugepaged/max_ptes_swap");
+	void *p;
+
+	p = alloc_mapping();
+
+	fill_memory(p, 0, hpage_pmd_size);
+	printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
+	if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
+		perror("madvise(MADV_PAGEOUT)");
+		exit(EXIT_FAILURE);
+	}
+	if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
+		success("OK");
+	} else {
+		fail("Fail");
+		goto out;
+	}
+
+	if (wait_for_scan("Do not collapse with max_ptes_swap exeeded", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		fail("Fail");
+	else
+		success("OK");
+	validate_memory(p, 0, hpage_pmd_size);
+
+	fill_memory(p, 0, hpage_pmd_size);
+	printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr);
+	if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
+		perror("madvise(MADV_PAGEOUT)");
+		exit(EXIT_FAILURE);
+	}
+	if (check_swap(p, max_ptes_swap * page_size)) {
+		success("OK");
+	} else {
+		fail("Fail");
+		goto out;
+	}
+
+	if (wait_for_scan("Collapse with max_ptes_swap pages swapped out", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, hpage_pmd_size);
+out:
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_signle_pte_entry_compound(void)
+{
+	void *p;
+
+	p = alloc_mapping();
+
+	printf("Allocate huge page...");
+	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+	fill_memory(p, 0, hpage_pmd_size);
+	if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+	printf("Split huge page leaving single PTE mapping compount page...");
+	madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
+	if (!check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	if (wait_for_scan("Collapse PTE table with single PTE mapping compount page", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, page_size);
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_full_of_compound(void)
+{
+	void *p;
+
+	p = alloc_mapping();
+
+	printf("Allocate huge page...");
+	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+	fill_memory(p, 0, hpage_pmd_size);
+	if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	printf("Split huge page leaving single PTE page table full of compount pages...");
+	madvise(p, page_size, MADV_NOHUGEPAGE);
+	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+	if (!check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	if (wait_for_scan("Collapse PTE table full of compound pages", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, hpage_pmd_size);
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_fork(void)
+{
+	int wstatus;
+	void *p;
+
+	p = alloc_mapping();
+
+	printf("Allocate small page...");
+	fill_memory(p, 0, page_size);
+	if (!check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	printf("Share small page over fork()...");
+	if (!fork()) {
+		/* Do not touch settings on child exit */
+		skip_settings_restore = true;
+		exit_status = 0;
+
+		if (!check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+
+		fill_memory(p, page_size, 2 * page_size);
+
+		if (wait_for_scan("Collapse PTE table with single page shared with parent process", p))
+			fail("Timeout");
+		else if (check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+
+		validate_memory(p, 0, page_size);
+		munmap(p, hpage_pmd_size);
+		exit(exit_status);
+	}
+
+	wait(&wstatus);
+	exit_status += WEXITSTATUS(wstatus);
+
+	printf("Check if parent still has small page...");
+	if (!check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, page_size);
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_fork_compound(void)
+{
+	int wstatus;
+	void *p;
+
+	p = alloc_mapping();
+
+	printf("Allocate huge page...");
+	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+	fill_memory(p, 0, hpage_pmd_size);
+	if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	printf("Share huge page over fork()...");
+	if (!fork()) {
+		/* Do not touch settings on child exit */
+		skip_settings_restore = true;
+		exit_status = 0;
+
+		if (check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+
+		printf("Split huge page PMD in child process...");
+		madvise(p, page_size, MADV_NOHUGEPAGE);
+		madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+		if (!check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+		fill_memory(p, 0, page_size);
+
+		if (wait_for_scan("Collapse PTE table full of compound pages in child", p))
+			fail("Timeout");
+		else if (check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+
+		validate_memory(p, 0, hpage_pmd_size);
+		munmap(p, hpage_pmd_size);
+		exit(exit_status);
+	}
+
+	wait(&wstatus);
+	exit_status += WEXITSTATUS(wstatus);
+
+	printf("Check if parent still has huge page...");
+	if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, hpage_pmd_size);
+	munmap(p, hpage_pmd_size);
+}
+
+int main(void)
+{
+	setbuf(stdout, NULL);
+
+	page_size = getpagesize();
+	hpage_pmd_size = read_num("hpage_pmd_size");
+	hpage_pmd_nr = hpage_pmd_size / page_size;
+
+	default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
+	default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
+	default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
+
+	save_settings();
+	adjust_settings();
+
+	alloc_at_fault();
+	collapse_full();
+	collapse_empty();
+	collapse_signle_pte_entry();
+	collapse_max_ptes_none();
+	collapse_swapin_single_pte();
+	collapse_max_ptes_swap();
+	collapse_signle_pte_entry_compound();
+	collapse_full_of_compound();
+	collapse_fork();
+	collapse_fork_compound();
+
+	restore_settings(0);
+}
-- 
2.26.0



^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 2/7] khugepaged: Do not stop collapse if less than half PTEs are referenced
  2020-03-27 17:05 [PATCH 0/7] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
  2020-03-27 17:05 ` [PATCH 1/7] khugepaged: Add self test Kirill A. Shutemov
@ 2020-03-27 17:05 ` Kirill A. Shutemov
  2020-03-27 17:30   ` Zi Yan
  2020-03-27 17:46   ` Yang Shi
  2020-03-27 17:05 ` [PATCH 3/7] khugepaged: Drain LRU add pagevec to get rid of extra pins Kirill A. Shutemov
                   ` (4 subsequent siblings)
  6 siblings, 2 replies; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-27 17:05 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli; +Cc: linux-mm, linux-kernel, Kirill A. Shutemov

__collapse_huge_page_swapin() check number of referenced PTE to decide
if the memory range is hot enough to justify swapin.

The problem is that it stops collapse altogether if there's not enough
refereced pages, not only swappingin.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Fixes: 0db501f7a34c ("mm, thp: convert from optimistic swapin collapsing to conservative")
---
 mm/khugepaged.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 99bab7e4d05b..14d7afc90786 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -905,7 +905,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 	/* we only decide to swapin, if there is enough young ptes */
 	if (referenced < HPAGE_PMD_NR/2) {
 		trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
-		return false;
+		/* Do not block collapse, only skip swapping in */
+		return true;
 	}
 	vmf.pte = pte_offset_map(pmd, address);
 	for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
-- 
2.26.0



^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 3/7] khugepaged: Drain LRU add pagevec to get rid of extra pins
  2020-03-27 17:05 [PATCH 0/7] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
  2020-03-27 17:05 ` [PATCH 1/7] khugepaged: Add self test Kirill A. Shutemov
  2020-03-27 17:05 ` [PATCH 2/7] khugepaged: Do not stop collapse if less than half PTEs are referenced Kirill A. Shutemov
@ 2020-03-27 17:05 ` Kirill A. Shutemov
  2020-03-27 17:34   ` Zi Yan
  2020-03-27 18:10   ` Yang Shi
  2020-03-27 17:05 ` [PATCH 4/7] khugepaged: Allow to callapse a page shared across fork Kirill A. Shutemov
                   ` (3 subsequent siblings)
  6 siblings, 2 replies; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-27 17:05 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli; +Cc: linux-mm, linux-kernel, Kirill A. Shutemov

__collapse_huge_page_isolate() may fail due to extra pin in the LRU add
pagevec. It's petty common for swapin case: we swap in pages just to
fail due to the extra pin.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/khugepaged.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 14d7afc90786..39e0994abeb8 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -585,11 +585,19 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		 * The page must only be referenced by the scanned process
 		 * and page swap cache.
 		 */
+		if (page_count(page) != 1 + PageSwapCache(page)) {
+			/*
+			 * Drain pagevec and retry just in case we can get rid
+			 * of the extra pin, like in swapin case.
+			 */
+			lru_add_drain();
+		}
 		if (page_count(page) != 1 + PageSwapCache(page)) {
 			unlock_page(page);
 			result = SCAN_PAGE_COUNT;
 			goto out;
 		}
+
 		if (pte_write(pteval)) {
 			writable = true;
 		} else {
-- 
2.26.0



^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 4/7] khugepaged: Allow to callapse a page shared across fork
  2020-03-27 17:05 [PATCH 0/7] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
                   ` (2 preceding siblings ...)
  2020-03-27 17:05 ` [PATCH 3/7] khugepaged: Drain LRU add pagevec to get rid of extra pins Kirill A. Shutemov
@ 2020-03-27 17:05 ` Kirill A. Shutemov
  2020-03-27 18:19   ` Zi Yan
  2020-03-27 17:05 ` [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages Kirill A. Shutemov
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-27 17:05 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli; +Cc: linux-mm, linux-kernel, Kirill A. Shutemov

The page can be included into collapse as long as it doesn't have extra
pins (from GUP or otherwise).

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/khugepaged.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 39e0994abeb8..b47edfe57f7b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -581,18 +581,26 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		}
 
 		/*
-		 * cannot use mapcount: can't collapse if there's a gup pin.
-		 * The page must only be referenced by the scanned process
-		 * and page swap cache.
+		 * Check if the page has any GUP (or other external) pins.
+		 *
+		 * The page table that maps the page has been already unlinked
+		 * from the page table tree and this process cannot get
+		 * additinal pin on the page.
+		 *
+		 * New pins can come later if the page is shared across fork,
+		 * but not for the this process. It is fine. The other process
+		 * cannot write to the page, only trigger CoW.
 		 */
-		if (page_count(page) != 1 + PageSwapCache(page)) {
+		if (total_mapcount(page) + PageSwapCache(page) !=
+				page_count(page)) {
 			/*
 			 * Drain pagevec and retry just in case we can get rid
 			 * of the extra pin, like in swapin case.
 			 */
 			lru_add_drain();
 		}
-		if (page_count(page) != 1 + PageSwapCache(page)) {
+		if (total_mapcount(page) + PageSwapCache(page) !=
+				page_count(page)) {
 			unlock_page(page);
 			result = SCAN_PAGE_COUNT;
 			goto out;
@@ -680,7 +688,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 		} else {
 			src_page = pte_page(pteval);
 			copy_user_highpage(page, src_page, address, vma);
-			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
 			release_pte_page(src_page);
 			/*
 			 * ptl mostly unnecessary, but preempt has to
@@ -1209,12 +1216,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 			goto out_unmap;
 		}
 
-		/*
-		 * cannot use mapcount: can't collapse if there's a gup pin.
-		 * The page must only be referenced by the scanned process
-		 * and page swap cache.
-		 */
-		if (page_count(page) != 1 + PageSwapCache(page)) {
+		/* Check if the page has any GUP (or other external) pins */
+		if (total_mapcount(page) + PageSwapCache(page) !=
+				page_count(page)) {
 			result = SCAN_PAGE_COUNT;
 			goto out_unmap;
 		}
-- 
2.26.0



^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-27 17:05 [PATCH 0/7] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
                   ` (3 preceding siblings ...)
  2020-03-27 17:05 ` [PATCH 4/7] khugepaged: Allow to callapse a page shared across fork Kirill A. Shutemov
@ 2020-03-27 17:05 ` Kirill A. Shutemov
  2020-03-27 18:53   ` Yang Shi
                     ` (2 more replies)
  2020-03-27 17:06 ` [PATCH 6/7] thp: Change CoW semantics for anon-THP Kirill A. Shutemov
  2020-03-27 17:06 ` [PATCH 7/7] khugepaged: Introduce 'max_ptes_shared' tunable Kirill A. Shutemov
  6 siblings, 3 replies; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-27 17:05 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli; +Cc: linux-mm, linux-kernel, Kirill A. Shutemov

We can collapse PTE-mapped compound pages. We only need to avoid
handling them more than once: lock/unlock page only once if it's present
in the PMD range multiple times as it handled on compound level. The
same goes for LRU isolation and putpack.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b47edfe57f7b..c8c2c463095c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
 
 static void release_pte_page(struct page *page)
 {
+	/*
+	 * We need to unlock and put compound page on LRU only once.
+	 * The rest of the pages have to be locked and not on LRU here.
+	 */
+	VM_BUG_ON_PAGE(!PageCompound(page) &&
+			(!PageLocked(page) && PageLRU(page)), page);
+
+	if (!PageLocked(page))
+		return;
+
+	page = compound_head(page);
 	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
 	unlock_page(page);
 	putback_lru_page(page);
@@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 	pte_t *_pte;
 	int none_or_zero = 0, result = 0, referenced = 0;
 	bool writable = false;
+	LIST_HEAD(compound_pagelist);
 
 	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
 	     _pte++, address += PAGE_SIZE) {
@@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			goto out;
 		}
 
-		/* TODO: teach khugepaged to collapse THP mapped with pte */
+		VM_BUG_ON_PAGE(!PageAnon(page), page);
+
 		if (PageCompound(page)) {
-			result = SCAN_PAGE_COMPOUND;
-			goto out;
-		}
+			struct page *p;
+			page = compound_head(page);
 
-		VM_BUG_ON_PAGE(!PageAnon(page), page);
+			/*
+			 * Check if we have dealt with the compount page
+			 * already
+			 */
+			list_for_each_entry(p, &compound_pagelist, lru) {
+				if (page ==  p)
+					break;
+			}
+			if (page ==  p)
+				continue;
+		}
 
 		/*
 		 * We can do it before isolate_lru_page because the
@@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		    page_is_young(page) || PageReferenced(page) ||
 		    mmu_notifier_test_young(vma->vm_mm, address))
 			referenced++;
+
+		if (PageCompound(page))
+			list_add_tail(&page->lru, &compound_pagelist);
 	}
 	if (likely(writable)) {
 		if (likely(referenced)) {
@@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 			goto out_unmap;
 		}
 
-		/* TODO: teach khugepaged to collapse THP mapped with pte */
-		if (PageCompound(page)) {
-			result = SCAN_PAGE_COMPOUND;
-			goto out_unmap;
-		}
+		page = compound_head(page);
 
 		/*
 		 * Record which node the original page is from and save this
-- 
2.26.0



^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 6/7] thp: Change CoW semantics for anon-THP
  2020-03-27 17:05 [PATCH 0/7] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
                   ` (4 preceding siblings ...)
  2020-03-27 17:05 ` [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages Kirill A. Shutemov
@ 2020-03-27 17:06 ` Kirill A. Shutemov
  2020-03-27 20:07   ` Yang Shi
  2020-03-27 17:06 ` [PATCH 7/7] khugepaged: Introduce 'max_ptes_shared' tunable Kirill A. Shutemov
  6 siblings, 1 reply; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-27 17:06 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli; +Cc: linux-mm, linux-kernel, Kirill A. Shutemov

Currently we have different copy-on-write semantics for anon- and
file-THP. For anon-THP we try to allocate huge page on the write fault,
but on file-THP we split PMD and allocate 4k page.

Arguably, file-THP semantics is more desirable: we don't necessary want
to unshare full PMD range from the parent on the first access. This is
the primary reason THP is unusable for some workloads, like Redis.

The original THP refcounting didn't allow to have PTE-mapped compound
pages, so we had no options, but to allocate huge page on CoW (with
fallback to 512 4k pages).

The current refcounting doesn't have such limitations and we can cut a
lot of complex code out of fault path.

khugepaged is now able to recover THP from such ranges if the
configuration allows.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/huge_memory.c | 247 +++++------------------------------------------
 1 file changed, 24 insertions(+), 223 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ef6a6bcb291f..15b7a9c86b7c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1206,262 +1206,63 @@ void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
 	spin_unlock(vmf->ptl);
 }
 
-static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
-			pmd_t orig_pmd, struct page *page)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-	struct mem_cgroup *memcg;
-	pgtable_t pgtable;
-	pmd_t _pmd;
-	int i;
-	vm_fault_t ret = 0;
-	struct page **pages;
-	struct mmu_notifier_range range;
-
-	pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
-			      GFP_KERNEL);
-	if (unlikely(!pages)) {
-		ret |= VM_FAULT_OOM;
-		goto out;
-	}
-
-	for (i = 0; i < HPAGE_PMD_NR; i++) {
-		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
-					       vmf->address, page_to_nid(page));
-		if (unlikely(!pages[i] ||
-			     mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
-				     GFP_KERNEL, &memcg, false))) {
-			if (pages[i])
-				put_page(pages[i]);
-			while (--i >= 0) {
-				memcg = (void *)page_private(pages[i]);
-				set_page_private(pages[i], 0);
-				mem_cgroup_cancel_charge(pages[i], memcg,
-						false);
-				put_page(pages[i]);
-			}
-			kfree(pages);
-			ret |= VM_FAULT_OOM;
-			goto out;
-		}
-		set_page_private(pages[i], (unsigned long)memcg);
-	}
-
-	for (i = 0; i < HPAGE_PMD_NR; i++) {
-		copy_user_highpage(pages[i], page + i,
-				   haddr + PAGE_SIZE * i, vma);
-		__SetPageUptodate(pages[i]);
-		cond_resched();
-	}
-
-	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
-				haddr, haddr + HPAGE_PMD_SIZE);
-	mmu_notifier_invalidate_range_start(&range);
-
-	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
-	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
-		goto out_free_pages;
-	VM_BUG_ON_PAGE(!PageHead(page), page);
-
-	/*
-	 * Leave pmd empty until pte is filled note we must notify here as
-	 * concurrent CPU thread might write to new page before the call to
-	 * mmu_notifier_invalidate_range_end() happens which can lead to a
-	 * device seeing memory write in different order than CPU.
-	 *
-	 * See Documentation/vm/mmu_notifier.rst
-	 */
-	pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-
-	pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
-	pmd_populate(vma->vm_mm, &_pmd, pgtable);
-
-	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
-		pte_t entry;
-		entry = mk_pte(pages[i], vma->vm_page_prot);
-		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-		memcg = (void *)page_private(pages[i]);
-		set_page_private(pages[i], 0);
-		page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
-		mem_cgroup_commit_charge(pages[i], memcg, false, false);
-		lru_cache_add_active_or_unevictable(pages[i], vma);
-		vmf->pte = pte_offset_map(&_pmd, haddr);
-		VM_BUG_ON(!pte_none(*vmf->pte));
-		set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
-		pte_unmap(vmf->pte);
-	}
-	kfree(pages);
-
-	smp_wmb(); /* make pte visible before pmd */
-	pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
-	page_remove_rmap(page, true);
-	spin_unlock(vmf->ptl);
-
-	/*
-	 * No need to double call mmu_notifier->invalidate_range() callback as
-	 * the above pmdp_huge_clear_flush_notify() did already call it.
-	 */
-	mmu_notifier_invalidate_range_only_end(&range);
-
-	ret |= VM_FAULT_WRITE;
-	put_page(page);
-
-out:
-	return ret;
-
-out_free_pages:
-	spin_unlock(vmf->ptl);
-	mmu_notifier_invalidate_range_end(&range);
-	for (i = 0; i < HPAGE_PMD_NR; i++) {
-		memcg = (void *)page_private(pages[i]);
-		set_page_private(pages[i], 0);
-		mem_cgroup_cancel_charge(pages[i], memcg, false);
-		put_page(pages[i]);
-	}
-	kfree(pages);
-	goto out;
-}
-
 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	struct page *page = NULL, *new_page;
-	struct mem_cgroup *memcg;
+	struct page *page;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-	struct mmu_notifier_range range;
-	gfp_t huge_gfp;			/* for allocation and charge */
-	vm_fault_t ret = 0;
 
 	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
 	VM_BUG_ON_VMA(!vma->anon_vma, vma);
+
 	if (is_huge_zero_pmd(orig_pmd))
-		goto alloc;
+		goto fallback;
+
 	spin_lock(vmf->ptl);
-	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
-		goto out_unlock;
+
+	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+		spin_unlock(vmf->ptl);
+		return 0;
+	}
 
 	page = pmd_page(orig_pmd);
 	VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
-	/*
-	 * We can only reuse the page if nobody else maps the huge page or it's
-	 * part.
-	 */
+
+	/* Lock page for reuse_swap_page() */
 	if (!trylock_page(page)) {
 		get_page(page);
 		spin_unlock(vmf->ptl);
 		lock_page(page);
 		spin_lock(vmf->ptl);
 		if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+			spin_unlock(vmf->ptl);
 			unlock_page(page);
 			put_page(page);
-			goto out_unlock;
+			return 0;
 		}
 		put_page(page);
 	}
+
+	/*
+	 * We can only reuse the page if nobody else maps the huge page or it's
+	 * part.
+	 */
 	if (reuse_swap_page(page, NULL)) {
 		pmd_t entry;
 		entry = pmd_mkyoung(orig_pmd);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
 			update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
-		ret |= VM_FAULT_WRITE;
 		unlock_page(page);
-		goto out_unlock;
-	}
-	unlock_page(page);
-	get_page(page);
-	spin_unlock(vmf->ptl);
-alloc:
-	if (__transparent_hugepage_enabled(vma) &&
-	    !transparent_hugepage_debug_cow()) {
-		huge_gfp = alloc_hugepage_direct_gfpmask(vma);
-		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
-	} else
-		new_page = NULL;
-
-	if (likely(new_page)) {
-		prep_transhuge_page(new_page);
-	} else {
-		if (!page) {
-			split_huge_pmd(vma, vmf->pmd, vmf->address);
-			ret |= VM_FAULT_FALLBACK;
-		} else {
-			ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
-			if (ret & VM_FAULT_OOM) {
-				split_huge_pmd(vma, vmf->pmd, vmf->address);
-				ret |= VM_FAULT_FALLBACK;
-			}
-			put_page(page);
-		}
-		count_vm_event(THP_FAULT_FALLBACK);
-		goto out;
-	}
-
-	if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
-					huge_gfp, &memcg, true))) {
-		put_page(new_page);
-		split_huge_pmd(vma, vmf->pmd, vmf->address);
-		if (page)
-			put_page(page);
-		ret |= VM_FAULT_FALLBACK;
-		count_vm_event(THP_FAULT_FALLBACK);
-		goto out;
-	}
-
-	count_vm_event(THP_FAULT_ALLOC);
-	count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
-
-	if (!page)
-		clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
-	else
-		copy_user_huge_page(new_page, page, vmf->address,
-				    vma, HPAGE_PMD_NR);
-	__SetPageUptodate(new_page);
-
-	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
-				haddr, haddr + HPAGE_PMD_SIZE);
-	mmu_notifier_invalidate_range_start(&range);
-
-	spin_lock(vmf->ptl);
-	if (page)
-		put_page(page);
-	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
 		spin_unlock(vmf->ptl);
-		mem_cgroup_cancel_charge(new_page, memcg, true);
-		put_page(new_page);
-		goto out_mn;
-	} else {
-		pmd_t entry;
-		entry = mk_huge_pmd(new_page, vma->vm_page_prot);
-		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-		pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-		page_add_new_anon_rmap(new_page, vma, haddr, true);
-		mem_cgroup_commit_charge(new_page, memcg, false, true);
-		lru_cache_add_active_or_unevictable(new_page, vma);
-		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
-		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
-		if (!page) {
-			add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-		} else {
-			VM_BUG_ON_PAGE(!PageHead(page), page);
-			page_remove_rmap(page, true);
-			put_page(page);
-		}
-		ret |= VM_FAULT_WRITE;
+		return VM_FAULT_WRITE;
 	}
+
+	unlock_page(page);
 	spin_unlock(vmf->ptl);
-out_mn:
-	/*
-	 * No need to double call mmu_notifier->invalidate_range() callback as
-	 * the above pmdp_huge_clear_flush_notify() did already call it.
-	 */
-	mmu_notifier_invalidate_range_only_end(&range);
-out:
-	return ret;
-out_unlock:
-	spin_unlock(vmf->ptl);
-	return ret;
+fallback:
+	__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+	return VM_FAULT_FALLBACK;
 }
 
 /*
-- 
2.26.0



^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 7/7] khugepaged: Introduce 'max_ptes_shared' tunable
  2020-03-27 17:05 [PATCH 0/7] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
                   ` (5 preceding siblings ...)
  2020-03-27 17:06 ` [PATCH 6/7] thp: Change CoW semantics for anon-THP Kirill A. Shutemov
@ 2020-03-27 17:06 ` Kirill A. Shutemov
  6 siblings, 0 replies; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-27 17:06 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli; +Cc: linux-mm, linux-kernel, Kirill A. Shutemov

``max_ptes_shared`` speicies how many pages can be shared across multiple
processes. Exeeding the number woul block the collapse::

	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared

A higher value may increase memory footprint for some workloads.

By default, at least half of pages has to be not shared.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 Documentation/admin-guide/mm/transhuge.rst |  7 ++
 mm/khugepaged.c                            | 52 ++++++++++++--
 tools/testing/selftests/vm/khugepaged.c    | 83 ++++++++++++++++++++++
 3 files changed, 138 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index bd5714547cee..d16e4f2bb70f 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -220,6 +220,13 @@ memory. A lower value can prevent THPs from being
 collapsed, resulting fewer pages being collapsed into
 THPs, and lower memory access performance.
 
+``max_ptes_shared`` speicies how many pages can be shared across multiple
+processes. Exeeding the number woul block the collapse::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared
+
+A higher value may increase memory footprint for some workloads.
+
 Boot parameter
 ==============
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c8c2c463095c..8e728a602491 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -28,6 +28,8 @@ enum scan_result {
 	SCAN_SUCCEED,
 	SCAN_PMD_NULL,
 	SCAN_EXCEED_NONE_PTE,
+	SCAN_EXCEED_SWAP_PTE,
+	SCAN_EXCEED_SHARED_PTE,
 	SCAN_PTE_NON_PRESENT,
 	SCAN_PAGE_RO,
 	SCAN_LACK_REFERENCED_PAGE,
@@ -46,7 +48,6 @@ enum scan_result {
 	SCAN_DEL_PAGE_LRU,
 	SCAN_ALLOC_HUGE_PAGE_FAIL,
 	SCAN_CGROUP_CHARGE_FAIL,
-	SCAN_EXCEED_SWAP_PTE,
 	SCAN_TRUNCATED,
 	SCAN_PAGE_HAS_PRIVATE,
 };
@@ -71,6 +72,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
  */
 static unsigned int khugepaged_max_ptes_none __read_mostly;
 static unsigned int khugepaged_max_ptes_swap __read_mostly;
+static unsigned int khugepaged_max_ptes_shared __read_mostly;
 
 #define MM_SLOTS_HASH_BITS 10
 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -290,15 +292,43 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr =
 	__ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
 	       khugepaged_max_ptes_swap_store);
 
+static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
+					     struct kobj_attribute *attr,
+					     char *buf)
+{
+	return sprintf(buf, "%u\n", khugepaged_max_ptes_shared);
+}
+
+static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
+					      struct kobj_attribute *attr,
+					      const char *buf, size_t count)
+{
+	int err;
+	unsigned long max_ptes_shared;
+
+	err  = kstrtoul(buf, 10, &max_ptes_shared);
+	if (err || max_ptes_shared > HPAGE_PMD_NR-1)
+		return -EINVAL;
+
+	khugepaged_max_ptes_shared = max_ptes_shared;
+
+	return count;
+}
+
+static struct kobj_attribute khugepaged_max_ptes_shared_attr =
+	__ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
+	       khugepaged_max_ptes_shared_store);
+
 static struct attribute *khugepaged_attr[] = {
 	&khugepaged_defrag_attr.attr,
 	&khugepaged_max_ptes_none_attr.attr,
+	&khugepaged_max_ptes_swap_attr.attr,
+	&khugepaged_max_ptes_shared_attr.attr,
 	&pages_to_scan_attr.attr,
 	&pages_collapsed_attr.attr,
 	&full_scans_attr.attr,
 	&scan_sleep_millisecs_attr.attr,
 	&alloc_sleep_millisecs_attr.attr,
-	&khugepaged_max_ptes_swap_attr.attr,
 	NULL,
 };
 
@@ -360,6 +390,7 @@ int __init khugepaged_init(void)
 	khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
 	khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
 	khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
+	khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
 
 	return 0;
 }
@@ -546,7 +577,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 {
 	struct page *page = NULL;
 	pte_t *_pte;
-	int none_or_zero = 0, result = 0, referenced = 0;
+	int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
 	bool writable = false;
 	LIST_HEAD(compound_pagelist);
 
@@ -575,6 +606,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
 		VM_BUG_ON_PAGE(!PageAnon(page), page);
 
+		if (page_mapcount(page) > 1 &&
+				++shared > khugepaged_max_ptes_shared) {
+			result = SCAN_EXCEED_SHARED_PTE;
+			goto out;
+		}
+
 		if (PageCompound(page)) {
 			struct page *p;
 			page = compound_head(page);
@@ -1160,7 +1197,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 {
 	pmd_t *pmd;
 	pte_t *pte, *_pte;
-	int ret = 0, none_or_zero = 0, result = 0, referenced = 0;
+	int ret = 0, result = 0, referenced = 0;
+	int none_or_zero = 0, shared = 0;
 	struct page *page = NULL;
 	unsigned long _address;
 	spinlock_t *ptl;
@@ -1210,6 +1248,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 			goto out_unmap;
 		}
 
+		if (page_mapcount(page) > 1 &&
+				++shared > khugepaged_max_ptes_shared) {
+			result = SCAN_EXCEED_SHARED_PTE;
+			goto out_unmap;
+		}
+
 		page = compound_head(page);
 
 		/*
diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c
index 193bde6a1534..3a98d5b2d6d8 100644
--- a/tools/testing/selftests/vm/khugepaged.c
+++ b/tools/testing/selftests/vm/khugepaged.c
@@ -77,6 +77,7 @@ struct khugepaged_settings {
 	unsigned int scan_sleep_millisecs;
 	unsigned int max_ptes_none;
 	unsigned int max_ptes_swap;
+	unsigned int max_ptes_shared;
 	unsigned long pages_to_scan;
 };
 
@@ -276,6 +277,7 @@ static void write_settings(struct settings *settings)
 			khugepaged->scan_sleep_millisecs);
 	write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
 	write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
+	write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
 	write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
 }
 
@@ -312,6 +314,7 @@ static void save_settings(void)
 			read_num("khugepaged/scan_sleep_millisecs"),
 		.max_ptes_none = read_num("khugepaged/max_ptes_none"),
 		.max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
+		.max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
 		.pages_to_scan = read_num("khugepaged/pages_to_scan"),
 	};
 	success("OK");
@@ -786,12 +789,90 @@ static void collapse_fork_compound(void)
 			fail("Fail");
 		fill_memory(p, 0, page_size);
 
+		write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
 		if (wait_for_scan("Collapse PTE table full of compound pages in child", p))
 			fail("Timeout");
 		else if (check_huge(p))
 			success("OK");
 		else
 			fail("Fail");
+		write_num("khugepaged/max_ptes_shared",
+				default_settings.khugepaged.max_ptes_shared);
+
+		validate_memory(p, 0, hpage_pmd_size);
+		munmap(p, hpage_pmd_size);
+		exit(exit_status);
+	}
+
+	wait(&wstatus);
+	exit_status += WEXITSTATUS(wstatus);
+
+	printf("Check if parent still has huge page...");
+	if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, hpage_pmd_size);
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_shared()
+{
+	int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
+	int wstatus;
+	void *p;
+
+	p = alloc_mapping();
+
+	printf("Allocate huge page...");
+	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+	fill_memory(p, 0, hpage_pmd_size);
+	if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	printf("Share huge page over fork()...");
+	if (!fork()) {
+		/* Do not touch settings on child exit */
+		skip_settings_restore = true;
+		exit_status = 0;
+
+		if (check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+
+		printf("Trigger CoW in %d of %d...",
+				hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
+		fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
+		if (!check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+
+		if (wait_for_scan("Do not collapse with max_ptes_shared exeeded", p))
+			fail("Timeout");
+		else if (!check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+
+		printf("Trigger CoW in %d of %d...",
+				hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
+		fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size);
+		if (!check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+
+
+		if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p))
+			fail("Timeout");
+		else if (check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
 
 		validate_memory(p, 0, hpage_pmd_size);
 		munmap(p, hpage_pmd_size);
@@ -820,6 +901,7 @@ int main(void)
 
 	default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
 	default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
+	default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
 	default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
 
 	save_settings();
@@ -836,6 +918,7 @@ int main(void)
 	collapse_full_of_compound();
 	collapse_fork();
 	collapse_fork_compound();
+	collapse_max_ptes_shared();
 
 	restore_settings(0);
 }
-- 
2.26.0



^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/7] khugepaged: Do not stop collapse if less than half PTEs are referenced
  2020-03-27 17:05 ` [PATCH 2/7] khugepaged: Do not stop collapse if less than half PTEs are referenced Kirill A. Shutemov
@ 2020-03-27 17:30   ` Zi Yan
  2020-03-27 17:46   ` Yang Shi
  1 sibling, 0 replies; 38+ messages in thread
From: Zi Yan @ 2020-03-27 17:30 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: akpm, Andrea Arcangeli, linux-mm, linux-kernel, Kirill A. Shutemov

[-- Attachment #1: Type: text/plain, Size: 1278 bytes --]

On 27 Mar 2020, at 13:05, Kirill A. Shutemov wrote:

> __collapse_huge_page_swapin() check number of referenced PTE to decide
> if the memory range is hot enough to justify swapin.
>
> The problem is that it stops collapse altogether if there's not enough
> refereced pages, not only swappingin.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Fixes: 0db501f7a34c ("mm, thp: convert from optimistic swapin collapsing to conservative")
> ---
>  mm/khugepaged.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 99bab7e4d05b..14d7afc90786 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -905,7 +905,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>  	/* we only decide to swapin, if there is enough young ptes */
>  	if (referenced < HPAGE_PMD_NR/2) {
>  		trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
> -		return false;
> +		/* Do not block collapse, only skip swapping in */
> +		return true;
>  	}
>  	vmf.pte = pte_offset_map(pmd, address);
>  	for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
> -- 
> 2.26.0

Make sense.

Reviewed-by: Zi Yan <ziy@nvidia.com>

—
Best Regards,
Yan Zi

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 854 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 3/7] khugepaged: Drain LRU add pagevec to get rid of extra pins
  2020-03-27 17:05 ` [PATCH 3/7] khugepaged: Drain LRU add pagevec to get rid of extra pins Kirill A. Shutemov
@ 2020-03-27 17:34   ` Zi Yan
  2020-03-28  0:20     ` Kirill A. Shutemov
  2020-03-27 18:10   ` Yang Shi
  1 sibling, 1 reply; 38+ messages in thread
From: Zi Yan @ 2020-03-27 17:34 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: akpm, Andrea Arcangeli, linux-mm, linux-kernel, Kirill A. Shutemov

[-- Attachment #1: Type: text/plain, Size: 1308 bytes --]

On 27 Mar 2020, at 13:05, Kirill A. Shutemov wrote:

> __collapse_huge_page_isolate() may fail due to extra pin in the LRU add
> pagevec. It's petty common for swapin case: we swap in pages just to
> fail due to the extra pin.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  mm/khugepaged.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 14d7afc90786..39e0994abeb8 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -585,11 +585,19 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  		 * The page must only be referenced by the scanned process
>  		 * and page swap cache.
>  		 */
> +		if (page_count(page) != 1 + PageSwapCache(page)) {
> +			/*
> +			 * Drain pagevec and retry just in case we can get rid
> +			 * of the extra pin, like in swapin case.
> +			 */
> +			lru_add_drain();
> +		}
>  		if (page_count(page) != 1 + PageSwapCache(page)) {
>  			unlock_page(page);
>  			result = SCAN_PAGE_COUNT;
>  			goto out;
>  		}
> +
>  		if (pte_write(pteval)) {
>  			writable = true;
>  		} else {
> -- 
> 2.26.0

Looks good to me. Is the added empty line intentional?

Reviewed-by: Zi Yan <ziy@nvidia.com>

—
Best Regards,
Yan Zi

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 854 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/7] khugepaged: Do not stop collapse if less than half PTEs are referenced
  2020-03-27 17:05 ` [PATCH 2/7] khugepaged: Do not stop collapse if less than half PTEs are referenced Kirill A. Shutemov
  2020-03-27 17:30   ` Zi Yan
@ 2020-03-27 17:46   ` Yang Shi
  1 sibling, 0 replies; 38+ messages in thread
From: Yang Shi @ 2020-03-27 17:46 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
<kirill@shutemov.name> wrote:
>
> __collapse_huge_page_swapin() check number of referenced PTE to decide
> if the memory range is hot enough to justify swapin.
>
> The problem is that it stops collapse altogether if there's not enough
> refereced pages, not only swappingin.

s/refereced/referenced

>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Fixes: 0db501f7a34c ("mm, thp: convert from optimistic swapin collapsing to conservative")
> ---
>  mm/khugepaged.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 99bab7e4d05b..14d7afc90786 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -905,7 +905,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>         /* we only decide to swapin, if there is enough young ptes */
>         if (referenced < HPAGE_PMD_NR/2) {
>                 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
> -               return false;
> +               /* Do not block collapse, only skip swapping in */
> +               return true;
>         }
>         vmf.pte = pte_offset_map(pmd, address);
>         for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
> --
> 2.26.0
>
>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 3/7] khugepaged: Drain LRU add pagevec to get rid of extra pins
  2020-03-27 17:05 ` [PATCH 3/7] khugepaged: Drain LRU add pagevec to get rid of extra pins Kirill A. Shutemov
  2020-03-27 17:34   ` Zi Yan
@ 2020-03-27 18:10   ` Yang Shi
  2020-03-28 12:18     ` Kirill A. Shutemov
  1 sibling, 1 reply; 38+ messages in thread
From: Yang Shi @ 2020-03-27 18:10 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
<kirill@shutemov.name> wrote:
>
> __collapse_huge_page_isolate() may fail due to extra pin in the LRU add
> pagevec. It's petty common for swapin case: we swap in pages just to
> fail due to the extra pin.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  mm/khugepaged.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 14d7afc90786..39e0994abeb8 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -585,11 +585,19 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                  * The page must only be referenced by the scanned process
>                  * and page swap cache.
>                  */
> +               if (page_count(page) != 1 + PageSwapCache(page)) {
> +                       /*
> +                        * Drain pagevec and retry just in case we can get rid
> +                        * of the extra pin, like in swapin case.
> +                        */
> +                       lru_add_drain();

This is definitely correct.

I'm wondering if we need one more lru_add_drain() before PageLRU check
in khugepaged_scan_pmd() or not? The page might be in lru cache then
get skipped. This would improve the success rate.

Reviewed-by: Yang Shi <yang.shi@linux.alibaba.com>

> +               }
>                 if (page_count(page) != 1 + PageSwapCache(page)) {
>                         unlock_page(page);
>                         result = SCAN_PAGE_COUNT;
>                         goto out;
>                 }
> +
>                 if (pte_write(pteval)) {
>                         writable = true;
>                 } else {
> --
> 2.26.0
>
>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/7] khugepaged: Allow to callapse a page shared across fork
  2020-03-27 17:05 ` [PATCH 4/7] khugepaged: Allow to callapse a page shared across fork Kirill A. Shutemov
@ 2020-03-27 18:19   ` Zi Yan
  2020-03-27 21:31     ` Yang Shi
  0 siblings, 1 reply; 38+ messages in thread
From: Zi Yan @ 2020-03-27 18:19 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: akpm, Andrea Arcangeli, linux-mm, linux-kernel, Kirill A. Shutemov

[-- Attachment #1: Type: text/plain, Size: 2974 bytes --]

On 27 Mar 2020, at 13:05, Kirill A. Shutemov wrote:

> The page can be included into collapse as long as it doesn't have extra
> pins (from GUP or otherwise).
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  mm/khugepaged.c | 28 ++++++++++++++++------------
>  1 file changed, 16 insertions(+), 12 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 39e0994abeb8..b47edfe57f7b 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -581,18 +581,26 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  		}
>
>  		/*
> -		 * cannot use mapcount: can't collapse if there's a gup pin.
> -		 * The page must only be referenced by the scanned process
> -		 * and page swap cache.
> +		 * Check if the page has any GUP (or other external) pins.
> +		 *
> +		 * The page table that maps the page has been already unlinked
> +		 * from the page table tree and this process cannot get
> +		 * additinal pin on the page.
> +		 *
> +		 * New pins can come later if the page is shared across fork,
> +		 * but not for the this process. It is fine. The other process
> +		 * cannot write to the page, only trigger CoW.
>  		 */
> -		if (page_count(page) != 1 + PageSwapCache(page)) {
> +		if (total_mapcount(page) + PageSwapCache(page) !=
> +				page_count(page)) {

Do you think having a function for this check would be better? Since the check is used three times.

>  			/*
>  			 * Drain pagevec and retry just in case we can get rid
>  			 * of the extra pin, like in swapin case.
>  			 */
>  			lru_add_drain();
>  		}
> -		if (page_count(page) != 1 + PageSwapCache(page)) {
> +		if (total_mapcount(page) + PageSwapCache(page) !=
> +				page_count(page)) {
>  			unlock_page(page);
>  			result = SCAN_PAGE_COUNT;
>  			goto out;
> @@ -680,7 +688,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
>  		} else {
>  			src_page = pte_page(pteval);
>  			copy_user_highpage(page, src_page, address, vma);
> -			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);

Maybe replace it with this?

VM_BUG_ON_PAGE(page_mapcount(src_page) + PageSwapCache(src_page) != page_count(src_page), src_page);


>  			release_pte_page(src_page);
>  			/*
>  			 * ptl mostly unnecessary, but preempt has to
> @@ -1209,12 +1216,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>  			goto out_unmap;
>  		}
>
> -		/*
> -		 * cannot use mapcount: can't collapse if there's a gup pin.
> -		 * The page must only be referenced by the scanned process
> -		 * and page swap cache.
> -		 */
> -		if (page_count(page) != 1 + PageSwapCache(page)) {
> +		/* Check if the page has any GUP (or other external) pins */
> +		if (total_mapcount(page) + PageSwapCache(page) !=
> +				page_count(page)) {
>  			result = SCAN_PAGE_COUNT;
>  			goto out_unmap;
>  		}
> -- 
> 2.26.0

Thanks.

—
Best Regards,
Yan Zi

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 508 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-27 17:05 ` [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages Kirill A. Shutemov
@ 2020-03-27 18:53   ` Yang Shi
  2020-03-28  0:34     ` Kirill A. Shutemov
  2020-03-27 18:55   ` Zi Yan
  2020-03-27 20:45   ` Yang Shi
  2 siblings, 1 reply; 38+ messages in thread
From: Yang Shi @ 2020-03-27 18:53 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
<kirill@shutemov.name> wrote:
>
> We can collapse PTE-mapped compound pages. We only need to avoid
> handling them more than once: lock/unlock page only once if it's present
> in the PMD range multiple times as it handled on compound level. The
> same goes for LRU isolation and putpack.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
>  1 file changed, 31 insertions(+), 10 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index b47edfe57f7b..c8c2c463095c 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
>
>  static void release_pte_page(struct page *page)
>  {
> +       /*
> +        * We need to unlock and put compound page on LRU only once.
> +        * The rest of the pages have to be locked and not on LRU here.
> +        */
> +       VM_BUG_ON_PAGE(!PageCompound(page) &&
> +                       (!PageLocked(page) && PageLRU(page)), page);
> +
> +       if (!PageLocked(page))
> +               return;
> +
> +       page = compound_head(page);
>         dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));

We need count in the number of base pages. The same counter is
modified by vmscan in base page unit. Also need modify the inc path.

>         unlock_page(page);
>         putback_lru_page(page);
> @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>         pte_t *_pte;
>         int none_or_zero = 0, result = 0, referenced = 0;
>         bool writable = false;
> +       LIST_HEAD(compound_pagelist);
>
>         for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
>              _pte++, address += PAGE_SIZE) {
> @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                         goto out;
>                 }
>
> -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> +               VM_BUG_ON_PAGE(!PageAnon(page), page);
> +
>                 if (PageCompound(page)) {
> -                       result = SCAN_PAGE_COMPOUND;
> -                       goto out;
> -               }
> +                       struct page *p;
> +                       page = compound_head(page);
>
> -               VM_BUG_ON_PAGE(!PageAnon(page), page);
> +                       /*
> +                        * Check if we have dealt with the compount page

s/compount/compound

> +                        * already
> +                        */
> +                       list_for_each_entry(p, &compound_pagelist, lru) {
> +                               if (page ==  p)
> +                                       break;
> +                       }
> +                       if (page ==  p)
> +                               continue;

I don't quite understand why we need the above check. My understanding
is when we scan the ptes, once the first PTE mapped subpage is found,
then the THP would be added into compound_pagelist, then the later
loop would find the same THP on the list then just break the loop. Did
I miss anything?

> +               }
>
>                 /*
>                  * We can do it before isolate_lru_page because the
> @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                     page_is_young(page) || PageReferenced(page) ||
>                     mmu_notifier_test_young(vma->vm_mm, address))
>                         referenced++;
> +
> +               if (PageCompound(page))
> +                       list_add_tail(&page->lru, &compound_pagelist);
>         }
>         if (likely(writable)) {
>                 if (likely(referenced)) {
> @@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>                         goto out_unmap;
>                 }
>
> -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> -               if (PageCompound(page)) {
> -                       result = SCAN_PAGE_COMPOUND;
> -                       goto out_unmap;
> -               }
> +               page = compound_head(page);
>
>                 /*
>                  * Record which node the original page is from and save this
> --
> 2.26.0
>
>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-27 17:05 ` [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages Kirill A. Shutemov
  2020-03-27 18:53   ` Yang Shi
@ 2020-03-27 18:55   ` Zi Yan
  2020-03-28  0:39     ` Kirill A. Shutemov
  2020-03-27 20:45   ` Yang Shi
  2 siblings, 1 reply; 38+ messages in thread
From: Zi Yan @ 2020-03-27 18:55 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: akpm, Andrea Arcangeli, linux-mm, linux-kernel, Kirill A. Shutemov

[-- Attachment #1: Type: text/plain, Size: 3626 bytes --]

On 27 Mar 2020, at 13:05, Kirill A. Shutemov wrote:

> We can collapse PTE-mapped compound pages. We only need to avoid
> handling them more than once: lock/unlock page only once if it's present
> in the PMD range multiple times as it handled on compound level. The
> same goes for LRU isolation and putpack.

s/putpack/putback/

>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
>  1 file changed, 31 insertions(+), 10 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index b47edfe57f7b..c8c2c463095c 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
>
>  static void release_pte_page(struct page *page)
>  {
> +	/*
> +	 * We need to unlock and put compound page on LRU only once.
> +	 * The rest of the pages have to be locked and not on LRU here.
> +	 */
> +	VM_BUG_ON_PAGE(!PageCompound(page) &&
> +			(!PageLocked(page) && PageLRU(page)), page);
It only checks base pages.

Add

VM_BUG_ON_PAGE(PageCompound(page) && PageLocked(page) && PageLRU(page), page);

to check for compound pages.

> +
> +	if (!PageLocked(page))
> +		return;
> +
> +	page = compound_head(page);
>  	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
>  	unlock_page(page);
>  	putback_lru_page(page);
> @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  	pte_t *_pte;
>  	int none_or_zero = 0, result = 0, referenced = 0;
>  	bool writable = false;
> +	LIST_HEAD(compound_pagelist);
>
>  	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
>  	     _pte++, address += PAGE_SIZE) {
> @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  			goto out;
>  		}
>
> -		/* TODO: teach khugepaged to collapse THP mapped with pte */
> +		VM_BUG_ON_PAGE(!PageAnon(page), page);
> +
>  		if (PageCompound(page)) {
> -			result = SCAN_PAGE_COMPOUND;
> -			goto out;
> -		}
> +			struct page *p;
> +			page = compound_head(page);
>
> -		VM_BUG_ON_PAGE(!PageAnon(page), page);
> +			/*
> +			 * Check if we have dealt with the compount page

s/compount/compound/

> +			 * already
> +			 */
> +			list_for_each_entry(p, &compound_pagelist, lru) {
> +				if (page ==  p)
> +					break;
> +			}
> +			if (page ==  p)
> +				continue;
> +		}
>
>  		/*
>  		 * We can do it before isolate_lru_page because the
> @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  		    page_is_young(page) || PageReferenced(page) ||
>  		    mmu_notifier_test_young(vma->vm_mm, address))
>  			referenced++;
> +
> +		if (PageCompound(page))
> +			list_add_tail(&page->lru, &compound_pagelist);
>  	}
>  	if (likely(writable)) {
>  		if (likely(referenced)) {

Do we need a list here? There should be at most one compound page we will see here, right?

If a compound page is seen here, can we bail out the loop early? I guess not,
because we can a partially mapped compound page at the beginning or the end of a VMA, right?

> @@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>  			goto out_unmap;
>  		}
>
> -		/* TODO: teach khugepaged to collapse THP mapped with pte */
> -		if (PageCompound(page)) {
> -			result = SCAN_PAGE_COMPOUND;
> -			goto out_unmap;
> -		}
> +		page = compound_head(page);
>
>  		/*
>  		 * Record which node the original page is from and save this
> -- 
> 2.26.0


—
Best Regards,
Yan Zi

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 854 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 6/7] thp: Change CoW semantics for anon-THP
  2020-03-27 17:06 ` [PATCH 6/7] thp: Change CoW semantics for anon-THP Kirill A. Shutemov
@ 2020-03-27 20:07   ` Yang Shi
  2020-03-28  0:43     ` Kirill A. Shutemov
  0 siblings, 1 reply; 38+ messages in thread
From: Yang Shi @ 2020-03-27 20:07 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
<kirill@shutemov.name> wrote:
>
> Currently we have different copy-on-write semantics for anon- and
> file-THP. For anon-THP we try to allocate huge page on the write fault,
> but on file-THP we split PMD and allocate 4k page.

I agree this seems confusing.

>
> Arguably, file-THP semantics is more desirable: we don't necessary want
> to unshare full PMD range from the parent on the first access. This is
> the primary reason THP is unusable for some workloads, like Redis.
>
> The original THP refcounting didn't allow to have PTE-mapped compound
> pages, so we had no options, but to allocate huge page on CoW (with
> fallback to 512 4k pages).
>
> The current refcounting doesn't have such limitations and we can cut a
> lot of complex code out of fault path.
>
> khugepaged is now able to recover THP from such ranges if the
> configuration allows.

It looks this patch would just split the PMD then fallback to handle
it on PTE level, it definitely simplify the code a lot. However it
makes the use of THP depend on the productivity of khugepaged. And the
success rate of THP allocation in khugepaged depends on defrag. But by
default khugepaged runs at very low priority, so khugepaged defrag may
result in priority inversion easily.

For example we saw THP split in reclaim path triggered by khugepaged
held write anon_vma lock, but the other process which was doing
reclaim too was blocked by anon_vma lock. Then the cond_resched() in
rmap walk would make khugepaged preempted by all other processes
easily so khugepaged may hold the anon_vma lock for indefinite time.
Then we saw hung tasks.

So we have khugepaged defrag disabled for some workloads to workaround
the priority inversion problem. So, I'm concerned some processes may
never get THP for some our workloads with this patch applied.

The priority inversion caused by khugepaged was not very usual. Just
my two cents.

>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  mm/huge_memory.c | 247 +++++------------------------------------------
>  1 file changed, 24 insertions(+), 223 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index ef6a6bcb291f..15b7a9c86b7c 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1206,262 +1206,63 @@ void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
>         spin_unlock(vmf->ptl);
>  }
>
> -static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
> -                       pmd_t orig_pmd, struct page *page)
> -{
> -       struct vm_area_struct *vma = vmf->vma;
> -       unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> -       struct mem_cgroup *memcg;
> -       pgtable_t pgtable;
> -       pmd_t _pmd;
> -       int i;
> -       vm_fault_t ret = 0;
> -       struct page **pages;
> -       struct mmu_notifier_range range;
> -
> -       pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
> -                             GFP_KERNEL);
> -       if (unlikely(!pages)) {
> -               ret |= VM_FAULT_OOM;
> -               goto out;
> -       }
> -
> -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> -               pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
> -                                              vmf->address, page_to_nid(page));
> -               if (unlikely(!pages[i] ||
> -                            mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
> -                                    GFP_KERNEL, &memcg, false))) {
> -                       if (pages[i])
> -                               put_page(pages[i]);
> -                       while (--i >= 0) {
> -                               memcg = (void *)page_private(pages[i]);
> -                               set_page_private(pages[i], 0);
> -                               mem_cgroup_cancel_charge(pages[i], memcg,
> -                                               false);
> -                               put_page(pages[i]);
> -                       }
> -                       kfree(pages);
> -                       ret |= VM_FAULT_OOM;
> -                       goto out;
> -               }
> -               set_page_private(pages[i], (unsigned long)memcg);
> -       }
> -
> -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> -               copy_user_highpage(pages[i], page + i,
> -                                  haddr + PAGE_SIZE * i, vma);
> -               __SetPageUptodate(pages[i]);
> -               cond_resched();
> -       }
> -
> -       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> -                               haddr, haddr + HPAGE_PMD_SIZE);
> -       mmu_notifier_invalidate_range_start(&range);
> -
> -       vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
> -               goto out_free_pages;
> -       VM_BUG_ON_PAGE(!PageHead(page), page);
> -
> -       /*
> -        * Leave pmd empty until pte is filled note we must notify here as
> -        * concurrent CPU thread might write to new page before the call to
> -        * mmu_notifier_invalidate_range_end() happens which can lead to a
> -        * device seeing memory write in different order than CPU.
> -        *
> -        * See Documentation/vm/mmu_notifier.rst
> -        */
> -       pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
> -
> -       pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
> -       pmd_populate(vma->vm_mm, &_pmd, pgtable);
> -
> -       for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
> -               pte_t entry;
> -               entry = mk_pte(pages[i], vma->vm_page_prot);
> -               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> -               memcg = (void *)page_private(pages[i]);
> -               set_page_private(pages[i], 0);
> -               page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
> -               mem_cgroup_commit_charge(pages[i], memcg, false, false);
> -               lru_cache_add_active_or_unevictable(pages[i], vma);
> -               vmf->pte = pte_offset_map(&_pmd, haddr);
> -               VM_BUG_ON(!pte_none(*vmf->pte));
> -               set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
> -               pte_unmap(vmf->pte);
> -       }
> -       kfree(pages);
> -
> -       smp_wmb(); /* make pte visible before pmd */
> -       pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
> -       page_remove_rmap(page, true);
> -       spin_unlock(vmf->ptl);
> -
> -       /*
> -        * No need to double call mmu_notifier->invalidate_range() callback as
> -        * the above pmdp_huge_clear_flush_notify() did already call it.
> -        */
> -       mmu_notifier_invalidate_range_only_end(&range);
> -
> -       ret |= VM_FAULT_WRITE;
> -       put_page(page);
> -
> -out:
> -       return ret;
> -
> -out_free_pages:
> -       spin_unlock(vmf->ptl);
> -       mmu_notifier_invalidate_range_end(&range);
> -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> -               memcg = (void *)page_private(pages[i]);
> -               set_page_private(pages[i], 0);
> -               mem_cgroup_cancel_charge(pages[i], memcg, false);
> -               put_page(pages[i]);
> -       }
> -       kfree(pages);
> -       goto out;
> -}
> -
>  vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
>  {
>         struct vm_area_struct *vma = vmf->vma;
> -       struct page *page = NULL, *new_page;
> -       struct mem_cgroup *memcg;
> +       struct page *page;
>         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> -       struct mmu_notifier_range range;
> -       gfp_t huge_gfp;                 /* for allocation and charge */
> -       vm_fault_t ret = 0;
>
>         vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>         VM_BUG_ON_VMA(!vma->anon_vma, vma);
> +
>         if (is_huge_zero_pmd(orig_pmd))
> -               goto alloc;
> +               goto fallback;
> +
>         spin_lock(vmf->ptl);
> -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
> -               goto out_unlock;
> +
> +       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> +               spin_unlock(vmf->ptl);
> +               return 0;
> +       }
>
>         page = pmd_page(orig_pmd);
>         VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
> -       /*
> -        * We can only reuse the page if nobody else maps the huge page or it's
> -        * part.
> -        */
> +
> +       /* Lock page for reuse_swap_page() */
>         if (!trylock_page(page)) {
>                 get_page(page);
>                 spin_unlock(vmf->ptl);
>                 lock_page(page);
>                 spin_lock(vmf->ptl);
>                 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> +                       spin_unlock(vmf->ptl);
>                         unlock_page(page);
>                         put_page(page);
> -                       goto out_unlock;
> +                       return 0;
>                 }
>                 put_page(page);
>         }
> +
> +       /*
> +        * We can only reuse the page if nobody else maps the huge page or it's
> +        * part.
> +        */
>         if (reuse_swap_page(page, NULL)) {
>                 pmd_t entry;
>                 entry = pmd_mkyoung(orig_pmd);
>                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>                 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
>                         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> -               ret |= VM_FAULT_WRITE;
>                 unlock_page(page);
> -               goto out_unlock;
> -       }
> -       unlock_page(page);
> -       get_page(page);
> -       spin_unlock(vmf->ptl);
> -alloc:
> -       if (__transparent_hugepage_enabled(vma) &&
> -           !transparent_hugepage_debug_cow()) {
> -               huge_gfp = alloc_hugepage_direct_gfpmask(vma);
> -               new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
> -       } else
> -               new_page = NULL;
> -
> -       if (likely(new_page)) {
> -               prep_transhuge_page(new_page);
> -       } else {
> -               if (!page) {
> -                       split_huge_pmd(vma, vmf->pmd, vmf->address);
> -                       ret |= VM_FAULT_FALLBACK;
> -               } else {
> -                       ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
> -                       if (ret & VM_FAULT_OOM) {
> -                               split_huge_pmd(vma, vmf->pmd, vmf->address);
> -                               ret |= VM_FAULT_FALLBACK;
> -                       }
> -                       put_page(page);
> -               }
> -               count_vm_event(THP_FAULT_FALLBACK);
> -               goto out;
> -       }
> -
> -       if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
> -                                       huge_gfp, &memcg, true))) {
> -               put_page(new_page);
> -               split_huge_pmd(vma, vmf->pmd, vmf->address);
> -               if (page)
> -                       put_page(page);
> -               ret |= VM_FAULT_FALLBACK;
> -               count_vm_event(THP_FAULT_FALLBACK);
> -               goto out;
> -       }
> -
> -       count_vm_event(THP_FAULT_ALLOC);
> -       count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
> -
> -       if (!page)
> -               clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
> -       else
> -               copy_user_huge_page(new_page, page, vmf->address,
> -                                   vma, HPAGE_PMD_NR);
> -       __SetPageUptodate(new_page);
> -
> -       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> -                               haddr, haddr + HPAGE_PMD_SIZE);
> -       mmu_notifier_invalidate_range_start(&range);
> -
> -       spin_lock(vmf->ptl);
> -       if (page)
> -               put_page(page);
> -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
>                 spin_unlock(vmf->ptl);
> -               mem_cgroup_cancel_charge(new_page, memcg, true);
> -               put_page(new_page);
> -               goto out_mn;
> -       } else {
> -               pmd_t entry;
> -               entry = mk_huge_pmd(new_page, vma->vm_page_prot);
> -               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> -               pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
> -               page_add_new_anon_rmap(new_page, vma, haddr, true);
> -               mem_cgroup_commit_charge(new_page, memcg, false, true);
> -               lru_cache_add_active_or_unevictable(new_page, vma);
> -               set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
> -               update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> -               if (!page) {
> -                       add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> -               } else {
> -                       VM_BUG_ON_PAGE(!PageHead(page), page);
> -                       page_remove_rmap(page, true);
> -                       put_page(page);
> -               }
> -               ret |= VM_FAULT_WRITE;
> +               return VM_FAULT_WRITE;
>         }
> +
> +       unlock_page(page);
>         spin_unlock(vmf->ptl);
> -out_mn:
> -       /*
> -        * No need to double call mmu_notifier->invalidate_range() callback as
> -        * the above pmdp_huge_clear_flush_notify() did already call it.
> -        */
> -       mmu_notifier_invalidate_range_only_end(&range);
> -out:
> -       return ret;
> -out_unlock:
> -       spin_unlock(vmf->ptl);
> -       return ret;
> +fallback:
> +       __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
> +       return VM_FAULT_FALLBACK;
>  }
>
>  /*
> --
> 2.26.0
>
>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-27 17:05 ` [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages Kirill A. Shutemov
  2020-03-27 18:53   ` Yang Shi
  2020-03-27 18:55   ` Zi Yan
@ 2020-03-27 20:45   ` Yang Shi
  2020-03-28  0:40     ` Kirill A. Shutemov
  2 siblings, 1 reply; 38+ messages in thread
From: Yang Shi @ 2020-03-27 20:45 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
<kirill@shutemov.name> wrote:
>
> We can collapse PTE-mapped compound pages. We only need to avoid
> handling them more than once: lock/unlock page only once if it's present
> in the PMD range multiple times as it handled on compound level. The
> same goes for LRU isolation and putpack.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
>  1 file changed, 31 insertions(+), 10 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index b47edfe57f7b..c8c2c463095c 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
>
>  static void release_pte_page(struct page *page)
>  {
> +       /*
> +        * We need to unlock and put compound page on LRU only once.
> +        * The rest of the pages have to be locked and not on LRU here.
> +        */
> +       VM_BUG_ON_PAGE(!PageCompound(page) &&
> +                       (!PageLocked(page) && PageLRU(page)), page);
> +
> +       if (!PageLocked(page))
> +               return;
> +
> +       page = compound_head(page);
>         dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
>         unlock_page(page);
>         putback_lru_page(page);

BTW, wouldn't this unlock the whole THP and put it back to LRU? Then
we may copy the following PTE mapped pages with page unlocked and on
LRU. I don't see critical problem, just the pages might be on and off
LRU by others, i.e. vmscan, compaction, migration, etc. But no one
could take the page away since try_to_unmap() would fail, but not very
productive.


> @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>         pte_t *_pte;
>         int none_or_zero = 0, result = 0, referenced = 0;
>         bool writable = false;
> +       LIST_HEAD(compound_pagelist);
>
>         for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
>              _pte++, address += PAGE_SIZE) {
> @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                         goto out;
>                 }
>
> -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> +               VM_BUG_ON_PAGE(!PageAnon(page), page);
> +
>                 if (PageCompound(page)) {
> -                       result = SCAN_PAGE_COMPOUND;
> -                       goto out;
> -               }
> +                       struct page *p;
> +                       page = compound_head(page);
>
> -               VM_BUG_ON_PAGE(!PageAnon(page), page);
> +                       /*
> +                        * Check if we have dealt with the compount page
> +                        * already
> +                        */
> +                       list_for_each_entry(p, &compound_pagelist, lru) {
> +                               if (page ==  p)
> +                                       break;
> +                       }
> +                       if (page ==  p)
> +                               continue;
> +               }
>
>                 /*
>                  * We can do it before isolate_lru_page because the
> @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                     page_is_young(page) || PageReferenced(page) ||
>                     mmu_notifier_test_young(vma->vm_mm, address))
>                         referenced++;
> +
> +               if (PageCompound(page))
> +                       list_add_tail(&page->lru, &compound_pagelist);
>         }
>         if (likely(writable)) {
>                 if (likely(referenced)) {
> @@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>                         goto out_unmap;
>                 }
>
> -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> -               if (PageCompound(page)) {
> -                       result = SCAN_PAGE_COMPOUND;
> -                       goto out_unmap;
> -               }
> +               page = compound_head(page);
>
>                 /*
>                  * Record which node the original page is from and save this
> --
> 2.26.0
>
>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/7] khugepaged: Allow to callapse a page shared across fork
  2020-03-27 18:19   ` Zi Yan
@ 2020-03-27 21:31     ` Yang Shi
  2020-03-27 21:44       ` Zi Yan
  0 siblings, 1 reply; 38+ messages in thread
From: Yang Shi @ 2020-03-27 21:31 UTC (permalink / raw)
  To: Zi Yan
  Cc: Kirill A. Shutemov, Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 11:20 AM Zi Yan <zi.yan@sent.com> wrote:
>
> On 27 Mar 2020, at 13:05, Kirill A. Shutemov wrote:
>
> > The page can be included into collapse as long as it doesn't have extra
> > pins (from GUP or otherwise).
> >
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >  mm/khugepaged.c | 28 ++++++++++++++++------------
> >  1 file changed, 16 insertions(+), 12 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 39e0994abeb8..b47edfe57f7b 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -581,18 +581,26 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >               }
> >
> >               /*
> > -              * cannot use mapcount: can't collapse if there's a gup pin.
> > -              * The page must only be referenced by the scanned process
> > -              * and page swap cache.
> > +              * Check if the page has any GUP (or other external) pins.
> > +              *
> > +              * The page table that maps the page has been already unlinked
> > +              * from the page table tree and this process cannot get
> > +              * additinal pin on the page.
> > +              *
> > +              * New pins can come later if the page is shared across fork,
> > +              * but not for the this process. It is fine. The other process
> > +              * cannot write to the page, only trigger CoW.
> >                */
> > -             if (page_count(page) != 1 + PageSwapCache(page)) {
> > +             if (total_mapcount(page) + PageSwapCache(page) !=
> > +                             page_count(page)) {
>
> Do you think having a function for this check would be better? Since the check is used three times.
>
> >                       /*
> >                        * Drain pagevec and retry just in case we can get rid
> >                        * of the extra pin, like in swapin case.
> >                        */
> >                       lru_add_drain();
> >               }
> > -             if (page_count(page) != 1 + PageSwapCache(page)) {
> > +             if (total_mapcount(page) + PageSwapCache(page) !=
> > +                             page_count(page)) {
> >                       unlock_page(page);
> >                       result = SCAN_PAGE_COUNT;
> >                       goto out;
> > @@ -680,7 +688,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
> >               } else {
> >                       src_page = pte_page(pteval);
> >                       copy_user_highpage(page, src_page, address, vma);
> > -                     VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
>
> Maybe replace it with this?
>
> VM_BUG_ON_PAGE(page_mapcount(src_page) + PageSwapCache(src_page) != page_count(src_page), src_page);

I don't think this is correct either. If a THP is PTE mapped its
refcount would be bumped by the number of PTE mapped subpages. But
page_mapcount() would just return the mapcount of that specific
subpage. So, total_mapcount() should be used, but the same check has
been done before reaching here.

>
>
> >                       release_pte_page(src_page);
> >                       /*
> >                        * ptl mostly unnecessary, but preempt has to
> > @@ -1209,12 +1216,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> >                       goto out_unmap;
> >               }
> >
> > -             /*
> > -              * cannot use mapcount: can't collapse if there's a gup pin.
> > -              * The page must only be referenced by the scanned process
> > -              * and page swap cache.
> > -              */
> > -             if (page_count(page) != 1 + PageSwapCache(page)) {
> > +             /* Check if the page has any GUP (or other external) pins */
> > +             if (total_mapcount(page) + PageSwapCache(page) !=
> > +                             page_count(page)) {
> >                       result = SCAN_PAGE_COUNT;
> >                       goto out_unmap;
> >               }
> > --
> > 2.26.0
>
> Thanks.
>
> —
> Best Regards,
> Yan Zi


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/7] khugepaged: Allow to callapse a page shared across fork
  2020-03-27 21:31     ` Yang Shi
@ 2020-03-27 21:44       ` Zi Yan
  0 siblings, 0 replies; 38+ messages in thread
From: Zi Yan @ 2020-03-27 21:44 UTC (permalink / raw)
  To: Yang Shi
  Cc: Kirill A. Shutemov, Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

[-- Attachment #1: Type: text/plain, Size: 1508 bytes --]

<snip>
>>>                       /*
>>>                        * Drain pagevec and retry just in case we can get rid
>>>                        * of the extra pin, like in swapin case.
>>>                        */
>>>                       lru_add_drain();
>>>               }
>>> -             if (page_count(page) != 1 + PageSwapCache(page)) {
>>> +             if (total_mapcount(page) + PageSwapCache(page) !=
>>> +                             page_count(page)) {
>>>                       unlock_page(page);
>>>                       result = SCAN_PAGE_COUNT;
>>>                       goto out;
>>> @@ -680,7 +688,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
>>>               } else {
>>>                       src_page = pte_page(pteval);
>>>                       copy_user_highpage(page, src_page, address, vma);
>>> -                     VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
>>
>> Maybe replace it with this?
>>
>> VM_BUG_ON_PAGE(page_mapcount(src_page) + PageSwapCache(src_page) != page_count(src_page), src_page);
>
> I don't think this is correct either. If a THP is PTE mapped its
> refcount would be bumped by the number of PTE mapped subpages. But
> page_mapcount() would just return the mapcount of that specific
> subpage. So, total_mapcount() should be used, but the same check has
> been done before reaching here.

Yes, you are right. Thanks.

Please disregard this comment.

—
Best Regards,
Yan Zi

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 854 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 3/7] khugepaged: Drain LRU add pagevec to get rid of extra pins
  2020-03-27 17:34   ` Zi Yan
@ 2020-03-28  0:20     ` Kirill A. Shutemov
  0 siblings, 0 replies; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-28  0:20 UTC (permalink / raw)
  To: Zi Yan; +Cc: akpm, Andrea Arcangeli, linux-mm, linux-kernel, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 01:34:20PM -0400, Zi Yan wrote:
> On 27 Mar 2020, at 13:05, Kirill A. Shutemov wrote:
> 
> > __collapse_huge_page_isolate() may fail due to extra pin in the LRU add
> > pagevec. It's petty common for swapin case: we swap in pages just to
> > fail due to the extra pin.
> >
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >  mm/khugepaged.c | 8 ++++++++
> >  1 file changed, 8 insertions(+)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 14d7afc90786..39e0994abeb8 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -585,11 +585,19 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >  		 * The page must only be referenced by the scanned process
> >  		 * and page swap cache.
> >  		 */
> > +		if (page_count(page) != 1 + PageSwapCache(page)) {
> > +			/*
> > +			 * Drain pagevec and retry just in case we can get rid
> > +			 * of the extra pin, like in swapin case.
> > +			 */
> > +			lru_add_drain();
> > +		}
> >  		if (page_count(page) != 1 + PageSwapCache(page)) {
> >  			unlock_page(page);
> >  			result = SCAN_PAGE_COUNT;
> >  			goto out;
> >  		}
> > +
> >  		if (pte_write(pteval)) {
> >  			writable = true;
> >  		} else {
> > -- 
> > 2.26.0
> 
> Looks good to me. Is the added empty line intentional?

Yes. It groups try and retry together.

-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-27 18:53   ` Yang Shi
@ 2020-03-28  0:34     ` Kirill A. Shutemov
  2020-03-28  1:09       ` Yang Shi
  0 siblings, 1 reply; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-28  0:34 UTC (permalink / raw)
  To: Yang Shi
  Cc: Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 11:53:57AM -0700, Yang Shi wrote:
> On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> <kirill@shutemov.name> wrote:
> >
> > We can collapse PTE-mapped compound pages. We only need to avoid
> > handling them more than once: lock/unlock page only once if it's present
> > in the PMD range multiple times as it handled on compound level. The
> > same goes for LRU isolation and putpack.
> >
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
> >  1 file changed, 31 insertions(+), 10 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index b47edfe57f7b..c8c2c463095c 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
> >
> >  static void release_pte_page(struct page *page)
> >  {
> > +       /*
> > +        * We need to unlock and put compound page on LRU only once.
> > +        * The rest of the pages have to be locked and not on LRU here.
> > +        */
> > +       VM_BUG_ON_PAGE(!PageCompound(page) &&
> > +                       (!PageLocked(page) && PageLRU(page)), page);
> > +
> > +       if (!PageLocked(page))
> > +               return;
> > +
> > +       page = compound_head(page);
> >         dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> 
> We need count in the number of base pages. The same counter is
> modified by vmscan in base page unit.

Is it though? Where?

> Also need modify the inc path.

Done already.

> >         unlock_page(page);
> >         putback_lru_page(page);
> > @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >         pte_t *_pte;
> >         int none_or_zero = 0, result = 0, referenced = 0;
> >         bool writable = false;
> > +       LIST_HEAD(compound_pagelist);
> >
> >         for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
> >              _pte++, address += PAGE_SIZE) {
> > @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                         goto out;
> >                 }
> >
> > -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> > +               VM_BUG_ON_PAGE(!PageAnon(page), page);
> > +
> >                 if (PageCompound(page)) {
> > -                       result = SCAN_PAGE_COMPOUND;
> > -                       goto out;
> > -               }
> > +                       struct page *p;
> > +                       page = compound_head(page);
> >
> > -               VM_BUG_ON_PAGE(!PageAnon(page), page);
> > +                       /*
> > +                        * Check if we have dealt with the compount page
> 
> s/compount/compound

Thanks.

> > +                        * already
> > +                        */
> > +                       list_for_each_entry(p, &compound_pagelist, lru) {
> > +                               if (page ==  p)
> > +                                       break;
> > +                       }
> > +                       if (page ==  p)
> > +                               continue;
> 
> I don't quite understand why we need the above check. My understanding
> is when we scan the ptes, once the first PTE mapped subpage is found,
> then the THP would be added into compound_pagelist, then the later
> loop would find the same THP on the list then just break the loop. Did
> I miss anything?

We skip the iteration and look at the next pte. We've already isolated the
page. Nothing to do here.

> > +               }
> >
> >                 /*
> >                  * We can do it before isolate_lru_page because the
> > @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                     page_is_young(page) || PageReferenced(page) ||
> >                     mmu_notifier_test_young(vma->vm_mm, address))
> >                         referenced++;
> > +
> > +               if (PageCompound(page))
> > +                       list_add_tail(&page->lru, &compound_pagelist);
> >         }
> >         if (likely(writable)) {
> >                 if (likely(referenced)) {
> > @@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> >                         goto out_unmap;
> >                 }
> >
> > -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> > -               if (PageCompound(page)) {
> > -                       result = SCAN_PAGE_COMPOUND;
> > -                       goto out_unmap;
> > -               }
> > +               page = compound_head(page);
> >
> >                 /*
> >                  * Record which node the original page is from and save this
> > --
> > 2.26.0
> >
> >

-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-27 18:55   ` Zi Yan
@ 2020-03-28  0:39     ` Kirill A. Shutemov
  2020-03-28  1:17       ` Zi Yan
  0 siblings, 1 reply; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-28  0:39 UTC (permalink / raw)
  To: Zi Yan; +Cc: akpm, Andrea Arcangeli, linux-mm, linux-kernel, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 02:55:06PM -0400, Zi Yan wrote:
> On 27 Mar 2020, at 13:05, Kirill A. Shutemov wrote:
> 
> > We can collapse PTE-mapped compound pages. We only need to avoid
> > handling them more than once: lock/unlock page only once if it's present
> > in the PMD range multiple times as it handled on compound level. The
> > same goes for LRU isolation and putpack.
> 
> s/putpack/putback/
> 
> >
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
> >  1 file changed, 31 insertions(+), 10 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index b47edfe57f7b..c8c2c463095c 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
> >
> >  static void release_pte_page(struct page *page)
> >  {
> > +	/*
> > +	 * We need to unlock and put compound page on LRU only once.
> > +	 * The rest of the pages have to be locked and not on LRU here.
> > +	 */
> > +	VM_BUG_ON_PAGE(!PageCompound(page) &&
> > +			(!PageLocked(page) && PageLRU(page)), page);
> It only checks base pages.

That's the point.

> Add
> 
> VM_BUG_ON_PAGE(PageCompound(page) && PageLocked(page) && PageLRU(page), page);
> 
> to check for compound pages.

The compound page may be locked here if the function called for the first
time for the page and not locked after that (becouse we've unlocked it we
saw it the first time). The same with LRU.

> > +
> > +	if (!PageLocked(page))
> > +		return;
> > +
> > +	page = compound_head(page);
> >  	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> >  	unlock_page(page);
> >  	putback_lru_page(page);
> > @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >  	pte_t *_pte;
> >  	int none_or_zero = 0, result = 0, referenced = 0;
> >  	bool writable = false;
> > +	LIST_HEAD(compound_pagelist);
> >
> >  	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
> >  	     _pte++, address += PAGE_SIZE) {
> > @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >  			goto out;
> >  		}
> >
> > -		/* TODO: teach khugepaged to collapse THP mapped with pte */
> > +		VM_BUG_ON_PAGE(!PageAnon(page), page);
> > +
> >  		if (PageCompound(page)) {
> > -			result = SCAN_PAGE_COMPOUND;
> > -			goto out;
> > -		}
> > +			struct page *p;
> > +			page = compound_head(page);
> >
> > -		VM_BUG_ON_PAGE(!PageAnon(page), page);
> > +			/*
> > +			 * Check if we have dealt with the compount page
> 
> s/compount/compound/
> 

Thanks.

> > +			 * already
> > +			 */
> > +			list_for_each_entry(p, &compound_pagelist, lru) {
> > +				if (page ==  p)
> > +					break;
> > +			}
> > +			if (page ==  p)
> > +				continue;
> > +		}
> >
> >  		/*
> >  		 * We can do it before isolate_lru_page because the
> > @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >  		    page_is_young(page) || PageReferenced(page) ||
> >  		    mmu_notifier_test_young(vma->vm_mm, address))
> >  			referenced++;
> > +
> > +		if (PageCompound(page))
> > +			list_add_tail(&page->lru, &compound_pagelist);
> >  	}
> >  	if (likely(writable)) {
> >  		if (likely(referenced)) {
> 
> Do we need a list here? There should be at most one compound page we will see here, right?

Um? It's outside the pte loop. We get here once per PMD range.

'page' argument to trace_mm_collapse_huge_page_isolate() is misleading:
it's just the last page handled in the loop.

> 
> If a compound page is seen here, can we bail out the loop early? I guess not,
> because we can a partially mapped compound page at the beginning or the end of a VMA, right?
> 
> > @@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> >  			goto out_unmap;
> >  		}
> >
> > -		/* TODO: teach khugepaged to collapse THP mapped with pte */
> > -		if (PageCompound(page)) {
> > -			result = SCAN_PAGE_COMPOUND;
> > -			goto out_unmap;
> > -		}
> > +		page = compound_head(page);
> >
> >  		/*
> >  		 * Record which node the original page is from and save this
> > -- 
> > 2.26.0
> 
> 
> —
> Best Regards,
> Yan Zi



-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-27 20:45   ` Yang Shi
@ 2020-03-28  0:40     ` Kirill A. Shutemov
  2020-03-28  1:12       ` Yang Shi
  0 siblings, 1 reply; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-28  0:40 UTC (permalink / raw)
  To: Yang Shi
  Cc: Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 01:45:55PM -0700, Yang Shi wrote:
> On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> <kirill@shutemov.name> wrote:
> >
> > We can collapse PTE-mapped compound pages. We only need to avoid
> > handling them more than once: lock/unlock page only once if it's present
> > in the PMD range multiple times as it handled on compound level. The
> > same goes for LRU isolation and putpack.
> >
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
> >  1 file changed, 31 insertions(+), 10 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index b47edfe57f7b..c8c2c463095c 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
> >
> >  static void release_pte_page(struct page *page)
> >  {
> > +       /*
> > +        * We need to unlock and put compound page on LRU only once.
> > +        * The rest of the pages have to be locked and not on LRU here.
> > +        */
> > +       VM_BUG_ON_PAGE(!PageCompound(page) &&
> > +                       (!PageLocked(page) && PageLRU(page)), page);
> > +
> > +       if (!PageLocked(page))
> > +               return;
> > +
> > +       page = compound_head(page);
> >         dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> >         unlock_page(page);
> >         putback_lru_page(page);
> 
> BTW, wouldn't this unlock the whole THP and put it back to LRU?

It is the intention.

> Then we may copy the following PTE mapped pages with page unlocked and
> on LRU. I don't see critical problem, just the pages might be on and off
> LRU by others, i.e. vmscan, compaction, migration, etc. But no one could
> take the page away since try_to_unmap() would fail, but not very
> productive.
> 
> 
> > @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >         pte_t *_pte;
> >         int none_or_zero = 0, result = 0, referenced = 0;
> >         bool writable = false;
> > +       LIST_HEAD(compound_pagelist);
> >
> >         for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
> >              _pte++, address += PAGE_SIZE) {
> > @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                         goto out;
> >                 }
> >
> > -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> > +               VM_BUG_ON_PAGE(!PageAnon(page), page);
> > +
> >                 if (PageCompound(page)) {
> > -                       result = SCAN_PAGE_COMPOUND;
> > -                       goto out;
> > -               }
> > +                       struct page *p;
> > +                       page = compound_head(page);
> >
> > -               VM_BUG_ON_PAGE(!PageAnon(page), page);
> > +                       /*
> > +                        * Check if we have dealt with the compount page
> > +                        * already
> > +                        */
> > +                       list_for_each_entry(p, &compound_pagelist, lru) {
> > +                               if (page ==  p)
> > +                                       break;
> > +                       }
> > +                       if (page ==  p)
> > +                               continue;
> > +               }
> >
> >                 /*
> >                  * We can do it before isolate_lru_page because the
> > @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                     page_is_young(page) || PageReferenced(page) ||
> >                     mmu_notifier_test_young(vma->vm_mm, address))
> >                         referenced++;
> > +
> > +               if (PageCompound(page))
> > +                       list_add_tail(&page->lru, &compound_pagelist);
> >         }
> >         if (likely(writable)) {
> >                 if (likely(referenced)) {
> > @@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> >                         goto out_unmap;
> >                 }
> >
> > -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> > -               if (PageCompound(page)) {
> > -                       result = SCAN_PAGE_COMPOUND;
> > -                       goto out_unmap;
> > -               }
> > +               page = compound_head(page);
> >
> >                 /*
> >                  * Record which node the original page is from and save this
> > --
> > 2.26.0
> >
> >

-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 6/7] thp: Change CoW semantics for anon-THP
  2020-03-27 20:07   ` Yang Shi
@ 2020-03-28  0:43     ` Kirill A. Shutemov
  2020-03-28  1:30       ` Yang Shi
  0 siblings, 1 reply; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-28  0:43 UTC (permalink / raw)
  To: Yang Shi
  Cc: Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 01:07:34PM -0700, Yang Shi wrote:
> On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> <kirill@shutemov.name> wrote:
> >
> > Currently we have different copy-on-write semantics for anon- and
> > file-THP. For anon-THP we try to allocate huge page on the write fault,
> > but on file-THP we split PMD and allocate 4k page.
> 
> I agree this seems confusing.
> 
> >
> > Arguably, file-THP semantics is more desirable: we don't necessary want
> > to unshare full PMD range from the parent on the first access. This is
> > the primary reason THP is unusable for some workloads, like Redis.
> >
> > The original THP refcounting didn't allow to have PTE-mapped compound
> > pages, so we had no options, but to allocate huge page on CoW (with
> > fallback to 512 4k pages).
> >
> > The current refcounting doesn't have such limitations and we can cut a
> > lot of complex code out of fault path.
> >
> > khugepaged is now able to recover THP from such ranges if the
> > configuration allows.
> 
> It looks this patch would just split the PMD then fallback to handle
> it on PTE level, it definitely simplify the code a lot. However it
> makes the use of THP depend on the productivity of khugepaged. And the
> success rate of THP allocation in khugepaged depends on defrag. But by
> default khugepaged runs at very low priority, so khugepaged defrag may
> result in priority inversion easily.

If you have a workload that may be hurt by such change, please get some
numbers. It would be interesting to see.

> For example we saw THP split in reclaim path triggered by khugepaged
> held write anon_vma lock, but the other process which was doing
> reclaim too was blocked by anon_vma lock. Then the cond_resched() in
> rmap walk would make khugepaged preempted by all other processes
> easily so khugepaged may hold the anon_vma lock for indefinite time.
> Then we saw hung tasks.

Any chance you have a reproducer? I'm not sure I follow the problem, but
it's almost 4AM, so I'm slow.

> So we have khugepaged defrag disabled for some workloads to workaround
> the priority inversion problem. So, I'm concerned some processes may
> never get THP for some our workloads with this patch applied.
> 
> The priority inversion caused by khugepaged was not very usual. Just
> my two cents.
> 
> >
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >  mm/huge_memory.c | 247 +++++------------------------------------------
> >  1 file changed, 24 insertions(+), 223 deletions(-)
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index ef6a6bcb291f..15b7a9c86b7c 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -1206,262 +1206,63 @@ void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
> >         spin_unlock(vmf->ptl);
> >  }
> >
> > -static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
> > -                       pmd_t orig_pmd, struct page *page)
> > -{
> > -       struct vm_area_struct *vma = vmf->vma;
> > -       unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> > -       struct mem_cgroup *memcg;
> > -       pgtable_t pgtable;
> > -       pmd_t _pmd;
> > -       int i;
> > -       vm_fault_t ret = 0;
> > -       struct page **pages;
> > -       struct mmu_notifier_range range;
> > -
> > -       pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
> > -                             GFP_KERNEL);
> > -       if (unlikely(!pages)) {
> > -               ret |= VM_FAULT_OOM;
> > -               goto out;
> > -       }
> > -
> > -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> > -               pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
> > -                                              vmf->address, page_to_nid(page));
> > -               if (unlikely(!pages[i] ||
> > -                            mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
> > -                                    GFP_KERNEL, &memcg, false))) {
> > -                       if (pages[i])
> > -                               put_page(pages[i]);
> > -                       while (--i >= 0) {
> > -                               memcg = (void *)page_private(pages[i]);
> > -                               set_page_private(pages[i], 0);
> > -                               mem_cgroup_cancel_charge(pages[i], memcg,
> > -                                               false);
> > -                               put_page(pages[i]);
> > -                       }
> > -                       kfree(pages);
> > -                       ret |= VM_FAULT_OOM;
> > -                       goto out;
> > -               }
> > -               set_page_private(pages[i], (unsigned long)memcg);
> > -       }
> > -
> > -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> > -               copy_user_highpage(pages[i], page + i,
> > -                                  haddr + PAGE_SIZE * i, vma);
> > -               __SetPageUptodate(pages[i]);
> > -               cond_resched();
> > -       }
> > -
> > -       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> > -                               haddr, haddr + HPAGE_PMD_SIZE);
> > -       mmu_notifier_invalidate_range_start(&range);
> > -
> > -       vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> > -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
> > -               goto out_free_pages;
> > -       VM_BUG_ON_PAGE(!PageHead(page), page);
> > -
> > -       /*
> > -        * Leave pmd empty until pte is filled note we must notify here as
> > -        * concurrent CPU thread might write to new page before the call to
> > -        * mmu_notifier_invalidate_range_end() happens which can lead to a
> > -        * device seeing memory write in different order than CPU.
> > -        *
> > -        * See Documentation/vm/mmu_notifier.rst
> > -        */
> > -       pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
> > -
> > -       pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
> > -       pmd_populate(vma->vm_mm, &_pmd, pgtable);
> > -
> > -       for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
> > -               pte_t entry;
> > -               entry = mk_pte(pages[i], vma->vm_page_prot);
> > -               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> > -               memcg = (void *)page_private(pages[i]);
> > -               set_page_private(pages[i], 0);
> > -               page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
> > -               mem_cgroup_commit_charge(pages[i], memcg, false, false);
> > -               lru_cache_add_active_or_unevictable(pages[i], vma);
> > -               vmf->pte = pte_offset_map(&_pmd, haddr);
> > -               VM_BUG_ON(!pte_none(*vmf->pte));
> > -               set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
> > -               pte_unmap(vmf->pte);
> > -       }
> > -       kfree(pages);
> > -
> > -       smp_wmb(); /* make pte visible before pmd */
> > -       pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
> > -       page_remove_rmap(page, true);
> > -       spin_unlock(vmf->ptl);
> > -
> > -       /*
> > -        * No need to double call mmu_notifier->invalidate_range() callback as
> > -        * the above pmdp_huge_clear_flush_notify() did already call it.
> > -        */
> > -       mmu_notifier_invalidate_range_only_end(&range);
> > -
> > -       ret |= VM_FAULT_WRITE;
> > -       put_page(page);
> > -
> > -out:
> > -       return ret;
> > -
> > -out_free_pages:
> > -       spin_unlock(vmf->ptl);
> > -       mmu_notifier_invalidate_range_end(&range);
> > -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> > -               memcg = (void *)page_private(pages[i]);
> > -               set_page_private(pages[i], 0);
> > -               mem_cgroup_cancel_charge(pages[i], memcg, false);
> > -               put_page(pages[i]);
> > -       }
> > -       kfree(pages);
> > -       goto out;
> > -}
> > -
> >  vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
> >  {
> >         struct vm_area_struct *vma = vmf->vma;
> > -       struct page *page = NULL, *new_page;
> > -       struct mem_cgroup *memcg;
> > +       struct page *page;
> >         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> > -       struct mmu_notifier_range range;
> > -       gfp_t huge_gfp;                 /* for allocation and charge */
> > -       vm_fault_t ret = 0;
> >
> >         vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
> >         VM_BUG_ON_VMA(!vma->anon_vma, vma);
> > +
> >         if (is_huge_zero_pmd(orig_pmd))
> > -               goto alloc;
> > +               goto fallback;
> > +
> >         spin_lock(vmf->ptl);
> > -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
> > -               goto out_unlock;
> > +
> > +       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> > +               spin_unlock(vmf->ptl);
> > +               return 0;
> > +       }
> >
> >         page = pmd_page(orig_pmd);
> >         VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
> > -       /*
> > -        * We can only reuse the page if nobody else maps the huge page or it's
> > -        * part.
> > -        */
> > +
> > +       /* Lock page for reuse_swap_page() */
> >         if (!trylock_page(page)) {
> >                 get_page(page);
> >                 spin_unlock(vmf->ptl);
> >                 lock_page(page);
> >                 spin_lock(vmf->ptl);
> >                 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> > +                       spin_unlock(vmf->ptl);
> >                         unlock_page(page);
> >                         put_page(page);
> > -                       goto out_unlock;
> > +                       return 0;
> >                 }
> >                 put_page(page);
> >         }
> > +
> > +       /*
> > +        * We can only reuse the page if nobody else maps the huge page or it's
> > +        * part.
> > +        */
> >         if (reuse_swap_page(page, NULL)) {
> >                 pmd_t entry;
> >                 entry = pmd_mkyoung(orig_pmd);
> >                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> >                 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
> >                         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> > -               ret |= VM_FAULT_WRITE;
> >                 unlock_page(page);
> > -               goto out_unlock;
> > -       }
> > -       unlock_page(page);
> > -       get_page(page);
> > -       spin_unlock(vmf->ptl);
> > -alloc:
> > -       if (__transparent_hugepage_enabled(vma) &&
> > -           !transparent_hugepage_debug_cow()) {
> > -               huge_gfp = alloc_hugepage_direct_gfpmask(vma);
> > -               new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
> > -       } else
> > -               new_page = NULL;
> > -
> > -       if (likely(new_page)) {
> > -               prep_transhuge_page(new_page);
> > -       } else {
> > -               if (!page) {
> > -                       split_huge_pmd(vma, vmf->pmd, vmf->address);
> > -                       ret |= VM_FAULT_FALLBACK;
> > -               } else {
> > -                       ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
> > -                       if (ret & VM_FAULT_OOM) {
> > -                               split_huge_pmd(vma, vmf->pmd, vmf->address);
> > -                               ret |= VM_FAULT_FALLBACK;
> > -                       }
> > -                       put_page(page);
> > -               }
> > -               count_vm_event(THP_FAULT_FALLBACK);
> > -               goto out;
> > -       }
> > -
> > -       if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
> > -                                       huge_gfp, &memcg, true))) {
> > -               put_page(new_page);
> > -               split_huge_pmd(vma, vmf->pmd, vmf->address);
> > -               if (page)
> > -                       put_page(page);
> > -               ret |= VM_FAULT_FALLBACK;
> > -               count_vm_event(THP_FAULT_FALLBACK);
> > -               goto out;
> > -       }
> > -
> > -       count_vm_event(THP_FAULT_ALLOC);
> > -       count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
> > -
> > -       if (!page)
> > -               clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
> > -       else
> > -               copy_user_huge_page(new_page, page, vmf->address,
> > -                                   vma, HPAGE_PMD_NR);
> > -       __SetPageUptodate(new_page);
> > -
> > -       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> > -                               haddr, haddr + HPAGE_PMD_SIZE);
> > -       mmu_notifier_invalidate_range_start(&range);
> > -
> > -       spin_lock(vmf->ptl);
> > -       if (page)
> > -               put_page(page);
> > -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> >                 spin_unlock(vmf->ptl);
> > -               mem_cgroup_cancel_charge(new_page, memcg, true);
> > -               put_page(new_page);
> > -               goto out_mn;
> > -       } else {
> > -               pmd_t entry;
> > -               entry = mk_huge_pmd(new_page, vma->vm_page_prot);
> > -               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> > -               pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
> > -               page_add_new_anon_rmap(new_page, vma, haddr, true);
> > -               mem_cgroup_commit_charge(new_page, memcg, false, true);
> > -               lru_cache_add_active_or_unevictable(new_page, vma);
> > -               set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
> > -               update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> > -               if (!page) {
> > -                       add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> > -               } else {
> > -                       VM_BUG_ON_PAGE(!PageHead(page), page);
> > -                       page_remove_rmap(page, true);
> > -                       put_page(page);
> > -               }
> > -               ret |= VM_FAULT_WRITE;
> > +               return VM_FAULT_WRITE;
> >         }
> > +
> > +       unlock_page(page);
> >         spin_unlock(vmf->ptl);
> > -out_mn:
> > -       /*
> > -        * No need to double call mmu_notifier->invalidate_range() callback as
> > -        * the above pmdp_huge_clear_flush_notify() did already call it.
> > -        */
> > -       mmu_notifier_invalidate_range_only_end(&range);
> > -out:
> > -       return ret;
> > -out_unlock:
> > -       spin_unlock(vmf->ptl);
> > -       return ret;
> > +fallback:
> > +       __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
> > +       return VM_FAULT_FALLBACK;
> >  }
> >
> >  /*
> > --
> > 2.26.0
> >
> >

-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-28  0:34     ` Kirill A. Shutemov
@ 2020-03-28  1:09       ` Yang Shi
  2020-03-28 12:27         ` Kirill A. Shutemov
  0 siblings, 1 reply; 38+ messages in thread
From: Yang Shi @ 2020-03-28  1:09 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 5:34 PM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Fri, Mar 27, 2020 at 11:53:57AM -0700, Yang Shi wrote:
> > On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> > <kirill@shutemov.name> wrote:
> > >
> > > We can collapse PTE-mapped compound pages. We only need to avoid
> > > handling them more than once: lock/unlock page only once if it's present
> > > in the PMD range multiple times as it handled on compound level. The
> > > same goes for LRU isolation and putpack.
> > >
> > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > > ---
> > >  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
> > >  1 file changed, 31 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index b47edfe57f7b..c8c2c463095c 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
> > >
> > >  static void release_pte_page(struct page *page)
> > >  {
> > > +       /*
> > > +        * We need to unlock and put compound page on LRU only once.
> > > +        * The rest of the pages have to be locked and not on LRU here.
> > > +        */
> > > +       VM_BUG_ON_PAGE(!PageCompound(page) &&
> > > +                       (!PageLocked(page) && PageLRU(page)), page);
> > > +
> > > +       if (!PageLocked(page))
> > > +               return;
> > > +
> > > +       page = compound_head(page);
> > >         dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> >
> > We need count in the number of base pages. The same counter is
> > modified by vmscan in base page unit.
>
> Is it though? Where?

__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken) in
vmscan.c, here nr_taken is nr_compound(page), so if it is THP the
number would be 512.

So in both inc and dec path of collapse PTE mapped THP, we should mod
nr_compound(page) too.

>
> > Also need modify the inc path.
>
> Done already.
>
> > >         unlock_page(page);
> > >         putback_lru_page(page);
> > > @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > >         pte_t *_pte;
> > >         int none_or_zero = 0, result = 0, referenced = 0;
> > >         bool writable = false;
> > > +       LIST_HEAD(compound_pagelist);
> > >
> > >         for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
> > >              _pte++, address += PAGE_SIZE) {
> > > @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > >                         goto out;
> > >                 }
> > >
> > > -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> > > +               VM_BUG_ON_PAGE(!PageAnon(page), page);
> > > +
> > >                 if (PageCompound(page)) {
> > > -                       result = SCAN_PAGE_COMPOUND;
> > > -                       goto out;
> > > -               }
> > > +                       struct page *p;
> > > +                       page = compound_head(page);
> > >
> > > -               VM_BUG_ON_PAGE(!PageAnon(page), page);
> > > +                       /*
> > > +                        * Check if we have dealt with the compount page
> >
> > s/compount/compound
>
> Thanks.
>
> > > +                        * already
> > > +                        */
> > > +                       list_for_each_entry(p, &compound_pagelist, lru) {
> > > +                               if (page ==  p)
> > > +                                       break;
> > > +                       }
> > > +                       if (page ==  p)
> > > +                               continue;
> >
> > I don't quite understand why we need the above check. My understanding
> > is when we scan the ptes, once the first PTE mapped subpage is found,
> > then the THP would be added into compound_pagelist, then the later
> > loop would find the same THP on the list then just break the loop. Did
> > I miss anything?
>
> We skip the iteration and look at the next pte. We've already isolated the
> page. Nothing to do here.

I got your point. Thanks.

>
> > > +               }
> > >
> > >                 /*
> > >                  * We can do it before isolate_lru_page because the
> > > @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > >                     page_is_young(page) || PageReferenced(page) ||
> > >                     mmu_notifier_test_young(vma->vm_mm, address))
> > >                         referenced++;
> > > +
> > > +               if (PageCompound(page))
> > > +                       list_add_tail(&page->lru, &compound_pagelist);
> > >         }
> > >         if (likely(writable)) {
> > >                 if (likely(referenced)) {
> > > @@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> > >                         goto out_unmap;
> > >                 }
> > >
> > > -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> > > -               if (PageCompound(page)) {
> > > -                       result = SCAN_PAGE_COMPOUND;
> > > -                       goto out_unmap;
> > > -               }
> > > +               page = compound_head(page);
> > >
> > >                 /*
> > >                  * Record which node the original page is from and save this
> > > --
> > > 2.26.0
> > >
> > >
>
> --
>  Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-28  0:40     ` Kirill A. Shutemov
@ 2020-03-28  1:12       ` Yang Shi
  0 siblings, 0 replies; 38+ messages in thread
From: Yang Shi @ 2020-03-28  1:12 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 5:40 PM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Fri, Mar 27, 2020 at 01:45:55PM -0700, Yang Shi wrote:
> > On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> > <kirill@shutemov.name> wrote:
> > >
> > > We can collapse PTE-mapped compound pages. We only need to avoid
> > > handling them more than once: lock/unlock page only once if it's present
> > > in the PMD range multiple times as it handled on compound level. The
> > > same goes for LRU isolation and putpack.
> > >
> > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > > ---
> > >  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
> > >  1 file changed, 31 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index b47edfe57f7b..c8c2c463095c 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
> > >
> > >  static void release_pte_page(struct page *page)
> > >  {
> > > +       /*
> > > +        * We need to unlock and put compound page on LRU only once.
> > > +        * The rest of the pages have to be locked and not on LRU here.
> > > +        */
> > > +       VM_BUG_ON_PAGE(!PageCompound(page) &&
> > > +                       (!PageLocked(page) && PageLRU(page)), page);
> > > +
> > > +       if (!PageLocked(page))
> > > +               return;
> > > +
> > > +       page = compound_head(page);
> > >         dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> > >         unlock_page(page);
> > >         putback_lru_page(page);
> >
> > BTW, wouldn't this unlock the whole THP and put it back to LRU?
>
> It is the intention.

Yes, understood. Considering the below case:

Subpages 0, 1, 2, 3 are PTE mapped. Once subpage 0 is copied
release_pte_page() would be called then the whole THP would be
unlocked and putback to lru, then the loop would iterate to subpage 1,
2 and 3, but the page is not locked and on lru already. Is it
intentional?

>
> > Then we may copy the following PTE mapped pages with page unlocked and
> > on LRU. I don't see critical problem, just the pages might be on and off
> > LRU by others, i.e. vmscan, compaction, migration, etc. But no one could
> > take the page away since try_to_unmap() would fail, but not very
> > productive.
> >
> >
> > > @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > >         pte_t *_pte;
> > >         int none_or_zero = 0, result = 0, referenced = 0;
> > >         bool writable = false;
> > > +       LIST_HEAD(compound_pagelist);
> > >
> > >         for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
> > >              _pte++, address += PAGE_SIZE) {
> > > @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > >                         goto out;
> > >                 }
> > >
> > > -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> > > +               VM_BUG_ON_PAGE(!PageAnon(page), page);
> > > +
> > >                 if (PageCompound(page)) {
> > > -                       result = SCAN_PAGE_COMPOUND;
> > > -                       goto out;
> > > -               }
> > > +                       struct page *p;
> > > +                       page = compound_head(page);
> > >
> > > -               VM_BUG_ON_PAGE(!PageAnon(page), page);
> > > +                       /*
> > > +                        * Check if we have dealt with the compount page
> > > +                        * already
> > > +                        */
> > > +                       list_for_each_entry(p, &compound_pagelist, lru) {
> > > +                               if (page ==  p)
> > > +                                       break;
> > > +                       }
> > > +                       if (page ==  p)
> > > +                               continue;
> > > +               }
> > >
> > >                 /*
> > >                  * We can do it before isolate_lru_page because the
> > > @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > >                     page_is_young(page) || PageReferenced(page) ||
> > >                     mmu_notifier_test_young(vma->vm_mm, address))
> > >                         referenced++;
> > > +
> > > +               if (PageCompound(page))
> > > +                       list_add_tail(&page->lru, &compound_pagelist);
> > >         }
> > >         if (likely(writable)) {
> > >                 if (likely(referenced)) {
> > > @@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> > >                         goto out_unmap;
> > >                 }
> > >
> > > -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> > > -               if (PageCompound(page)) {
> > > -                       result = SCAN_PAGE_COMPOUND;
> > > -                       goto out_unmap;
> > > -               }
> > > +               page = compound_head(page);
> > >
> > >                 /*
> > >                  * Record which node the original page is from and save this
> > > --
> > > 2.26.0
> > >
> > >
>
> --
>  Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-28  0:39     ` Kirill A. Shutemov
@ 2020-03-28  1:17       ` Zi Yan
  2020-03-28 12:33         ` Kirill A. Shutemov
  0 siblings, 1 reply; 38+ messages in thread
From: Zi Yan @ 2020-03-28  1:17 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: akpm, Andrea Arcangeli, linux-mm, linux-kernel, Kirill A. Shutemov

[-- Attachment #1: Type: text/plain, Size: 5146 bytes --]

On 27 Mar 2020, at 20:39, Kirill A. Shutemov wrote:

> External email: Use caution opening links or attachments
>
>
> On Fri, Mar 27, 2020 at 02:55:06PM -0400, Zi Yan wrote:
>> On 27 Mar 2020, at 13:05, Kirill A. Shutemov wrote:
>>
>>> We can collapse PTE-mapped compound pages. We only need to avoid
>>> handling them more than once: lock/unlock page only once if it's present
>>> in the PMD range multiple times as it handled on compound level. The
>>> same goes for LRU isolation and putpack.
>>
>> s/putpack/putback/
>>
>>>
>>> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
>>> ---
>>>  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
>>>  1 file changed, 31 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index b47edfe57f7b..c8c2c463095c 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
>>>
>>>  static void release_pte_page(struct page *page)
>>>  {
>>> +   /*
>>> +    * We need to unlock and put compound page on LRU only once.
>>> +    * The rest of the pages have to be locked and not on LRU here.
>>> +    */
>>> +   VM_BUG_ON_PAGE(!PageCompound(page) &&
>>> +                   (!PageLocked(page) && PageLRU(page)), page);
>> It only checks base pages.
>
> That's the point.
>
>> Add
>>
>> VM_BUG_ON_PAGE(PageCompound(page) && PageLocked(page) && PageLRU(page), page);
>>
>> to check for compound pages.
>
> The compound page may be locked here if the function called for the first
> time for the page and not locked after that (becouse we've unlocked it we
> saw it the first time). The same with LRU.
>

For the first time, the compound page is locked and not on LRU, so this VM_BUG_ON passes.
For the second time and so on, the compound page is unlocked and on the LRU,
so this VM_BUG_ON still passes.

For base page, VM_BUG_ON passes.

Other unexpected situation (a compound page is locked and on LRU) triggers the VM_BU_ON,
but your VM_BUG_ON will not detect this situation, right?



>>> +
>>> +   if (!PageLocked(page))
>>> +           return;
>>> +
>>> +   page = compound_head(page);
>>>     dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
>>>     unlock_page(page);
>>>     putback_lru_page(page);
>>> @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>>     pte_t *_pte;
>>>     int none_or_zero = 0, result = 0, referenced = 0;
>>>     bool writable = false;
>>> +   LIST_HEAD(compound_pagelist);
>>>
>>>     for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
>>>          _pte++, address += PAGE_SIZE) {
>>> @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>>                     goto out;
>>>             }
>>>
>>> -           /* TODO: teach khugepaged to collapse THP mapped with pte */
>>> +           VM_BUG_ON_PAGE(!PageAnon(page), page);
>>> +
>>>             if (PageCompound(page)) {
>>> -                   result = SCAN_PAGE_COMPOUND;
>>> -                   goto out;
>>> -           }
>>> +                   struct page *p;
>>> +                   page = compound_head(page);
>>>
>>> -           VM_BUG_ON_PAGE(!PageAnon(page), page);
>>> +                   /*
>>> +                    * Check if we have dealt with the compount page
>>
>> s/compount/compound/
>>
>
> Thanks.
>
>>> +                    * already
>>> +                    */
>>> +                   list_for_each_entry(p, &compound_pagelist, lru) {
>>> +                           if (page ==  p)
>>> +                                   break;
>>> +                   }
>>> +                   if (page ==  p)
>>> +                           continue;
>>> +           }
>>>
>>>             /*
>>>              * We can do it before isolate_lru_page because the
>>> @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>>                 page_is_young(page) || PageReferenced(page) ||
>>>                 mmu_notifier_test_young(vma->vm_mm, address))
>>>                     referenced++;
>>> +
>>> +           if (PageCompound(page))
>>> +                   list_add_tail(&page->lru, &compound_pagelist);
>>>     }
>>>     if (likely(writable)) {
>>>             if (likely(referenced)) {
>>
>> Do we need a list here? There should be at most one compound page we will see here, right?
>
> Um? It's outside the pte loop. We get here once per PMD range.
>
> 'page' argument to trace_mm_collapse_huge_page_isolate() is misleading:
> it's just the last page handled in the loop.
>

Throughout the pte loop, we should only see at most one compound page, right?

If that is the case, we do not need a list to store that single compound page
but can use a struct page pointer that is initialized to NULL and later assigned
to the discovered compound page, right? I am not saying I want to change the code
in this way, but just try to make sure I understand the code correctly.

Thanks.

—
Best Regards,
Yan Zi

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 854 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 6/7] thp: Change CoW semantics for anon-THP
  2020-03-28  0:43     ` Kirill A. Shutemov
@ 2020-03-28  1:30       ` Yang Shi
  0 siblings, 0 replies; 38+ messages in thread
From: Yang Shi @ 2020-03-28  1:30 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 5:43 PM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Fri, Mar 27, 2020 at 01:07:34PM -0700, Yang Shi wrote:
> > On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> > <kirill@shutemov.name> wrote:
> > >
> > > Currently we have different copy-on-write semantics for anon- and
> > > file-THP. For anon-THP we try to allocate huge page on the write fault,
> > > but on file-THP we split PMD and allocate 4k page.
> >
> > I agree this seems confusing.
> >
> > >
> > > Arguably, file-THP semantics is more desirable: we don't necessary want
> > > to unshare full PMD range from the parent on the first access. This is
> > > the primary reason THP is unusable for some workloads, like Redis.
> > >
> > > The original THP refcounting didn't allow to have PTE-mapped compound
> > > pages, so we had no options, but to allocate huge page on CoW (with
> > > fallback to 512 4k pages).
> > >
> > > The current refcounting doesn't have such limitations and we can cut a
> > > lot of complex code out of fault path.
> > >
> > > khugepaged is now able to recover THP from such ranges if the
> > > configuration allows.
> >
> > It looks this patch would just split the PMD then fallback to handle
> > it on PTE level, it definitely simplify the code a lot. However it
> > makes the use of THP depend on the productivity of khugepaged. And the
> > success rate of THP allocation in khugepaged depends on defrag. But by
> > default khugepaged runs at very low priority, so khugepaged defrag may
> > result in priority inversion easily.
>
> If you have a workload that may be hurt by such change, please get some
> numbers. It would be interesting to see.

Please see the below splat:

[ 7417.871422] INFO: task /home/staragent:9088 blocked for more than
1200 seconds.
[ 7417.880501]       Tainted: G        W         4.19.91 #1
[ 7417.889015] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
[ 7417.897373] /home/staragent D    0  9088      1 0x00000000
[ 7417.905017] Call Trace:
[ 7417.907898]  ? __schedule+0x3cf/0x660
[ 7417.911988]  ? core_sys_select+0x1d2/0x270
[ 7417.916562]  schedule+0x33/0xc0
[ 7417.920205]  rwsem_down_read_failed+0x130/0x190
[ 7417.925153]  ? call_rwsem_down_read_failed+0x14/0x30
[ 7417.930648]  call_rwsem_down_read_failed+0x14/0x30
[ 7417.935894]  down_read+0x1c/0x30
[ 7417.939603]  __do_page_fault+0x435/0x4d0
[ 7417.943984]  do_page_fault+0x32/0x110
[ 7417.949315]  ? page_fault+0x8/0x30
[ 7417.953189]  page_fault+0x1e/0x30
[ 7417.956974] RIP: 0033:0x8063b0
[ 7417.960582] Code: Bad RIP value.

***  the task got current->mm->mmap_sem: 0xffff88a65ed02f80
crash> bt ffff88a65ed02f80
PID: 102341  TASK: ffff88a65ed02f80  CPU: 64  COMMAND: "/home/staragent"
 #0 [ffffc90021bd73e8] __schedule at ffffffff818436af
 #1 [ffffc90021bd7480] schedule at ffffffff81843973
 #2 [ffffc90021bd7498] rwsem_down_read_failed at ffffffff818472b0
 #3 [ffffc90021bd7540] call_rwsem_down_read_failed at ffffffff818393d4
 #4 [ffffc90021bd7588] down_read at ffffffff8184662c
 #5 [ffffc90021bd7598] page_lock_anon_vma_read at ffffffff81242d9c
 #6 [ffffc90021bd75c0] rmap_walk_anon at ffffffff8124154f
 #7 [ffffc90021bd7620] page_referenced at ffffffff8124350e
 #8 [ffffc90021bd7690] shrink_page_list at ffffffff8120b2f4
 #9 [ffffc90021bd7760] shrink_inactive_list at ffffffff8120c4c6
#10 [ffffc90021bd7808] shrink_node_memcg at ffffffff8120cffa
#11 [ffffc90021bd7910] shrink_node at ffffffff8120d534
#12 [ffffc90021bd79a8] do_try_to_free_pages at ffffffff8120dac4
#13 [ffffc90021bd7a10] try_to_free_pages at ffffffff8120dec4
#14 [ffffc90021bd7aa8] __alloc_pages_slowpath at ffffffff811fa367
#15 [ffffc90021bd7bd0] __alloc_pages_nodemask at ffffffff811fb116
#16 [ffffc90021bd7c30] __do_page_cache_readahead at ffffffff812013c4
#17 [ffffc90021bd7cc8] filemap_fault at ffffffff811ef652
#18 [ffffc90021bd7d98] ext4_filemap_fault at ffffffffc030c48c [ext4]
#19 [ffffc90021bd7db0] __do_fault at ffffffff81234cf0
#20 [ffffc90021bd7dd0] __handle_mm_fault at ffffffff812337d8
#21 [ffffc90021bd7e80] handle_mm_fault at ffffffff81233adc
#22 [ffffc90021bd7eb0] __do_page_fault at ffffffff8106f084
#23 [ffffc90021bd7f20] do_page_fault at ffffffff8106f342
#24 [ffffc90021bd7f50] page_fault at ffffffff81a0112e
    RIP: 00007f67de084e72  RSP: 00007f67677fddb8  RFLAGS: 00010206
    RAX: 00007f67de084e72  RBX: 0000000002802310  RCX: 0000000000080000
    RDX: 00007f67680008c0  RSI: 0000000000080000  RDI: 00007f67680008c0
    RBP: 00007f67677fddf0   R8: 0000000000000000   R9: 00007f6768086460
    R10: 00007f67677fdde0  R11: 00000000009b3ca8  R12: 0000000002802120
    R13: 0000000002802768  R14: 0000000000000003  R15: 00007f67677fe700
    ORIG_RAX: ffffffffffffffff  CS: 0033  SS: 002b

*** staragent held mmap_sem and was doing memory reclaim, and was
trying to acquire anon_vma lock.

*** the anon_vma->root->rwsem is held by khugepaged
crash> struct task_struct 0xffff88fe6f5dc740
comm = “khugepaged\000\000\000\000\000”
crash> bt 0xffff88fe6f5dc740
PID: 504    TASK: ffff88fe6f5dc740  CPU: 34  COMMAND: "khugepaged"
 #0 [ffffc9000da2f630] __schedule at ffffffff818436af
 #1 [ffffc9000da2f6c8] _cond_resched at ffffffff81843c0d
 #2 [ffffc9000da2f6d8] rmap_walk_anon at ffffffff81241465
 #3 [ffffc9000da2f738] remove_migration_ptes at ffffffff8127129d
 #4 [ffffc9000da2f770] remap_page at ffffffff8127504f
 #5 [ffffc9000da2f788] split_huge_page_to_list at ffffffff8127a82a
 #6 [ffffc9000da2f810] shrink_page_list at ffffffff8120b3fc
 #7 [ffffc9000da2f8e0] shrink_inactive_list at ffffffff8120c4c6
 #8 [ffffc9000da2f988] shrink_node_memcg at ffffffff8120cffa
 #9 [ffffc9000da2fa90] shrink_node at ffffffff8120d534
#10 [ffffc9000da2fb28] do_try_to_free_pages at ffffffff8120dac4
#11 [ffffc9000da2fb90] try_to_free_pages at ffffffff8120dec4
#12 [ffffc9000da2fc28] __alloc_pages_slowpath at ffffffff811fa367
#13 [ffffc9000da2fd50] __alloc_pages_nodemask at ffffffff811fb116
#14 [ffffc9000da2fdb0] khugepaged at ffffffff8127e478
#15 [ffffc9000da2ff10] kthread at ffffffff810bcb75
#16 [ffffc9000da2ff50] ret_from_fork at ffffffff81a001cf

The system was overloaded and experiencing memory pressure. Releasing
mmap_sem before doing any blocking operations could mitigate the
problem, but the underlying priority inversion caused by khugepaged
still exists.

> > For example we saw THP split in reclaim path triggered by khugepaged
> > held write anon_vma lock, but the other process which was doing
> > reclaim too was blocked by anon_vma lock. Then the cond_resched() in
> > rmap walk would make khugepaged preempted by all other processes
> > easily so khugepaged may hold the anon_vma lock for indefinite time.
> > Then we saw hung tasks.
>
> Any chance you have a reproducer? I'm not sure I follow the problem, but
> it's almost 4AM, so I'm slow.

I don't have a stable reproducer.

>
> > So we have khugepaged defrag disabled for some workloads to workaround
> > the priority inversion problem. So, I'm concerned some processes may
> > never get THP for some our workloads with this patch applied.
> >
> > The priority inversion caused by khugepaged was not very usual. Just
> > my two cents.
> >
> > >
> > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > > ---
> > >  mm/huge_memory.c | 247 +++++------------------------------------------
> > >  1 file changed, 24 insertions(+), 223 deletions(-)
> > >
> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > index ef6a6bcb291f..15b7a9c86b7c 100644
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -1206,262 +1206,63 @@ void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
> > >         spin_unlock(vmf->ptl);
> > >  }
> > >
> > > -static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
> > > -                       pmd_t orig_pmd, struct page *page)
> > > -{
> > > -       struct vm_area_struct *vma = vmf->vma;
> > > -       unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> > > -       struct mem_cgroup *memcg;
> > > -       pgtable_t pgtable;
> > > -       pmd_t _pmd;
> > > -       int i;
> > > -       vm_fault_t ret = 0;
> > > -       struct page **pages;
> > > -       struct mmu_notifier_range range;
> > > -
> > > -       pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
> > > -                             GFP_KERNEL);
> > > -       if (unlikely(!pages)) {
> > > -               ret |= VM_FAULT_OOM;
> > > -               goto out;
> > > -       }
> > > -
> > > -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> > > -               pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
> > > -                                              vmf->address, page_to_nid(page));
> > > -               if (unlikely(!pages[i] ||
> > > -                            mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
> > > -                                    GFP_KERNEL, &memcg, false))) {
> > > -                       if (pages[i])
> > > -                               put_page(pages[i]);
> > > -                       while (--i >= 0) {
> > > -                               memcg = (void *)page_private(pages[i]);
> > > -                               set_page_private(pages[i], 0);
> > > -                               mem_cgroup_cancel_charge(pages[i], memcg,
> > > -                                               false);
> > > -                               put_page(pages[i]);
> > > -                       }
> > > -                       kfree(pages);
> > > -                       ret |= VM_FAULT_OOM;
> > > -                       goto out;
> > > -               }
> > > -               set_page_private(pages[i], (unsigned long)memcg);
> > > -       }
> > > -
> > > -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> > > -               copy_user_highpage(pages[i], page + i,
> > > -                                  haddr + PAGE_SIZE * i, vma);
> > > -               __SetPageUptodate(pages[i]);
> > > -               cond_resched();
> > > -       }
> > > -
> > > -       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> > > -                               haddr, haddr + HPAGE_PMD_SIZE);
> > > -       mmu_notifier_invalidate_range_start(&range);
> > > -
> > > -       vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> > > -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
> > > -               goto out_free_pages;
> > > -       VM_BUG_ON_PAGE(!PageHead(page), page);
> > > -
> > > -       /*
> > > -        * Leave pmd empty until pte is filled note we must notify here as
> > > -        * concurrent CPU thread might write to new page before the call to
> > > -        * mmu_notifier_invalidate_range_end() happens which can lead to a
> > > -        * device seeing memory write in different order than CPU.
> > > -        *
> > > -        * See Documentation/vm/mmu_notifier.rst
> > > -        */
> > > -       pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
> > > -
> > > -       pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
> > > -       pmd_populate(vma->vm_mm, &_pmd, pgtable);
> > > -
> > > -       for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
> > > -               pte_t entry;
> > > -               entry = mk_pte(pages[i], vma->vm_page_prot);
> > > -               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> > > -               memcg = (void *)page_private(pages[i]);
> > > -               set_page_private(pages[i], 0);
> > > -               page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
> > > -               mem_cgroup_commit_charge(pages[i], memcg, false, false);
> > > -               lru_cache_add_active_or_unevictable(pages[i], vma);
> > > -               vmf->pte = pte_offset_map(&_pmd, haddr);
> > > -               VM_BUG_ON(!pte_none(*vmf->pte));
> > > -               set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
> > > -               pte_unmap(vmf->pte);
> > > -       }
> > > -       kfree(pages);
> > > -
> > > -       smp_wmb(); /* make pte visible before pmd */
> > > -       pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
> > > -       page_remove_rmap(page, true);
> > > -       spin_unlock(vmf->ptl);
> > > -
> > > -       /*
> > > -        * No need to double call mmu_notifier->invalidate_range() callback as
> > > -        * the above pmdp_huge_clear_flush_notify() did already call it.
> > > -        */
> > > -       mmu_notifier_invalidate_range_only_end(&range);
> > > -
> > > -       ret |= VM_FAULT_WRITE;
> > > -       put_page(page);
> > > -
> > > -out:
> > > -       return ret;
> > > -
> > > -out_free_pages:
> > > -       spin_unlock(vmf->ptl);
> > > -       mmu_notifier_invalidate_range_end(&range);
> > > -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> > > -               memcg = (void *)page_private(pages[i]);
> > > -               set_page_private(pages[i], 0);
> > > -               mem_cgroup_cancel_charge(pages[i], memcg, false);
> > > -               put_page(pages[i]);
> > > -       }
> > > -       kfree(pages);
> > > -       goto out;
> > > -}
> > > -
> > >  vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
> > >  {
> > >         struct vm_area_struct *vma = vmf->vma;
> > > -       struct page *page = NULL, *new_page;
> > > -       struct mem_cgroup *memcg;
> > > +       struct page *page;
> > >         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> > > -       struct mmu_notifier_range range;
> > > -       gfp_t huge_gfp;                 /* for allocation and charge */
> > > -       vm_fault_t ret = 0;
> > >
> > >         vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
> > >         VM_BUG_ON_VMA(!vma->anon_vma, vma);
> > > +
> > >         if (is_huge_zero_pmd(orig_pmd))
> > > -               goto alloc;
> > > +               goto fallback;
> > > +
> > >         spin_lock(vmf->ptl);
> > > -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
> > > -               goto out_unlock;
> > > +
> > > +       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> > > +               spin_unlock(vmf->ptl);
> > > +               return 0;
> > > +       }
> > >
> > >         page = pmd_page(orig_pmd);
> > >         VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
> > > -       /*
> > > -        * We can only reuse the page if nobody else maps the huge page or it's
> > > -        * part.
> > > -        */
> > > +
> > > +       /* Lock page for reuse_swap_page() */
> > >         if (!trylock_page(page)) {
> > >                 get_page(page);
> > >                 spin_unlock(vmf->ptl);
> > >                 lock_page(page);
> > >                 spin_lock(vmf->ptl);
> > >                 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> > > +                       spin_unlock(vmf->ptl);
> > >                         unlock_page(page);
> > >                         put_page(page);
> > > -                       goto out_unlock;
> > > +                       return 0;
> > >                 }
> > >                 put_page(page);
> > >         }
> > > +
> > > +       /*
> > > +        * We can only reuse the page if nobody else maps the huge page or it's
> > > +        * part.
> > > +        */
> > >         if (reuse_swap_page(page, NULL)) {
> > >                 pmd_t entry;
> > >                 entry = pmd_mkyoung(orig_pmd);
> > >                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> > >                 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
> > >                         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> > > -               ret |= VM_FAULT_WRITE;
> > >                 unlock_page(page);
> > > -               goto out_unlock;
> > > -       }
> > > -       unlock_page(page);
> > > -       get_page(page);
> > > -       spin_unlock(vmf->ptl);
> > > -alloc:
> > > -       if (__transparent_hugepage_enabled(vma) &&
> > > -           !transparent_hugepage_debug_cow()) {
> > > -               huge_gfp = alloc_hugepage_direct_gfpmask(vma);
> > > -               new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
> > > -       } else
> > > -               new_page = NULL;
> > > -
> > > -       if (likely(new_page)) {
> > > -               prep_transhuge_page(new_page);
> > > -       } else {
> > > -               if (!page) {
> > > -                       split_huge_pmd(vma, vmf->pmd, vmf->address);
> > > -                       ret |= VM_FAULT_FALLBACK;
> > > -               } else {
> > > -                       ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
> > > -                       if (ret & VM_FAULT_OOM) {
> > > -                               split_huge_pmd(vma, vmf->pmd, vmf->address);
> > > -                               ret |= VM_FAULT_FALLBACK;
> > > -                       }
> > > -                       put_page(page);
> > > -               }
> > > -               count_vm_event(THP_FAULT_FALLBACK);
> > > -               goto out;
> > > -       }
> > > -
> > > -       if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
> > > -                                       huge_gfp, &memcg, true))) {
> > > -               put_page(new_page);
> > > -               split_huge_pmd(vma, vmf->pmd, vmf->address);
> > > -               if (page)
> > > -                       put_page(page);
> > > -               ret |= VM_FAULT_FALLBACK;
> > > -               count_vm_event(THP_FAULT_FALLBACK);
> > > -               goto out;
> > > -       }
> > > -
> > > -       count_vm_event(THP_FAULT_ALLOC);
> > > -       count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
> > > -
> > > -       if (!page)
> > > -               clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
> > > -       else
> > > -               copy_user_huge_page(new_page, page, vmf->address,
> > > -                                   vma, HPAGE_PMD_NR);
> > > -       __SetPageUptodate(new_page);
> > > -
> > > -       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> > > -                               haddr, haddr + HPAGE_PMD_SIZE);
> > > -       mmu_notifier_invalidate_range_start(&range);
> > > -
> > > -       spin_lock(vmf->ptl);
> > > -       if (page)
> > > -               put_page(page);
> > > -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> > >                 spin_unlock(vmf->ptl);
> > > -               mem_cgroup_cancel_charge(new_page, memcg, true);
> > > -               put_page(new_page);
> > > -               goto out_mn;
> > > -       } else {
> > > -               pmd_t entry;
> > > -               entry = mk_huge_pmd(new_page, vma->vm_page_prot);
> > > -               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> > > -               pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
> > > -               page_add_new_anon_rmap(new_page, vma, haddr, true);
> > > -               mem_cgroup_commit_charge(new_page, memcg, false, true);
> > > -               lru_cache_add_active_or_unevictable(new_page, vma);
> > > -               set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
> > > -               update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> > > -               if (!page) {
> > > -                       add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> > > -               } else {
> > > -                       VM_BUG_ON_PAGE(!PageHead(page), page);
> > > -                       page_remove_rmap(page, true);
> > > -                       put_page(page);
> > > -               }
> > > -               ret |= VM_FAULT_WRITE;
> > > +               return VM_FAULT_WRITE;
> > >         }
> > > +
> > > +       unlock_page(page);
> > >         spin_unlock(vmf->ptl);
> > > -out_mn:
> > > -       /*
> > > -        * No need to double call mmu_notifier->invalidate_range() callback as
> > > -        * the above pmdp_huge_clear_flush_notify() did already call it.
> > > -        */
> > > -       mmu_notifier_invalidate_range_only_end(&range);
> > > -out:
> > > -       return ret;
> > > -out_unlock:
> > > -       spin_unlock(vmf->ptl);
> > > -       return ret;
> > > +fallback:
> > > +       __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
> > > +       return VM_FAULT_FALLBACK;
> > >  }
> > >
> > >  /*
> > > --
> > > 2.26.0
> > >
> > >
>
> --
>  Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 3/7] khugepaged: Drain LRU add pagevec to get rid of extra pins
  2020-03-27 18:10   ` Yang Shi
@ 2020-03-28 12:18     ` Kirill A. Shutemov
  2020-03-30 18:30       ` Yang Shi
  0 siblings, 1 reply; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-28 12:18 UTC (permalink / raw)
  To: Yang Shi
  Cc: Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 11:10:40AM -0700, Yang Shi wrote:
> On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> <kirill@shutemov.name> wrote:
> >
> > __collapse_huge_page_isolate() may fail due to extra pin in the LRU add
> > pagevec. It's petty common for swapin case: we swap in pages just to
> > fail due to the extra pin.
> >
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >  mm/khugepaged.c | 8 ++++++++
> >  1 file changed, 8 insertions(+)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 14d7afc90786..39e0994abeb8 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -585,11 +585,19 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                  * The page must only be referenced by the scanned process
> >                  * and page swap cache.
> >                  */
> > +               if (page_count(page) != 1 + PageSwapCache(page)) {
> > +                       /*
> > +                        * Drain pagevec and retry just in case we can get rid
> > +                        * of the extra pin, like in swapin case.
> > +                        */
> > +                       lru_add_drain();
> 
> This is definitely correct.
> 
> I'm wondering if we need one more lru_add_drain() before PageLRU check
> in khugepaged_scan_pmd() or not? The page might be in lru cache then
> get skipped. This would improve the success rate.

Could you elaborate on the scenario, I don't follow.


-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-28  1:09       ` Yang Shi
@ 2020-03-28 12:27         ` Kirill A. Shutemov
  2020-03-30 18:38           ` Yang Shi
  0 siblings, 1 reply; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-28 12:27 UTC (permalink / raw)
  To: Yang Shi
  Cc: Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 06:09:38PM -0700, Yang Shi wrote:
> On Fri, Mar 27, 2020 at 5:34 PM Kirill A. Shutemov <kirill@shutemov.name> wrote:
> >
> > On Fri, Mar 27, 2020 at 11:53:57AM -0700, Yang Shi wrote:
> > > On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> > > <kirill@shutemov.name> wrote:
> > > >
> > > > We can collapse PTE-mapped compound pages. We only need to avoid
> > > > handling them more than once: lock/unlock page only once if it's present
> > > > in the PMD range multiple times as it handled on compound level. The
> > > > same goes for LRU isolation and putpack.
> > > >
> > > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > > > ---
> > > >  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
> > > >  1 file changed, 31 insertions(+), 10 deletions(-)
> > > >
> > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > index b47edfe57f7b..c8c2c463095c 100644
> > > > --- a/mm/khugepaged.c
> > > > +++ b/mm/khugepaged.c
> > > > @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
> > > >
> > > >  static void release_pte_page(struct page *page)
> > > >  {
> > > > +       /*
> > > > +        * We need to unlock and put compound page on LRU only once.
> > > > +        * The rest of the pages have to be locked and not on LRU here.
> > > > +        */
> > > > +       VM_BUG_ON_PAGE(!PageCompound(page) &&
> > > > +                       (!PageLocked(page) && PageLRU(page)), page);
> > > > +
> > > > +       if (!PageLocked(page))
> > > > +               return;
> > > > +
> > > > +       page = compound_head(page);
> > > >         dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> > >
> > > We need count in the number of base pages. The same counter is
> > > modified by vmscan in base page unit.
> >
> > Is it though? Where?
> 
> __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken) in
> vmscan.c, here nr_taken is nr_compound(page), so if it is THP the
> number would be 512.

Could you point a particular codepath?

> So in both inc and dec path of collapse PTE mapped THP, we should mod
> nr_compound(page) too.

I disagree. Compound page is represented by single entry on LRU, so it has
to be counted once.

-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-28  1:17       ` Zi Yan
@ 2020-03-28 12:33         ` Kirill A. Shutemov
  2020-03-30 18:41           ` Yang Shi
  2020-03-30 18:50           ` Yang Shi
  0 siblings, 2 replies; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-28 12:33 UTC (permalink / raw)
  To: Zi Yan; +Cc: akpm, Andrea Arcangeli, linux-mm, linux-kernel, Kirill A. Shutemov

On Fri, Mar 27, 2020 at 09:17:00PM -0400, Zi Yan wrote:
> > The compound page may be locked here if the function called for the first
> > time for the page and not locked after that (becouse we've unlocked it we
> > saw it the first time). The same with LRU.
> >
> 
> For the first time, the compound page is locked and not on LRU, so this VM_BUG_ON passes.
> For the second time and so on, the compound page is unlocked and on the LRU,
> so this VM_BUG_ON still passes.
> 
> For base page, VM_BUG_ON passes.
> 
> Other unexpected situation (a compound page is locked and on LRU) triggers the VM_BU_ON,
> but your VM_BUG_ON will not detect this situation, right?

Right. I will rework this code. I've just realized it is racy: after
unlock and putback on LRU the page can be locked by somebody else and this
code can unlock it which completely borken.

I'll pass down compound_pagelist to release_pte_pages() and handle the
situation there.

> >>>     if (likely(writable)) {
> >>>             if (likely(referenced)) {
> >>
> >> Do we need a list here? There should be at most one compound page we will see here, right?
> >
> > Um? It's outside the pte loop. We get here once per PMD range.
> >
> > 'page' argument to trace_mm_collapse_huge_page_isolate() is misleading:
> > it's just the last page handled in the loop.
> >
> 
> Throughout the pte loop, we should only see at most one compound page, right?

No. mremap(2) opens a possibility for HPAGE_PMD_NR compound pages for
single PMD range.


-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 3/7] khugepaged: Drain LRU add pagevec to get rid of extra pins
  2020-03-28 12:18     ` Kirill A. Shutemov
@ 2020-03-30 18:30       ` Yang Shi
  2020-03-30 21:38         ` Kirill A. Shutemov
  0 siblings, 1 reply; 38+ messages in thread
From: Yang Shi @ 2020-03-30 18:30 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Sat, Mar 28, 2020 at 5:18 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Fri, Mar 27, 2020 at 11:10:40AM -0700, Yang Shi wrote:
> > On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> > <kirill@shutemov.name> wrote:
> > >
> > > __collapse_huge_page_isolate() may fail due to extra pin in the LRU add
> > > pagevec. It's petty common for swapin case: we swap in pages just to
> > > fail due to the extra pin.
> > >
> > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > > ---
> > >  mm/khugepaged.c | 8 ++++++++
> > >  1 file changed, 8 insertions(+)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index 14d7afc90786..39e0994abeb8 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -585,11 +585,19 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > >                  * The page must only be referenced by the scanned process
> > >                  * and page swap cache.
> > >                  */
> > > +               if (page_count(page) != 1 + PageSwapCache(page)) {
> > > +                       /*
> > > +                        * Drain pagevec and retry just in case we can get rid
> > > +                        * of the extra pin, like in swapin case.
> > > +                        */
> > > +                       lru_add_drain();
> >
> > This is definitely correct.
> >
> > I'm wondering if we need one more lru_add_drain() before PageLRU check
> > in khugepaged_scan_pmd() or not? The page might be in lru cache then
> > get skipped. This would improve the success rate.
>
> Could you elaborate on the scenario, I don't follow.

I mean the below change:

--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1195,6 +1195,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                        goto out_unmap;
                }
                khugepaged_node_load[node]++;
+               if (!PageLRU(page))
+                       /* Flush the page out of lru cache */
+                       lru_add_drain();
                if (!PageLRU(page)) {
                        result = SCAN_PAGE_LRU;
                        goto out_unmap;

If the page is not on LRU we even can't reach collapse_huge_page(), right?

>
>
> --
>  Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-28 12:27         ` Kirill A. Shutemov
@ 2020-03-30 18:38           ` Yang Shi
  0 siblings, 0 replies; 38+ messages in thread
From: Yang Shi @ 2020-03-30 18:38 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Sat, Mar 28, 2020 at 5:27 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Fri, Mar 27, 2020 at 06:09:38PM -0700, Yang Shi wrote:
> > On Fri, Mar 27, 2020 at 5:34 PM Kirill A. Shutemov <kirill@shutemov.name> wrote:
> > >
> > > On Fri, Mar 27, 2020 at 11:53:57AM -0700, Yang Shi wrote:
> > > > On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> > > > <kirill@shutemov.name> wrote:
> > > > >
> > > > > We can collapse PTE-mapped compound pages. We only need to avoid
> > > > > handling them more than once: lock/unlock page only once if it's present
> > > > > in the PMD range multiple times as it handled on compound level. The
> > > > > same goes for LRU isolation and putpack.
> > > > >
> > > > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > > > > ---
> > > > >  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
> > > > >  1 file changed, 31 insertions(+), 10 deletions(-)
> > > > >
> > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > > index b47edfe57f7b..c8c2c463095c 100644
> > > > > --- a/mm/khugepaged.c
> > > > > +++ b/mm/khugepaged.c
> > > > > @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
> > > > >
> > > > >  static void release_pte_page(struct page *page)
> > > > >  {
> > > > > +       /*
> > > > > +        * We need to unlock and put compound page on LRU only once.
> > > > > +        * The rest of the pages have to be locked and not on LRU here.
> > > > > +        */
> > > > > +       VM_BUG_ON_PAGE(!PageCompound(page) &&
> > > > > +                       (!PageLocked(page) && PageLRU(page)), page);
> > > > > +
> > > > > +       if (!PageLocked(page))
> > > > > +               return;
> > > > > +
> > > > > +       page = compound_head(page);
> > > > >         dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> > > >
> > > > We need count in the number of base pages. The same counter is
> > > > modified by vmscan in base page unit.
> > >
> > > Is it though? Where?
> >
> > __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken) in
> > vmscan.c, here nr_taken is nr_compound(page), so if it is THP the
> > number would be 512.
>
> Could you point a particular codepath?

shrink_inactive_list ->
        nr_taken = isolate_lru_pages()
        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);

Then in isolate_lru_pages() :
        nr_pages = compound_nr(page);
        ...
        switch (__isolate_lru_page(page, mode)) {
                case 0:
                        nr_taken += nr_pages;

>
> > So in both inc and dec path of collapse PTE mapped THP, we should mod
> > nr_compound(page) too.
>
> I disagree. Compound page is represented by single entry on LRU, so it has
> to be counted once.

It was not a problem without THP swap. But with THP swap we saw
pgsteal_{kswapd|direct} may be greater than pgscan_{kswapd|direct} if
we count THP as one page.

Please refer to the below commit:

commit 98879b3b9edc1604f2d1a6686576ef4d08ed3310
Author: Yang Shi <yang.shi@linux.alibaba.com>
Date:   Thu Jul 11 20:59:30 2019 -0700

    mm: vmscan: correct some vmscan counters for THP swapout

    Commit bd4c82c22c36 ("mm, THP, swap: delay splitting THP after swapped
    out"), THP can be swapped out in a whole.  But, nr_reclaimed and some
    other vm counters still get inc'ed by one even though a whole THP (512
    pages) gets swapped out.

    This doesn't make too much sense to memory reclaim.

    For example, direct reclaim may just need reclaim SWAP_CLUSTER_MAX
    pages, reclaiming one THP could fulfill it.  But, if nr_reclaimed is not
    increased correctly, direct reclaim may just waste time to reclaim more
    pages, SWAP_CLUSTER_MAX * 512 pages in worst case.

    And, it may cause pgsteal_{kswapd|direct} is greater than
    pgscan_{kswapd|direct}, like the below:

    pgsteal_kswapd 122933
    pgsteal_direct 26600225
    pgscan_kswapd 174153
    pgscan_direct 14678312

    nr_reclaimed and nr_scanned must be fixed in parallel otherwise it would
    break some page reclaim logic, e.g.

    vmpressure: this looks at the scanned/reclaimed ratio so it won't change
    semantics as long as scanned & reclaimed are fixed in parallel.

    compaction/reclaim: compaction wants a certain number of physical pages
    freed up before going back to compacting.

    kswapd priority raising: kswapd raises priority if we scan fewer pages
    than the reclaim target (which itself is obviously expressed in order-0
    pages).  As a result, kswapd can falsely raise its aggressiveness even
    when it's making great progress.

    Other than nr_scanned and nr_reclaimed, some other counters, e.g.
    pgactivate, nr_skipped, nr_ref_keep and nr_unmap_fail need to be fixed too
    since they are user visible via cgroup, /proc/vmstat or trace points,
    otherwise they would be underreported.

    When isolating pages from LRUs, nr_taken has been accounted in base page,
    but nr_scanned and nr_skipped are still accounted in THP.  It doesn't make
    too much sense too since this may cause trace point underreport the
    numbers as well.

    So accounting those counters in base page instead of accounting THP as one
    page.

    nr_dirty, nr_unqueued_dirty, nr_congested and nr_writeback are used by
    file cache, so they are not impacted by THP swap.

    This change may result in lower steal/scan ratio in some cases since THP
    may get split during page reclaim, then a part of tail pages get reclaimed
    instead of the whole 512 pages, but nr_scanned is accounted by 512,
    particularly for direct reclaim.  But, this should be not a significant
    issue.


So, since we count THP in base page unit in vmscan path, so the same
counter should be updated in base page unit in other path as well
IMHO.


>
> --
>  Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-28 12:33         ` Kirill A. Shutemov
@ 2020-03-30 18:41           ` Yang Shi
  2020-03-30 18:50           ` Yang Shi
  1 sibling, 0 replies; 38+ messages in thread
From: Yang Shi @ 2020-03-30 18:41 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Zi Yan, Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Sat, Mar 28, 2020 at 5:33 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Fri, Mar 27, 2020 at 09:17:00PM -0400, Zi Yan wrote:
> > > The compound page may be locked here if the function called for the first
> > > time for the page and not locked after that (becouse we've unlocked it we
> > > saw it the first time). The same with LRU.
> > >
> >
> > For the first time, the compound page is locked and not on LRU, so this VM_BUG_ON passes.
> > For the second time and so on, the compound page is unlocked and on the LRU,
> > so this VM_BUG_ON still passes.
> >
> > For base page, VM_BUG_ON passes.
> >
> > Other unexpected situation (a compound page is locked and on LRU) triggers the VM_BU_ON,
> > but your VM_BUG_ON will not detect this situation, right?
>
> Right. I will rework this code. I've just realized it is racy: after
> unlock and putback on LRU the page can be locked by somebody else and this
> code can unlock it which completely borken.

I'm wondering if we shall skip putting the page back to LRU. Since the
page is about to be freed soon, so as I mentioned in the other patch
it sounds not very productive to put it back to LRU again.

>
> I'll pass down compound_pagelist to release_pte_pages() and handle the
> situation there.
>
> > >>>     if (likely(writable)) {
> > >>>             if (likely(referenced)) {
> > >>
> > >> Do we need a list here? There should be at most one compound page we will see here, right?
> > >
> > > Um? It's outside the pte loop. We get here once per PMD range.
> > >
> > > 'page' argument to trace_mm_collapse_huge_page_isolate() is misleading:
> > > it's just the last page handled in the loop.
> > >
> >
> > Throughout the pte loop, we should only see at most one compound page, right?
>
> No. mremap(2) opens a possibility for HPAGE_PMD_NR compound pages for
> single PMD range.
>
>
> --
>  Kirill A. Shutemov
>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-28 12:33         ` Kirill A. Shutemov
  2020-03-30 18:41           ` Yang Shi
@ 2020-03-30 18:50           ` Yang Shi
  2020-03-31 14:08             ` Kirill A. Shutemov
  1 sibling, 1 reply; 38+ messages in thread
From: Yang Shi @ 2020-03-30 18:50 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Zi Yan, Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Sat, Mar 28, 2020 at 5:33 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Fri, Mar 27, 2020 at 09:17:00PM -0400, Zi Yan wrote:
> > > The compound page may be locked here if the function called for the first
> > > time for the page and not locked after that (becouse we've unlocked it we
> > > saw it the first time). The same with LRU.
> > >
> >
> > For the first time, the compound page is locked and not on LRU, so this VM_BUG_ON passes.
> > For the second time and so on, the compound page is unlocked and on the LRU,
> > so this VM_BUG_ON still passes.
> >
> > For base page, VM_BUG_ON passes.
> >
> > Other unexpected situation (a compound page is locked and on LRU) triggers the VM_BU_ON,
> > but your VM_BUG_ON will not detect this situation, right?
>
> Right. I will rework this code. I've just realized it is racy: after
> unlock and putback on LRU the page can be locked by somebody else and this
> code can unlock it which completely borken.
>
> I'll pass down compound_pagelist to release_pte_pages() and handle the
> situation there.
>
> > >>>     if (likely(writable)) {
> > >>>             if (likely(referenced)) {
> > >>
> > >> Do we need a list here? There should be at most one compound page we will see here, right?
> > >
> > > Um? It's outside the pte loop. We get here once per PMD range.
> > >
> > > 'page' argument to trace_mm_collapse_huge_page_isolate() is misleading:
> > > it's just the last page handled in the loop.
> > >
> >
> > Throughout the pte loop, we should only see at most one compound page, right?
>
> No. mremap(2) opens a possibility for HPAGE_PMD_NR compound pages for
> single PMD range.

Do you mean every PTE in the PMD is mapped by a sub page from different THPs?

>
>
> --
>  Kirill A. Shutemov
>


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 3/7] khugepaged: Drain LRU add pagevec to get rid of extra pins
  2020-03-30 18:30       ` Yang Shi
@ 2020-03-30 21:38         ` Kirill A. Shutemov
  0 siblings, 0 replies; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-30 21:38 UTC (permalink / raw)
  To: Yang Shi
  Cc: Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Mon, Mar 30, 2020 at 11:30:14AM -0700, Yang Shi wrote:
> On Sat, Mar 28, 2020 at 5:18 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
> >
> > On Fri, Mar 27, 2020 at 11:10:40AM -0700, Yang Shi wrote:
> > > On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> > > <kirill@shutemov.name> wrote:
> > > >
> > > > __collapse_huge_page_isolate() may fail due to extra pin in the LRU add
> > > > pagevec. It's petty common for swapin case: we swap in pages just to
> > > > fail due to the extra pin.
> > > >
> > > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > > > ---
> > > >  mm/khugepaged.c | 8 ++++++++
> > > >  1 file changed, 8 insertions(+)
> > > >
> > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > index 14d7afc90786..39e0994abeb8 100644
> > > > --- a/mm/khugepaged.c
> > > > +++ b/mm/khugepaged.c
> > > > @@ -585,11 +585,19 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > > >                  * The page must only be referenced by the scanned process
> > > >                  * and page swap cache.
> > > >                  */
> > > > +               if (page_count(page) != 1 + PageSwapCache(page)) {
> > > > +                       /*
> > > > +                        * Drain pagevec and retry just in case we can get rid
> > > > +                        * of the extra pin, like in swapin case.
> > > > +                        */
> > > > +                       lru_add_drain();
> > >
> > > This is definitely correct.
> > >
> > > I'm wondering if we need one more lru_add_drain() before PageLRU check
> > > in khugepaged_scan_pmd() or not? The page might be in lru cache then
> > > get skipped. This would improve the success rate.
> >
> > Could you elaborate on the scenario, I don't follow.
> 
> I mean the below change:
> 
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1195,6 +1195,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>                         goto out_unmap;
>                 }
>                 khugepaged_node_load[node]++;
> +               if (!PageLRU(page))
> +                       /* Flush the page out of lru cache */
> +                       lru_add_drain();
>                 if (!PageLRU(page)) {
>                         result = SCAN_PAGE_LRU;
>                         goto out_unmap;
> 
> If the page is not on LRU we even can't reach collapse_huge_page(), right?

Yeah, I've archived the same by doing lru_add_drain_all() once per
khugepaged_do_scan(). It is more effective than lru_add_drain() inside
khugepaged_scan_pmd() and should have too much overhead.

The lru_add_drain() from this patch moved into swapin routine and called
only on success.

-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-30 18:50           ` Yang Shi
@ 2020-03-31 14:08             ` Kirill A. Shutemov
  2020-04-01 19:45               ` Yang Shi
  0 siblings, 1 reply; 38+ messages in thread
From: Kirill A. Shutemov @ 2020-03-31 14:08 UTC (permalink / raw)
  To: Yang Shi
  Cc: Zi Yan, Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Mon, Mar 30, 2020 at 11:50:41AM -0700, Yang Shi wrote:
> On Sat, Mar 28, 2020 at 5:33 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
> >
> > On Fri, Mar 27, 2020 at 09:17:00PM -0400, Zi Yan wrote:
> > > > The compound page may be locked here if the function called for the first
> > > > time for the page and not locked after that (becouse we've unlocked it we
> > > > saw it the first time). The same with LRU.
> > > >
> > >
> > > For the first time, the compound page is locked and not on LRU, so this VM_BUG_ON passes.
> > > For the second time and so on, the compound page is unlocked and on the LRU,
> > > so this VM_BUG_ON still passes.
> > >
> > > For base page, VM_BUG_ON passes.
> > >
> > > Other unexpected situation (a compound page is locked and on LRU) triggers the VM_BU_ON,
> > > but your VM_BUG_ON will not detect this situation, right?
> >
> > Right. I will rework this code. I've just realized it is racy: after
> > unlock and putback on LRU the page can be locked by somebody else and this
> > code can unlock it which completely borken.
> >
> > I'll pass down compound_pagelist to release_pte_pages() and handle the
> > situation there.
> >
> > > >>>     if (likely(writable)) {
> > > >>>             if (likely(referenced)) {
> > > >>
> > > >> Do we need a list here? There should be at most one compound page we will see here, right?
> > > >
> > > > Um? It's outside the pte loop. We get here once per PMD range.
> > > >
> > > > 'page' argument to trace_mm_collapse_huge_page_isolate() is misleading:
> > > > it's just the last page handled in the loop.
> > > >
> > >
> > > Throughout the pte loop, we should only see at most one compound page, right?
> >
> > No. mremap(2) opens a possibility for HPAGE_PMD_NR compound pages for
> > single PMD range.
> 
> Do you mean every PTE in the PMD is mapped by a sub page from different THPs?

Yes.

Well, it was harder to archive than I expected, but below is a test case,
I've come up with. It maps 512 head pages within single PMD range.

diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c
index 3a98d5b2d6d8..9ae119234a39 100644
--- a/tools/testing/selftests/vm/khugepaged.c
+++ b/tools/testing/selftests/vm/khugepaged.c
@@ -703,6 +703,63 @@ static void collapse_full_of_compound(void)
 	munmap(p, hpage_pmd_size);
 }
 
+static void collapse_compound_extreme(void)
+{
+	void *p;
+	int i;
+
+	p = alloc_mapping();
+	for (i = 0; i < hpage_pmd_nr; i++) {
+		printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
+				i + 1, hpage_pmd_nr);
+
+		madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
+		fill_memory(BASE_ADDR, 0, hpage_pmd_size);
+		if (!check_huge(BASE_ADDR)) {
+			printf("Failed to allocate huge page\n");
+			exit(EXIT_FAILURE);
+		}
+		madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+		p = mremap(BASE_ADDR - i * page_size,
+				i * page_size + hpage_pmd_size,
+				(i + 1) * page_size,
+				MREMAP_MAYMOVE | MREMAP_FIXED,
+				BASE_ADDR + 2 * hpage_pmd_size);
+		if (p == MAP_FAILED) {
+			perror("mremap+unmap");
+			exit(EXIT_FAILURE);
+		}
+
+		p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
+				(i + 1) * page_size,
+				(i + 1) * page_size + hpage_pmd_size,
+				MREMAP_MAYMOVE | MREMAP_FIXED,
+				BASE_ADDR - (i + 1) * page_size);
+		if (p == MAP_FAILED) {
+			perror("mremap+alloc");
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	munmap(BASE_ADDR, hpage_pmd_size);
+	fill_memory(p, 0, hpage_pmd_size);
+	if (!check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	if (wait_for_scan("Collapse PTE table full of different compound pages", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	validate_memory(p, 0, hpage_pmd_size);
+	munmap(p, hpage_pmd_size);
+}
+
 static void collapse_fork(void)
 {
 	int wstatus;
@@ -916,6 +973,7 @@ int main(void)
 	collapse_max_ptes_swap();
 	collapse_signle_pte_entry_compound();
 	collapse_full_of_compound();
+	collapse_compound_extreme();
 	collapse_fork();
 	collapse_fork_compound();
 	collapse_max_ptes_shared();
-- 
 Kirill A. Shutemov


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-03-31 14:08             ` Kirill A. Shutemov
@ 2020-04-01 19:45               ` Yang Shi
  0 siblings, 0 replies; 38+ messages in thread
From: Yang Shi @ 2020-04-01 19:45 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Zi Yan, Andrew Morton, Andrea Arcangeli, Linux MM,
	Linux Kernel Mailing List, Kirill A. Shutemov

On Tue, Mar 31, 2020 at 7:08 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Mon, Mar 30, 2020 at 11:50:41AM -0700, Yang Shi wrote:
> > On Sat, Mar 28, 2020 at 5:33 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
> > >
> > > On Fri, Mar 27, 2020 at 09:17:00PM -0400, Zi Yan wrote:
> > > > > The compound page may be locked here if the function called for the first
> > > > > time for the page and not locked after that (becouse we've unlocked it we
> > > > > saw it the first time). The same with LRU.
> > > > >
> > > >
> > > > For the first time, the compound page is locked and not on LRU, so this VM_BUG_ON passes.
> > > > For the second time and so on, the compound page is unlocked and on the LRU,
> > > > so this VM_BUG_ON still passes.
> > > >
> > > > For base page, VM_BUG_ON passes.
> > > >
> > > > Other unexpected situation (a compound page is locked and on LRU) triggers the VM_BU_ON,
> > > > but your VM_BUG_ON will not detect this situation, right?
> > >
> > > Right. I will rework this code. I've just realized it is racy: after
> > > unlock and putback on LRU the page can be locked by somebody else and this
> > > code can unlock it which completely borken.
> > >
> > > I'll pass down compound_pagelist to release_pte_pages() and handle the
> > > situation there.
> > >
> > > > >>>     if (likely(writable)) {
> > > > >>>             if (likely(referenced)) {
> > > > >>
> > > > >> Do we need a list here? There should be at most one compound page we will see here, right?
> > > > >
> > > > > Um? It's outside the pte loop. We get here once per PMD range.
> > > > >
> > > > > 'page' argument to trace_mm_collapse_huge_page_isolate() is misleading:
> > > > > it's just the last page handled in the loop.
> > > > >
> > > >
> > > > Throughout the pte loop, we should only see at most one compound page, right?
> > >
> > > No. mremap(2) opens a possibility for HPAGE_PMD_NR compound pages for
> > > single PMD range.
> >
> > Do you mean every PTE in the PMD is mapped by a sub page from different THPs?
>
> Yes.
>
> Well, it was harder to archive than I expected, but below is a test case,
> I've come up with. It maps 512 head pages within single PMD range.

Thanks, this is very helpful.

>
> diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c
> index 3a98d5b2d6d8..9ae119234a39 100644
> --- a/tools/testing/selftests/vm/khugepaged.c
> +++ b/tools/testing/selftests/vm/khugepaged.c
> @@ -703,6 +703,63 @@ static void collapse_full_of_compound(void)
>         munmap(p, hpage_pmd_size);
>  }
>
> +static void collapse_compound_extreme(void)
> +{
> +       void *p;
> +       int i;
> +
> +       p = alloc_mapping();
> +       for (i = 0; i < hpage_pmd_nr; i++) {
> +               printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
> +                               i + 1, hpage_pmd_nr);
> +
> +               madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
> +               fill_memory(BASE_ADDR, 0, hpage_pmd_size);
> +               if (!check_huge(BASE_ADDR)) {
> +                       printf("Failed to allocate huge page\n");
> +                       exit(EXIT_FAILURE);
> +               }
> +               madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
> +
> +               p = mremap(BASE_ADDR - i * page_size,
> +                               i * page_size + hpage_pmd_size,
> +                               (i + 1) * page_size,
> +                               MREMAP_MAYMOVE | MREMAP_FIXED,
> +                               BASE_ADDR + 2 * hpage_pmd_size);
> +               if (p == MAP_FAILED) {
> +                       perror("mremap+unmap");
> +                       exit(EXIT_FAILURE);
> +               }
> +
> +               p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
> +                               (i + 1) * page_size,
> +                               (i + 1) * page_size + hpage_pmd_size,
> +                               MREMAP_MAYMOVE | MREMAP_FIXED,
> +                               BASE_ADDR - (i + 1) * page_size);
> +               if (p == MAP_FAILED) {
> +                       perror("mremap+alloc");
> +                       exit(EXIT_FAILURE);
> +               }
> +       }
> +
> +       munmap(BASE_ADDR, hpage_pmd_size);
> +       fill_memory(p, 0, hpage_pmd_size);
> +       if (!check_huge(p))
> +               success("OK");
> +       else
> +               fail("Fail");
> +
> +       if (wait_for_scan("Collapse PTE table full of different compound pages", p))
> +               fail("Timeout");
> +       else if (check_huge(p))
> +               success("OK");
> +       else
> +               fail("Fail");
> +
> +       validate_memory(p, 0, hpage_pmd_size);
> +       munmap(p, hpage_pmd_size);
> +}
> +
>  static void collapse_fork(void)
>  {
>         int wstatus;
> @@ -916,6 +973,7 @@ int main(void)
>         collapse_max_ptes_swap();
>         collapse_signle_pte_entry_compound();
>         collapse_full_of_compound();
> +       collapse_compound_extreme();
>         collapse_fork();
>         collapse_fork_compound();
>         collapse_max_ptes_shared();
> --
>  Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2020-04-01 19:46 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-27 17:05 [PATCH 0/7] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
2020-03-27 17:05 ` [PATCH 1/7] khugepaged: Add self test Kirill A. Shutemov
2020-03-27 17:05 ` [PATCH 2/7] khugepaged: Do not stop collapse if less than half PTEs are referenced Kirill A. Shutemov
2020-03-27 17:30   ` Zi Yan
2020-03-27 17:46   ` Yang Shi
2020-03-27 17:05 ` [PATCH 3/7] khugepaged: Drain LRU add pagevec to get rid of extra pins Kirill A. Shutemov
2020-03-27 17:34   ` Zi Yan
2020-03-28  0:20     ` Kirill A. Shutemov
2020-03-27 18:10   ` Yang Shi
2020-03-28 12:18     ` Kirill A. Shutemov
2020-03-30 18:30       ` Yang Shi
2020-03-30 21:38         ` Kirill A. Shutemov
2020-03-27 17:05 ` [PATCH 4/7] khugepaged: Allow to callapse a page shared across fork Kirill A. Shutemov
2020-03-27 18:19   ` Zi Yan
2020-03-27 21:31     ` Yang Shi
2020-03-27 21:44       ` Zi Yan
2020-03-27 17:05 ` [PATCH 5/7] khugepaged: Allow to collapse PTE-mapped compound pages Kirill A. Shutemov
2020-03-27 18:53   ` Yang Shi
2020-03-28  0:34     ` Kirill A. Shutemov
2020-03-28  1:09       ` Yang Shi
2020-03-28 12:27         ` Kirill A. Shutemov
2020-03-30 18:38           ` Yang Shi
2020-03-27 18:55   ` Zi Yan
2020-03-28  0:39     ` Kirill A. Shutemov
2020-03-28  1:17       ` Zi Yan
2020-03-28 12:33         ` Kirill A. Shutemov
2020-03-30 18:41           ` Yang Shi
2020-03-30 18:50           ` Yang Shi
2020-03-31 14:08             ` Kirill A. Shutemov
2020-04-01 19:45               ` Yang Shi
2020-03-27 20:45   ` Yang Shi
2020-03-28  0:40     ` Kirill A. Shutemov
2020-03-28  1:12       ` Yang Shi
2020-03-27 17:06 ` [PATCH 6/7] thp: Change CoW semantics for anon-THP Kirill A. Shutemov
2020-03-27 20:07   ` Yang Shi
2020-03-28  0:43     ` Kirill A. Shutemov
2020-03-28  1:30       ` Yang Shi
2020-03-27 17:06 ` [PATCH 7/7] khugepaged: Introduce 'max_ptes_shared' tunable Kirill A. Shutemov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).