[PATCHv3, RESEND 0/8] thp/khugepaged improvements and CoW semantics

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCHv3, RESEND 0/8] thp/khugepaged improvements and CoW semantics
@ 2020-04-13 12:52 Kirill A. Shutemov
  2020-04-13 12:52 ` [PATCHv3, RESEND 1/8] khugepaged: Add self test Kirill A. Shutemov
                   ` (7 more replies)
  0 siblings, 8 replies; 28+ messages in thread
From: Kirill A. Shutemov @ 2020-04-13 12:52 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli
  Cc: Zi Yan, Yang Shi, Ralph Campbell, John Hubbard,
	William Kucharski, linux-mm, linux-kernel, Kirill A. Shutemov

The patchset adds khugepaged selftest (anon-THP only for now), expands
cases khugepaged can handle and switches anon-THP copy-on-write handling
to 4k.

Please review and consider applying.

v3:
 - Fix handling compound pages in swap cache;
 - Rework swaped vs. referenced check;
 - Fix refcounting for compound pages;
 - Drop Reviewed-by/Acked-by as patchset changed non-trivially;
 - Typos;
v2:
 - Fix race in compound page handling;
 - Add one more test-case for compound page case;
 - Rework LRU add cache draining;
 - Typos;

Kirill A. Shutemov (8):
  khugepaged: Add self test
  khugepaged: Do not stop collapse if less than half PTEs are referenced
  khugepaged: Drain all LRU caches before scanning pages
  khugepaged: Drain LRU add pagevec after swapin
  khugepaged: Allow to collapse a page shared across fork
  khugepaged: Allow to collapse PTE-mapped compound pages
  thp: Change CoW semantics for anon-THP
  khugepaged: Introduce 'max_ptes_shared' tunable

 Documentation/admin-guide/mm/transhuge.rst |   7 +
 include/trace/events/huge_memory.h         |   3 +-
 mm/huge_memory.c                           | 249 +-----
 mm/khugepaged.c                            | 218 +++--
 tools/testing/selftests/vm/Makefile        |   1 +
 tools/testing/selftests/vm/khugepaged.c    | 982 +++++++++++++++++++++
 6 files changed, 1174 insertions(+), 286 deletions(-)
 create mode 100644 tools/testing/selftests/vm/khugepaged.c

-- 
2.26.0

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCHv3, RESEND 1/8] khugepaged: Add self test
  2020-04-13 12:52 [PATCHv3, RESEND 0/8] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
@ 2020-04-13 12:52 ` Kirill A. Shutemov
  2020-04-15 15:39   ` Zi Yan
  2020-04-13 12:52 ` [PATCHv3, RESEND 2/8] khugepaged: Do not stop collapse if less than half PTEs are referenced Kirill A. Shutemov
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 28+ messages in thread
From: Kirill A. Shutemov @ 2020-04-13 12:52 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli
  Cc: Zi Yan, Yang Shi, Ralph Campbell, John Hubbard,
	William Kucharski, linux-mm, linux-kernel, Kirill A. Shutemov

The test checks if khugepaged is able to recover huge page where we
expect to do so. It only covers anon-THP for now.

Currently the test shows few failures. They are going to be addressed by
the following patches.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 tools/testing/selftests/vm/Makefile     |   1 +
 tools/testing/selftests/vm/khugepaged.c | 899 ++++++++++++++++++++++++
 2 files changed, 900 insertions(+)
 create mode 100644 tools/testing/selftests/vm/khugepaged.c

diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 7f9a8a8c31da..981d0dc21f9e 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -18,6 +18,7 @@ TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
+TEST_GEN_FILES += khugepaged
 
 ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64))
 TEST_GEN_FILES += va_128TBswitch
diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c
new file mode 100644
index 000000000000..34d945e71e2e
--- /dev/null
+++ b/tools/testing/selftests/vm/khugepaged.c
@@ -0,0 +1,899 @@
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+#include <sys/wait.h>
+
+#ifndef MADV_PAGEOUT
+#define MADV_PAGEOUT 21
+#endif
+
+#define BASE_ADDR ((void *)(1UL << 30))
+static unsigned long hpage_pmd_size;
+static unsigned long page_size;
+static int hpage_pmd_nr;
+
+#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
+
+enum thp_enabled {
+	THP_ALWAYS,
+	THP_MADVISE,
+	THP_NEVER,
+};
+
+static const char *thp_enabled_strings[] = {
+	"always",
+	"madvise",
+	"never",
+	NULL
+};
+
+enum thp_defrag {
+	THP_DEFRAG_ALWAYS,
+	THP_DEFRAG_DEFER,
+	THP_DEFRAG_DEFER_MADVISE,
+	THP_DEFRAG_MADVISE,
+	THP_DEFRAG_NEVER,
+};
+
+static const char *thp_defrag_strings[] = {
+	"always",
+	"defer",
+	"defer+madvise",
+	"madvise",
+	"never",
+	NULL
+};
+
+enum shmem_enabled {
+	SHMEM_ALWAYS,
+	SHMEM_WITHIN_SIZE,
+	SHMEM_ADVISE,
+	SHMEM_NEVER,
+	SHMEM_DENY,
+	SHMEM_FORCE,
+};
+
+static const char *shmem_enabled_strings[] = {
+	"always",
+	"within_size",
+	"advise",
+	"never",
+	"deny",
+	"force",
+	NULL
+};
+
+struct khugepaged_settings {
+	bool defrag;
+	unsigned int alloc_sleep_millisecs;
+	unsigned int scan_sleep_millisecs;
+	unsigned int max_ptes_none;
+	unsigned int max_ptes_swap;
+	unsigned long pages_to_scan;
+};
+
+struct settings {
+	enum thp_enabled thp_enabled;
+	enum thp_defrag thp_defrag;
+	enum shmem_enabled shmem_enabled;
+	bool debug_cow;
+	bool use_zero_page;
+	struct khugepaged_settings khugepaged;
+};
+
+static struct settings default_settings = {
+	.thp_enabled = THP_MADVISE,
+	.thp_defrag = THP_DEFRAG_ALWAYS,
+	.shmem_enabled = SHMEM_NEVER,
+	.debug_cow = 0,
+	.use_zero_page = 0,
+	.khugepaged = {
+		.defrag = 1,
+		.alloc_sleep_millisecs = 10,
+		.scan_sleep_millisecs = 10,
+	},
+};
+
+static struct settings saved_settings;
+static bool skip_settings_restore;
+
+static int exit_status;
+
+static void success(const char *msg)
+{
+	printf(" \e[32m%s\e[0m\n", msg);
+}
+
+static void fail(const char *msg)
+{
+	printf(" \e[31m%s\e[0m\n", msg);
+	exit_status++;
+}
+
+static int read_file(const char *path, char *buf, size_t buflen)
+{
+	int fd;
+	ssize_t numread;
+
+	fd = open(path, O_RDONLY);
+	if (fd == -1)
+		return 0;
+
+	numread = read(fd, buf, buflen - 1);
+	if (numread < 1) {
+		close(fd);
+		return 0;
+	}
+
+	buf[numread] = '\0';
+	close(fd);
+
+	return (unsigned int) numread;
+}
+
+static int write_file(const char *path, const char *buf, size_t buflen)
+{
+	int fd;
+	ssize_t numwritten;
+
+	fd = open(path, O_WRONLY);
+	if (fd == -1)
+		return 0;
+
+	numwritten = write(fd, buf, buflen - 1);
+	close(fd);
+	if (numwritten < 1)
+		return 0;
+
+	return (unsigned int) numwritten;
+}
+
+static int read_string(const char *name, const char *strings[])
+{
+	char path[PATH_MAX];
+	char buf[256];
+	char *c;
+	int ret;
+
+	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+	if (ret >= PATH_MAX) {
+		printf("%s: Pathname is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+	if (!read_file(path, buf, sizeof(buf))) {
+		perror(path);
+		exit(EXIT_FAILURE);
+	}
+
+	c = strchr(buf, '[');
+	if (!c) {
+		printf("%s: Parse failure\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+	c++;
+	memmove(buf, c, sizeof(buf) - (c - buf));
+
+	c = strchr(buf, ']');
+	if (!c) {
+		printf("%s: Parse failure\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+	*c = '\0';
+
+	ret = 0;
+	while (strings[ret]) {
+		if (!strcmp(strings[ret], buf))
+			return ret;
+		ret++;
+	}
+
+	printf("Failed to parse %s\n", name);
+	exit(EXIT_FAILURE);
+}
+
+static void write_string(const char *name, const char *val)
+{
+	char path[PATH_MAX];
+	int ret;
+
+	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+	if (ret >= PATH_MAX) {
+		printf("%s: Pathname is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+	if (!write_file(path, val, strlen(val) + 1)) {
+		perror(path);
+		exit(EXIT_FAILURE);
+	}
+}
+
+static const unsigned long read_num(const char *name)
+{
+	char path[PATH_MAX];
+	char buf[21];
+	int ret;
+
+	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+	if (ret >= PATH_MAX) {
+		printf("%s: Pathname is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+	ret = read_file(path, buf, sizeof(buf));
+	if (ret < 0) {
+		perror("read_file(read_num)");
+		exit(EXIT_FAILURE);
+	}
+
+	return strtoul(buf, NULL, 10);
+}
+
+static void write_num(const char *name, unsigned long num)
+{
+	char path[PATH_MAX];
+	char buf[21];
+	int ret;
+
+	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+	if (ret >= PATH_MAX) {
+		printf("%s: Pathname is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+	sprintf(buf, "%ld", num);
+	if (!write_file(path, buf, strlen(buf) + 1)) {
+		perror(path);
+		exit(EXIT_FAILURE);
+	}
+}
+
+static void write_settings(struct settings *settings)
+{
+	struct khugepaged_settings *khugepaged = &settings->khugepaged;
+
+	write_string("enabled", thp_enabled_strings[settings->thp_enabled]);
+	write_string("defrag", thp_defrag_strings[settings->thp_defrag]);
+	write_string("shmem_enabled",
+			shmem_enabled_strings[settings->shmem_enabled]);
+	write_num("debug_cow", settings->debug_cow);
+	write_num("use_zero_page", settings->use_zero_page);
+
+	write_num("khugepaged/defrag", khugepaged->defrag);
+	write_num("khugepaged/alloc_sleep_millisecs",
+			khugepaged->alloc_sleep_millisecs);
+	write_num("khugepaged/scan_sleep_millisecs",
+			khugepaged->scan_sleep_millisecs);
+	write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
+	write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
+	write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
+}
+
+static void restore_settings(int sig)
+{
+	if (skip_settings_restore)
+		goto out;
+
+	printf("Restore THP and khugepaged settings...");
+	write_settings(&saved_settings);
+	success("OK");
+	if (sig)
+		exit(EXIT_FAILURE);
+out:
+	exit(exit_status);
+}
+
+static void save_settings(void)
+{
+	printf("Save THP and khugepaged settings...");
+	saved_settings = (struct settings) {
+		.thp_enabled = read_string("enabled", thp_enabled_strings),
+		.thp_defrag = read_string("defrag", thp_defrag_strings),
+		.shmem_enabled =
+			read_string("shmem_enabled", shmem_enabled_strings),
+		.debug_cow = read_num("debug_cow"),
+		.use_zero_page = read_num("use_zero_page"),
+	};
+	saved_settings.khugepaged = (struct khugepaged_settings) {
+		.defrag = read_num("khugepaged/defrag"),
+		.alloc_sleep_millisecs =
+			read_num("khugepaged/alloc_sleep_millisecs"),
+		.scan_sleep_millisecs =
+			read_num("khugepaged/scan_sleep_millisecs"),
+		.max_ptes_none = read_num("khugepaged/max_ptes_none"),
+		.max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
+		.pages_to_scan = read_num("khugepaged/pages_to_scan"),
+	};
+	success("OK");
+
+	signal(SIGTERM, restore_settings);
+	signal(SIGINT, restore_settings);
+	signal(SIGHUP, restore_settings);
+	signal(SIGQUIT, restore_settings);
+}
+
+static void adjust_settings(void)
+{
+
+	printf("Adjust settings...");
+	write_settings(&default_settings);
+	success("OK");
+}
+
+#define CHECK_HUGE_FMT "sed -ne " \
+	"'/^%lx/,/^AnonHugePages/{/^AnonHugePages:\\s*%ld kB/ q1}' " \
+	"/proc/%d/smaps"
+
+static bool check_huge(void *p)
+{
+	char *cmd;
+	int ret;
+
+	ret = asprintf(&cmd, CHECK_HUGE_FMT,
+			(unsigned long)p, hpage_pmd_size >> 10, getpid());
+	if (ret < 0) {
+		perror("asprintf(CHECK_FMT)");
+		exit(EXIT_FAILURE);
+	}
+
+	ret = system(cmd);
+	free(cmd);
+	if (ret < 0 || !WIFEXITED(ret)) {
+		perror("system(check_huge)");
+		exit(EXIT_FAILURE);
+	}
+
+	return WEXITSTATUS(ret);
+}
+
+#define CHECK_SWAP_FMT "sed -ne " \
+	"'/^%lx/,/^Swap:/{/^Swap:\\s*%ld kB/ q1}' " \
+	"/proc/%d/smaps"
+
+static bool check_swap(void *p, unsigned long size)
+{
+	char *cmd;
+	int ret;
+
+	ret = asprintf(&cmd, CHECK_SWAP_FMT,
+			(unsigned long)p, size >> 10, getpid());
+	if (ret < 0) {
+		perror("asprintf(CHECK_SWAP)");
+		exit(EXIT_FAILURE);
+	}
+
+	ret = system(cmd);
+	free(cmd);
+	if (ret < 0 || !WIFEXITED(ret)) {
+		perror("system(check_swap)");
+		exit(EXIT_FAILURE);
+	}
+
+	return WEXITSTATUS(ret);
+}
+
+static void *alloc_mapping(void)
+{
+	void *p;
+
+	p = mmap(BASE_ADDR, hpage_pmd_size, PROT_READ | PROT_WRITE,
+			MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (p != BASE_ADDR) {
+		printf("Failed to allocate VMA at %p\n", BASE_ADDR);
+		exit(EXIT_FAILURE);
+	}
+
+	return p;
+}
+
+static void fill_memory(int *p, unsigned long start, unsigned long end)
+{
+	int i;
+
+	for (i = start / page_size; i < end / page_size; i++)
+		p[i * page_size / sizeof(*p)] = i + 0xdead0000;
+}
+
+static void validate_memory(int *p, unsigned long start, unsigned long end)
+{
+	int i;
+
+	for (i = start / page_size; i < end / page_size; i++) {
+		if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
+			printf("Page %d is corrupted: %#x\n",
+					i, p[i * page_size / sizeof(*p)]);
+			exit(EXIT_FAILURE);
+		}
+	}
+}
+
+#define TICK 500000
+static bool wait_for_scan(const char *msg, char *p)
+{
+	int full_scans;
+	int timeout = 6; /* 3 seconds */
+
+	/* Sanity check */
+	if (check_huge(p)) {
+		printf("Unexpected huge page\n");
+		exit(EXIT_FAILURE);
+	}
+
+	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+
+	/* Wait until the second full_scan completed */
+	full_scans = read_num("khugepaged/full_scans") + 2;
+
+	printf("%s...", msg);
+	while (timeout--) {
+		if (check_huge(p))
+			break;
+		if (read_num("khugepaged/full_scans") >= full_scans)
+			break;
+		printf(".");
+		usleep(TICK);
+	}
+
+	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+	return !timeout;
+}
+
+static void alloc_at_fault(void)
+{
+	struct settings settings = default_settings;
+	char *p;
+
+	settings.thp_enabled = THP_ALWAYS;
+	write_settings(&settings);
+
+	p = alloc_mapping();
+	*p = 1;
+	printf("Allocate huge page on fault...");
+	if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	write_settings(&default_settings);
+
+	madvise(p, page_size, MADV_DONTNEED);
+	printf("Split huge PMD on MADV_DONTNEED...");
+	if (!check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_full(void)
+{
+	void *p;
+
+	p = alloc_mapping();
+	fill_memory(p, 0, hpage_pmd_size);
+	if (wait_for_scan("Collapse fully populated PTE table", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, hpage_pmd_size);
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_empty(void)
+{
+	void *p;
+
+	p = alloc_mapping();
+	if (wait_for_scan("Do not collapse empty PTE table", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		fail("Fail");
+	else
+		success("OK");
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_single_pte_entry(void)
+{
+	void *p;
+
+	p = alloc_mapping();
+	fill_memory(p, 0, page_size);
+	if (wait_for_scan("Collapse PTE table with single PTE entry present", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, page_size);
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_none(void)
+{
+	int max_ptes_none = hpage_pmd_nr / 2;
+	struct settings settings = default_settings;
+	void *p;
+
+	settings.khugepaged.max_ptes_none = max_ptes_none;
+	write_settings(&settings);
+
+	p = alloc_mapping();
+
+	fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
+	if (wait_for_scan("Do not collapse with max_ptes_none exeeded", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		fail("Fail");
+	else
+		success("OK");
+	validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
+
+	fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
+	if (wait_for_scan("Collapse with max_ptes_none PTEs empty", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
+
+	munmap(p, hpage_pmd_size);
+	write_settings(&default_settings);
+}
+
+static void collapse_swapin_single_pte(void)
+{
+	void *p;
+	p = alloc_mapping();
+	fill_memory(p, 0, hpage_pmd_size);
+
+	printf("Swapout one page...");
+	if (madvise(p, page_size, MADV_PAGEOUT)) {
+		perror("madvise(MADV_PAGEOUT)");
+		exit(EXIT_FAILURE);
+	}
+	if (check_swap(p, page_size)) {
+		success("OK");
+	} else {
+		fail("Fail");
+		goto out;
+	}
+
+	if (wait_for_scan("Collapse with swaping in single PTE entry", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, hpage_pmd_size);
+out:
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_swap(void)
+{
+	int max_ptes_swap = read_num("khugepaged/max_ptes_swap");
+	void *p;
+
+	p = alloc_mapping();
+
+	fill_memory(p, 0, hpage_pmd_size);
+	printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
+	if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
+		perror("madvise(MADV_PAGEOUT)");
+		exit(EXIT_FAILURE);
+	}
+	if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
+		success("OK");
+	} else {
+		fail("Fail");
+		goto out;
+	}
+
+	if (wait_for_scan("Do not collapse with max_ptes_swap exeeded", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		fail("Fail");
+	else
+		success("OK");
+	validate_memory(p, 0, hpage_pmd_size);
+
+	fill_memory(p, 0, hpage_pmd_size);
+	printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr);
+	if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
+		perror("madvise(MADV_PAGEOUT)");
+		exit(EXIT_FAILURE);
+	}
+	if (check_swap(p, max_ptes_swap * page_size)) {
+		success("OK");
+	} else {
+		fail("Fail");
+		goto out;
+	}
+
+	if (wait_for_scan("Collapse with max_ptes_swap pages swapped out", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, hpage_pmd_size);
+out:
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_single_pte_entry_compound(void)
+{
+	void *p;
+
+	p = alloc_mapping();
+
+	printf("Allocate huge page...");
+	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+	fill_memory(p, 0, hpage_pmd_size);
+	if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+	printf("Split huge page leaving single PTE mapping compount page...");
+	madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
+	if (!check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	if (wait_for_scan("Collapse PTE table with single PTE mapping compount page", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, page_size);
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_full_of_compound(void)
+{
+	void *p;
+
+	p = alloc_mapping();
+
+	printf("Allocate huge page...");
+	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+	fill_memory(p, 0, hpage_pmd_size);
+	if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	printf("Split huge page leaving single PTE page table full of compount pages...");
+	madvise(p, page_size, MADV_NOHUGEPAGE);
+	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+	if (!check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	if (wait_for_scan("Collapse PTE table full of compound pages", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, hpage_pmd_size);
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_compound_extreme(void)
+{
+	void *p;
+	int i;
+
+	p = alloc_mapping();
+	for (i = 0; i < hpage_pmd_nr; i++) {
+		printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
+				i + 1, hpage_pmd_nr);
+
+		madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
+		fill_memory(BASE_ADDR, 0, hpage_pmd_size);
+		if (!check_huge(BASE_ADDR)) {
+			printf("Failed to allocate huge page\n");
+			exit(EXIT_FAILURE);
+		}
+		madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+		p = mremap(BASE_ADDR - i * page_size,
+				i * page_size + hpage_pmd_size,
+				(i + 1) * page_size,
+				MREMAP_MAYMOVE | MREMAP_FIXED,
+				BASE_ADDR + 2 * hpage_pmd_size);
+		if (p == MAP_FAILED) {
+			perror("mremap+unmap");
+			exit(EXIT_FAILURE);
+		}
+
+		p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
+				(i + 1) * page_size,
+				(i + 1) * page_size + hpage_pmd_size,
+				MREMAP_MAYMOVE | MREMAP_FIXED,
+				BASE_ADDR - (i + 1) * page_size);
+		if (p == MAP_FAILED) {
+			perror("mremap+alloc");
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	munmap(BASE_ADDR, hpage_pmd_size);
+	fill_memory(p, 0, hpage_pmd_size);
+	if (!check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	if (wait_for_scan("Collapse PTE table full of different compound pages", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	validate_memory(p, 0, hpage_pmd_size);
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_fork(void)
+{
+	int wstatus;
+	void *p;
+
+	p = alloc_mapping();
+
+	printf("Allocate small page...");
+	fill_memory(p, 0, page_size);
+	if (!check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	printf("Share small page over fork()...");
+	if (!fork()) {
+		/* Do not touch settings on child exit */
+		skip_settings_restore = true;
+		exit_status = 0;
+
+		if (!check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+
+		fill_memory(p, page_size, 2 * page_size);
+
+		if (wait_for_scan("Collapse PTE table with single page shared with parent process", p))
+			fail("Timeout");
+		else if (check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+
+		validate_memory(p, 0, page_size);
+		munmap(p, hpage_pmd_size);
+		exit(exit_status);
+	}
+
+	wait(&wstatus);
+	exit_status += WEXITSTATUS(wstatus);
+
+	printf("Check if parent still has small page...");
+	if (!check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, page_size);
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_fork_compound(void)
+{
+	int wstatus;
+	void *p;
+
+	p = alloc_mapping();
+
+	printf("Allocate huge page...");
+	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+	fill_memory(p, 0, hpage_pmd_size);
+	if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	printf("Share huge page over fork()...");
+	if (!fork()) {
+		/* Do not touch settings on child exit */
+		skip_settings_restore = true;
+		exit_status = 0;
+
+		if (check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+
+		printf("Split huge page PMD in child process...");
+		madvise(p, page_size, MADV_NOHUGEPAGE);
+		madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+		if (!check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+		fill_memory(p, 0, page_size);
+
+		if (wait_for_scan("Collapse PTE table full of compound pages in child", p))
+			fail("Timeout");
+		else if (check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+
+		validate_memory(p, 0, hpage_pmd_size);
+		munmap(p, hpage_pmd_size);
+		exit(exit_status);
+	}
+
+	wait(&wstatus);
+	exit_status += WEXITSTATUS(wstatus);
+
+	printf("Check if parent still has huge page...");
+	if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, hpage_pmd_size);
+	munmap(p, hpage_pmd_size);
+}
+
+int main(void)
+{
+	setbuf(stdout, NULL);
+
+	page_size = getpagesize();
+	hpage_pmd_size = read_num("hpage_pmd_size");
+	hpage_pmd_nr = hpage_pmd_size / page_size;
+
+	default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
+	default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
+	default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
+
+	save_settings();
+	adjust_settings();
+
+	alloc_at_fault();
+	collapse_full();
+	collapse_empty();
+	collapse_single_pte_entry();
+	collapse_max_ptes_none();
+	collapse_swapin_single_pte();
+	collapse_max_ptes_swap();
+	collapse_single_pte_entry_compound();
+	collapse_full_of_compound();
+	collapse_compound_extreme();
+	collapse_fork();
+	collapse_fork_compound();
+
+	restore_settings(0);
+}
-- 
2.26.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCHv3, RESEND 2/8] khugepaged: Do not stop collapse if less than half PTEs are referenced
  2020-04-13 12:52 [PATCHv3, RESEND 0/8] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
  2020-04-13 12:52 ` [PATCHv3, RESEND 1/8] khugepaged: Add self test Kirill A. Shutemov
@ 2020-04-13 12:52 ` Kirill A. Shutemov
  2020-04-15 20:31   ` Yang Shi
  2020-04-13 12:52 ` [PATCHv3, RESEND 3/8] khugepaged: Drain all LRU caches before scanning pages Kirill A. Shutemov
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 28+ messages in thread
From: Kirill A. Shutemov @ 2020-04-13 12:52 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli
  Cc: Zi Yan, Yang Shi, Ralph Campbell, John Hubbard,
	William Kucharski, linux-mm, linux-kernel, Kirill A. Shutemov

__collapse_huge_page_swapin() checks the number of referenced PTE to
decide if the memory range is hot enough to justify swapin.

We have few problems with the approach:

 - It is way too late: we can do the check much earlier and safe time.
   khugepaged_scan_pmd() already knows if we have any pages to swap in
   and number of referenced page.

 - It stops collapse altogether if there's not enough referenced pages,
   not only swappingin.

Fix it by making the right check early. We also can avoid additional
page table scanning if khugepaged_scan_pmd() haven't found any swap
entries.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Fixes: 0db501f7a34c ("mm, thp: convert from optimistic swapin collapsing to conservative")
---
 mm/khugepaged.c | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 99bab7e4d05b..5968ec5ddd6b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -902,11 +902,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 		.pgoff = linear_page_index(vma, address),
 	};
 
-	/* we only decide to swapin, if there is enough young ptes */
-	if (referenced < HPAGE_PMD_NR/2) {
-		trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
-		return false;
-	}
 	vmf.pte = pte_offset_map(pmd, address);
 	for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
 			vmf.pte++, vmf.address += PAGE_SIZE) {
@@ -946,7 +941,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 static void collapse_huge_page(struct mm_struct *mm,
 				   unsigned long address,
 				   struct page **hpage,
-				   int node, int referenced)
+				   int node, int referenced, int unmapped)
 {
 	pmd_t *pmd, _pmd;
 	pte_t *pte;
@@ -1003,7 +998,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	 * If it fails, we release mmap_sem and jump out_nolock.
 	 * Continuing to collapse causes inconsistency.
 	 */
-	if (!__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) {
+	if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
+				pmd, referenced)) {
 		mem_cgroup_cancel_charge(new_page, memcg, true);
 		up_read(&mm->mmap_sem);
 		goto out_nolock;
@@ -1214,22 +1210,21 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 		    mmu_notifier_test_young(vma->vm_mm, address))
 			referenced++;
 	}
-	if (writable) {
-		if (referenced) {
+	if (!writable) {
+		result = SCAN_PAGE_RO;
+	} else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
+		result = SCAN_LACK_REFERENCED_PAGE;
+	} else {
 			result = SCAN_SUCCEED;
 			ret = 1;
-		} else {
-			result = SCAN_LACK_REFERENCED_PAGE;
-		}
-	} else {
-		result = SCAN_PAGE_RO;
 	}
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
 	if (ret) {
 		node = khugepaged_find_target_node();
 		/* collapse_huge_page will return with the mmap_sem released */
-		collapse_huge_page(mm, address, hpage, node, referenced);
+		collapse_huge_page(mm, address, hpage, node,
+				referenced, unmapped);
 	}
 out:
 	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
-- 
2.26.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCHv3, RESEND 3/8] khugepaged: Drain all LRU caches before scanning pages
  2020-04-13 12:52 [PATCHv3, RESEND 0/8] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
  2020-04-13 12:52 ` [PATCHv3, RESEND 1/8] khugepaged: Add self test Kirill A. Shutemov
  2020-04-13 12:52 ` [PATCHv3, RESEND 2/8] khugepaged: Do not stop collapse if less than half PTEs are referenced Kirill A. Shutemov
@ 2020-04-13 12:52 ` Kirill A. Shutemov
  2020-04-15 20:31   ` Yang Shi
  2020-04-13 12:52 ` [PATCHv3, RESEND 4/8] khugepaged: Drain LRU add pagevec after swapin Kirill A. Shutemov
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 28+ messages in thread
From: Kirill A. Shutemov @ 2020-04-13 12:52 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli
  Cc: Zi Yan, Yang Shi, Ralph Campbell, John Hubbard,
	William Kucharski, linux-mm, linux-kernel, Kirill A. Shutemov

Having a page in LRU add cache offsets page refcount and gives
false-negative on PageLRU(). It reduces collapse success rate.

Drain all LRU add caches before scanning. It happens relatively
rare and should not disturb the system too much.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/khugepaged.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 5968ec5ddd6b..ee66c140c2d6 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2059,6 +2059,8 @@ static void khugepaged_do_scan(void)
 
 	barrier(); /* write khugepaged_pages_to_scan to local stack */
 
+	lru_add_drain_all();
+
 	while (progress < pages) {
 		if (!khugepaged_prealloc_page(&hpage, &wait))
 			break;
-- 
2.26.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCHv3, RESEND 4/8] khugepaged: Drain LRU add pagevec after swapin
  2020-04-13 12:52 [PATCHv3, RESEND 0/8] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
                   ` (2 preceding siblings ...)
  2020-04-13 12:52 ` [PATCHv3, RESEND 3/8] khugepaged: Drain all LRU caches before scanning pages Kirill A. Shutemov
@ 2020-04-13 12:52 ` Kirill A. Shutemov
  2020-04-15 20:32   ` Yang Shi
  2020-04-13 12:52 ` [PATCHv3, RESEND 5/8] khugepaged: Allow to collapse a page shared across fork Kirill A. Shutemov
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 28+ messages in thread
From: Kirill A. Shutemov @ 2020-04-13 12:52 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli
  Cc: Zi Yan, Yang Shi, Ralph Campbell, John Hubbard,
	William Kucharski, linux-mm, linux-kernel, Kirill A. Shutemov

__collapse_huge_page_isolate() may fail due to extra pin in the LRU add
pagevec. It's pretty common for swapin case: we swap in pages just to
fail due to the extra pin.

Drain LRU add pagevec on successful swapin.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/khugepaged.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ee66c140c2d6..e3e41c2768d8 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -934,6 +934,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 	}
 	vmf.pte--;
 	pte_unmap(vmf.pte);
+
+	/* Drain LRU add pagevec to remove extra pin on the swapped in pages */
+	if (swapped_in)
+		lru_add_drain();
+
 	trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
 	return true;
 }
-- 
2.26.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCHv3, RESEND 5/8] khugepaged: Allow to collapse a page shared across fork
  2020-04-13 12:52 [PATCHv3, RESEND 0/8] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
                   ` (3 preceding siblings ...)
  2020-04-13 12:52 ` [PATCHv3, RESEND 4/8] khugepaged: Drain LRU add pagevec after swapin Kirill A. Shutemov
@ 2020-04-13 12:52 ` Kirill A. Shutemov
  2020-04-13 20:48   ` John Hubbard
                     ` (2 more replies)
  2020-04-13 12:52 ` [PATCHv3, RESEND 6/8] khugepaged: Allow to collapse PTE-mapped compound pages Kirill A. Shutemov
                   ` (2 subsequent siblings)
  7 siblings, 3 replies; 28+ messages in thread
From: Kirill A. Shutemov @ 2020-04-13 12:52 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli
  Cc: Zi Yan, Yang Shi, Ralph Campbell, John Hubbard,
	William Kucharski, linux-mm, linux-kernel, Kirill A. Shutemov

The page can be included into collapse as long as it doesn't have extra
pins (from GUP or otherwise).

Logic to check the refcound is moved to a separate function.
Note that the function is ready to deal with compound pages. It's
preparation for the following patch.

VM_BUG_ON_PAGE() was removed from __collapse_huge_page_copy() as the
invariant it checks is no longer valid: the source can be mapped
multiple times now.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/khugepaged.c | 41 ++++++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e3e41c2768d8..f9864644c3b7 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -529,6 +529,24 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
 	}
 }
 
+static bool is_refcount_suitable(struct page *page)
+{
+	int expected_refcount, refcount;
+
+	refcount = page_count(page);
+	expected_refcount = total_mapcount(page);
+	if (PageSwapCache(page))
+		expected_refcount += compound_nr(page);
+
+	if (IS_ENABLED(CONFIG_DEBUG_VM) && expected_refcount > refcount) {
+		pr_err("expected_refcount: %d, refcount: %d\n",
+				expected_refcount, refcount);
+		dump_page(page, "Unexpected refcount");
+	}
+
+	return page_count(page) == expected_refcount;
+}
+
 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 					unsigned long address,
 					pte_t *pte)
@@ -581,11 +599,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		}
 
 		/*
-		 * cannot use mapcount: can't collapse if there's a gup pin.
-		 * The page must only be referenced by the scanned process
-		 * and page swap cache.
+		 * Check if the page has any GUP (or other external) pins.
+		 *
+		 * The page table that maps the page has been already unlinked
+		 * from the page table tree and this process cannot get
+		 * an additinal pin on the page.
+		 *
+		 * New pins can come later if the page is shared across fork,
+		 * but not for the this process. It is fine. The other process
+		 * cannot write to the page, only trigger CoW.
 		 */
-		if (page_count(page) != 1 + PageSwapCache(page)) {
+		if (!is_refcount_suitable(page)) {
 			unlock_page(page);
 			result = SCAN_PAGE_COUNT;
 			goto out;
@@ -672,7 +696,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 		} else {
 			src_page = pte_page(pteval);
 			copy_user_highpage(page, src_page, address, vma);
-			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
 			release_pte_page(src_page);
 			/*
 			 * ptl mostly unnecessary, but preempt has to
@@ -1201,12 +1224,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 			goto out_unmap;
 		}
 
-		/*
-		 * cannot use mapcount: can't collapse if there's a gup pin.
-		 * The page must only be referenced by the scanned process
-		 * and page swap cache.
-		 */
-		if (page_count(page) != 1 + PageSwapCache(page)) {
+		/* Check if the page has any GUP (or other external) pins */
+		if (!is_refcount_suitable(page)) {
 			result = SCAN_PAGE_COUNT;
 			goto out_unmap;
 		}
-- 
2.26.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCHv3, RESEND 6/8] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-04-13 12:52 [PATCHv3, RESEND 0/8] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
                   ` (4 preceding siblings ...)
  2020-04-13 12:52 ` [PATCHv3, RESEND 5/8] khugepaged: Allow to collapse a page shared across fork Kirill A. Shutemov
@ 2020-04-13 12:52 ` Kirill A. Shutemov
  2020-04-15 20:46   ` Yang Shi
  2020-04-15 21:44   ` Andrew Morton
  2020-04-13 12:52 ` [PATCHv3, RESEND 7/8] thp: Change CoW semantics for anon-THP Kirill A. Shutemov
  2020-04-13 12:52 ` [PATCHv3, RESEND 8/8] khugepaged: Introduce 'max_ptes_shared' tunable Kirill A. Shutemov
  7 siblings, 2 replies; 28+ messages in thread
From: Kirill A. Shutemov @ 2020-04-13 12:52 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli
  Cc: Zi Yan, Yang Shi, Ralph Campbell, John Hubbard,
	William Kucharski, linux-mm, linux-kernel, Kirill A. Shutemov

We can collapse PTE-mapped compound pages. We only need to avoid
handling them more than once: lock/unlock page only once if it's present
in the PMD range multiple times as it handled on compound level. The
same goes for LRU isolation and putback.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/khugepaged.c | 99 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 65 insertions(+), 34 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f9864644c3b7..11d500396d85 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -515,17 +515,30 @@ void __khugepaged_exit(struct mm_struct *mm)
 
 static void release_pte_page(struct page *page)
 {
-	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
+	mod_node_page_state(page_pgdat(page),
+			NR_ISOLATED_ANON + page_is_file_cache(page),
+			-compound_nr(page));
 	unlock_page(page);
 	putback_lru_page(page);
 }
 
-static void release_pte_pages(pte_t *pte, pte_t *_pte)
+static void release_pte_pages(pte_t *pte, pte_t *_pte,
+		struct list_head *compound_pagelist)
 {
+	struct page *page, *tmp;
+
 	while (--_pte >= pte) {
 		pte_t pteval = *_pte;
-		if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
-			release_pte_page(pte_page(pteval));
+
+		page = pte_page(pteval);
+		if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
+				!PageCompound(page))
+			release_pte_page(page);
+	}
+
+	list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
+		list_del(&page->lru);
+		release_pte_page(page);
 	}
 }
 
@@ -549,7 +562,8 @@ static bool is_refcount_suitable(struct page *page)
 
 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 					unsigned long address,
-					pte_t *pte)
+					pte_t *pte,
+					struct list_head *compound_pagelist)
 {
 	struct page *page = NULL;
 	pte_t *_pte;
@@ -579,13 +593,21 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			goto out;
 		}
 
-		/* TODO: teach khugepaged to collapse THP mapped with pte */
+		VM_BUG_ON_PAGE(!PageAnon(page), page);
+
 		if (PageCompound(page)) {
-			result = SCAN_PAGE_COMPOUND;
-			goto out;
-		}
+			struct page *p;
+			page = compound_head(page);
 
-		VM_BUG_ON_PAGE(!PageAnon(page), page);
+			/*
+			 * Check if we have dealt with the compound page
+			 * already
+			 */
+			list_for_each_entry(p, compound_pagelist, lru) {
+				if (page == p)
+					goto next;
+			}
+		}
 
 		/*
 		 * We can do it before isolate_lru_page because the
@@ -614,19 +636,15 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			result = SCAN_PAGE_COUNT;
 			goto out;
 		}
-		if (pte_write(pteval)) {
-			writable = true;
-		} else {
-			if (PageSwapCache(page) &&
-			    !reuse_swap_page(page, NULL)) {
-				unlock_page(page);
-				result = SCAN_SWAP_CACHE_PAGE;
-				goto out;
-			}
+		if (!pte_write(pteval) && PageSwapCache(page) &&
+				!reuse_swap_page(page, NULL)) {
 			/*
-			 * Page is not in the swap cache. It can be collapsed
-			 * into a THP.
+			 * Page is in the swap cache and cannot be re-used.
+			 * It cannot be collapsed into a THP.
 			 */
+			unlock_page(page);
+			result = SCAN_SWAP_CACHE_PAGE;
+			goto out;
 		}
 
 		/*
@@ -638,16 +656,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			result = SCAN_DEL_PAGE_LRU;
 			goto out;
 		}
-		inc_node_page_state(page,
-				NR_ISOLATED_ANON + page_is_file_cache(page));
+		mod_node_page_state(page_pgdat(page),
+				NR_ISOLATED_ANON + page_is_file_cache(page),
+				compound_nr(page));
 		VM_BUG_ON_PAGE(!PageLocked(page), page);
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 
+		if (PageCompound(page))
+			list_add_tail(&page->lru, compound_pagelist);
+next:
 		/* There should be enough young pte to collapse the page */
 		if (pte_young(pteval) ||
 		    page_is_young(page) || PageReferenced(page) ||
 		    mmu_notifier_test_young(vma->vm_mm, address))
 			referenced++;
+
+		if (pte_write(pteval))
+			writable = true;
 	}
 	if (likely(writable)) {
 		if (likely(referenced)) {
@@ -661,7 +686,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 	}
 
 out:
-	release_pte_pages(pte, _pte);
+	release_pte_pages(pte, _pte, compound_pagelist);
 	trace_mm_collapse_huge_page_isolate(page, none_or_zero,
 					    referenced, writable, result);
 	return 0;
@@ -670,13 +695,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 				      struct vm_area_struct *vma,
 				      unsigned long address,
-				      spinlock_t *ptl)
+				      spinlock_t *ptl,
+				      struct list_head *compound_pagelist)
 {
+	struct page *src_page, *tmp;
 	pte_t *_pte;
 	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
 				_pte++, page++, address += PAGE_SIZE) {
 		pte_t pteval = *_pte;
-		struct page *src_page;
 
 		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
 			clear_user_highpage(page, address);
@@ -696,7 +722,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 		} else {
 			src_page = pte_page(pteval);
 			copy_user_highpage(page, src_page, address, vma);
-			release_pte_page(src_page);
+			if (!PageCompound(src_page))
+				release_pte_page(src_page);
 			/*
 			 * ptl mostly unnecessary, but preempt has to
 			 * be disabled to update the per-cpu stats
@@ -713,6 +740,11 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 			free_page_and_swap_cache(src_page);
 		}
 	}
+
+	list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
+		list_del(&src_page->lru);
+		release_pte_page(src_page);
+	}
 }
 
 static void khugepaged_alloc_sleep(void)
@@ -971,6 +1003,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 				   struct page **hpage,
 				   int node, int referenced, int unmapped)
 {
+	LIST_HEAD(compound_pagelist);
 	pmd_t *pmd, _pmd;
 	pte_t *pte;
 	pgtable_t pgtable;
@@ -1071,7 +1104,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	mmu_notifier_invalidate_range_end(&range);
 
 	spin_lock(pte_ptl);
-	isolated = __collapse_huge_page_isolate(vma, address, pte);
+	isolated = __collapse_huge_page_isolate(vma, address, pte,
+			&compound_pagelist);
 	spin_unlock(pte_ptl);
 
 	if (unlikely(!isolated)) {
@@ -1096,7 +1130,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	 */
 	anon_vma_unlock_write(vma->anon_vma);
 
-	__collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
+	__collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
+			&compound_pagelist);
 	pte_unmap(pte);
 	__SetPageUptodate(new_page);
 	pgtable = pmd_pgtable(_pmd);
@@ -1193,11 +1228,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 			goto out_unmap;
 		}
 
-		/* TODO: teach khugepaged to collapse THP mapped with pte */
-		if (PageCompound(page)) {
-			result = SCAN_PAGE_COMPOUND;
-			goto out_unmap;
-		}
+		page = compound_head(page);
 
 		/*
 		 * Record which node the original page is from and save this
-- 
2.26.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCHv3, RESEND 7/8] thp: Change CoW semantics for anon-THP
  2020-04-13 12:52 [PATCHv3, RESEND 0/8] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
                   ` (5 preceding siblings ...)
  2020-04-13 12:52 ` [PATCHv3, RESEND 6/8] khugepaged: Allow to collapse PTE-mapped compound pages Kirill A. Shutemov
@ 2020-04-13 12:52 ` Kirill A. Shutemov
  2020-04-15 21:19   ` Yang Shi
  2020-04-13 12:52 ` [PATCHv3, RESEND 8/8] khugepaged: Introduce 'max_ptes_shared' tunable Kirill A. Shutemov
  7 siblings, 1 reply; 28+ messages in thread
From: Kirill A. Shutemov @ 2020-04-13 12:52 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli
  Cc: Zi Yan, Yang Shi, Ralph Campbell, John Hubbard,
	William Kucharski, linux-mm, linux-kernel, Kirill A. Shutemov

Currently we have different copy-on-write semantics for anon- and
file-THP. For anon-THP we try to allocate huge page on the write fault,
but on file-THP we split PMD and allocate 4k page.

Arguably, file-THP semantics is more desirable: we don't necessary want
to unshare full PMD range from the parent on the first access. This is
the primary reason THP is unusable for some workloads, like Redis.

The original THP refcounting didn't allow to have PTE-mapped compound
pages, so we had no options, but to allocate huge page on CoW (with
fallback to 512 4k pages).

The current refcounting doesn't have such limitations and we can cut a
lot of complex code out of fault path.

khugepaged is now able to recover THP from such ranges if the
configuration allows.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/huge_memory.c | 249 +++++------------------------------------------
 1 file changed, 25 insertions(+), 224 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 24ad53b4dfc0..25b84cc0f17d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1206,262 +1206,63 @@ void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
 	spin_unlock(vmf->ptl);
 }
 
-static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
-			pmd_t orig_pmd, struct page *page)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-	struct mem_cgroup *memcg;
-	pgtable_t pgtable;
-	pmd_t _pmd;
-	int i;
-	vm_fault_t ret = 0;
-	struct page **pages;
-	struct mmu_notifier_range range;
-
-	pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
-			      GFP_KERNEL);
-	if (unlikely(!pages)) {
-		ret |= VM_FAULT_OOM;
-		goto out;
-	}
-
-	for (i = 0; i < HPAGE_PMD_NR; i++) {
-		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
-					       vmf->address, page_to_nid(page));
-		if (unlikely(!pages[i] ||
-			     mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
-				     GFP_KERNEL, &memcg, false))) {
-			if (pages[i])
-				put_page(pages[i]);
-			while (--i >= 0) {
-				memcg = (void *)page_private(pages[i]);
-				set_page_private(pages[i], 0);
-				mem_cgroup_cancel_charge(pages[i], memcg,
-						false);
-				put_page(pages[i]);
-			}
-			kfree(pages);
-			ret |= VM_FAULT_OOM;
-			goto out;
-		}
-		set_page_private(pages[i], (unsigned long)memcg);
-	}
-
-	for (i = 0; i < HPAGE_PMD_NR; i++) {
-		copy_user_highpage(pages[i], page + i,
-				   haddr + PAGE_SIZE * i, vma);
-		__SetPageUptodate(pages[i]);
-		cond_resched();
-	}
-
-	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
-				haddr, haddr + HPAGE_PMD_SIZE);
-	mmu_notifier_invalidate_range_start(&range);
-
-	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
-	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
-		goto out_free_pages;
-	VM_BUG_ON_PAGE(!PageHead(page), page);
-
-	/*
-	 * Leave pmd empty until pte is filled note we must notify here as
-	 * concurrent CPU thread might write to new page before the call to
-	 * mmu_notifier_invalidate_range_end() happens which can lead to a
-	 * device seeing memory write in different order than CPU.
-	 *
-	 * See Documentation/vm/mmu_notifier.rst
-	 */
-	pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-
-	pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
-	pmd_populate(vma->vm_mm, &_pmd, pgtable);
-
-	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
-		pte_t entry;
-		entry = mk_pte(pages[i], vma->vm_page_prot);
-		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-		memcg = (void *)page_private(pages[i]);
-		set_page_private(pages[i], 0);
-		page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
-		mem_cgroup_commit_charge(pages[i], memcg, false, false);
-		lru_cache_add_active_or_unevictable(pages[i], vma);
-		vmf->pte = pte_offset_map(&_pmd, haddr);
-		VM_BUG_ON(!pte_none(*vmf->pte));
-		set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
-		pte_unmap(vmf->pte);
-	}
-	kfree(pages);
-
-	smp_wmb(); /* make pte visible before pmd */
-	pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
-	page_remove_rmap(page, true);
-	spin_unlock(vmf->ptl);
-
-	/*
-	 * No need to double call mmu_notifier->invalidate_range() callback as
-	 * the above pmdp_huge_clear_flush_notify() did already call it.
-	 */
-	mmu_notifier_invalidate_range_only_end(&range);
-
-	ret |= VM_FAULT_WRITE;
-	put_page(page);
-
-out:
-	return ret;
-
-out_free_pages:
-	spin_unlock(vmf->ptl);
-	mmu_notifier_invalidate_range_end(&range);
-	for (i = 0; i < HPAGE_PMD_NR; i++) {
-		memcg = (void *)page_private(pages[i]);
-		set_page_private(pages[i], 0);
-		mem_cgroup_cancel_charge(pages[i], memcg, false);
-		put_page(pages[i]);
-	}
-	kfree(pages);
-	goto out;
-}
-
 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	struct page *page = NULL, *new_page;
-	struct mem_cgroup *memcg;
+	struct page *page;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-	struct mmu_notifier_range range;
-	gfp_t huge_gfp;			/* for allocation and charge */
-	vm_fault_t ret = 0;
 
 	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
 	VM_BUG_ON_VMA(!vma->anon_vma, vma);
+
 	if (is_huge_zero_pmd(orig_pmd))
-		goto alloc;
+		goto fallback;
+
 	spin_lock(vmf->ptl);
-	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
-		goto out_unlock;
+
+	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+		spin_unlock(vmf->ptl);
+		return 0;
+	}
 
 	page = pmd_page(orig_pmd);
 	VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
-	/*
-	 * We can only reuse the page if nobody else maps the huge page or it's
-	 * part.
-	 */
+
+	/* Lock page for reuse_swap_page() */
 	if (!trylock_page(page)) {
 		get_page(page);
 		spin_unlock(vmf->ptl);
 		lock_page(page);
 		spin_lock(vmf->ptl);
 		if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+			spin_unlock(vmf->ptl);
 			unlock_page(page);
 			put_page(page);
-			goto out_unlock;
+			return 0;
 		}
 		put_page(page);
 	}
+
+	/*
+	 * We can only reuse the page if nobody else maps the huge page or it's
+	 * part.
+	 */
 	if (reuse_swap_page(page, NULL)) {
 		pmd_t entry;
 		entry = pmd_mkyoung(orig_pmd);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
+		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
 			update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
-		ret |= VM_FAULT_WRITE;
 		unlock_page(page);
-		goto out_unlock;
-	}
-	unlock_page(page);
-	get_page(page);
-	spin_unlock(vmf->ptl);
-alloc:
-	if (__transparent_hugepage_enabled(vma) &&
-	    !transparent_hugepage_debug_cow()) {
-		huge_gfp = alloc_hugepage_direct_gfpmask(vma);
-		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
-	} else
-		new_page = NULL;
-
-	if (likely(new_page)) {
-		prep_transhuge_page(new_page);
-	} else {
-		if (!page) {
-			split_huge_pmd(vma, vmf->pmd, vmf->address);
-			ret |= VM_FAULT_FALLBACK;
-		} else {
-			ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
-			if (ret & VM_FAULT_OOM) {
-				split_huge_pmd(vma, vmf->pmd, vmf->address);
-				ret |= VM_FAULT_FALLBACK;
-			}
-			put_page(page);
-		}
-		count_vm_event(THP_FAULT_FALLBACK);
-		goto out;
-	}
-
-	if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
-					huge_gfp, &memcg, true))) {
-		put_page(new_page);
-		split_huge_pmd(vma, vmf->pmd, vmf->address);
-		if (page)
-			put_page(page);
-		ret |= VM_FAULT_FALLBACK;
-		count_vm_event(THP_FAULT_FALLBACK);
-		goto out;
-	}
-
-	count_vm_event(THP_FAULT_ALLOC);
-	count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
-
-	if (!page)
-		clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
-	else
-		copy_user_huge_page(new_page, page, vmf->address,
-				    vma, HPAGE_PMD_NR);
-	__SetPageUptodate(new_page);
-
-	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
-				haddr, haddr + HPAGE_PMD_SIZE);
-	mmu_notifier_invalidate_range_start(&range);
-
-	spin_lock(vmf->ptl);
-	if (page)
-		put_page(page);
-	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
 		spin_unlock(vmf->ptl);
-		mem_cgroup_cancel_charge(new_page, memcg, true);
-		put_page(new_page);
-		goto out_mn;
-	} else {
-		pmd_t entry;
-		entry = mk_huge_pmd(new_page, vma->vm_page_prot);
-		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-		pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-		page_add_new_anon_rmap(new_page, vma, haddr, true);
-		mem_cgroup_commit_charge(new_page, memcg, false, true);
-		lru_cache_add_active_or_unevictable(new_page, vma);
-		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
-		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
-		if (!page) {
-			add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-		} else {
-			VM_BUG_ON_PAGE(!PageHead(page), page);
-			page_remove_rmap(page, true);
-			put_page(page);
-		}
-		ret |= VM_FAULT_WRITE;
+		return VM_FAULT_WRITE;
 	}
+
+	unlock_page(page);
 	spin_unlock(vmf->ptl);
-out_mn:
-	/*
-	 * No need to double call mmu_notifier->invalidate_range() callback as
-	 * the above pmdp_huge_clear_flush_notify() did already call it.
-	 */
-	mmu_notifier_invalidate_range_only_end(&range);
-out:
-	return ret;
-out_unlock:
-	spin_unlock(vmf->ptl);
-	return ret;
+fallback:
+	__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+	return VM_FAULT_FALLBACK;
 }
 
 /*
-- 
2.26.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCHv3, RESEND 8/8] khugepaged: Introduce 'max_ptes_shared' tunable
  2020-04-13 12:52 [PATCHv3, RESEND 0/8] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
                   ` (6 preceding siblings ...)
  2020-04-13 12:52 ` [PATCHv3, RESEND 7/8] thp: Change CoW semantics for anon-THP Kirill A. Shutemov
@ 2020-04-13 12:52 ` Kirill A. Shutemov
  2020-04-15 21:25   ` Yang Shi
  7 siblings, 1 reply; 28+ messages in thread
From: Kirill A. Shutemov @ 2020-04-13 12:52 UTC (permalink / raw)
  To: akpm, Andrea Arcangeli
  Cc: Zi Yan, Yang Shi, Ralph Campbell, John Hubbard,
	William Kucharski, linux-mm, linux-kernel, Kirill A. Shutemov

'max_ptes_shared' specifies how many pages can be shared across multiple
processes. Exceeding the number would block the collapse::

	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared

A higher value may increase memory footprint for some workloads.

By default, at least half of pages has to be not shared.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 Documentation/admin-guide/mm/transhuge.rst |  7 ++
 include/trace/events/huge_memory.h         |  3 +-
 mm/khugepaged.c                            | 52 ++++++++++++--
 tools/testing/selftests/vm/khugepaged.c    | 83 ++++++++++++++++++++++
 4 files changed, 140 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index bd5714547cee..a10f6a50d48f 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -220,6 +220,13 @@ memory. A lower value can prevent THPs from being
 collapsed, resulting fewer pages being collapsed into
 THPs, and lower memory access performance.
 
+``max_ptes_shared`` specifies how many pages can be shared across multiple
+processes. Exceeding the number would block the collapse::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared
+
+A higher value may increase memory footprint for some workloads.
+
 Boot parameter
 ==============
 
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index d82a0f4e824d..53532f5925c3 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -12,6 +12,8 @@
 	EM( SCAN_SUCCEED,		"succeeded")			\
 	EM( SCAN_PMD_NULL,		"pmd_null")			\
 	EM( SCAN_EXCEED_NONE_PTE,	"exceed_none_pte")		\
+	EM( SCAN_EXCEED_SWAP_PTE,	"exceed_swap_pte")		\
+	EM( SCAN_EXCEED_SHARED_PTE,	"exceed_shared_pte")		\
 	EM( SCAN_PTE_NON_PRESENT,	"pte_non_present")		\
 	EM( SCAN_PAGE_RO,		"no_writable_page")		\
 	EM( SCAN_LACK_REFERENCED_PAGE,	"lack_referenced_page")		\
@@ -30,7 +32,6 @@
 	EM( SCAN_DEL_PAGE_LRU,		"could_not_delete_page_from_lru")\
 	EM( SCAN_ALLOC_HUGE_PAGE_FAIL,	"alloc_huge_page_failed")	\
 	EM( SCAN_CGROUP_CHARGE_FAIL,	"ccgroup_charge_failed")	\
-	EM( SCAN_EXCEED_SWAP_PTE,	"exceed_swap_pte")		\
 	EM( SCAN_TRUNCATED,		"truncated")			\
 	EMe(SCAN_PAGE_HAS_PRIVATE,	"page_has_private")		\
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 11d500396d85..0730014620e2 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -28,6 +28,8 @@ enum scan_result {
 	SCAN_SUCCEED,
 	SCAN_PMD_NULL,
 	SCAN_EXCEED_NONE_PTE,
+	SCAN_EXCEED_SWAP_PTE,
+	SCAN_EXCEED_SHARED_PTE,
 	SCAN_PTE_NON_PRESENT,
 	SCAN_PAGE_RO,
 	SCAN_LACK_REFERENCED_PAGE,
@@ -46,7 +48,6 @@ enum scan_result {
 	SCAN_DEL_PAGE_LRU,
 	SCAN_ALLOC_HUGE_PAGE_FAIL,
 	SCAN_CGROUP_CHARGE_FAIL,
-	SCAN_EXCEED_SWAP_PTE,
 	SCAN_TRUNCATED,
 	SCAN_PAGE_HAS_PRIVATE,
 };
@@ -71,6 +72,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
  */
 static unsigned int khugepaged_max_ptes_none __read_mostly;
 static unsigned int khugepaged_max_ptes_swap __read_mostly;
+static unsigned int khugepaged_max_ptes_shared __read_mostly;
 
 #define MM_SLOTS_HASH_BITS 10
 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -290,15 +292,43 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr =
 	__ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
 	       khugepaged_max_ptes_swap_store);
 
+static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
+					     struct kobj_attribute *attr,
+					     char *buf)
+{
+	return sprintf(buf, "%u\n", khugepaged_max_ptes_shared);
+}
+
+static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
+					      struct kobj_attribute *attr,
+					      const char *buf, size_t count)
+{
+	int err;
+	unsigned long max_ptes_shared;
+
+	err  = kstrtoul(buf, 10, &max_ptes_shared);
+	if (err || max_ptes_shared > HPAGE_PMD_NR-1)
+		return -EINVAL;
+
+	khugepaged_max_ptes_shared = max_ptes_shared;
+
+	return count;
+}
+
+static struct kobj_attribute khugepaged_max_ptes_shared_attr =
+	__ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
+	       khugepaged_max_ptes_shared_store);
+
 static struct attribute *khugepaged_attr[] = {
 	&khugepaged_defrag_attr.attr,
 	&khugepaged_max_ptes_none_attr.attr,
+	&khugepaged_max_ptes_swap_attr.attr,
+	&khugepaged_max_ptes_shared_attr.attr,
 	&pages_to_scan_attr.attr,
 	&pages_collapsed_attr.attr,
 	&full_scans_attr.attr,
 	&scan_sleep_millisecs_attr.attr,
 	&alloc_sleep_millisecs_attr.attr,
-	&khugepaged_max_ptes_swap_attr.attr,
 	NULL,
 };
 
@@ -360,6 +390,7 @@ int __init khugepaged_init(void)
 	khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
 	khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
 	khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
+	khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
 
 	return 0;
 }
@@ -567,7 +598,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 {
 	struct page *page = NULL;
 	pte_t *_pte;
-	int none_or_zero = 0, result = 0, referenced = 0;
+	int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
 	bool writable = false;
 
 	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
@@ -595,6 +626,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
 		VM_BUG_ON_PAGE(!PageAnon(page), page);
 
+		if (page_mapcount(page) > 1 &&
+				++shared > khugepaged_max_ptes_shared) {
+			result = SCAN_EXCEED_SHARED_PTE;
+			goto out;
+		}
+
 		if (PageCompound(page)) {
 			struct page *p;
 			page = compound_head(page);
@@ -1178,7 +1215,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 {
 	pmd_t *pmd;
 	pte_t *pte, *_pte;
-	int ret = 0, none_or_zero = 0, result = 0, referenced = 0;
+	int ret = 0, result = 0, referenced = 0;
+	int none_or_zero = 0, shared = 0;
 	struct page *page = NULL;
 	unsigned long _address;
 	spinlock_t *ptl;
@@ -1228,6 +1266,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 			goto out_unmap;
 		}
 
+		if (page_mapcount(page) > 1 &&
+				++shared > khugepaged_max_ptes_shared) {
+			result = SCAN_EXCEED_SHARED_PTE;
+			goto out_unmap;
+		}
+
 		page = compound_head(page);
 
 		/*
diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c
index 34d945e71e2e..7c7283cc7dcc 100644
--- a/tools/testing/selftests/vm/khugepaged.c
+++ b/tools/testing/selftests/vm/khugepaged.c
@@ -77,6 +77,7 @@ struct khugepaged_settings {
 	unsigned int scan_sleep_millisecs;
 	unsigned int max_ptes_none;
 	unsigned int max_ptes_swap;
+	unsigned int max_ptes_shared;
 	unsigned long pages_to_scan;
 };
 
@@ -276,6 +277,7 @@ static void write_settings(struct settings *settings)
 			khugepaged->scan_sleep_millisecs);
 	write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
 	write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
+	write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
 	write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
 }
 
@@ -312,6 +314,7 @@ static void save_settings(void)
 			read_num("khugepaged/scan_sleep_millisecs"),
 		.max_ptes_none = read_num("khugepaged/max_ptes_none"),
 		.max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
+		.max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
 		.pages_to_scan = read_num("khugepaged/pages_to_scan"),
 	};
 	success("OK");
@@ -843,12 +846,90 @@ static void collapse_fork_compound(void)
 			fail("Fail");
 		fill_memory(p, 0, page_size);
 
+		write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
 		if (wait_for_scan("Collapse PTE table full of compound pages in child", p))
 			fail("Timeout");
 		else if (check_huge(p))
 			success("OK");
 		else
 			fail("Fail");
+		write_num("khugepaged/max_ptes_shared",
+				default_settings.khugepaged.max_ptes_shared);
+
+		validate_memory(p, 0, hpage_pmd_size);
+		munmap(p, hpage_pmd_size);
+		exit(exit_status);
+	}
+
+	wait(&wstatus);
+	exit_status += WEXITSTATUS(wstatus);
+
+	printf("Check if parent still has huge page...");
+	if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, hpage_pmd_size);
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_shared()
+{
+	int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
+	int wstatus;
+	void *p;
+
+	p = alloc_mapping();
+
+	printf("Allocate huge page...");
+	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+	fill_memory(p, 0, hpage_pmd_size);
+	if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	printf("Share huge page over fork()...");
+	if (!fork()) {
+		/* Do not touch settings on child exit */
+		skip_settings_restore = true;
+		exit_status = 0;
+
+		if (check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+
+		printf("Trigger CoW in %d of %d...",
+				hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
+		fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
+		if (!check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+
+		if (wait_for_scan("Do not collapse with max_ptes_shared exeeded", p))
+			fail("Timeout");
+		else if (!check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+
+		printf("Trigger CoW in %d of %d...",
+				hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
+		fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size);
+		if (!check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
+
+
+		if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p))
+			fail("Timeout");
+		else if (check_huge(p))
+			success("OK");
+		else
+			fail("Fail");
 
 		validate_memory(p, 0, hpage_pmd_size);
 		munmap(p, hpage_pmd_size);
@@ -877,6 +958,7 @@ int main(void)
 
 	default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
 	default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
+	default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
 	default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
 
 	save_settings();
@@ -894,6 +976,7 @@ int main(void)
 	collapse_compound_extreme();
 	collapse_fork();
 	collapse_fork_compound();
+	collapse_max_ptes_shared();
 
 	restore_settings(0);
 }
-- 
2.26.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 5/8] khugepaged: Allow to collapse a page shared across fork
  2020-04-13 12:52 ` [PATCHv3, RESEND 5/8] khugepaged: Allow to collapse a page shared across fork Kirill A. Shutemov
@ 2020-04-13 20:48   ` John Hubbard
  2020-04-14 21:35     ` Kirill A. Shutemov
  2020-04-15  2:43   ` John Hubbard
  2020-04-15 20:39   ` Yang Shi
  2 siblings, 1 reply; 28+ messages in thread
From: John Hubbard @ 2020-04-13 20:48 UTC (permalink / raw)
  To: Kirill A. Shutemov, akpm, Andrea Arcangeli
  Cc: Zi Yan, Yang Shi, Ralph Campbell, William Kucharski, linux-mm,
	linux-kernel

On 4/13/20 5:52 AM, Kirill A. Shutemov wrote:
> The page can be included into collapse as long as it doesn't have extra
> pins (from GUP or otherwise).
> 
> Logic to check the refcound is moved to a separate function.

"refcount"

> Note that the function is ready to deal with compound pages. It's
> preparation for the following patch.

Maybe:

"Added compound_nr(page) to the expected refcount, in order to handle
the compound page case. This is in preparation for the following patch."

(Just to make it clear that this is not just refactoring, but also a
change.)

> 
> VM_BUG_ON_PAGE() was removed from __collapse_huge_page_copy() as the
> invariant it checks is no longer valid: the source can be mapped
> multiple times now.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>   mm/khugepaged.c | 41 ++++++++++++++++++++++++++++++-----------
>   1 file changed, 30 insertions(+), 11 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index e3e41c2768d8..f9864644c3b7 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -529,6 +529,24 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
>   	}
>   }
>   
> +static bool is_refcount_suitable(struct page *page)
> +{


This looks nice.


> +	int expected_refcount, refcount;
> +
> +	refcount = page_count(page);
> +	expected_refcount = total_mapcount(page);
> +	if (PageSwapCache(page))
> +		expected_refcount += compound_nr(page);
> +
> +	if (IS_ENABLED(CONFIG_DEBUG_VM) && expected_refcount > refcount) {
> +		pr_err("expected_refcount: %d, refcount: %d\n",
> +				expected_refcount, refcount);
> +		dump_page(page, "Unexpected refcount");


I see two issues with the pr_err() and the dump_page() call:

1. You probably want to rate limit this, otherwise you'll have a big
problem if lots of pages are pinned!

2. Actually, I don't think you'd want to print anything at all here, even with
rate limiting, because doing so presumes that "unexpected" means "wrong". And I
think this patch doesn't expect to have GUP pins (or pin_user_pages() pins, ha),
but that doesn't mean that they're wrong to have.


> +	}
> +
> +	return page_count(page) == expected_refcount;
> +}
> +
>   static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   					unsigned long address,
>   					pte_t *pte)
> @@ -581,11 +599,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   		}
>   
>   		/*
> -		 * cannot use mapcount: can't collapse if there's a gup pin.
> -		 * The page must only be referenced by the scanned process
> -		 * and page swap cache.
> +		 * Check if the page has any GUP (or other external) pins.
> +		 *
> +		 * The page table that maps the page has been already unlinked
> +		 * from the page table tree and this process cannot get
> +		 * an additinal pin on the page.
> +		 *
> +		 * New pins can come later if the page is shared across fork,
> +		 * but not for the this process. It is fine. The other process

"but not from this process. The other process"

(It's very hard to figure out what "it is fine" means in this context, so
better to leave it out, imho.)

> +		 * cannot write to the page, only trigger CoW.
>   		 */
> -		if (page_count(page) != 1 + PageSwapCache(page)) {
> +		if (!is_refcount_suitable(page)) {
>   			unlock_page(page);
>   			result = SCAN_PAGE_COUNT;
>   			goto out;
> @@ -672,7 +696,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
>   		} else {
>   			src_page = pte_page(pteval);
>   			copy_user_highpage(page, src_page, address, vma);
> -			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
>   			release_pte_page(src_page);
>   			/*
>   			 * ptl mostly unnecessary, but preempt has to
> @@ -1201,12 +1224,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>   			goto out_unmap;
>   		}
>   
> -		/*
> -		 * cannot use mapcount: can't collapse if there's a gup pin.
> -		 * The page must only be referenced by the scanned process
> -		 * and page swap cache.
> -		 */
> -		if (page_count(page) != 1 + PageSwapCache(page)) {
> +		/* Check if the page has any GUP (or other external) pins */
> +		if (!is_refcount_suitable(page)) {
>   			result = SCAN_PAGE_COUNT;
>   			goto out_unmap;
>   		}
> 

thanks,
-- 
John Hubbard
NVIDIA

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 5/8] khugepaged: Allow to collapse a page shared across fork
  2020-04-13 20:48   ` John Hubbard
@ 2020-04-14 21:35     ` Kirill A. Shutemov
  2020-04-15  2:42       ` John Hubbard
  0 siblings, 1 reply; 28+ messages in thread
From: Kirill A. Shutemov @ 2020-04-14 21:35 UTC (permalink / raw)
  To: John Hubbard
  Cc: Kirill A. Shutemov, akpm, Andrea Arcangeli, Zi Yan, Yang Shi,
	Ralph Campbell, William Kucharski, linux-mm, linux-kernel

On Mon, Apr 13, 2020 at 01:48:22PM -0700, John Hubbard wrote:

[Thanks for all your suggestions and corrections]

> > +	if (IS_ENABLED(CONFIG_DEBUG_VM) && expected_refcount > refcount) {
> > +		pr_err("expected_refcount: %d, refcount: %d\n",
> > +				expected_refcount, refcount);
> > +		dump_page(page, "Unexpected refcount");
> 
> 
> I see two issues with the pr_err() and the dump_page() call:
> 
> 1. You probably want to rate limit this, otherwise you'll have a big
> problem if lots of pages are pinned!

Nope. Only if kernel is buggy. See below.

> 2. Actually, I don't think you'd want to print anything at all here, even with
> rate limiting, because doing so presumes that "unexpected" means "wrong". And I
> think this patch doesn't expect to have GUP pins (or pin_user_pages() pins, ha),
> but that doesn't mean that they're wrong to have.

See condition. We only do it if refcount is *below* expected refcount. It
should never happen. Pinned page would have refcount above expected.

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 5/8] khugepaged: Allow to collapse a page shared across fork
  2020-04-14 21:35     ` Kirill A. Shutemov
@ 2020-04-15  2:42       ` John Hubbard
  0 siblings, 0 replies; 28+ messages in thread
From: John Hubbard @ 2020-04-15  2:42 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Kirill A. Shutemov, akpm, Andrea Arcangeli, Zi Yan, Yang Shi,
	Ralph Campbell, William Kucharski, linux-mm, linux-kernel

On 2020-04-14 14:35, Kirill A. Shutemov wrote:
> On Mon, Apr 13, 2020 at 01:48:22PM -0700, John Hubbard wrote:
> 
> [Thanks for all your suggestions and corrections]
> 
>>> +	if (IS_ENABLED(CONFIG_DEBUG_VM) && expected_refcount > refcount) {
>>> +		pr_err("expected_refcount: %d, refcount: %d\n",
>>> +				expected_refcount, refcount);
>>> +		dump_page(page, "Unexpected refcount");
>>
>>
>> I see two issues with the pr_err() and the dump_page() call:
>>
>> 1. You probably want to rate limit this, otherwise you'll have a big
>> problem if lots of pages are pinned!
> 
> Nope. Only if kernel is buggy. See below.
> 
>> 2. Actually, I don't think you'd want to print anything at all here, even with
>> rate limiting, because doing so presumes that "unexpected" means "wrong". And I
>> think this patch doesn't expect to have GUP pins (or pin_user_pages() pins, ha),
>> but that doesn't mean that they're wrong to have.
> 
> See condition. We only do it if refcount is *below* expected refcount. It
> should never happen. Pinned page would have refcount above expected.
> 

Yes, you are right. I misread the condition. This actually is just right. :)

thanks,
-- 
John Hubbard
NVIDIA

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 5/8] khugepaged: Allow to collapse a page shared across fork
  2020-04-13 12:52 ` [PATCHv3, RESEND 5/8] khugepaged: Allow to collapse a page shared across fork Kirill A. Shutemov
  2020-04-13 20:48   ` John Hubbard
@ 2020-04-15  2:43   ` John Hubbard
  2020-04-15 20:39   ` Yang Shi
  2 siblings, 0 replies; 28+ messages in thread
From: John Hubbard @ 2020-04-15  2:43 UTC (permalink / raw)
  To: Kirill A. Shutemov, akpm, Andrea Arcangeli
  Cc: Zi Yan, Yang Shi, Ralph Campbell, William Kucharski, linux-mm,
	linux-kernel

On 2020-04-13 05:52, Kirill A. Shutemov wrote:
> The page can be included into collapse as long as it doesn't have extra
> pins (from GUP or otherwise).
> 
> Logic to check the refcound is moved to a separate function.
> Note that the function is ready to deal with compound pages. It's
> preparation for the following patch.
> 
> VM_BUG_ON_PAGE() was removed from __collapse_huge_page_copy() as the
> invariant it checks is no longer valid: the source can be mapped
> multiple times now.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>   mm/khugepaged.c | 41 ++++++++++++++++++++++++++++++-----------
>   1 file changed, 30 insertions(+), 11 deletions(-)


Reviewed-by: John Hubbard <jhubbard@nvidia.com>

thanks,
-- 
John Hubbard
NVIDIA

> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index e3e41c2768d8..f9864644c3b7 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -529,6 +529,24 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
>   	}
>   }
>   
> +static bool is_refcount_suitable(struct page *page)
> +{
> +	int expected_refcount, refcount;
> +
> +	refcount = page_count(page);
> +	expected_refcount = total_mapcount(page);
> +	if (PageSwapCache(page))
> +		expected_refcount += compound_nr(page);
> +
> +	if (IS_ENABLED(CONFIG_DEBUG_VM) && expected_refcount > refcount) {
> +		pr_err("expected_refcount: %d, refcount: %d\n",
> +				expected_refcount, refcount);
> +		dump_page(page, "Unexpected refcount");
> +	}
> +
> +	return page_count(page) == expected_refcount;
> +}
> +
>   static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   					unsigned long address,
>   					pte_t *pte)
> @@ -581,11 +599,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   		}
>   
>   		/*
> -		 * cannot use mapcount: can't collapse if there's a gup pin.
> -		 * The page must only be referenced by the scanned process
> -		 * and page swap cache.
> +		 * Check if the page has any GUP (or other external) pins.
> +		 *
> +		 * The page table that maps the page has been already unlinked
> +		 * from the page table tree and this process cannot get
> +		 * an additinal pin on the page.
> +		 *
> +		 * New pins can come later if the page is shared across fork,
> +		 * but not for the this process. It is fine. The other process
> +		 * cannot write to the page, only trigger CoW.
>   		 */
> -		if (page_count(page) != 1 + PageSwapCache(page)) {
> +		if (!is_refcount_suitable(page)) {
>   			unlock_page(page);
>   			result = SCAN_PAGE_COUNT;
>   			goto out;
> @@ -672,7 +696,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
>   		} else {
>   			src_page = pte_page(pteval);
>   			copy_user_highpage(page, src_page, address, vma);
> -			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
>   			release_pte_page(src_page);
>   			/*
>   			 * ptl mostly unnecessary, but preempt has to
> @@ -1201,12 +1224,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>   			goto out_unmap;
>   		}
>   
> -		/*
> -		 * cannot use mapcount: can't collapse if there's a gup pin.
> -		 * The page must only be referenced by the scanned process
> -		 * and page swap cache.
> -		 */
> -		if (page_count(page) != 1 + PageSwapCache(page)) {
> +		/* Check if the page has any GUP (or other external) pins */
> +		if (!is_refcount_suitable(page)) {
>   			result = SCAN_PAGE_COUNT;
>   			goto out_unmap;
>   		}
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 1/8] khugepaged: Add self test
  2020-04-13 12:52 ` [PATCHv3, RESEND 1/8] khugepaged: Add self test Kirill A. Shutemov
@ 2020-04-15 15:39   ` Zi Yan
  2020-04-15 20:11     ` Yang Shi
  0 siblings, 1 reply; 28+ messages in thread
From: Zi Yan @ 2020-04-15 15:39 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: akpm, Andrea Arcangeli, Yang Shi, Ralph Campbell, John Hubbard,
	William Kucharski, linux-mm, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 636 bytes --]

On 13 Apr 2020, at 8:52, Kirill A. Shutemov wrote:

> External email: Use caution opening links or attachments
>
>
> The test checks if khugepaged is able to recover huge page where we
> expect to do so. It only covers anon-THP for now.
>
> Currently the test shows few failures. They are going to be addressed by
> the following patches.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>

Reviewed-by: Zi Yan <ziy@nvidia.com>
Tested-by: Zi Yan <ziy@nvidia.com>

All tests passed with the whole series applied on v5.6 and Linus’ tree (commit: 8632e9b5).

Thanks.

—
Best Regards,
Yan Zi

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 854 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 1/8] khugepaged: Add self test
  2020-04-15 15:39   ` Zi Yan
@ 2020-04-15 20:11     ` Yang Shi
  0 siblings, 0 replies; 28+ messages in thread
From: Yang Shi @ 2020-04-15 20:11 UTC (permalink / raw)
  To: Zi Yan, Kirill A. Shutemov
  Cc: akpm, Andrea Arcangeli, Ralph Campbell, John Hubbard,
	William Kucharski, linux-mm, linux-kernel



On 4/15/20 8:39 AM, Zi Yan wrote:
> On 13 Apr 2020, at 8:52, Kirill A. Shutemov wrote:
>
>> External email: Use caution opening links or attachments
>>
>>
>> The test checks if khugepaged is able to recover huge page where we
>> expect to do so. It only covers anon-THP for now.
>>
>> Currently the test shows few failures. They are going to be addressed by
>> the following patches.
>>
>> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Tested-by: Zi Yan <ziy@nvidia.com>
>
> All tests passed with the whole series applied on v5.6 and Linus’ tree (commit: 8632e9b5).

I didn't run the tests by myself, I choose to trust the result from Zi Yan.

Acked-by: Yang Shi <yang.shi@linux.alibaba.com>

>
> Thanks.
>
> —
> Best Regards,
> Yan Zi


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 2/8] khugepaged: Do not stop collapse if less than half PTEs are referenced
  2020-04-13 12:52 ` [PATCHv3, RESEND 2/8] khugepaged: Do not stop collapse if less than half PTEs are referenced Kirill A. Shutemov
@ 2020-04-15 20:31   ` Yang Shi
  0 siblings, 0 replies; 28+ messages in thread
From: Yang Shi @ 2020-04-15 20:31 UTC (permalink / raw)
  To: Kirill A. Shutemov, akpm, Andrea Arcangeli
  Cc: Zi Yan, Ralph Campbell, John Hubbard, William Kucharski,
	linux-mm, linux-kernel



On 4/13/20 5:52 AM, Kirill A. Shutemov wrote:
> __collapse_huge_page_swapin() checks the number of referenced PTE to
> decide if the memory range is hot enough to justify swapin.
>
> We have few problems with the approach:
>
>   - It is way too late: we can do the check much earlier and safe time.
>     khugepaged_scan_pmd() already knows if we have any pages to swap in
>     and number of referenced page.
>
>   - It stops collapse altogether if there's not enough referenced pages,
>     not only swappingin.
>
> Fix it by making the right check early. We also can avoid additional
> page table scanning if khugepaged_scan_pmd() haven't found any swap
> entries.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Fixes: 0db501f7a34c ("mm, thp: convert from optimistic swapin collapsing to conservative")
> ---
>   mm/khugepaged.c | 25 ++++++++++---------------
>   1 file changed, 10 insertions(+), 15 deletions(-)

Acked-by: Yang Shi <yang.shi@linux.alibaba.com>

Just a nit below.

>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 99bab7e4d05b..5968ec5ddd6b 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -902,11 +902,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>   		.pgoff = linear_page_index(vma, address),
>   	};
>   
> -	/* we only decide to swapin, if there is enough young ptes */
> -	if (referenced < HPAGE_PMD_NR/2) {
> -		trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
> -		return false;
> -	}
>   	vmf.pte = pte_offset_map(pmd, address);
>   	for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
>   			vmf.pte++, vmf.address += PAGE_SIZE) {
> @@ -946,7 +941,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>   static void collapse_huge_page(struct mm_struct *mm,
>   				   unsigned long address,
>   				   struct page **hpage,
> -				   int node, int referenced)
> +				   int node, int referenced, int unmapped)
>   {
>   	pmd_t *pmd, _pmd;
>   	pte_t *pte;
> @@ -1003,7 +998,8 @@ static void collapse_huge_page(struct mm_struct *mm,
>   	 * If it fails, we release mmap_sem and jump out_nolock.
>   	 * Continuing to collapse causes inconsistency.
>   	 */
> -	if (!__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) {
> +	if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
> +				pmd, referenced)) {
>   		mem_cgroup_cancel_charge(new_page, memcg, true);
>   		up_read(&mm->mmap_sem);
>   		goto out_nolock;
> @@ -1214,22 +1210,21 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>   		    mmu_notifier_test_young(vma->vm_mm, address))
>   			referenced++;
>   	}
> -	if (writable) {
> -		if (referenced) {
> +	if (!writable) {
> +		result = SCAN_PAGE_RO;
> +	} else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
> +		result = SCAN_LACK_REFERENCED_PAGE;
> +	} else {
>   			result = SCAN_SUCCEED;
>   			ret = 1;

Shall fix the indentation for the above two statements?

> -		} else {
> -			result = SCAN_LACK_REFERENCED_PAGE;
> -		}
> -	} else {
> -		result = SCAN_PAGE_RO;
>   	}
>   out_unmap:
>   	pte_unmap_unlock(pte, ptl);
>   	if (ret) {
>   		node = khugepaged_find_target_node();
>   		/* collapse_huge_page will return with the mmap_sem released */
> -		collapse_huge_page(mm, address, hpage, node, referenced);
> +		collapse_huge_page(mm, address, hpage, node,
> +				referenced, unmapped);
>   	}
>   out:
>   	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 3/8] khugepaged: Drain all LRU caches before scanning pages
  2020-04-13 12:52 ` [PATCHv3, RESEND 3/8] khugepaged: Drain all LRU caches before scanning pages Kirill A. Shutemov
@ 2020-04-15 20:31   ` Yang Shi
  0 siblings, 0 replies; 28+ messages in thread
From: Yang Shi @ 2020-04-15 20:31 UTC (permalink / raw)
  To: Kirill A. Shutemov, akpm, Andrea Arcangeli
  Cc: Zi Yan, Ralph Campbell, John Hubbard, William Kucharski,
	linux-mm, linux-kernel



On 4/13/20 5:52 AM, Kirill A. Shutemov wrote:
> Having a page in LRU add cache offsets page refcount and gives
> false-negative on PageLRU(). It reduces collapse success rate.
>
> Drain all LRU add caches before scanning. It happens relatively
> rare and should not disturb the system too much.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>   mm/khugepaged.c | 2 ++
>   1 file changed, 2 insertions(+)

Acked-by: Yang Shi <yang.shi@linux.alibaba.com>

>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 5968ec5ddd6b..ee66c140c2d6 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2059,6 +2059,8 @@ static void khugepaged_do_scan(void)
>   
>   	barrier(); /* write khugepaged_pages_to_scan to local stack */
>   
> +	lru_add_drain_all();
> +
>   	while (progress < pages) {
>   		if (!khugepaged_prealloc_page(&hpage, &wait))
>   			break;


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 4/8] khugepaged: Drain LRU add pagevec after swapin
  2020-04-13 12:52 ` [PATCHv3, RESEND 4/8] khugepaged: Drain LRU add pagevec after swapin Kirill A. Shutemov
@ 2020-04-15 20:32   ` Yang Shi
  0 siblings, 0 replies; 28+ messages in thread
From: Yang Shi @ 2020-04-15 20:32 UTC (permalink / raw)
  To: Kirill A. Shutemov, akpm, Andrea Arcangeli
  Cc: Zi Yan, Ralph Campbell, John Hubbard, William Kucharski,
	linux-mm, linux-kernel



On 4/13/20 5:52 AM, Kirill A. Shutemov wrote:
> __collapse_huge_page_isolate() may fail due to extra pin in the LRU add
> pagevec. It's pretty common for swapin case: we swap in pages just to
> fail due to the extra pin.
>
> Drain LRU add pagevec on successful swapin.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>   mm/khugepaged.c | 5 +++++
>   1 file changed, 5 insertions(+)

Acked-by: Yang Shi <yang.shi@linux.alibaba.com>

>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index ee66c140c2d6..e3e41c2768d8 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -934,6 +934,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>   	}
>   	vmf.pte--;
>   	pte_unmap(vmf.pte);
> +
> +	/* Drain LRU add pagevec to remove extra pin on the swapped in pages */
> +	if (swapped_in)
> +		lru_add_drain();
> +
>   	trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
>   	return true;
>   }


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 5/8] khugepaged: Allow to collapse a page shared across fork
  2020-04-13 12:52 ` [PATCHv3, RESEND 5/8] khugepaged: Allow to collapse a page shared across fork Kirill A. Shutemov
  2020-04-13 20:48   ` John Hubbard
  2020-04-15  2:43   ` John Hubbard
@ 2020-04-15 20:39   ` Yang Shi
  2 siblings, 0 replies; 28+ messages in thread
From: Yang Shi @ 2020-04-15 20:39 UTC (permalink / raw)
  To: Kirill A. Shutemov, akpm, Andrea Arcangeli
  Cc: Zi Yan, Ralph Campbell, John Hubbard, William Kucharski,
	linux-mm, linux-kernel



On 4/13/20 5:52 AM, Kirill A. Shutemov wrote:
> The page can be included into collapse as long as it doesn't have extra
> pins (from GUP or otherwise).
>
> Logic to check the refcound is moved to a separate function.

s/refcound/refcount

> Note that the function is ready to deal with compound pages. It's
> preparation for the following patch.
>
> VM_BUG_ON_PAGE() was removed from __collapse_huge_page_copy() as the
> invariant it checks is no longer valid: the source can be mapped
> multiple times now.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>   mm/khugepaged.c | 41 ++++++++++++++++++++++++++++++-----------
>   1 file changed, 30 insertions(+), 11 deletions(-)

Just a minor typo problem.

Acked-by: Yang Shi <yang.shi@linux.alibaba.com>

>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index e3e41c2768d8..f9864644c3b7 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -529,6 +529,24 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
>   	}
>   }
>   
> +static bool is_refcount_suitable(struct page *page)
> +{
> +	int expected_refcount, refcount;
> +
> +	refcount = page_count(page);
> +	expected_refcount = total_mapcount(page);
> +	if (PageSwapCache(page))
> +		expected_refcount += compound_nr(page);
> +
> +	if (IS_ENABLED(CONFIG_DEBUG_VM) && expected_refcount > refcount) {
> +		pr_err("expected_refcount: %d, refcount: %d\n",
> +				expected_refcount, refcount);
> +		dump_page(page, "Unexpected refcount");
> +	}
> +
> +	return page_count(page) == expected_refcount;
> +}
> +
>   static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   					unsigned long address,
>   					pte_t *pte)
> @@ -581,11 +599,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   		}
>   
>   		/*
> -		 * cannot use mapcount: can't collapse if there's a gup pin.
> -		 * The page must only be referenced by the scanned process
> -		 * and page swap cache.
> +		 * Check if the page has any GUP (or other external) pins.
> +		 *
> +		 * The page table that maps the page has been already unlinked
> +		 * from the page table tree and this process cannot get
> +		 * an additinal pin on the page.
> +		 *
> +		 * New pins can come later if the page is shared across fork,
> +		 * but not for the this process. It is fine. The other process
> +		 * cannot write to the page, only trigger CoW.
>   		 */
> -		if (page_count(page) != 1 + PageSwapCache(page)) {
> +		if (!is_refcount_suitable(page)) {
>   			unlock_page(page);
>   			result = SCAN_PAGE_COUNT;
>   			goto out;
> @@ -672,7 +696,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
>   		} else {
>   			src_page = pte_page(pteval);
>   			copy_user_highpage(page, src_page, address, vma);
> -			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
>   			release_pte_page(src_page);
>   			/*
>   			 * ptl mostly unnecessary, but preempt has to
> @@ -1201,12 +1224,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>   			goto out_unmap;
>   		}
>   
> -		/*
> -		 * cannot use mapcount: can't collapse if there's a gup pin.
> -		 * The page must only be referenced by the scanned process
> -		 * and page swap cache.
> -		 */
> -		if (page_count(page) != 1 + PageSwapCache(page)) {
> +		/* Check if the page has any GUP (or other external) pins */
> +		if (!is_refcount_suitable(page)) {
>   			result = SCAN_PAGE_COUNT;
>   			goto out_unmap;
>   		}


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 6/8] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-04-13 12:52 ` [PATCHv3, RESEND 6/8] khugepaged: Allow to collapse PTE-mapped compound pages Kirill A. Shutemov
@ 2020-04-15 20:46   ` Yang Shi
  2020-04-15 21:44   ` Andrew Morton
  1 sibling, 0 replies; 28+ messages in thread
From: Yang Shi @ 2020-04-15 20:46 UTC (permalink / raw)
  To: Kirill A. Shutemov, akpm, Andrea Arcangeli
  Cc: Zi Yan, Ralph Campbell, John Hubbard, William Kucharski,
	linux-mm, linux-kernel



On 4/13/20 5:52 AM, Kirill A. Shutemov wrote:
> We can collapse PTE-mapped compound pages. We only need to avoid
> handling them more than once: lock/unlock page only once if it's present
> in the PMD range multiple times as it handled on compound level. The
> same goes for LRU isolation and putback.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>   mm/khugepaged.c | 99 ++++++++++++++++++++++++++++++++-----------------
>   1 file changed, 65 insertions(+), 34 deletions(-)

Acked-by: Yang Shi <yang.shi@linux.alibaba.com>

>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index f9864644c3b7..11d500396d85 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -515,17 +515,30 @@ void __khugepaged_exit(struct mm_struct *mm)
>   
>   static void release_pte_page(struct page *page)
>   {
> -	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> +	mod_node_page_state(page_pgdat(page),
> +			NR_ISOLATED_ANON + page_is_file_cache(page),
> +			-compound_nr(page));
>   	unlock_page(page);
>   	putback_lru_page(page);
>   }
>   
> -static void release_pte_pages(pte_t *pte, pte_t *_pte)
> +static void release_pte_pages(pte_t *pte, pte_t *_pte,
> +		struct list_head *compound_pagelist)
>   {
> +	struct page *page, *tmp;
> +
>   	while (--_pte >= pte) {
>   		pte_t pteval = *_pte;
> -		if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
> -			release_pte_page(pte_page(pteval));
> +
> +		page = pte_page(pteval);
> +		if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
> +				!PageCompound(page))
> +			release_pte_page(page);
> +	}
> +
> +	list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
> +		list_del(&page->lru);
> +		release_pte_page(page);
>   	}
>   }
>   
> @@ -549,7 +562,8 @@ static bool is_refcount_suitable(struct page *page)
>   
>   static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   					unsigned long address,
> -					pte_t *pte)
> +					pte_t *pte,
> +					struct list_head *compound_pagelist)
>   {
>   	struct page *page = NULL;
>   	pte_t *_pte;
> @@ -579,13 +593,21 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   			goto out;
>   		}
>   
> -		/* TODO: teach khugepaged to collapse THP mapped with pte */
> +		VM_BUG_ON_PAGE(!PageAnon(page), page);
> +
>   		if (PageCompound(page)) {
> -			result = SCAN_PAGE_COMPOUND;
> -			goto out;
> -		}
> +			struct page *p;
> +			page = compound_head(page);
>   
> -		VM_BUG_ON_PAGE(!PageAnon(page), page);
> +			/*
> +			 * Check if we have dealt with the compound page
> +			 * already
> +			 */
> +			list_for_each_entry(p, compound_pagelist, lru) {
> +				if (page == p)
> +					goto next;
> +			}
> +		}
>   
>   		/*
>   		 * We can do it before isolate_lru_page because the
> @@ -614,19 +636,15 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   			result = SCAN_PAGE_COUNT;
>   			goto out;
>   		}
> -		if (pte_write(pteval)) {
> -			writable = true;
> -		} else {
> -			if (PageSwapCache(page) &&
> -			    !reuse_swap_page(page, NULL)) {
> -				unlock_page(page);
> -				result = SCAN_SWAP_CACHE_PAGE;
> -				goto out;
> -			}
> +		if (!pte_write(pteval) && PageSwapCache(page) &&
> +				!reuse_swap_page(page, NULL)) {
>   			/*
> -			 * Page is not in the swap cache. It can be collapsed
> -			 * into a THP.
> +			 * Page is in the swap cache and cannot be re-used.
> +			 * It cannot be collapsed into a THP.
>   			 */
> +			unlock_page(page);
> +			result = SCAN_SWAP_CACHE_PAGE;
> +			goto out;
>   		}
>   
>   		/*
> @@ -638,16 +656,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   			result = SCAN_DEL_PAGE_LRU;
>   			goto out;
>   		}
> -		inc_node_page_state(page,
> -				NR_ISOLATED_ANON + page_is_file_cache(page));
> +		mod_node_page_state(page_pgdat(page),
> +				NR_ISOLATED_ANON + page_is_file_cache(page),
> +				compound_nr(page));
>   		VM_BUG_ON_PAGE(!PageLocked(page), page);
>   		VM_BUG_ON_PAGE(PageLRU(page), page);
>   
> +		if (PageCompound(page))
> +			list_add_tail(&page->lru, compound_pagelist);
> +next:
>   		/* There should be enough young pte to collapse the page */
>   		if (pte_young(pteval) ||
>   		    page_is_young(page) || PageReferenced(page) ||
>   		    mmu_notifier_test_young(vma->vm_mm, address))
>   			referenced++;
> +
> +		if (pte_write(pteval))
> +			writable = true;
>   	}
>   	if (likely(writable)) {
>   		if (likely(referenced)) {
> @@ -661,7 +686,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   	}
>   
>   out:
> -	release_pte_pages(pte, _pte);
> +	release_pte_pages(pte, _pte, compound_pagelist);
>   	trace_mm_collapse_huge_page_isolate(page, none_or_zero,
>   					    referenced, writable, result);
>   	return 0;
> @@ -670,13 +695,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
>   				      struct vm_area_struct *vma,
>   				      unsigned long address,
> -				      spinlock_t *ptl)
> +				      spinlock_t *ptl,
> +				      struct list_head *compound_pagelist)
>   {
> +	struct page *src_page, *tmp;
>   	pte_t *_pte;
>   	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
>   				_pte++, page++, address += PAGE_SIZE) {
>   		pte_t pteval = *_pte;
> -		struct page *src_page;
>   
>   		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
>   			clear_user_highpage(page, address);
> @@ -696,7 +722,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
>   		} else {
>   			src_page = pte_page(pteval);
>   			copy_user_highpage(page, src_page, address, vma);
> -			release_pte_page(src_page);
> +			if (!PageCompound(src_page))
> +				release_pte_page(src_page);
>   			/*
>   			 * ptl mostly unnecessary, but preempt has to
>   			 * be disabled to update the per-cpu stats
> @@ -713,6 +740,11 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
>   			free_page_and_swap_cache(src_page);
>   		}
>   	}
> +
> +	list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
> +		list_del(&src_page->lru);
> +		release_pte_page(src_page);
> +	}
>   }
>   
>   static void khugepaged_alloc_sleep(void)
> @@ -971,6 +1003,7 @@ static void collapse_huge_page(struct mm_struct *mm,
>   				   struct page **hpage,
>   				   int node, int referenced, int unmapped)
>   {
> +	LIST_HEAD(compound_pagelist);
>   	pmd_t *pmd, _pmd;
>   	pte_t *pte;
>   	pgtable_t pgtable;
> @@ -1071,7 +1104,8 @@ static void collapse_huge_page(struct mm_struct *mm,
>   	mmu_notifier_invalidate_range_end(&range);
>   
>   	spin_lock(pte_ptl);
> -	isolated = __collapse_huge_page_isolate(vma, address, pte);
> +	isolated = __collapse_huge_page_isolate(vma, address, pte,
> +			&compound_pagelist);
>   	spin_unlock(pte_ptl);
>   
>   	if (unlikely(!isolated)) {
> @@ -1096,7 +1130,8 @@ static void collapse_huge_page(struct mm_struct *mm,
>   	 */
>   	anon_vma_unlock_write(vma->anon_vma);
>   
> -	__collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
> +	__collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
> +			&compound_pagelist);
>   	pte_unmap(pte);
>   	__SetPageUptodate(new_page);
>   	pgtable = pmd_pgtable(_pmd);
> @@ -1193,11 +1228,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>   			goto out_unmap;
>   		}
>   
> -		/* TODO: teach khugepaged to collapse THP mapped with pte */
> -		if (PageCompound(page)) {
> -			result = SCAN_PAGE_COMPOUND;
> -			goto out_unmap;
> -		}
> +		page = compound_head(page);
>   
>   		/*
>   		 * Record which node the original page is from and save this


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 7/8] thp: Change CoW semantics for anon-THP
  2020-04-13 12:52 ` [PATCHv3, RESEND 7/8] thp: Change CoW semantics for anon-THP Kirill A. Shutemov
@ 2020-04-15 21:19   ` Yang Shi
  0 siblings, 0 replies; 28+ messages in thread
From: Yang Shi @ 2020-04-15 21:19 UTC (permalink / raw)
  To: Kirill A. Shutemov, akpm, Andrea Arcangeli
  Cc: Zi Yan, Ralph Campbell, John Hubbard, William Kucharski,
	linux-mm, linux-kernel



On 4/13/20 5:52 AM, Kirill A. Shutemov wrote:
> Currently we have different copy-on-write semantics for anon- and
> file-THP. For anon-THP we try to allocate huge page on the write fault,
> but on file-THP we split PMD and allocate 4k page.
>
> Arguably, file-THP semantics is more desirable: we don't necessary want
> to unshare full PMD range from the parent on the first access. This is
> the primary reason THP is unusable for some workloads, like Redis.
>
> The original THP refcounting didn't allow to have PTE-mapped compound
> pages, so we had no options, but to allocate huge page on CoW (with
> fallback to 512 4k pages).
>
> The current refcounting doesn't have such limitations and we can cut a
> lot of complex code out of fault path.
>
> khugepaged is now able to recover THP from such ranges if the
> configuration allows.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>   mm/huge_memory.c | 249 +++++------------------------------------------
>   1 file changed, 25 insertions(+), 224 deletions(-)

The change looks reasonable and the complexity is reduced significantly. 
Basically I'm fine to this change although the potential increasing 
priority inversion might be concerned for some corner cases, however it 
should be unusual.

Acked-by: Yang Shi <yang.shi@linux.alibaba.com>

>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 24ad53b4dfc0..25b84cc0f17d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1206,262 +1206,63 @@ void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
>   	spin_unlock(vmf->ptl);
>   }
>   
> -static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
> -			pmd_t orig_pmd, struct page *page)
> -{
> -	struct vm_area_struct *vma = vmf->vma;
> -	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> -	struct mem_cgroup *memcg;
> -	pgtable_t pgtable;
> -	pmd_t _pmd;
> -	int i;
> -	vm_fault_t ret = 0;
> -	struct page **pages;
> -	struct mmu_notifier_range range;
> -
> -	pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
> -			      GFP_KERNEL);
> -	if (unlikely(!pages)) {
> -		ret |= VM_FAULT_OOM;
> -		goto out;
> -	}
> -
> -	for (i = 0; i < HPAGE_PMD_NR; i++) {
> -		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
> -					       vmf->address, page_to_nid(page));
> -		if (unlikely(!pages[i] ||
> -			     mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
> -				     GFP_KERNEL, &memcg, false))) {
> -			if (pages[i])
> -				put_page(pages[i]);
> -			while (--i >= 0) {
> -				memcg = (void *)page_private(pages[i]);
> -				set_page_private(pages[i], 0);
> -				mem_cgroup_cancel_charge(pages[i], memcg,
> -						false);
> -				put_page(pages[i]);
> -			}
> -			kfree(pages);
> -			ret |= VM_FAULT_OOM;
> -			goto out;
> -		}
> -		set_page_private(pages[i], (unsigned long)memcg);
> -	}
> -
> -	for (i = 0; i < HPAGE_PMD_NR; i++) {
> -		copy_user_highpage(pages[i], page + i,
> -				   haddr + PAGE_SIZE * i, vma);
> -		__SetPageUptodate(pages[i]);
> -		cond_resched();
> -	}
> -
> -	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> -				haddr, haddr + HPAGE_PMD_SIZE);
> -	mmu_notifier_invalidate_range_start(&range);
> -
> -	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> -	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
> -		goto out_free_pages;
> -	VM_BUG_ON_PAGE(!PageHead(page), page);
> -
> -	/*
> -	 * Leave pmd empty until pte is filled note we must notify here as
> -	 * concurrent CPU thread might write to new page before the call to
> -	 * mmu_notifier_invalidate_range_end() happens which can lead to a
> -	 * device seeing memory write in different order than CPU.
> -	 *
> -	 * See Documentation/vm/mmu_notifier.rst
> -	 */
> -	pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
> -
> -	pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
> -	pmd_populate(vma->vm_mm, &_pmd, pgtable);
> -
> -	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
> -		pte_t entry;
> -		entry = mk_pte(pages[i], vma->vm_page_prot);
> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> -		memcg = (void *)page_private(pages[i]);
> -		set_page_private(pages[i], 0);
> -		page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
> -		mem_cgroup_commit_charge(pages[i], memcg, false, false);
> -		lru_cache_add_active_or_unevictable(pages[i], vma);
> -		vmf->pte = pte_offset_map(&_pmd, haddr);
> -		VM_BUG_ON(!pte_none(*vmf->pte));
> -		set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
> -		pte_unmap(vmf->pte);
> -	}
> -	kfree(pages);
> -
> -	smp_wmb(); /* make pte visible before pmd */
> -	pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
> -	page_remove_rmap(page, true);
> -	spin_unlock(vmf->ptl);
> -
> -	/*
> -	 * No need to double call mmu_notifier->invalidate_range() callback as
> -	 * the above pmdp_huge_clear_flush_notify() did already call it.
> -	 */
> -	mmu_notifier_invalidate_range_only_end(&range);
> -
> -	ret |= VM_FAULT_WRITE;
> -	put_page(page);
> -
> -out:
> -	return ret;
> -
> -out_free_pages:
> -	spin_unlock(vmf->ptl);
> -	mmu_notifier_invalidate_range_end(&range);
> -	for (i = 0; i < HPAGE_PMD_NR; i++) {
> -		memcg = (void *)page_private(pages[i]);
> -		set_page_private(pages[i], 0);
> -		mem_cgroup_cancel_charge(pages[i], memcg, false);
> -		put_page(pages[i]);
> -	}
> -	kfree(pages);
> -	goto out;
> -}
> -
>   vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
>   {
>   	struct vm_area_struct *vma = vmf->vma;
> -	struct page *page = NULL, *new_page;
> -	struct mem_cgroup *memcg;
> +	struct page *page;
>   	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> -	struct mmu_notifier_range range;
> -	gfp_t huge_gfp;			/* for allocation and charge */
> -	vm_fault_t ret = 0;
>   
>   	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>   	VM_BUG_ON_VMA(!vma->anon_vma, vma);
> +
>   	if (is_huge_zero_pmd(orig_pmd))
> -		goto alloc;
> +		goto fallback;
> +
>   	spin_lock(vmf->ptl);
> -	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
> -		goto out_unlock;
> +
> +	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> +		spin_unlock(vmf->ptl);
> +		return 0;
> +	}
>   
>   	page = pmd_page(orig_pmd);
>   	VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
> -	/*
> -	 * We can only reuse the page if nobody else maps the huge page or it's
> -	 * part.
> -	 */
> +
> +	/* Lock page for reuse_swap_page() */
>   	if (!trylock_page(page)) {
>   		get_page(page);
>   		spin_unlock(vmf->ptl);
>   		lock_page(page);
>   		spin_lock(vmf->ptl);
>   		if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> +			spin_unlock(vmf->ptl);
>   			unlock_page(page);
>   			put_page(page);
> -			goto out_unlock;
> +			return 0;
>   		}
>   		put_page(page);
>   	}
> +
> +	/*
> +	 * We can only reuse the page if nobody else maps the huge page or it's
> +	 * part.
> +	 */
>   	if (reuse_swap_page(page, NULL)) {
>   		pmd_t entry;
>   		entry = pmd_mkyoung(orig_pmd);
>   		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> -		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
> +		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
>   			update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> -		ret |= VM_FAULT_WRITE;
>   		unlock_page(page);
> -		goto out_unlock;
> -	}
> -	unlock_page(page);
> -	get_page(page);
> -	spin_unlock(vmf->ptl);
> -alloc:
> -	if (__transparent_hugepage_enabled(vma) &&
> -	    !transparent_hugepage_debug_cow()) {
> -		huge_gfp = alloc_hugepage_direct_gfpmask(vma);
> -		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
> -	} else
> -		new_page = NULL;
> -
> -	if (likely(new_page)) {
> -		prep_transhuge_page(new_page);
> -	} else {
> -		if (!page) {
> -			split_huge_pmd(vma, vmf->pmd, vmf->address);
> -			ret |= VM_FAULT_FALLBACK;
> -		} else {
> -			ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
> -			if (ret & VM_FAULT_OOM) {
> -				split_huge_pmd(vma, vmf->pmd, vmf->address);
> -				ret |= VM_FAULT_FALLBACK;
> -			}
> -			put_page(page);
> -		}
> -		count_vm_event(THP_FAULT_FALLBACK);
> -		goto out;
> -	}
> -
> -	if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
> -					huge_gfp, &memcg, true))) {
> -		put_page(new_page);
> -		split_huge_pmd(vma, vmf->pmd, vmf->address);
> -		if (page)
> -			put_page(page);
> -		ret |= VM_FAULT_FALLBACK;
> -		count_vm_event(THP_FAULT_FALLBACK);
> -		goto out;
> -	}
> -
> -	count_vm_event(THP_FAULT_ALLOC);
> -	count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
> -
> -	if (!page)
> -		clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
> -	else
> -		copy_user_huge_page(new_page, page, vmf->address,
> -				    vma, HPAGE_PMD_NR);
> -	__SetPageUptodate(new_page);
> -
> -	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> -				haddr, haddr + HPAGE_PMD_SIZE);
> -	mmu_notifier_invalidate_range_start(&range);
> -
> -	spin_lock(vmf->ptl);
> -	if (page)
> -		put_page(page);
> -	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
>   		spin_unlock(vmf->ptl);
> -		mem_cgroup_cancel_charge(new_page, memcg, true);
> -		put_page(new_page);
> -		goto out_mn;
> -	} else {
> -		pmd_t entry;
> -		entry = mk_huge_pmd(new_page, vma->vm_page_prot);
> -		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> -		pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
> -		page_add_new_anon_rmap(new_page, vma, haddr, true);
> -		mem_cgroup_commit_charge(new_page, memcg, false, true);
> -		lru_cache_add_active_or_unevictable(new_page, vma);
> -		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
> -		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> -		if (!page) {
> -			add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> -		} else {
> -			VM_BUG_ON_PAGE(!PageHead(page), page);
> -			page_remove_rmap(page, true);
> -			put_page(page);
> -		}
> -		ret |= VM_FAULT_WRITE;
> +		return VM_FAULT_WRITE;
>   	}
> +
> +	unlock_page(page);
>   	spin_unlock(vmf->ptl);
> -out_mn:
> -	/*
> -	 * No need to double call mmu_notifier->invalidate_range() callback as
> -	 * the above pmdp_huge_clear_flush_notify() did already call it.
> -	 */
> -	mmu_notifier_invalidate_range_only_end(&range);
> -out:
> -	return ret;
> -out_unlock:
> -	spin_unlock(vmf->ptl);
> -	return ret;
> +fallback:
> +	__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
> +	return VM_FAULT_FALLBACK;
>   }
>   
>   /*


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 8/8] khugepaged: Introduce 'max_ptes_shared' tunable
  2020-04-13 12:52 ` [PATCHv3, RESEND 8/8] khugepaged: Introduce 'max_ptes_shared' tunable Kirill A. Shutemov
@ 2020-04-15 21:25   ` Yang Shi
  2020-04-16 15:52     ` Kirill A. Shutemov
  0 siblings, 1 reply; 28+ messages in thread
From: Yang Shi @ 2020-04-15 21:25 UTC (permalink / raw)
  To: Kirill A. Shutemov, akpm, Andrea Arcangeli
  Cc: Zi Yan, Ralph Campbell, John Hubbard, William Kucharski,
	linux-mm, linux-kernel



On 4/13/20 5:52 AM, Kirill A. Shutemov wrote:
> 'max_ptes_shared' specifies how many pages can be shared across multiple
> processes. Exceeding the number would block the collapse::
>
> 	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared
>
> A higher value may increase memory footprint for some workloads.
>
> By default, at least half of pages has to be not shared.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>

Acked-by: Yang Shi <yang.shi@linux.alibaba.com>

One nit below.

> ---
>   Documentation/admin-guide/mm/transhuge.rst |  7 ++
>   include/trace/events/huge_memory.h         |  3 +-
>   mm/khugepaged.c                            | 52 ++++++++++++--
>   tools/testing/selftests/vm/khugepaged.c    | 83 ++++++++++++++++++++++
>   4 files changed, 140 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index bd5714547cee..a10f6a50d48f 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -220,6 +220,13 @@ memory. A lower value can prevent THPs from being
>   collapsed, resulting fewer pages being collapsed into
>   THPs, and lower memory access performance.
>   
> +``max_ptes_shared`` specifies how many pages can be shared across multiple
> +processes. Exceeding the number would block the collapse::
> +
> +	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared
> +
> +A higher value may increase memory footprint for some workloads.
> +
>   Boot parameter
>   ==============
>   
> diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
> index d82a0f4e824d..53532f5925c3 100644
> --- a/include/trace/events/huge_memory.h
> +++ b/include/trace/events/huge_memory.h
> @@ -12,6 +12,8 @@
>   	EM( SCAN_SUCCEED,		"succeeded")			\
>   	EM( SCAN_PMD_NULL,		"pmd_null")			\
>   	EM( SCAN_EXCEED_NONE_PTE,	"exceed_none_pte")		\
> +	EM( SCAN_EXCEED_SWAP_PTE,	"exceed_swap_pte")		\
> +	EM( SCAN_EXCEED_SHARED_PTE,	"exceed_shared_pte")		\
>   	EM( SCAN_PTE_NON_PRESENT,	"pte_non_present")		\
>   	EM( SCAN_PAGE_RO,		"no_writable_page")		\
>   	EM( SCAN_LACK_REFERENCED_PAGE,	"lack_referenced_page")		\
> @@ -30,7 +32,6 @@
>   	EM( SCAN_DEL_PAGE_LRU,		"could_not_delete_page_from_lru")\
>   	EM( SCAN_ALLOC_HUGE_PAGE_FAIL,	"alloc_huge_page_failed")	\
>   	EM( SCAN_CGROUP_CHARGE_FAIL,	"ccgroup_charge_failed")	\
> -	EM( SCAN_EXCEED_SWAP_PTE,	"exceed_swap_pte")		\
>   	EM( SCAN_TRUNCATED,		"truncated")			\
>   	EMe(SCAN_PAGE_HAS_PRIVATE,	"page_has_private")		\
>   
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 11d500396d85..0730014620e2 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -28,6 +28,8 @@ enum scan_result {
>   	SCAN_SUCCEED,
>   	SCAN_PMD_NULL,
>   	SCAN_EXCEED_NONE_PTE,
> +	SCAN_EXCEED_SWAP_PTE,
> +	SCAN_EXCEED_SHARED_PTE,
>   	SCAN_PTE_NON_PRESENT,
>   	SCAN_PAGE_RO,
>   	SCAN_LACK_REFERENCED_PAGE,
> @@ -46,7 +48,6 @@ enum scan_result {
>   	SCAN_DEL_PAGE_LRU,
>   	SCAN_ALLOC_HUGE_PAGE_FAIL,
>   	SCAN_CGROUP_CHARGE_FAIL,
> -	SCAN_EXCEED_SWAP_PTE,
>   	SCAN_TRUNCATED,
>   	SCAN_PAGE_HAS_PRIVATE,
>   };
> @@ -71,6 +72,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
>    */
>   static unsigned int khugepaged_max_ptes_none __read_mostly;
>   static unsigned int khugepaged_max_ptes_swap __read_mostly;
> +static unsigned int khugepaged_max_ptes_shared __read_mostly;
>   
>   #define MM_SLOTS_HASH_BITS 10
>   static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
> @@ -290,15 +292,43 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr =
>   	__ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
>   	       khugepaged_max_ptes_swap_store);
>   
> +static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
> +					     struct kobj_attribute *attr,
> +					     char *buf)
> +{
> +	return sprintf(buf, "%u\n", khugepaged_max_ptes_shared);
> +}
> +
> +static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
> +					      struct kobj_attribute *attr,
> +					      const char *buf, size_t count)
> +{
> +	int err;
> +	unsigned long max_ptes_shared;
> +
> +	err  = kstrtoul(buf, 10, &max_ptes_shared);
> +	if (err || max_ptes_shared > HPAGE_PMD_NR-1)
> +		return -EINVAL;
> +
> +	khugepaged_max_ptes_shared = max_ptes_shared;
> +
> +	return count;
> +}
> +
> +static struct kobj_attribute khugepaged_max_ptes_shared_attr =
> +	__ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
> +	       khugepaged_max_ptes_shared_store);
> +
>   static struct attribute *khugepaged_attr[] = {
>   	&khugepaged_defrag_attr.attr,
>   	&khugepaged_max_ptes_none_attr.attr,
> +	&khugepaged_max_ptes_swap_attr.attr,
> +	&khugepaged_max_ptes_shared_attr.attr,
>   	&pages_to_scan_attr.attr,
>   	&pages_collapsed_attr.attr,
>   	&full_scans_attr.attr,
>   	&scan_sleep_millisecs_attr.attr,
>   	&alloc_sleep_millisecs_attr.attr,
> -	&khugepaged_max_ptes_swap_attr.attr,
>   	NULL,
>   };
>   
> @@ -360,6 +390,7 @@ int __init khugepaged_init(void)
>   	khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
>   	khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
>   	khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
> +	khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
>   
>   	return 0;
>   }
> @@ -567,7 +598,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   {
>   	struct page *page = NULL;
>   	pte_t *_pte;
> -	int none_or_zero = 0, result = 0, referenced = 0;
> +	int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
>   	bool writable = false;
>   
>   	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
> @@ -595,6 +626,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   
>   		VM_BUG_ON_PAGE(!PageAnon(page), page);
>   
> +		if (page_mapcount(page) > 1 &&
> +				++shared > khugepaged_max_ptes_shared) {
> +			result = SCAN_EXCEED_SHARED_PTE;

It may be better to extract the check into a helper? For example, bool 
exceed_max_ptes_shared(struct page)?

> +			goto out;
> +		}
> +
>   		if (PageCompound(page)) {
>   			struct page *p;
>   			page = compound_head(page);
> @@ -1178,7 +1215,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>   {
>   	pmd_t *pmd;
>   	pte_t *pte, *_pte;
> -	int ret = 0, none_or_zero = 0, result = 0, referenced = 0;
> +	int ret = 0, result = 0, referenced = 0;
> +	int none_or_zero = 0, shared = 0;
>   	struct page *page = NULL;
>   	unsigned long _address;
>   	spinlock_t *ptl;
> @@ -1228,6 +1266,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>   			goto out_unmap;
>   		}
>   
> +		if (page_mapcount(page) > 1 &&
> +				++shared > khugepaged_max_ptes_shared) {
> +			result = SCAN_EXCEED_SHARED_PTE;
> +			goto out_unmap;
> +		}
> +
>   		page = compound_head(page);
>   
>   		/*
> diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c
> index 34d945e71e2e..7c7283cc7dcc 100644
> --- a/tools/testing/selftests/vm/khugepaged.c
> +++ b/tools/testing/selftests/vm/khugepaged.c
> @@ -77,6 +77,7 @@ struct khugepaged_settings {
>   	unsigned int scan_sleep_millisecs;
>   	unsigned int max_ptes_none;
>   	unsigned int max_ptes_swap;
> +	unsigned int max_ptes_shared;
>   	unsigned long pages_to_scan;
>   };
>   
> @@ -276,6 +277,7 @@ static void write_settings(struct settings *settings)
>   			khugepaged->scan_sleep_millisecs);
>   	write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
>   	write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
> +	write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
>   	write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
>   }
>   
> @@ -312,6 +314,7 @@ static void save_settings(void)
>   			read_num("khugepaged/scan_sleep_millisecs"),
>   		.max_ptes_none = read_num("khugepaged/max_ptes_none"),
>   		.max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
> +		.max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
>   		.pages_to_scan = read_num("khugepaged/pages_to_scan"),
>   	};
>   	success("OK");
> @@ -843,12 +846,90 @@ static void collapse_fork_compound(void)
>   			fail("Fail");
>   		fill_memory(p, 0, page_size);
>   
> +		write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
>   		if (wait_for_scan("Collapse PTE table full of compound pages in child", p))
>   			fail("Timeout");
>   		else if (check_huge(p))
>   			success("OK");
>   		else
>   			fail("Fail");
> +		write_num("khugepaged/max_ptes_shared",
> +				default_settings.khugepaged.max_ptes_shared);
> +
> +		validate_memory(p, 0, hpage_pmd_size);
> +		munmap(p, hpage_pmd_size);
> +		exit(exit_status);
> +	}
> +
> +	wait(&wstatus);
> +	exit_status += WEXITSTATUS(wstatus);
> +
> +	printf("Check if parent still has huge page...");
> +	if (check_huge(p))
> +		success("OK");
> +	else
> +		fail("Fail");
> +	validate_memory(p, 0, hpage_pmd_size);
> +	munmap(p, hpage_pmd_size);
> +}
> +
> +static void collapse_max_ptes_shared()
> +{
> +	int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
> +	int wstatus;
> +	void *p;
> +
> +	p = alloc_mapping();
> +
> +	printf("Allocate huge page...");
> +	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
> +	fill_memory(p, 0, hpage_pmd_size);
> +	if (check_huge(p))
> +		success("OK");
> +	else
> +		fail("Fail");
> +
> +	printf("Share huge page over fork()...");
> +	if (!fork()) {
> +		/* Do not touch settings on child exit */
> +		skip_settings_restore = true;
> +		exit_status = 0;
> +
> +		if (check_huge(p))
> +			success("OK");
> +		else
> +			fail("Fail");
> +
> +		printf("Trigger CoW in %d of %d...",
> +				hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
> +		fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
> +		if (!check_huge(p))
> +			success("OK");
> +		else
> +			fail("Fail");
> +
> +		if (wait_for_scan("Do not collapse with max_ptes_shared exeeded", p))
> +			fail("Timeout");
> +		else if (!check_huge(p))
> +			success("OK");
> +		else
> +			fail("Fail");
> +
> +		printf("Trigger CoW in %d of %d...",
> +				hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
> +		fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size);
> +		if (!check_huge(p))
> +			success("OK");
> +		else
> +			fail("Fail");
> +
> +
> +		if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p))
> +			fail("Timeout");
> +		else if (check_huge(p))
> +			success("OK");
> +		else
> +			fail("Fail");
>   
>   		validate_memory(p, 0, hpage_pmd_size);
>   		munmap(p, hpage_pmd_size);
> @@ -877,6 +958,7 @@ int main(void)
>   
>   	default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
>   	default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
> +	default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
>   	default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
>   
>   	save_settings();
> @@ -894,6 +976,7 @@ int main(void)
>   	collapse_compound_extreme();
>   	collapse_fork();
>   	collapse_fork_compound();
> +	collapse_max_ptes_shared();
>   
>   	restore_settings(0);
>   }


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 6/8] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-04-13 12:52 ` [PATCHv3, RESEND 6/8] khugepaged: Allow to collapse PTE-mapped compound pages Kirill A. Shutemov
  2020-04-15 20:46   ` Yang Shi
@ 2020-04-15 21:44   ` Andrew Morton
  2020-04-15 21:52     ` Kirill A. Shutemov
  1 sibling, 1 reply; 28+ messages in thread
From: Andrew Morton @ 2020-04-15 21:44 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrea Arcangeli, Zi Yan, Yang Shi, Ralph Campbell, John Hubbard,
	William Kucharski, linux-mm, linux-kernel

On Mon, 13 Apr 2020 15:52:18 +0300 "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:

> We can collapse PTE-mapped compound pages. We only need to avoid
> handling them more than once: lock/unlock page only once if it's present
> in the PMD range multiple times as it handled on compound level. The
> same goes for LRU isolation and putback.
> 
> ...
>
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -515,17 +515,30 @@ void __khugepaged_exit(struct mm_struct *mm)
>  
>  static void release_pte_page(struct page *page)
>  {
> -	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));

I have

	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page));

here.  Is there some prerequisite which I wasn't able to find?



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 6/8] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-04-15 21:44   ` Andrew Morton
@ 2020-04-15 21:52     ` Kirill A. Shutemov
  2020-04-15 22:52       ` Andrew Morton
  0 siblings, 1 reply; 28+ messages in thread
From: Kirill A. Shutemov @ 2020-04-15 21:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Kirill A. Shutemov, Andrea Arcangeli, Zi Yan, Yang Shi,
	Ralph Campbell, John Hubbard, William Kucharski, linux-mm,
	linux-kernel

On Wed, Apr 15, 2020 at 02:44:26PM -0700, Andrew Morton wrote:
> On Mon, 13 Apr 2020 15:52:18 +0300 "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:
> 
> > We can collapse PTE-mapped compound pages. We only need to avoid
> > handling them more than once: lock/unlock page only once if it's present
> > in the PMD range multiple times as it handled on compound level. The
> > same goes for LRU isolation and putback.
> > 
> > ...
> >
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -515,17 +515,30 @@ void __khugepaged_exit(struct mm_struct *mm)
> >  
> >  static void release_pte_page(struct page *page)
> >  {
> > -	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> 
> I have
> 
> 	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page));
> 
> here.  Is there some prerequisite which I wasn't able to find?

The patchset is on top of v5.6. It has been changed since. See
9de4f22a60f7 ("mm: code cleanup for MADV_FREE").

Look like a trivial fixup is required.

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 6/8] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-04-15 21:52     ` Kirill A. Shutemov
@ 2020-04-15 22:52       ` Andrew Morton
  2020-04-15 22:59         ` Kirill A. Shutemov
  0 siblings, 1 reply; 28+ messages in thread
From: Andrew Morton @ 2020-04-15 22:52 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Kirill A. Shutemov, Andrea Arcangeli, Zi Yan, Yang Shi,
	Ralph Campbell, John Hubbard, William Kucharski, linux-mm,
	linux-kernel

On Thu, 16 Apr 2020 00:52:05 +0300 "Kirill A. Shutemov" <kirill@shutemov.name> wrote:

> On Wed, Apr 15, 2020 at 02:44:26PM -0700, Andrew Morton wrote:
> > On Mon, 13 Apr 2020 15:52:18 +0300 "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:
> > 
> > > We can collapse PTE-mapped compound pages. We only need to avoid
> > > handling them more than once: lock/unlock page only once if it's present
> > > in the PMD range multiple times as it handled on compound level. The
> > > same goes for LRU isolation and putback.
> > > 
> > > ...
> > >
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -515,17 +515,30 @@ void __khugepaged_exit(struct mm_struct *mm)
> > >  
> > >  static void release_pte_page(struct page *page)
> > >  {
> > > -	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> > 
> > I have
> > 
> > 	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page));
> > 
> > here.  Is there some prerequisite which I wasn't able to find?
> 
> The patchset is on top of v5.6. It has been changed since. See
> 9de4f22a60f7 ("mm: code cleanup for MADV_FREE").
> 
> Look like a trivial fixup is required.

[7/8] makes a big mess.  Can we please have a v4?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 6/8] khugepaged: Allow to collapse PTE-mapped compound pages
  2020-04-15 22:52       ` Andrew Morton
@ 2020-04-15 22:59         ` Kirill A. Shutemov
  0 siblings, 0 replies; 28+ messages in thread
From: Kirill A. Shutemov @ 2020-04-15 22:59 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Kirill A. Shutemov, Andrea Arcangeli, Zi Yan, Yang Shi,
	Ralph Campbell, John Hubbard, William Kucharski, linux-mm,
	linux-kernel

On Wed, Apr 15, 2020 at 03:52:59PM -0700, Andrew Morton wrote:
> On Thu, 16 Apr 2020 00:52:05 +0300 "Kirill A. Shutemov" <kirill@shutemov.name> wrote:
> 
> > On Wed, Apr 15, 2020 at 02:44:26PM -0700, Andrew Morton wrote:
> > > On Mon, 13 Apr 2020 15:52:18 +0300 "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:
> > > 
> > > > We can collapse PTE-mapped compound pages. We only need to avoid
> > > > handling them more than once: lock/unlock page only once if it's present
> > > > in the PMD range multiple times as it handled on compound level. The
> > > > same goes for LRU isolation and putback.
> > > > 
> > > > ...
> > > >
> > > > --- a/mm/khugepaged.c
> > > > +++ b/mm/khugepaged.c
> > > > @@ -515,17 +515,30 @@ void __khugepaged_exit(struct mm_struct *mm)
> > > >  
> > > >  static void release_pte_page(struct page *page)
> > > >  {
> > > > -	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> > > 
> > > I have
> > > 
> > > 	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page));
> > > 
> > > here.  Is there some prerequisite which I wasn't able to find?
> > 
> > The patchset is on top of v5.6. It has been changed since. See
> > 9de4f22a60f7 ("mm: code cleanup for MADV_FREE").
> > 
> > Look like a trivial fixup is required.
> 
> [7/8] makes a big mess.  Can we please have a v4?

Sure. Give me a day.

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 8/8] khugepaged: Introduce 'max_ptes_shared' tunable
  2020-04-15 21:25   ` Yang Shi
@ 2020-04-16 15:52     ` Kirill A. Shutemov
  2020-04-16 18:25       ` Yang Shi
  0 siblings, 1 reply; 28+ messages in thread
From: Kirill A. Shutemov @ 2020-04-16 15:52 UTC (permalink / raw)
  To: Yang Shi
  Cc: Kirill A. Shutemov, akpm, Andrea Arcangeli, Zi Yan,
	Ralph Campbell, John Hubbard, William Kucharski, linux-mm,
	linux-kernel

On Wed, Apr 15, 2020 at 02:25:30PM -0700, Yang Shi wrote:
> > @@ -595,6 +626,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >   		VM_BUG_ON_PAGE(!PageAnon(page), page);
> > +		if (page_mapcount(page) > 1 &&
> > +				++shared > khugepaged_max_ptes_shared) {
> > +			result = SCAN_EXCEED_SHARED_PTE;
> 
> It may be better to extract the check into a helper? For example, bool
> exceed_max_ptes_shared(struct page)?

It might be reasonable as part of wider cleanup: the same has to be done
to khugepaged_max_ptes_none and khugepaged_max_ptes_swap.

Volunteers? :P

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCHv3, RESEND 8/8] khugepaged: Introduce 'max_ptes_shared' tunable
  2020-04-16 15:52     ` Kirill A. Shutemov
@ 2020-04-16 18:25       ` Yang Shi
  0 siblings, 0 replies; 28+ messages in thread
From: Yang Shi @ 2020-04-16 18:25 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Kirill A. Shutemov, akpm, Andrea Arcangeli, Zi Yan,
	Ralph Campbell, John Hubbard, William Kucharski, linux-mm,
	linux-kernel



On 4/16/20 8:52 AM, Kirill A. Shutemov wrote:
> On Wed, Apr 15, 2020 at 02:25:30PM -0700, Yang Shi wrote:
>>> @@ -595,6 +626,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>>    		VM_BUG_ON_PAGE(!PageAnon(page), page);
>>> +		if (page_mapcount(page) > 1 &&
>>> +				++shared > khugepaged_max_ptes_shared) {
>>> +			result = SCAN_EXCEED_SHARED_PTE;
>> It may be better to extract the check into a helper? For example, bool
>> exceed_max_ptes_shared(struct page)?
> It might be reasonable as part of wider cleanup: the same has to be done
> to khugepaged_max_ptes_none and khugepaged_max_ptes_swap.
>
> Volunteers? :P

Fine, I will take it. :-)

>


^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2020-04-16 18:26 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-13 12:52 [PATCHv3, RESEND 0/8] thp/khugepaged improvements and CoW semantics Kirill A. Shutemov
2020-04-13 12:52 ` [PATCHv3, RESEND 1/8] khugepaged: Add self test Kirill A. Shutemov
2020-04-15 15:39   ` Zi Yan
2020-04-15 20:11     ` Yang Shi
2020-04-13 12:52 ` [PATCHv3, RESEND 2/8] khugepaged: Do not stop collapse if less than half PTEs are referenced Kirill A. Shutemov
2020-04-15 20:31   ` Yang Shi
2020-04-13 12:52 ` [PATCHv3, RESEND 3/8] khugepaged: Drain all LRU caches before scanning pages Kirill A. Shutemov
2020-04-15 20:31   ` Yang Shi
2020-04-13 12:52 ` [PATCHv3, RESEND 4/8] khugepaged: Drain LRU add pagevec after swapin Kirill A. Shutemov
2020-04-15 20:32   ` Yang Shi
2020-04-13 12:52 ` [PATCHv3, RESEND 5/8] khugepaged: Allow to collapse a page shared across fork Kirill A. Shutemov
2020-04-13 20:48   ` John Hubbard
2020-04-14 21:35     ` Kirill A. Shutemov
2020-04-15  2:42       ` John Hubbard
2020-04-15  2:43   ` John Hubbard
2020-04-15 20:39   ` Yang Shi
2020-04-13 12:52 ` [PATCHv3, RESEND 6/8] khugepaged: Allow to collapse PTE-mapped compound pages Kirill A. Shutemov
2020-04-15 20:46   ` Yang Shi
2020-04-15 21:44   ` Andrew Morton
2020-04-15 21:52     ` Kirill A. Shutemov
2020-04-15 22:52       ` Andrew Morton
2020-04-15 22:59         ` Kirill A. Shutemov
2020-04-13 12:52 ` [PATCHv3, RESEND 7/8] thp: Change CoW semantics for anon-THP Kirill A. Shutemov
2020-04-15 21:19   ` Yang Shi
2020-04-13 12:52 ` [PATCHv3, RESEND 8/8] khugepaged: Introduce 'max_ptes_shared' tunable Kirill A. Shutemov
2020-04-15 21:25   ` Yang Shi
2020-04-16 15:52     ` Kirill A. Shutemov
2020-04-16 18:25       ` Yang Shi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.