linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests.
@ 2021-03-15 20:33 Zi Yan
  2021-03-15 20:33 ` [PATCH v4 2/2] mm: huge_memory: debugfs for file-backed THP split Zi Yan
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Zi Yan @ 2021-03-15 20:33 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, linux-kselftest, Kirill A . Shutemov,
	Andrew Morton, Shuah Khan, John Hubbard, Sandipan Das,
	David Hildenbrand, Yang Shi, Mika Penttila, Zi Yan

From: Zi Yan <ziy@nvidia.com>

We did not have a direct user interface of splitting the compound page
backing a THP and there is no need unless we want to expose the THP
implementation details to users. Make <debugfs>/split_huge_pages accept
a new command to do that.

By writing "<pid>,<vaddr_start>,<vaddr_end>" to
<debugfs>/split_huge_pages, THPs within the given virtual address range
from the process with the given pid are split. It is used to test
split_huge_page function. In addition, a selftest program is added to
tools/testing/selftests/vm to utilize the interface by splitting
PMD THPs and PTE-mapped THPs.

This does not change the old behavior, i.e., writing 1 to the interface
to split all THPs in the system.

Changelog:

From v3:
1. Factored out split huge pages in the given pid code to a separate
   function.
2. Added the missing put_page for not split pages.
3. pr_debug -> pr_info, make reading results simpler.

From v2:

1. Reused existing <debugfs>/split_huge_pages interface. (suggested by
   Yang Shi)

From v1:

1. Removed unnecessary calling to vma_migratable, spotted by kernel test
   robot <lkp@intel.com>.
2. Dropped the use of find_mm_struct and code it directly, since there
   is no need for the permission check in that function and the function
   is only available when migration is on.
3. Added some comments in the selftest program to clarify how PTE-mapped
   THPs are formed.

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 mm/huge_memory.c                              | 136 +++++++-
 tools/testing/selftests/vm/.gitignore         |   1 +
 tools/testing/selftests/vm/Makefile           |   1 +
 .../selftests/vm/split_huge_page_test.c       | 313 ++++++++++++++++++
 4 files changed, 444 insertions(+), 7 deletions(-)
 create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bff92dea5ab3..3bfee54e2cd0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -7,6 +7,7 @@
 
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/highmem.h>
@@ -2922,16 +2923,14 @@ static struct shrinker deferred_split_shrinker = {
 };
 
 #ifdef CONFIG_DEBUG_FS
-static int split_huge_pages_set(void *data, u64 val)
+static void split_huge_pages_all(void)
 {
 	struct zone *zone;
 	struct page *page;
 	unsigned long pfn, max_zone_pfn;
 	unsigned long total = 0, split = 0;
 
-	if (val != 1)
-		return -EINVAL;
-
+	pr_info("Split all THPs\n");
 	for_each_populated_zone(zone) {
 		max_zone_pfn = zone_end_pfn(zone);
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
@@ -2959,11 +2958,134 @@ static int split_huge_pages_set(void *data, u64 val)
 	}
 
 	pr_info("%lu of %lu THP split\n", split, total);
+}
 
-	return 0;
+static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
+				unsigned long vaddr_end)
+{
+	int ret = 0;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	unsigned long total = 0, split = 0;
+	unsigned long addr;
+
+	vaddr_start &= PAGE_MASK;
+	vaddr_end &= PAGE_MASK;
+
+	/* Find the task_struct from pid */
+	rcu_read_lock();
+	task = find_task_by_vpid(pid);
+	if (!task) {
+		rcu_read_unlock();
+		ret = -ESRCH;
+		goto out;
+	}
+	get_task_struct(task);
+	rcu_read_unlock();
+
+	/* Find the mm_struct */
+	mm = get_task_mm(task);
+	put_task_struct(task);
+
+	if (!mm) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	pr_info("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
+		 pid, vaddr_start, vaddr_end);
+
+	mmap_read_lock(mm);
+	/*
+	 * always increase addr by PAGE_SIZE, since we could have a PTE page
+	 * table filled with PTE-mapped THPs, each of which is distinct.
+	 */
+	for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
+		struct vm_area_struct *vma = find_vma(mm, addr);
+		unsigned int follflags;
+		struct page *page;
+
+		if (!vma || addr < vma->vm_start)
+			break;
+
+		/* FOLL_DUMP to ignore special (like zero) pages */
+		follflags = FOLL_GET | FOLL_DUMP;
+		page = follow_page(vma, addr, follflags);
+
+		if (IS_ERR(page))
+			break;
+		if (!page)
+			break;
+
+		if (!is_transparent_hugepage(page))
+			goto next;
+
+		total++;
+		if (!can_split_huge_page(compound_head(page), NULL))
+			goto next;
+
+		if (!trylock_page(page))
+			goto next;
+
+		if (!split_huge_page(page))
+			split++;
+
+		unlock_page(page);
+next:
+		put_page(page);
+	}
+	mmap_read_unlock(mm);
+	mmput(mm);
+
+	pr_info("%lu of %lu THP split\n", split, total);
+
+out:
+	return ret;
 }
-DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
-		"%llu\n");
+
+static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppops)
+{
+	static DEFINE_MUTEX(mutex);
+	ssize_t ret;
+	char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
+	int pid;
+	unsigned long vaddr_start, vaddr_end;
+
+	ret = mutex_lock_interruptible(&mutex);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
+
+	memset(input_buf, 0, 80);
+	if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
+		goto out;
+
+	input_buf[79] = '\0';
+	ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
+	if (ret == 1 && pid == 1) {
+		split_huge_pages_all();
+		ret = strlen(input_buf);
+		goto out;
+	} else if (ret != 3) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!split_huge_pages_pid(pid, vaddr_start, vaddr_end))
+		ret = strlen(input_buf);
+out:
+	mutex_unlock(&mutex);
+	return ret;
+
+}
+
+static const struct file_operations split_huge_pages_fops = {
+	.owner	 = THIS_MODULE,
+	.write	 = split_huge_pages_write,
+	.llseek  = no_llseek,
+};
 
 static int __init split_huge_pages_debugfs(void)
 {
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index 9a35c3f6a557..1f651e85ed60 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -22,3 +22,4 @@ map_fixed_noreplace
 write_to_hugetlbfs
 hmm-tests
 local_config.*
+split_huge_page_test
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index d42115e4284d..4cbc91d6869f 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -42,6 +42,7 @@ TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
+TEST_GEN_FILES += split_huge_page_test
 
 ifeq ($(MACHINE),x86_64)
 CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
new file mode 100644
index 000000000000..9f33ddbb3182
--- /dev/null
+++ b/tools/testing/selftests/vm/split_huge_page_test.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
+ * address range in a process via <debugfs>/split_huge_pages interface.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <malloc.h>
+#include <stdbool.h>
+
+uint64_t pagesize;
+unsigned int pageshift;
+uint64_t pmd_pagesize;
+
+#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
+#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages"
+#define SMAP_PATH "/proc/self/smaps"
+#define INPUT_MAX 80
+
+#define PFN_MASK     ((1UL<<55)-1)
+#define KPF_THP      (1UL<<22)
+
+int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
+{
+	uint64_t paddr;
+	uint64_t page_flags;
+
+	if (pagemap_file) {
+		pread(pagemap_file, &paddr, sizeof(paddr),
+			((long)vaddr >> pageshift) * sizeof(paddr));
+
+		if (kpageflags_file) {
+			pread(kpageflags_file, &page_flags, sizeof(page_flags),
+				(paddr & PFN_MASK) * sizeof(page_flags));
+
+			return !!(page_flags & KPF_THP);
+		}
+	}
+	return 0;
+}
+
+
+static uint64_t read_pmd_pagesize(void)
+{
+	int fd;
+	char buf[20];
+	ssize_t num_read;
+
+	fd = open(PMD_SIZE_PATH, O_RDONLY);
+	if (fd == -1) {
+		perror("Open hpage_pmd_size failed");
+		exit(EXIT_FAILURE);
+	}
+	num_read = read(fd, buf, 19);
+	if (num_read < 1) {
+		close(fd);
+		perror("Read hpage_pmd_size failed");
+		exit(EXIT_FAILURE);
+	}
+	buf[num_read] = '\0';
+	close(fd);
+
+	return strtoul(buf, NULL, 10);
+}
+
+static int write_file(const char *path, const char *buf, size_t buflen)
+{
+	int fd;
+	ssize_t numwritten;
+
+	fd = open(path, O_WRONLY);
+	if (fd == -1)
+		return 0;
+
+	numwritten = write(fd, buf, buflen - 1);
+	close(fd);
+	if (numwritten < 1)
+		return 0;
+
+	return (unsigned int) numwritten;
+}
+
+static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end)
+{
+	char input[INPUT_MAX];
+	int ret;
+
+	ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start,
+			vaddr_end);
+	if (ret >= INPUT_MAX) {
+		printf("%s: Debugfs input is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+	if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
+		perror(SPLIT_DEBUGFS);
+		exit(EXIT_FAILURE);
+	}
+}
+
+#define MAX_LINE_LENGTH 500
+
+static bool check_for_pattern(FILE *fp, const char *pattern, char *buf)
+{
+	while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) {
+		if (!strncmp(buf, pattern, strlen(pattern)))
+			return true;
+	}
+	return false;
+}
+
+static uint64_t check_huge(void *addr)
+{
+	uint64_t thp = 0;
+	int ret;
+	FILE *fp;
+	char buffer[MAX_LINE_LENGTH];
+	char addr_pattern[MAX_LINE_LENGTH];
+
+	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
+		       (unsigned long) addr);
+	if (ret >= MAX_LINE_LENGTH) {
+		printf("%s: Pattern is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+
+	fp = fopen(SMAP_PATH, "r");
+	if (!fp) {
+		printf("%s: Failed to open file %s\n", __func__, SMAP_PATH);
+		exit(EXIT_FAILURE);
+	}
+	if (!check_for_pattern(fp, addr_pattern, buffer))
+		goto err_out;
+
+	/*
+	 * Fetch the AnonHugePages: in the same block and check the number of
+	 * hugepages.
+	 */
+	if (!check_for_pattern(fp, "AnonHugePages:", buffer))
+		goto err_out;
+
+	if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) {
+		printf("Reading smap error\n");
+		exit(EXIT_FAILURE);
+	}
+
+err_out:
+	fclose(fp);
+	return thp;
+}
+
+void split_pmd_thp(void)
+{
+	char *one_page;
+	size_t len = 4 * pmd_pagesize;
+	uint64_t thp_size;
+	size_t i;
+
+	one_page = memalign(pmd_pagesize, len);
+
+	madvise(one_page, len, MADV_HUGEPAGE);
+
+	for (i = 0; i < len; i++)
+		one_page[i] = (char)i;
+
+	thp_size = check_huge(one_page);
+	if (!thp_size) {
+		printf("No THP is allocatd");
+		exit(EXIT_FAILURE);
+	}
+
+	/* split all THPs */
+	write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len);
+
+	for (i = 0; i < len; i++)
+		if (one_page[i] != (char)i) {
+			printf("%ld byte corrupted\n", i);
+			exit(EXIT_FAILURE);
+		}
+
+
+	thp_size = check_huge(one_page);
+	if (thp_size) {
+		printf("Still %ld kB AnonHugePages not split\n", thp_size);
+		exit(EXIT_FAILURE);
+	}
+
+	printf("Split huge pages successful\n");
+	free(one_page);
+}
+
+void split_pte_mapped_thp(void)
+{
+	char *one_page, *pte_mapped, *pte_mapped2;
+	size_t len = 4 * pmd_pagesize;
+	uint64_t thp_size;
+	size_t i;
+	const char *pagemap_template = "/proc/%d/pagemap";
+	const char *kpageflags_proc = "/proc/kpageflags";
+	char pagemap_proc[255];
+	int pagemap_fd;
+	int kpageflags_fd;
+
+	if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
+		perror("get pagemap proc error");
+		exit(EXIT_FAILURE);
+	}
+	pagemap_fd = open(pagemap_proc, O_RDONLY);
+
+	if (pagemap_fd == -1) {
+		perror("read pagemap:");
+		exit(EXIT_FAILURE);
+	}
+
+	kpageflags_fd = open(kpageflags_proc, O_RDONLY);
+
+	if (kpageflags_fd == -1) {
+		perror("read kpageflags:");
+		exit(EXIT_FAILURE);
+	}
+
+	one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
+			MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+	madvise(one_page, len, MADV_HUGEPAGE);
+
+	for (i = 0; i < len; i++)
+		one_page[i] = (char)i;
+
+	thp_size = check_huge(one_page);
+	if (!thp_size) {
+		printf("No THP is allocatd");
+		exit(EXIT_FAILURE);
+	}
+
+	/* remap the first pagesize of first THP */
+	pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
+
+	/* remap the Nth pagesize of Nth THP */
+	for (i = 1; i < 4; i++) {
+		pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
+				     pagesize, pagesize,
+				     MREMAP_MAYMOVE|MREMAP_FIXED,
+				     pte_mapped + pagesize * i);
+		if (pte_mapped2 == (char *)-1) {
+			perror("mremap failed");
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	/* smap does not show THPs after mremap, use kpageflags instead */
+	thp_size = 0;
+	for (i = 0; i < pagesize * 4; i++)
+		if (i % pagesize == 0 &&
+		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
+			thp_size++;
+
+	if (thp_size != 4) {
+		printf("Some THPs are missing during mremap\n");
+		exit(EXIT_FAILURE);
+	}
+
+	/* split all remapped THPs */
+	write_debugfs(getpid(), (uint64_t)pte_mapped,
+		      (uint64_t)pte_mapped + pagesize * 4);
+
+	/* smap does not show THPs after mremap, use kpageflags instead */
+	thp_size = 0;
+	for (i = 0; i < pagesize * 4; i++) {
+		if (pte_mapped[i] != (char)i) {
+			printf("%ld byte corrupted\n", i);
+			exit(EXIT_FAILURE);
+		}
+		if (i % pagesize == 0 &&
+		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
+			thp_size++;
+	}
+
+	if (thp_size) {
+		printf("Still %ld THPs not split\n", thp_size);
+		exit(EXIT_FAILURE);
+	}
+
+	printf("Split PTE-mapped huge pages successful\n");
+	munmap(one_page, len);
+	close(pagemap_fd);
+	close(kpageflags_fd);
+}
+
+int main(int argc, char **argv)
+{
+	if (geteuid() != 0) {
+		printf("Please run the benchmark as root\n");
+		exit(EXIT_FAILURE);
+	}
+
+	pagesize = getpagesize();
+	pageshift = ffs(pagesize) - 1;
+	pmd_pagesize = read_pmd_pagesize();
+
+	split_pmd_thp();
+	split_pte_mapped_thp();
+
+	return 0;
+}
-- 
2.30.1



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v4 2/2] mm: huge_memory: debugfs for file-backed THP split.
  2021-03-15 20:33 [PATCH v4 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests Zi Yan
@ 2021-03-15 20:33 ` Zi Yan
  2021-03-16 23:18   ` Yang Shi
  2021-03-16  0:36 ` [PATCH v4 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests kernel test robot
  2021-03-16 22:23 ` Yang Shi
  2 siblings, 1 reply; 9+ messages in thread
From: Zi Yan @ 2021-03-15 20:33 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, linux-kselftest, Kirill A . Shutemov,
	Andrew Morton, Shuah Khan, John Hubbard, Sandipan Das,
	David Hildenbrand, Yang Shi, Mika Penttila, Zi Yan

From: Zi Yan <ziy@nvidia.com>

Further extend <debugfs>/split_huge_pages to accept
"<path>,<off_start>,<off_end>" for file-backed THP split tests since
tmpfs may have file backed by THP that mapped nowhere.

Update selftest program to test file-backed THP split too.

Suggested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 mm/huge_memory.c                              | 95 ++++++++++++++++++-
 .../selftests/vm/split_huge_page_test.c       | 79 ++++++++++++++-
 2 files changed, 166 insertions(+), 8 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3bfee54e2cd0..da91ee97d944 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3043,12 +3043,72 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
 	return ret;
 }
 
+static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
+				pgoff_t off_end)
+{
+	struct filename *file;
+	struct file *candidate;
+	struct address_space *mapping;
+	int ret = -EINVAL;
+	pgoff_t off_cur;
+	unsigned long total = 0, split = 0;
+
+	file = getname_kernel(file_path);
+	if (IS_ERR(file))
+		return ret;
+
+	candidate = file_open_name(file, O_RDONLY, 0);
+	if (IS_ERR(candidate))
+		goto out;
+
+	pr_info("split file-backed THPs in file: %s, offset: [0x%lx - 0x%lx]\n",
+		 file_path, off_start, off_end);
+
+	mapping = candidate->f_mapping;
+
+	for (off_cur = off_start; off_cur < off_end;) {
+		struct page *fpage = pagecache_get_page(mapping, off_cur,
+						FGP_ENTRY | FGP_HEAD, 0);
+
+		if (xa_is_value(fpage) || !fpage) {
+			off_cur += PAGE_SIZE;
+			continue;
+		}
+
+		if (!is_transparent_hugepage(fpage)) {
+			off_cur += PAGE_SIZE;
+			goto next;
+		}
+		total++;
+		off_cur = fpage->index + thp_size(fpage);
+
+		if (!trylock_page(fpage))
+			goto next;
+
+		if (!split_huge_page(fpage))
+			split++;
+
+		unlock_page(fpage);
+next:
+		put_page(fpage);
+	}
+
+	filp_close(candidate, NULL);
+	ret = 0;
+
+	pr_info("%lu of %lu file-backed THP split\n", split, total);
+out:
+	putname(file);
+	return ret;
+}
+
 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppops)
 {
 	static DEFINE_MUTEX(mutex);
 	ssize_t ret;
-	char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
+	/* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
+	char input_buf[MAX_INPUT];
 	int pid;
 	unsigned long vaddr_start, vaddr_end;
 
@@ -3058,11 +3118,40 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
 
 	ret = -EFAULT;
 
-	memset(input_buf, 0, 80);
+	memset(input_buf, 0, MAX_INPUT);
 	if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
 		goto out;
 
-	input_buf[79] = '\0';
+	input_buf[MAX_INPUT - 1] = '\0';
+
+	if (input_buf[0] == '/') {
+		char *tok;
+		char *buf = input_buf;
+		char file_path[MAX_INPUT];
+		pgoff_t off_start = 0, off_end = 0;
+		size_t input_len = strlen(input_buf);
+
+		tok = strsep(&buf, ",");
+		if (tok) {
+			strncpy(file_path, tok, MAX_INPUT);
+		} else {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
+		if (ret != 2) {
+			pr_info("ret: %ld\n", ret);
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = split_huge_pages_in_file(file_path, off_start, off_end);
+		if (!ret)
+			ret = input_len;
+
+		goto out;
+	}
+
 	ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
 	if (ret == 1 && pid == 1) {
 		split_huge_pages_all();
diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
index 9f33ddbb3182..0202702f7eda 100644
--- a/tools/testing/selftests/vm/split_huge_page_test.c
+++ b/tools/testing/selftests/vm/split_huge_page_test.c
@@ -7,11 +7,13 @@
 #define _GNU_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdarg.h>
 #include <unistd.h>
 #include <inttypes.h>
 #include <string.h>
 #include <fcntl.h>
 #include <sys/mman.h>
+#include <sys/mount.h>
 #include <malloc.h>
 #include <stdbool.h>
 
@@ -24,6 +26,9 @@ uint64_t pmd_pagesize;
 #define SMAP_PATH "/proc/self/smaps"
 #define INPUT_MAX 80
 
+#define PID_FMT "%d,0x%lx,0x%lx"
+#define PATH_FMT "%s,0x%lx,0x%lx"
+
 #define PFN_MASK     ((1UL<<55)-1)
 #define KPF_THP      (1UL<<22)
 
@@ -87,13 +92,16 @@ static int write_file(const char *path, const char *buf, size_t buflen)
 	return (unsigned int) numwritten;
 }
 
-static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end)
+static void write_debugfs(const char *fmt, ...)
 {
 	char input[INPUT_MAX];
 	int ret;
+	va_list argp;
+
+	va_start(argp, fmt);
+	ret = vsnprintf(input, INPUT_MAX, fmt, argp);
+	va_end(argp);
 
-	ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start,
-			vaddr_end);
 	if (ret >= INPUT_MAX) {
 		printf("%s: Debugfs input is too long\n", __func__);
 		exit(EXIT_FAILURE);
@@ -178,7 +186,8 @@ void split_pmd_thp(void)
 	}
 
 	/* split all THPs */
-	write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len);
+	write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
+		(uint64_t)one_page + len);
 
 	for (i = 0; i < len; i++)
 		if (one_page[i] != (char)i) {
@@ -269,7 +278,7 @@ void split_pte_mapped_thp(void)
 	}
 
 	/* split all remapped THPs */
-	write_debugfs(getpid(), (uint64_t)pte_mapped,
+	write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped,
 		      (uint64_t)pte_mapped + pagesize * 4);
 
 	/* smap does not show THPs after mremap, use kpageflags instead */
@@ -295,6 +304,65 @@ void split_pte_mapped_thp(void)
 	close(kpageflags_fd);
 }
 
+void split_file_backed_thp(void)
+{
+	int status;
+	int fd;
+	ssize_t num_written;
+	char tmpfs_template[] = "/tmp/thp_split_XXXXXX";
+	const char *tmpfs_loc = mkdtemp(tmpfs_template);
+	char testfile[INPUT_MAX];
+
+	status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m");
+
+	if (status) {
+		printf("Unable to create a tmpfs for testing\n");
+		exit(EXIT_FAILURE);
+	}
+
+	status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc);
+	if (status >= INPUT_MAX) {
+		printf("Fail to create file-backed THP split testing file\n");
+		goto cleanup;
+	}
+
+	fd = open(testfile, O_CREAT|O_WRONLY);
+	if (fd == -1) {
+		perror("Cannot open testing file\n");
+		goto cleanup;
+	}
+
+	/* write something to the file, so a file-backed THP can be allocated */
+	num_written = write(fd, tmpfs_loc, sizeof(tmpfs_loc));
+	close(fd);
+
+	if (num_written < 1) {
+		printf("Fail to write data to testing file\n");
+		goto cleanup;
+	}
+
+	/* split the file-backed THP */
+	write_debugfs(PATH_FMT, testfile, 0, 1024);
+
+	status = unlink(testfile);
+	if (status)
+		perror("Cannot remove testing file\n");
+
+cleanup:
+	status = umount(tmpfs_loc);
+	if (status) {
+		printf("Unable to umount %s\n", tmpfs_loc);
+		exit(EXIT_FAILURE);
+	}
+	status = rmdir(tmpfs_loc);
+	if (status) {
+		perror("cannot remove tmp dir");
+		exit(EXIT_FAILURE);
+	}
+
+	printf("file-backed THP split test done, please check dmesg for more information\n");
+}
+
 int main(int argc, char **argv)
 {
 	if (geteuid() != 0) {
@@ -308,6 +376,7 @@ int main(int argc, char **argv)
 
 	split_pmd_thp();
 	split_pte_mapped_thp();
+	split_file_backed_thp();
 
 	return 0;
 }
-- 
2.30.1



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests.
  2021-03-15 20:33 [PATCH v4 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests Zi Yan
  2021-03-15 20:33 ` [PATCH v4 2/2] mm: huge_memory: debugfs for file-backed THP split Zi Yan
@ 2021-03-16  0:36 ` kernel test robot
  2021-03-16  1:10   ` Zi Yan
  2021-03-16 22:23 ` Yang Shi
  2 siblings, 1 reply; 9+ messages in thread
From: kernel test robot @ 2021-03-16  0:36 UTC (permalink / raw)
  To: Zi Yan, linux-mm
  Cc: kbuild-all, linux-kernel, linux-kselftest, Kirill A . Shutemov,
	Andrew Morton, Linux Memory Management List, Shuah Khan,
	John Hubbard, Sandipan Das, David Hildenbrand

[-- Attachment #1: Type: text/plain, Size: 2184 bytes --]

Hi Zi,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on kselftest/next]
[also build test WARNING on linux/master linus/master v5.12-rc3]
[cannot apply to hnaz-linux-mm/master next-20210315]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Zi-Yan/mm-huge_memory-a-new-debugfs-interface-for-splitting-THP-tests/20210316-043528
base:   https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
config: i386-randconfig-m021-20210315 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

smatch warnings:
mm/huge_memory.c:3086 split_huge_pages_write() warn: sscanf doesn't return error codes

vim +3086 mm/huge_memory.c

  3051	
  3052	static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
  3053					size_t count, loff_t *ppops)
  3054	{
  3055		static DEFINE_MUTEX(mutex);
  3056		ssize_t ret;
  3057		char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
  3058		int pid;
  3059		unsigned long vaddr_start, vaddr_end;
  3060	
  3061		ret = mutex_lock_interruptible(&mutex);
  3062		if (ret)
  3063			return ret;
  3064	
  3065		ret = -EFAULT;
  3066	
  3067		memset(input_buf, 0, 80);
  3068		if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
  3069			goto out;
  3070	
  3071		input_buf[79] = '\0';
  3072		ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
  3073		if (ret == 1 && pid == 1) {
  3074			split_huge_pages_all();
  3075			ret = strlen(input_buf);
  3076			goto out;
  3077		} else if (ret != 3) {
  3078			ret = -EINVAL;
  3079			goto out;
  3080		}
  3081	
  3082		if (!split_huge_pages_pid(pid, vaddr_start, vaddr_end))
  3083			ret = strlen(input_buf);
  3084	out:
  3085		mutex_unlock(&mutex);
> 3086		return ret;
  3087	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 32434 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests.
  2021-03-16  0:36 ` [PATCH v4 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests kernel test robot
@ 2021-03-16  1:10   ` Zi Yan
  0 siblings, 0 replies; 9+ messages in thread
From: Zi Yan @ 2021-03-16  1:10 UTC (permalink / raw)
  To: kernel test robot
  Cc: Zi Yan, linux-mm, kbuild-all, linux-kernel, linux-kselftest,
	Kirill A . Shutemov, Andrew Morton, Shuah Khan, John Hubbard,
	Sandipan Das, David Hildenbrand

[-- Attachment #1: Type: text/plain, Size: 2537 bytes --]

On 15 Mar 2021, at 20:36, kernel test robot wrote:

> Hi Zi,
>
> Thank you for the patch! Perhaps something to improve:
>
> [auto build test WARNING on kselftest/next]
> [also build test WARNING on linux/master linus/master v5.12-rc3]
> [cannot apply to hnaz-linux-mm/master next-20210315]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch]
>
> url:    https://github.com/0day-ci/linux/commits/Zi-Yan/mm-huge_memory-a-new-debugfs-interface-for-splitting-THP-tests/20210316-043528
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
> config: i386-randconfig-m021-20210315 (attached as .config)
> compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
>
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>
>
> smatch warnings:
> mm/huge_memory.c:3086 split_huge_pages_write() warn: sscanf doesn't return error codes
>
> vim +3086 mm/huge_memory.c
>
>   3051	
>   3052	static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
>   3053					size_t count, loff_t *ppops)
>   3054	{
>   3055		static DEFINE_MUTEX(mutex);
>   3056		ssize_t ret;
>   3057		char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
>   3058		int pid;
>   3059		unsigned long vaddr_start, vaddr_end;
>   3060	
>   3061		ret = mutex_lock_interruptible(&mutex);
>   3062		if (ret)
>   3063			return ret;
>   3064	
>   3065		ret = -EFAULT;
>   3066	
>   3067		memset(input_buf, 0, 80);
>   3068		if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
>   3069			goto out;
>   3070	
>   3071		input_buf[79] = '\0';
>   3072		ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
>   3073		if (ret == 1 && pid == 1) {
>   3074			split_huge_pages_all();
>   3075			ret = strlen(input_buf);
>   3076			goto out;
>   3077		} else if (ret != 3) {
>   3078			ret = -EINVAL;
>   3079			goto out;
>   3080		}
>   3081	
>   3082		if (!split_huge_pages_pid(pid, vaddr_start, vaddr_end))
>   3083			ret = strlen(input_buf);

Change this to:

ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end);
if (!ret)
	ret = strlen(input_buf);

should fix the warning. I will resend after I get feedback for the patches.


>   3084	out:
>   3085		mutex_unlock(&mutex);
>> 3086		return ret;
>   3087	





—
Best Regards,
Yan Zi

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 854 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests.
  2021-03-15 20:33 [PATCH v4 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests Zi Yan
  2021-03-15 20:33 ` [PATCH v4 2/2] mm: huge_memory: debugfs for file-backed THP split Zi Yan
  2021-03-16  0:36 ` [PATCH v4 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests kernel test robot
@ 2021-03-16 22:23 ` Yang Shi
  2021-03-17 14:51   ` Zi Yan
  2 siblings, 1 reply; 9+ messages in thread
From: Yang Shi @ 2021-03-16 22:23 UTC (permalink / raw)
  To: Zi Yan
  Cc: Linux MM, Linux Kernel Mailing List, linux-kselftest,
	Kirill A . Shutemov, Andrew Morton, Shuah Khan, John Hubbard,
	Sandipan Das, David Hildenbrand, Mika Penttila

On Mon, Mar 15, 2021 at 1:34 PM Zi Yan <zi.yan@sent.com> wrote:
>
> From: Zi Yan <ziy@nvidia.com>
>
> We did not have a direct user interface of splitting the compound page
> backing a THP and there is no need unless we want to expose the THP
> implementation details to users. Make <debugfs>/split_huge_pages accept
> a new command to do that.
>
> By writing "<pid>,<vaddr_start>,<vaddr_end>" to
> <debugfs>/split_huge_pages, THPs within the given virtual address range
> from the process with the given pid are split. It is used to test
> split_huge_page function. In addition, a selftest program is added to
> tools/testing/selftests/vm to utilize the interface by splitting
> PMD THPs and PTE-mapped THPs.
>
> This does not change the old behavior, i.e., writing 1 to the interface
> to split all THPs in the system.
>
> Changelog:
>
> From v3:
> 1. Factored out split huge pages in the given pid code to a separate
>    function.
> 2. Added the missing put_page for not split pages.
> 3. pr_debug -> pr_info, make reading results simpler.
>
> From v2:
>
> 1. Reused existing <debugfs>/split_huge_pages interface. (suggested by
>    Yang Shi)
>
> From v1:
>
> 1. Removed unnecessary calling to vma_migratable, spotted by kernel test
>    robot <lkp@intel.com>.
> 2. Dropped the use of find_mm_struct and code it directly, since there
>    is no need for the permission check in that function and the function
>    is only available when migration is on.
> 3. Added some comments in the selftest program to clarify how PTE-mapped
>    THPs are formed.
>
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
>  mm/huge_memory.c                              | 136 +++++++-
>  tools/testing/selftests/vm/.gitignore         |   1 +
>  tools/testing/selftests/vm/Makefile           |   1 +
>  .../selftests/vm/split_huge_page_test.c       | 313 ++++++++++++++++++
>  4 files changed, 444 insertions(+), 7 deletions(-)
>  create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index bff92dea5ab3..3bfee54e2cd0 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -7,6 +7,7 @@
>
>  #include <linux/mm.h>
>  #include <linux/sched.h>
> +#include <linux/sched/mm.h>
>  #include <linux/sched/coredump.h>
>  #include <linux/sched/numa_balancing.h>
>  #include <linux/highmem.h>
> @@ -2922,16 +2923,14 @@ static struct shrinker deferred_split_shrinker = {
>  };
>
>  #ifdef CONFIG_DEBUG_FS
> -static int split_huge_pages_set(void *data, u64 val)
> +static void split_huge_pages_all(void)
>  {
>         struct zone *zone;
>         struct page *page;
>         unsigned long pfn, max_zone_pfn;
>         unsigned long total = 0, split = 0;
>
> -       if (val != 1)
> -               return -EINVAL;
> -
> +       pr_info("Split all THPs\n");
>         for_each_populated_zone(zone) {
>                 max_zone_pfn = zone_end_pfn(zone);
>                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
> @@ -2959,11 +2958,134 @@ static int split_huge_pages_set(void *data, u64 val)
>         }
>
>         pr_info("%lu of %lu THP split\n", split, total);
> +}
>
> -       return 0;
> +static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
> +                               unsigned long vaddr_end)
> +{
> +       int ret = 0;
> +       struct task_struct *task;
> +       struct mm_struct *mm;
> +       unsigned long total = 0, split = 0;
> +       unsigned long addr;
> +
> +       vaddr_start &= PAGE_MASK;
> +       vaddr_end &= PAGE_MASK;
> +
> +       /* Find the task_struct from pid */
> +       rcu_read_lock();
> +       task = find_task_by_vpid(pid);
> +       if (!task) {
> +               rcu_read_unlock();
> +               ret = -ESRCH;
> +               goto out;
> +       }
> +       get_task_struct(task);
> +       rcu_read_unlock();
> +
> +       /* Find the mm_struct */
> +       mm = get_task_mm(task);
> +       put_task_struct(task);
> +
> +       if (!mm) {
> +               ret = -EINVAL;
> +               goto out;
> +       }
> +
> +       pr_info("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
> +                pid, vaddr_start, vaddr_end);
> +
> +       mmap_read_lock(mm);
> +       /*
> +        * always increase addr by PAGE_SIZE, since we could have a PTE page
> +        * table filled with PTE-mapped THPs, each of which is distinct.
> +        */
> +       for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
> +               struct vm_area_struct *vma = find_vma(mm, addr);
> +               unsigned int follflags;
> +               struct page *page;
> +
> +               if (!vma || addr < vma->vm_start)
> +                       break;

I think you could skip some special vmas, i.e. VM_HUGETLB, VM_PFNMAP,
VM_MIXEDMAP and VM_IO, this should save some time for some cases.

> +
> +               /* FOLL_DUMP to ignore special (like zero) pages */
> +               follflags = FOLL_GET | FOLL_DUMP;
> +               page = follow_page(vma, addr, follflags);
> +
> +               if (IS_ERR(page))
> +                       break;
> +               if (!page)
> +                       break;

Shouldn't use continue here and in the above IS_ERR branch? The addr
might be not mapped, but the next one might be mapped.

> +
> +               if (!is_transparent_hugepage(page))
> +                       goto next;
> +
> +               total++;
> +               if (!can_split_huge_page(compound_head(page), NULL))
> +                       goto next;
> +
> +               if (!trylock_page(page))
> +                       goto next;
> +
> +               if (!split_huge_page(page))
> +                       split++;
> +
> +               unlock_page(page);
> +next:
> +               put_page(page);
> +       }
> +       mmap_read_unlock(mm);
> +       mmput(mm);
> +
> +       pr_info("%lu of %lu THP split\n", split, total);
> +
> +out:
> +       return ret;
>  }
> -DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
> -               "%llu\n");
> +
> +static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
> +                               size_t count, loff_t *ppops)
> +{
> +       static DEFINE_MUTEX(mutex);

May have a more descriptive name for the mutex?

> +       ssize_t ret;
> +       char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
> +       int pid;
> +       unsigned long vaddr_start, vaddr_end;
> +
> +       ret = mutex_lock_interruptible(&mutex);
> +       if (ret)
> +               return ret;
> +
> +       ret = -EFAULT;
> +
> +       memset(input_buf, 0, 80);
> +       if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
> +               goto out;
> +
> +       input_buf[79] = '\0';
> +       ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
> +       if (ret == 1 && pid == 1) {
> +               split_huge_pages_all();
> +               ret = strlen(input_buf);
> +               goto out;
> +       } else if (ret != 3) {
> +               ret = -EINVAL;
> +               goto out;
> +       }
> +
> +       if (!split_huge_pages_pid(pid, vaddr_start, vaddr_end))
> +               ret = strlen(input_buf);

Why not return the errno from split_huge_pages_pid()? The
split_huge_pages_pid() may return -ESRCH or -EINVAL.

> +out:
> +       mutex_unlock(&mutex);
> +       return ret;
> +
> +}
> +
> +static const struct file_operations split_huge_pages_fops = {
> +       .owner   = THIS_MODULE,
> +       .write   = split_huge_pages_write,
> +       .llseek  = no_llseek,
> +};
>
>  static int __init split_huge_pages_debugfs(void)
>  {
> diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
> index 9a35c3f6a557..1f651e85ed60 100644
> --- a/tools/testing/selftests/vm/.gitignore
> +++ b/tools/testing/selftests/vm/.gitignore
> @@ -22,3 +22,4 @@ map_fixed_noreplace
>  write_to_hugetlbfs
>  hmm-tests
>  local_config.*
> +split_huge_page_test
> diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
> index d42115e4284d..4cbc91d6869f 100644
> --- a/tools/testing/selftests/vm/Makefile
> +++ b/tools/testing/selftests/vm/Makefile
> @@ -42,6 +42,7 @@ TEST_GEN_FILES += on-fault-limit
>  TEST_GEN_FILES += thuge-gen
>  TEST_GEN_FILES += transhuge-stress
>  TEST_GEN_FILES += userfaultfd
> +TEST_GEN_FILES += split_huge_page_test
>
>  ifeq ($(MACHINE),x86_64)
>  CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
> diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
> new file mode 100644
> index 000000000000..9f33ddbb3182
> --- /dev/null
> +++ b/tools/testing/selftests/vm/split_huge_page_test.c
> @@ -0,0 +1,313 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
> + * address range in a process via <debugfs>/split_huge_pages interface.
> + */
> +
> +#define _GNU_SOURCE
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <inttypes.h>
> +#include <string.h>
> +#include <fcntl.h>
> +#include <sys/mman.h>
> +#include <malloc.h>
> +#include <stdbool.h>
> +
> +uint64_t pagesize;
> +unsigned int pageshift;
> +uint64_t pmd_pagesize;
> +
> +#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
> +#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages"
> +#define SMAP_PATH "/proc/self/smaps"
> +#define INPUT_MAX 80
> +
> +#define PFN_MASK     ((1UL<<55)-1)
> +#define KPF_THP      (1UL<<22)
> +
> +int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
> +{
> +       uint64_t paddr;
> +       uint64_t page_flags;
> +
> +       if (pagemap_file) {
> +               pread(pagemap_file, &paddr, sizeof(paddr),
> +                       ((long)vaddr >> pageshift) * sizeof(paddr));
> +
> +               if (kpageflags_file) {
> +                       pread(kpageflags_file, &page_flags, sizeof(page_flags),
> +                               (paddr & PFN_MASK) * sizeof(page_flags));
> +
> +                       return !!(page_flags & KPF_THP);
> +               }
> +       }
> +       return 0;
> +}
> +
> +
> +static uint64_t read_pmd_pagesize(void)
> +{
> +       int fd;
> +       char buf[20];
> +       ssize_t num_read;
> +
> +       fd = open(PMD_SIZE_PATH, O_RDONLY);
> +       if (fd == -1) {
> +               perror("Open hpage_pmd_size failed");
> +               exit(EXIT_FAILURE);
> +       }
> +       num_read = read(fd, buf, 19);
> +       if (num_read < 1) {
> +               close(fd);
> +               perror("Read hpage_pmd_size failed");
> +               exit(EXIT_FAILURE);
> +       }
> +       buf[num_read] = '\0';
> +       close(fd);
> +
> +       return strtoul(buf, NULL, 10);
> +}
> +
> +static int write_file(const char *path, const char *buf, size_t buflen)
> +{
> +       int fd;
> +       ssize_t numwritten;
> +
> +       fd = open(path, O_WRONLY);
> +       if (fd == -1)
> +               return 0;
> +
> +       numwritten = write(fd, buf, buflen - 1);
> +       close(fd);
> +       if (numwritten < 1)
> +               return 0;
> +
> +       return (unsigned int) numwritten;
> +}
> +
> +static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end)
> +{
> +       char input[INPUT_MAX];
> +       int ret;
> +
> +       ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start,
> +                       vaddr_end);
> +       if (ret >= INPUT_MAX) {
> +               printf("%s: Debugfs input is too long\n", __func__);
> +               exit(EXIT_FAILURE);
> +       }
> +
> +       if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
> +               perror(SPLIT_DEBUGFS);
> +               exit(EXIT_FAILURE);
> +       }
> +}
> +
> +#define MAX_LINE_LENGTH 500
> +
> +static bool check_for_pattern(FILE *fp, const char *pattern, char *buf)
> +{
> +       while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) {
> +               if (!strncmp(buf, pattern, strlen(pattern)))
> +                       return true;
> +       }
> +       return false;
> +}
> +
> +static uint64_t check_huge(void *addr)
> +{
> +       uint64_t thp = 0;
> +       int ret;
> +       FILE *fp;
> +       char buffer[MAX_LINE_LENGTH];
> +       char addr_pattern[MAX_LINE_LENGTH];
> +
> +       ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
> +                      (unsigned long) addr);
> +       if (ret >= MAX_LINE_LENGTH) {
> +               printf("%s: Pattern is too long\n", __func__);
> +               exit(EXIT_FAILURE);
> +       }
> +
> +
> +       fp = fopen(SMAP_PATH, "r");
> +       if (!fp) {
> +               printf("%s: Failed to open file %s\n", __func__, SMAP_PATH);
> +               exit(EXIT_FAILURE);
> +       }
> +       if (!check_for_pattern(fp, addr_pattern, buffer))
> +               goto err_out;
> +
> +       /*
> +        * Fetch the AnonHugePages: in the same block and check the number of
> +        * hugepages.
> +        */
> +       if (!check_for_pattern(fp, "AnonHugePages:", buffer))
> +               goto err_out;
> +
> +       if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) {
> +               printf("Reading smap error\n");
> +               exit(EXIT_FAILURE);
> +       }
> +
> +err_out:
> +       fclose(fp);
> +       return thp;
> +}
> +
> +void split_pmd_thp(void)
> +{
> +       char *one_page;
> +       size_t len = 4 * pmd_pagesize;
> +       uint64_t thp_size;
> +       size_t i;
> +
> +       one_page = memalign(pmd_pagesize, len);

Need to check if one_page is NULL.

> +
> +       madvise(one_page, len, MADV_HUGEPAGE);
> +
> +       for (i = 0; i < len; i++)
> +               one_page[i] = (char)i;
> +
> +       thp_size = check_huge(one_page);
> +       if (!thp_size) {
> +               printf("No THP is allocatd");

s/allocted/allocated

> +               exit(EXIT_FAILURE);
> +       }
> +
> +       /* split all THPs */
> +       write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len);
> +
> +       for (i = 0; i < len; i++)
> +               if (one_page[i] != (char)i) {
> +                       printf("%ld byte corrupted\n", i);
> +                       exit(EXIT_FAILURE);
> +               }
> +
> +
> +       thp_size = check_huge(one_page);
> +       if (thp_size) {
> +               printf("Still %ld kB AnonHugePages not split\n", thp_size);
> +               exit(EXIT_FAILURE);
> +       }
> +
> +       printf("Split huge pages successful\n");
> +       free(one_page);
> +}
> +
> +void split_pte_mapped_thp(void)
> +{
> +       char *one_page, *pte_mapped, *pte_mapped2;
> +       size_t len = 4 * pmd_pagesize;
> +       uint64_t thp_size;
> +       size_t i;
> +       const char *pagemap_template = "/proc/%d/pagemap";
> +       const char *kpageflags_proc = "/proc/kpageflags";
> +       char pagemap_proc[255];
> +       int pagemap_fd;
> +       int kpageflags_fd;
> +
> +       if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
> +               perror("get pagemap proc error");
> +               exit(EXIT_FAILURE);
> +       }
> +       pagemap_fd = open(pagemap_proc, O_RDONLY);
> +
> +       if (pagemap_fd == -1) {
> +               perror("read pagemap:");
> +               exit(EXIT_FAILURE);
> +       }
> +
> +       kpageflags_fd = open(kpageflags_proc, O_RDONLY);
> +
> +       if (kpageflags_fd == -1) {
> +               perror("read kpageflags:");
> +               exit(EXIT_FAILURE);
> +       }
> +
> +       one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
> +                       MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
> +
> +       madvise(one_page, len, MADV_HUGEPAGE);
> +
> +       for (i = 0; i < len; i++)
> +               one_page[i] = (char)i;
> +
> +       thp_size = check_huge(one_page);
> +       if (!thp_size) {
> +               printf("No THP is allocatd");

s/allocatd/allocated

> +               exit(EXIT_FAILURE);
> +       }
> +
> +       /* remap the first pagesize of first THP */
> +       pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
> +
> +       /* remap the Nth pagesize of Nth THP */
> +       for (i = 1; i < 4; i++) {
> +               pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
> +                                    pagesize, pagesize,
> +                                    MREMAP_MAYMOVE|MREMAP_FIXED,
> +                                    pte_mapped + pagesize * i);
> +               if (pte_mapped2 == (char *)-1) {
> +                       perror("mremap failed");
> +                       exit(EXIT_FAILURE);
> +               }
> +       }
> +
> +       /* smap does not show THPs after mremap, use kpageflags instead */
> +       thp_size = 0;
> +       for (i = 0; i < pagesize * 4; i++)
> +               if (i % pagesize == 0 &&
> +                   is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
> +                       thp_size++;
> +
> +       if (thp_size != 4) {
> +               printf("Some THPs are missing during mremap\n");
> +               exit(EXIT_FAILURE);
> +       }
> +
> +       /* split all remapped THPs */
> +       write_debugfs(getpid(), (uint64_t)pte_mapped,
> +                     (uint64_t)pte_mapped + pagesize * 4);
> +
> +       /* smap does not show THPs after mremap, use kpageflags instead */
> +       thp_size = 0;
> +       for (i = 0; i < pagesize * 4; i++) {
> +               if (pte_mapped[i] != (char)i) {
> +                       printf("%ld byte corrupted\n", i);
> +                       exit(EXIT_FAILURE);
> +               }
> +               if (i % pagesize == 0 &&
> +                   is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
> +                       thp_size++;
> +       }
> +
> +       if (thp_size) {
> +               printf("Still %ld THPs not split\n", thp_size);
> +               exit(EXIT_FAILURE);
> +       }
> +
> +       printf("Split PTE-mapped huge pages successful\n");
> +       munmap(one_page, len);
> +       close(pagemap_fd);
> +       close(kpageflags_fd);
> +}
> +
> +int main(int argc, char **argv)
> +{
> +       if (geteuid() != 0) {
> +               printf("Please run the benchmark as root\n");
> +               exit(EXIT_FAILURE);
> +       }
> +
> +       pagesize = getpagesize();
> +       pageshift = ffs(pagesize) - 1;
> +       pmd_pagesize = read_pmd_pagesize();
> +
> +       split_pmd_thp();
> +       split_pte_mapped_thp();
> +
> +       return 0;
> +}
> --
> 2.30.1
>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 2/2] mm: huge_memory: debugfs for file-backed THP split.
  2021-03-15 20:33 ` [PATCH v4 2/2] mm: huge_memory: debugfs for file-backed THP split Zi Yan
@ 2021-03-16 23:18   ` Yang Shi
  2021-03-17 15:00     ` Zi Yan
  0 siblings, 1 reply; 9+ messages in thread
From: Yang Shi @ 2021-03-16 23:18 UTC (permalink / raw)
  To: Zi Yan
  Cc: Linux MM, Linux Kernel Mailing List, linux-kselftest,
	Kirill A . Shutemov, Andrew Morton, Shuah Khan, John Hubbard,
	Sandipan Das, David Hildenbrand, Mika Penttila

On Mon, Mar 15, 2021 at 1:34 PM Zi Yan <zi.yan@sent.com> wrote:
>
> From: Zi Yan <ziy@nvidia.com>
>
> Further extend <debugfs>/split_huge_pages to accept
> "<path>,<off_start>,<off_end>" for file-backed THP split tests since
> tmpfs may have file backed by THP that mapped nowhere.
>
> Update selftest program to test file-backed THP split too.
>
> Suggested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
>  mm/huge_memory.c                              | 95 ++++++++++++++++++-
>  .../selftests/vm/split_huge_page_test.c       | 79 ++++++++++++++-
>  2 files changed, 166 insertions(+), 8 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 3bfee54e2cd0..da91ee97d944 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3043,12 +3043,72 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>         return ret;
>  }
>
> +static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
> +                               pgoff_t off_end)
> +{
> +       struct filename *file;
> +       struct file *candidate;
> +       struct address_space *mapping;
> +       int ret = -EINVAL;
> +       pgoff_t off_cur;
> +       unsigned long total = 0, split = 0;
> +
> +       file = getname_kernel(file_path);
> +       if (IS_ERR(file))
> +               return ret;
> +
> +       candidate = file_open_name(file, O_RDONLY, 0);
> +       if (IS_ERR(candidate))
> +               goto out;
> +
> +       pr_info("split file-backed THPs in file: %s, offset: [0x%lx - 0x%lx]\n",
> +                file_path, off_start, off_end);
> +
> +       mapping = candidate->f_mapping;
> +
> +       for (off_cur = off_start; off_cur < off_end;) {
> +               struct page *fpage = pagecache_get_page(mapping, off_cur,
> +                                               FGP_ENTRY | FGP_HEAD, 0);
> +
> +               if (xa_is_value(fpage) || !fpage) {

Why do you have FGP_ENTRY? It seems it would return page instead of
NULL if page is value. So I think you could remove FGP_ENTRY and
xa_is_value() check as well.


> +                       off_cur += PAGE_SIZE;
> +                       continue;
> +               }
> +
> +               if (!is_transparent_hugepage(fpage)) {
> +                       off_cur += PAGE_SIZE;
> +                       goto next;
> +               }
> +               total++;
> +               off_cur = fpage->index + thp_size(fpage);
> +
> +               if (!trylock_page(fpage))
> +                       goto next;
> +
> +               if (!split_huge_page(fpage))
> +                       split++;
> +
> +               unlock_page(fpage);
> +next:
> +               put_page(fpage);
> +       }
> +
> +       filp_close(candidate, NULL);
> +       ret = 0;
> +
> +       pr_info("%lu of %lu file-backed THP split\n", split, total);
> +out:
> +       putname(file);
> +       return ret;
> +}
> +
>  static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
>                                 size_t count, loff_t *ppops)
>  {
>         static DEFINE_MUTEX(mutex);
>         ssize_t ret;
> -       char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
> +       /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
> +       char input_buf[MAX_INPUT];

I didn't find where MAX_INPUT is defined in your patch. Just saw
include/uapi/linux/limits.h have it defined. Is it the one you really
refer to?

>         int pid;
>         unsigned long vaddr_start, vaddr_end;
>
> @@ -3058,11 +3118,40 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
>
>         ret = -EFAULT;
>
> -       memset(input_buf, 0, 80);
> +       memset(input_buf, 0, MAX_INPUT);
>         if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
>                 goto out;
>
> -       input_buf[79] = '\0';
> +       input_buf[MAX_INPUT - 1] = '\0';
> +
> +       if (input_buf[0] == '/') {
> +               char *tok;
> +               char *buf = input_buf;
> +               char file_path[MAX_INPUT];
> +               pgoff_t off_start = 0, off_end = 0;
> +               size_t input_len = strlen(input_buf);
> +
> +               tok = strsep(&buf, ",");
> +               if (tok) {
> +                       strncpy(file_path, tok, MAX_INPUT);
> +               } else {
> +                       ret = -EINVAL;
> +                       goto out;
> +               }
> +
> +               ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
> +               if (ret != 2) {
> +                       pr_info("ret: %ld\n", ret);
> +                       ret = -EINVAL;
> +                       goto out;
> +               }
> +               ret = split_huge_pages_in_file(file_path, off_start, off_end);
> +               if (!ret)
> +                       ret = input_len;
> +
> +               goto out;
> +       }
> +
>         ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
>         if (ret == 1 && pid == 1) {
>                 split_huge_pages_all();
> diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
> index 9f33ddbb3182..0202702f7eda 100644
> --- a/tools/testing/selftests/vm/split_huge_page_test.c
> +++ b/tools/testing/selftests/vm/split_huge_page_test.c
> @@ -7,11 +7,13 @@
>  #define _GNU_SOURCE
>  #include <stdio.h>
>  #include <stdlib.h>
> +#include <stdarg.h>
>  #include <unistd.h>
>  #include <inttypes.h>
>  #include <string.h>
>  #include <fcntl.h>
>  #include <sys/mman.h>
> +#include <sys/mount.h>
>  #include <malloc.h>
>  #include <stdbool.h>
>
> @@ -24,6 +26,9 @@ uint64_t pmd_pagesize;
>  #define SMAP_PATH "/proc/self/smaps"
>  #define INPUT_MAX 80
>
> +#define PID_FMT "%d,0x%lx,0x%lx"
> +#define PATH_FMT "%s,0x%lx,0x%lx"
> +
>  #define PFN_MASK     ((1UL<<55)-1)
>  #define KPF_THP      (1UL<<22)
>
> @@ -87,13 +92,16 @@ static int write_file(const char *path, const char *buf, size_t buflen)
>         return (unsigned int) numwritten;
>  }
>
> -static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end)
> +static void write_debugfs(const char *fmt, ...)
>  {
>         char input[INPUT_MAX];
>         int ret;
> +       va_list argp;
> +
> +       va_start(argp, fmt);
> +       ret = vsnprintf(input, INPUT_MAX, fmt, argp);
> +       va_end(argp);
>
> -       ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start,
> -                       vaddr_end);
>         if (ret >= INPUT_MAX) {
>                 printf("%s: Debugfs input is too long\n", __func__);
>                 exit(EXIT_FAILURE);
> @@ -178,7 +186,8 @@ void split_pmd_thp(void)
>         }
>
>         /* split all THPs */
> -       write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len);
> +       write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
> +               (uint64_t)one_page + len);
>
>         for (i = 0; i < len; i++)
>                 if (one_page[i] != (char)i) {
> @@ -269,7 +278,7 @@ void split_pte_mapped_thp(void)
>         }
>
>         /* split all remapped THPs */
> -       write_debugfs(getpid(), (uint64_t)pte_mapped,
> +       write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped,
>                       (uint64_t)pte_mapped + pagesize * 4);
>
>         /* smap does not show THPs after mremap, use kpageflags instead */
> @@ -295,6 +304,65 @@ void split_pte_mapped_thp(void)
>         close(kpageflags_fd);
>  }
>
> +void split_file_backed_thp(void)
> +{
> +       int status;
> +       int fd;
> +       ssize_t num_written;
> +       char tmpfs_template[] = "/tmp/thp_split_XXXXXX";
> +       const char *tmpfs_loc = mkdtemp(tmpfs_template);
> +       char testfile[INPUT_MAX];
> +
> +       status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m");
> +
> +       if (status) {
> +               printf("Unable to create a tmpfs for testing\n");
> +               exit(EXIT_FAILURE);
> +       }
> +
> +       status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc);
> +       if (status >= INPUT_MAX) {
> +               printf("Fail to create file-backed THP split testing file\n");
> +               goto cleanup;
> +       }
> +
> +       fd = open(testfile, O_CREAT|O_WRONLY);
> +       if (fd == -1) {
> +               perror("Cannot open testing file\n");
> +               goto cleanup;
> +       }
> +
> +       /* write something to the file, so a file-backed THP can be allocated */
> +       num_written = write(fd, tmpfs_loc, sizeof(tmpfs_loc));
> +       close(fd);
> +
> +       if (num_written < 1) {
> +               printf("Fail to write data to testing file\n");
> +               goto cleanup;
> +       }
> +
> +       /* split the file-backed THP */
> +       write_debugfs(PATH_FMT, testfile, 0, 1024);
> +
> +       status = unlink(testfile);
> +       if (status)
> +               perror("Cannot remove testing file\n");
> +
> +cleanup:
> +       status = umount(tmpfs_loc);
> +       if (status) {
> +               printf("Unable to umount %s\n", tmpfs_loc);
> +               exit(EXIT_FAILURE);
> +       }
> +       status = rmdir(tmpfs_loc);
> +       if (status) {
> +               perror("cannot remove tmp dir");
> +               exit(EXIT_FAILURE);
> +       }
> +
> +       printf("file-backed THP split test done, please check dmesg for more information\n");
> +}
> +
>  int main(int argc, char **argv)
>  {
>         if (geteuid() != 0) {
> @@ -308,6 +376,7 @@ int main(int argc, char **argv)
>
>         split_pmd_thp();
>         split_pte_mapped_thp();
> +       split_file_backed_thp();
>
>         return 0;
>  }
> --
> 2.30.1
>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests.
  2021-03-16 22:23 ` Yang Shi
@ 2021-03-17 14:51   ` Zi Yan
  0 siblings, 0 replies; 9+ messages in thread
From: Zi Yan @ 2021-03-17 14:51 UTC (permalink / raw)
  To: Yang Shi
  Cc: Linux MM, Linux Kernel Mailing List, linux-kselftest,
	Kirill A . Shutemov, Andrew Morton, Shuah Khan, John Hubbard,
	Sandipan Das, David Hildenbrand, Mika Penttila

[-- Attachment #1: Type: text/plain, Size: 20377 bytes --]

On 16 Mar 2021, at 18:23, Yang Shi wrote:

> On Mon, Mar 15, 2021 at 1:34 PM Zi Yan <zi.yan@sent.com> wrote:
>>
>> From: Zi Yan <ziy@nvidia.com>
>>
>> We did not have a direct user interface of splitting the compound page
>> backing a THP and there is no need unless we want to expose the THP
>> implementation details to users. Make <debugfs>/split_huge_pages accept
>> a new command to do that.
>>
>> By writing "<pid>,<vaddr_start>,<vaddr_end>" to
>> <debugfs>/split_huge_pages, THPs within the given virtual address range
>> from the process with the given pid are split. It is used to test
>> split_huge_page function. In addition, a selftest program is added to
>> tools/testing/selftests/vm to utilize the interface by splitting
>> PMD THPs and PTE-mapped THPs.
>>
>> This does not change the old behavior, i.e., writing 1 to the interface
>> to split all THPs in the system.
>>
>> Changelog:
>>
>> From v3:
>> 1. Factored out split huge pages in the given pid code to a separate
>>    function.
>> 2. Added the missing put_page for not split pages.
>> 3. pr_debug -> pr_info, make reading results simpler.
>>
>> From v2:
>>
>> 1. Reused existing <debugfs>/split_huge_pages interface. (suggested by
>>    Yang Shi)
>>
>> From v1:
>>
>> 1. Removed unnecessary calling to vma_migratable, spotted by kernel test
>>    robot <lkp@intel.com>.
>> 2. Dropped the use of find_mm_struct and code it directly, since there
>>    is no need for the permission check in that function and the function
>>    is only available when migration is on.
>> 3. Added some comments in the selftest program to clarify how PTE-mapped
>>    THPs are formed.
>>
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> ---
>>  mm/huge_memory.c                              | 136 +++++++-
>>  tools/testing/selftests/vm/.gitignore         |   1 +
>>  tools/testing/selftests/vm/Makefile           |   1 +
>>  .../selftests/vm/split_huge_page_test.c       | 313 ++++++++++++++++++
>>  4 files changed, 444 insertions(+), 7 deletions(-)
>>  create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index bff92dea5ab3..3bfee54e2cd0 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -7,6 +7,7 @@
>>
>>  #include <linux/mm.h>
>>  #include <linux/sched.h>
>> +#include <linux/sched/mm.h>
>>  #include <linux/sched/coredump.h>
>>  #include <linux/sched/numa_balancing.h>
>>  #include <linux/highmem.h>
>> @@ -2922,16 +2923,14 @@ static struct shrinker deferred_split_shrinker = {
>>  };
>>
>>  #ifdef CONFIG_DEBUG_FS
>> -static int split_huge_pages_set(void *data, u64 val)
>> +static void split_huge_pages_all(void)
>>  {
>>         struct zone *zone;
>>         struct page *page;
>>         unsigned long pfn, max_zone_pfn;
>>         unsigned long total = 0, split = 0;
>>
>> -       if (val != 1)
>> -               return -EINVAL;
>> -
>> +       pr_info("Split all THPs\n");
>>         for_each_populated_zone(zone) {
>>                 max_zone_pfn = zone_end_pfn(zone);
>>                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
>> @@ -2959,11 +2958,134 @@ static int split_huge_pages_set(void *data, u64 val)
>>         }
>>
>>         pr_info("%lu of %lu THP split\n", split, total);
>> +}
>>
>> -       return 0;
>> +static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>> +                               unsigned long vaddr_end)
>> +{
>> +       int ret = 0;
>> +       struct task_struct *task;
>> +       struct mm_struct *mm;
>> +       unsigned long total = 0, split = 0;
>> +       unsigned long addr;
>> +
>> +       vaddr_start &= PAGE_MASK;
>> +       vaddr_end &= PAGE_MASK;
>> +
>> +       /* Find the task_struct from pid */
>> +       rcu_read_lock();
>> +       task = find_task_by_vpid(pid);
>> +       if (!task) {
>> +               rcu_read_unlock();
>> +               ret = -ESRCH;
>> +               goto out;
>> +       }
>> +       get_task_struct(task);
>> +       rcu_read_unlock();
>> +
>> +       /* Find the mm_struct */
>> +       mm = get_task_mm(task);
>> +       put_task_struct(task);
>> +
>> +       if (!mm) {
>> +               ret = -EINVAL;
>> +               goto out;
>> +       }
>> +
>> +       pr_info("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
>> +                pid, vaddr_start, vaddr_end);
>> +
>> +       mmap_read_lock(mm);
>> +       /*
>> +        * always increase addr by PAGE_SIZE, since we could have a PTE page
>> +        * table filled with PTE-mapped THPs, each of which is distinct.
>> +        */
>> +       for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
>> +               struct vm_area_struct *vma = find_vma(mm, addr);
>> +               unsigned int follflags;
>> +               struct page *page;
>> +
>> +               if (!vma || addr < vma->vm_start)
>> +                       break;
>
> I think you could skip some special vmas, i.e. VM_HUGETLB, VM_PFNMAP,
> VM_MIXEDMAP and VM_IO, this should save some time for some cases.
>

Sure. Will do.

>> +
>> +               /* FOLL_DUMP to ignore special (like zero) pages */
>> +               follflags = FOLL_GET | FOLL_DUMP;
>> +               page = follow_page(vma, addr, follflags);
>> +
>> +               if (IS_ERR(page))
>> +                       break;
>> +               if (!page)
>> +                       break;
>
> Shouldn't use continue here and in the above IS_ERR branch? The addr
> might be not mapped, but the next one might be mapped.
>

My original intention was just to test split on known THP ranges, so
the code did not anticipate any error from follow_page. But you point
makes sense. Will let the split process continue even if there is an
error or unmapped pages.

>> +
>> +               if (!is_transparent_hugepage(page))
>> +                       goto next;
>> +
>> +               total++;
>> +               if (!can_split_huge_page(compound_head(page), NULL))
>> +                       goto next;
>> +
>> +               if (!trylock_page(page))
>> +                       goto next;
>> +
>> +               if (!split_huge_page(page))
>> +                       split++;
>> +
>> +               unlock_page(page);
>> +next:
>> +               put_page(page);
>> +       }
>> +       mmap_read_unlock(mm);
>> +       mmput(mm);
>> +
>> +       pr_info("%lu of %lu THP split\n", split, total);
>> +
>> +out:
>> +       return ret;
>>  }
>> -DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
>> -               "%llu\n");
>> +
>> +static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
>> +                               size_t count, loff_t *ppops)
>> +{
>> +       static DEFINE_MUTEX(mutex);
>
> May have a more descriptive name for the mutex?

Sure. split_debug_mutex.

>
>> +       ssize_t ret;
>> +       char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
>> +       int pid;
>> +       unsigned long vaddr_start, vaddr_end;
>> +
>> +       ret = mutex_lock_interruptible(&mutex);
>> +       if (ret)
>> +               return ret;
>> +
>> +       ret = -EFAULT;
>> +
>> +       memset(input_buf, 0, 80);
>> +       if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
>> +               goto out;
>> +
>> +       input_buf[79] = '\0';
>> +       ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
>> +       if (ret == 1 && pid == 1) {
>> +               split_huge_pages_all();
>> +               ret = strlen(input_buf);
>> +               goto out;
>> +       } else if (ret != 3) {
>> +               ret = -EINVAL;
>> +               goto out;
>> +       }
>> +
>> +       if (!split_huge_pages_pid(pid, vaddr_start, vaddr_end))
>> +               ret = strlen(input_buf);
>
> Why not return the errno from split_huge_pages_pid()? The
> split_huge_pages_pid() may return -ESRCH or -EINVAL.

You are right. This is also causing a return error code issue spotted
by the bot. I am changing it to return error code from split_huge_pages_pid.

>
>> +out:
>> +       mutex_unlock(&mutex);
>> +       return ret;
>> +
>> +}
>> +
>> +static const struct file_operations split_huge_pages_fops = {
>> +       .owner   = THIS_MODULE,
>> +       .write   = split_huge_pages_write,
>> +       .llseek  = no_llseek,
>> +};
>>
>>  static int __init split_huge_pages_debugfs(void)
>>  {
>> diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
>> index 9a35c3f6a557..1f651e85ed60 100644
>> --- a/tools/testing/selftests/vm/.gitignore
>> +++ b/tools/testing/selftests/vm/.gitignore
>> @@ -22,3 +22,4 @@ map_fixed_noreplace
>>  write_to_hugetlbfs
>>  hmm-tests
>>  local_config.*
>> +split_huge_page_test
>> diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
>> index d42115e4284d..4cbc91d6869f 100644
>> --- a/tools/testing/selftests/vm/Makefile
>> +++ b/tools/testing/selftests/vm/Makefile
>> @@ -42,6 +42,7 @@ TEST_GEN_FILES += on-fault-limit
>>  TEST_GEN_FILES += thuge-gen
>>  TEST_GEN_FILES += transhuge-stress
>>  TEST_GEN_FILES += userfaultfd
>> +TEST_GEN_FILES += split_huge_page_test
>>
>>  ifeq ($(MACHINE),x86_64)
>>  CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
>> diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
>> new file mode 100644
>> index 000000000000..9f33ddbb3182
>> --- /dev/null
>> +++ b/tools/testing/selftests/vm/split_huge_page_test.c
>> @@ -0,0 +1,313 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
>> + * address range in a process via <debugfs>/split_huge_pages interface.
>> + */
>> +
>> +#define _GNU_SOURCE
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include <unistd.h>
>> +#include <inttypes.h>
>> +#include <string.h>
>> +#include <fcntl.h>
>> +#include <sys/mman.h>
>> +#include <malloc.h>
>> +#include <stdbool.h>
>> +
>> +uint64_t pagesize;
>> +unsigned int pageshift;
>> +uint64_t pmd_pagesize;
>> +
>> +#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
>> +#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages"
>> +#define SMAP_PATH "/proc/self/smaps"
>> +#define INPUT_MAX 80
>> +
>> +#define PFN_MASK     ((1UL<<55)-1)
>> +#define KPF_THP      (1UL<<22)
>> +
>> +int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
>> +{
>> +       uint64_t paddr;
>> +       uint64_t page_flags;
>> +
>> +       if (pagemap_file) {
>> +               pread(pagemap_file, &paddr, sizeof(paddr),
>> +                       ((long)vaddr >> pageshift) * sizeof(paddr));
>> +
>> +               if (kpageflags_file) {
>> +                       pread(kpageflags_file, &page_flags, sizeof(page_flags),
>> +                               (paddr & PFN_MASK) * sizeof(page_flags));
>> +
>> +                       return !!(page_flags & KPF_THP);
>> +               }
>> +       }
>> +       return 0;
>> +}
>> +
>> +
>> +static uint64_t read_pmd_pagesize(void)
>> +{
>> +       int fd;
>> +       char buf[20];
>> +       ssize_t num_read;
>> +
>> +       fd = open(PMD_SIZE_PATH, O_RDONLY);
>> +       if (fd == -1) {
>> +               perror("Open hpage_pmd_size failed");
>> +               exit(EXIT_FAILURE);
>> +       }
>> +       num_read = read(fd, buf, 19);
>> +       if (num_read < 1) {
>> +               close(fd);
>> +               perror("Read hpage_pmd_size failed");
>> +               exit(EXIT_FAILURE);
>> +       }
>> +       buf[num_read] = '\0';
>> +       close(fd);
>> +
>> +       return strtoul(buf, NULL, 10);
>> +}
>> +
>> +static int write_file(const char *path, const char *buf, size_t buflen)
>> +{
>> +       int fd;
>> +       ssize_t numwritten;
>> +
>> +       fd = open(path, O_WRONLY);
>> +       if (fd == -1)
>> +               return 0;
>> +
>> +       numwritten = write(fd, buf, buflen - 1);
>> +       close(fd);
>> +       if (numwritten < 1)
>> +               return 0;
>> +
>> +       return (unsigned int) numwritten;
>> +}
>> +
>> +static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end)
>> +{
>> +       char input[INPUT_MAX];
>> +       int ret;
>> +
>> +       ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start,
>> +                       vaddr_end);
>> +       if (ret >= INPUT_MAX) {
>> +               printf("%s: Debugfs input is too long\n", __func__);
>> +               exit(EXIT_FAILURE);
>> +       }
>> +
>> +       if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
>> +               perror(SPLIT_DEBUGFS);
>> +               exit(EXIT_FAILURE);
>> +       }
>> +}
>> +
>> +#define MAX_LINE_LENGTH 500
>> +
>> +static bool check_for_pattern(FILE *fp, const char *pattern, char *buf)
>> +{
>> +       while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) {
>> +               if (!strncmp(buf, pattern, strlen(pattern)))
>> +                       return true;
>> +       }
>> +       return false;
>> +}
>> +
>> +static uint64_t check_huge(void *addr)
>> +{
>> +       uint64_t thp = 0;
>> +       int ret;
>> +       FILE *fp;
>> +       char buffer[MAX_LINE_LENGTH];
>> +       char addr_pattern[MAX_LINE_LENGTH];
>> +
>> +       ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
>> +                      (unsigned long) addr);
>> +       if (ret >= MAX_LINE_LENGTH) {
>> +               printf("%s: Pattern is too long\n", __func__);
>> +               exit(EXIT_FAILURE);
>> +       }
>> +
>> +
>> +       fp = fopen(SMAP_PATH, "r");
>> +       if (!fp) {
>> +               printf("%s: Failed to open file %s\n", __func__, SMAP_PATH);
>> +               exit(EXIT_FAILURE);
>> +       }
>> +       if (!check_for_pattern(fp, addr_pattern, buffer))
>> +               goto err_out;
>> +
>> +       /*
>> +        * Fetch the AnonHugePages: in the same block and check the number of
>> +        * hugepages.
>> +        */
>> +       if (!check_for_pattern(fp, "AnonHugePages:", buffer))
>> +               goto err_out;
>> +
>> +       if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) {
>> +               printf("Reading smap error\n");
>> +               exit(EXIT_FAILURE);
>> +       }
>> +
>> +err_out:
>> +       fclose(fp);
>> +       return thp;
>> +}
>> +
>> +void split_pmd_thp(void)
>> +{
>> +       char *one_page;
>> +       size_t len = 4 * pmd_pagesize;
>> +       uint64_t thp_size;
>> +       size_t i;
>> +
>> +       one_page = memalign(pmd_pagesize, len);
>
> Need to check if one_page is NULL.

Right. Thanks.

>
>> +
>> +       madvise(one_page, len, MADV_HUGEPAGE);
>> +
>> +       for (i = 0; i < len; i++)
>> +               one_page[i] = (char)i;
>> +
>> +       thp_size = check_huge(one_page);
>> +       if (!thp_size) {
>> +               printf("No THP is allocatd");
>
> s/allocted/allocated

Will change.
>
>> +               exit(EXIT_FAILURE);
>> +       }
>> +
>> +       /* split all THPs */
>> +       write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len);
>> +
>> +       for (i = 0; i < len; i++)
>> +               if (one_page[i] != (char)i) {
>> +                       printf("%ld byte corrupted\n", i);
>> +                       exit(EXIT_FAILURE);
>> +               }
>> +
>> +
>> +       thp_size = check_huge(one_page);
>> +       if (thp_size) {
>> +               printf("Still %ld kB AnonHugePages not split\n", thp_size);
>> +               exit(EXIT_FAILURE);
>> +       }
>> +
>> +       printf("Split huge pages successful\n");
>> +       free(one_page);
>> +}
>> +
>> +void split_pte_mapped_thp(void)
>> +{
>> +       char *one_page, *pte_mapped, *pte_mapped2;
>> +       size_t len = 4 * pmd_pagesize;
>> +       uint64_t thp_size;
>> +       size_t i;
>> +       const char *pagemap_template = "/proc/%d/pagemap";
>> +       const char *kpageflags_proc = "/proc/kpageflags";
>> +       char pagemap_proc[255];
>> +       int pagemap_fd;
>> +       int kpageflags_fd;
>> +
>> +       if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
>> +               perror("get pagemap proc error");
>> +               exit(EXIT_FAILURE);
>> +       }
>> +       pagemap_fd = open(pagemap_proc, O_RDONLY);
>> +
>> +       if (pagemap_fd == -1) {
>> +               perror("read pagemap:");
>> +               exit(EXIT_FAILURE);
>> +       }
>> +
>> +       kpageflags_fd = open(kpageflags_proc, O_RDONLY);
>> +
>> +       if (kpageflags_fd == -1) {
>> +               perror("read kpageflags:");
>> +               exit(EXIT_FAILURE);
>> +       }
>> +
>> +       one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
>> +                       MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
>> +
>> +       madvise(one_page, len, MADV_HUGEPAGE);
>> +
>> +       for (i = 0; i < len; i++)
>> +               one_page[i] = (char)i;
>> +
>> +       thp_size = check_huge(one_page);
>> +       if (!thp_size) {
>> +               printf("No THP is allocatd");
>
> s/allocatd/allocated

Will change.
>
>> +               exit(EXIT_FAILURE);
>> +       }
>> +
>> +       /* remap the first pagesize of first THP */
>> +       pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
>> +
>> +       /* remap the Nth pagesize of Nth THP */
>> +       for (i = 1; i < 4; i++) {
>> +               pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
>> +                                    pagesize, pagesize,
>> +                                    MREMAP_MAYMOVE|MREMAP_FIXED,
>> +                                    pte_mapped + pagesize * i);
>> +               if (pte_mapped2 == (char *)-1) {
>> +                       perror("mremap failed");
>> +                       exit(EXIT_FAILURE);
>> +               }
>> +       }
>> +
>> +       /* smap does not show THPs after mremap, use kpageflags instead */
>> +       thp_size = 0;
>> +       for (i = 0; i < pagesize * 4; i++)
>> +               if (i % pagesize == 0 &&
>> +                   is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
>> +                       thp_size++;
>> +
>> +       if (thp_size != 4) {
>> +               printf("Some THPs are missing during mremap\n");
>> +               exit(EXIT_FAILURE);
>> +       }
>> +
>> +       /* split all remapped THPs */
>> +       write_debugfs(getpid(), (uint64_t)pte_mapped,
>> +                     (uint64_t)pte_mapped + pagesize * 4);
>> +
>> +       /* smap does not show THPs after mremap, use kpageflags instead */
>> +       thp_size = 0;
>> +       for (i = 0; i < pagesize * 4; i++) {
>> +               if (pte_mapped[i] != (char)i) {
>> +                       printf("%ld byte corrupted\n", i);
>> +                       exit(EXIT_FAILURE);
>> +               }
>> +               if (i % pagesize == 0 &&
>> +                   is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
>> +                       thp_size++;
>> +       }
>> +
>> +       if (thp_size) {
>> +               printf("Still %ld THPs not split\n", thp_size);
>> +               exit(EXIT_FAILURE);
>> +       }
>> +
>> +       printf("Split PTE-mapped huge pages successful\n");
>> +       munmap(one_page, len);
>> +       close(pagemap_fd);
>> +       close(kpageflags_fd);
>> +}
>> +
>> +int main(int argc, char **argv)
>> +{
>> +       if (geteuid() != 0) {
>> +               printf("Please run the benchmark as root\n");
>> +               exit(EXIT_FAILURE);
>> +       }
>> +
>> +       pagesize = getpagesize();
>> +       pageshift = ffs(pagesize) - 1;
>> +       pmd_pagesize = read_pmd_pagesize();
>> +
>> +       split_pmd_thp();
>> +       split_pte_mapped_thp();
>> +
>> +       return 0;
>> +}
>> --
>> 2.30.1
>>

Thanks for your review and comments.

—
Best Regards,
Yan Zi

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 854 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 2/2] mm: huge_memory: debugfs for file-backed THP split.
  2021-03-16 23:18   ` Yang Shi
@ 2021-03-17 15:00     ` Zi Yan
  2021-03-17 16:34       ` Yang Shi
  0 siblings, 1 reply; 9+ messages in thread
From: Zi Yan @ 2021-03-17 15:00 UTC (permalink / raw)
  To: Yang Shi
  Cc: Linux MM, Linux Kernel Mailing List, linux-kselftest,
	Kirill A . Shutemov, Andrew Morton, Shuah Khan, John Hubbard,
	Sandipan Das, David Hildenbrand, Mika Penttila

[-- Attachment #1: Type: text/plain, Size: 10849 bytes --]

On 16 Mar 2021, at 19:18, Yang Shi wrote:

> On Mon, Mar 15, 2021 at 1:34 PM Zi Yan <zi.yan@sent.com> wrote:
>>
>> From: Zi Yan <ziy@nvidia.com>
>>
>> Further extend <debugfs>/split_huge_pages to accept
>> "<path>,<off_start>,<off_end>" for file-backed THP split tests since
>> tmpfs may have file backed by THP that mapped nowhere.
>>
>> Update selftest program to test file-backed THP split too.
>>
>> Suggested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> ---
>>  mm/huge_memory.c                              | 95 ++++++++++++++++++-
>>  .../selftests/vm/split_huge_page_test.c       | 79 ++++++++++++++-
>>  2 files changed, 166 insertions(+), 8 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 3bfee54e2cd0..da91ee97d944 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3043,12 +3043,72 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>>         return ret;
>>  }
>>
>> +static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
>> +                               pgoff_t off_end)
>> +{
>> +       struct filename *file;
>> +       struct file *candidate;
>> +       struct address_space *mapping;
>> +       int ret = -EINVAL;
>> +       pgoff_t off_cur;
>> +       unsigned long total = 0, split = 0;
>> +
>> +       file = getname_kernel(file_path);
>> +       if (IS_ERR(file))
>> +               return ret;
>> +
>> +       candidate = file_open_name(file, O_RDONLY, 0);
>> +       if (IS_ERR(candidate))
>> +               goto out;
>> +
>> +       pr_info("split file-backed THPs in file: %s, offset: [0x%lx - 0x%lx]\n",
>> +                file_path, off_start, off_end);
>> +
>> +       mapping = candidate->f_mapping;
>> +
>> +       for (off_cur = off_start; off_cur < off_end;) {
>> +               struct page *fpage = pagecache_get_page(mapping, off_cur,
>> +                                               FGP_ENTRY | FGP_HEAD, 0);
>> +
>> +               if (xa_is_value(fpage) || !fpage) {
>
> Why do you have FGP_ENTRY? It seems it would return page instead of
> NULL if page is value. So I think you could remove FGP_ENTRY and
> xa_is_value() check as well.

The comment on FGP_ENTRY says “If there is a shadow/swap/DAX entry, return
it instead of allocating a new page to replace it”. I do not think we
want to allocate new pages here. I mostly follow the use of pagecache_get_page()
in shmem_getpage_gfp without swapin or allocating new pages.

>
>> +                       off_cur += PAGE_SIZE;
>> +                       continue;
>> +               }
>> +
>> +               if (!is_transparent_hugepage(fpage)) {
>> +                       off_cur += PAGE_SIZE;
>> +                       goto next;
>> +               }
>> +               total++;
>> +               off_cur = fpage->index + thp_size(fpage);
>> +
>> +               if (!trylock_page(fpage))
>> +                       goto next;
>> +
>> +               if (!split_huge_page(fpage))
>> +                       split++;
>> +
>> +               unlock_page(fpage);
>> +next:
>> +               put_page(fpage);
>> +       }
>> +
>> +       filp_close(candidate, NULL);
>> +       ret = 0;
>> +
>> +       pr_info("%lu of %lu file-backed THP split\n", split, total);
>> +out:
>> +       putname(file);
>> +       return ret;
>> +}
>> +
>>  static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
>>                                 size_t count, loff_t *ppops)
>>  {
>>         static DEFINE_MUTEX(mutex);
>>         ssize_t ret;
>> -       char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
>> +       /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
>> +       char input_buf[MAX_INPUT];
>
> I didn't find where MAX_INPUT is defined in your patch. Just saw
> include/uapi/linux/limits.h have it defined. Is it the one you really
> refer to?

Yeah, I want to use 255 as the max input size and find MAX_INPUT. From your comment,
I think it is better to define a MACRO here explicitly.


>>         int pid;
>>         unsigned long vaddr_start, vaddr_end;
>>
>> @@ -3058,11 +3118,40 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
>>
>>         ret = -EFAULT;
>>
>> -       memset(input_buf, 0, 80);
>> +       memset(input_buf, 0, MAX_INPUT);
>>         if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
>>                 goto out;
>>
>> -       input_buf[79] = '\0';
>> +       input_buf[MAX_INPUT - 1] = '\0';
>> +
>> +       if (input_buf[0] == '/') {
>> +               char *tok;
>> +               char *buf = input_buf;
>> +               char file_path[MAX_INPUT];
>> +               pgoff_t off_start = 0, off_end = 0;
>> +               size_t input_len = strlen(input_buf);
>> +
>> +               tok = strsep(&buf, ",");
>> +               if (tok) {
>> +                       strncpy(file_path, tok, MAX_INPUT);
>> +               } else {
>> +                       ret = -EINVAL;
>> +                       goto out;
>> +               }
>> +
>> +               ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
>> +               if (ret != 2) {
>> +                       pr_info("ret: %ld\n", ret);
>> +                       ret = -EINVAL;
>> +                       goto out;
>> +               }
>> +               ret = split_huge_pages_in_file(file_path, off_start, off_end);
>> +               if (!ret)
>> +                       ret = input_len;
>> +
>> +               goto out;
>> +       }
>> +
>>         ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
>>         if (ret == 1 && pid == 1) {
>>                 split_huge_pages_all();
>> diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
>> index 9f33ddbb3182..0202702f7eda 100644
>> --- a/tools/testing/selftests/vm/split_huge_page_test.c
>> +++ b/tools/testing/selftests/vm/split_huge_page_test.c
>> @@ -7,11 +7,13 @@
>>  #define _GNU_SOURCE
>>  #include <stdio.h>
>>  #include <stdlib.h>
>> +#include <stdarg.h>
>>  #include <unistd.h>
>>  #include <inttypes.h>
>>  #include <string.h>
>>  #include <fcntl.h>
>>  #include <sys/mman.h>
>> +#include <sys/mount.h>
>>  #include <malloc.h>
>>  #include <stdbool.h>
>>
>> @@ -24,6 +26,9 @@ uint64_t pmd_pagesize;
>>  #define SMAP_PATH "/proc/self/smaps"
>>  #define INPUT_MAX 80
>>
>> +#define PID_FMT "%d,0x%lx,0x%lx"
>> +#define PATH_FMT "%s,0x%lx,0x%lx"
>> +
>>  #define PFN_MASK     ((1UL<<55)-1)
>>  #define KPF_THP      (1UL<<22)
>>
>> @@ -87,13 +92,16 @@ static int write_file(const char *path, const char *buf, size_t buflen)
>>         return (unsigned int) numwritten;
>>  }
>>
>> -static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end)
>> +static void write_debugfs(const char *fmt, ...)
>>  {
>>         char input[INPUT_MAX];
>>         int ret;
>> +       va_list argp;
>> +
>> +       va_start(argp, fmt);
>> +       ret = vsnprintf(input, INPUT_MAX, fmt, argp);
>> +       va_end(argp);
>>
>> -       ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start,
>> -                       vaddr_end);
>>         if (ret >= INPUT_MAX) {
>>                 printf("%s: Debugfs input is too long\n", __func__);
>>                 exit(EXIT_FAILURE);
>> @@ -178,7 +186,8 @@ void split_pmd_thp(void)
>>         }
>>
>>         /* split all THPs */
>> -       write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len);
>> +       write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
>> +               (uint64_t)one_page + len);
>>
>>         for (i = 0; i < len; i++)
>>                 if (one_page[i] != (char)i) {
>> @@ -269,7 +278,7 @@ void split_pte_mapped_thp(void)
>>         }
>>
>>         /* split all remapped THPs */
>> -       write_debugfs(getpid(), (uint64_t)pte_mapped,
>> +       write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped,
>>                       (uint64_t)pte_mapped + pagesize * 4);
>>
>>         /* smap does not show THPs after mremap, use kpageflags instead */
>> @@ -295,6 +304,65 @@ void split_pte_mapped_thp(void)
>>         close(kpageflags_fd);
>>  }
>>
>> +void split_file_backed_thp(void)
>> +{
>> +       int status;
>> +       int fd;
>> +       ssize_t num_written;
>> +       char tmpfs_template[] = "/tmp/thp_split_XXXXXX";
>> +       const char *tmpfs_loc = mkdtemp(tmpfs_template);
>> +       char testfile[INPUT_MAX];
>> +
>> +       status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m");
>> +
>> +       if (status) {
>> +               printf("Unable to create a tmpfs for testing\n");
>> +               exit(EXIT_FAILURE);
>> +       }
>> +
>> +       status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc);
>> +       if (status >= INPUT_MAX) {
>> +               printf("Fail to create file-backed THP split testing file\n");
>> +               goto cleanup;
>> +       }
>> +
>> +       fd = open(testfile, O_CREAT|O_WRONLY);
>> +       if (fd == -1) {
>> +               perror("Cannot open testing file\n");
>> +               goto cleanup;
>> +       }
>> +
>> +       /* write something to the file, so a file-backed THP can be allocated */
>> +       num_written = write(fd, tmpfs_loc, sizeof(tmpfs_loc));
>> +       close(fd);
>> +
>> +       if (num_written < 1) {
>> +               printf("Fail to write data to testing file\n");
>> +               goto cleanup;
>> +       }
>> +
>> +       /* split the file-backed THP */
>> +       write_debugfs(PATH_FMT, testfile, 0, 1024);
>> +
>> +       status = unlink(testfile);
>> +       if (status)
>> +               perror("Cannot remove testing file\n");
>> +
>> +cleanup:
>> +       status = umount(tmpfs_loc);
>> +       if (status) {
>> +               printf("Unable to umount %s\n", tmpfs_loc);
>> +               exit(EXIT_FAILURE);
>> +       }
>> +       status = rmdir(tmpfs_loc);
>> +       if (status) {
>> +               perror("cannot remove tmp dir");
>> +               exit(EXIT_FAILURE);
>> +       }
>> +
>> +       printf("file-backed THP split test done, please check dmesg for more information\n");
>> +}
>> +
>>  int main(int argc, char **argv)
>>  {
>>         if (geteuid() != 0) {
>> @@ -308,6 +376,7 @@ int main(int argc, char **argv)
>>
>>         split_pmd_thp();
>>         split_pte_mapped_thp();
>> +       split_file_backed_thp();
>>
>>         return 0;
>>  }
>> --
>> 2.30.1
>>

Thanks for the comments. :)

—
Best Regards,
Yan Zi

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 854 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 2/2] mm: huge_memory: debugfs for file-backed THP split.
  2021-03-17 15:00     ` Zi Yan
@ 2021-03-17 16:34       ` Yang Shi
  0 siblings, 0 replies; 9+ messages in thread
From: Yang Shi @ 2021-03-17 16:34 UTC (permalink / raw)
  To: Zi Yan
  Cc: Linux MM, Linux Kernel Mailing List, linux-kselftest,
	Kirill A . Shutemov, Andrew Morton, Shuah Khan, John Hubbard,
	Sandipan Das, David Hildenbrand, Mika Penttila

On Wed, Mar 17, 2021 at 8:00 AM Zi Yan <ziy@nvidia.com> wrote:
>
> On 16 Mar 2021, at 19:18, Yang Shi wrote:
>
> > On Mon, Mar 15, 2021 at 1:34 PM Zi Yan <zi.yan@sent.com> wrote:
> >>
> >> From: Zi Yan <ziy@nvidia.com>
> >>
> >> Further extend <debugfs>/split_huge_pages to accept
> >> "<path>,<off_start>,<off_end>" for file-backed THP split tests since
> >> tmpfs may have file backed by THP that mapped nowhere.
> >>
> >> Update selftest program to test file-backed THP split too.
> >>
> >> Suggested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> >> Signed-off-by: Zi Yan <ziy@nvidia.com>
> >> ---
> >>  mm/huge_memory.c                              | 95 ++++++++++++++++++-
> >>  .../selftests/vm/split_huge_page_test.c       | 79 ++++++++++++++-
> >>  2 files changed, 166 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 3bfee54e2cd0..da91ee97d944 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -3043,12 +3043,72 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
> >>         return ret;
> >>  }
> >>
> >> +static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
> >> +                               pgoff_t off_end)
> >> +{
> >> +       struct filename *file;
> >> +       struct file *candidate;
> >> +       struct address_space *mapping;
> >> +       int ret = -EINVAL;
> >> +       pgoff_t off_cur;
> >> +       unsigned long total = 0, split = 0;
> >> +
> >> +       file = getname_kernel(file_path);
> >> +       if (IS_ERR(file))
> >> +               return ret;
> >> +
> >> +       candidate = file_open_name(file, O_RDONLY, 0);
> >> +       if (IS_ERR(candidate))
> >> +               goto out;
> >> +
> >> +       pr_info("split file-backed THPs in file: %s, offset: [0x%lx - 0x%lx]\n",
> >> +                file_path, off_start, off_end);
> >> +
> >> +       mapping = candidate->f_mapping;
> >> +
> >> +       for (off_cur = off_start; off_cur < off_end;) {
> >> +               struct page *fpage = pagecache_get_page(mapping, off_cur,
> >> +                                               FGP_ENTRY | FGP_HEAD, 0);
> >> +
> >> +               if (xa_is_value(fpage) || !fpage) {
> >
> > Why do you have FGP_ENTRY? It seems it would return page instead of
> > NULL if page is value. So I think you could remove FGP_ENTRY and
> > xa_is_value() check as well.
>
> The comment on FGP_ENTRY says “If there is a shadow/swap/DAX entry, return
> it instead of allocating a new page to replace it”. I do not think we
> want to allocate new pages here. I mostly follow the use of pagecache_get_page()
> in shmem_getpage_gfp without swapin or allocating new pages.

Yes, you are correct. I overlooked that.

>
> >
> >> +                       off_cur += PAGE_SIZE;
> >> +                       continue;
> >> +               }
> >> +
> >> +               if (!is_transparent_hugepage(fpage)) {
> >> +                       off_cur += PAGE_SIZE;
> >> +                       goto next;
> >> +               }
> >> +               total++;
> >> +               off_cur = fpage->index + thp_size(fpage);
> >> +
> >> +               if (!trylock_page(fpage))
> >> +                       goto next;
> >> +
> >> +               if (!split_huge_page(fpage))
> >> +                       split++;
> >> +
> >> +               unlock_page(fpage);
> >> +next:
> >> +               put_page(fpage);
> >> +       }
> >> +
> >> +       filp_close(candidate, NULL);
> >> +       ret = 0;
> >> +
> >> +       pr_info("%lu of %lu file-backed THP split\n", split, total);
> >> +out:
> >> +       putname(file);
> >> +       return ret;
> >> +}
> >> +
> >>  static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
> >>                                 size_t count, loff_t *ppops)
> >>  {
> >>         static DEFINE_MUTEX(mutex);
> >>         ssize_t ret;
> >> -       char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
> >> +       /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
> >> +       char input_buf[MAX_INPUT];
> >
> > I didn't find where MAX_INPUT is defined in your patch. Just saw
> > include/uapi/linux/limits.h have it defined. Is it the one you really
> > refer to?
>
> Yeah, I want to use 255 as the max input size and find MAX_INPUT. From your comment,
> I think it is better to define a MACRO here explicitly.
>
>
> >>         int pid;
> >>         unsigned long vaddr_start, vaddr_end;
> >>
> >> @@ -3058,11 +3118,40 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
> >>
> >>         ret = -EFAULT;
> >>
> >> -       memset(input_buf, 0, 80);
> >> +       memset(input_buf, 0, MAX_INPUT);
> >>         if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
> >>                 goto out;
> >>
> >> -       input_buf[79] = '\0';
> >> +       input_buf[MAX_INPUT - 1] = '\0';
> >> +
> >> +       if (input_buf[0] == '/') {
> >> +               char *tok;
> >> +               char *buf = input_buf;
> >> +               char file_path[MAX_INPUT];
> >> +               pgoff_t off_start = 0, off_end = 0;
> >> +               size_t input_len = strlen(input_buf);
> >> +
> >> +               tok = strsep(&buf, ",");
> >> +               if (tok) {
> >> +                       strncpy(file_path, tok, MAX_INPUT);
> >> +               } else {
> >> +                       ret = -EINVAL;
> >> +                       goto out;
> >> +               }
> >> +
> >> +               ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
> >> +               if (ret != 2) {
> >> +                       pr_info("ret: %ld\n", ret);
> >> +                       ret = -EINVAL;
> >> +                       goto out;
> >> +               }
> >> +               ret = split_huge_pages_in_file(file_path, off_start, off_end);
> >> +               if (!ret)
> >> +                       ret = input_len;
> >> +
> >> +               goto out;
> >> +       }
> >> +
> >>         ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
> >>         if (ret == 1 && pid == 1) {
> >>                 split_huge_pages_all();
> >> diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
> >> index 9f33ddbb3182..0202702f7eda 100644
> >> --- a/tools/testing/selftests/vm/split_huge_page_test.c
> >> +++ b/tools/testing/selftests/vm/split_huge_page_test.c
> >> @@ -7,11 +7,13 @@
> >>  #define _GNU_SOURCE
> >>  #include <stdio.h>
> >>  #include <stdlib.h>
> >> +#include <stdarg.h>
> >>  #include <unistd.h>
> >>  #include <inttypes.h>
> >>  #include <string.h>
> >>  #include <fcntl.h>
> >>  #include <sys/mman.h>
> >> +#include <sys/mount.h>
> >>  #include <malloc.h>
> >>  #include <stdbool.h>
> >>
> >> @@ -24,6 +26,9 @@ uint64_t pmd_pagesize;
> >>  #define SMAP_PATH "/proc/self/smaps"
> >>  #define INPUT_MAX 80
> >>
> >> +#define PID_FMT "%d,0x%lx,0x%lx"
> >> +#define PATH_FMT "%s,0x%lx,0x%lx"
> >> +
> >>  #define PFN_MASK     ((1UL<<55)-1)
> >>  #define KPF_THP      (1UL<<22)
> >>
> >> @@ -87,13 +92,16 @@ static int write_file(const char *path, const char *buf, size_t buflen)
> >>         return (unsigned int) numwritten;
> >>  }
> >>
> >> -static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end)
> >> +static void write_debugfs(const char *fmt, ...)
> >>  {
> >>         char input[INPUT_MAX];
> >>         int ret;
> >> +       va_list argp;
> >> +
> >> +       va_start(argp, fmt);
> >> +       ret = vsnprintf(input, INPUT_MAX, fmt, argp);
> >> +       va_end(argp);
> >>
> >> -       ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start,
> >> -                       vaddr_end);
> >>         if (ret >= INPUT_MAX) {
> >>                 printf("%s: Debugfs input is too long\n", __func__);
> >>                 exit(EXIT_FAILURE);
> >> @@ -178,7 +186,8 @@ void split_pmd_thp(void)
> >>         }
> >>
> >>         /* split all THPs */
> >> -       write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len);
> >> +       write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
> >> +               (uint64_t)one_page + len);
> >>
> >>         for (i = 0; i < len; i++)
> >>                 if (one_page[i] != (char)i) {
> >> @@ -269,7 +278,7 @@ void split_pte_mapped_thp(void)
> >>         }
> >>
> >>         /* split all remapped THPs */
> >> -       write_debugfs(getpid(), (uint64_t)pte_mapped,
> >> +       write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped,
> >>                       (uint64_t)pte_mapped + pagesize * 4);
> >>
> >>         /* smap does not show THPs after mremap, use kpageflags instead */
> >> @@ -295,6 +304,65 @@ void split_pte_mapped_thp(void)
> >>         close(kpageflags_fd);
> >>  }
> >>
> >> +void split_file_backed_thp(void)
> >> +{
> >> +       int status;
> >> +       int fd;
> >> +       ssize_t num_written;
> >> +       char tmpfs_template[] = "/tmp/thp_split_XXXXXX";
> >> +       const char *tmpfs_loc = mkdtemp(tmpfs_template);
> >> +       char testfile[INPUT_MAX];
> >> +
> >> +       status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m");
> >> +
> >> +       if (status) {
> >> +               printf("Unable to create a tmpfs for testing\n");
> >> +               exit(EXIT_FAILURE);
> >> +       }
> >> +
> >> +       status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc);
> >> +       if (status >= INPUT_MAX) {
> >> +               printf("Fail to create file-backed THP split testing file\n");
> >> +               goto cleanup;
> >> +       }
> >> +
> >> +       fd = open(testfile, O_CREAT|O_WRONLY);
> >> +       if (fd == -1) {
> >> +               perror("Cannot open testing file\n");
> >> +               goto cleanup;
> >> +       }
> >> +
> >> +       /* write something to the file, so a file-backed THP can be allocated */
> >> +       num_written = write(fd, tmpfs_loc, sizeof(tmpfs_loc));
> >> +       close(fd);
> >> +
> >> +       if (num_written < 1) {
> >> +               printf("Fail to write data to testing file\n");
> >> +               goto cleanup;
> >> +       }
> >> +
> >> +       /* split the file-backed THP */
> >> +       write_debugfs(PATH_FMT, testfile, 0, 1024);
> >> +
> >> +       status = unlink(testfile);
> >> +       if (status)
> >> +               perror("Cannot remove testing file\n");
> >> +
> >> +cleanup:
> >> +       status = umount(tmpfs_loc);
> >> +       if (status) {
> >> +               printf("Unable to umount %s\n", tmpfs_loc);
> >> +               exit(EXIT_FAILURE);
> >> +       }
> >> +       status = rmdir(tmpfs_loc);
> >> +       if (status) {
> >> +               perror("cannot remove tmp dir");
> >> +               exit(EXIT_FAILURE);
> >> +       }
> >> +
> >> +       printf("file-backed THP split test done, please check dmesg for more information\n");
> >> +}
> >> +
> >>  int main(int argc, char **argv)
> >>  {
> >>         if (geteuid() != 0) {
> >> @@ -308,6 +376,7 @@ int main(int argc, char **argv)
> >>
> >>         split_pmd_thp();
> >>         split_pte_mapped_thp();
> >> +       split_file_backed_thp();
> >>
> >>         return 0;
> >>  }
> >> --
> >> 2.30.1
> >>
>
> Thanks for the comments. :)
>
> —
> Best Regards,
> Yan Zi


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-03-17 17:53 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-15 20:33 [PATCH v4 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests Zi Yan
2021-03-15 20:33 ` [PATCH v4 2/2] mm: huge_memory: debugfs for file-backed THP split Zi Yan
2021-03-16 23:18   ` Yang Shi
2021-03-17 15:00     ` Zi Yan
2021-03-17 16:34       ` Yang Shi
2021-03-16  0:36 ` [PATCH v4 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests kernel test robot
2021-03-16  1:10   ` Zi Yan
2021-03-16 22:23 ` Yang Shi
2021-03-17 14:51   ` Zi Yan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).