All of lore.kernel.org
 help / color / mirror / Atom feed
From: Gregory Price <gourry.memverge@gmail.com>
To: linux-mm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org, linux-arch@vger.kernel.org,
	linux-api@vger.kernel.org, linux-cxl@vger.kernel.org,
	luto@kernel.org, tglx@linutronix.de, mingo@redhat.com,
	bp@alien8.de, dave.hansen@linux.intel.com, hpa@zytor.com,
	arnd@arndb.de, akpm@linux-foundation.org, x86@kernel.org,
	Gregory Price <gregory.price@memverge.com>
Subject: [RFC PATCH 3/3] mm/migrate: Create move_phys_pages syscall
Date: Thu,  7 Sep 2023 03:54:53 -0400	[thread overview]
Message-ID: <20230907075453.350554-4-gregory.price@memverge.com> (raw)
In-Reply-To: <20230907075453.350554-1-gregory.price@memverge.com>

Similar to the move_pages system call, instead of taking a pid and
list of virtual addresses, this system call takes a list of physical
addresses.

Because there is no task to validate the memory policy against, each
page needs to be interrogated to determine whether the migration is
valid, and all tasks that map it need to be interrogated.

This is accomplished via an rmap_walk on the folio containing
the page, and interrogating all tasks that map the page.

Each page must be interrogated individually, which should be
considered when using this to migrate shared regions.

The remaining logic is the same as the move_pages syscall. One
change to do_pages_move is made (to check whether an mm_struct is
passed) in order to re-use the existing migration code.

Signed-off-by: Gregory Price <gregory.price@memverge.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl  |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl  |   1 +
 include/linux/syscalls.h                |   5 +
 include/uapi/asm-generic/unistd.h       |   8 +-
 kernel/sys_ni.c                         |   1 +
 mm/migrate.c                            | 178 +++++++++++++++++++++++-
 tools/include/uapi/asm-generic/unistd.h |   8 +-
 7 files changed, 197 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 2d0b1bd866ea..25db6d71af0c 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -457,3 +457,4 @@
 450	i386	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	i386	cachestat		sys_cachestat
 452	i386	fchmodat2		sys_fchmodat2
+454	i386	move_phys_pages		sys_move_phys_pages
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 1d6eee30eceb..9676f2e7698c 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -375,6 +375,7 @@
 451	common	cachestat		sys_cachestat
 452	common	fchmodat2		sys_fchmodat2
 453	64	map_shadow_stack	sys_map_shadow_stack
+454	common	move_phys_pages		sys_move_phys_pages
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 22bc6bc147f8..6860675a942f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -821,6 +821,11 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 				const int __user *nodes,
 				int __user *status,
 				int flags);
+asmlinkage long sys_move_phys_pages(unsigned long nr_pages,
+				    const void __user * __user *pages,
+				    const int __user *nodes,
+				    int __user *status,
+				    int flags);
 asmlinkage long sys_rt_tgsigqueueinfo(pid_t tgid, pid_t  pid, int sig,
 		siginfo_t __user *uinfo);
 asmlinkage long sys_perf_event_open(
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index abe087c53b4b..8838fcfaf261 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -823,8 +823,14 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
 #define __NR_fchmodat2 452
 __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
 
+/* CONFIG_MMU only */
+#ifndef __ARCH_NOMMU
+#define __NR_move_phys_pages 454
+__SYSCALL(__NR_move_phys_pages, sys_move_phys_pages)
+#endif
+
 #undef __NR_syscalls
-#define __NR_syscalls 453
+#define __NR_syscalls 455
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e137c1385c56..07441b10f92a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -192,6 +192,7 @@ COND_SYSCALL(migrate_pages);
 COND_SYSCALL(move_pages);
 COND_SYSCALL(set_mempolicy_home_node);
 COND_SYSCALL(cachestat);
+COND_SYSCALL(move_phys_pages);
 
 COND_SYSCALL(perf_event_open);
 COND_SYSCALL(accept4);
diff --git a/mm/migrate.c b/mm/migrate.c
index 3506b8202937..8a6f1eb6e512 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2161,6 +2161,101 @@ static int move_pages_and_store_status(int node,
 	return store_status(status, start, node, i - start);
 }
 
+struct rmap_page_ctxt {
+	bool found;
+	bool migratable;
+	bool node_allowed;
+	int node;
+};
+
+/*
+ * Walks each vma mapping a given page and determines if those
+ * vma's are both migratable, and that the target node is within
+ * the allowed cpuset of the owning task.
+ */
+static bool phys_page_migratable(struct folio *folio,
+				 struct vm_area_struct *vma,
+				 unsigned long address,
+				 void *arg)
+{
+	struct rmap_page_ctxt *ctxt = (struct rmap_page_ctxt *)arg;
+	struct task_struct *owner = vma->vm_mm->owner;
+	nodemask_t task_nodes = cpuset_mems_allowed(owner);
+
+	ctxt->found |= true;
+	ctxt->migratable &= vma_migratable(vma);
+	ctxt->node_allowed &= node_isset(ctxt->node, task_nodes);
+
+	return ctxt->migratable && ctxt->node_allowed;
+}
+
+static struct folio *phys_migrate_get_folio(struct page *page)
+{
+	struct folio *folio;
+
+	folio = page_folio(page);
+	if (!folio_test_lru(folio) || !folio_try_get(folio))
+		return NULL;
+	if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) {
+		folio_put(folio);
+		folio = NULL;
+	}
+	return folio;
+}
+
+/*
+ * Validates the physical address is online and migratable.  Walks the folio
+ * containing the page to validate the vma is migratable and the cpuset node
+ * restrictions.  Then calls add_page_for_migration to isolate it from the
+ * LRU and place it into the given pagelist.
+ * Returns:
+ *     errno - if the page is not online, migratable, or can't be isolated
+ *     0 - when it doesn't have to be migrated because it is already on the
+ *         target node
+ *     1 - when it has been queued
+ */
+static int add_phys_page_for_migration(const void __user *p, int node,
+				       struct list_head *pagelist,
+				       bool migrate_all)
+{
+	unsigned long pfn;
+	struct page *page;
+	struct folio *folio;
+	int err;
+	struct rmap_page_ctxt rmctxt = {
+		.found = false,
+		.migratable = true,
+		.node_allowed = true,
+		.node = node
+	};
+	struct rmap_walk_control rwc = {
+		.rmap_one = phys_page_migratable,
+		.arg = &rmctxt
+	};
+
+	pfn = ((unsigned long)p) >> PAGE_SHIFT;
+	page = pfn_to_online_page(pfn);
+	if (!page || PageTail(page))
+		return -ENOENT;
+
+	folio = phys_migrate_get_folio(page);
+	if (folio) {
+		rmap_walk(folio, &rwc);
+		folio_put(folio);
+	}
+
+	if (!rmctxt.found)
+		err = -ENOENT;
+	else if (!rmctxt.migratable)
+		err = -EFAULT;
+	else if (!rmctxt.node_allowed)
+		err = -EACCES;
+	else
+		err = add_page_for_migration(page, node, pagelist, migrate_all);
+
+	return err;
+}
+
 /*
  * Migrate an array of page address onto an array of nodes and fill
  * the corresponding array of status.
@@ -2214,8 +2309,13 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
 		 * Errors in the page lookup or isolation are not fatal and we simply
 		 * report them via status
 		 */
-		err = add_virt_page_for_migration(mm, p, current_node, &pagelist,
-					     flags & MPOL_MF_MOVE_ALL);
+		if (mm)
+			err = add_virt_page_for_migration(mm, p, current_node, &pagelist,
+						     flags & MPOL_MF_MOVE_ALL);
+		else
+			err = add_phys_page_for_migration(p, current_node, &pagelist,
+					flags & MPOL_MF_MOVE_ALL);
+
 
 		if (err > 0) {
 			/* The page is successfully queued for migration */
@@ -2303,6 +2403,36 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
 	mmap_read_unlock(mm);
 }
 
+/*
+ * Determine the nodes of an array of pages and store it in an array of status.
+ */
+static void do_phys_pages_stat_array(unsigned long nr_pages,
+				     const void __user **pages, int *status)
+{
+	unsigned long i;
+
+	for (i = 0; i < nr_pages; i++) {
+		unsigned long pfn = (unsigned long)(*pages) >> PAGE_SHIFT;
+		struct page *page = pfn_to_online_page(pfn);
+		int err = -ENOENT;
+
+		if (!page)
+			goto set_status;
+
+		get_page(page);
+
+		if (!is_zone_device_page(page))
+			err = page_to_nid(page);
+
+		put_page(page);
+set_status:
+		*status = err;
+
+		pages++;
+		status++;
+	}
+}
+
 static int get_compat_pages_array(const void __user *chunk_pages[],
 				  const void __user * __user *pages,
 				  unsigned long chunk_nr)
@@ -2345,7 +2475,10 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
 				break;
 		}
 
-		do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
+		if (mm)
+			do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
+		else
+			do_phys_pages_stat_array(chunk_nr, chunk_pages, chunk_status);
 
 		if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
 			break;
@@ -2446,6 +2579,45 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
 	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
 }
 
+/*
+ * Move a list of pages in the address space of the currently executing
+ * process.
+ */
+static int kernel_move_phys_pages(unsigned long nr_pages,
+				  const void __user * __user *pages,
+				  const int __user *nodes,
+				  int __user *status, int flags)
+{
+	int err;
+	nodemask_t target_nodes;
+
+	/* Check flags */
+	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
+		return -EINVAL;
+
+	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+		return -EPERM;
+
+	/* All tasks mapping each page is checked in phys_page_migratable */
+	nodes_setall(target_nodes);
+	if (nodes)
+		err = do_pages_move(NULL, target_nodes, nr_pages, pages,
+			nodes, status, flags);
+	else
+		err = do_pages_stat(NULL, nr_pages, pages, status);
+
+	return err;
+}
+
+SYSCALL_DEFINE5(move_phys_pages, unsigned long, nr_pages,
+		const void __user * __user *, pages,
+		const int __user *, nodes,
+		int __user *, status, int, flags)
+{
+	return kernel_move_phys_pages(nr_pages, pages, nodes, status, flags);
+}
+
+
 #ifdef CONFIG_NUMA_BALANCING
 /*
  * Returns true if this is a safe migration target node for misplaced NUMA
diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
index fd6c1cb585db..b140ad444946 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@@ -820,8 +820,14 @@ __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
 #define __NR_cachestat 451
 __SYSCALL(__NR_cachestat, sys_cachestat)
 
+/* CONFIG_MMU only */
+#ifndef __ARCH_NOMMU
+#define __NR_move_phys_pages 454
+__SYSCALL(__NR_move_phys_pages, sys_move_phys_pages)
+#endif
+
 #undef __NR_syscalls
-#define __NR_syscalls 452
+#define __NR_syscalls 455
 
 /*
  * 32 bit systems traditionally used different
-- 
2.39.1


  parent reply	other threads:[~2023-09-08 16:44 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-09-07  7:54 [RFC 0/3] sys_move_phy_pages system call Gregory Price
2023-09-07  7:54 ` [RFC PATCH 1/3] mm/migrate: remove unused mm argument from do_move_pages_to_node Gregory Price
2023-09-07  7:54 ` [RFC PATCH 2/3] mm/migrate: refactor add_page_for_migration for code re-use Gregory Price
2023-09-07  7:54 ` Gregory Price [this message]
2023-09-08 21:59   ` [RFC PATCH 3/3] mm/migrate: Create move_phys_pages syscall kernel test robot
2023-09-08 22:33   ` kernel test robot
2023-09-08 22:43   ` kernel test robot
2023-09-09  8:03   ` Arnd Bergmann
2023-09-08  7:46     ` Gregory Price
2023-09-09 15:18       ` Arnd Bergmann
2023-09-10 12:52         ` Gregory Price
2023-09-11 17:26           ` Arnd Bergmann
2023-09-10 15:01             ` Gregory Price
2023-09-10 20:36   ` Jonathan Corbet
2023-09-10 11:49     ` Gregory Price
2023-09-19  3:34       ` Andy Lutomirski
2023-09-19 16:31         ` Gregory Price
2023-09-19 17:59           ` Andy Lutomirski
2023-09-19 18:20             ` Gregory Price
2023-09-19 18:52               ` Andy Lutomirski
2023-09-19 19:59                 ` Gregory Price
2023-09-19  0:17   ` Thomas Gleixner
2023-09-19 15:57     ` Gregory Price
2023-09-19 16:37     ` Gregory Price

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230907075453.350554-4-gregory.price@memverge.com \
    --to=gourry.memverge@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=arnd@arndb.de \
    --cc=bp@alien8.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=gregory.price@memverge.com \
    --cc=hpa@zytor.com \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-cxl@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mingo@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.