All of lore.kernel.org
 help / color / mirror / Atom feed
From: Zi Yan <zi.yan@sent.com>
To: Dave Hansen <dave.hansen@linux.intel.com>,
	Yang Shi <yang.shi@linux.alibaba.com>,
	Keith Busch <keith.busch@intel.com>,
	Fengguang Wu <fengguang.wu@intel.com>,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>,
	Michal Hocko <mhocko@kernel.org>,
	"Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Vlastimil Babka <vbabka@suse.cz>,
	Mel Gorman <mgorman@techsingularity.net>,
	John Hubbard <jhubbard@nvidia.com>,
	Mark Hairgrove <mhairgrove@nvidia.com>,
	Nitin Gupta <nigupta@nvidia.com>,
	Javier Cabezas <jcabezas@nvidia.com>,
	David Nellans <dnellans@nvidia.com>, Zi Yan <ziy@nvidia.com>
Subject: [RFC PATCH 17/25] exchange page: Add exchange_page() syscall.
Date: Wed,  3 Apr 2019 19:00:38 -0700	[thread overview]
Message-ID: <20190404020046.32741-18-zi.yan@sent.com> (raw)
In-Reply-To: <20190404020046.32741-1-zi.yan@sent.com>

From: Zi Yan <ziy@nvidia.com>

Users can use the syscall to exchange two lists of pages, similar
to move_pages() syscall.

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 include/linux/syscalls.h               |   5 +
 mm/exchange.c                          | 346 +++++++++++++++++++++++++++++++++
 3 files changed, 352 insertions(+)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 92ee0b4..863a21e 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,7 @@
 332	common	statx			__x64_sys_statx
 333	common	io_pgetevents		__x64_sys_io_pgetevents
 334	common	rseq			__x64_sys_rseq
+335	common	exchange_pages	__x64_sys_exchange_pages
 # don't use numbers 387 through 423, add new calls after the last
 # 'common' entry
 424	common	pidfd_send_signal	__x64_sys_pidfd_send_signal
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e446806..2c1eb49 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1203,6 +1203,11 @@ asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len,
 			unsigned long fd, unsigned long pgoff);
 asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg);
 
+asmlinkage long sys_exchange_pages(pid_t pid, unsigned long nr_pages,
+				const void __user * __user *from_pages,
+				const void __user * __user *to_pages,
+				int __user *status,
+				int flags);
 
 /*
  * Not a real system call, but a placeholder for syscalls which are
diff --git a/mm/exchange.c b/mm/exchange.c
index 45c7013..48e344e 100644
--- a/mm/exchange.c
+++ b/mm/exchange.c
@@ -22,6 +22,7 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h> /* buffer_migrate_page  */
 #include <linux/backing-dev.h>
+#include <linux/sched/mm.h>
 
 
 #include "internal.h"
@@ -1212,3 +1213,348 @@ int exchange_pages_concur(struct list_head *exchange_list,
 
 	return nr_failed?-EFAULT:0;
 }
+
+static int store_status(int __user *status, int start, int value, int nr)
+{
+	while (nr-- > 0) {
+		if (put_user(value, status + start))
+			return -EFAULT;
+		start++;
+	}
+
+	return 0;
+}
+
+static int do_exchange_page_list(struct mm_struct *mm,
+		struct list_head *from_pagelist, struct list_head *to_pagelist,
+		bool migrate_mt, bool migrate_concur)
+{
+	int err;
+	struct exchange_page_info *one_pair;
+	LIST_HEAD(exchange_page_list);
+
+	while (!list_empty(from_pagelist)) {
+		struct page *from_page, *to_page;
+
+		from_page = list_first_entry_or_null(from_pagelist, struct page, lru);
+		to_page = list_first_entry_or_null(to_pagelist, struct page, lru);
+
+		if (!from_page || !to_page)
+			break;
+
+		one_pair = kzalloc(sizeof(struct exchange_page_info), GFP_ATOMIC);
+		if (!one_pair) {
+			err = -ENOMEM;
+			break;
+		}
+
+		list_del(&from_page->lru);
+		list_del(&to_page->lru);
+
+		one_pair->from_page = from_page;
+		one_pair->to_page = to_page;
+
+		list_add_tail(&one_pair->list, &exchange_page_list);
+	}
+
+	if (migrate_concur)
+		err = exchange_pages_concur(&exchange_page_list,
+			MIGRATE_SYNC | (migrate_mt ? MIGRATE_MT : MIGRATE_SINGLETHREAD),
+			MR_SYSCALL);
+	else
+		err = exchange_pages(&exchange_page_list,
+			MIGRATE_SYNC | (migrate_mt ? MIGRATE_MT : MIGRATE_SINGLETHREAD),
+			MR_SYSCALL);
+
+	while (!list_empty(&exchange_page_list)) {
+		struct exchange_page_info *one_pair =
+			list_first_entry(&exchange_page_list,
+							 struct exchange_page_info, list);
+
+		list_del(&one_pair->list);
+		kfree(one_pair);
+	}
+
+	if (!list_empty(from_pagelist))
+		putback_movable_pages(from_pagelist);
+
+	if (!list_empty(to_pagelist))
+		putback_movable_pages(to_pagelist);
+
+	return err;
+}
+
+static int add_page_for_exchange(struct mm_struct *mm,
+		unsigned long from_addr, unsigned long to_addr,
+		struct list_head *from_pagelist, struct list_head *to_pagelist,
+		bool migrate_all)
+{
+	struct vm_area_struct *from_vma, *to_vma;
+	struct page *from_page, *to_page;
+	LIST_HEAD(err_page_list);
+	unsigned int follflags;
+	int err;
+
+	err = -EFAULT;
+	from_vma = find_vma(mm, from_addr);
+	if (!from_vma || from_addr < from_vma->vm_start ||
+		!vma_migratable(from_vma))
+		goto set_from_status;
+
+	/* FOLL_DUMP to ignore special (like zero) pages */
+	follflags = FOLL_GET | FOLL_DUMP;
+	from_page = follow_page(from_vma, from_addr, follflags);
+
+	err = PTR_ERR(from_page);
+	if (IS_ERR(from_page))
+		goto set_from_status;
+
+	err = -ENOENT;
+	if (!from_page)
+		goto set_from_status;
+
+	err = -EACCES;
+	if (page_mapcount(from_page) > 1 && !migrate_all)
+		goto put_and_set_from_page;
+
+	if (PageHuge(from_page)) {
+		if (PageHead(from_page))
+			if (isolate_huge_page(from_page, &err_page_list)) {
+				err = 0;
+			}
+		goto put_and_set_from_page;
+	} else if (PageTransCompound(from_page)) {
+		if (PageTail(from_page)) {
+			err = -EACCES;
+			goto put_and_set_from_page;
+		}
+	}
+
+	err = isolate_lru_page(from_page);
+	if (!err)
+		mod_node_page_state(page_pgdat(from_page), NR_ISOLATED_ANON +
+					page_is_file_cache(from_page), hpage_nr_pages(from_page));
+put_and_set_from_page:
+	/*
+	 * Either remove the duplicate refcount from
+	 * isolate_lru_page() or drop the page ref if it was
+	 * not isolated.
+	 *
+	 * Since FOLL_GET calls get_page(), and isolate_lru_page()
+	 * also calls get_page()
+	 */
+	put_page(from_page);
+set_from_status:
+	if (err)
+		goto out;
+
+	/* to pages  */
+	err = -EFAULT;
+	to_vma = find_vma(mm, to_addr);
+	if (!to_vma ||
+		to_addr < to_vma->vm_start ||
+		!vma_migratable(to_vma))
+		goto set_to_status;
+
+	/* FOLL_DUMP to ignore special (like zero) pages */
+	to_page = follow_page(to_vma, to_addr, follflags);
+
+	err = PTR_ERR(to_page);
+	if (IS_ERR(to_page))
+		goto set_to_status;
+
+	err = -ENOENT;
+	if (!to_page)
+		goto set_to_status;
+
+	err = -EACCES;
+	if (page_mapcount(to_page) > 1 &&
+			!migrate_all)
+		goto put_and_set_to_page;
+
+	if (PageHuge(to_page)) {
+		if (PageHead(to_page))
+			if (isolate_huge_page(to_page, &err_page_list)) {
+				err = 0;
+			}
+		goto put_and_set_to_page;
+	} else if (PageTransCompound(to_page)) {
+		if (PageTail(to_page)) {
+			err = -EACCES;
+			goto put_and_set_to_page;
+		}
+	}
+
+	err = isolate_lru_page(to_page);
+	if (!err)
+		mod_node_page_state(page_pgdat(to_page), NR_ISOLATED_ANON +
+					page_is_file_cache(to_page), hpage_nr_pages(to_page));
+put_and_set_to_page:
+	/*
+	 * Either remove the duplicate refcount from
+	 * isolate_lru_page() or drop the page ref if it was
+	 * not isolated.
+	 *
+	 * Since FOLL_GET calls get_page(), and isolate_lru_page()
+	 * also calls get_page()
+	 */
+	put_page(to_page);
+set_to_status:
+	if (!err) {
+		if ((PageHuge(from_page) != PageHuge(to_page)) ||
+			(PageTransHuge(from_page) != PageTransHuge(to_page))) {
+			list_add(&from_page->lru, &err_page_list);
+			list_add(&to_page->lru, &err_page_list);
+		} else {
+			list_add_tail(&from_page->lru, from_pagelist);
+			list_add_tail(&to_page->lru, to_pagelist);
+		}
+	} else
+		list_add(&from_page->lru, &err_page_list);
+out:
+	if (!list_empty(&err_page_list))
+		putback_movable_pages(&err_page_list);
+	return err;
+}
+/*
+ * Migrate an array of page address onto an array of nodes and fill
+ * the corresponding array of status.
+ */
+static int do_pages_exchange(struct mm_struct *mm, nodemask_t task_nodes,
+			 unsigned long nr_pages,
+			 const void __user * __user *from_pages,
+			 const void __user * __user *to_pages,
+			 int __user *status, int flags)
+{
+	LIST_HEAD(from_pagelist);
+	LIST_HEAD(to_pagelist);
+	int start, i;
+	int err = 0, err1;
+
+	migrate_prep();
+
+	down_read(&mm->mmap_sem);
+	for (i = start = 0; i < nr_pages; i++) {
+		const void __user *from_p, *to_p;
+		unsigned long from_addr, to_addr;
+
+		err = -EFAULT;
+		if (get_user(from_p, from_pages + i))
+			goto out_flush;
+		if (get_user(to_p, to_pages + i))
+			goto out_flush;
+
+		from_addr = (unsigned long)from_p;
+		to_addr = (unsigned long)to_p;
+
+		err = -EACCES;
+		/*
+		 * Errors in the page lookup or isolation are not fatal and we simply
+		 * report them via status
+		 */
+		err = add_page_for_exchange(mm, from_addr, to_addr,
+				&from_pagelist, &to_pagelist,
+				flags & MPOL_MF_MOVE_ALL);
+
+		if (!err)
+			continue;
+
+		err = store_status(status, i, err, 1);
+		if (err)
+			goto out_flush;
+
+		err = do_exchange_page_list(mm, &from_pagelist, &to_pagelist,
+				flags & MPOL_MF_MOVE_MT,
+				flags & MPOL_MF_MOVE_CONCUR);
+		if (err)
+			goto out;
+		if (i > start) {
+			err = store_status(status, start, 0, i - start);
+			if (err)
+				goto out;
+		}
+		start = i;
+	}
+out_flush:
+	/* Make sure we do not overwrite the existing error */
+	err1 = do_exchange_page_list(mm, &from_pagelist, &to_pagelist,
+				flags & MPOL_MF_MOVE_MT,
+				flags & MPOL_MF_MOVE_CONCUR);
+	if (!err1)
+		err1 = store_status(status, start, 0, i - start);
+	if (!err)
+		err = err1;
+out:
+	up_read(&mm->mmap_sem);
+	return err;
+}
+
+SYSCALL_DEFINE6(exchange_pages, pid_t, pid, unsigned long, nr_pages,
+		const void __user * __user *, from_pages,
+		const void __user * __user *, to_pages,
+		int __user *, status, int, flags)
+{
+	const struct cred *cred = current_cred(), *tcred;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int err;
+	nodemask_t task_nodes;
+
+	/* Check flags */
+	if (flags & ~(MPOL_MF_MOVE|
+				  MPOL_MF_MOVE_ALL|
+				  MPOL_MF_MOVE_MT|
+				  MPOL_MF_MOVE_CONCUR))
+		return -EINVAL;
+
+	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+		return -EPERM;
+
+	/* Find the mm_struct */
+	rcu_read_lock();
+	task = pid ? find_task_by_vpid(pid) : current;
+	if (!task) {
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+	get_task_struct(task);
+
+	/*
+	 * Check if this process has the right to modify the specified
+	 * process. The right exists if the process has administrative
+	 * capabilities, superuser privileges or the same
+	 * userid as the target process.
+	 */
+	tcred = __task_cred(task);
+	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
+	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
+	    !capable(CAP_SYS_NICE)) {
+		rcu_read_unlock();
+		err = -EPERM;
+		goto out;
+	}
+	rcu_read_unlock();
+
+	err = security_task_movememory(task);
+	if (err)
+		goto out;
+
+	task_nodes = cpuset_mems_allowed(task);
+	mm = get_task_mm(task);
+	put_task_struct(task);
+
+	if (!mm)
+		return -EINVAL;
+
+	err = do_pages_exchange(mm, task_nodes, nr_pages, from_pages,
+				    to_pages, status, flags);
+
+	mmput(mm);
+
+	return err;
+
+out:
+	put_task_struct(task);
+
+	return err;
+}
-- 
2.7.4


  parent reply	other threads:[~2019-04-04  2:10 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-04  2:00 [RFC PATCH 00/25] Accelerate page migration and use memcg for PMEM management Zi Yan
2019-04-04  2:00 ` [RFC PATCH 01/25] mm: migrate: Change migrate_mode to support combination migration modes Zi Yan
2019-04-04  2:00 ` [RFC PATCH 02/25] mm: migrate: Add mode parameter to support future page copy routines Zi Yan
2019-04-04  2:00 ` [RFC PATCH 03/25] mm: migrate: Add a multi-threaded page migration function Zi Yan
2019-04-04  2:00 ` [RFC PATCH 04/25] mm: migrate: Add copy_page_multithread into migrate_pages Zi Yan
2019-04-04  2:00 ` [RFC PATCH 05/25] mm: migrate: Add vm.accel_page_copy in sysfs to control page copy acceleration Zi Yan
2019-04-04  2:00 ` [RFC PATCH 06/25] mm: migrate: Make the number of copy threads adjustable via sysctl Zi Yan
2019-04-04  2:00 ` [RFC PATCH 07/25] mm: migrate: Add copy_page_dma to use DMA Engine to copy pages Zi Yan
2019-04-04  2:00 ` [RFC PATCH 08/25] mm: migrate: Add copy_page_dma into migrate_page_copy Zi Yan
2019-04-04  2:00 ` [RFC PATCH 09/25] mm: migrate: Add copy_page_lists_dma_always to support copy a list of pages Zi Yan
2019-04-04  2:00 ` [RFC PATCH 10/25] mm: migrate: copy_page_lists_mt() to copy a page list using multi-threads Zi Yan
2019-04-04  2:00 ` [RFC PATCH 11/25] mm: migrate: Add concurrent page migration into move_pages syscall Zi Yan
2019-04-04  2:00 ` [RFC PATCH 12/25] exchange pages: new page migration mechanism: exchange_pages() Zi Yan
2019-04-04  2:00 ` [RFC PATCH 13/25] exchange pages: add multi-threaded exchange pages Zi Yan
2019-04-04  2:00 ` [RFC PATCH 14/25] exchange pages: concurrent " Zi Yan
2019-04-04  2:00 ` [RFC PATCH 15/25] exchange pages: exchange anonymous page and file-backed page Zi Yan
2019-04-04  2:00 ` [RFC PATCH 16/25] exchange page: Add THP exchange support Zi Yan
2019-04-04  2:00 ` Zi Yan [this message]
2019-04-04  2:00 ` [RFC PATCH 18/25] memcg: Add per node memory usage&max stats in memcg Zi Yan
2019-04-04  2:00 ` [RFC PATCH 19/25] mempolicy: add MPOL_F_MEMCG flag, enforcing memcg memory limit Zi Yan
2019-04-04  2:00 ` [RFC PATCH 20/25] memory manage: Add memory manage syscall Zi Yan
2019-04-04  2:00 ` [RFC PATCH 21/25] mm: move update_lru_sizes() to mm_inline.h for broader use Zi Yan
2019-04-04  2:00 ` [RFC PATCH 22/25] memory manage: active/inactive page list manipulation in memcg Zi Yan
2019-04-04  2:00 ` [RFC PATCH 23/25] memory manage: page migration based page manipulation between NUMA nodes Zi Yan
2019-04-04  2:00 ` [RFC PATCH 24/25] memory manage: limit migration batch size Zi Yan
2019-04-04  2:00 ` [RFC PATCH 25/25] memory manage: use exchange pages to memory manage to improve throughput Zi Yan
2019-04-04  7:13 ` [RFC PATCH 00/25] Accelerate page migration and use memcg for PMEM management Michal Hocko
2019-04-05  0:32 ` Yang Shi
2019-04-05 17:20   ` Zi Yan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190404020046.32741-18-zi.yan@sent.com \
    --to=zi.yan@sent.com \
    --cc=akpm@linux-foundation.org \
    --cc=daniel.m.jordan@oracle.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=dnellans@nvidia.com \
    --cc=fengguang.wu@intel.com \
    --cc=jcabezas@nvidia.com \
    --cc=jhubbard@nvidia.com \
    --cc=keith.busch@intel.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@techsingularity.net \
    --cc=mhairgrove@nvidia.com \
    --cc=mhocko@kernel.org \
    --cc=nigupta@nvidia.com \
    --cc=vbabka@suse.cz \
    --cc=yang.shi@linux.alibaba.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.