All of lore.kernel.org
 help / color / mirror / Atom feed
From: Zi Yan <zi.yan@sent.com>
To: Dave Hansen <dave.hansen@linux.intel.com>,
	Yang Shi <yang.shi@linux.alibaba.com>,
	Keith Busch <keith.busch@intel.com>,
	Fengguang Wu <fengguang.wu@intel.com>,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>,
	Michal Hocko <mhocko@kernel.org>,
	"Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Vlastimil Babka <vbabka@suse.cz>,
	Mel Gorman <mgorman@techsingularity.net>,
	John Hubbard <jhubbard@nvidia.com>,
	Mark Hairgrove <mhairgrove@nvidia.com>,
	Nitin Gupta <nigupta@nvidia.com>,
	Javier Cabezas <jcabezas@nvidia.com>,
	David Nellans <dnellans@nvidia.com>, Zi Yan <ziy@nvidia.com>
Subject: [RFC PATCH 20/25] memory manage: Add memory manage syscall.
Date: Wed,  3 Apr 2019 19:00:41 -0700	[thread overview]
Message-ID: <20190404020046.32741-21-zi.yan@sent.com> (raw)
In-Reply-To: <20190404020046.32741-1-zi.yan@sent.com>

From: Zi Yan <ziy@nvidia.com>

This prepares for the following patches to provide a user API to
manipulate pages in two memory nodes with the help of memcg.

missing memcg_max_size_node()

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 include/linux/sched/coredump.h         |   1 +
 include/linux/syscalls.h               |   5 ++
 include/uapi/linux/mempolicy.h         |   1 +
 mm/Makefile                            |   1 +
 mm/internal.h                          |   2 +
 mm/memory_manage.c                     | 109 +++++++++++++++++++++++++++++++++
 mm/mempolicy.c                         |   2 +-
 8 files changed, 121 insertions(+), 1 deletion(-)
 create mode 100644 mm/memory_manage.c

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 863a21e..fa8def3 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -344,6 +344,7 @@
 333	common	io_pgetevents		__x64_sys_io_pgetevents
 334	common	rseq			__x64_sys_rseq
 335	common	exchange_pages	__x64_sys_exchange_pages
+336	common	mm_manage		__x64_sys_mm_manage
 # don't use numbers 387 through 423, add new calls after the last
 # 'common' entry
 424	common	pidfd_send_signal	__x64_sys_pidfd_send_signal
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index ecdc654..9aa9d94b 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -73,6 +73,7 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_OOM_VICTIM		25	/* mm is the oom victim */
 #define MMF_OOM_REAP_QUEUED	26	/* mm was queued for oom_reaper */
 #define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
+#define MMF_MM_MANAGE		27
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
 				 MMF_DISABLE_THP_MASK)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2c1eb49..47d56c5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1208,6 +1208,11 @@ asmlinkage long sys_exchange_pages(pid_t pid, unsigned long nr_pages,
 				const void __user * __user *to_pages,
 				int __user *status,
 				int flags);
+asmlinkage long sys_mm_manage(pid_t pid, unsigned long nr_pages,
+				unsigned long maxnode,
+				const unsigned long __user *old_nodes,
+				const unsigned long __user *new_nodes,
+				int flags);
 
 /*
  * Not a real system call, but a placeholder for syscalls which are
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index a9d03e5..4722bb7 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -52,6 +52,7 @@ enum {
 #define MPOL_MF_MOVE_DMA (1<<5)	/* Use DMA page copy routine */
 #define MPOL_MF_MOVE_MT  (1<<6)	/* Use multi-threaded page copy routine */
 #define MPOL_MF_MOVE_CONCUR  (1<<7)	/* Move pages in a batch */
+#define MPOL_MF_EXCHANGE	(1<<8)	/* Exchange pages */
 
 #define MPOL_MF_VALID	(MPOL_MF_STRICT   | 	\
 			 MPOL_MF_MOVE     | 	\
diff --git a/mm/Makefile b/mm/Makefile
index 2f1f1ad..5302d79 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -47,6 +47,7 @@ obj-y += memblock.o
 obj-y += copy_page.o
 obj-y += exchange.o
 obj-y += exchange_page.o
+obj-y += memory_manage.o
 
 ifdef CONFIG_MMU
 	obj-$(CONFIG_ADVISE_SYSCALLS)	+= madvise.o
diff --git a/mm/internal.h b/mm/internal.h
index cf63bf6..94feb14 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -574,5 +574,7 @@ bool buffer_migrate_lock_buffers(struct buffer_head *head,
 int writeout(struct address_space *mapping, struct page *page);
 int expected_page_refs(struct address_space *mapping, struct page *page);
 
+int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
+		     unsigned long maxnode);
 
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/memory_manage.c b/mm/memory_manage.c
new file mode 100644
index 0000000..b8f3654
--- /dev/null
+++ b/mm/memory_manage.c
@@ -0,0 +1,109 @@
+/*
+ * A syscall used to move pages between two nodes.
+ */
+
+#include <linux/sched/mm.h>
+#include <linux/cpuset.h>
+#include <linux/mempolicy.h>
+#include <linux/nodemask.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+
+#include "internal.h"
+
+
+SYSCALL_DEFINE6(mm_manage, pid_t, pid, unsigned long, nr_pages,
+		unsigned long, maxnode,
+		const unsigned long __user *, slow_nodes,
+		const unsigned long __user *, fast_nodes,
+		int, flags)
+{
+	const struct cred *cred = current_cred(), *tcred;
+	struct task_struct *task;
+	struct mm_struct *mm = NULL;
+	int err;
+	nodemask_t task_nodes;
+	nodemask_t *slow;
+	nodemask_t *fast;
+	NODEMASK_SCRATCH(scratch);
+
+	if (!scratch)
+		return -ENOMEM;
+
+	slow = &scratch->mask1;
+	fast = &scratch->mask2;
+
+	err = get_nodes(slow, slow_nodes, maxnode);
+	if (err)
+		goto out;
+
+	err = get_nodes(fast, fast_nodes, maxnode);
+	if (err)
+		goto out;
+
+	/* Check flags */
+	if (flags & ~(MPOL_MF_MOVE_MT|
+				  MPOL_MF_MOVE_DMA|
+				  MPOL_MF_MOVE_CONCUR|
+				  MPOL_MF_EXCHANGE))
+		return -EINVAL;
+
+	/* Find the mm_struct */
+	rcu_read_lock();
+	task = pid ? find_task_by_vpid(pid) : current;
+	if (!task) {
+		rcu_read_unlock();
+		err = -ESRCH;
+		goto out;
+	}
+	get_task_struct(task);
+
+	err = -EINVAL;
+	/*
+	 * Check if this process has the right to modify the specified
+	 * process. The right exists if the process has administrative
+	 * capabilities, superuser privileges or the same
+	 * userid as the target process.
+	 */
+	tcred = __task_cred(task);
+	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
+	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
+	    !capable(CAP_SYS_NICE)) {
+		rcu_read_unlock();
+		err = -EPERM;
+		goto out_put;
+	}
+	rcu_read_unlock();
+
+	err = security_task_movememory(task);
+	if (err)
+		goto out_put;
+
+	task_nodes = cpuset_mems_allowed(task);
+	mm = get_task_mm(task);
+	put_task_struct(task);
+
+	if (!mm) {
+		err = -EINVAL;
+		goto out;
+	}
+	if (test_bit(MMF_MM_MANAGE, &mm->flags)) {
+		mmput(mm);
+		goto out;
+	} else {
+		set_bit(MMF_MM_MANAGE, &mm->flags);
+	}
+
+
+	clear_bit(MMF_MM_MANAGE, &mm->flags);
+	mmput(mm);
+out:
+	NODEMASK_SCRATCH_FREE(scratch);
+
+	return err;
+
+out_put:
+	put_task_struct(task);
+	goto out;
+
+}
\ No newline at end of file
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0e30049..168d17f8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1249,7 +1249,7 @@ static long do_mbind(unsigned long start, unsigned long len,
  */
 
 /* Copy a node mask from user space. */
-static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
+int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 		     unsigned long maxnode)
 {
 	unsigned long k;
-- 
2.7.4


  parent reply	other threads:[~2019-04-04  2:09 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-04  2:00 [RFC PATCH 00/25] Accelerate page migration and use memcg for PMEM management Zi Yan
2019-04-04  2:00 ` [RFC PATCH 01/25] mm: migrate: Change migrate_mode to support combination migration modes Zi Yan
2019-04-04  2:00 ` [RFC PATCH 02/25] mm: migrate: Add mode parameter to support future page copy routines Zi Yan
2019-04-04  2:00 ` [RFC PATCH 03/25] mm: migrate: Add a multi-threaded page migration function Zi Yan
2019-04-04  2:00 ` [RFC PATCH 04/25] mm: migrate: Add copy_page_multithread into migrate_pages Zi Yan
2019-04-04  2:00 ` [RFC PATCH 05/25] mm: migrate: Add vm.accel_page_copy in sysfs to control page copy acceleration Zi Yan
2019-04-04  2:00 ` [RFC PATCH 06/25] mm: migrate: Make the number of copy threads adjustable via sysctl Zi Yan
2019-04-04  2:00 ` [RFC PATCH 07/25] mm: migrate: Add copy_page_dma to use DMA Engine to copy pages Zi Yan
2019-04-04  2:00 ` [RFC PATCH 08/25] mm: migrate: Add copy_page_dma into migrate_page_copy Zi Yan
2019-04-04  2:00 ` [RFC PATCH 09/25] mm: migrate: Add copy_page_lists_dma_always to support copy a list of pages Zi Yan
2019-04-04  2:00 ` [RFC PATCH 10/25] mm: migrate: copy_page_lists_mt() to copy a page list using multi-threads Zi Yan
2019-04-04  2:00 ` [RFC PATCH 11/25] mm: migrate: Add concurrent page migration into move_pages syscall Zi Yan
2019-04-04  2:00 ` [RFC PATCH 12/25] exchange pages: new page migration mechanism: exchange_pages() Zi Yan
2019-04-04  2:00 ` [RFC PATCH 13/25] exchange pages: add multi-threaded exchange pages Zi Yan
2019-04-04  2:00 ` [RFC PATCH 14/25] exchange pages: concurrent " Zi Yan
2019-04-04  2:00 ` [RFC PATCH 15/25] exchange pages: exchange anonymous page and file-backed page Zi Yan
2019-04-04  2:00 ` [RFC PATCH 16/25] exchange page: Add THP exchange support Zi Yan
2019-04-04  2:00 ` [RFC PATCH 17/25] exchange page: Add exchange_page() syscall Zi Yan
2019-04-04  2:00 ` [RFC PATCH 18/25] memcg: Add per node memory usage&max stats in memcg Zi Yan
2019-04-04  2:00 ` [RFC PATCH 19/25] mempolicy: add MPOL_F_MEMCG flag, enforcing memcg memory limit Zi Yan
2019-04-04  2:00 ` Zi Yan [this message]
2019-04-04  2:00 ` [RFC PATCH 21/25] mm: move update_lru_sizes() to mm_inline.h for broader use Zi Yan
2019-04-04  2:00 ` [RFC PATCH 22/25] memory manage: active/inactive page list manipulation in memcg Zi Yan
2019-04-04  2:00 ` [RFC PATCH 23/25] memory manage: page migration based page manipulation between NUMA nodes Zi Yan
2019-04-04  2:00 ` [RFC PATCH 24/25] memory manage: limit migration batch size Zi Yan
2019-04-04  2:00 ` [RFC PATCH 25/25] memory manage: use exchange pages to memory manage to improve throughput Zi Yan
2019-04-04  7:13 ` [RFC PATCH 00/25] Accelerate page migration and use memcg for PMEM management Michal Hocko
2019-04-05  0:32 ` Yang Shi
2019-04-05 17:20   ` Zi Yan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190404020046.32741-21-zi.yan@sent.com \
    --to=zi.yan@sent.com \
    --cc=akpm@linux-foundation.org \
    --cc=daniel.m.jordan@oracle.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=dnellans@nvidia.com \
    --cc=fengguang.wu@intel.com \
    --cc=jcabezas@nvidia.com \
    --cc=jhubbard@nvidia.com \
    --cc=keith.busch@intel.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@techsingularity.net \
    --cc=mhairgrove@nvidia.com \
    --cc=mhocko@kernel.org \
    --cc=nigupta@nvidia.com \
    --cc=vbabka@suse.cz \
    --cc=yang.shi@linux.alibaba.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.