Linux-mm Archive on lore.kernel.org
 help / color / Atom feed
WARNING: multiple messages have this Message-ID
From: Minchan Kim <minchan@kernel.org>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: LKML <linux-kernel@vger.kernel.org>,
	linux-mm <linux-mm@kvack.org>, Michal Hocko <mhocko@suse.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Tim Murray <timmurray@google.com>,
	Joel Fernandes <joel@joelfernandes.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Daniel Colascione <dancol@google.com>,
	Shakeel Butt <shakeelb@google.com>,
	Sonny Rao <sonnyrao@google.com>,
	Brian Geffon <bgeffon@google.com>,
	Minchan Kim <minchan@kernel.org>
Subject: [RFC 5/7] mm: introduce external memory hinting API
Date: Mon, 20 May 2019 12:52:52 +0900
Message-ID: <20190520035254.57579-6-minchan@kernel.org> (raw)
In-Reply-To: <20190520035254.57579-1-minchan@kernel.org>

There is some usecase that centralized userspace daemon want to give
a memory hint like MADV_[COOL|COLD] to other process. Android's
ActivityManagerService is one of them.

It's similar in spirit to madvise(MADV_WONTNEED), but the information
required to make the reclaim decision is not known to the app. Instead,
it is known to the centralized userspace daemon(ActivityManagerService),
and that daemon must be able to initiate reclaim on its own without
any app involvement.

To solve the issue, this patch introduces new syscall process_madvise(2)
which works based on pidfd so it could give a hint to the exeternal
process.

int process_madvise(int pidfd, void *addr, size_t length, int advise);

All advises madvise provides can be supported in process_madvise, too.
Since it could affect other process's address range, only privileged
process(CAP_SYS_PTRACE) or something else(e.g., being the same UID)
gives it the right to ptrrace the process could use it successfully.

Please suggest better idea if you have other idea about the permission.

* from v1r1
  * use ptrace capability - surenb, dancol

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 arch/x86/entry/syscalls/syscall_32.tbl |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 include/linux/proc_fs.h                |  1 +
 include/linux/syscalls.h               |  2 ++
 include/uapi/asm-generic/unistd.h      |  2 ++
 kernel/signal.c                        |  2 +-
 kernel/sys_ni.c                        |  1 +
 mm/madvise.c                           | 45 ++++++++++++++++++++++++++
 8 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 4cd5f982b1e5..5b9dd55d6b57 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -438,3 +438,4 @@
 425	i386	io_uring_setup		sys_io_uring_setup		__ia32_sys_io_uring_setup
 426	i386	io_uring_enter		sys_io_uring_enter		__ia32_sys_io_uring_enter
 427	i386	io_uring_register	sys_io_uring_register		__ia32_sys_io_uring_register
+428	i386	process_madvise		sys_process_madvise		__ia32_sys_process_madvise
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 64ca0d06259a..0e5ee78161c9 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -355,6 +355,7 @@
 425	common	io_uring_setup		__x64_sys_io_uring_setup
 426	common	io_uring_enter		__x64_sys_io_uring_enter
 427	common	io_uring_register	__x64_sys_io_uring_register
+428	common	process_madvise		__x64_sys_process_madvise
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 52a283ba0465..f8545d7c5218 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -122,6 +122,7 @@ static inline struct pid *tgid_pidfd_to_pid(const struct file *file)
 
 #endif /* CONFIG_PROC_FS */
 
+extern struct pid *pidfd_to_pid(const struct file *file);
 struct net;
 
 static inline struct proc_dir_entry *proc_net_mkdir(
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e2870fe1be5b..21c6c9a62006 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -872,6 +872,8 @@ asmlinkage long sys_munlockall(void);
 asmlinkage long sys_mincore(unsigned long start, size_t len,
 				unsigned char __user * vec);
 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
+asmlinkage long sys_process_madvise(int pid_fd, unsigned long start,
+				size_t len, int behavior);
 asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 			unsigned long prot, unsigned long pgoff,
 			unsigned long flags);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index dee7292e1df6..7ee82ce04620 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -832,6 +832,8 @@ __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
 __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
 #define __NR_io_uring_register 427
 __SYSCALL(__NR_io_uring_register, sys_io_uring_register)
+#define __NR_process_madvise 428
+__SYSCALL(__NR_process_madvise, sys_process_madvise)
 
 #undef __NR_syscalls
 #define __NR_syscalls 428
diff --git a/kernel/signal.c b/kernel/signal.c
index 1c86b78a7597..04e75daab1f8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3620,7 +3620,7 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
 	return copy_siginfo_from_user(kinfo, info);
 }
 
-static struct pid *pidfd_to_pid(const struct file *file)
+struct pid *pidfd_to_pid(const struct file *file)
 {
 	if (file->f_op == &pidfd_fops)
 		return file->private_data;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 4d9ae5ea6caf..5277421795ab 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -278,6 +278,7 @@ COND_SYSCALL(mlockall);
 COND_SYSCALL(munlockall);
 COND_SYSCALL(mincore);
 COND_SYSCALL(madvise);
+COND_SYSCALL(process_madvise);
 COND_SYSCALL(remap_file_pages);
 COND_SYSCALL(mbind);
 COND_SYSCALL_COMPAT(mbind);
diff --git a/mm/madvise.c b/mm/madvise.c
index 119e82e1f065..af02aa17e5c1 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -9,6 +9,7 @@
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/page_idle.h>
+#include <linux/proc_fs.h>
 #include <linux/syscalls.h>
 #include <linux/mempolicy.h>
 #include <linux/page-isolation.h>
@@ -16,6 +17,7 @@
 #include <linux/hugetlb.h>
 #include <linux/falloc.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/ksm.h>
 #include <linux/fs.h>
 #include <linux/file.h>
@@ -1140,3 +1142,46 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 {
 	return madvise_core(current, start, len_in, behavior);
 }
+
+SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
+		size_t, len_in, int, behavior)
+{
+	int ret;
+	struct fd f;
+	struct pid *pid;
+	struct task_struct *tsk;
+	struct mm_struct *mm;
+
+	f = fdget(pidfd);
+	if (!f.file)
+		return -EBADF;
+
+	pid = pidfd_to_pid(f.file);
+	if (IS_ERR(pid)) {
+		ret = PTR_ERR(pid);
+		goto err;
+	}
+
+	ret = -EINVAL;
+	rcu_read_lock();
+	tsk = pid_task(pid, PIDTYPE_PID);
+	if (!tsk) {
+		rcu_read_unlock();
+		goto err;
+	}
+	get_task_struct(tsk);
+	rcu_read_unlock();
+	mm = mm_access(tsk, PTRACE_MODE_ATTACH_REALCREDS);
+	if (!mm || IS_ERR(mm)) {
+		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+		if (ret == -EACCES)
+			ret = -EPERM;
+		goto err;
+	}
+	ret = madvise_core(tsk, start, len_in, behavior);
+	mmput(mm);
+	put_task_struct(tsk);
+err:
+	fdput(f);
+	return ret;
+}
-- 
2.21.0.1020.gf2820cf01a-goog


From: Hillf Danton <hdanton@sina.com>
To: Minchan Kim <minchan@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	LKML <linux-kernel@vger.kernel.org>,
	linux-mm <linux-mm@kvack.org>, Michal Hocko <mhocko@suse.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Tim Murray <timmurray@google.com>,
	Joel Fernandes <joel@joelfernandes.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Daniel Colascione <dancol@google.com>,
	Shakeel Butt <shakeelb@google.com>,
	Sonny Rao <sonnyrao@google.com>,
	Brian Geffon <bgeffon@google.com>
Subject: Re: [RFC 5/7] mm: introduce external memory hinting API
Date: Wed, 29 May 2019 11:41:23 +0800
Message-ID: <20190520035254.57579-6-minchan@kernel.org> (raw)
Message-ID: <20190529034123.Ydws0HZGButPCKc2uBNRJHio08XPB3nB1CUeGF-D3fs@z> (raw)
In-Reply-To: <20190520035254.57579-1-minchan@kernel.org>


On Mon, 20 May 2019 12:52:52 +0900 Minchan Kim wrote:
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -355,6 +355,7 @@
>  425	common	io_uring_setup		__x64_sys_io_uring_setup
>  426	common	io_uring_enter		__x64_sys_io_uring_enter
>  427	common	io_uring_register	__x64_sys_io_uring_register
> +428	common	process_madvise		__x64_sys_process_madvise
>  
Much better if something similar is added for arm64.

>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -832,6 +832,8 @@ __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
>  __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
>  #define __NR_io_uring_register 427
>  __SYSCALL(__NR_io_uring_register, sys_io_uring_register)
> +#define __NR_process_madvise 428
> +__SYSCALL(__NR_process_madvise, sys_process_madvise)
>  
>  #undef __NR_syscalls
>  #define __NR_syscalls 428

Seems __NR_syscalls needs to increment by one.

BR
Hillf


  parent reply index

Thread overview: 138+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-20  3:52 [RFC 0/7] introduce memory hinting API for external process Minchan Kim
2019-05-20  3:52 ` [RFC 1/7] mm: introduce MADV_COOL Minchan Kim
2019-05-20  8:16   ` Michal Hocko
2019-05-20  8:19     ` Michal Hocko
2019-05-20 15:08       ` Suren Baghdasaryan
2019-05-20 22:55       ` Minchan Kim
2019-05-20 22:54     ` Minchan Kim
2019-05-21  6:04       ` Michal Hocko
2019-05-21  9:11         ` Minchan Kim
2019-05-21 10:05           ` Michal Hocko
2019-05-28  8:53   ` Hillf Danton
2019-05-28 10:58   ` Minchan Kim
2019-05-20  3:52 ` [RFC 2/7] mm: change PAGEREF_RECLAIM_CLEAN with PAGE_REFRECLAIM Minchan Kim
2019-05-20 16:50   ` Johannes Weiner
2019-05-20 22:57     ` Minchan Kim
2019-05-20  3:52 ` [RFC 3/7] mm: introduce MADV_COLD Minchan Kim
2019-05-20  8:27   ` Michal Hocko
2019-05-20 23:00     ` Minchan Kim
2019-05-21  6:08       ` Michal Hocko
2019-05-21  9:13         ` Minchan Kim
2019-05-28 14:54   ` Hillf Danton
2019-05-30  0:45   ` Minchan Kim
2019-05-20  3:52 ` [RFC 4/7] mm: factor out madvise's core functionality Minchan Kim
2019-05-20 14:26   ` Oleksandr Natalenko
2019-05-21  1:26     ` Minchan Kim
2019-05-21  6:36       ` Oleksandr Natalenko
2019-05-21  6:50         ` Michal Hocko
2019-05-21  7:06           ` Oleksandr Natalenko
2019-05-21 10:52             ` Minchan Kim
2019-05-21 11:00               ` Michal Hocko
2019-05-21 11:24                 ` Minchan Kim
2019-05-21 11:32                   ` Michal Hocko
2019-05-21 10:49         ` Minchan Kim
2019-05-21 10:55           ` Michal Hocko
2019-05-20  3:52 ` Minchan Kim [this message]
2019-05-20  9:18   ` [RFC 5/7] mm: introduce external memory hinting API Michal Hocko
2019-05-21  2:41     ` Minchan Kim
2019-05-21  6:17       ` Michal Hocko
2019-05-21 10:32         ` Minchan Kim
2019-05-21  9:01   ` Christian Brauner
2019-05-21 11:35     ` Minchan Kim
2019-05-21 11:51       ` Christian Brauner
2019-05-21 15:31   ` Oleg Nesterov
2019-05-27  7:43     ` Minchan Kim
2019-05-27 15:12       ` Oleg Nesterov
2019-05-27 23:33         ` Minchan Kim
2019-05-28  7:23           ` Michal Hocko
2019-05-29  3:41   ` Hillf Danton
2019-05-30  0:38   ` Minchan Kim
2019-05-20  3:52 ` [RFC 6/7] mm: extend process_madvise syscall to support vector arrary Minchan Kim
2019-05-20  9:22   ` Michal Hocko
2019-05-21  2:48     ` Minchan Kim
2019-05-21  6:24       ` Michal Hocko
2019-05-21 10:26         ` Minchan Kim
2019-05-21 10:37           ` Michal Hocko
2019-05-27  7:49             ` Minchan Kim
2019-05-29 10:08               ` Daniel Colascione
2019-05-29 10:33                 ` Michal Hocko
2019-05-30  2:17                   ` Minchan Kim
2019-05-30  6:57                     ` Michal Hocko
2019-05-30  8:02                       ` Minchan Kim
2019-05-30 16:19                         ` Daniel Colascione
2019-05-30 18:47                         ` Michal Hocko
2019-05-29  4:14   ` Hillf Danton
2019-05-30  0:35   ` Minchan Kim
2019-05-20  3:52 ` [RFC 7/7] mm: madvise support MADV_ANONYMOUS_FILTER and MADV_FILE_FILTER Minchan Kim
2019-05-20  9:28   ` Michal Hocko
2019-05-21  2:55     ` Minchan Kim
2019-05-21  6:26       ` Michal Hocko
2019-05-27  7:58         ` Minchan Kim
2019-05-27 12:44           ` Michal Hocko
2019-05-28  3:26             ` Minchan Kim
2019-05-28  6:29               ` Michal Hocko
2019-05-28  8:13                 ` Minchan Kim
2019-05-28  8:31                   ` Daniel Colascione
2019-05-28  8:49                     ` Minchan Kim
2019-05-28  9:08                       ` Michal Hocko
2019-05-28  9:39                         ` Daniel Colascione
2019-05-28 10:33                           ` Michal Hocko
2019-05-28 11:21                             ` Daniel Colascione
2019-05-28 11:49                               ` Michal Hocko
2019-05-28 12:11                                 ` Daniel Colascione
2019-05-28 12:32                                   ` Michal Hocko
2019-05-28 10:32                         ` Minchan Kim
2019-05-28 10:41                           ` Michal Hocko
2019-05-28 11:12                             ` Minchan Kim
2019-05-28 11:28                               ` Michal Hocko
2019-05-28 11:42                                 ` Daniel Colascione
2019-05-28 11:56                                   ` Michal Hocko
2019-05-28 12:18                                     ` Daniel Colascione
2019-05-28 12:38                                       ` Michal Hocko
2019-05-28 12:10                                   ` Minchan Kim
2019-05-28 11:44                                 ` Minchan Kim
2019-05-28 11:51                                   ` Daniel Colascione
2019-05-28 12:06                                   ` Michal Hocko
2019-05-28 12:22                                     ` Minchan Kim
2019-05-28 11:28                             ` Daniel Colascione
2019-05-21 15:33       ` Johannes Weiner
2019-05-22  1:50         ` Minchan Kim
2019-05-29  4:36   ` Hillf Danton
2019-05-30  1:00   ` Minchan Kim
2019-05-20  6:37 ` [RFC 0/7] introduce memory hinting API for external process Anshuman Khandual
2019-05-20 16:59   ` Tim Murray
2019-05-21  2:55     ` Anshuman Khandual
2019-05-21  5:14       ` Minchan Kim
2019-05-21 10:34       ` Michal Hocko
2019-05-28 10:50         ` Anshuman Khandual
2019-05-21 12:56       ` Shakeel Butt
2019-05-22  4:15         ` Brian Geffon
2019-05-22  4:23         ` Brian Geffon
2019-05-20  9:28 ` Michal Hocko
2019-05-20 14:42 ` Oleksandr Natalenko
2019-05-21  2:56   ` Minchan Kim
2019-05-20 16:46 ` Johannes Weiner
2019-05-21  4:39   ` Minchan Kim
2019-05-21  6:32     ` Michal Hocko
2019-05-21  1:44 ` Matthew Wilcox
2019-05-21  5:01   ` Minchan Kim
2019-05-21  6:34   ` Michal Hocko
2019-05-21  8:42 ` Christian Brauner
2019-05-21 11:05   ` Minchan Kim
2019-05-21 11:30     ` Christian Brauner
2019-05-21 11:39       ` Christian Brauner
2019-05-22  5:11         ` Daniel Colascione
2019-05-22  8:22           ` Christian Brauner
2019-05-22 13:16             ` Daniel Colascione
2019-05-22 14:52               ` Christian Brauner
2019-05-22 15:17                 ` Daniel Colascione
2019-05-22 15:48                   ` Christian Brauner
2019-05-22 15:57                     ` Daniel Colascione
2019-05-22 16:01                       ` Christian Brauner
2019-05-22 16:01                         ` Daniel Colascione
2019-05-23 13:07                           ` Minchan Kim
2019-05-27  8:06                             ` Minchan Kim
2019-05-21 11:41       ` Minchan Kim
2019-05-21 12:04         ` Christian Brauner
2019-05-21 12:15           ` Oleksandr Natalenko
2019-05-21 12:53 ` Shakeel Butt

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190520035254.57579-6-minchan@kernel.org \
    --to=minchan@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=bgeffon@google.com \
    --cc=dancol@google.com \
    --cc=hannes@cmpxchg.org \
    --cc=joel@joelfernandes.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.com \
    --cc=shakeelb@google.com \
    --cc=sonnyrao@google.com \
    --cc=surenb@google.com \
    --cc=timmurray@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-mm Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-mm/0 linux-mm/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-mm linux-mm/ https://lore.kernel.org/linux-mm \
		linux-mm@kvack.org
	public-inbox-index linux-mm

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kvack.linux-mm


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git