All of lore.kernel.org
 help / color / mirror / Atom feed
From: Shawn Landden <slandden@gmail.com>
To: unlisted-recipients:; (no To-header on input)
Cc: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org, Shawn Landden <slandden@gmail.com>
Subject: [RFC] EPOLL_KILLME: New flag to epoll_wait() that subscribes process to death row (new syscall)
Date: Tue, 31 Oct 2017 22:32:44 -0700	[thread overview]
Message-ID: <20171101053244.5218-1-slandden@gmail.com> (raw)

It is common for services to be stateless around their main event loop.
If a process passes the EPOLL_KILLME flag to epoll_wait5() then it
signals to the kernel that epoll_wait5() may not complete, and the kernel
may send SIGKILL if resources get tight.

See my systemd patch: https://github.com/shawnl/systemd/tree/killme

Android uses this memory model for all programs, and having it in the
kernel will enable integration with the page cache (not in this
series).
---
 arch/x86/entry/syscalls/syscall_32.tbl |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 fs/eventpoll.c                         | 74 +++++++++++++++++++++++++++++++++-
 include/linux/eventpoll.h              |  2 +
 include/linux/sched.h                  |  3 ++
 include/uapi/asm-generic/unistd.h      |  5 ++-
 include/uapi/linux/eventpoll.h         |  3 ++
 kernel/exit.c                          |  2 +
 mm/oom_kill.c                          | 17 ++++++++
 9 files changed, 105 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 448ac2161112..040e5d02bdcc 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -391,3 +391,4 @@
 382	i386	pkey_free		sys_pkey_free
 383	i386	statx			sys_statx
 384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
+385	i386	epoll_wait5		sys_epoll_wait5
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183e2f85..c72802e8cf65 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
 330	common	pkey_alloc		sys_pkey_alloc
 331	common	pkey_free		sys_pkey_free
 332	common	statx			sys_statx
+333	common	epoll_wait5		sys_epoll_wait5
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..76d1c91d940b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -297,6 +297,14 @@ static LIST_HEAD(visited_list);
  */
 static LIST_HEAD(tfile_check_list);
 
+static LIST_HEAD(deathrow_q);
+static long deathrow_len __read_mostly;
+
+/* TODO: Can this lock be removed by using atomic instructions to update
+ * queue?
+ */
+static DEFINE_MUTEX(deathrow_mutex);
+
 #ifdef CONFIG_SYSCTL
 
 #include <linux/sysctl.h>
@@ -314,6 +322,15 @@ struct ctl_table epoll_table[] = {
 		.extra1		= &zero,
 		.extra2		= &long_max,
 	},
+	{
+		.procname	= "deathrow_size",
+		.data		= &deathrow_len,
+		.maxlen		= sizeof(deathrow_len),
+		.mode		= 0444,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &long_max,
+	},
 	{ }
 };
 #endif /* CONFIG_SYSCTL */
@@ -2164,9 +2181,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 /*
  * Implement the event wait interface for the eventpoll file. It is the kernel
  * part of the user space epoll_wait(2).
+ *
+ * A flags argument cannot be added to epoll_pwait cause it already has
+ * the maximum number of arguments (6). Can this be fixed?
  */
-SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
-		int, maxevents, int, timeout)
+SYSCALL_DEFINE5(epoll_wait5, int, epfd, struct epoll_event __user *, events,
+		int, maxevents, int, timeout, int, flags)
 {
 	int error;
 	struct fd f;
@@ -2199,14 +2219,44 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
 	 */
 	ep = f.file->private_data;
 
+	/* Check the EPOLL_* constants for conflicts.  */
+	BUILD_BUG_ON(EPOLL_KILLME == EPOLL_CLOEXEC);
+
+	if (flags & ~EPOLL_KILLME)
+		return -EINVAL;
+
+	if (flags & EPOLL_KILLME) {
+		/* Put process on death row. */
+		mutex_lock(&deathrow_mutex);
+		deathrow_len++;
+		list_add(&current->se.deathrow, &deathrow_q);
+		current->se.on_deathrow = 1;
+		mutex_unlock(&deathrow_mutex);
+	}
+
 	/* Time to fish for events ... */
 	error = ep_poll(ep, events, maxevents, timeout);
 
+	if (flags & EPOLL_KILLME) {
+		/* Remove process from death row. */
+		mutex_lock(&deathrow_mutex);
+		current->se.on_deathrow = 0;
+		list_del(&current->se.deathrow);
+		deathrow_len--;
+		mutex_unlock(&deathrow_mutex);
+	}
+
 error_fput:
 	fdput(f);
 	return error;
 }
 
+SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
+		int, maxevents, int, timeout)
+{
+	return sys_epoll_wait5(epfd, events, maxevents, timeout, 0);
+}
+
 /*
  * Implement the event wait interface for the eventpoll file. It is the kernel
  * part of the user space epoll_pwait(2).
@@ -2297,6 +2347,26 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 }
 #endif
 
+/* Clean up after a EPOLL_KILLME process quits.
+ * Called by kernel/exit.c.
+ */
+int exit_killme(void)
+{
+	if (current->se.on_deathrow) {
+		mutex_lock(&deathrow_mutex);
+		current->se.on_deathrow = 0;
+		list_del(&current->se.deathrow);
+		mutex_unlock(&deathrow_mutex);
+	}
+
+	return 0;
+}
+
+struct list_head *eventpoll_deathrow_list(void)
+{
+	return &deathrow_q;
+}
+
 static int __init eventpoll_init(void)
 {
 	struct sysinfo si;
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 2f14ac73d01d..f1e28d468de5 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -20,6 +20,8 @@
 /* Forward declarations to avoid compiler errors */
 struct file;
 
+int exit_killme(void);
+struct list_head *eventpoll_deathrow_list(void);
 
 #ifdef CONFIG_EPOLL
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a7df4e558c..66462bf27a29 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -380,6 +380,9 @@ struct sched_entity {
 	struct list_head		group_node;
 	unsigned int			on_rq;
 
+	unsigned			on_deathrow:1;
+	struct list_head		deathrow;
+
 	u64				exec_start;
 	u64				sum_exec_runtime;
 	u64				vruntime;
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 061185a5eb51..843553a39388 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -893,8 +893,11 @@ __SYSCALL(__NR_fork, sys_fork)
 __SYSCALL(__NR_fork, sys_ni_syscall)
 #endif /* CONFIG_MMU */
 
+#define __NR_epoll_wait5 1080
+__SYSCALL(__NR_epoll_wait5, sys_epoll_wait5)
+
 #undef __NR_syscalls
-#define __NR_syscalls (__NR_fork+1)
+#define __NR_syscalls (__NR_fork+2)
 
 #endif /* __ARCH_WANT_SYSCALL_DEPRECATED */
 
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index f4d5c998cc2b..ce150a3e7248 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -21,6 +21,9 @@
 /* Flags for epoll_create1.  */
 #define EPOLL_CLOEXEC O_CLOEXEC
 
+/* Flags for epoll_wait5.  */
+#define EPOLL_KILLME 0x00000001
+
 /* Valid opcodes to issue to sys_epoll_ctl() */
 #define EPOLL_CTL_ADD 1
 #define EPOLL_CTL_DEL 2
diff --git a/kernel/exit.c b/kernel/exit.c
index f6cad39f35df..cd089bdc5b17 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -62,6 +62,7 @@
 #include <linux/random.h>
 #include <linux/rcuwait.h>
 #include <linux/compat.h>
+#include <linux/eventpoll.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -917,6 +918,7 @@ void __noreturn do_exit(long code)
 		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 	exit_rcu();
 	exit_tasks_rcu_finish();
+	exit_killme();
 
 	lockdep_free_task(tsk);
 	do_task_dead();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dee0f75c3013..d6252772d593 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -41,6 +41,7 @@
 #include <linux/kthread.h>
 #include <linux/init.h>
 #include <linux/mmu_notifier.h>
+#include <linux/eventpoll.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -1029,6 +1030,22 @@ bool out_of_memory(struct oom_control *oc)
 		return true;
 	}
 
+	/*
+	 * Check death row.
+	 */
+	if (!list_empty(eventpoll_deathrow_list())) {
+		struct list_head *l = eventpoll_deathrow_list();
+		struct task_struct *ts = list_first_entry(l,
+					 struct task_struct, se.deathrow);
+
+		pr_debug("Killing pid %u from EPOLL_KILLME death row.",
+			ts->pid);
+
+		/* We use SIGKILL so as to cleanly interrupt ep_poll() */
+		kill_pid(task_pid(ts), SIGKILL, 1);
+		return true;
+	}
+
 	/*
 	 * The OOM killer does not compensate for IO-less reclaim.
 	 * pagefault_out_of_memory lost its gfp context so we have to
-- 
2.15.0.rc2

WARNING: multiple messages have this Message-ID (diff)
From: Shawn Landden <slandden@gmail.com>
Cc: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org, Shawn Landden <slandden@gmail.com>
Subject: [RFC] EPOLL_KILLME: New flag to epoll_wait() that subscribes process to death row (new syscall)
Date: Tue, 31 Oct 2017 22:32:44 -0700	[thread overview]
Message-ID: <20171101053244.5218-1-slandden@gmail.com> (raw)

It is common for services to be stateless around their main event loop.
If a process passes the EPOLL_KILLME flag to epoll_wait5() then it
signals to the kernel that epoll_wait5() may not complete, and the kernel
may send SIGKILL if resources get tight.

See my systemd patch: https://github.com/shawnl/systemd/tree/killme

Android uses this memory model for all programs, and having it in the
kernel will enable integration with the page cache (not in this
series).
---
 arch/x86/entry/syscalls/syscall_32.tbl |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 fs/eventpoll.c                         | 74 +++++++++++++++++++++++++++++++++-
 include/linux/eventpoll.h              |  2 +
 include/linux/sched.h                  |  3 ++
 include/uapi/asm-generic/unistd.h      |  5 ++-
 include/uapi/linux/eventpoll.h         |  3 ++
 kernel/exit.c                          |  2 +
 mm/oom_kill.c                          | 17 ++++++++
 9 files changed, 105 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 448ac2161112..040e5d02bdcc 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -391,3 +391,4 @@
 382	i386	pkey_free		sys_pkey_free
 383	i386	statx			sys_statx
 384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
+385	i386	epoll_wait5		sys_epoll_wait5
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183e2f85..c72802e8cf65 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
 330	common	pkey_alloc		sys_pkey_alloc
 331	common	pkey_free		sys_pkey_free
 332	common	statx			sys_statx
+333	common	epoll_wait5		sys_epoll_wait5
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..76d1c91d940b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -297,6 +297,14 @@ static LIST_HEAD(visited_list);
  */
 static LIST_HEAD(tfile_check_list);
 
+static LIST_HEAD(deathrow_q);
+static long deathrow_len __read_mostly;
+
+/* TODO: Can this lock be removed by using atomic instructions to update
+ * queue?
+ */
+static DEFINE_MUTEX(deathrow_mutex);
+
 #ifdef CONFIG_SYSCTL
 
 #include <linux/sysctl.h>
@@ -314,6 +322,15 @@ struct ctl_table epoll_table[] = {
 		.extra1		= &zero,
 		.extra2		= &long_max,
 	},
+	{
+		.procname	= "deathrow_size",
+		.data		= &deathrow_len,
+		.maxlen		= sizeof(deathrow_len),
+		.mode		= 0444,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &long_max,
+	},
 	{ }
 };
 #endif /* CONFIG_SYSCTL */
@@ -2164,9 +2181,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 /*
  * Implement the event wait interface for the eventpoll file. It is the kernel
  * part of the user space epoll_wait(2).
+ *
+ * A flags argument cannot be added to epoll_pwait cause it already has
+ * the maximum number of arguments (6). Can this be fixed?
  */
-SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
-		int, maxevents, int, timeout)
+SYSCALL_DEFINE5(epoll_wait5, int, epfd, struct epoll_event __user *, events,
+		int, maxevents, int, timeout, int, flags)
 {
 	int error;
 	struct fd f;
@@ -2199,14 +2219,44 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
 	 */
 	ep = f.file->private_data;
 
+	/* Check the EPOLL_* constants for conflicts.  */
+	BUILD_BUG_ON(EPOLL_KILLME == EPOLL_CLOEXEC);
+
+	if (flags & ~EPOLL_KILLME)
+		return -EINVAL;
+
+	if (flags & EPOLL_KILLME) {
+		/* Put process on death row. */
+		mutex_lock(&deathrow_mutex);
+		deathrow_len++;
+		list_add(&current->se.deathrow, &deathrow_q);
+		current->se.on_deathrow = 1;
+		mutex_unlock(&deathrow_mutex);
+	}
+
 	/* Time to fish for events ... */
 	error = ep_poll(ep, events, maxevents, timeout);
 
+	if (flags & EPOLL_KILLME) {
+		/* Remove process from death row. */
+		mutex_lock(&deathrow_mutex);
+		current->se.on_deathrow = 0;
+		list_del(&current->se.deathrow);
+		deathrow_len--;
+		mutex_unlock(&deathrow_mutex);
+	}
+
 error_fput:
 	fdput(f);
 	return error;
 }
 
+SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
+		int, maxevents, int, timeout)
+{
+	return sys_epoll_wait5(epfd, events, maxevents, timeout, 0);
+}
+
 /*
  * Implement the event wait interface for the eventpoll file. It is the kernel
  * part of the user space epoll_pwait(2).
@@ -2297,6 +2347,26 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 }
 #endif
 
+/* Clean up after a EPOLL_KILLME process quits.
+ * Called by kernel/exit.c.
+ */
+int exit_killme(void)
+{
+	if (current->se.on_deathrow) {
+		mutex_lock(&deathrow_mutex);
+		current->se.on_deathrow = 0;
+		list_del(&current->se.deathrow);
+		mutex_unlock(&deathrow_mutex);
+	}
+
+	return 0;
+}
+
+struct list_head *eventpoll_deathrow_list(void)
+{
+	return &deathrow_q;
+}
+
 static int __init eventpoll_init(void)
 {
 	struct sysinfo si;
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 2f14ac73d01d..f1e28d468de5 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -20,6 +20,8 @@
 /* Forward declarations to avoid compiler errors */
 struct file;
 
+int exit_killme(void);
+struct list_head *eventpoll_deathrow_list(void);
 
 #ifdef CONFIG_EPOLL
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a7df4e558c..66462bf27a29 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -380,6 +380,9 @@ struct sched_entity {
 	struct list_head		group_node;
 	unsigned int			on_rq;
 
+	unsigned			on_deathrow:1;
+	struct list_head		deathrow;
+
 	u64				exec_start;
 	u64				sum_exec_runtime;
 	u64				vruntime;
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 061185a5eb51..843553a39388 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -893,8 +893,11 @@ __SYSCALL(__NR_fork, sys_fork)
 __SYSCALL(__NR_fork, sys_ni_syscall)
 #endif /* CONFIG_MMU */
 
+#define __NR_epoll_wait5 1080
+__SYSCALL(__NR_epoll_wait5, sys_epoll_wait5)
+
 #undef __NR_syscalls
-#define __NR_syscalls (__NR_fork+1)
+#define __NR_syscalls (__NR_fork+2)
 
 #endif /* __ARCH_WANT_SYSCALL_DEPRECATED */
 
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index f4d5c998cc2b..ce150a3e7248 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -21,6 +21,9 @@
 /* Flags for epoll_create1.  */
 #define EPOLL_CLOEXEC O_CLOEXEC
 
+/* Flags for epoll_wait5.  */
+#define EPOLL_KILLME 0x00000001
+
 /* Valid opcodes to issue to sys_epoll_ctl() */
 #define EPOLL_CTL_ADD 1
 #define EPOLL_CTL_DEL 2
diff --git a/kernel/exit.c b/kernel/exit.c
index f6cad39f35df..cd089bdc5b17 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -62,6 +62,7 @@
 #include <linux/random.h>
 #include <linux/rcuwait.h>
 #include <linux/compat.h>
+#include <linux/eventpoll.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -917,6 +918,7 @@ void __noreturn do_exit(long code)
 		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 	exit_rcu();
 	exit_tasks_rcu_finish();
+	exit_killme();
 
 	lockdep_free_task(tsk);
 	do_task_dead();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dee0f75c3013..d6252772d593 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -41,6 +41,7 @@
 #include <linux/kthread.h>
 #include <linux/init.h>
 #include <linux/mmu_notifier.h>
+#include <linux/eventpoll.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -1029,6 +1030,22 @@ bool out_of_memory(struct oom_control *oc)
 		return true;
 	}
 
+	/*
+	 * Check death row.
+	 */
+	if (!list_empty(eventpoll_deathrow_list())) {
+		struct list_head *l = eventpoll_deathrow_list();
+		struct task_struct *ts = list_first_entry(l,
+					 struct task_struct, se.deathrow);
+
+		pr_debug("Killing pid %u from EPOLL_KILLME death row.",
+			ts->pid);
+
+		/* We use SIGKILL so as to cleanly interrupt ep_poll() */
+		kill_pid(task_pid(ts), SIGKILL, 1);
+		return true;
+	}
+
 	/*
 	 * The OOM killer does not compensate for IO-less reclaim.
 	 * pagefault_out_of_memory lost its gfp context so we have to
-- 
2.15.0.rc2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

WARNING: multiple messages have this Message-ID (diff)
From: Shawn Landden <slandden@gmail.com>
Cc: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org, Shawn Landden <slandden@gmail.com>
Subject: [RFC] EPOLL_KILLME: New flag to epoll_wait() that subscribes process to death row (new syscall)
Date: Tue, 31 Oct 2017 22:32:44 -0700	[thread overview]
Message-ID: <20171101053244.5218-1-slandden@gmail.com> (raw)

It is common for services to be stateless around their main event loop.
If a process passes the EPOLL_KILLME flag to epoll_wait5() then it
signals to the kernel that epoll_wait5() may not complete, and the kernel
may send SIGKILL if resources get tight.

See my systemd patch: https://github.com/shawnl/systemd/tree/killme

Android uses this memory model for all programs, and having it in the
kernel will enable integration with the page cache (not in this
series).
---
 arch/x86/entry/syscalls/syscall_32.tbl |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 fs/eventpoll.c                         | 74 +++++++++++++++++++++++++++++++++-
 include/linux/eventpoll.h              |  2 +
 include/linux/sched.h                  |  3 ++
 include/uapi/asm-generic/unistd.h      |  5 ++-
 include/uapi/linux/eventpoll.h         |  3 ++
 kernel/exit.c                          |  2 +
 mm/oom_kill.c                          | 17 ++++++++
 9 files changed, 105 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 448ac2161112..040e5d02bdcc 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -391,3 +391,4 @@
 382	i386	pkey_free		sys_pkey_free
 383	i386	statx			sys_statx
 384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
+385	i386	epoll_wait5		sys_epoll_wait5
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183e2f85..c72802e8cf65 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
 330	common	pkey_alloc		sys_pkey_alloc
 331	common	pkey_free		sys_pkey_free
 332	common	statx			sys_statx
+333	common	epoll_wait5		sys_epoll_wait5
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..76d1c91d940b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -297,6 +297,14 @@ static LIST_HEAD(visited_list);
  */
 static LIST_HEAD(tfile_check_list);
 
+static LIST_HEAD(deathrow_q);
+static long deathrow_len __read_mostly;
+
+/* TODO: Can this lock be removed by using atomic instructions to update
+ * queue?
+ */
+static DEFINE_MUTEX(deathrow_mutex);
+
 #ifdef CONFIG_SYSCTL
 
 #include <linux/sysctl.h>
@@ -314,6 +322,15 @@ struct ctl_table epoll_table[] = {
 		.extra1		= &zero,
 		.extra2		= &long_max,
 	},
+	{
+		.procname	= "deathrow_size",
+		.data		= &deathrow_len,
+		.maxlen		= sizeof(deathrow_len),
+		.mode		= 0444,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &long_max,
+	},
 	{ }
 };
 #endif /* CONFIG_SYSCTL */
@@ -2164,9 +2181,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 /*
  * Implement the event wait interface for the eventpoll file. It is the kernel
  * part of the user space epoll_wait(2).
+ *
+ * A flags argument cannot be added to epoll_pwait cause it already has
+ * the maximum number of arguments (6). Can this be fixed?
  */
-SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
-		int, maxevents, int, timeout)
+SYSCALL_DEFINE5(epoll_wait5, int, epfd, struct epoll_event __user *, events,
+		int, maxevents, int, timeout, int, flags)
 {
 	int error;
 	struct fd f;
@@ -2199,14 +2219,44 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
 	 */
 	ep = f.file->private_data;
 
+	/* Check the EPOLL_* constants for conflicts.  */
+	BUILD_BUG_ON(EPOLL_KILLME == EPOLL_CLOEXEC);
+
+	if (flags & ~EPOLL_KILLME)
+		return -EINVAL;
+
+	if (flags & EPOLL_KILLME) {
+		/* Put process on death row. */
+		mutex_lock(&deathrow_mutex);
+		deathrow_len++;
+		list_add(&current->se.deathrow, &deathrow_q);
+		current->se.on_deathrow = 1;
+		mutex_unlock(&deathrow_mutex);
+	}
+
 	/* Time to fish for events ... */
 	error = ep_poll(ep, events, maxevents, timeout);
 
+	if (flags & EPOLL_KILLME) {
+		/* Remove process from death row. */
+		mutex_lock(&deathrow_mutex);
+		current->se.on_deathrow = 0;
+		list_del(&current->se.deathrow);
+		deathrow_len--;
+		mutex_unlock(&deathrow_mutex);
+	}
+
 error_fput:
 	fdput(f);
 	return error;
 }
 
+SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
+		int, maxevents, int, timeout)
+{
+	return sys_epoll_wait5(epfd, events, maxevents, timeout, 0);
+}
+
 /*
  * Implement the event wait interface for the eventpoll file. It is the kernel
  * part of the user space epoll_pwait(2).
@@ -2297,6 +2347,26 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 }
 #endif
 
+/* Clean up after a EPOLL_KILLME process quits.
+ * Called by kernel/exit.c.
+ */
+int exit_killme(void)
+{
+	if (current->se.on_deathrow) {
+		mutex_lock(&deathrow_mutex);
+		current->se.on_deathrow = 0;
+		list_del(&current->se.deathrow);
+		mutex_unlock(&deathrow_mutex);
+	}
+
+	return 0;
+}
+
+struct list_head *eventpoll_deathrow_list(void)
+{
+	return &deathrow_q;
+}
+
 static int __init eventpoll_init(void)
 {
 	struct sysinfo si;
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 2f14ac73d01d..f1e28d468de5 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -20,6 +20,8 @@
 /* Forward declarations to avoid compiler errors */
 struct file;
 
+int exit_killme(void);
+struct list_head *eventpoll_deathrow_list(void);
 
 #ifdef CONFIG_EPOLL
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a7df4e558c..66462bf27a29 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -380,6 +380,9 @@ struct sched_entity {
 	struct list_head		group_node;
 	unsigned int			on_rq;
 
+	unsigned			on_deathrow:1;
+	struct list_head		deathrow;
+
 	u64				exec_start;
 	u64				sum_exec_runtime;
 	u64				vruntime;
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 061185a5eb51..843553a39388 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -893,8 +893,11 @@ __SYSCALL(__NR_fork, sys_fork)
 __SYSCALL(__NR_fork, sys_ni_syscall)
 #endif /* CONFIG_MMU */
 
+#define __NR_epoll_wait5 1080
+__SYSCALL(__NR_epoll_wait5, sys_epoll_wait5)
+
 #undef __NR_syscalls
-#define __NR_syscalls (__NR_fork+1)
+#define __NR_syscalls (__NR_fork+2)
 
 #endif /* __ARCH_WANT_SYSCALL_DEPRECATED */
 
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index f4d5c998cc2b..ce150a3e7248 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -21,6 +21,9 @@
 /* Flags for epoll_create1.  */
 #define EPOLL_CLOEXEC O_CLOEXEC
 
+/* Flags for epoll_wait5.  */
+#define EPOLL_KILLME 0x00000001
+
 /* Valid opcodes to issue to sys_epoll_ctl() */
 #define EPOLL_CTL_ADD 1
 #define EPOLL_CTL_DEL 2
diff --git a/kernel/exit.c b/kernel/exit.c
index f6cad39f35df..cd089bdc5b17 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -62,6 +62,7 @@
 #include <linux/random.h>
 #include <linux/rcuwait.h>
 #include <linux/compat.h>
+#include <linux/eventpoll.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -917,6 +918,7 @@ void __noreturn do_exit(long code)
 		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 	exit_rcu();
 	exit_tasks_rcu_finish();
+	exit_killme();
 
 	lockdep_free_task(tsk);
 	do_task_dead();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dee0f75c3013..d6252772d593 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -41,6 +41,7 @@
 #include <linux/kthread.h>
 #include <linux/init.h>
 #include <linux/mmu_notifier.h>
+#include <linux/eventpoll.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -1029,6 +1030,22 @@ bool out_of_memory(struct oom_control *oc)
 		return true;
 	}
 
+	/*
+	 * Check death row.
+	 */
+	if (!list_empty(eventpoll_deathrow_list())) {
+		struct list_head *l = eventpoll_deathrow_list();
+		struct task_struct *ts = list_first_entry(l,
+					 struct task_struct, se.deathrow);
+
+		pr_debug("Killing pid %u from EPOLL_KILLME death row.",
+			ts->pid);
+
+		/* We use SIGKILL so as to cleanly interrupt ep_poll() */
+		kill_pid(task_pid(ts), SIGKILL, 1);
+		return true;
+	}
+
 	/*
 	 * The OOM killer does not compensate for IO-less reclaim.
 	 * pagefault_out_of_memory lost its gfp context so we have to
-- 
2.15.0.rc2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

             reply	other threads:[~2017-11-01  5:32 UTC|newest]

Thread overview: 58+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-11-01  5:32 Shawn Landden [this message]
2017-11-01  5:32 ` [RFC] EPOLL_KILLME: New flag to epoll_wait() that subscribes process to death row (new syscall) Shawn Landden
2017-11-01  5:32 ` Shawn Landden
2017-11-01 14:04 ` Matthew Wilcox
2017-11-01 14:04   ` Matthew Wilcox
2017-11-01 15:16 ` Colin Walters
2017-11-01 15:16   ` Colin Walters
2017-11-01 15:22   ` Colin Walters
2017-11-01 15:22     ` Colin Walters
2017-11-03  9:22     ` peter enderborg
2017-11-03  9:22       ` peter enderborg
2017-11-03  9:22       ` peter enderborg
2017-11-01 19:02   ` Shawn Landden
2017-11-01 19:37     ` Colin Walters
2017-11-01 19:37       ` Colin Walters
2017-11-01 19:43       ` Shawn Landden
2017-11-01 20:54       ` Shawn Landden
2017-11-02 15:24       ` Shawn Paul Landden
2017-11-02 15:24         ` Shawn Paul Landden
2017-11-01 19:05   ` Shawn Landden
2017-11-01 22:10 ` Tetsuo Handa
2017-11-01 22:10   ` Tetsuo Handa
2017-11-02  7:36   ` Shawn Landden
2017-11-02 15:45 ` Michal Hocko
2017-11-02 15:45   ` Michal Hocko
2017-11-03  6:35 ` [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops Shawn Landden
2017-11-03  6:35   ` Shawn Landden
2017-11-03  6:35   ` Shawn Landden
2017-11-03  9:09   ` Michal Hocko
2017-11-03  9:09     ` Michal Hocko
2017-11-18  4:45     ` Shawn Landden
2017-11-19  4:19       ` Matthew Wilcox
2017-11-19  4:19         ` Matthew Wilcox
2017-11-19  4:19         ` Matthew Wilcox
2017-11-20  8:35       ` Michal Hocko
2017-11-20  8:35         ` Michal Hocko
2017-11-21  4:48         ` Shawn Landden
2017-11-21  4:48           ` Shawn Landden
2017-11-21  7:05           ` Michal Hocko
2017-11-21  7:05             ` Michal Hocko
2017-11-18 20:33     ` Shawn Landden
2017-11-18 20:33       ` Shawn Landden
2017-11-15 21:11   ` Pavel Machek
2017-11-21  4:49   ` [RFC v3] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may not complete, and the kernel may send SIGKILL if resources get tight Shawn Landden
2017-11-21  4:49     ` Shawn Landden
2017-11-21  4:49     ` Shawn Landden
2017-11-21  4:56     ` Shawn Landden
2017-11-21  4:56       ` Shawn Landden
2017-11-21  5:16     ` [RFC v4] " Shawn Landden
2017-11-21  5:16       ` Shawn Landden
2017-11-21  5:16       ` Shawn Landden
2017-11-21  5:26       ` Shawn Landden
2017-11-21  5:26         ` Shawn Landden
2017-11-21  9:14       ` Thomas Gleixner
2017-11-21  9:14         ` Thomas Gleixner
2017-11-22 10:29   ` [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops peter enderborg
2017-11-22 10:29     ` peter enderborg
2017-11-22 10:29     ` peter enderborg

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20171101053244.5218-1-slandden@gmail.com \
    --to=slandden@gmail.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.