BPF Archive on lore.kernel.org
 help / color / Atom feed
* [RFC][PATCH] net/bpfilter:  Remove this broken and apparently unmantained
       [not found]   ` <202006051903.C44988B@keescook>
@ 2020-06-06 19:20     ` Eric W. Biederman
  2020-06-06 20:19       ` Alexei Starovoitov
                         ` (2 more replies)
  0 siblings, 3 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-06 19:20 UTC (permalink / raw)
  To: Kees Cook
  Cc: Tetsuo Handa, akpm, ast, davem, viro, Alexei Starovoitov, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, Linus Torvalds


Tetsuo Honda recently noticed that the exec support of bpfilter is buggy.
https://lore.kernel.org/linux-fsdevel/2a8775b4-1dd5-9d5c-aa42-9872445e0942@i-love.sakura.ne.jp/

I agree with Al that Tetsuo's patch does not lend clarity to the code in
exec.  At a rough glance Tetsuo's patch does appear correct.

There have been no replies from the people who I expect would be
maintainers of the code.  When I look at the history of the code all it
appears to have received since it was merged was trivial maintenance
updates.  There has been no apparent work to finish fleshing out the
code to do what it is was aimed to do.

Examinine the code the pid handling is questionable.  The custom hook
into do_exit might prevent it but it appears that shutdown_umh has every
possibility of sending SIGKILL to the wrong process.

The Kconfig documentation lists this code as experimental.

The code only supports ipv4 not ipv6 another strong sign that this
code has not been going anywhere.

So as far as I can tell this bpfilter code was an experiment that did
not succeed and now no one cares about it.

So let's fix all of the bugs by removing the code.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---

Kees, Tesuo.  Unless someone chimes in and says they care I will
rebase this patch onto -rc1 to ensure I haven't missed something
because of the merge window and send this to Linus.

 fs/exec.c                        |  38 ++-----
 include/linux/binfmts.h          |   1 -
 include/linux/sched.h            |   8 --
 include/linux/umh.h              |  11 ---
 kernel/exit.c                    |   1 -
 kernel/umh.c                     | 163 +------------------------------
 net/Kconfig                      |   1 -
 net/Makefile                     |   1 -
 net/bpfilter/.gitignore          |   2 -
 net/bpfilter/Kconfig             |  16 ---
 net/bpfilter/Makefile            |  21 ----
 net/bpfilter/bpfilter_kern.c     | 128 ------------------------
 net/bpfilter/bpfilter_umh_blob.S |   7 --
 net/bpfilter/main.c              |  64 ------------
 net/bpfilter/msgfmt.h            |  17 ----
 net/ipv4/Makefile                |   1 -
 net/ipv4/bpfilter/Makefile       |   2 -
 net/ipv4/bpfilter/sockopt.c      |  78 ---------------
 18 files changed, 12 insertions(+), 548 deletions(-)
 delete mode 100644 net/bpfilter/.gitignore
 delete mode 100644 net/bpfilter/Kconfig
 delete mode 100644 net/bpfilter/Makefile
 delete mode 100644 net/bpfilter/bpfilter_kern.c
 delete mode 100644 net/bpfilter/bpfilter_umh_blob.S
 delete mode 100644 net/bpfilter/main.c
 delete mode 100644 net/bpfilter/msgfmt.h
 delete mode 100644 net/ipv4/bpfilter/Makefile
 delete mode 100644 net/ipv4/bpfilter/sockopt.c

diff --git a/fs/exec.c b/fs/exec.c
index e8599236290d..e6c24dabc1e4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1795,13 +1795,14 @@ static int exec_binprm(struct linux_binprm *bprm)
 /*
  * sys_execve() executes a new program.
  */
-static int __do_execve_file(int fd, struct filename *filename,
-			    struct user_arg_ptr argv,
-			    struct user_arg_ptr envp,
-			    int flags, struct file *file)
+static int do_execveat_common(int fd, struct filename *filename,
+			      struct user_arg_ptr argv,
+			      struct user_arg_ptr envp,
+			      int flags)
 {
 	char *pathbuf = NULL;
 	struct linux_binprm *bprm;
+	struct file *file;
 	struct files_struct *displaced;
 	int retval;
 
@@ -1840,8 +1841,7 @@ static int __do_execve_file(int fd, struct filename *filename,
 	check_unsafe_exec(bprm);
 	current->in_execve = 1;
 
-	if (!file)
-		file = do_open_execat(fd, filename, flags);
+	file = do_open_execat(fd, filename, flags);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto out_unmark;
@@ -1849,9 +1849,7 @@ static int __do_execve_file(int fd, struct filename *filename,
 	sched_exec();
 
 	bprm->file = file;
-	if (!filename) {
-		bprm->filename = "none";
-	} else if (fd == AT_FDCWD || filename->name[0] == '/') {
+	if (fd == AT_FDCWD || filename->name[0] == '/') {
 		bprm->filename = filename->name;
 	} else {
 		if (filename->name[0] == '\0')
@@ -1912,8 +1910,7 @@ static int __do_execve_file(int fd, struct filename *filename,
 	task_numa_free(current, false);
 	free_bprm(bprm);
 	kfree(pathbuf);
-	if (filename)
-		putname(filename);
+	putname(filename);
 	if (displaced)
 		put_files_struct(displaced);
 	return retval;
@@ -1944,27 +1941,10 @@ static int __do_execve_file(int fd, struct filename *filename,
 	if (displaced)
 		reset_files_struct(displaced);
 out_ret:
-	if (filename)
-		putname(filename);
+	putname(filename);
 	return retval;
 }
 
-static int do_execveat_common(int fd, struct filename *filename,
-			      struct user_arg_ptr argv,
-			      struct user_arg_ptr envp,
-			      int flags)
-{
-	return __do_execve_file(fd, filename, argv, envp, flags, NULL);
-}
-
-int do_execve_file(struct file *file, void *__argv, void *__envp)
-{
-	struct user_arg_ptr argv = { .ptr.native = __argv };
-	struct user_arg_ptr envp = { .ptr.native = __envp };
-
-	return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file);
-}
-
 int do_execve(struct filename *filename,
 	const char __user *const __user *__argv,
 	const char __user *const __user *__envp)
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index aece1b340e7d..e01eddc42750 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -142,6 +142,5 @@ extern int do_execveat(int, struct filename *,
 		       const char __user * const __user *,
 		       const char __user * const __user *,
 		       int);
-int do_execve_file(struct file *file, void *__argv, void *__envp);
 
 #endif /* _LINUX_BINFMTS_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4418f5cb8324..73d0eb46a67f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1986,14 +1986,6 @@ static inline void rseq_execve(struct task_struct *t)
 
 #endif
 
-void __exit_umh(struct task_struct *tsk);
-
-static inline void exit_umh(struct task_struct *tsk)
-{
-	if (unlikely(tsk->flags & PF_UMH))
-		__exit_umh(tsk);
-}
-
 #ifdef CONFIG_DEBUG_RSEQ
 
 void rseq_syscall(struct pt_regs *regs);
diff --git a/include/linux/umh.h b/include/linux/umh.h
index 0c08de356d0d..128ae92e6983 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -22,10 +22,8 @@ struct subprocess_info {
 	const char *path;
 	char **argv;
 	char **envp;
-	struct file *file;
 	int wait;
 	int retval;
-	pid_t pid;
 	int (*init)(struct subprocess_info *info, struct cred *new);
 	void (*cleanup)(struct subprocess_info *info);
 	void *data;
@@ -43,15 +41,6 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp,
 struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
 			  int (*init)(struct subprocess_info *info, struct cred *new),
 			  void (*cleanup)(struct subprocess_info *), void *data);
-struct umh_info {
-	const char *cmdline;
-	struct file *pipe_to_umh;
-	struct file *pipe_from_umh;
-	struct list_head list;
-	void (*cleanup)(struct umh_info *info);
-	pid_t pid;
-};
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
 
 extern int
 call_usermodehelper_exec(struct subprocess_info *info, int wait);
diff --git a/kernel/exit.c b/kernel/exit.c
index ce2a75bc0ade..989f1ada0bf1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -795,7 +795,6 @@ void __noreturn do_exit(long code)
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
 	exit_thread(tsk);
-	exit_umh(tsk);
 
 	/*
 	 * Flush inherited counters to the parent - before the parent
diff --git a/kernel/umh.c b/kernel/umh.c
index 7f255b5a8845..a9a6032e08a6 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -26,8 +26,6 @@
 #include <linux/ptrace.h>
 #include <linux/async.h>
 #include <linux/uaccess.h>
-#include <linux/shmem_fs.h>
-#include <linux/pipe_fs_i.h>
 
 #include <trace/events/module.h>
 
@@ -39,7 +37,6 @@ static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
 static DEFINE_SPINLOCK(umh_sysctl_lock);
 static DECLARE_RWSEM(umhelper_sem);
 static LIST_HEAD(umh_list);
-static DEFINE_MUTEX(umh_list_lock);
 
 static void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
@@ -102,16 +99,9 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	sub_info->pid = task_pid_nr(current);
-	if (sub_info->file) {
-		retval = do_execve_file(sub_info->file,
-					sub_info->argv, sub_info->envp);
-		if (!retval)
-			current->flags |= PF_UMH;
-	} else
-		retval = do_execve(getname_kernel(sub_info->path),
-				   (const char __user *const __user *)sub_info->argv,
-				   (const char __user *const __user *)sub_info->envp);
+	retval = do_execve(getname_kernel(sub_info->path),
+			   (const char __user *const __user *)sub_info->argv,
+			   (const char __user *const __user *)sub_info->envp);
 out:
 	sub_info->retval = retval;
 	/*
@@ -405,133 +395,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
 }
 EXPORT_SYMBOL(call_usermodehelper_setup);
 
-struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
-		int (*init)(struct subprocess_info *info, struct cred *new),
-		void (*cleanup)(struct subprocess_info *info), void *data)
-{
-	struct subprocess_info *sub_info;
-	struct umh_info *info = data;
-	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
-
-	sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL);
-	if (!sub_info)
-		return NULL;
-
-	sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL);
-	if (!sub_info->argv) {
-		kfree(sub_info);
-		return NULL;
-	}
-
-	INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
-	sub_info->path = "none";
-	sub_info->file = file;
-	sub_info->init = init;
-	sub_info->cleanup = cleanup;
-	sub_info->data = data;
-	return sub_info;
-}
-
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
-{
-	struct umh_info *umh_info = info->data;
-	struct file *from_umh[2];
-	struct file *to_umh[2];
-	int err;
-
-	/* create pipe to send data to umh */
-	err = create_pipe_files(to_umh, 0);
-	if (err)
-		return err;
-	err = replace_fd(0, to_umh[0], 0);
-	fput(to_umh[0]);
-	if (err < 0) {
-		fput(to_umh[1]);
-		return err;
-	}
-
-	/* create pipe to receive data from umh */
-	err = create_pipe_files(from_umh, 0);
-	if (err) {
-		fput(to_umh[1]);
-		replace_fd(0, NULL, 0);
-		return err;
-	}
-	err = replace_fd(1, from_umh[1], 0);
-	fput(from_umh[1]);
-	if (err < 0) {
-		fput(to_umh[1]);
-		replace_fd(0, NULL, 0);
-		fput(from_umh[0]);
-		return err;
-	}
-
-	umh_info->pipe_to_umh = to_umh[1];
-	umh_info->pipe_from_umh = from_umh[0];
-	return 0;
-}
-
-static void umh_clean_and_save_pid(struct subprocess_info *info)
-{
-	struct umh_info *umh_info = info->data;
-
-	argv_free(info->argv);
-	umh_info->pid = info->pid;
-}
-
-/**
- * fork_usermode_blob - fork a blob of bytes as a usermode process
- * @data: a blob of bytes that can be do_execv-ed as a file
- * @len: length of the blob
- * @info: information about usermode process (shouldn't be NULL)
- *
- * If info->cmdline is set it will be used as command line for the
- * user process, else "usermodehelper" is used.
- *
- * Returns either negative error or zero which indicates success
- * in executing a blob of bytes as a usermode process. In such
- * case 'struct umh_info *info' is populated with two pipes
- * and a pid of the process. The caller is responsible for health
- * check of the user process, killing it via pid, and closing the
- * pipes when user process is no longer needed.
- */
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
-{
-	struct subprocess_info *sub_info;
-	struct file *file;
-	ssize_t written;
-	loff_t pos = 0;
-	int err;
-
-	file = shmem_kernel_file_setup("", len, 0);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
-
-	written = kernel_write(file, data, len, &pos);
-	if (written != len) {
-		err = written;
-		if (err >= 0)
-			err = -ENOMEM;
-		goto out;
-	}
-
-	err = -ENOMEM;
-	sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup,
-						  umh_clean_and_save_pid, info);
-	if (!sub_info)
-		goto out;
-
-	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
-	if (!err) {
-		mutex_lock(&umh_list_lock);
-		list_add(&info->list, &umh_list);
-		mutex_unlock(&umh_list_lock);
-	}
-out:
-	fput(file);
-	return err;
-}
-EXPORT_SYMBOL_GPL(fork_usermode_blob);
 
 /**
  * call_usermodehelper_exec - start a usermode application
@@ -689,26 +552,6 @@ static int proc_cap_handler(struct ctl_table *table, int write,
 	return 0;
 }
 
-void __exit_umh(struct task_struct *tsk)
-{
-	struct umh_info *info;
-	pid_t pid = tsk->pid;
-
-	mutex_lock(&umh_list_lock);
-	list_for_each_entry(info, &umh_list, list) {
-		if (info->pid == pid) {
-			list_del(&info->list);
-			mutex_unlock(&umh_list_lock);
-			goto out;
-		}
-	}
-	mutex_unlock(&umh_list_lock);
-	return;
-out:
-	if (info->cleanup)
-		info->cleanup(info);
-}
-
 struct ctl_table usermodehelper_table[] = {
 	{
 		.procname	= "bset",
diff --git a/net/Kconfig b/net/Kconfig
index df8d8c9bd021..56066e279336 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -209,7 +209,6 @@ source "net/bridge/netfilter/Kconfig"
 
 endif
 
-source "net/bpfilter/Kconfig"
 
 source "net/dccp/Kconfig"
 source "net/sctp/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index 07ea48160874..5148cce5f588 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -20,7 +20,6 @@ obj-$(CONFIG_TLS)		+= tls/
 obj-$(CONFIG_XFRM)		+= xfrm/
 obj-$(CONFIG_UNIX_SCM)		+= unix/
 obj-$(CONFIG_NET)		+= ipv6/
-obj-$(CONFIG_BPFILTER)		+= bpfilter/
 obj-$(CONFIG_PACKET)		+= packet/
 obj-$(CONFIG_NET_KEY)		+= key/
 obj-$(CONFIG_BRIDGE)		+= bridge/
diff --git a/net/bpfilter/.gitignore b/net/bpfilter/.gitignore
deleted file mode 100644
index f34e85ee8204..000000000000
--- a/net/bpfilter/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-bpfilter_umh
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
deleted file mode 100644
index fed9290e3b41..000000000000
--- a/net/bpfilter/Kconfig
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-menuconfig BPFILTER
-	bool "BPF based packet filtering framework (BPFILTER)"
-	depends on NET && BPF && INET
-	help
-	  This builds experimental bpfilter framework that is aiming to
-	  provide netfilter compatible functionality via BPF
-
-if BPFILTER
-config BPFILTER_UMH
-	tristate "bpfilter kernel module with user mode helper"
-	depends on CC_CAN_LINK
-	default m
-	help
-	  This builds bpfilter kernel module with embedded user mode helper
-endif
diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
deleted file mode 100644
index 36580301da70..000000000000
--- a/net/bpfilter/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for the Linux BPFILTER layer.
-#
-
-hostprogs := bpfilter_umh
-bpfilter_umh-objs := main.o
-KBUILD_HOSTCFLAGS += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi
-HOSTCC := $(CC)
-
-ifeq ($(CONFIG_BPFILTER_UMH), y)
-# builtin bpfilter_umh should be compiled with -static
-# since rootfs isn't mounted at the time of __init
-# function is called and do_execv won't find elf interpreter
-KBUILD_HOSTLDFLAGS += -static
-endif
-
-$(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh
-
-obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
-bpfilter-objs += bpfilter_kern.o bpfilter_umh_blob.o
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
deleted file mode 100644
index c0f0990f30b6..000000000000
--- a/net/bpfilter/bpfilter_kern.c
+++ /dev/null
@@ -1,128 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/umh.h>
-#include <linux/bpfilter.h>
-#include <linux/sched.h>
-#include <linux/sched/signal.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include "msgfmt.h"
-
-extern char bpfilter_umh_start;
-extern char bpfilter_umh_end;
-
-static void shutdown_umh(void)
-{
-	struct task_struct *tsk;
-
-	if (bpfilter_ops.stop)
-		return;
-
-	tsk = get_pid_task(find_vpid(bpfilter_ops.info.pid), PIDTYPE_PID);
-	if (tsk) {
-		send_sig(SIGKILL, tsk, 1);
-		put_task_struct(tsk);
-	}
-}
-
-static void __stop_umh(void)
-{
-	if (IS_ENABLED(CONFIG_INET))
-		shutdown_umh();
-}
-
-static int __bpfilter_process_sockopt(struct sock *sk, int optname,
-				      char __user *optval,
-				      unsigned int optlen, bool is_set)
-{
-	struct mbox_request req;
-	struct mbox_reply reply;
-	loff_t pos;
-	ssize_t n;
-	int ret = -EFAULT;
-
-	req.is_set = is_set;
-	req.pid = current->pid;
-	req.cmd = optname;
-	req.addr = (long __force __user)optval;
-	req.len = optlen;
-	if (!bpfilter_ops.info.pid)
-		goto out;
-	n = __kernel_write(bpfilter_ops.info.pipe_to_umh, &req, sizeof(req),
-			   &pos);
-	if (n != sizeof(req)) {
-		pr_err("write fail %zd\n", n);
-		__stop_umh();
-		ret = -EFAULT;
-		goto out;
-	}
-	pos = 0;
-	n = kernel_read(bpfilter_ops.info.pipe_from_umh, &reply, sizeof(reply),
-			&pos);
-	if (n != sizeof(reply)) {
-		pr_err("read fail %zd\n", n);
-		__stop_umh();
-		ret = -EFAULT;
-		goto out;
-	}
-	ret = reply.status;
-out:
-	return ret;
-}
-
-static int start_umh(void)
-{
-	int err;
-
-	/* fork usermode process */
-	err = fork_usermode_blob(&bpfilter_umh_start,
-				 &bpfilter_umh_end - &bpfilter_umh_start,
-				 &bpfilter_ops.info);
-	if (err)
-		return err;
-	bpfilter_ops.stop = false;
-	pr_info("Loaded bpfilter_umh pid %d\n", bpfilter_ops.info.pid);
-
-	/* health check that usermode process started correctly */
-	if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) {
-		shutdown_umh();
-		return -EFAULT;
-	}
-
-	return 0;
-}
-
-static int __init load_umh(void)
-{
-	int err;
-
-	mutex_lock(&bpfilter_ops.lock);
-	if (!bpfilter_ops.stop) {
-		err = -EFAULT;
-		goto out;
-	}
-	err = start_umh();
-	if (!err && IS_ENABLED(CONFIG_INET)) {
-		bpfilter_ops.sockopt = &__bpfilter_process_sockopt;
-		bpfilter_ops.start = &start_umh;
-	}
-out:
-	mutex_unlock(&bpfilter_ops.lock);
-	return err;
-}
-
-static void __exit fini_umh(void)
-{
-	mutex_lock(&bpfilter_ops.lock);
-	if (IS_ENABLED(CONFIG_INET)) {
-		shutdown_umh();
-		bpfilter_ops.start = NULL;
-		bpfilter_ops.sockopt = NULL;
-	}
-	mutex_unlock(&bpfilter_ops.lock);
-}
-module_init(load_umh);
-module_exit(fini_umh);
-MODULE_LICENSE("GPL");
diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S
deleted file mode 100644
index 9ea6100dca87..000000000000
--- a/net/bpfilter/bpfilter_umh_blob.S
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-	.section .rodata, "a"
-	.global bpfilter_umh_start
-bpfilter_umh_start:
-	.incbin "net/bpfilter/bpfilter_umh"
-	.global bpfilter_umh_end
-bpfilter_umh_end:
diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
deleted file mode 100644
index 05e1cfc1e5cd..000000000000
--- a/net/bpfilter/main.c
+++ /dev/null
@@ -1,64 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define _GNU_SOURCE
-#include <sys/uio.h>
-#include <errno.h>
-#include <stdio.h>
-#include <sys/socket.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include "../../include/uapi/linux/bpf.h"
-#include <asm/unistd.h>
-#include "msgfmt.h"
-
-FILE *debug_f;
-
-static int handle_get_cmd(struct mbox_request *cmd)
-{
-	switch (cmd->cmd) {
-	case 0:
-		return 0;
-	default:
-		break;
-	}
-	return -ENOPROTOOPT;
-}
-
-static int handle_set_cmd(struct mbox_request *cmd)
-{
-	return -ENOPROTOOPT;
-}
-
-static void loop(void)
-{
-	while (1) {
-		struct mbox_request req;
-		struct mbox_reply reply;
-		int n;
-
-		n = read(0, &req, sizeof(req));
-		if (n != sizeof(req)) {
-			fprintf(debug_f, "invalid request %d\n", n);
-			return;
-		}
-
-		reply.status = req.is_set ?
-			handle_set_cmd(&req) :
-			handle_get_cmd(&req);
-
-		n = write(1, &reply, sizeof(reply));
-		if (n != sizeof(reply)) {
-			fprintf(debug_f, "reply failed %d\n", n);
-			return;
-		}
-	}
-}
-
-int main(void)
-{
-	debug_f = fopen("/dev/kmsg", "w");
-	setvbuf(debug_f, 0, _IOLBF, 0);
-	fprintf(debug_f, "Started bpfilter\n");
-	loop();
-	fclose(debug_f);
-	return 0;
-}
diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h
deleted file mode 100644
index 98d121c62945..000000000000
--- a/net/bpfilter/msgfmt.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NET_BPFILTER_MSGFMT_H
-#define _NET_BPFILTER_MSGFMT_H
-
-struct mbox_request {
-	__u64 addr;
-	__u32 len;
-	__u32 is_set;
-	__u32 cmd;
-	__u32 pid;
-};
-
-struct mbox_reply {
-	__u32 status;
-};
-
-#endif
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9e1a186a3671..e4c1cb8df316 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -16,7 +16,6 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
 	     metrics.o netlink.o nexthop.o
 
-obj-$(CONFIG_BPFILTER) += bpfilter/
 
 obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
deleted file mode 100644
index 00af5305e05a..000000000000
--- a/net/ipv4/bpfilter/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_BPFILTER) += sockopt.o
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
deleted file mode 100644
index 0480918bfc7c..000000000000
--- a/net/ipv4/bpfilter/sockopt.c
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <linux/bpfilter.h>
-#include <uapi/linux/bpf.h>
-#include <linux/wait.h>
-#include <linux/kmod.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-
-struct bpfilter_umh_ops bpfilter_ops;
-EXPORT_SYMBOL_GPL(bpfilter_ops);
-
-static void bpfilter_umh_cleanup(struct umh_info *info)
-{
-	mutex_lock(&bpfilter_ops.lock);
-	bpfilter_ops.stop = true;
-	fput(info->pipe_to_umh);
-	fput(info->pipe_from_umh);
-	info->pid = 0;
-	mutex_unlock(&bpfilter_ops.lock);
-}
-
-static int bpfilter_mbox_request(struct sock *sk, int optname,
-				 char __user *optval,
-				 unsigned int optlen, bool is_set)
-{
-	int err;
-	mutex_lock(&bpfilter_ops.lock);
-	if (!bpfilter_ops.sockopt) {
-		mutex_unlock(&bpfilter_ops.lock);
-		request_module("bpfilter");
-		mutex_lock(&bpfilter_ops.lock);
-
-		if (!bpfilter_ops.sockopt) {
-			err = -ENOPROTOOPT;
-			goto out;
-		}
-	}
-	if (bpfilter_ops.stop) {
-		err = bpfilter_ops.start();
-		if (err)
-			goto out;
-	}
-	err = bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set);
-out:
-	mutex_unlock(&bpfilter_ops.lock);
-	return err;
-}
-
-int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
-			    unsigned int optlen)
-{
-	return bpfilter_mbox_request(sk, optname, optval, optlen, true);
-}
-
-int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
-			    int __user *optlen)
-{
-	int len;
-
-	if (get_user(len, optlen))
-		return -EFAULT;
-
-	return bpfilter_mbox_request(sk, optname, optval, len, false);
-}
-
-static int __init bpfilter_sockopt_init(void)
-{
-	mutex_init(&bpfilter_ops.lock);
-	bpfilter_ops.stop = true;
-	bpfilter_ops.info.cmdline = "bpfilter_umh";
-	bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup;
-
-	return 0;
-}
-device_initcall(bpfilter_sockopt_init);
-- 
2.20.1

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter:  Remove this broken and apparently unmantained
  2020-06-06 19:20     ` [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained Eric W. Biederman
@ 2020-06-06 20:19       ` Alexei Starovoitov
  2020-06-06 22:33         ` Linus Torvalds
  2020-06-06 20:43       ` Matthew Wilcox
  2020-06-07  1:13       ` Tetsuo Handa
  2 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-06 20:19 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Kees Cook, Tetsuo Handa, akpm, ast, davem, viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, Linus Torvalds

On Sat, Jun 06, 2020 at 02:20:36PM -0500, Eric W. Biederman wrote:
> 
> Tetsuo Honda recently noticed that the exec support of bpfilter is buggy.
> https://lore.kernel.org/linux-fsdevel/2a8775b4-1dd5-9d5c-aa42-9872445e0942@i-love.sakura.ne.jp/
> 
> I agree with Al that Tetsuo's patch does not lend clarity to the code in
> exec.  At a rough glance Tetsuo's patch does appear correct.
> 
> There have been no replies from the people who I expect would be
> maintainers of the code.  When I look at the history of the code all it
> appears to have received since it was merged was trivial maintenance
> updates.  There has been no apparent work to finish fleshing out the
> code to do what it is was aimed to do.
> 
> Examinine the code the pid handling is questionable.  The custom hook
> into do_exit might prevent it but it appears that shutdown_umh has every
> possibility of sending SIGKILL to the wrong process.
> 
> The Kconfig documentation lists this code as experimental.
> 
> The code only supports ipv4 not ipv6 another strong sign that this
> code has not been going anywhere.
> 
> So as far as I can tell this bpfilter code was an experiment that did
> not succeed and now no one cares about it.
> 
> So let's fix all of the bugs by removing the code.
> 
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
> 
> Kees, Tesuo.  Unless someone chimes in and says they care I will
> rebase this patch onto -rc1 to ensure I haven't missed something
> because of the merge window and send this to Linus.

NACKed-by: Alexei Starovoitov <ast@kernel.org>

Please mention specific bugs and let's fix them.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter:  Remove this broken and apparently unmantained
  2020-06-06 19:20     ` [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained Eric W. Biederman
  2020-06-06 20:19       ` Alexei Starovoitov
@ 2020-06-06 20:43       ` Matthew Wilcox
  2020-06-07 15:51         ` Eric W. Biederman
  2020-06-07  1:13       ` Tetsuo Handa
  2 siblings, 1 reply; 194+ messages in thread
From: Matthew Wilcox @ 2020-06-06 20:43 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Kees Cook, Tetsuo Handa, akpm, ast, davem, viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, Linus Torvalds

On Sat, Jun 06, 2020 at 02:20:36PM -0500, Eric W. Biederman wrote:
> @@ -39,7 +37,6 @@ static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
>  static DEFINE_SPINLOCK(umh_sysctl_lock);
>  static DECLARE_RWSEM(umhelper_sem);
>  static LIST_HEAD(umh_list);
> -static DEFINE_MUTEX(umh_list_lock);

You can delete the umh_list too; you've deleted all its users.


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-06 20:19       ` Alexei Starovoitov
@ 2020-06-06 22:33         ` Linus Torvalds
  2020-06-07  1:49           ` Alexei Starovoitov
  0 siblings, 1 reply; 194+ messages in thread
From: Linus Torvalds @ 2020-06-06 22:33 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eric W. Biederman, Kees Cook, Tetsuo Handa, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Sat, Jun 6, 2020 at 1:20 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> Please mention specific bugs and let's fix them.

Well, Eric did mention one explicit bug, and several "looks dodgy" bugs.

And the fact is, this isn't used.

It's clever, and I like the concept, but it was probably a mistake to
do this as a user-mode-helper thing.

If people really convert netfilter rules to bpf, they'll likely do so
in user space. This bpfilter thing hasn't gone anywhere, and it _has_
caused problems.

So Alexei, I think the burden of proof is not on Eric, but on you.

Eric's claim is that

 (a) it has bugs (and yes, he pointed to at lelast one)

 (b) it's not doing anything useful

 (b) it's a maintenance issue for execve, which is what Eric maintains.

So you can't just dismiss this, ignore the reported bug, and say
"we'll fix them".

That only answers (a) (well, it _would_ have answered (a)., except you
actually didn't even read Eric's report of existing bugs).

What is your answer to (b)-(c)?

             Linus

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-06 19:20     ` [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained Eric W. Biederman
  2020-06-06 20:19       ` Alexei Starovoitov
  2020-06-06 20:43       ` Matthew Wilcox
@ 2020-06-07  1:13       ` Tetsuo Handa
  2 siblings, 0 replies; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-07  1:13 UTC (permalink / raw)
  To: Bruno Meneguele
  Cc: Eric W. Biederman, Kees Cook, akpm, ast, davem, viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Linus Torvalds

On 2020/06/07 4:20, Eric W. Biederman wrote:
> Kees, Tesuo.  Unless someone chimes in and says they care I will
> rebase this patch onto -rc1 to ensure I haven't missed something
> because of the merge window and send this to Linus.

Is the exec support of bpfilter already supported by distributions?
I can see that RHEL8's 4.18 kernel contains several changelogs
( https://git.centos.org/rpms/kernel/blob/c8/f/SPECS/kernel.spec )
that might be relevant to the exec support of bpfilter.
If it is not supported yet, removing the code would be a choice.


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-06 22:33         ` Linus Torvalds
@ 2020-06-07  1:49           ` Alexei Starovoitov
  2020-06-07  2:19             ` Linus Torvalds
                               ` (2 more replies)
  0 siblings, 3 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-07  1:49 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric W. Biederman, Kees Cook, Tetsuo Handa, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Sat, Jun 06, 2020 at 03:33:14PM -0700, Linus Torvalds wrote:
> On Sat, Jun 6, 2020 at 1:20 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > Please mention specific bugs and let's fix them.
> 
> Well, Eric did mention one explicit bug, and several "looks dodgy" bugs.
> 
> And the fact is, this isn't used.
> 
> It's clever, and I like the concept, but it was probably a mistake to
> do this as a user-mode-helper thing.
> 
> If people really convert netfilter rules to bpf, they'll likely do so
> in user space. This bpfilter thing hasn't gone anywhere, and it _has_
> caused problems.
> 
> So Alexei, I think the burden of proof is not on Eric, but on you.
> 
> Eric's claim is that
> 
>  (a) it has bugs (and yes, he pointed to at lelast one)

the patch from March 12 ?
I thought it landed long ago. Is there an issue with it?
'handling is questionable' is not very constructive.

>  (b) it's not doing anything useful

true.

>  (b) it's a maintenance issue for execve, which is what Eric maintains.

I'm not aware of execve issues. I don't remember being cc-ed on them.
To me this 'lets remove everything' patch comes out of nowhere with
a link to three month old patch as a justification.

> So you can't just dismiss this, ignore the reported bug, and say
> "we'll fix them".
> 
> That only answers (a) (well, it _would_ have answered (a)., except you
> actually didn't even read Eric's report of existing bugs).
> 
> What is your answer to (b)-(c)?

So far we had two attempts at converting netfilter rules to bpf. Both ended up
with user space implementation and short cuts. bpf side didn't have loops and
couldn't support 10k+ rules. That is what stalled the effort. imo it's a
pointless corner case, but to be a true replacement people kept bringing it up
as something valid. Now we have bpf iterator concept and soon bpf will be able
to handle millions of rules. Also folks are also realizing that this effort has
to be project managed appropriately. Will it materialize in patches tomorrow?
Unlikely. Probably another 6 month at least. Also outside of netfilter
conversion we've started /proc extension effort that will use the same umh
facility. It won't be ready tomorrow as well, but both need umh. initrd is not
an option due to operational constraints. We need a way to ship kernel tarball
where bpf things are ready at boot. I suspect /proc extensions patches will
land sooner. Couple month ago people used umh to do ovs->xdp translatation. It
didn't land. People argued that the same thing can be achieved in user space
and they were correct. So you're right that for most folks user space is the
answer. But there are cases where kernel has to have these things before
systemd starts.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-07  1:49           ` Alexei Starovoitov
@ 2020-06-07  2:19             ` Linus Torvalds
  2020-06-07 16:09               ` Eric W. Biederman
  2020-06-08 16:20               ` Alexei Starovoitov
  2020-06-07  2:31             ` Tetsuo Handa
  2020-06-07  5:58             ` Eric W. Biederman
  2 siblings, 2 replies; 194+ messages in thread
From: Linus Torvalds @ 2020-06-07  2:19 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eric W. Biederman, Kees Cook, Tetsuo Handa, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Sat, Jun 6, 2020 at 6:49 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>>
> I'm not aware of execve issues. I don't remember being cc-ed on them.
> To me this 'lets remove everything' patch comes out of nowhere with
> a link to three month old patch as a justification.

Well, it's out of nowhere as far as bpf is concerned, but we've had a
fair amount of discussions about execve cleanups (and a fair amount of
work too, not just discussion) lately

So it comes out of "execve is rather grotty", and trying to make it
simpler have fewer special cases.

> So far we had two attempts at converting netfilter rules to bpf. Both ended up
> with user space implementation and short cuts.

So I have a question: are we convinced that doing this "netfilter
conversion" in user space is required at all?

I realize that yes, running clang is not something we'd want to do in
kernel space, that's not what I'm asking.

But how much might be doable at kernel compile time (run clang to
generate bpf statically when building the kernel) together with some
simplistic run-time parameterized JITting for the table details that
the kernel could do on its own without a real compiler?

Because the problem with this code isn't the "use bpf for netfilter
rules", it's the "run a user mode helper". The execve thing is
actually only incidental, it also ends up being a somewhat interesting
issue wrt namespacing and security (and bootstrapping - I'm not
convinced people want to have a clang bpf compiler in initrd etc).

So particularly if we accept the fact that we won't necessarily need
all of netfilter converted in general - some will be just translated
entirely independently in user space and not use netfilter at all
(just bpf loaded normally)

IOW there would potentially only be a (fairly small?) core set that
the kernel would need to be able to handle "natively".

Am I just blathering?

                  Linus

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-07  1:49           ` Alexei Starovoitov
  2020-06-07  2:19             ` Linus Torvalds
@ 2020-06-07  2:31             ` Tetsuo Handa
  2020-06-08 16:23               ` Alexei Starovoitov
  2020-06-07  5:58             ` Eric W. Biederman
  2 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-07  2:31 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, Eric W. Biederman, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On 2020/06/07 10:49, Alexei Starovoitov wrote:
> So you're right that for most folks user space is the
> answer. But there are cases where kernel has to have these things before
> systemd starts.

Why such cases can't use init= kernel command line argument?
The program specified via init= kernel command line argument can do anything
before systemd (a.k.a. global init process) starts.

By the way, from the LSM perspective, doing a lot of things before global init
process starts is not desirable, for access decision can be made only after policy
is loaded (which is generally when /sbin/init on a device specified via root=
kernel command line argument becomes ready). Since
fork_usermode_blob((void *) "#!/bin/true\n", 12, info) is possible, I worry that
the ability to start userspace code is abused for bypassing dentry/inode-based
permission checks.


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-07  1:49           ` Alexei Starovoitov
  2020-06-07  2:19             ` Linus Torvalds
  2020-06-07  2:31             ` Tetsuo Handa
@ 2020-06-07  5:58             ` Eric W. Biederman
  2020-06-07 11:56               ` Eric W. Biederman
  2020-06-08 16:33               ` Alexei Starovoitov
  2 siblings, 2 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-07  5:58 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, Kees Cook, Tetsuo Handa, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Sat, Jun 06, 2020 at 03:33:14PM -0700, Linus Torvalds wrote:
>> On Sat, Jun 6, 2020 at 1:20 PM Alexei Starovoitov
>> <alexei.starovoitov@gmail.com> wrote:
>> >
>> > Please mention specific bugs and let's fix them.
>> 
>> Well, Eric did mention one explicit bug, and several "looks dodgy" bugs.
>> 
>> And the fact is, this isn't used.
>> 
>> It's clever, and I like the concept, but it was probably a mistake to
>> do this as a user-mode-helper thing.
>> 
>> If people really convert netfilter rules to bpf, they'll likely do so
>> in user space. This bpfilter thing hasn't gone anywhere, and it _has_
>> caused problems.
>> 
>> So Alexei, I think the burden of proof is not on Eric, but on you.
>> 
>> Eric's claim is that
>> 
>>  (a) it has bugs (and yes, he pointed to at lelast one)
>
> the patch from March 12 ?
> I thought it landed long ago. Is there an issue with it?
> 'handling is questionable' is not very constructive.

It was half a fix.  Tetsuo still doesn't know how to fix tomoyo to work
with fork_usermode_blob.

He was asking for your feedback and you did not give it.

The truth is Tetsuo's fix was only a fix for the symptoms.  It was not a
good fix to the code.

>>  (b) it's not doing anything useful
>
> true.
>
>>  (b) it's a maintenance issue for execve, which is what Eric maintains.
>
> I'm not aware of execve issues. I don't remember being cc-ed on them.
> To me this 'lets remove everything' patch comes out of nowhere with
> a link to three month old patch as a justification.

I needed to know how dead the code is and your reply has confirmed
that the code is dead.

Deleting the code is much easier than the detailed careful work it would
take to make code that is in use work correctly.

>> So you can't just dismiss this, ignore the reported bug, and say
>> "we'll fix them".
>> 
>> That only answers (a) (well, it _would_ have answered (a)., except you
>> actually didn't even read Eric's report of existing bugs).
>> 
>> What is your answer to (b)-(c)?
>
> So far we had two attempts at converting netfilter rules to bpf. Both ended up
> with user space implementation and short cuts. bpf side didn't have loops and
> couldn't support 10k+ rules. That is what stalled the effort. imo it's a
> pointless corner case, but to be a true replacement people kept bringing it up
> as something valid. Now we have bpf iterator concept and soon bpf will be able
> to handle millions of rules. Also folks are also realizing that this effort has
> to be project managed appropriately. Will it materialize in patches tomorrow?
> Unlikely. Probably another 6 month at least. Also outside of netfilter
> conversion we've started /proc extension effort that will use the same umh
> facility. It won't be ready tomorrow as well, but both need umh.

Given that I am one of the folks who looks after proc I haven't seen
that either.  The direction I have seen in the last 20 years is people
figuring out how to reduce proc not really how to extend it so I can't
imagine what a /proc extension effort is.

> initrd is not
> an option due to operational constraints. We need a way to ship kernel tarball
> where bpf things are ready at boot. I suspect /proc extensions patches will
> land sooner. Couple month ago people used umh to do ovs->xdp translatation. It
> didn't land. People argued that the same thing can be achieved in user space
> and they were correct. So you're right that for most folks user space is the
> answer. But there are cases where kernel has to have these things before
> systemd starts.

You may have a valid case for doing things in the kernel before systemd
starts.  The current mechanism is fundamentally in conflict with the
LSMs which is an unresolved problem.

I don't see why you can't have a userspace process that does:

	pid = fork();
        if (pid == 0) {
        	/* Do bpf stuff */
        }
        else if (pid > 0) {
        	execve("/sbin/init", ...);
        }

You can build an initramfs with that code right into the kernel, so
I can't imagine the existing mechanisms being insufficient.

That said the fork_usermode_blob code needs to be taken out and
rewritten so as not to impose a burden on the rest of the code.  There
is no reason why code that is called only one time can not allocate a
filename and pass it to __do_execve_file.

There is no reason to allow modules access to any of that functionality
if you need something before an initramfs can be processed.

exit_umh() is completely unnecessary all that is needed is a reference
to a struct pid.

There are all of these layers and abstractions but with only the single
user in net/bpfilter/bpfilter_kern.c they all appear to have been
jumbled together without good layering inbetween then.

That is just what I see from looking at the code quickly.

All of those problems need to be addressed before fork_usermode_blob
grows any real users.

As for other users that want to use for_usermode_blob in the future
currently the code in the kernel is not at all straightforward to tell
if it is correct or not.  So before it grows any living users the code
need to be rewitten so that it is easy to tell that it is correct.

I have sympathy with your efforts but since the code is currently dead,
and in need of work.  I will write a good version of removing
CONFIG_BPFILTER_UMH on top of -rc1, leaving CONFIG_BPFILTER.

That should give you a solid foundation to build upon, while making the
kernel maintainble.

Eric


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-07  5:58             ` Eric W. Biederman
@ 2020-06-07 11:56               ` Eric W. Biederman
  2020-06-08 16:35                 ` Alexei Starovoitov
  2020-06-08 16:33               ` Alexei Starovoitov
  1 sibling, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-07 11:56 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, Kees Cook, Tetsuo Handa, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

ebiederm@xmission.com (Eric W. Biederman) writes:

> I have sympathy with your efforts but since the code is currently dead,
> and in need of work.  I will write a good version of removing
> CONFIG_BPFILTER_UMH on top of -rc1, leaving CONFIG_BPFILTER.

Of course when I just limit my code removal to code that depends upon
the user mode helper all that is left is a Kconfig entry and
include/uapi/linux/bpfilter.h.

I don't get it.

I also noticed that the type of do_exeve_file is wrong. envp and argv
are not "void *", they should be "const char __user *const __user *__argv".

Eric

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter:  Remove this broken and apparently unmantained
  2020-06-06 20:43       ` Matthew Wilcox
@ 2020-06-07 15:51         ` Eric W. Biederman
  0 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-07 15:51 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Kees Cook, Tetsuo Handa, akpm, ast, davem, viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, Linus Torvalds

Matthew Wilcox <willy@infradead.org> writes:

> On Sat, Jun 06, 2020 at 02:20:36PM -0500, Eric W. Biederman wrote:
>> @@ -39,7 +37,6 @@ static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
>>  static DEFINE_SPINLOCK(umh_sysctl_lock);
>>  static DECLARE_RWSEM(umhelper_sem);
>>  static LIST_HEAD(umh_list);
>> -static DEFINE_MUTEX(umh_list_lock);
>
> You can delete the umh_list too; you've deleted all its users.

Good catch, thank you.

Eric


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-07  2:19             ` Linus Torvalds
@ 2020-06-07 16:09               ` Eric W. Biederman
  2020-06-08 16:20               ` Alexei Starovoitov
  1 sibling, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-07 16:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alexei Starovoitov, Kees Cook, Tetsuo Handa, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Sat, Jun 6, 2020 at 6:49 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
>>>
>> I'm not aware of execve issues. I don't remember being cc-ed on them.
>> To me this 'lets remove everything' patch comes out of nowhere with
>> a link to three month old patch as a justification.
>
> Well, it's out of nowhere as far as bpf is concerned, but we've had a
> fair amount of discussions about execve cleanups (and a fair amount of
> work too, not just discussion) lately
>
> So it comes out of "execve is rather grotty", and trying to make it
> simpler have fewer special cases.
>
>> So far we had two attempts at converting netfilter rules to bpf. Both ended up
>> with user space implementation and short cuts.
>
> So I have a question: are we convinced that doing this "netfilter
> conversion" in user space is required at all?
>
> I realize that yes, running clang is not something we'd want to do in
> kernel space, that's not what I'm asking.
>
> But how much might be doable at kernel compile time (run clang to
> generate bpf statically when building the kernel) together with some
> simplistic run-time parameterized JITting for the table details that
> the kernel could do on its own without a real compiler?
>
> Because the problem with this code isn't the "use bpf for netfilter
> rules", it's the "run a user mode helper". The execve thing is
> actually only incidental, it also ends up being a somewhat interesting
> issue wrt namespacing and security (and bootstrapping - I'm not
> convinced people want to have a clang bpf compiler in initrd etc).
>
> So particularly if we accept the fact that we won't necessarily need
> all of netfilter converted in general - some will be just translated
> entirely independently in user space and not use netfilter at all
> (just bpf loaded normally)
>
> IOW there would potentially only be a (fairly small?) core set that
> the kernel would need to be able to handle "natively".
>
> Am I just blathering?

I wish I could answer you.

All the code does at this time is connect some ipv4 bpfilter specific
setsockopt commands to a usermode helper with a read pipe and a write
pipe.  The userspace portion does absolutely nothing with those
commands.

I don't have the foggiest idea what that code hopes to be doing when
that code is fully fleshed out.

If the goal is to become a backwards compatible compiler from historic
netfilter commands to bpf it isn't the craziest design in the world.
But that isn't what is implemented today.

With no users it just isn't clear what the code needs to be doing so I
can't tell what needs to be done to bugs in the code.  I can't answer
which behaviors do the users care about.

Eric

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-07  2:19             ` Linus Torvalds
  2020-06-07 16:09               ` Eric W. Biederman
@ 2020-06-08 16:20               ` Alexei Starovoitov
  2020-06-08 16:40                 ` Greg KH
                                   ` (2 more replies)
  1 sibling, 3 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-08 16:20 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric W. Biederman, Kees Cook, Tetsuo Handa, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Sat, Jun 06, 2020 at 07:19:56PM -0700, Linus Torvalds wrote:
> On Sat, Jun 6, 2020 at 6:49 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >>
> > I'm not aware of execve issues. I don't remember being cc-ed on them.
> > To me this 'lets remove everything' patch comes out of nowhere with
> > a link to three month old patch as a justification.
> 
> Well, it's out of nowhere as far as bpf is concerned, but we've had a
> fair amount of discussions about execve cleanups (and a fair amount of
> work too, not just discussion) lately
> 
> So it comes out of "execve is rather grotty", and trying to make it
> simpler have fewer special cases.
> 
> > So far we had two attempts at converting netfilter rules to bpf. Both ended up
> > with user space implementation and short cuts.
> 
> So I have a question: are we convinced that doing this "netfilter
> conversion" in user space is required at all?
> 
> I realize that yes, running clang is not something we'd want to do in
> kernel space, that's not what I'm asking.
> 
> But how much might be doable at kernel compile time (run clang to
> generate bpf statically when building the kernel) together with some
> simplistic run-time parameterized JITting for the table details that
> the kernel could do on its own without a real compiler?

Right. There is no room for large user space application like clang
in vmlinux. The idea for umh was to stay small and self contained.
Its advantage vs kernel module is to execute with user privs
and use normal syscalls to drive kernel instead of export_symbol apis.

There are two things in this discussion. bpfilter that intercepting
netfilter sockopt and elf file embedded into vmlinux that executes
as user process.
The pro/con of bpfilter approach is hard to argue now because
bpfilter itself didn't materialize yet. I'm fine with removal of that part
from the kernel, but I'm still arguing that 'embed elf into vmlinux'
is necessary, useful and there is no alternative.
There are builtin kernel modules. 'elf in vmlinux' is similar to that.
The primary use case is bpf driven features like bpf_lsm.
bpf_lsm needs to load many different bpf programs, create bpf maps, populate
them, attach to lsm hooks to make the whole thing ready. That initialization of
bpf_lsm is currently done after everything booted, but folks want it to be
active much early. Like other LSMs.
Take android for example. It can certify vmlinux, but not boot fs image.
vmlinux needs to apply security policy via bpf_lsm during the boot.
In such case 'embedded elf in vmlinux' would start early, do its thing
via bpf syscall and exit. Unlike bpfilter approach it won't stay running.
Its job is to setup all bpf things and quit.
Theoretically we can do it as proper kernel module, but then it would mean huge
refactoring of all bpf syscall commands to be accessible from the kernel module.
It's simpler to embed elf into vmlinux and run it as user process doing normal
syscalls. I can imagine that in other cases this elf executable would keep
running after setup.
It doesn't have to be bpf related. Folks thought they can do usb drivers
running in user space and ready at boot. 'elf in vmlinux' would work as well.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-07  2:31             ` Tetsuo Handa
@ 2020-06-08 16:23               ` Alexei Starovoitov
  2020-06-08 23:22                 ` Tetsuo Handa
  2020-06-09 20:02                 ` Eric W. Biederman
  0 siblings, 2 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-08 16:23 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Linus Torvalds, Eric W. Biederman, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Sun, Jun 07, 2020 at 11:31:05AM +0900, Tetsuo Handa wrote:
> On 2020/06/07 10:49, Alexei Starovoitov wrote:
> > So you're right that for most folks user space is the
> > answer. But there are cases where kernel has to have these things before
> > systemd starts.
> 
> Why such cases can't use init= kernel command line argument?
> The program specified via init= kernel command line argument can do anything
> before systemd (a.k.a. global init process) starts.
> 
> By the way, from the LSM perspective, doing a lot of things before global init
> process starts is not desirable, for access decision can be made only after policy
> is loaded (which is generally when /sbin/init on a device specified via root=
> kernel command line argument becomes ready). Since
> fork_usermode_blob((void *) "#!/bin/true\n", 12, info) is possible, I worry that
> the ability to start userspace code is abused for bypassing dentry/inode-based
> permission checks.

bpf_lsm is that thing that needs to load and start acting early.
It's somewhat chicken and egg. fork_usermode_blob() will start a process
that will load and apply security policy to all further forks and execs.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-07  5:58             ` Eric W. Biederman
  2020-06-07 11:56               ` Eric W. Biederman
@ 2020-06-08 16:33               ` Alexei Starovoitov
  1 sibling, 0 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-08 16:33 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linus Torvalds, Kees Cook, Tetsuo Handa, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Sun, Jun 07, 2020 at 12:58:12AM -0500, Eric W. Biederman wrote:
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> 
> > On Sat, Jun 06, 2020 at 03:33:14PM -0700, Linus Torvalds wrote:
> >> On Sat, Jun 6, 2020 at 1:20 PM Alexei Starovoitov
> >> <alexei.starovoitov@gmail.com> wrote:
> >> >
> >> > Please mention specific bugs and let's fix them.
> >> 
> >> Well, Eric did mention one explicit bug, and several "looks dodgy" bugs.
> >> 
> >> And the fact is, this isn't used.
> >> 
> >> It's clever, and I like the concept, but it was probably a mistake to
> >> do this as a user-mode-helper thing.
> >> 
> >> If people really convert netfilter rules to bpf, they'll likely do so
> >> in user space. This bpfilter thing hasn't gone anywhere, and it _has_
> >> caused problems.
> >> 
> >> So Alexei, I think the burden of proof is not on Eric, but on you.
> >> 
> >> Eric's claim is that
> >> 
> >>  (a) it has bugs (and yes, he pointed to at lelast one)
> >
> > the patch from March 12 ?
> > I thought it landed long ago. Is there an issue with it?
> > 'handling is questionable' is not very constructive.
> 
> It was half a fix.  Tetsuo still doesn't know how to fix tomoyo to work
> with fork_usermode_blob.
> 
> He was asking for your feedback and you did not give it.
> 
> The truth is Tetsuo's fix was only a fix for the symptoms.  It was not a
> good fix to the code.
> 
> >>  (b) it's not doing anything useful
> >
> > true.
> >
> >>  (b) it's a maintenance issue for execve, which is what Eric maintains.
> >
> > I'm not aware of execve issues. I don't remember being cc-ed on them.
> > To me this 'lets remove everything' patch comes out of nowhere with
> > a link to three month old patch as a justification.
> 
> I needed to know how dead the code is and your reply has confirmed
> that the code is dead.
> 
> Deleting the code is much easier than the detailed careful work it would
> take to make code that is in use work correctly.
> 
> >> So you can't just dismiss this, ignore the reported bug, and say
> >> "we'll fix them".
> >> 
> >> That only answers (a) (well, it _would_ have answered (a)., except you
> >> actually didn't even read Eric's report of existing bugs).
> >> 
> >> What is your answer to (b)-(c)?
> >
> > So far we had two attempts at converting netfilter rules to bpf. Both ended up
> > with user space implementation and short cuts. bpf side didn't have loops and
> > couldn't support 10k+ rules. That is what stalled the effort. imo it's a
> > pointless corner case, but to be a true replacement people kept bringing it up
> > as something valid. Now we have bpf iterator concept and soon bpf will be able
> > to handle millions of rules. Also folks are also realizing that this effort has
> > to be project managed appropriately. Will it materialize in patches tomorrow?
> > Unlikely. Probably another 6 month at least. Also outside of netfilter
> > conversion we've started /proc extension effort that will use the same umh
> > facility. It won't be ready tomorrow as well, but both need umh.
> 
> Given that I am one of the folks who looks after proc I haven't seen
> that either.  The direction I have seen in the last 20 years is people
> figuring out how to reduce proc not really how to extend it so I can't
> imagine what a /proc extension effort is.

We already made it extensible without changing /proc.
Folks can mount bpffs into /newproc, pin bpf prog in there and it
will be cat-able.
It's not quite /proc, of course. It's a flexible alternative
with unstable cat-able files that are kernel specific.

> 
> > initrd is not
> > an option due to operational constraints. We need a way to ship kernel tarball
> > where bpf things are ready at boot. I suspect /proc extensions patches will
> > land sooner. Couple month ago people used umh to do ovs->xdp translatation. It
> > didn't land. People argued that the same thing can be achieved in user space
> > and they were correct. So you're right that for most folks user space is the
> > answer. But there are cases where kernel has to have these things before
> > systemd starts.
> 
> You may have a valid case for doing things in the kernel before systemd
> starts.  The current mechanism is fundamentally in conflict with the
> LSMs which is an unresolved problem.

It's the other way around. fork_usermode_blob is a mechanism to launch bpf_lsm.

> I don't see why you can't have a userspace process that does:
> 
> 	pid = fork();
>         if (pid == 0) {
>         	/* Do bpf stuff */
>         }
>         else if (pid > 0) {
>         	execve("/sbin/init", ...);
>         }
> 
> You can build an initramfs with that code right into the kernel, so
> I can't imagine the existing mechanisms being insufficient.

that doesn't work for android.
It also doesn't work for us. We ship the kernel package.
It has vmlinux and kernel modules. That's it.

> That said the fork_usermode_blob code needs to be taken out and
> rewritten so as not to impose a burden on the rest of the code.  There
> is no reason why code that is called only one time can not allocate a
> filename and pass it to __do_execve_file.

Sure. Let's alloc filename.

> There is no reason to allow modules access to any of that functionality
> if you need something before an initramfs can be processed.
> 
> exit_umh() is completely unnecessary all that is needed is a reference
> to a struct pid.

So there are no bugs, but there are few layering concerns, right?
Let's switch to pid from task_struct.

> There are all of these layers and abstractions but with only the single
> user in net/bpfilter/bpfilter_kern.c they all appear to have been
> jumbled together without good layering inbetween then.

I'm totally fine tweaking the layering if it makes exec code easier
to maintain.
Sounds like alloc filename and pid vs task_struct are the only things
that needs to be tweaked.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-07 11:56               ` Eric W. Biederman
@ 2020-06-08 16:35                 ` Alexei Starovoitov
  0 siblings, 0 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-08 16:35 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linus Torvalds, Kees Cook, Tetsuo Handa, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Sun, Jun 07, 2020 at 06:56:38AM -0500, Eric W. Biederman wrote:
> ebiederm@xmission.com (Eric W. Biederman) writes:
> 
> > I have sympathy with your efforts but since the code is currently dead,
> > and in need of work.  I will write a good version of removing
> > CONFIG_BPFILTER_UMH on top of -rc1, leaving CONFIG_BPFILTER.
> 
> Of course when I just limit my code removal to code that depends upon
> the user mode helper all that is left is a Kconfig entry and
> include/uapi/linux/bpfilter.h.

This bit you can actually remove.
bpfilter didn't materialize.
But 'elf in vmlinux' is useful. Please keep it.

> I also noticed that the type of do_exeve_file is wrong. envp and argv
> are not "void *", they should be "const char __user *const __user *__argv".

Sounds like a trivial fix.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-08 16:20               ` Alexei Starovoitov
@ 2020-06-08 16:40                 ` Greg KH
  2020-06-08 18:35                 ` Kees Cook
  2020-06-09 19:51                 ` Eric W. Biederman
  2 siblings, 0 replies; 194+ messages in thread
From: Greg KH @ 2020-06-08 16:40 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, Eric W. Biederman, Kees Cook, Tetsuo Handa,
	Andrew Morton, Alexei Starovoitov, David Miller, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele

On Mon, Jun 08, 2020 at 09:20:27AM -0700, Alexei Starovoitov wrote:
> On Sat, Jun 06, 2020 at 07:19:56PM -0700, Linus Torvalds wrote:
> > On Sat, Jun 6, 2020 at 6:49 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >>
> > > I'm not aware of execve issues. I don't remember being cc-ed on them.
> > > To me this 'lets remove everything' patch comes out of nowhere with
> > > a link to three month old patch as a justification.
> > 
> > Well, it's out of nowhere as far as bpf is concerned, but we've had a
> > fair amount of discussions about execve cleanups (and a fair amount of
> > work too, not just discussion) lately
> > 
> > So it comes out of "execve is rather grotty", and trying to make it
> > simpler have fewer special cases.
> > 
> > > So far we had two attempts at converting netfilter rules to bpf. Both ended up
> > > with user space implementation and short cuts.
> > 
> > So I have a question: are we convinced that doing this "netfilter
> > conversion" in user space is required at all?
> > 
> > I realize that yes, running clang is not something we'd want to do in
> > kernel space, that's not what I'm asking.
> > 
> > But how much might be doable at kernel compile time (run clang to
> > generate bpf statically when building the kernel) together with some
> > simplistic run-time parameterized JITting for the table details that
> > the kernel could do on its own without a real compiler?
> 
> Right. There is no room for large user space application like clang
> in vmlinux. The idea for umh was to stay small and self contained.
> Its advantage vs kernel module is to execute with user privs
> and use normal syscalls to drive kernel instead of export_symbol apis.
> 
> There are two things in this discussion. bpfilter that intercepting
> netfilter sockopt and elf file embedded into vmlinux that executes
> as user process.
> The pro/con of bpfilter approach is hard to argue now because
> bpfilter itself didn't materialize yet. I'm fine with removal of that part
> from the kernel, but I'm still arguing that 'embed elf into vmlinux'
> is necessary, useful and there is no alternative.
> There are builtin kernel modules. 'elf in vmlinux' is similar to that.
> The primary use case is bpf driven features like bpf_lsm.
> bpf_lsm needs to load many different bpf programs, create bpf maps, populate
> them, attach to lsm hooks to make the whole thing ready. That initialization of
> bpf_lsm is currently done after everything booted, but folks want it to be
> active much early. Like other LSMs.
> Take android for example. It can certify vmlinux, but not boot fs image.

Huh?  dm-verity does this for the whole "boot fs partition" already,
right?  Or one of the "dm-" modules...

> vmlinux needs to apply security policy via bpf_lsm during the boot.
> In such case 'embedded elf in vmlinux' would start early, do its thing
> via bpf syscall and exit. Unlike bpfilter approach it won't stay running.
> Its job is to setup all bpf things and quit.
> Theoretically we can do it as proper kernel module, but then it would mean huge
> refactoring of all bpf syscall commands to be accessible from the kernel module.
> It's simpler to embed elf into vmlinux and run it as user process doing normal
> syscalls. I can imagine that in other cases this elf executable would keep
> running after setup.
> It doesn't have to be bpf related. Folks thought they can do usb drivers
> running in user space and ready at boot. 'elf in vmlinux' would work as well.

I still want to work on the "usb drivers in userspace" like this, it's
on my TODO for this year.  But don't let that "sometime in the future
wish" keep this code around if no one is currently using it.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-08 16:20               ` Alexei Starovoitov
  2020-06-08 16:40                 ` Greg KH
@ 2020-06-08 18:35                 ` Kees Cook
  2020-06-09  1:26                   ` Alexei Starovoitov
  2020-06-09 19:51                 ` Eric W. Biederman
  2 siblings, 1 reply; 194+ messages in thread
From: Kees Cook @ 2020-06-08 18:35 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, Eric W. Biederman, Tetsuo Handa, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Mon, Jun 08, 2020 at 09:20:27AM -0700, Alexei Starovoitov wrote:
> Take android for example. It can certify vmlinux, but not boot fs image.

Huh? Yes it does, and for a while now. It uses Android uses dm-verity[1]
and fs-verity[2].

[1] https://source.android.com/security/verifiedboot/dm-verity
    https://www.kernel.org/doc/html/latest/admin-guide/device-mapper/verity.html
[2] https://source.android.com/security/apksigning/v3
    https://www.kernel.org/doc/html/latest/filesystems/fsverity.html

-- 
Kees Cook

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-08 16:23               ` Alexei Starovoitov
@ 2020-06-08 23:22                 ` Tetsuo Handa
  2020-06-09  1:28                   ` Alexei Starovoitov
  2020-06-09 20:02                 ` Eric W. Biederman
  1 sibling, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-08 23:22 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, Eric W. Biederman, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On 2020/06/09 1:23, Alexei Starovoitov wrote:
> On Sun, Jun 07, 2020 at 11:31:05AM +0900, Tetsuo Handa wrote:
>> On 2020/06/07 10:49, Alexei Starovoitov wrote:
>>> So you're right that for most folks user space is the
>>> answer. But there are cases where kernel has to have these things before
>>> systemd starts.
>>
>> Why such cases can't use init= kernel command line argument?
>> The program specified via init= kernel command line argument can do anything
>> before systemd (a.k.a. global init process) starts.
>>
>> By the way, from the LSM perspective, doing a lot of things before global init
>> process starts is not desirable, for access decision can be made only after policy
>> is loaded (which is generally when /sbin/init on a device specified via root=
>> kernel command line argument becomes ready). Since
>> fork_usermode_blob((void *) "#!/bin/true\n", 12, info) is possible, I worry that
>> the ability to start userspace code is abused for bypassing dentry/inode-based
>> permission checks.
> 
> bpf_lsm is that thing that needs to load and start acting early.
> It's somewhat chicken and egg. fork_usermode_blob() will start a process
> that will load and apply security policy to all further forks and execs.

fork_usermode_blob() would start a process in userspace, but early in the boot
stage means that things in the kernel might not be ready to serve for userspace
processes (e.g. we can't open a shared library before a filesystem containing
that file becomes ready, we can't mount a filesystem before mount point becomes
ready, we can't access mount point before a device that contains that directory
becomes ready).

TOMOYO LSM module uses call_usermodehelper() from tomoyo_load_policy() in order to
load and apply security policy. What is so nice with fork_usermode_blob() compared
to existing call_usermodehelper(), at the cost of confusing LSM modules by allowing
file-less execve() request from fork_usermode_blob() ?

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-08 18:35                 ` Kees Cook
@ 2020-06-09  1:26                   ` Alexei Starovoitov
  2020-06-09 15:37                     ` Kees Cook
  0 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-09  1:26 UTC (permalink / raw)
  To: Kees Cook
  Cc: Linus Torvalds, Eric W. Biederman, Tetsuo Handa, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Mon, Jun 08, 2020 at 11:35:12AM -0700, Kees Cook wrote:
> On Mon, Jun 08, 2020 at 09:20:27AM -0700, Alexei Starovoitov wrote:
> > Take android for example. It can certify vmlinux, but not boot fs image.
> 
> Huh? Yes it does, and for a while now. It uses Android uses dm-verity[1]
> and fs-verity[2].

I didn't mean 'certified' like untrusted or insecure.
I meant the vendor kernel has to satisfy and pass SDK checks to be
certified as an android phone whereas vendor can put more or less whatever
they like on the fs. Their own bloatware, etc.
So for android to make sure something is part of the whole sw package
it has to come from the kernel and its modules.
Well, at least that's what I've been told.
Similarly kernel upgrade doesn't necessary include boot image upgrade.
In that sense 'elf in vmlinux' addresses packaging issue.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-08 23:22                 ` Tetsuo Handa
@ 2020-06-09  1:28                   ` Alexei Starovoitov
  2020-06-09  5:29                     ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-09  1:28 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Linus Torvalds, Eric W. Biederman, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Tue, Jun 09, 2020 at 08:22:13AM +0900, Tetsuo Handa wrote:
> On 2020/06/09 1:23, Alexei Starovoitov wrote:
> > On Sun, Jun 07, 2020 at 11:31:05AM +0900, Tetsuo Handa wrote:
> >> On 2020/06/07 10:49, Alexei Starovoitov wrote:
> >>> So you're right that for most folks user space is the
> >>> answer. But there are cases where kernel has to have these things before
> >>> systemd starts.
> >>
> >> Why such cases can't use init= kernel command line argument?
> >> The program specified via init= kernel command line argument can do anything
> >> before systemd (a.k.a. global init process) starts.
> >>
> >> By the way, from the LSM perspective, doing a lot of things before global init
> >> process starts is not desirable, for access decision can be made only after policy
> >> is loaded (which is generally when /sbin/init on a device specified via root=
> >> kernel command line argument becomes ready). Since
> >> fork_usermode_blob((void *) "#!/bin/true\n", 12, info) is possible, I worry that
> >> the ability to start userspace code is abused for bypassing dentry/inode-based
> >> permission checks.
> > 
> > bpf_lsm is that thing that needs to load and start acting early.
> > It's somewhat chicken and egg. fork_usermode_blob() will start a process
> > that will load and apply security policy to all further forks and execs.
> 
> fork_usermode_blob() would start a process in userspace, but early in the boot
> stage means that things in the kernel might not be ready to serve for userspace
> processes (e.g. we can't open a shared library before a filesystem containing
> that file becomes ready, we can't mount a filesystem before mount point becomes
> ready, we can't access mount point before a device that contains that directory
> becomes ready).
> 
> TOMOYO LSM module uses call_usermodehelper() from tomoyo_load_policy() in order to
> load and apply security policy. What is so nice with fork_usermode_blob() compared
> to existing call_usermodehelper(), at the cost of confusing LSM modules by allowing
> file-less execve() request from fork_usermode_blob() ?

For the same reason you did commit 0e4ae0e0dec6 ("TOMOYO: Make several options configurable.")
Quoting your words from that commit:
"To be able to start using enforcing mode from the early stage of boot sequence,
 this patch adds support for activating access control without calling external
 policy loader program."

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-09  1:28                   ` Alexei Starovoitov
@ 2020-06-09  5:29                     ` Tetsuo Handa
  2020-06-09 22:32                       ` Alexei Starovoitov
  0 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-09  5:29 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, Eric W. Biederman, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On 2020/06/09 10:28, Alexei Starovoitov wrote:
>> TOMOYO LSM module uses call_usermodehelper() from tomoyo_load_policy() in order to
>> load and apply security policy. What is so nice with fork_usermode_blob() compared
>> to existing call_usermodehelper(), at the cost of confusing LSM modules by allowing
>> file-less execve() request from fork_usermode_blob() ?
> 
> For the same reason you did commit 0e4ae0e0dec6 ("TOMOYO: Make several options configurable.")
> Quoting your words from that commit:
> "To be able to start using enforcing mode from the early stage of boot sequence,
>  this patch adds support for activating access control without calling external
>  policy loader program."
> 

I can't catch what you mean. That commit is to allow not to call usermode helper.

You can't start a usermode helper which requires access to filesystems (e.g. ELF loaders,
shared libraries) before call_usermodehelper() can start a usermode helper which requires
access to filesystems. Under such a restricted condition, what is nice with starting a
usermode helper? Programs which can be started under such condition will be quite limited.
My question is: why you can't use existing call_usermodehelper() (if you need to call
a usermode helper) ?


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-09  1:26                   ` Alexei Starovoitov
@ 2020-06-09 15:37                     ` Kees Cook
  0 siblings, 0 replies; 194+ messages in thread
From: Kees Cook @ 2020-06-09 15:37 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, Eric W. Biederman, Tetsuo Handa, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Mon, Jun 08, 2020 at 06:26:36PM -0700, Alexei Starovoitov wrote:
> On Mon, Jun 08, 2020 at 11:35:12AM -0700, Kees Cook wrote:
> > On Mon, Jun 08, 2020 at 09:20:27AM -0700, Alexei Starovoitov wrote:
> > > Take android for example. It can certify vmlinux, but not boot fs image.
> > 
> > Huh? Yes it does, and for a while now. It uses Android uses dm-verity[1]
> > and fs-verity[2].
> 
> I didn't mean 'certified' like untrusted or insecure.
> I meant the vendor kernel has to satisfy and pass SDK checks to be
> certified as an android phone whereas vendor can put more or less whatever
> they like on the fs. Their own bloatware, etc.
> So for android to make sure something is part of the whole sw package
> it has to come from the kernel and its modules.
> Well, at least that's what I've been told.
> Similarly kernel upgrade doesn't necessary include boot image upgrade.
> In that sense 'elf in vmlinux' addresses packaging issue.

Well, it's much more complex than that, but I see what you mean: it's an
ELF tied to a specific kernel, like modules are. But Android's control
over modules cover such an ELF as well, if it were separate from the
kernel. Regardless, we're off in the weeds. :)

-- 
Kees Cook

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-08 16:20               ` Alexei Starovoitov
  2020-06-08 16:40                 ` Greg KH
  2020-06-08 18:35                 ` Kees Cook
@ 2020-06-09 19:51                 ` Eric W. Biederman
  2 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-09 19:51 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, Kees Cook, Tetsuo Handa, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Sat, Jun 06, 2020 at 07:19:56PM -0700, Linus Torvalds wrote:
>> On Sat, Jun 6, 2020 at 6:49 PM Alexei Starovoitov
>> <alexei.starovoitov@gmail.com> wrote:
>> >>
>> > I'm not aware of execve issues. I don't remember being cc-ed on them.
>> > To me this 'lets remove everything' patch comes out of nowhere with
>> > a link to three month old patch as a justification.
>> 
>> Well, it's out of nowhere as far as bpf is concerned, but we've had a
>> fair amount of discussions about execve cleanups (and a fair amount of
>> work too, not just discussion) lately
>> 
>> So it comes out of "execve is rather grotty", and trying to make it
>> simpler have fewer special cases.
>> 
>> > So far we had two attempts at converting netfilter rules to bpf. Both ended up
>> > with user space implementation and short cuts.
>> 
>> So I have a question: are we convinced that doing this "netfilter
>> conversion" in user space is required at all?
>> 
>> I realize that yes, running clang is not something we'd want to do in
>> kernel space, that's not what I'm asking.
>> 
>> But how much might be doable at kernel compile time (run clang to
>> generate bpf statically when building the kernel) together with some
>> simplistic run-time parameterized JITting for the table details that
>> the kernel could do on its own without a real compiler?
>
> Right. There is no room for large user space application like clang
> in vmlinux. The idea for umh was to stay small and self contained.
> Its advantage vs kernel module is to execute with user privs
> and use normal syscalls to drive kernel instead of export_symbol apis.
>
> There are two things in this discussion. bpfilter that intercepting
> netfilter sockopt and elf file embedded into vmlinux that executes
> as user process.
> The pro/con of bpfilter approach is hard to argue now because
> bpfilter itself didn't materialize yet. I'm fine with removal of that part
> from the kernel, but I'm still arguing that 'embed elf into vmlinux'
> is necessary, useful and there is no alternative.
> There are builtin kernel modules. 'elf in vmlinux' is similar to that.
> The primary use case is bpf driven features like bpf_lsm.
> bpf_lsm needs to load many different bpf programs, create bpf maps, populate
> them, attach to lsm hooks to make the whole thing ready. That initialization of
> bpf_lsm is currently done after everything booted, but folks want it to be
> active much early. Like other LSMs.
> Take android for example. It can certify vmlinux, but not boot fs image.
> vmlinux needs to apply security policy via bpf_lsm during the boot.
> In such case 'embedded elf in vmlinux' would start early, do its thing
> via bpf syscall and exit. Unlike bpfilter approach it won't stay running.
> Its job is to setup all bpf things and quit.
> Theoretically we can do it as proper kernel module, but then it would mean huge
> refactoring of all bpf syscall commands to be accessible from the kernel module.
> It's simpler to embed elf into vmlinux and run it as user process doing normal
> syscalls. I can imagine that in other cases this elf executable would keep
> running after setup.
> It doesn't have to be bpf related. Folks thought they can do usb drivers
> running in user space and ready at boot. 'elf in vmlinux' would work as well.

To be 100% clear.  This is not a rejection of the concept of behind
fork_usermode_blob.

I see nothing fundamentally wrong with the concept and I have no problem
sorting out the details and remerging that code when it is ready.

If there is a user of fork_usermode_blob that it should be ready for the
next merge window let's keep the code, and let's come up with some clean
fixes to waiting for a process and for passing a struct file to exec.

If it is simply coming one of these days like moving usb drivers into
userspace let's come back to the concept when we have a user ready to
use it.  What exists today will still be in the git history for people
to find.

Eric


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-08 16:23               ` Alexei Starovoitov
  2020-06-08 23:22                 ` Tetsuo Handa
@ 2020-06-09 20:02                 ` Eric W. Biederman
  2020-06-09 23:56                   ` Alexei Starovoitov
  1 sibling, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-09 20:02 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> bpf_lsm is that thing that needs to load and start acting early.
> It's somewhat chicken and egg. fork_usermode_blob() will start a process
> that will load and apply security policy to all further forks and
> execs.

What is the timeframe for bpf_lsm patches wanting to use
fork_usermode_blob()?

Are we possibly looking at something that will be ready for the next
merge window?

Eric


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-09  5:29                     ` Tetsuo Handa
@ 2020-06-09 22:32                       ` Alexei Starovoitov
  2020-06-09 23:30                         ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-09 22:32 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Linus Torvalds, Eric W. Biederman, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Tue, Jun 09, 2020 at 02:29:09PM +0900, Tetsuo Handa wrote:
> On 2020/06/09 10:28, Alexei Starovoitov wrote:
> >> TOMOYO LSM module uses call_usermodehelper() from tomoyo_load_policy() in order to
> >> load and apply security policy. What is so nice with fork_usermode_blob() compared
> >> to existing call_usermodehelper(), at the cost of confusing LSM modules by allowing
> >> file-less execve() request from fork_usermode_blob() ?
> > 
> > For the same reason you did commit 0e4ae0e0dec6 ("TOMOYO: Make several options configurable.")
> > Quoting your words from that commit:
> > "To be able to start using enforcing mode from the early stage of boot sequence,
> >  this patch adds support for activating access control without calling external
> >  policy loader program."
> > 
> 
> I can't catch what you mean. That commit is to allow not to call usermode helper.
> 
> You can't start a usermode helper which requires access to filesystems (e.g. ELF loaders,
> shared libraries) before call_usermodehelper() can start a usermode helper which requires
> access to filesystems. Under such a restricted condition, what is nice with starting a
> usermode helper? Programs which can be started under such condition will be quite limited.
> My question is: why you can't use existing call_usermodehelper() (if you need to call
> a usermode helper) ?

I think the confusion comes from assumption that usermode blob is a dynamic file that
needs ld.so, shared libs and rootfs. This mode is supported by the blob loading
logic, but it's not a primary use case. It's nice to be able to compile
that blob with -g and be able to 'gdb -p' into it. That works and very
convenient when it comes to debugging. Compare that to debugging a kernel module!
It's pretty cool to have vmlinux with kernel module-like feature
that folks can breakpoint and single step while the kernel is running.
That's how we've been developing bpfilter. Sadly the other two patches
(by Davem and Daniel) didn't land:
https://lore.kernel.org/patchwork/patch/902785/
https://lore.kernel.org/patchwork/patch/902783/
and without them bpfilter looks completely useless.

The main mode of bpfilter operation was envisioned as rootfs-less.
It must work with any init= including busybox. For production the bpfilter
user mode blob was compiled as static binary with no dependencies.
So there is no path to point to. It should be ready before pid 1
will do its first iptables sys_setsockopt. If user reboots the kernel
with different init= cmdline the bpfilter should still be doing its job.
Like builtin kernel module.
Anyway bpfilter is only one of the use cases for 'elf in vmlinux'.
I think better name would have been 'user space kernel modules'.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-09 22:32                       ` Alexei Starovoitov
@ 2020-06-09 23:30                         ` Tetsuo Handa
  2020-06-10  0:05                           ` Alexei Starovoitov
  0 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-09 23:30 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, Eric W. Biederman, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On 2020/06/10 7:32, Alexei Starovoitov wrote:
>> You can't start a usermode helper which requires access to filesystems (e.g. ELF loaders,
>> shared libraries) before call_usermodehelper() can start a usermode helper which requires
>> access to filesystems. Under such a restricted condition, what is nice with starting a
>> usermode helper? Programs which can be started under such condition will be quite limited.
>> My question is: why you can't use existing call_usermodehelper() (if you need to call
>> a usermode helper) ?
> 
> I think the confusion comes from assumption that usermode blob is a dynamic file that
> needs ld.so, shared libs and rootfs.

Yes, I assume that usermode blob needs to be able to access the rootfs.

>                                      This mode is supported by the blob loading
> logic, but it's not a primary use case. It's nice to be able to compile
> that blob with -g and be able to 'gdb -p' into it.

Where can the gdb come from when the rootfs is not accessible?

>                                                    That works and very
> convenient when it comes to debugging. Compare that to debugging a kernel module!

Userspace is convenient for debugging, at the cost of robustness (e.g. being killed
by SIGKILL).

> 
> The main mode of bpfilter operation was envisioned as rootfs-less.
> It must work with any init= including busybox. For production the bpfilter
> user mode blob was compiled as static binary with no dependencies.

I still can't imagine. Compiling a user mode blob as a static binary is possible.
But what does 'It must work with any init=' mean? The use of init= depends on
the rootfs being ready.

> So there is no path to point to. It should be ready before pid 1
> will do its first iptables sys_setsockopt.

There has to be at least the root directory in order to use init= parameter.

What does the "pid 1" mean? Why you can't specify your "user mode blob" using init=
parameter and then transfer the control of "pid 1" from your "user mode blob" to
"some program which will do its first iptables sys_setsockopt()" ?

>                                            If user reboots the kernel
> with different init= cmdline the bpfilter should still be doing its job.
> Like builtin kernel module.

Even when rebooting the kernel with different init= cmdline, you have a space for
running your "user mode blob" (e.g.

  init=/path/to/your/user/mode/blob init_after_blob=/path/to/some/program/which/will/do/something/else

), don't you?

There is no need to use call_usermodehelper(), let alone fork_usermode_blob()...

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-09 20:02                 ` Eric W. Biederman
@ 2020-06-09 23:56                   ` Alexei Starovoitov
  2020-06-10 21:12                     ` Eric W. Biederman
  0 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-09 23:56 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Tue, Jun 09, 2020 at 03:02:30PM -0500, Eric W. Biederman wrote:
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> 
> > bpf_lsm is that thing that needs to load and start acting early.
> > It's somewhat chicken and egg. fork_usermode_blob() will start a process
> > that will load and apply security policy to all further forks and
> > execs.
> 
> What is the timeframe for bpf_lsm patches wanting to use
> fork_usermode_blob()?
> 
> Are we possibly looking at something that will be ready for the next
> merge window?

In bpf space there are these that want to use usermode_blobs:
1. bpfilter itself.
First of all I think we made a mistake delaying landing the main patches:
https://lore.kernel.org/patchwork/patch/902785/
https://lore.kernel.org/patchwork/patch/902783/
without them bpfilter is indeed dead. That probably was the reason
no one was brave enough to continue working on it.
So I think the landed skeleton of bpfilter can be removed.
I think no user space code will notice that include/uapi/linux/bpfilter.h
is gone. So it won't be considered as user space breakage.
Similarly CONFIG_BPFILTER can be nuked too.
bpftool is checking for it (see tools/bpf/bpftool/feature.c)
but it's fine to remove it.
I still think that the approach taken was a correct one, but
lifting that project off the ground was too much for three of us.
So when it's staffed appropriately we can re-add that code.

2. bpf_lsm.
It's very active at the moment. I'm working on it as well
(sleepable progs is targeting that), but I'm not sure when folks
would have to have it during the boot. So far it sounds that
they're addressing more critical needs first. "bpf_lsm ready at boot"
came up several times during "bpf office hours" conference calls,
so it's certainly on the radar. If I to guess I don't think
bpf_lsm will use usermode_blobs in the next 6 weeks.
More likely 2-4 month.

3. bpf iterator.
It's already capable extension of several things in /proc.
See https://lore.kernel.org/bpf/20200509175921.2477493-1-yhs@fb.com/
Cat-ing bpf program as "cat /sys/fs/bpf/my_ipv6_route"
will produce the same human output as "cat /proc/net/ipv6_route".
The key difference is that bpf is all tracing based and it's unstable.
struct fib6_info can change and prog will stop loading.
There are few FIXME in there. That is being addressed right now.
After that the next step is to make cat-able progs available
right after boot via usermode_blobs.
Unlike cases 1 and 2 here we don't care that they appear before pid 1.
They can certainly be chef installed and started as services.
But they are kernel dependent, so deploying them to production
is much more complicated when they're done as separate rpm.
Testing is harder and so on. Operational issues pile up when something
that almost like kernel module is done as a separate package.
Hence usermode_blob fits the best.
Of course we were not planning to add a bunch of them to kernel tree.
The idea was to add only _one_ such cat-able bpf prog and have it as
a selftest for usermode_blob + bpf_iter. What we want our users to
see in 'cat my_ipv6_route' is probably different from other companies.
These patches will likely be using usermode_blob() in the next month.

But we don't need to wait. We can make the progress right now.
How about we remove bpfilter uapi and rename net/bpfilter/bpfilter_kern.c
into net/umb/umb_test.c only to exercise Makefile to build elf file
from simple main.c including .S with incbin trick
and kernel side that does fork_usermode_blob().
And that's it.
net/ipv4/bpfilter/sockopt.c and kconfig can be removed.
That would be enough base to do use cases 2 and 3 above.
Having such selftest will be enough to adjust the layering
for fork_usermode_blob(), right?
If I understood you correctly you want to replace pid_t
in 'struct umh_info' with proper 'struct pid' pointer that
is refcounted, so user process's exit is clean? What else?

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-09 23:30                         ` Tetsuo Handa
@ 2020-06-10  0:05                           ` Alexei Starovoitov
  2020-06-10  3:08                             ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-10  0:05 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Linus Torvalds, Eric W. Biederman, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Wed, Jun 10, 2020 at 08:30:31AM +0900, Tetsuo Handa wrote:
> On 2020/06/10 7:32, Alexei Starovoitov wrote:
> >> You can't start a usermode helper which requires access to filesystems (e.g. ELF loaders,
> >> shared libraries) before call_usermodehelper() can start a usermode helper which requires
> >> access to filesystems. Under such a restricted condition, what is nice with starting a
> >> usermode helper? Programs which can be started under such condition will be quite limited.
> >> My question is: why you can't use existing call_usermodehelper() (if you need to call
> >> a usermode helper) ?
> > 
> > I think the confusion comes from assumption that usermode blob is a dynamic file that
> > needs ld.so, shared libs and rootfs.
> 
> Yes, I assume that usermode blob needs to be able to access the rootfs.
> 
> >                                      This mode is supported by the blob loading
> > logic, but it's not a primary use case. It's nice to be able to compile
> > that blob with -g and be able to 'gdb -p' into it.
> 
> Where can the gdb come from when the rootfs is not accessible?
> 
> >                                                    That works and very
> > convenient when it comes to debugging. Compare that to debugging a kernel module!
> 
> Userspace is convenient for debugging, at the cost of robustness (e.g. being killed
> by SIGKILL).
> 
> > 
> > The main mode of bpfilter operation was envisioned as rootfs-less.
> > It must work with any init= including busybox. For production the bpfilter
> > user mode blob was compiled as static binary with no dependencies.
> 
> I still can't imagine. Compiling a user mode blob as a static binary is possible.
> But what does 'It must work with any init=' mean? The use of init= depends on
> the rootfs being ready.
> 
> > So there is no path to point to. It should be ready before pid 1
> > will do its first iptables sys_setsockopt.
> 
> There has to be at least the root directory in order to use init= parameter.

I think you're still missing that usermode_blob is completely fs-less.
It doesn't need any fs to work.

> 
> What does the "pid 1" mean? Why you can't specify your "user mode blob" using init=
> parameter and then transfer the control of "pid 1" from your "user mode blob" to
> "some program which will do its first iptables sys_setsockopt()" ?

because init= is user cmdline and usermode_blob() is used by the kernel feature.
they are independent.

> >                                            If user reboots the kernel
> > with different init= cmdline the bpfilter should still be doing its job.
> > Like builtin kernel module.
> 
> Even when rebooting the kernel with different init= cmdline, you have a space for
> running your "user mode blob" (e.g.
> 
>   init=/path/to/your/user/mode/blob init_after_blob=/path/to/some/program/which/will/do/something/else
> 
> ), don't you?
> 
> There is no need to use call_usermodehelper(), let alone fork_usermode_blob()...

Using the same argument there is no need for kernel modules and certainly
no need for builtin kernel modules. That back and forth is not going anywhere.
Let's table it.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-10  0:05                           ` Alexei Starovoitov
@ 2020-06-10  3:08                             ` Tetsuo Handa
  2020-06-10  3:32                               ` Alexei Starovoitov
  0 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-10  3:08 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, Eric W. Biederman, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On 2020/06/10 9:05, Alexei Starovoitov wrote:
> I think you're still missing that usermode_blob is completely fs-less.
> It doesn't need any fs to work.

fork_usermode_blob() allows usage like fork_usermode_blob("#!/bin/sh").
A problem for LSMs is not "It doesn't need any fs to work." but "It can access any fs and
it can issue arbitrary syscalls.".

LSM modules switch their security context upon execve(), based on available information like
"What is the !AT_SYMLINK_NOFOLLOW pathname for the requested program passed to execve()?",
"What is the AT_SYMLINK_NOFOLLOW pathname for the requested program passed to execve()?",
"What are argv[]/envp[] for the requested program passed to execve()?", "What is the inode's
security context passed to execve()?" etc. And file-less execve() request (a.k.a.
fork_usermode_blob()) makes pathname information (which pathname-based LSMs depend on)
unavailable.

Since fork_usermode_blob() can execute arbitrary code in userspace, fork_usermode_blob() can
allow execution of e.g. statically linked HTTP server and statically linked DBMS server, without
giving LSM modules a chance to understand the intent of individual file-less execve() request.
If many different statically linked programs were executed via fork_usermode_blob(), how LSM
modules can determine whether a syscall from a file-less process should be permitted/denied?

By the way, TOMOYO LSM wants to know meaningful AT_SYMLINK_NOFOLLOW pathname and !AT_SYMLINK_NOFOLLOW
pathname, and currently there is no API for allow obtaining both pathnames atomically. But that is a
different problem, for what this mail thread is discussing would be whether we can avoid file-less
execve() request (i.e. get rid of fork_usermode_blob()).


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-10  3:08                             ` Tetsuo Handa
@ 2020-06-10  3:32                               ` Alexei Starovoitov
  2020-06-10  7:30                                 ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-10  3:32 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Linus Torvalds, Eric W. Biederman, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Wed, Jun 10, 2020 at 12:08:20PM +0900, Tetsuo Handa wrote:
> On 2020/06/10 9:05, Alexei Starovoitov wrote:
> > I think you're still missing that usermode_blob is completely fs-less.
> > It doesn't need any fs to work.
> 
> fork_usermode_blob() allows usage like fork_usermode_blob("#!/bin/sh").
> A problem for LSMs is not "It doesn't need any fs to work." but "It can access any fs and
> it can issue arbitrary syscalls.".
> 
> LSM modules switch their security context upon execve(), based on available information like
> "What is the !AT_SYMLINK_NOFOLLOW pathname for the requested program passed to execve()?",
> "What is the AT_SYMLINK_NOFOLLOW pathname for the requested program passed to execve()?",
> "What are argv[]/envp[] for the requested program passed to execve()?", "What is the inode's
> security context passed to execve()?" etc. And file-less execve() request (a.k.a.
> fork_usermode_blob()) makes pathname information (which pathname-based LSMs depend on)
> unavailable.
> 
> Since fork_usermode_blob() can execute arbitrary code in userspace, fork_usermode_blob() can
> allow execution of e.g. statically linked HTTP server and statically linked DBMS server, without
> giving LSM modules a chance to understand the intent of individual file-less execve() request.
> If many different statically linked programs were executed via fork_usermode_blob(), how LSM
> modules can determine whether a syscall from a file-less process should be permitted/denied?

What you're saying is tomoyo doesn't trust kernel modules that are built-in
as part of vmlinux and doesn't trust vmlinux build.
I cannot really comprehend that since it means that tomoyo doesn't trust itself.

> By the way, TOMOYO LSM wants to know meaningful AT_SYMLINK_NOFOLLOW pathname and !AT_SYMLINK_NOFOLLOW
> pathname, and currently there is no API for allow obtaining both pathnames atomically. But that is a
> different problem, for what this mail thread is discussing would be whether we can avoid file-less
> execve() request (i.e. get rid of fork_usermode_blob()).

tomoyo does path name resolution as a string and using that for security?
I'm looking at tomoyo_realpath*() and tomoyo_pathcmp(). Ouch.
Path based security is anti pattern of security.
I didn't realize tomoyo so broken.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-10  3:32                               ` Alexei Starovoitov
@ 2020-06-10  7:30                                 ` Tetsuo Handa
  2020-06-10 16:24                                   ` Casey Schaufler
  0 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-10  7:30 UTC (permalink / raw)
  To: linux-security-module
  Cc: Alexei Starovoitov, Linus Torvalds, Eric W. Biederman, Kees Cook,
	Andrew Morton, Alexei Starovoitov, David Miller, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele

Forwarding to LSM-ML. Security people, any comments?

On 2020/06/10 12:32, Alexei Starovoitov wrote:
> On Wed, Jun 10, 2020 at 12:08:20PM +0900, Tetsuo Handa wrote:
>> On 2020/06/10 9:05, Alexei Starovoitov wrote:
>>> I think you're still missing that usermode_blob is completely fs-less.
>>> It doesn't need any fs to work.
>>
>> fork_usermode_blob() allows usage like fork_usermode_blob("#!/bin/sh").
>> A problem for LSMs is not "It doesn't need any fs to work." but "It can access any fs and
>> it can issue arbitrary syscalls.".
>>
>> LSM modules switch their security context upon execve(), based on available information like
>> "What is the !AT_SYMLINK_NOFOLLOW pathname for the requested program passed to execve()?",
>> "What is the AT_SYMLINK_NOFOLLOW pathname for the requested program passed to execve()?",
>> "What are argv[]/envp[] for the requested program passed to execve()?", "What is the inode's
>> security context passed to execve()?" etc. And file-less execve() request (a.k.a.
>> fork_usermode_blob()) makes pathname information (which pathname-based LSMs depend on)
>> unavailable.
>>
>> Since fork_usermode_blob() can execute arbitrary code in userspace, fork_usermode_blob() can
>> allow execution of e.g. statically linked HTTP server and statically linked DBMS server, without
>> giving LSM modules a chance to understand the intent of individual file-less execve() request.
>> If many different statically linked programs were executed via fork_usermode_blob(), how LSM
>> modules can determine whether a syscall from a file-less process should be permitted/denied?
> 
> What you're saying is tomoyo doesn't trust kernel modules that are built-in
> as part of vmlinux and doesn't trust vmlinux build.
> I cannot really comprehend that since it means that tomoyo doesn't trust itself.
> 
>> By the way, TOMOYO LSM wants to know meaningful AT_SYMLINK_NOFOLLOW pathname and !AT_SYMLINK_NOFOLLOW
>> pathname, and currently there is no API for allow obtaining both pathnames atomically. But that is a
>> different problem, for what this mail thread is discussing would be whether we can avoid file-less
>> execve() request (i.e. get rid of fork_usermode_blob()).
> 
> tomoyo does path name resolution as a string and using that for security?
> I'm looking at tomoyo_realpath*() and tomoyo_pathcmp(). Ouch.
> Path based security is anti pattern of security.
> I didn't realize tomoyo so broken.
> 

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-10  7:30                                 ` Tetsuo Handa
@ 2020-06-10 16:24                                   ` Casey Schaufler
  0 siblings, 0 replies; 194+ messages in thread
From: Casey Schaufler @ 2020-06-10 16:24 UTC (permalink / raw)
  To: Tetsuo Handa, linux-security-module
  Cc: Alexei Starovoitov, Linus Torvalds, Eric W. Biederman, Kees Cook,
	Andrew Morton, Alexei Starovoitov, David Miller, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, Casey Schaufler

On 6/10/2020 12:30 AM, Tetsuo Handa wrote:
> Forwarding to LSM-ML. Security people, any comments?
>
> On 2020/06/10 12:32, Alexei Starovoitov wrote:
>> On Wed, Jun 10, 2020 at 12:08:20PM +0900, Tetsuo Handa wrote:
>>> On 2020/06/10 9:05, Alexei Starovoitov wrote:
>>>> I think you're still missing that usermode_blob is completely fs-less.
>>>> It doesn't need any fs to work.
>>> fork_usermode_blob() allows usage like fork_usermode_blob("#!/bin/sh").
>>> A problem for LSMs is not "It doesn't need any fs to work." but "It can access any fs and
>>> it can issue arbitrary syscalls.".
>>>
>>> LSM modules switch their security context upon execve(), based on available information like
>>> "What is the !AT_SYMLINK_NOFOLLOW pathname for the requested program passed to execve()?",
>>> "What is the AT_SYMLINK_NOFOLLOW pathname for the requested program passed to execve()?",
>>> "What are argv[]/envp[] for the requested program passed to execve()?", "What is the inode's
>>> security context passed to execve()?" etc. And file-less execve() request (a.k.a.
>>> fork_usermode_blob()) makes pathname information (which pathname-based LSMs depend on)
>>> unavailable.
>>>
>>> Since fork_usermode_blob() can execute arbitrary code in userspace, fork_usermode_blob() can
>>> allow execution of e.g. statically linked HTTP server and statically linked DBMS server, without
>>> giving LSM modules a chance to understand the intent of individual file-less execve() request.
>>> If many different statically linked programs were executed via fork_usermode_blob(), how LSM
>>> modules can determine whether a syscall from a file-less process should be permitted/denied?
>> What you're saying is tomoyo doesn't trust kernel modules that are built-in
>> as part of vmlinux and doesn't trust vmlinux build.
>> I cannot really comprehend that since it means that tomoyo doesn't trust itself.

That's not a rational conclusion.

>>> By the way, TOMOYO LSM wants to know meaningful AT_SYMLINK_NOFOLLOW pathname and !AT_SYMLINK_NOFOLLOW
>>> pathname, and currently there is no API for allow obtaining both pathnames atomically. But that is a
>>> different problem, for what this mail thread is discussing would be whether we can avoid file-less
>>> execve() request (i.e. get rid of fork_usermode_blob()).
>> tomoyo does path name resolution as a string and using that for security?
>> I'm looking at tomoyo_realpath*() and tomoyo_pathcmp(). Ouch.
>> Path based security is anti pattern of security.
>> I didn't realize tomoyo so broken.

A lawyer would respond "asked and answered". This argument is
old. We had it in the 1980's with Unix systems. While you can't
identify a *object* using a path name, you can and must use a
path name to identify *user intentions*. If that were not the case
the audit system would be massively less sophisticated. Whether
path name based controls are valuable on a system with the
namespace characteristics of Linux (complete anarchy) is in the
eye of the beholder.

We have Linux Security Modules (LSM) because, as Linus put it,
"security people are insane" and incapable of agreeing on anything.
Security is inherently subjective. AppArmor make some people feel safe,
while others like SELinux. I understand that eBPF is now the cat's
pajamas. We don't go ripping out existing security just because
someone thinks poorly of it. Security features don't go in all that
often without some malice aforethought.



^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-09 23:56                   ` Alexei Starovoitov
@ 2020-06-10 21:12                     ` Eric W. Biederman
  2020-06-11 23:31                       ` Alexei Starovoitov
  0 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-10 21:12 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Tue, Jun 09, 2020 at 03:02:30PM -0500, Eric W. Biederman wrote:
>> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>> 
>> > bpf_lsm is that thing that needs to load and start acting early.
>> > It's somewhat chicken and egg. fork_usermode_blob() will start a process
>> > that will load and apply security policy to all further forks and
>> > execs.
>> 
>> What is the timeframe for bpf_lsm patches wanting to use
>> fork_usermode_blob()?
>> 
>> Are we possibly looking at something that will be ready for the next
>> merge window?
>
> In bpf space there are these that want to use usermode_blobs:
> 1. bpfilter itself.
> First of all I think we made a mistake delaying landing the main patches:
> https://lore.kernel.org/patchwork/patch/902785/
> https://lore.kernel.org/patchwork/patch/902783/
> without them bpfilter is indeed dead. That probably was the reason
> no one was brave enough to continue working on it.
> So I think the landed skeleton of bpfilter can be removed.
> I think no user space code will notice that include/uapi/linux/bpfilter.h
> is gone. So it won't be considered as user space breakage.
> Similarly CONFIG_BPFILTER can be nuked too.
> bpftool is checking for it (see tools/bpf/bpftool/feature.c)
> but it's fine to remove it.
> I still think that the approach taken was a correct one, but
> lifting that project off the ground was too much for three of us.
> So when it's staffed appropriately we can re-add that code.
>
> 2. bpf_lsm.
> It's very active at the moment. I'm working on it as well
> (sleepable progs is targeting that), but I'm not sure when folks
> would have to have it during the boot. So far it sounds that
> they're addressing more critical needs first. "bpf_lsm ready at boot"
> came up several times during "bpf office hours" conference calls,
> so it's certainly on the radar. If I to guess I don't think
> bpf_lsm will use usermode_blobs in the next 6 weeks.
> More likely 2-4 month.
>
> 3. bpf iterator.
> It's already capable extension of several things in /proc.
> See https://lore.kernel.org/bpf/20200509175921.2477493-1-yhs@fb.com/
> Cat-ing bpf program as "cat /sys/fs/bpf/my_ipv6_route"
> will produce the same human output as "cat /proc/net/ipv6_route".
> The key difference is that bpf is all tracing based and it's unstable.
> struct fib6_info can change and prog will stop loading.
> There are few FIXME in there. That is being addressed right now.
> After that the next step is to make cat-able progs available
> right after boot via usermode_blobs.
> Unlike cases 1 and 2 here we don't care that they appear before pid 1.
> They can certainly be chef installed and started as services.
> But they are kernel dependent, so deploying them to production
> is much more complicated when they're done as separate rpm.
> Testing is harder and so on. Operational issues pile up when something
> that almost like kernel module is done as a separate package.
> Hence usermode_blob fits the best.
> Of course we were not planning to add a bunch of them to kernel tree.
> The idea was to add only _one_ such cat-able bpf prog and have it as
> a selftest for usermode_blob + bpf_iter. What we want our users to
> see in 'cat my_ipv6_route' is probably different from other companies.
> These patches will likely be using usermode_blob() in the next month.
>
> But we don't need to wait. We can make the progress right now.
> How about we remove bpfilter uapi and rename net/bpfilter/bpfilter_kern.c
> into net/umb/umb_test.c only to exercise Makefile to build elf file
> from simple main.c including .S with incbin trick
> and kernel side that does fork_usermode_blob().
> And that's it.
> net/ipv4/bpfilter/sockopt.c and kconfig can be removed.
> That would be enough base to do use cases 2 and 3 above.
> Having such selftest will be enough to adjust the layering
> for fork_usermode_blob(), right?

If I understand correctly you are asking people to support out of tree
code.  I see some justification for this functionality for in-tree code.
For out of tree code there really is no way to understand support or
maintain the code.

We probably also need to have a conversation about why this
functionality is a better choice that using a compiled in initramfs,
such as can be had by setting CONFIG_INITRAMFS_SOURCE.

Even with this write up and the conversations so far I don't understand
what problem fork_usermode_blob is supposed to be solving.  Is there
anything kernel version dependent about bpf_lsm?  For me the primary
justification of something like fork_usermode_blob is something that is
for all practical purposes a kernel module but it just happens to run in
usermode.

From what little I know about bpf_lsm that isn't the case.  So far all
you have mentioned is that bpf_lsm needs to load early.  That seems like
something that could be solved by a couple of lines init/main.c that
forks and exec's a program before init if it is present.  Maybe that
also needs a bit of protection so the bootloader can't override the
binary.

The entire concept of a loadable lsm has me scratching my head.  Last
time that concept was seriously looked at the races for initializing per
object data were difficult enough to deal with modular support was
removed from all of the existing lsms.

Not to mention there are places where the lsm hooks are a pretty lousy
API and will be refactored to make things better with no thought of any
out of tree code.

> If I understood you correctly you want to replace pid_t
> in 'struct umh_info' with proper 'struct pid' pointer that
> is refcounted, so user process's exit is clean? What else?

No "if (filename)" or "if (file)" on the exec code paths.  No extra case
for the LSM's to have to deal with.  Nothing fork_usermode_blob does is
something that can't be done from userspace as far as execve is
concerned so there is no justification for any special cases in the core
of the exec code.

Getting the deny_write_count and the reference count correct on the file
argument as well as getting BPRM_FLAGS_PATH_INACCESSIBLE set.

Using the proper type for argv and envp.

Those are the things I know of that need to be addressed.


Getting the code refactored so that the do_open_execat can be called
in do_execveat_common instead of __do_execve_file is enough of a
challenge of code motion I really would rather not do that.   Unfortunately that is
the only way I can see right now to have both do_execveat_common and
do_execve_file pass in a struct file.

Calling deny_write_access and get_file in do_execve_file and probably
a bit more is the only way I can see to cleanly isoloate the special
cases fork_usermode_blob brings to the table.


Strictly speaking I am also aware of the issue that the kernel has to
use set_fs(KERNEL_DS) to allow argv and envp to exist in kernel space
instead of userspace.  That needs to be fixed as well, but for all
kernel uses of exec.  So any work fixing fork_usermode_blob can ignore
that issue.

Eric




^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-10 21:12                     ` Eric W. Biederman
@ 2020-06-11 23:31                       ` Alexei Starovoitov
  2020-06-12  0:57                         ` Tetsuo Handa
  2020-06-13 14:08                         ` Eric W. Biederman
  0 siblings, 2 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-11 23:31 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Wed, Jun 10, 2020 at 04:12:29PM -0500, Eric W. Biederman wrote:
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> 
> > On Tue, Jun 09, 2020 at 03:02:30PM -0500, Eric W. Biederman wrote:
> >> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> >> 
> >> > bpf_lsm is that thing that needs to load and start acting early.
> >> > It's somewhat chicken and egg. fork_usermode_blob() will start a process
> >> > that will load and apply security policy to all further forks and
> >> > execs.
> >> 
> >> What is the timeframe for bpf_lsm patches wanting to use
> >> fork_usermode_blob()?
> >> 
> >> Are we possibly looking at something that will be ready for the next
> >> merge window?
> >
> > In bpf space there are these that want to use usermode_blobs:
> > 1. bpfilter itself.
> > First of all I think we made a mistake delaying landing the main patches:
> > https://lore.kernel.org/patchwork/patch/902785/
> > https://lore.kernel.org/patchwork/patch/902783/
> > without them bpfilter is indeed dead. That probably was the reason
> > no one was brave enough to continue working on it.
> > So I think the landed skeleton of bpfilter can be removed.
> > I think no user space code will notice that include/uapi/linux/bpfilter.h
> > is gone. So it won't be considered as user space breakage.
> > Similarly CONFIG_BPFILTER can be nuked too.
> > bpftool is checking for it (see tools/bpf/bpftool/feature.c)
> > but it's fine to remove it.
> > I still think that the approach taken was a correct one, but
> > lifting that project off the ground was too much for three of us.
> > So when it's staffed appropriately we can re-add that code.
> >
> > 2. bpf_lsm.
> > It's very active at the moment. I'm working on it as well
> > (sleepable progs is targeting that), but I'm not sure when folks
> > would have to have it during the boot. So far it sounds that
> > they're addressing more critical needs first. "bpf_lsm ready at boot"
> > came up several times during "bpf office hours" conference calls,
> > so it's certainly on the radar. If I to guess I don't think
> > bpf_lsm will use usermode_blobs in the next 6 weeks.
> > More likely 2-4 month.
> >
> > 3. bpf iterator.
> > It's already capable extension of several things in /proc.
> > See https://lore.kernel.org/bpf/20200509175921.2477493-1-yhs@fb.com/
> > Cat-ing bpf program as "cat /sys/fs/bpf/my_ipv6_route"
> > will produce the same human output as "cat /proc/net/ipv6_route".
> > The key difference is that bpf is all tracing based and it's unstable.
> > struct fib6_info can change and prog will stop loading.
> > There are few FIXME in there. That is being addressed right now.
> > After that the next step is to make cat-able progs available
> > right after boot via usermode_blobs.
> > Unlike cases 1 and 2 here we don't care that they appear before pid 1.
> > They can certainly be chef installed and started as services.
> > But they are kernel dependent, so deploying them to production
> > is much more complicated when they're done as separate rpm.
> > Testing is harder and so on. Operational issues pile up when something
> > that almost like kernel module is done as a separate package.
> > Hence usermode_blob fits the best.
> > Of course we were not planning to add a bunch of them to kernel tree.
> > The idea was to add only _one_ such cat-able bpf prog and have it as
> > a selftest for usermode_blob + bpf_iter. What we want our users to
> > see in 'cat my_ipv6_route' is probably different from other companies.
> > These patches will likely be using usermode_blob() in the next month.
> >
> > But we don't need to wait. We can make the progress right now.
> > How about we remove bpfilter uapi and rename net/bpfilter/bpfilter_kern.c
> > into net/umb/umb_test.c only to exercise Makefile to build elf file
> > from simple main.c including .S with incbin trick
> > and kernel side that does fork_usermode_blob().
> > And that's it.
> > net/ipv4/bpfilter/sockopt.c and kconfig can be removed.
> > That would be enough base to do use cases 2 and 3 above.
> > Having such selftest will be enough to adjust the layering
> > for fork_usermode_blob(), right?
> 
> If I understand correctly you are asking people to support out of tree
> code.  I see some justification for this functionality for in-tree code.
> For out of tree code there really is no way to understand support or
> maintain the code.

It's just like saying that sys_finit_module() is there to support out
of tree code. There are in- and out- tree modules and there will be
in- and out- of tree bpf programs, but the focus is on those that
are relevant for the long term future of the kernel.
The 1 case above is in-tree only. There is nothing in bpfilter
that makes sense out of tree.
The 2 case (bpf_lsm) is primarily in-tree. Security is something
everyone wants its own way, but majority of bpf_lsm functionality
should live in-tree.
The 3 case is mostly out-of-tree. If there was obvious way to
extend /proc it could have been in-tree, but no one will agree.

> We probably also need to have a conversation about why this
> functionality is a better choice that using a compiled in initramfs,
> such as can be had by setting CONFIG_INITRAMFS_SOURCE.

I explained it several times already. I don't see how initramfs solves 1 and 2.

> Even with this write up and the conversations so far I don't understand
> what problem fork_usermode_blob is supposed to be solving.  Is there
> anything kernel version dependent about bpf_lsm?  For me the primary
> justification of something like fork_usermode_blob is something that is
> for all practical purposes a kernel module but it just happens to run in
> usermode.

that's what it is. It's a kernel module that runs in user space.

> From what little I know about bpf_lsm that isn't the case.  So far all

It is.

> you have mentioned is that bpf_lsm needs to load early.  That seems like
> something that could be solved by a couple of lines init/main.c that
> forks and exec's a program before init if it is present.  Maybe that
> also needs a bit of protection so the bootloader can't override the
> binary.
> 
> The entire concept of a loadable lsm has me scratching my head.  Last
> time that concept was seriously looked at the races for initializing per
> object data were difficult enough to deal with modular support was
> removed from all of the existing lsms.

I'm not sure what races you're talking about.
usermode_blob will interact with kernel via syscalls and other standard
communication mechanism.

> Not to mention there are places where the lsm hooks are a pretty lousy
> API and will be refactored to make things better with no thought of any
> out of tree code.

I don't see how refactoring LSM hooks is relevant in this discussion.

> 
> > If I understood you correctly you want to replace pid_t
> > in 'struct umh_info' with proper 'struct pid' pointer that
> > is refcounted, so user process's exit is clean? What else?
> 
> No "if (filename)" or "if (file)" on the exec code paths.  No extra case
> for the LSM's to have to deal with.  Nothing fork_usermode_blob does is
> something that can't be done from userspace as far as execve is
> concerned so there is no justification for any special cases in the core
> of the exec code.

Adding getname_kernel() instead of filename==NULL is trivial enough
and makes sense as a cleanup.
But where do you see 'if (file)' ?
The correct 'file' pointer is passed from shmem_kernel_file_setup() all
the way to exec.

> Getting the deny_write_count and the reference count correct on the file
> argument as well as getting BPRM_FLAGS_PATH_INACCESSIBLE set.

There is no fd because there is no task, but there is a file. I think 
do_execve should assume BINPRM_FLAGS_PATH_INACCESSIBLE in this case.

> Using the proper type for argv and envp.

I guess that's going to be a part of other cleanup.

> Those are the things I know of that need to be addressed.
> 
> 
> Getting the code refactored so that the do_open_execat can be called
> in do_execveat_common instead of __do_execve_file is enough of a
> challenge of code motion I really would rather not do that.   Unfortunately that is
> the only way I can see right now to have both do_execveat_common and
> do_execve_file pass in a struct file.

The 'struct file' is there. Please take another look at the code.

> Calling deny_write_access and get_file in do_execve_file and probably
> a bit more is the only way I can see to cleanly isoloate the special
> cases fork_usermode_blob brings to the table.
> 
> 
> Strictly speaking I am also aware of the issue that the kernel has to
> use set_fs(KERNEL_DS) to allow argv and envp to exist in kernel space
> instead of userspace.  That needs to be fixed as well, but for all
> kernel uses of exec.  So any work fixing fork_usermode_blob can ignore
> that issue.

well, this is the problem of usermodehelper_exec.
usermode_blob doesn't use argv/envp.
They could be NULL for all practical purpose.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-11 23:31                       ` Alexei Starovoitov
@ 2020-06-12  0:57                         ` Tetsuo Handa
  2020-06-13  3:38                           ` Alexei Starovoitov
  2020-06-13 14:08                         ` Eric W. Biederman
  1 sibling, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-12  0:57 UTC (permalink / raw)
  To: Alexei Starovoitov, Eric W. Biederman
  Cc: Linus Torvalds, Kees Cook, Andrew Morton, Alexei Starovoitov,
	David Miller, Al Viro, bpf, linux-fsdevel, Daniel Borkmann,
	Jakub Kicinski, Masahiro Yamada, Gary Lin, Bruno Meneguele

On 2020/06/12 8:31, Alexei Starovoitov wrote:
> On Wed, Jun 10, 2020 at 04:12:29PM -0500, Eric W. Biederman wrote:
>> We probably also need to have a conversation about why this
>> functionality is a better choice that using a compiled in initramfs,
>> such as can be had by setting CONFIG_INITRAMFS_SOURCE.

I agree. CONFIG_INITRAMFS_SOURCE or call_usermodehelper() should be fine.

>> Even with this write up and the conversations so far I don't understand
>> what problem fork_usermode_blob is supposed to be solving.  Is there
>> anything kernel version dependent about bpf_lsm?  For me the primary
>> justification of something like fork_usermode_blob is something that is
>> for all practical purposes a kernel module but it just happens to run in
>> usermode.
> 
> that's what it is. It's a kernel module that runs in user space.
> 

How can the code running in the userspace memory be protected? Like you said

  It's nice to be able to compile that blob with -g and be able to 'gdb -p' into it.
  That works and very convenient when it comes to debugging. Compare that to debugging
  a kernel module!

, the userspace memory can be easily interfered from userspace. The kernel module
running in kernel space is protected (unless methods like /dev/{mem,kmem} are used)
but the kernel module running in user space is not protected.

You said

  What you're saying is tomoyo doesn't trust kernel modules that are built-in
  as part of vmlinux and doesn't trust vmlinux build.

but the word 'trust' has multiple aspects. One of aspects is "can the program
contain malicious code?" which would be mitigated by cryptographic signing
technology. But another aspect is "does the program contain vulnerability or
bugs?" which would be mitigated by updating programs as soon as possible.
Yet another aspect is "is the program protected from interference?" which would
be mitigated by enforcing sandbox like seccomp. But to enforce it, we need
information for identifying what does the code need to do.

We might need to invent built-in "protected userspace" because existing
"unprotected userspace" is not trustworthy enough to run kernel modules.
That's not just inventing fork_usermode_blob().



>> Strictly speaking I am also aware of the issue that the kernel has to
>> use set_fs(KERNEL_DS) to allow argv and envp to exist in kernel space
>> instead of userspace.  That needs to be fixed as well, but for all
>> kernel uses of exec.  So any work fixing fork_usermode_blob can ignore
>> that issue.
> 
> well, this is the problem of usermodehelper_exec.
> usermode_blob doesn't use argv/envp.
> They could be NULL for all practical purpose.
> 

That's what TOMOYO LSM does not like. You said

  tomoyo does path name resolution as a string and using that for security?
  I'm looking at tomoyo_realpath*() and tomoyo_pathcmp(). Ouch.
  Path based security is anti pattern of security.

but, like Casey mentioned, pathnames/argv/envp etc. represents *user intentions*
for controlling what that code can do.

A method for allow anonymously running arbitrary code in userspace memory (which
can be interfered) is so painful. I won't be able to trust kernel modules running
in userspace memory.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-12  0:57                         ` Tetsuo Handa
@ 2020-06-13  3:38                           ` Alexei Starovoitov
  2020-06-13  4:22                             ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-13  3:38 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Eric W. Biederman, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Fri, Jun 12, 2020 at 09:57:40AM +0900, Tetsuo Handa wrote:
> 
> , the userspace memory can be easily interfered from userspace. The kernel module
> running in kernel space is protected (unless methods like /dev/{mem,kmem} are used)
> but the kernel module running in user space is not protected.

huh? One user process 'can easily interfere' with memory of other process?

> 
> You said
> 
>   What you're saying is tomoyo doesn't trust kernel modules that are built-in
>   as part of vmlinux and doesn't trust vmlinux build.
> 
> but the word 'trust' has multiple aspects. One of aspects is "can the program
> contain malicious code?" which would be mitigated by cryptographic signing

usermode_blob is either part of kernel module or part of vmlinux.
If it's part of vmlinux it's inherently trusted.
If it's part of ko it's signed along with the rest of ko.

> We might need to invent built-in "protected userspace" because existing
> "unprotected userspace" is not trustworthy enough to run kernel modules.
> That's not just inventing fork_usermode_blob().

sorry, but this makes no sense at all to me.

> can be interfered) is so painful. I won't be able to trust kernel modules running
> in userspace memory.

The part that I suspect is still missing is what triggers fork_usermode_blob().
It's always kernel code == trusted code.

Currently we have the following:

vmlinux {
  core code of kernel
  built-in mod_A {
    kernel code
  }
  built-in mod_B {
    kernel code
  }
}
loadable mod_C {
  kernel code
}

With fork_usermode_blob() kernel modules can delegate parts of their
functionality to run in user space:

vmlinux {
  core code of kernel
  built-in mod_A {
    kernel code
    code to run in user space
  }
  built-in mod_B {
    kernel code
    code to run in user space
  }
}
loadable mod_C {
  kernel code
  code to run in user space
}

The interface between kernel part of .ko and user part of .ko is
specific to that particular kernel module. It's not a typical user space.
Take bpfilter, for example. It has its own format of structures
that are being passed between kernel side of bpfilter and user side
of bpfilter. It's bpfilter's internal interface that doesn't
create user api. bpfilter in kernel 5.x could be passing different
structs vs bpfilter in kernel 6.x.
It also cannot be started via init=, but it has to be ready
if pid 1 needs it.
Say, bpfilter was compiled as loadable kernel module.
In such case bpfilter.ko will not be loaded into the kernel
until the first iptables sockopt. It may never be loaded.
But when loaded the bpfilter.ko will start its user space side
via fork_usermode_blob() that is specific to that version of .ko.
Other kernel modules like bpf_lsm.ko will decide what's appropriate
for them in terms of when user side should start and exit.
Likely bpf_lsm.ko would want to be built-in.
In all cases kernel module cannot rely on traditional usermode_helper,
because usermode_helper is true uapi. The admin can boot kernel 5.x
and it must work with usermode_helpers installed in rootfs. The admin
will reboot to kernel 6.x and it should still work without changing rootfs.
Whereas usermode_blobs are ko and kernel specific.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-13  3:38                           ` Alexei Starovoitov
@ 2020-06-13  4:22                             ` Tetsuo Handa
  0 siblings, 0 replies; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-13  4:22 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eric W. Biederman, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On 2020/06/13 12:38, Alexei Starovoitov wrote:
> On Fri, Jun 12, 2020 at 09:57:40AM +0900, Tetsuo Handa wrote:
>>
>> , the userspace memory can be easily interfered from userspace. The kernel module
>> running in kernel space is protected (unless methods like /dev/{mem,kmem} are used)
>> but the kernel module running in user space is not protected.
> 
> huh? One user process 'can easily interfere' with memory of other process?

It is an execution environment problem.

Somebody can send SIGKILL (e.g. OOM-killker, SysRq-i) to kill kernel code running as
usermode process, somebody can send SIGSTOP to make kernel code running as usermode
process defunctional, somebody can /usr/bin/strace in order to eavesdrop on secret
data used by kernel code running as usermode process etc.

>> can be interfered) is so painful. I won't be able to trust kernel modules running
>> in userspace memory.
> 
> The part that I suspect is still missing is what triggers fork_usermode_blob().
> It's always kernel code == trusted code.

How can that part be guaranteed?
In future somebody might add a caller that allows

  sys_execute_anonymously_in_usermode(const char code, const int len) {
     return fork_usermode_blob(code, len);
  }

or something similar.

> The interface between kernel part of .ko and user part of .ko is
> specific to that particular kernel module. It's not a typical user space.

How can that part be guaranteed? A caller can pass arbitrary code including
typical user space program (e.g. /bin/sh).

> But when loaded the bpfilter.ko will start its user space side
> via fork_usermode_blob() that is specific to that version of .ko.

How can we guarantee that its user space side started via fork_usermode_blob()
is not disturbed (e.g. SIGKILL, SIGSTOP, /usr/bin/strace) ?

I consider that reliability (from "robustness" perspective) of fork_usermode_blob()
is same with CONFIG_INITRAMFS_SOURCE or call_usermodehelper() or init= approach.


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-11 23:31                       ` Alexei Starovoitov
  2020-06-12  0:57                         ` Tetsuo Handa
@ 2020-06-13 14:08                         ` Eric W. Biederman
  2020-06-13 15:33                           ` Alexei Starovoitov
  1 sibling, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-13 14:08 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Wed, Jun 10, 2020 at 04:12:29PM -0500, Eric W. Biederman wrote:
>> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>> 
>> > On Tue, Jun 09, 2020 at 03:02:30PM -0500, Eric W. Biederman wrote:
>> >> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>> >> 
>> >> > bpf_lsm is that thing that needs to load and start acting early.
>> >> > It's somewhat chicken and egg. fork_usermode_blob() will start a process
>> >> > that will load and apply security policy to all further forks and
>> >> > execs.
>> >> 
>> >> What is the timeframe for bpf_lsm patches wanting to use
>> >> fork_usermode_blob()?
>> >> 
>> >> Are we possibly looking at something that will be ready for the next
>> >> merge window?
>> >
>> > In bpf space there are these that want to use usermode_blobs:
>> > 1. bpfilter itself.
>> > First of all I think we made a mistake delaying landing the main patches:
>> > https://lore.kernel.org/patchwork/patch/902785/
>> > https://lore.kernel.org/patchwork/patch/902783/
>> > without them bpfilter is indeed dead. That probably was the reason
>> > no one was brave enough to continue working on it.
>> > So I think the landed skeleton of bpfilter can be removed.
>> > I think no user space code will notice that include/uapi/linux/bpfilter.h
>> > is gone. So it won't be considered as user space breakage.
>> > Similarly CONFIG_BPFILTER can be nuked too.
>> > bpftool is checking for it (see tools/bpf/bpftool/feature.c)
>> > but it's fine to remove it.
>> > I still think that the approach taken was a correct one, but
>> > lifting that project off the ground was too much for three of us.
>> > So when it's staffed appropriately we can re-add that code.
>> >
>> > 2. bpf_lsm.
>> > It's very active at the moment. I'm working on it as well
>> > (sleepable progs is targeting that), but I'm not sure when folks
>> > would have to have it during the boot. So far it sounds that
>> > they're addressing more critical needs first. "bpf_lsm ready at boot"
>> > came up several times during "bpf office hours" conference calls,
>> > so it's certainly on the radar. If I to guess I don't think
>> > bpf_lsm will use usermode_blobs in the next 6 weeks.
>> > More likely 2-4 month.
>> >
>> > 3. bpf iterator.
>> > It's already capable extension of several things in /proc.
>> > See https://lore.kernel.org/bpf/20200509175921.2477493-1-yhs@fb.com/
>> > Cat-ing bpf program as "cat /sys/fs/bpf/my_ipv6_route"
>> > will produce the same human output as "cat /proc/net/ipv6_route".
>> > The key difference is that bpf is all tracing based and it's unstable.
>> > struct fib6_info can change and prog will stop loading.
>> > There are few FIXME in there. That is being addressed right now.
>> > After that the next step is to make cat-able progs available
>> > right after boot via usermode_blobs.
>> > Unlike cases 1 and 2 here we don't care that they appear before pid 1.
>> > They can certainly be chef installed and started as services.
>> > But they are kernel dependent, so deploying them to production
>> > is much more complicated when they're done as separate rpm.
>> > Testing is harder and so on. Operational issues pile up when something
>> > that almost like kernel module is done as a separate package.
>> > Hence usermode_blob fits the best.
>> > Of course we were not planning to add a bunch of them to kernel tree.
>> > The idea was to add only _one_ such cat-able bpf prog and have it as
>> > a selftest for usermode_blob + bpf_iter. What we want our users to
>> > see in 'cat my_ipv6_route' is probably different from other companies.
>> > These patches will likely be using usermode_blob() in the next month.
>> >
>> > But we don't need to wait. We can make the progress right now.
>> > How about we remove bpfilter uapi and rename net/bpfilter/bpfilter_kern.c
>> > into net/umb/umb_test.c only to exercise Makefile to build elf file
>> > from simple main.c including .S with incbin trick
>> > and kernel side that does fork_usermode_blob().
>> > And that's it.
>> > net/ipv4/bpfilter/sockopt.c and kconfig can be removed.
>> > That would be enough base to do use cases 2 and 3 above.
>> > Having such selftest will be enough to adjust the layering
>> > for fork_usermode_blob(), right?
>> 
>> If I understand correctly you are asking people to support out of tree
>> code.  I see some justification for this functionality for in-tree code.
>> For out of tree code there really is no way to understand support or
>> maintain the code.
>
> It's just like saying that sys_finit_module() is there to support out
> of tree code. There are in- and out- tree modules and there will be
> in- and out- of tree bpf programs, but the focus is on those that
> are relevant for the long term future of the kernel.
> The 1 case above is in-tree only. There is nothing in bpfilter
> that makes sense out of tree.
> The 2 case (bpf_lsm) is primarily in-tree. Security is something
> everyone wants its own way, but majority of bpf_lsm functionality
> should live in-tree.
> The 3 case is mostly out-of-tree. If there was obvious way to
> extend /proc it could have been in-tree, but no one will agree.
>
>> We probably also need to have a conversation about why this
>> functionality is a better choice that using a compiled in initramfs,
>> such as can be had by setting CONFIG_INITRAMFS_SOURCE.
>
> I explained it several times already. I don't see how initramfs solves 1 and 2.

You said whatever it was needed to live in the kernel image.
A compiled in initramfs that can be supplemented by a initramfs from the
bootloader is that compiled in situation.  That is what is implemented
with CONFIG_INITRAMFS_SOURCE.

>> Even with this write up and the conversations so far I don't understand
>> what problem fork_usermode_blob is supposed to be solving.  Is there
>> anything kernel version dependent about bpf_lsm?  For me the primary
>> justification of something like fork_usermode_blob is something that is
>> for all practical purposes a kernel module but it just happens to run in
>> usermode.
>
> that's what it is. It's a kernel module that runs in user space.
>
>> From what little I know about bpf_lsm that isn't the case.  So far all
>
> It is.

So the bpf programs will live in the kernel?  Where can I look or has
that part not been merged yet?

>> you have mentioned is that bpf_lsm needs to load early.  That seems like
>> something that could be solved by a couple of lines init/main.c that
>> forks and exec's a program before init if it is present.  Maybe that
>> also needs a bit of protection so the bootloader can't override the
>> binary.
>> 
>> The entire concept of a loadable lsm has me scratching my head.  Last
>> time that concept was seriously looked at the races for initializing per
>> object data were difficult enough to deal with modular support was
>> removed from all of the existing lsms.
>
> I'm not sure what races you're talking about.
> usermode_blob will interact with kernel via syscalls and other standard
> communication mechanism.

The races between the kernel allocating objects say inodes and the code
to place security labels or other markes onto those objects so an
LSM can later make security decisions.

>> Not to mention there are places where the lsm hooks are a pretty lousy
>> API and will be refactored to make things better with no thought of any
>> out of tree code.
>
> I don't see how refactoring LSM hooks is relevant in this discussion.

We were talking about the bpf_lsm, and I have been refactoring the lsm
hooks that run through exec, because they did not have obvious calling
conventions.

If all of the bpf programs for bpf_lsm live in the kernel tree then I
guess there is no problem, but I don't see why in that case you are
using bpf instead of compiling things to C.

When I looked at the bpf_lsm code all I see is a hooks that call out to
bpf programs.  So I made the rather obvious assumption that those bpf
programs are loaded like any other bpf programs from ordinary userspace.

>> > If I understood you correctly you want to replace pid_t
>> > in 'struct umh_info' with proper 'struct pid' pointer that
>> > is refcounted, so user process's exit is clean? What else?
>> 
>> No "if (filename)" or "if (file)" on the exec code paths.  No extra case
>> for the LSM's to have to deal with.  Nothing fork_usermode_blob does is
>> something that can't be done from userspace as far as execve is
>> concerned so there is no justification for any special cases in the core
>> of the exec code.
>
> Adding getname_kernel() instead of filename==NULL is trivial enough
> and makes sense as a cleanup.
> But where do you see 'if (file)' ?
> The correct 'file' pointer is passed from shmem_kernel_file_setup() all
> the way to exec.

In the middle of __do_execve_file:

	if (!file)
		file = do_open_execat(fd, filename, flags);
	retval = PTR_ERR(file);
	if (IS_ERR(file))
		goto out_unmark;

Then just after it we have:

	if (!filename) {
		bprm->filename = "none";
	} else if (fd == AT_FDCWD || filename->name[0] == '/') {
		bprm->filename = filename->name;
	} else {
		if (filename->name[0] == '\0')
			pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
		else
			pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
					    fd, filename->name);
		if (!pathbuf) {
			retval = -ENOMEM;
			goto out_unmark;
		}
		/*
		 * Record that a name derived from an O_CLOEXEC fd will be
		 * inaccessible after exec. Relies on having exclusive access to
		 * current->files (due to unshare_files above).
		 */
		if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
			bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
		bprm->filename = pathbuf;
	}

So we have two core cases in the code that are specific to
fork_usermode_blob, and quite frankly they are both wrong today.

The if (!filename) case does not set BINPRM_FLAGS_PATH_INACCESSIBLE.
The if (file) case does not call get_file(file) and deny_write_access(file).

>> Getting the deny_write_count and the reference count correct on the file
>> argument as well as getting BPRM_FLAGS_PATH_INACCESSIBLE set.
>
> There is no fd because there is no task, but there is a file. I think 
> do_execve should assume BINPRM_FLAGS_PATH_INACCESSIBLE in this case.

I agree that why I said it needs to set that flag, as you can see above
that flag is not set today.

Having BINPRM_FLAGS_PATH_INACCESSIBLE set would slightly relieve
Tetsuo's concern about fork_usermode_blob having a shell script.

>> Using the proper type for argv and envp.
>
> I guess that's going to be a part of other cleanup.

No.  Already answered but again.  Today do_execve_file has the prototype:

int do_execve_file(struct file *file, void *__argv, void *__envp);

void * is definitely not the type that is passed to __argv and __envp,
and that needs to be fixed for that function to remain.

>> Those are the things I know of that need to be addressed.
>> 
>> 
>> Getting the code refactored so that the do_open_execat can be called
>> in do_execveat_common instead of __do_execve_file is enough of a
>> challenge of code motion I really would rather not do that.   Unfortunately that is
>> the only way I can see right now to have both do_execveat_common and
>> do_execve_file pass in a struct file.
>
> The 'struct file' is there. Please take another look at the code.

do_execveat_common passes a NULL file.
While do_execveat_common to be maintainable in this scenario needs to
perform the do_open_execat and pass the file.

That would allow separating out what is specific to do_execve_file
from do_execveat_common.

Which would fundamentally simplify the logic in what is today
__do_execve_file.  The way things are factored today it takes serious
digging to figure out what is going with !file and the !filename
arguments.  We can and should do much better.

Another possible solution which would clean up the code in exec and make
things easier to understand is that there could exist a directory in an
initramfs filesystem somewhere with the names of the programs matching
the module names, and do_execveat_common could just be called with a cwd
on that directory and the name of the module.  Which would remove the
need to have a do_execve_file and do_execve could be used like
everywhere else.

I don't understand why fork_usermode_blob has to reimplement
INITRAMFS_SOURCE but perhaps there is good reason for it.

>> Calling deny_write_access and get_file in do_execve_file and probably
>> a bit more is the only way I can see to cleanly isoloate the special
>> cases fork_usermode_blob brings to the table.
>> 
>> 
>> Strictly speaking I am also aware of the issue that the kernel has to
>> use set_fs(KERNEL_DS) to allow argv and envp to exist in kernel space
>> instead of userspace.  That needs to be fixed as well, but for all
>> kernel uses of exec.  So any work fixing fork_usermode_blob can ignore
>> that issue.
>
> well, this is the problem of usermodehelper_exec.
> usermode_blob doesn't use argv/envp.
> They could be NULL for all practical purpose.

fork_usermode_blob
   call_usermode_helper_setup_file
      split_argv

Addmittedly the only argument split_argv is called on currently is
"bpfilter_umh".  But the code does try and support more.

Should that split_argv be removed?

....

Anyway you asked what needs to be done in exec.

I am in the middle of cleaning up exec.  Send the patches that address
the issues and make this mess not a maintenance issue, and I will be
happy to leave fork_usermode_blob alone.  Otherwise I plan to just
remove the code for now as it is all dead at the moment.

Enough of the argument has been the code which is not merged yet needs
to do X, that this conversation has not been able to point at concrete
parts of the code and argue about them.  I strongly suspect for this
code to be maintainable we need the in-tree users so we can discuss what
is going on.

Eric

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-13 14:08                         ` Eric W. Biederman
@ 2020-06-13 15:33                           ` Alexei Starovoitov
  2020-06-13 16:14                             ` Alexei Starovoitov
  0 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-13 15:33 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Sat, Jun 13, 2020 at 7:13 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
> I am in the middle of cleaning up exec.  Send the patches that address
> the issues and make this mess not a maintenance issue, and I will be
> happy to leave fork_usermode_blob alone.  Otherwise I plan to just
> remove the code for now as it is all dead at the moment.

May be stop being a jerk first ?
It's a Nack to remove the code.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-13 15:33                           ` Alexei Starovoitov
@ 2020-06-13 16:14                             ` Alexei Starovoitov
  2020-06-14 14:51                               ` Eric W. Biederman
  0 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-13 16:14 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Sat, Jun 13, 2020 at 8:33 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Sat, Jun 13, 2020 at 7:13 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
> > I am in the middle of cleaning up exec.  Send the patches that address
> > the issues and make this mess not a maintenance issue, and I will be
> > happy to leave fork_usermode_blob alone.  Otherwise I plan to just
> > remove the code for now as it is all dead at the moment.
>
> May be stop being a jerk first ?
> It's a Nack to remove the code.

I'm happy to work on changes, but your removal threats must stop
before we can continue discussion. ok?

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-13 16:14                             ` Alexei Starovoitov
@ 2020-06-14 14:51                               ` Eric W. Biederman
  2020-06-16  1:55                                 ` Alexei Starovoitov
  0 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-14 14:51 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Sat, Jun 13, 2020 at 8:33 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
>>
>> On Sat, Jun 13, 2020 at 7:13 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>> > I am in the middle of cleaning up exec.  Send the patches that address
>> > the issues and make this mess not a maintenance issue, and I will be
>> > happy to leave fork_usermode_blob alone.  Otherwise I plan to just
>> > remove the code for now as it is all dead at the moment.
>>
>> May be stop being a jerk first ?
>> It's a Nack to remove the code.
>
> I'm happy to work on changes, but your removal threats must stop
> before we can continue discussion. ok?

I am looking for reasons to not remove the code.  What I can't
personally do is justify spending the time fixing unused code.

The code is currently unused, and has an implementation that can be
improved.  All of which argues for removal on technical grounds.

The implementation issues are a problem for maintaining other code so
they need to be addressed, again that argues for removal on technical
grounds.

There are some asperations to use fork_usermode_blob but no commitments
that I can see to actually use this code.

If someone who cares can step up so other developers don't have to deal
with the maintenance problems, then there is no problem in keeping the
code.




Now there is one technical issue I see that has implications for how
this gets fixed.  The current implementation requires that 2 copies
of the user mode executable be kept.

int fork_usermode_blob(void *data, size_t len, struct umh_info *info);


The function fork_usermode_blob is passed an array and a length.  Today
that array is stored in .rodata.  Not in a init section where it could
be discared.

Now userspace in general and exec in particular requires the executable
to be in a mmapable file.  So fork_usermode_blob creates a mini
filesystem that only supports one file and no file names and opens
a file within it, and passes that open file to exec.

If creation of the filesystem and copying of the data can be separated
from the actual execution of the code, then there will be no need to
keep 2 copies of the executable in memory.  If the file was also given a
name there would be no need for fork_usermode_blob to open the file.
All fork_usermode_blob would need to do is make make it possible for
exec to find that file.

The implification this has for fixing the issues with exec is that once
the file has a name fork_usermode_blob no longer needs to preopen the
file and call do_execve_file.  Instead fork_usermode_blob can call
do_execve.  Which means do_execve_file and all of it's strange corner
cases can go away.

We have all of the infrastructure to decode a cpio in init/initramfs.c
so it would be practically no code at all to place the code into a
filesystem instead of just into a file at startup time.  At which
point it could be guaranteed that the section the filesystem lives in is
an init section and is not used past the point of loading it into a
filesystem.  Making the code use half the memory.

Eric


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-14 14:51                               ` Eric W. Biederman
@ 2020-06-16  1:55                                 ` Alexei Starovoitov
  2020-06-16 16:21                                   ` Alexei Starovoitov
  2020-06-23 18:04                                   ` Eric W. Biederman
  0 siblings, 2 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-16  1:55 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Sun, Jun 14, 2020 at 09:51:09AM -0500, Eric W. Biederman wrote:
> 
> There are some asperations to use fork_usermode_blob but no commitments
> that I can see to actually use this code.

huh? I've listed three projects with concrete timelines that are going to use
fork_usermode_blob.

> If someone who cares can step up so other developers don't have to deal
> with the maintenance problems, then there is no problem in keeping the
> code.

That code has been there for two years and wasn't causing 'maintenance
problems'. Quite the opposite was happening. Initial way of embedding blob
into ko has changed quite a bit thanks to work from Masahiro.
See commit 8e75887d321d ("bpfilter: include bpfilter_umh in assembly instead of using objcopy")

What is happening that this bit of code is somehow in a way of some refactoring
that you're doing (I'm not even sure what kind of refactoring you have in
mind), but instead of working with the community on the best ways to do this
refactoring you're arguing for removal just to avoid tweaking few lines of code.

> Now there is one technical issue I see that has implications for how
> this gets fixed.  The current implementation requires that 2 copies
> of the user mode executable be kept.
> 
> int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
> 
> 
> The function fork_usermode_blob is passed an array and a length.  Today
> that array is stored in .rodata.  Not in a init section where it could
> be discared.

It's a one line change in bpfilter_umh_blob.S to make it .init section,
but for bpfilter init may not work.
For some ko init is appropriate for some other it's not.

> Now userspace in general and exec in particular requires the executable
> to be in a mmapable file.  So fork_usermode_blob creates a mini
> filesystem that only supports one file and no file names and opens
> a file within it, and passes that open file to exec.
> 
> If creation of the filesystem and copying of the data can be separated
> from the actual execution of the code, then there will be no need to
> keep 2 copies of the executable in memory.  If the file was also given a
> name there would be no need for fork_usermode_blob to open the file.
> All fork_usermode_blob would need to do is make make it possible for
> exec to find that file.
> 
> The implification this has for fixing the issues with exec is that once
> the file has a name fork_usermode_blob no longer needs to preopen the
> file and call do_execve_file.  Instead fork_usermode_blob can call
> do_execve.  Which means do_execve_file and all of it's strange corner
> cases can go away.
> 
> We have all of the infrastructure to decode a cpio in init/initramfs.c
> so it would be practically no code at all to place the code into a
> filesystem instead of just into a file at startup time.  At which
> point it could be guaranteed that the section the filesystem lives in is
> an init section and is not used past the point of loading it into a
> filesystem.  Making the code use half the memory.

Could you please re-read the explanation just up the thread:
https://lore.kernel.org/bpf/20200613033821.l62q2ed5ligheyhu@ast-mbp/
that goes into detail how bpfilter is invoking this blob.
Now explain how initramfs could work?
How bpfilter can load its blob when bpfilter.ko was loaded into
the kernel a day after boot ? Where is initramfs?
bpfilter can be normal ko and builtin. In both cases it cannot rely on
a path. That path may not exist. initramfs is not present after boot.
Any path based approach has serious disadvantages.
The ko cannot rely on an external fs hieararchy. The ko is a self contained
object. It has kernel and user code. A blob inside ko is like another kernel
function of that particular ko, but it runs in user space. The root fs could
have been corrupted but ko needs to be operational if it was builtin.

Another reason why single fs (initramfs or other) doesn't work is multiple
ko-s. Theoretically all ko-s can agree on dir layout, but ko-s are built and
loaded at different times. Say we put all possibles blobs from all ko-s into
some new special fs that is available during the boot and after the boot. In
such case the majority of that ram is going to be wasted. Since ko-s may not
need that blob to run or ko-s may not even load, but ram is wasted anyway.
All these show stoppers with fs and path were considered two years ago
when design of user mode blobs was done.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-16  1:55                                 ` Alexei Starovoitov
@ 2020-06-16 16:21                                   ` Alexei Starovoitov
  2020-06-23 18:04                                   ` Eric W. Biederman
  1 sibling, 0 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-16 16:21 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Mon, Jun 15, 2020 at 6:55 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> >
> > int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
> >
> >
> > The function fork_usermode_blob is passed an array and a length.  Today
> > that array is stored in .rodata.  Not in a init section where it could
> > be discared.
>
> It's a one line change in bpfilter_umh_blob.S to make it .init section,
> but for bpfilter init may not work.
> For some ko init is appropriate for some other it's not.

since I remember discussing the desire to have only one copy of the blob
with Andy back then I did a bit of git archeology.
Sure enough it was in .init.rodata when usermode_blob() landed.
But then commit 61fbf5933d42 ("net: bpfilter: restart bpfilter_umh
when error occurred")
added blob restart logic.
It's kinda questionable whether bpfilter needs restart or not.
But because the kernel module now starts the blob multiple times it had to move
them from .init.rodata to .rodata.
Regardless the point is that init or not-init is a decision of a
particular kernel module.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-16  1:55                                 ` Alexei Starovoitov
  2020-06-16 16:21                                   ` Alexei Starovoitov
@ 2020-06-23 18:04                                   ` Eric W. Biederman
  2020-06-23 18:35                                     ` Alexei Starovoitov
  1 sibling, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-23 18:04 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele


Sigh.  I was busy last week so I left reading this until now in the
hopes I would see something reasonable.

What I see is rejecting of everything that is said to you.

What I do not see are patches fixing issues.  I will await patches.

Eric



^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-23 18:04                                   ` Eric W. Biederman
@ 2020-06-23 18:35                                     ` Alexei Starovoitov
  2020-06-23 18:53                                       ` Eric W. Biederman
  0 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-23 18:35 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Tue, Jun 23, 2020 at 01:04:02PM -0500, Eric W. Biederman wrote:
> 
> Sigh.  I was busy last week so I left reading this until now in the
> hopes I would see something reasonable.
> 
> What I see is rejecting of everything that is said to you.
> 
> What I do not see are patches fixing issues.  I will await patches.

huh?
I can say exactly the same. You keep ignoring numerous points I brought up.
You still haven't showed what kind of refactoring you have in mind and
why fork_blob is in its way.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-23 18:35                                     ` Alexei Starovoitov
@ 2020-06-23 18:53                                       ` Eric W. Biederman
  2020-06-23 19:40                                         ` Alexei Starovoitov
  0 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-23 18:53 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Tue, Jun 23, 2020 at 01:04:02PM -0500, Eric W. Biederman wrote:
>> 
>> Sigh.  I was busy last week so I left reading this until now in the
>> hopes I would see something reasonable.
>> 
>> What I see is rejecting of everything that is said to you.
>> 
>> What I do not see are patches fixing issues.  I will await patches.
>
> huh?
> I can say exactly the same. You keep ignoring numerous points I brought up.
> You still haven't showed what kind of refactoring you have in mind and
> why fork_blob is in its way.

That is correct.  What I wind up doing with exec is irrelevant.

What is relevant is getting correct working code on the fork_blob path.
Something that is clean enough that whatever weird things it winds up
doing are readable.  The way things are intermingled today it took 2
years for someone to realize there was a basic reference counting bug.

This isn't work anyone else can do because there are not yet any real in
tree users of fork_blob.  The fact that no one else can make
substantials changes to the code because it has no users is what gets in
the way of maintenance.


One of the 2 year old bugs that needs to be fixed is that some LSMs
work in terms of paths.  Tetsuo has been very gracious in pointing that
out.  Either a path needs to be provided or the LSMs that work in terms
of paths need to be fixed.



Now I really don't care how the bugs are fixed.


My recomendation for long term maintenance is to split fork_blob into 2
functions: fs_from_blob, and the ordinary call_usermodehelper_exec.
That removes the need for any special support for anything in the exec
path because your blob will also have a path for your file, and the
file in the filesystem can be reused for restart.

That feels like the least long term work on everyone.

But with no in-tree users none of us can do anything bug guess what
the actual requirements of fork_usermode_blob are.


So patches to fix the bugs please.

Eric





^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-23 18:53                                       ` Eric W. Biederman
@ 2020-06-23 19:40                                         ` Alexei Starovoitov
  2020-06-24  1:51                                           ` Tetsuo Handa
  2020-06-24 12:13                                           ` Eric W. Biederman
  0 siblings, 2 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-23 19:40 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Tue, Jun 23, 2020 at 01:53:48PM -0500, Eric W. Biederman wrote:
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> 
> > On Tue, Jun 23, 2020 at 01:04:02PM -0500, Eric W. Biederman wrote:
> >> 
> >> Sigh.  I was busy last week so I left reading this until now in the
> >> hopes I would see something reasonable.
> >> 
> >> What I see is rejecting of everything that is said to you.
> >> 
> >> What I do not see are patches fixing issues.  I will await patches.
> >
> > huh?
> > I can say exactly the same. You keep ignoring numerous points I brought up.
> > You still haven't showed what kind of refactoring you have in mind and
> > why fork_blob is in its way.
> 
> That is correct.  What I wind up doing with exec is irrelevant.
> 
> What is relevant is getting correct working code on the fork_blob path.
> Something that is clean enough that whatever weird things it winds up
> doing are readable.  The way things are intermingled today it took 2
> years for someone to realize there was a basic reference counting bug.

There is no refcnt bug. It was a user error on tomoyo side.
fork_blob() works as expected.

> This isn't work anyone else can do because there are not yet any real in
> tree users of fork_blob.  The fact that no one else can make
> substantials changes to the code because it has no users is what gets in
> the way of maintenance.

Not true either.
bpfilter is a full blown user. bpfilter itself didn't go anywhere,
but that's a different story.

> One of the 2 year old bugs that needs to be fixed is that some LSMs
> work in terms of paths.  Tetsuo has been very gracious in pointing that
> out.  Either a path needs to be provided or the LSMs that work in terms
> of paths need to be fixed.

Not true again.
usermode_blob is part of the kernel module.
Kernel module when loaded doesn't have path.
tomoyo has to fix itself.

> Now I really don't care how the bugs are fixed.
> 
> 
> My recomendation for long term maintenance is to split fork_blob into 2
> functions: fs_from_blob, and the ordinary call_usermodehelper_exec.

what is fs_from_blob() ?
If you mean to create a file system from a blob then it makes no sense.
Please read upthread why. I'm not going to repeat the same points.

> So patches to fix the bugs please.

There are bugs. Ok?
This pointless thread is happening because you want to do some refactoring
of the code and somehow believe that fork_blob is in your way.
If you cannot do refactoring without screaming about removal and misreading
implementation details may be you shouldn't be doing that refactoring.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-23 19:40                                         ` Alexei Starovoitov
@ 2020-06-24  1:51                                           ` Tetsuo Handa
  2020-06-24  4:00                                             ` Alexei Starovoitov
  2020-06-24 12:13                                           ` Eric W. Biederman
  1 sibling, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-24  1:51 UTC (permalink / raw)
  To: Alexei Starovoitov, Eric W. Biederman
  Cc: Linus Torvalds, Kees Cook, Andrew Morton, Alexei Starovoitov,
	David Miller, Al Viro, bpf, linux-fsdevel, Daniel Borkmann,
	Jakub Kicinski, Masahiro Yamada, Gary Lin, Bruno Meneguele

On 2020/06/24 4:40, Alexei Starovoitov wrote:
> There is no refcnt bug. It was a user error on tomoyo side.
> fork_blob() works as expected.

Absolutely wrong! Any check which returns an error during current->in_execve == 1
will cause this refcnt bug. You are simply ignoring that there is possibility
that execve() fails.

> Not true again.
> usermode_blob is part of the kernel module.

Disagree.

> Kernel module when loaded doesn't have path.

Disagree.

Kernel modules can be trusted via module signature mechanism, and the byte array
(which contains code / data) is protected by keeping that byte array within the
kernel address space. Therefore, pathname based security does not need to complain
that there is no pathname when kernel module is loaded.

However, regarding usermode_blob, although the byte array (which contains code / data)
might be initially loaded from the kernel space (which is protected), that byte array
is no longer protected (e.g. SIGKILL, strace()) when executed because they are placed
in the user address space. Thus, LSM modules (including pathname based security) want
to control how that byte array can behave.

> tomoyo has to fix itself.

TOMOYO needs to somehow handle /dev/fd/ case from execveat(), but fork_blob() is a
different story.

On 2020/06/24 3:53, Eric W. Biederman wrote:
> This isn't work anyone else can do because there are not yet any real in
> tree users of fork_blob.  The fact that no one else can make
> substantials changes to the code because it has no users is what gets in
> the way of maintenance.

It sounds to me that fork_blob() is a dangerous interface which anonymously
allows arbitrary behavior in an unprotected environment. Therefore,

> Either a path needs to be provided or the LSMs that work in terms
> of paths need to be fixed.

LSM modules want to control how that byte array can behave. But Alexei
still does not explain how information for LSM modules can be provided.

> My recomendation for long term maintenance is to split fork_blob into 2
> functions: fs_from_blob, and the ordinary call_usermodehelper_exec.
> That removes the need for any special support for anything in the exec
> path because your blob will also have a path for your file, and the
> file in the filesystem can be reused for restart.

Yes, that would be an approach for providing information for LSM modules.

> But with no in-tree users none of us can do anything bug guess what
> the actual requirements of fork_usermode_blob are.

Exactly. Since it is not explained why the usermode process started by
fork_usermode_blob() cannot interfere (or be interfered by) the rest of
the system (including normal usermode processes), the byte array comes from
the kernel address space is insufficient for convincing LSM modules to
ignore what that byte array can do.


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-24  1:51                                           ` Tetsuo Handa
@ 2020-06-24  4:00                                             ` Alexei Starovoitov
  2020-06-24  4:58                                               ` Tetsuo Handa
  2020-06-24  6:05                                               ` Alexei Starovoitov
  0 siblings, 2 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-24  4:00 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Eric W. Biederman, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Wed, Jun 24, 2020 at 10:51:15AM +0900, Tetsuo Handa wrote:
> On 2020/06/24 4:40, Alexei Starovoitov wrote:
> > There is no refcnt bug. It was a user error on tomoyo side.
> > fork_blob() works as expected.
> 
> Absolutely wrong! Any check which returns an error during current->in_execve == 1
> will cause this refcnt bug. You are simply ignoring that there is possibility
> that execve() fails.

you mean security_bprm_creds_for_exec() denying exec?
hmm. got it. refcnt model needs to change then.

> > Not true again.
> > usermode_blob is part of the kernel module.
> 
> Disagree.

Disagree with what? that blob is part of kernel module? huh?
what is it then?

> 
> > Kernel module when loaded doesn't have path.
> 
> Disagree.
> 
> Kernel modules can be trusted via module signature mechanism, and the byte array
> (which contains code / data) is protected by keeping that byte array within the
> kernel address space. Therefore, pathname based security does not need to complain
> that there is no pathname when kernel module is loaded.

I already explained upthread that blob is part of .rodata or .init.rodata
of kernel module and covered by the same signature mechanism.

> However, regarding usermode_blob, although the byte array (which contains code / data)
> might be initially loaded from the kernel space (which is protected), that byte array
> is no longer protected (e.g. SIGKILL, strace()) when executed because they are placed
> in the user address space. Thus, LSM modules (including pathname based security) want
> to control how that byte array can behave.

It's privileged memory regardless. root can poke into kernel or any process memory.

> On 2020/06/24 3:53, Eric W. Biederman wrote:
> > This isn't work anyone else can do because there are not yet any real in
> > tree users of fork_blob.  The fact that no one else can make
> > substantials changes to the code because it has no users is what gets in
> > the way of maintenance.
> 
> It sounds to me that fork_blob() is a dangerous interface which anonymously
> allows arbitrary behavior in an unprotected environment. Therefore,

I think you missed the part that user blob is signed as part of kernel module.

> > Either a path needs to be provided or the LSMs that work in terms
> > of paths need to be fixed.
> 
> LSM modules want to control how that byte array can behave. But Alexei
> still does not explain how information for LSM modules can be provided.

huh?
please see net/bpfilter/.

> 
> > My recomendation for long term maintenance is to split fork_blob into 2
> > functions: fs_from_blob, and the ordinary call_usermodehelper_exec.
> > That removes the need for any special support for anything in the exec
> > path because your blob will also have a path for your file, and the
> > file in the filesystem can be reused for restart.
> 
> Yes, that would be an approach for providing information for LSM modules.
> 
> > But with no in-tree users none of us can do anything bug guess what
> > the actual requirements of fork_usermode_blob are.
> 
> Exactly. Since it is not explained why the usermode process started by
> fork_usermode_blob() cannot interfere (or be interfered by) the rest of
> the system (including normal usermode processes), the byte array comes from
> the kernel address space is insufficient for convincing LSM modules to
> ignore what that byte array can do.

Sounds like tomoyo doesn't trust kernel modules. I don't think that is
fixable with any amount of explantation.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-24  4:00                                             ` Alexei Starovoitov
@ 2020-06-24  4:58                                               ` Tetsuo Handa
  2020-06-24  6:39                                                 ` Alexei Starovoitov
  2020-06-24  6:05                                               ` Alexei Starovoitov
  1 sibling, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-24  4:58 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eric W. Biederman, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On 2020/06/24 13:00, Alexei Starovoitov wrote:
>> However, regarding usermode_blob, although the byte array (which contains code / data)
>> might be initially loaded from the kernel space (which is protected), that byte array
>> is no longer protected (e.g. SIGKILL, strace()) when executed because they are placed
>> in the user address space. Thus, LSM modules (including pathname based security) want
>> to control how that byte array can behave.
> 
> It's privileged memory regardless. root can poke into kernel or any process memory.

LSM is there to restrict processes running as "root".
Your "root can poke into kernel or any process memory." response is out of step with the times.

Initial byte array used for usermode blob might be protected because of "part of .rodata or
.init.rodata of kernel module", but that byte array after started in userspace is no longer
protected. I don't trust such byte array as "part of kernel module", and I'm asking you how
such byte array does not interfere (or be interfered by) the rest of the system.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-24  4:00                                             ` Alexei Starovoitov
  2020-06-24  4:58                                               ` Tetsuo Handa
@ 2020-06-24  6:05                                               ` Alexei Starovoitov
  2020-06-24 14:18                                                 ` Alexei Starovoitov
  1 sibling, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-24  6:05 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Eric W. Biederman, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Tue, Jun 23, 2020 at 9:00 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Jun 24, 2020 at 10:51:15AM +0900, Tetsuo Handa wrote:
> > On 2020/06/24 4:40, Alexei Starovoitov wrote:
> > > There is no refcnt bug. It was a user error on tomoyo side.
> > > fork_blob() works as expected.
> >
> > Absolutely wrong! Any check which returns an error during current->in_execve == 1
> > will cause this refcnt bug. You are simply ignoring that there is possibility
> > that execve() fails.
>
> you mean security_bprm_creds_for_exec() denying exec?
> hmm. got it. refcnt model needs to change then.

I think the following trivial change should do it:

diff --git a/kernel/umh.c b/kernel/umh.c
index 79f139a7ca03..f80dd2a93ca4 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -512,7 +512,9 @@ int fork_usermode_blob(void *data, size_t len,
struct umh_info *info)
        file = shmem_kernel_file_setup("", len, 0);
        if (IS_ERR(file))
                return PTR_ERR(file);
-
+       err = deny_write_access(file);
+       if (err)
+               goto out_fput;
        written = kernel_write(file, data, len, &pos);
        if (written != len) {
                err = written;
@@ -532,8 +534,11 @@ int fork_usermode_blob(void *data, size_t len,
struct umh_info *info)
                mutex_lock(&umh_list_lock);
                list_add(&info->list, &umh_list);
                mutex_unlock(&umh_list_lock);
+               return 0;
        }
 out:
+       allow_write_access(file);
+out_fput:
        fput(file);
        return err;
 }

I'll do more tests tomorrow and send it with SOB.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-24  4:58                                               ` Tetsuo Handa
@ 2020-06-24  6:39                                                 ` Alexei Starovoitov
  2020-06-24  7:05                                                   ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-24  6:39 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Eric W. Biederman, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Wed, Jun 24, 2020 at 01:58:33PM +0900, Tetsuo Handa wrote:
> On 2020/06/24 13:00, Alexei Starovoitov wrote:
> >> However, regarding usermode_blob, although the byte array (which contains code / data)
> >> might be initially loaded from the kernel space (which is protected), that byte array
> >> is no longer protected (e.g. SIGKILL, strace()) when executed because they are placed
> >> in the user address space. Thus, LSM modules (including pathname based security) want
> >> to control how that byte array can behave.
> > 
> > It's privileged memory regardless. root can poke into kernel or any process memory.
> 
> LSM is there to restrict processes running as "root".

hmm. do you really mean that it's possible for an LSM to restrict CAP_SYS_ADMIN effectively?
LSM can certainly provide extra level of foolproof-ness against accidental
mistakes, but it's not a security boundary.

> Your "root can poke into kernel or any process memory." response is out of step with the times.
> 
> Initial byte array used for usermode blob might be protected because of "part of .rodata or
> .init.rodata of kernel module", but that byte array after started in userspace is no longer
> protected. 
>
> I don't trust such byte array as "part of kernel module", and I'm asking you how
> such byte array does not interfere (or be interfered by) the rest of the system.

Could you please explain the attack vector that you see in such scenario?
How elf binaries embedded in the kernel modules different from pid 1?
If anything can peek into their memory the system is compromised.
Say, there are no user blobs in kernel modules. How pid 1 memory is different
from all the JITed images? How is it different for all memory regions shared
between kernel and user processes?
I see an opportunity for an LSM to provide a protection against non-security
bugs when system is running trusted apps, but not when arbitrary code can
execute under root.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-24  6:39                                                 ` Alexei Starovoitov
@ 2020-06-24  7:05                                                   ` Tetsuo Handa
  2020-06-24 15:41                                                     ` Casey Schaufler
  0 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-24  7:05 UTC (permalink / raw)
  To: linux-security-module
  Cc: Alexei Starovoitov, Eric W. Biederman, Linus Torvalds, Kees Cook,
	Andrew Morton, Alexei Starovoitov, David Miller, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, Casey Schaufler

Forwarding to LSM-ML again. Any comments?

On 2020/06/24 15:39, Alexei Starovoitov wrote:
> On Wed, Jun 24, 2020 at 01:58:33PM +0900, Tetsuo Handa wrote:
>> On 2020/06/24 13:00, Alexei Starovoitov wrote:
>>>> However, regarding usermode_blob, although the byte array (which contains code / data)
>>>> might be initially loaded from the kernel space (which is protected), that byte array
>>>> is no longer protected (e.g. SIGKILL, strace()) when executed because they are placed
>>>> in the user address space. Thus, LSM modules (including pathname based security) want
>>>> to control how that byte array can behave.
>>>
>>> It's privileged memory regardless. root can poke into kernel or any process memory.
>>
>> LSM is there to restrict processes running as "root".
> 
> hmm. do you really mean that it's possible for an LSM to restrict CAP_SYS_ADMIN effectively?
> LSM can certainly provide extra level of foolproof-ness against accidental
> mistakes, but it's not a security boundary.
> 
>> Your "root can poke into kernel or any process memory." response is out of step with the times.
>>
>> Initial byte array used for usermode blob might be protected because of "part of .rodata or
>> .init.rodata of kernel module", but that byte array after started in userspace is no longer
>> protected. 
>>
>> I don't trust such byte array as "part of kernel module", and I'm asking you how
>> such byte array does not interfere (or be interfered by) the rest of the system.
> 
> Could you please explain the attack vector that you see in such scenario?
> How elf binaries embedded in the kernel modules different from pid 1?
> If anything can peek into their memory the system is compromised.
> Say, there are no user blobs in kernel modules. How pid 1 memory is different
> from all the JITed images? How is it different for all memory regions shared
> between kernel and user processes?
> I see an opportunity for an LSM to provide a protection against non-security
> bugs when system is running trusted apps, but not when arbitrary code can
> execute under root.
> 


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-23 19:40                                         ` Alexei Starovoitov
  2020-06-24  1:51                                           ` Tetsuo Handa
@ 2020-06-24 12:13                                           ` Eric W. Biederman
  2020-06-24 14:26                                             ` Alexei Starovoitov
  1 sibling, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-24 12:13 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Tue, Jun 23, 2020 at 01:53:48PM -0500, Eric W. Biederman wrote:

> There is no refcnt bug. It was a user error on tomoyo side.
> fork_blob() works as expected.

Nope.  I have independently confirmed it myself.

fork_usermode_blob holds a reference and puts that reference.  An
additional reference is needed for execve to hold and put.

Now can you write some patches to make that obvious please?

Eric

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-24  6:05                                               ` Alexei Starovoitov
@ 2020-06-24 14:18                                                 ` Alexei Starovoitov
  0 siblings, 0 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-24 14:18 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Eric W. Biederman, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Tue, Jun 23, 2020 at 11:05 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Jun 23, 2020 at 9:00 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Wed, Jun 24, 2020 at 10:51:15AM +0900, Tetsuo Handa wrote:
> > > On 2020/06/24 4:40, Alexei Starovoitov wrote:
> > > > There is no refcnt bug. It was a user error on tomoyo side.
> > > > fork_blob() works as expected.
> > >
> > > Absolutely wrong! Any check which returns an error during current->in_execve == 1
> > > will cause this refcnt bug. You are simply ignoring that there is possibility
> > > that execve() fails.
> >
> > you mean security_bprm_creds_for_exec() denying exec?
> > hmm. got it. refcnt model needs to change then.
>
> I think the following trivial change should do it:
>
> diff --git a/kernel/umh.c b/kernel/umh.c
> index 79f139a7ca03..f80dd2a93ca4 100644
> --- a/kernel/umh.c
> +++ b/kernel/umh.c
> @@ -512,7 +512,9 @@ int fork_usermode_blob(void *data, size_t len,
> struct umh_info *info)
>         file = shmem_kernel_file_setup("", len, 0);
>         if (IS_ERR(file))
>                 return PTR_ERR(file);
> -
> +       err = deny_write_access(file);
> +       if (err)
> +               goto out_fput;
>         written = kernel_write(file, data, len, &pos);
>         if (written != len) {
>                 err = written;
> @@ -532,8 +534,11 @@ int fork_usermode_blob(void *data, size_t len,
> struct umh_info *info)
>                 mutex_lock(&umh_list_lock);
>                 list_add(&info->list, &umh_list);
>                 mutex_unlock(&umh_list_lock);
> +               return 0;
>         }
>  out:
> +       allow_write_access(file);
> +out_fput:
>         fput(file);
>         return err;
>  }
>
> I'll do more tests tomorrow...

yeah. sorry. -enocoffee. It needs more work.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-24 12:13                                           ` Eric W. Biederman
@ 2020-06-24 14:26                                             ` Alexei Starovoitov
  2020-06-24 23:14                                               ` Tetsuo Handa
  2020-06-26 11:30                                               ` Eric W. Biederman
  0 siblings, 2 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-24 14:26 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

On Wed, Jun 24, 2020 at 5:17 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>
> > On Tue, Jun 23, 2020 at 01:53:48PM -0500, Eric W. Biederman wrote:
>
> > There is no refcnt bug. It was a user error on tomoyo side.
> > fork_blob() works as expected.
>
> Nope.  I have independently confirmed it myself.

I guess you've tried Tetsuo's fork_blob("#!/bin/true") kernel module ?
yes. that fails. It never meant to be used for this.
With elf blob it works, but breaks if there are rejections
in things like security_bprm_creds_for_exec().
In my mind that path was 'must succeed or kernel module is toast'.
Like passing NULL into a function that doesn't check for it.
Working on a fix for that since Tetsuo cares.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-24  7:05                                                   ` Tetsuo Handa
@ 2020-06-24 15:41                                                     ` Casey Schaufler
  2020-06-24 17:54                                                       ` Alexei Starovoitov
  0 siblings, 1 reply; 194+ messages in thread
From: Casey Schaufler @ 2020-06-24 15:41 UTC (permalink / raw)
  To: Tetsuo Handa, linux-security-module
  Cc: Alexei Starovoitov, Eric W. Biederman, Linus Torvalds, Kees Cook,
	Andrew Morton, Alexei Starovoitov, David Miller, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, Casey Schaufler

On 6/24/2020 12:05 AM, Tetsuo Handa wrote:
> Forwarding to LSM-ML again. Any comments?

Hey, BPF folks - you *really* need to do better about keeping the LSM
community in the loop when you're discussing LSM issues. 

>
> On 2020/06/24 15:39, Alexei Starovoitov wrote:
>> On Wed, Jun 24, 2020 at 01:58:33PM +0900, Tetsuo Handa wrote:
>>> On 2020/06/24 13:00, Alexei Starovoitov wrote:
>>>>> However, regarding usermode_blob, although the byte array (which contains code / data)
>>>>> might be initially loaded from the kernel space (which is protected), that byte array
>>>>> is no longer protected (e.g. SIGKILL, strace()) when executed because they are placed
>>>>> in the user address space. Thus, LSM modules (including pathname based security) want
>>>>> to control how that byte array can behave.
>>>> It's privileged memory regardless. root can poke into kernel or any process memory.
>>> LSM is there to restrict processes running as "root".
>> hmm. do you really mean that it's possible for an LSM to restrict CAP_SYS_ADMIN effectively?

I think that SELinux works hard to do just that. SELinux implements it's own
privilege model that is tangential to the capabilities model.

More directly, it is simple to create a security module to provide finer privilege
granularity than capabilities. I have one lurking in a source tree, and I would
be surprised if it's the only one waiting for the next round of LSM stacking.

>> LSM can certainly provide extra level of foolproof-ness against accidental
>> mistakes, but it's not a security boundary.

Gasp! Them's fight'n words. How do you justify such an outrageous claim?

>>> Your "root can poke into kernel or any process memory." response is out of step with the times.
>>>
>>> Initial byte array used for usermode blob might be protected because of "part of .rodata or
>>> .init.rodata of kernel module", but that byte array after started in userspace is no longer
>>> protected. 
>>>
>>> I don't trust such byte array as "part of kernel module", and I'm asking you how
>>> such byte array does not interfere (or be interfered by) the rest of the system.
>> Could you please explain the attack vector that you see in such scenario?
>> How elf binaries embedded in the kernel modules different from pid 1?
>> If anything can peek into their memory the system is compromised.
>> Say, there are no user blobs in kernel modules. How pid 1 memory is different
>> from all the JITed images? How is it different for all memory regions shared
>> between kernel and user processes?
>> I see an opportunity for an LSM to provide a protection against non-security
>> bugs when system is running trusted apps, but not when arbitrary code can
>> execute under root.
>>


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-24 15:41                                                     ` Casey Schaufler
@ 2020-06-24 17:54                                                       ` Alexei Starovoitov
  2020-06-24 19:48                                                         ` Casey Schaufler
  0 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-24 17:54 UTC (permalink / raw)
  To: Casey Schaufler
  Cc: Tetsuo Handa, linux-security-module, Eric W. Biederman,
	Linus Torvalds, Kees Cook, Andrew Morton, Alexei Starovoitov,
	David Miller, Al Viro, bpf, linux-fsdevel, Daniel Borkmann,
	Jakub Kicinski, Masahiro Yamada, Gary Lin, Bruno Meneguele

On Wed, Jun 24, 2020 at 08:41:37AM -0700, Casey Schaufler wrote:
> On 6/24/2020 12:05 AM, Tetsuo Handa wrote:
> > Forwarding to LSM-ML again. Any comments?
> 
> Hey, BPF folks - you *really* need to do better about keeping the LSM
> community in the loop when you're discussing LSM issues. 
> 
> >
> > On 2020/06/24 15:39, Alexei Starovoitov wrote:
> >> On Wed, Jun 24, 2020 at 01:58:33PM +0900, Tetsuo Handa wrote:
> >>> On 2020/06/24 13:00, Alexei Starovoitov wrote:
> >>>>> However, regarding usermode_blob, although the byte array (which contains code / data)
> >>>>> might be initially loaded from the kernel space (which is protected), that byte array
> >>>>> is no longer protected (e.g. SIGKILL, strace()) when executed because they are placed
> >>>>> in the user address space. Thus, LSM modules (including pathname based security) want
> >>>>> to control how that byte array can behave.
> >>>> It's privileged memory regardless. root can poke into kernel or any process memory.
> >>> LSM is there to restrict processes running as "root".
> >> hmm. do you really mean that it's possible for an LSM to restrict CAP_SYS_ADMIN effectively?
> 
> I think that SELinux works hard to do just that. SELinux implements it's own
> privilege model that is tangential to the capabilities model.

of course. no argument here.

> More directly, it is simple to create a security module to provide finer privilege
> granularity than capabilities. I have one lurking in a source tree, and I would
> be surprised if it's the only one waiting for the next round of LSM stacking.

no one is arguing with that either.

> 
> >> LSM can certainly provide extra level of foolproof-ness against accidental
> >> mistakes, but it's not a security boundary.
> 
> Gasp! Them's fight'n words. How do you justify such an outrageous claim?

.. for root user processes.
What's outrageous about that?
Did you capture the context or just replying to few sentences out of the context?

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-24 17:54                                                       ` Alexei Starovoitov
@ 2020-06-24 19:48                                                         ` Casey Schaufler
  0 siblings, 0 replies; 194+ messages in thread
From: Casey Schaufler @ 2020-06-24 19:48 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Tetsuo Handa, linux-security-module, Eric W. Biederman,
	Linus Torvalds, Kees Cook, Andrew Morton, Alexei Starovoitov,
	David Miller, Al Viro, bpf, linux-fsdevel, Daniel Borkmann,
	Jakub Kicinski, Masahiro Yamada, Gary Lin, Bruno Meneguele

On 6/24/2020 10:54 AM, Alexei Starovoitov wrote:
> On Wed, Jun 24, 2020 at 08:41:37AM -0700, Casey Schaufler wrote:
>> On 6/24/2020 12:05 AM, Tetsuo Handa wrote:
>>> Forwarding to LSM-ML again. Any comments?
>> Hey, BPF folks - you *really* need to do better about keeping the LSM
>> community in the loop when you're discussing LSM issues. 
>>
>>> On 2020/06/24 15:39, Alexei Starovoitov wrote:
>>>> On Wed, Jun 24, 2020 at 01:58:33PM +0900, Tetsuo Handa wrote:
>>>>> On 2020/06/24 13:00, Alexei Starovoitov wrote:
>>>>>>> However, regarding usermode_blob, although the byte array (which contains code / data)
>>>>>>> might be initially loaded from the kernel space (which is protected), that byte array
>>>>>>> is no longer protected (e.g. SIGKILL, strace()) when executed because they are placed
>>>>>>> in the user address space. Thus, LSM modules (including pathname based security) want
>>>>>>> to control how that byte array can behave.
>>>>>> It's privileged memory regardless. root can poke into kernel or any process memory.
>>>>> LSM is there to restrict processes running as "root".
>>>> hmm. do you really mean that it's possible for an LSM to restrict CAP_SYS_ADMIN effectively?
>> I think that SELinux works hard to do just that. SELinux implements it's own
>> privilege model that is tangential to the capabilities model.
> of course. no argument here.
>
>> More directly, it is simple to create a security module to provide finer privilege
>> granularity than capabilities. I have one lurking in a source tree, and I would
>> be surprised if it's the only one waiting for the next round of LSM stacking.
> no one is arguing with that either.
>
>>>> LSM can certainly provide extra level of foolproof-ness against accidental
>>>> mistakes, but it's not a security boundary.
>> Gasp! Them's fight'n words. How do you justify such an outrageous claim?
> .. for root user processes.
> What's outrageous about that?
> Did you capture the context or just replying to few sentences out of the context?

As I mentioned above, you need to include the LSM list in these discussions.
If you don't want "out of context" comments. I replied to what's presented.
And regardless of the context, saying that an LSM can't provide a security
boundary for "root user processes" is just wrong. Obviously there's been more
to the conversation than is included here.



^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-24 14:26                                             ` Alexei Starovoitov
@ 2020-06-24 23:14                                               ` Tetsuo Handa
  2020-06-25  1:35                                                 ` Alexei Starovoitov
  2020-06-25 12:56                                                 ` [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained Stephen Smalley
  2020-06-26 11:30                                               ` Eric W. Biederman
  1 sibling, 2 replies; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-24 23:14 UTC (permalink / raw)
  To: Alexei Starovoitov, Eric W. Biederman
  Cc: Linus Torvalds, Kees Cook, Andrew Morton, Alexei Starovoitov,
	David Miller, Al Viro, bpf, linux-fsdevel, Daniel Borkmann,
	Jakub Kicinski, Masahiro Yamada, Gary Lin, Bruno Meneguele,
	linux-security-module, Casey Schaufler

On 2020/06/24 23:26, Alexei Starovoitov wrote:
> On Wed, Jun 24, 2020 at 5:17 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>>
>> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>>
>>> On Tue, Jun 23, 2020 at 01:53:48PM -0500, Eric W. Biederman wrote:
>>
>>> There is no refcnt bug. It was a user error on tomoyo side.
>>> fork_blob() works as expected.
>>
>> Nope.  I have independently confirmed it myself.
> 
> I guess you've tried Tetsuo's fork_blob("#!/bin/true") kernel module ?
> yes. that fails. It never meant to be used for this.
> With elf blob it works, but breaks if there are rejections
> in things like security_bprm_creds_for_exec().
> In my mind that path was 'must succeed or kernel module is toast'.
> Like passing NULL into a function that doesn't check for it.
> Working on a fix for that since Tetsuo cares.
> 

What is unhappy for pathname based LSMs is that fork_usermode_blob() creates
a file with empty filename. I can imagine that somebody would start abusing
fork_usermode_blob() as an interface for starting programs like modprobe, hotplug,
udevd and sshd. When such situation happened, how fork_usermode_blob() provides
information for identifying the intent of such execve() requests?

fork_usermode_blob() might also be an unhappy behavior for inode based LSMs (like
SELinux and Smack) because it seems that fork_usermode_blob() can't have a chance
to associate appropriate security labels based on the content of the byte array
because files are created on-demand. Is fork_usermode_blob() friendly to inode
based LSMs?

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-24 23:14                                               ` Tetsuo Handa
@ 2020-06-25  1:35                                                 ` Alexei Starovoitov
  2020-06-25  6:38                                                   ` Tetsuo Handa
  2020-06-25 12:56                                                 ` [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained Stephen Smalley
  1 sibling, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-25  1:35 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Eric W. Biederman, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele, linux-security-module, Casey Schaufler

On Thu, Jun 25, 2020 at 08:14:20AM +0900, Tetsuo Handa wrote:
> On 2020/06/24 23:26, Alexei Starovoitov wrote:
> > On Wed, Jun 24, 2020 at 5:17 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
> >>
> >> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> >>
> >>> On Tue, Jun 23, 2020 at 01:53:48PM -0500, Eric W. Biederman wrote:
> >>
> >>> There is no refcnt bug. It was a user error on tomoyo side.
> >>> fork_blob() works as expected.
> >>
> >> Nope.  I have independently confirmed it myself.
> > 
> > I guess you've tried Tetsuo's fork_blob("#!/bin/true") kernel module ?
> > yes. that fails. It never meant to be used for this.
> > With elf blob it works, but breaks if there are rejections
> > in things like security_bprm_creds_for_exec().
> > In my mind that path was 'must succeed or kernel module is toast'.
> > Like passing NULL into a function that doesn't check for it.
> > Working on a fix for that since Tetsuo cares.
> > 
> 
> What is unhappy for pathname based LSMs is that fork_usermode_blob() creates
> a file with empty filename. I can imagine that somebody would start abusing
> fork_usermode_blob() as an interface for starting programs like modprobe, hotplug,
> udevd and sshd. When such situation happened, how fork_usermode_blob() provides
> information for identifying the intent of such execve() requests?
> 
> fork_usermode_blob() might also be an unhappy behavior for inode based LSMs (like
> SELinux and Smack) because it seems that fork_usermode_blob() can't have a chance
> to associate appropriate security labels based on the content of the byte array
> because files are created on-demand. Is fork_usermode_blob() friendly to inode
> based LSMs?

blob is started by a kernel module. Regardless of path existence that kernel module
could have disabled any LSM and any kernel security mechanism.
People who write out of tree kernel modules found ways to bypass EXPORT_SYMBOL
with and without _GPL. Modules can do anything. It's only the number of hoops
they need to jump through to get what they want. 
Signed and in-tree kernel module is the only way to protect the integrity of the system.
That's why user blob is part of kernel module elf object and it's covered by the same
module signature verification logic.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-25  1:35                                                 ` Alexei Starovoitov
@ 2020-06-25  6:38                                                   ` Tetsuo Handa
  2020-06-25  9:57                                                     ` Greg KH
  0 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-25  6:38 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eric W. Biederman, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele, linux-security-module, Casey Schaufler

On 2020/06/25 10:35, Alexei Starovoitov wrote:
>> What is unhappy for pathname based LSMs is that fork_usermode_blob() creates
>> a file with empty filename. I can imagine that somebody would start abusing
>> fork_usermode_blob() as an interface for starting programs like modprobe, hotplug,
>> udevd and sshd. When such situation happened, how fork_usermode_blob() provides
>> information for identifying the intent of such execve() requests?
>>
>> fork_usermode_blob() might also be an unhappy behavior for inode based LSMs (like
>> SELinux and Smack) because it seems that fork_usermode_blob() can't have a chance
>> to associate appropriate security labels based on the content of the byte array
>> because files are created on-demand. Is fork_usermode_blob() friendly to inode
>> based LSMs?
> 
> blob is started by a kernel module. Regardless of path existence that kernel module
> could have disabled any LSM and any kernel security mechanism.
> People who write out of tree kernel modules found ways to bypass EXPORT_SYMBOL
> with and without _GPL. Modules can do anything. It's only the number of hoops
> they need to jump through to get what they want.

So what? I know that. That's irrelevant to my questions.

> Signed and in-tree kernel module is the only way to protect the integrity of the system.
> That's why user blob is part of kernel module elf object and it's covered by the same
> module signature verification logic.

My questions are:

(1) "Signed and in-tree kernel module" assertion is pointless.
    In future, some of in-tree kernel modules might start using fork_usermode_blob()
    instead of call_usermodehelper(), with instructions containing what your initial
    use case does not use. There is no guarantee that such thing can't happen.
    Assuming that there will be multiple blobs, we need a way to identify these blobs.
    How does fork_usermode_blob() provide information for identification?

(2) Running some blob in usermode means that permission checks by LSM modules will
    be enforced. For example, socket's shutdown operation via shutdown() syscall from
    usermode blob will involve security_socket_shutdown() check.
    
    ----------
    int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)
    {
            return sock->ops->shutdown(sock, how);
    }
    ----------
    
    ----------
    int __sys_shutdown(int fd, int how)
    {
            int err, fput_needed;
            struct socket *sock;
    
            sock = sockfd_lookup_light(fd, &err, &fput_needed);
            if (sock != NULL) {
                    err = security_socket_shutdown(sock, how);
                    if (!err)
                            err = sock->ops->shutdown(sock, how);
                    fput_light(sock->file, fput_needed);
            }
            return err;
    }
    
    SYSCALL_DEFINE2(shutdown, int, fd, int, how)
    {
            return __sys_shutdown(fd, how);
    }
    ----------
    
    I don't know what instructions your blob would contain. But even if the blobs
    containing your initial use case use only setsockopt()/getsockopt() syscalls,
    LSM modules have rights to inspect and reject these requests from usermode blobs
    via security_socket_setsockopt()/security_socket_getsockopt() hooks. In order to
    inspect these requests, LSM modules need information (so called "security context"),
    and fork_usermode_blob() has to be able to somehow teach that information to LSM
    modules. Pathname is one of information for pathname based LSM modules. Inode's
    security label is one of information for inode based LSM modules.
    
    call_usermodehelper() can teach LSM modules via pre-existing file's pathname and
    inode's security label at security_bprm_creds_for_exec()/security_bprm_check() etc.
    But since fork_usermode_blob() accepts only "byte array" and "length of byte array"
    arguments, I'm not sure whether LSM modules can obtain information needed for
    inspection. How does fork_usermode_blob() tell that information?

(3) Again, "root can poke into kernel or any process memory." assertion is pointless.
    Answering to your questions

      > hmm. do you really mean that it's possible for an LSM to restrict CAP_SYS_ADMIN effectively?
      Not every LSM module restricts CAP_* flags. But LSM modules can implement finer grained
      restrictions than plain CAP_* flags.

      > How elf binaries embedded in the kernel modules different from pid 1?
      No difference.

      > If anything can peek into their memory the system is compromised.
      Permission checks via LSM modules are there to prevent such behavior.

      > Say, there are no user blobs in kernel modules. How pid 1 memory is different
      > from all the JITed images? How is it different for all memory regions shared
      > between kernel and user processes?
      I don't know what "the JITed images" means. But I guess that the answer is
      "No difference".

    Then, I ask you back.

    Although the byte array (which contains code / data) might be initially loaded from
    the kernel space (which is protected), that byte array is no longer protected (e.g.
    SIGKILL, ptrace()) when executed because they are placed in the user address space.

    Why the usermode process started by fork_usermode_blob() cannot interfere (or be
    interfered by) the rest of the system (including normal usermode processes) ?
    And I guess that your answer is "the usermode process started by fork_usermode_blob()
    _can_ (and be interfered by) the rest of the system, for they are nothing but
    normal usermode processes."

    Thus, LSM modules (including pathname based security) want to control how that byte
    array can behave. And how does fork_usermode_blob() tell necessary information?

Your answers up to now did not convince LSM modules to ignore what the usermode process
started by fork_usermode_blob() can do. If you again don't answer my questions, I'll
ack to https://lkml.kernel.org/r/875zc4c86z.fsf_-_@x220.int.ebiederm.org .


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-25  6:38                                                   ` Tetsuo Handa
@ 2020-06-25  9:57                                                     ` Greg KH
  2020-06-25 11:03                                                       ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Greg KH @ 2020-06-25  9:57 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Alexei Starovoitov, Eric W. Biederman, Linus Torvalds, Kees Cook,
	Andrew Morton, Alexei Starovoitov, David Miller, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, linux-security-module,
	Casey Schaufler

On Thu, Jun 25, 2020 at 03:38:14PM +0900, Tetsuo Handa wrote:
> On 2020/06/25 10:35, Alexei Starovoitov wrote:
> >> What is unhappy for pathname based LSMs is that fork_usermode_blob() creates
> >> a file with empty filename. I can imagine that somebody would start abusing
> >> fork_usermode_blob() as an interface for starting programs like modprobe, hotplug,
> >> udevd and sshd. When such situation happened, how fork_usermode_blob() provides
> >> information for identifying the intent of such execve() requests?
> >>
> >> fork_usermode_blob() might also be an unhappy behavior for inode based LSMs (like
> >> SELinux and Smack) because it seems that fork_usermode_blob() can't have a chance
> >> to associate appropriate security labels based on the content of the byte array
> >> because files are created on-demand. Is fork_usermode_blob() friendly to inode
> >> based LSMs?
> > 
> > blob is started by a kernel module. Regardless of path existence that kernel module
> > could have disabled any LSM and any kernel security mechanism.
> > People who write out of tree kernel modules found ways to bypass EXPORT_SYMBOL
> > with and without _GPL. Modules can do anything. It's only the number of hoops
> > they need to jump through to get what they want.
> 
> So what? I know that. That's irrelevant to my questions.
> 
> > Signed and in-tree kernel module is the only way to protect the integrity of the system.
> > That's why user blob is part of kernel module elf object and it's covered by the same
> > module signature verification logic.
> 
> My questions are:
> 
> (1) "Signed and in-tree kernel module" assertion is pointless.
>     In future, some of in-tree kernel modules might start using fork_usermode_blob()
>     instead of call_usermodehelper(), with instructions containing what your initial
>     use case does not use. There is no guarantee that such thing can't happen.

I hope that this would happen for some tools, what's wrong with that?
That means we can ship those programs from within the kernel source tree
instead of trying to rely on keeping a specific user/kernel api stable
for forever.

That would be a good thing, right?

>     Assuming that there will be multiple blobs, we need a way to identify these blobs.
>     How does fork_usermode_blob() provide information for identification?

If the kernel itself was running these blobs, why would LSM care about
it?  These are coming from "within the building!" don't you trust the
kernel already?

I don't understand the issue here.


> (2) Running some blob in usermode means that permission checks by LSM modules will
>     be enforced. For example, socket's shutdown operation via shutdown() syscall from
>     usermode blob will involve security_socket_shutdown() check.
>     
>     ----------
>     int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)
>     {
>             return sock->ops->shutdown(sock, how);
>     }
>     ----------
>     
>     ----------
>     int __sys_shutdown(int fd, int how)
>     {
>             int err, fput_needed;
>             struct socket *sock;
>     
>             sock = sockfd_lookup_light(fd, &err, &fput_needed);
>             if (sock != NULL) {
>                     err = security_socket_shutdown(sock, how);
>                     if (!err)
>                             err = sock->ops->shutdown(sock, how);
>                     fput_light(sock->file, fput_needed);
>             }
>             return err;
>     }
>     
>     SYSCALL_DEFINE2(shutdown, int, fd, int, how)
>     {
>             return __sys_shutdown(fd, how);
>     }
>     ----------
>     
>     I don't know what instructions your blob would contain. But even if the blobs
>     containing your initial use case use only setsockopt()/getsockopt() syscalls,
>     LSM modules have rights to inspect and reject these requests from usermode blobs
>     via security_socket_setsockopt()/security_socket_getsockopt() hooks. In order to
>     inspect these requests, LSM modules need information (so called "security context"),
>     and fork_usermode_blob() has to be able to somehow teach that information to LSM
>     modules. Pathname is one of information for pathname based LSM modules. Inode's
>     security label is one of information for inode based LSM modules.
>     
>     call_usermodehelper() can teach LSM modules via pre-existing file's pathname and
>     inode's security label at security_bprm_creds_for_exec()/security_bprm_check() etc.
>     But since fork_usermode_blob() accepts only "byte array" and "length of byte array"
>     arguments, I'm not sure whether LSM modules can obtain information needed for
>     inspection. How does fork_usermode_blob() tell that information?

It would seem that the "security context" for those would be the same as
anything created before userspace launches today, right?  You handle
that ok, and this should be just the same.

But again, as these programs are coming from "within the kernel", why
would you want to disallow them?  If you don't want to allow them, don't
build them into your kernel?  :)

> (3) Again, "root can poke into kernel or any process memory." assertion is pointless.
>     Answering to your questions
> 
>       > hmm. do you really mean that it's possible for an LSM to restrict CAP_SYS_ADMIN effectively?
>       Not every LSM module restricts CAP_* flags. But LSM modules can implement finer grained
>       restrictions than plain CAP_* flags.
> 
>       > How elf binaries embedded in the kernel modules different from pid 1?
>       No difference.
> 
>       > If anything can peek into their memory the system is compromised.
>       Permission checks via LSM modules are there to prevent such behavior.
> 
>       > Say, there are no user blobs in kernel modules. How pid 1 memory is different
>       > from all the JITed images? How is it different for all memory regions shared
>       > between kernel and user processes?
>       I don't know what "the JITed images" means. But I guess that the answer is
>       "No difference".
> 
>     Then, I ask you back.
> 
>     Although the byte array (which contains code / data) might be initially loaded from
>     the kernel space (which is protected), that byte array is no longer protected (e.g.
>     SIGKILL, ptrace()) when executed because they are placed in the user address space.
>
>     Why the usermode process started by fork_usermode_blob() cannot interfere (or be
>     interfered by) the rest of the system (including normal usermode processes) ?
>     And I guess that your answer is "the usermode process started by fork_usermode_blob()
>     _can_ (and be interfered by) the rest of the system, for they are nothing but
>     normal usermode processes."
> 
>     Thus, LSM modules (including pathname based security) want to control how that byte
>     array can behave. And how does fork_usermode_blob() tell necessary information?

Think of these blobs just as any other kernel module would be today.
Right now I, as a kernel module, can read/write to any file in the
system, and do all sorts of other fun things.  You can't mediate that
today from a LSM, and this is just one other example of this.

The "only" change is that now this code is running in userspace context,
which for an overall security/system issue, should be better than
running it in kernel context, right?

Perhaps we just add new LSM hooks for every place that we call this new
function to run a blob?  That will give you the needed "the kernel is
about to run a blob that we think is a userspace USB IR filter driver",
or whatever the blob does.

Would that help out?

But, given that we don't even have any in-kernel users, all of this
feels like a lot of arguing over something that no one can currently
even have happen...

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-25  9:57                                                     ` Greg KH
@ 2020-06-25 11:03                                                       ` Tetsuo Handa
  2020-06-25 12:07                                                         ` Greg KH
  0 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-25 11:03 UTC (permalink / raw)
  To: Greg KH
  Cc: Alexei Starovoitov, Eric W. Biederman, Linus Torvalds, Kees Cook,
	Andrew Morton, Alexei Starovoitov, David Miller, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, linux-security-module,
	Casey Schaufler

On 2020/06/25 18:57, Greg KH wrote:
> On Thu, Jun 25, 2020 at 03:38:14PM +0900, Tetsuo Handa wrote:
>> My questions are:
>>
>> (1) "Signed and in-tree kernel module" assertion is pointless.
>>     In future, some of in-tree kernel modules might start using fork_usermode_blob()
>>     instead of call_usermodehelper(), with instructions containing what your initial
>>     use case does not use. There is no guarantee that such thing can't happen.
> 
> I hope that this would happen for some tools, what's wrong with that?
> That means we can ship those programs from within the kernel source tree
> instead of trying to rely on keeping a specific user/kernel api stable
> for forever.
> 
> That would be a good thing, right?

Some in-tree users might start embedding byte array containing userspace programs
like /bin/sh when building kernels. How can we prove that such thing won't happen?
I consider that the byte array can contain arbitrary instructions (regardless of
some tools used for building the byte array).

> 
>>     Assuming that there will be multiple blobs, we need a way to identify these blobs.
>>     How does fork_usermode_blob() provide information for identification?
> 
> If the kernel itself was running these blobs, why would LSM care about
> it?  These are coming from "within the building!" don't you trust the
> kernel already?
> 
> I don't understand the issue here.

The byte array came from the kernel, but due to possibility of "root can poke into
kernel or any process memory.", that byte array can become as untrusted as byte
array coming from userspace. There is no concept like "the kernel itself _is_ running
these blobs". Only a fact "the byte array _was_ copied from the kernel address space
(rather than from some file on the filesystem)" exists. We need a mechanism (ideally,
without counting on LSMs) for avoid peeking/poking etc. into/from the byte array
which was copied from the kernel address space to user address space.



>>     call_usermodehelper() can teach LSM modules via pre-existing file's pathname and
>>     inode's security label at security_bprm_creds_for_exec()/security_bprm_check() etc.
>>     But since fork_usermode_blob() accepts only "byte array" and "length of byte array"
>>     arguments, I'm not sure whether LSM modules can obtain information needed for
>>     inspection. How does fork_usermode_blob() tell that information?
> 
> It would seem that the "security context" for those would be the same as
> anything created before userspace launches today, right?  You handle
> that ok, and this should be just the same.

I don't think so. Today when call_usermodehelper() is called, LSMs switch their security
context (at least TOMOYO does it) for further syscalls from the usermode process started
by the kernel context. But when fork_usermode_blob() is called, how LSMs can switch their
security context for further syscalls from the usermode process started by the kernel
context?

> 
> But again, as these programs are coming from "within the kernel", why
> would you want to disallow them?  If you don't want to allow them, don't
> build them into your kernel?  :)

I'm talking about not only "disallow unauthorized execve() request" but also "disallow
unauthorized syscalls after execve() request". Coming from the kernel is not important.



>>     Thus, LSM modules (including pathname based security) want to control how that byte
>>     array can behave. And how does fork_usermode_blob() tell necessary information?
> 
> Think of these blobs just as any other kernel module would be today.

No, I can't. How can we guarantee that the byte array came from kernel remains intact
despite the possibility of "root can poke into kernel or any process memory" ?

> Right now I, as a kernel module, can read/write to any file in the
> system, and do all sorts of other fun things.  You can't mediate that
> today from a LSM, and this is just one other example of this.

Some functions (e.g. kernel_sock_shutdown()) bypass permission checks by LSMs
comes from a sort of trustness that the byte array kept inside kernel address
space remains secure/intact.

> 
> The "only" change is that now this code is running in userspace context,
> which for an overall security/system issue, should be better than
> running it in kernel context, right?

As soon as exposing that byte array outside of kernel address space, processes
running such byte array are considered insecure/tampered. We can't prove that
the byte array exposed to outside of kernel address space does only limited
set of instructions, and we have to perform permission checks by LSMs.

And LSMs need to receive the intent (or "security context" argument) from fork_usermode_blob()
for restricting further syscalls by the usermode process started via fork_usermode_blob().

> 
> Perhaps we just add new LSM hooks for every place that we call this new
> function to run a blob?  That will give you the needed "the kernel is
> about to run a blob that we think is a userspace USB IR filter driver",
> or whatever the blob does.

Yes, that would be the intent (or "security context" argument) fork_usermode_blob()
is missing. Though I don't know how such stringuish argument can be represented for
individual LSM modules...


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-25 11:03                                                       ` Tetsuo Handa
@ 2020-06-25 12:07                                                         ` Greg KH
  2020-06-25 14:21                                                           ` Tetsuo Handa
  2020-06-25 19:34                                                           ` David Miller
  0 siblings, 2 replies; 194+ messages in thread
From: Greg KH @ 2020-06-25 12:07 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Alexei Starovoitov, Eric W. Biederman, Linus Torvalds, Kees Cook,
	Andrew Morton, Alexei Starovoitov, David Miller, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, linux-security-module,
	Casey Schaufler

On Thu, Jun 25, 2020 at 08:03:26PM +0900, Tetsuo Handa wrote:
> On 2020/06/25 18:57, Greg KH wrote:
> > On Thu, Jun 25, 2020 at 03:38:14PM +0900, Tetsuo Handa wrote:
> >> My questions are:
> >>
> >> (1) "Signed and in-tree kernel module" assertion is pointless.
> >>     In future, some of in-tree kernel modules might start using fork_usermode_blob()
> >>     instead of call_usermodehelper(), with instructions containing what your initial
> >>     use case does not use. There is no guarantee that such thing can't happen.
> > 
> > I hope that this would happen for some tools, what's wrong with that?
> > That means we can ship those programs from within the kernel source tree
> > instead of trying to rely on keeping a specific user/kernel api stable
> > for forever.
> > 
> > That would be a good thing, right?
> 
> Some in-tree users might start embedding byte array containing userspace programs
> like /bin/sh when building kernels. How can we prove that such thing won't happen?

We have the code, we can read it, we can say, "hey, looks like you are
including bash, do you want to do that?"  :)

> I consider that the byte array can contain arbitrary instructions (regardless of
> some tools used for building the byte array).

Sure, and is this a problem?

> >>     Assuming that there will be multiple blobs, we need a way to identify these blobs.
> >>     How does fork_usermode_blob() provide information for identification?
> > 
> > If the kernel itself was running these blobs, why would LSM care about
> > it?  These are coming from "within the building!" don't you trust the
> > kernel already?
> > 
> > I don't understand the issue here.
> 
> The byte array came from the kernel, but due to possibility of "root can poke into
> kernel or any process memory.", that byte array can become as untrusted as byte
> array coming from userspace. There is no concept like "the kernel itself _is_ running
> these blobs". Only a fact "the byte array _was_ copied from the kernel address space
> (rather than from some file on the filesystem)" exists. We need a mechanism (ideally,
> without counting on LSMs) for avoid peeking/poking etc. into/from the byte array
> which was copied from the kernel address space to user address space.

And what are you going to do with that if you can "look at the array"?
I really don't understand the objection here, why is this any different
than any other random kernel driver for what it can do?

> >>     call_usermodehelper() can teach LSM modules via pre-existing file's pathname and
> >>     inode's security label at security_bprm_creds_for_exec()/security_bprm_check() etc.
> >>     But since fork_usermode_blob() accepts only "byte array" and "length of byte array"
> >>     arguments, I'm not sure whether LSM modules can obtain information needed for
> >>     inspection. How does fork_usermode_blob() tell that information?
> > 
> > It would seem that the "security context" for those would be the same as
> > anything created before userspace launches today, right?  You handle
> > that ok, and this should be just the same.
> 
> I don't think so. Today when call_usermodehelper() is called, LSMs switch their security
> context (at least TOMOYO does it) for further syscalls from the usermode process started
> by the kernel context. But when fork_usermode_blob() is called, how LSMs can switch their
> security context for further syscalls from the usermode process started by the kernel
> context?

Ok, that makes a bit more sense.  Why not just do the same thing that
you do today with call_usermodehelper()?  The logic in a way is the
same, right?

> > But again, as these programs are coming from "within the kernel", why
> > would you want to disallow them?  If you don't want to allow them, don't
> > build them into your kernel?  :)
> 
> I'm talking about not only "disallow unauthorized execve() request" but also "disallow
> unauthorized syscalls after execve() request". Coming from the kernel is not important.

Ok, then do the same thing that you do for call_usermodehelper() to
prevent this.

> >>     Thus, LSM modules (including pathname based security) want to control how that byte
> >>     array can behave. And how does fork_usermode_blob() tell necessary information?
> > 
> > Think of these blobs just as any other kernel module would be today.
> 
> No, I can't. How can we guarantee that the byte array came from kernel remains intact
> despite the possibility of "root can poke into kernel or any process memory" ?

You guarantee it the same way you guarantee that the wifi driver really
is running the code you think it is running.  There is no difference
here.

> > Right now I, as a kernel module, can read/write to any file in the
> > system, and do all sorts of other fun things.  You can't mediate that
> > today from a LSM, and this is just one other example of this.
> 
> Some functions (e.g. kernel_sock_shutdown()) bypass permission checks by LSMs
> comes from a sort of trustness that the byte array kept inside kernel address
> space remains secure/intact.

And what is going to change that "trustness" here?  The byte array came
from the kernel address space to start with.  Are you thinking something
outside of the kernel will then tamper with those bytes to do something
else with them?  If so, shouldn't you be preventing that userspace
program that does the tampering from doing that in the first place with
the LSM running?

> > The "only" change is that now this code is running in userspace context,
> > which for an overall security/system issue, should be better than
> > running it in kernel context, right?
> 
> As soon as exposing that byte array outside of kernel address space, processes
> running such byte array are considered insecure/tampered.

Why?  Do you mean that you do not trust any program once it has been
started either?  If you can, why not do the same thing here?

> We can't prove that
> the byte array exposed to outside of kernel address space does only limited
> set of instructions, and we have to perform permission checks by LSMs.

Those checks should come through the same way you check any other
userspace program through an LSM.  Fix up the context like mentioned
above with call_usermodehelper() and you should be fine, right?

> And LSMs need to receive the intent (or "security context" argument) from fork_usermode_blob()
> for restricting further syscalls by the usermode process started via fork_usermode_blob().
> 
> > 
> > Perhaps we just add new LSM hooks for every place that we call this new
> > function to run a blob?  That will give you the needed "the kernel is
> > about to run a blob that we think is a userspace USB IR filter driver",
> > or whatever the blob does.
> 
> Yes, that would be the intent (or "security context" argument) fork_usermode_blob()
> is missing. Though I don't know how such stringuish argument can be represented for
> individual LSM modules...

The same way we do it today for any LSM callback?  i.e. by a new
function call :)

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-24 23:14                                               ` Tetsuo Handa
  2020-06-25  1:35                                                 ` Alexei Starovoitov
@ 2020-06-25 12:56                                                 ` Stephen Smalley
  2020-06-25 13:25                                                   ` Greg Kroah-Hartman
  1 sibling, 1 reply; 194+ messages in thread
From: Stephen Smalley @ 2020-06-25 12:56 UTC (permalink / raw)
  To: Tetsuo Handa, Greg Kroah-Hartman
  Cc: Alexei Starovoitov, Eric W. Biederman, Linus Torvalds, Kees Cook,
	Andrew Morton, Alexei Starovoitov, David Miller, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, linux-security-module,
	Casey Schaufler

On Wed, Jun 24, 2020 at 7:16 PM Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp> wrote:
> What is unhappy for pathname based LSMs is that fork_usermode_blob() creates
> a file with empty filename. I can imagine that somebody would start abusing
> fork_usermode_blob() as an interface for starting programs like modprobe, hotplug,
> udevd and sshd. When such situation happened, how fork_usermode_blob() provides
> information for identifying the intent of such execve() requests?
>
> fork_usermode_blob() might also be an unhappy behavior for inode based LSMs (like
> SELinux and Smack) because it seems that fork_usermode_blob() can't have a chance
> to associate appropriate security labels based on the content of the byte array
> because files are created on-demand. Is fork_usermode_blob() friendly to inode
> based LSMs?

No, because we cannot label the inode based on the program's purpose
and therefore cannot configure an automatic transition to a suitable
security context for the process, unlike call_usermodehelper(). It is
important to note that the goal of such transitions is not merely to
restrict the program from doing bad things but also to protect the
program from untrustworthy inputs, e.g. one can run kmod/modprobe in a
domain that can only read from authorized kernel modules, prevent
following untrusted symlinks, etc.  Further, at present, the
implementation creates the inode via shmem_kernel_file_setup(), which
is supposed to be for inodes private to the kernel not exposed to
userspace (hence marked S_PRIVATE), which I believe in this case will
end up leaving the inode unlabeled but still end up firing checks in
the bprm hooks on the file inode, thereby potentially yielding denials
in SELinux on the exec of unlabeled files.  Not exactly what we would
want.  If users were to switch from using call_usermodehelper() to
fork_usermode_blob() we would need them to label the inode in some
manner to reflect the program purpose prior to exec.  I suppose they
could pass in some string key and SELinux could look it up in policy
to get a context to use or something.

On a different note, will the usermode blob be measured by IMA prior
to execution?  What ensures that the blob was actually embedded in the
kernel image and wasn't just supplied as data through exploitation of
a kernel vulnerability or malicious kernel module?  Yes, things are
already bad at that point but it would be good to be able to detect
launch of the malicious userspace payload regardless (kernel exploit
can't undo the measurement extended into the TPM even if it tampers
with the IMA measurement list in the kernel, nor fake a quote signed
by the TPM).

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-25 12:56                                                 ` [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained Stephen Smalley
@ 2020-06-25 13:25                                                   ` Greg Kroah-Hartman
  2020-06-25 14:26                                                     ` Stephen Smalley
  0 siblings, 1 reply; 194+ messages in thread
From: Greg Kroah-Hartman @ 2020-06-25 13:25 UTC (permalink / raw)
  To: Stephen Smalley
  Cc: Tetsuo Handa, Alexei Starovoitov, Eric W. Biederman,
	Linus Torvalds, Kees Cook, Andrew Morton, Alexei Starovoitov,
	David Miller, Al Viro, bpf, linux-fsdevel, Daniel Borkmann,
	Jakub Kicinski, Masahiro Yamada, Gary Lin, Bruno Meneguele,
	linux-security-module, Casey Schaufler

On Thu, Jun 25, 2020 at 08:56:10AM -0400, Stephen Smalley wrote:
> On Wed, Jun 24, 2020 at 7:16 PM Tetsuo Handa
> <penguin-kernel@i-love.sakura.ne.jp> wrote:
> > What is unhappy for pathname based LSMs is that fork_usermode_blob() creates
> > a file with empty filename. I can imagine that somebody would start abusing
> > fork_usermode_blob() as an interface for starting programs like modprobe, hotplug,
> > udevd and sshd. When such situation happened, how fork_usermode_blob() provides
> > information for identifying the intent of such execve() requests?
> >
> > fork_usermode_blob() might also be an unhappy behavior for inode based LSMs (like
> > SELinux and Smack) because it seems that fork_usermode_blob() can't have a chance
> > to associate appropriate security labels based on the content of the byte array
> > because files are created on-demand. Is fork_usermode_blob() friendly to inode
> > based LSMs?
> 
> No, because we cannot label the inode based on the program's purpose
> and therefore cannot configure an automatic transition to a suitable
> security context for the process, unlike call_usermodehelper().

Why, what prevents this?  Can you not just do that based on the "blob
address" or signature of it or something like that?  Right now you all
do this based on inode of a random file on a disk, what's the difference
between a random blob in memory?

> It is
> important to note that the goal of such transitions is not merely to
> restrict the program from doing bad things but also to protect the
> program from untrustworthy inputs, e.g. one can run kmod/modprobe in a
> domain that can only read from authorized kernel modules, prevent
> following untrusted symlinks, etc.  Further, at present, the
> implementation creates the inode via shmem_kernel_file_setup(), which
> is supposed to be for inodes private to the kernel not exposed to
> userspace (hence marked S_PRIVATE), which I believe in this case will
> end up leaving the inode unlabeled but still end up firing checks in
> the bprm hooks on the file inode, thereby potentially yielding denials
> in SELinux on the exec of unlabeled files.  Not exactly what we would
> want.  If users were to switch from using call_usermodehelper() to
> fork_usermode_blob() we would need them to label the inode in some
> manner to reflect the program purpose prior to exec.  I suppose they
> could pass in some string key and SELinux could look it up in policy
> to get a context to use or something.

Sure, that would work.

> On a different note, will the usermode blob be measured by IMA prior
> to execution?  What ensures that the blob was actually embedded in the
> kernel image and wasn't just supplied as data through exploitation of
> a kernel vulnerability or malicious kernel module?

No reason it couldn't be passed to IMA for measuring, if people want to
do that.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-25 12:07                                                         ` Greg KH
@ 2020-06-25 14:21                                                           ` Tetsuo Handa
  2020-06-25 19:34                                                           ` David Miller
  1 sibling, 0 replies; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-25 14:21 UTC (permalink / raw)
  To: Greg KH
  Cc: Alexei Starovoitov, Eric W. Biederman, Linus Torvalds, Kees Cook,
	Andrew Morton, Alexei Starovoitov, David Miller, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, linux-security-module,
	Casey Schaufler

On 2020/06/25 21:07, Greg KH wrote:
>>>>     call_usermodehelper() can teach LSM modules via pre-existing file's pathname and
>>>>     inode's security label at security_bprm_creds_for_exec()/security_bprm_check() etc.
>>>>     But since fork_usermode_blob() accepts only "byte array" and "length of byte array"
>>>>     arguments, I'm not sure whether LSM modules can obtain information needed for
>>>>     inspection. How does fork_usermode_blob() tell that information?
>>>
>>> It would seem that the "security context" for those would be the same as
>>> anything created before userspace launches today, right?  You handle
>>> that ok, and this should be just the same.
>>
>> I don't think so. Today when call_usermodehelper() is called, LSMs switch their security
>> context (at least TOMOYO does it) for further syscalls from the usermode process started
>> by the kernel context. But when fork_usermode_blob() is called, how LSMs can switch their
>> security context for further syscalls from the usermode process started by the kernel
>> context?
> 
> Ok, that makes a bit more sense.  Why not just do the same thing that
> you do today with call_usermodehelper()?  The logic in a way is the
> same, right?
> 

call_usermodehelper() provides information like "the kernel is about to run
/sbin/modprobe in order to load foo module" but fork_usermode_blob() does not
provide information like "the kernel is about to run a blob that we think is
a userspace USB IR filter driver". That is unfriendly to LSM modules.

> 
>>> Right now I, as a kernel module, can read/write to any file in the
>>> system, and do all sorts of other fun things.  You can't mediate that
>>> today from a LSM, and this is just one other example of this.
>>
>> Some functions (e.g. kernel_sock_shutdown()) bypass permission checks by LSMs
>> comes from a sort of trustness that the byte array kept inside kernel address
>> space remains secure/intact.
> 
> And what is going to change that "trustness" here?  The byte array came
> from the kernel address space to start with.  Are you thinking something
> outside of the kernel will then tamper with those bytes to do something
> else with them?

Right. e.g. ptrace() will allow reading/writing those bytes to do something
else with them. I guess 'gdb -p' is the same meaning.

>                  If so, shouldn't you be preventing that userspace
> program that does the tampering from doing that in the first place with
> the LSM running?

SELinux can handle process isolation very well. But the reality is that none of customers
I'm working for can afford using SELinux because SELinux is too complicated to support.
Instead, they use proprietary antivirus kernel modules (which tamper with syscall tables
and/or security_hook_heads). Therefore, I wish that isolation between processes running
fork_usermode_blob() and processes running normal usermode programs is implemented by
built-in mechanism (like DAC), and I said

  We might need to invent built-in "protected userspace" because existing
  "unprotected userspace" is not trustworthy enough to run kernel modules.
  That's not just inventing fork_usermode_blob().

at https://lkml.kernel.org/r/62859212-df69-b913-c1e0-cd2e358d1adf@i-love.sakura.ne.jp .
I'm happy if we can implement such isolation without counting on in-tree LSMs.


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-25 13:25                                                   ` Greg Kroah-Hartman
@ 2020-06-25 14:26                                                     ` Stephen Smalley
  2020-06-25 14:36                                                       ` Stephen Smalley
  2020-06-25 15:21                                                       ` Tetsuo Handa
  0 siblings, 2 replies; 194+ messages in thread
From: Stephen Smalley @ 2020-06-25 14:26 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: Tetsuo Handa, Alexei Starovoitov, Eric W. Biederman,
	Linus Torvalds, Kees Cook, Andrew Morton, Alexei Starovoitov,
	David Miller, Al Viro, bpf, linux-fsdevel, Daniel Borkmann,
	Jakub Kicinski, Masahiro Yamada, Gary Lin, Bruno Meneguele,
	linux-security-module, Casey Schaufler

On Thu, Jun 25, 2020 at 9:25 AM Greg Kroah-Hartman
<gregkh@linuxfoundation.org> wrote:
>
> On Thu, Jun 25, 2020 at 08:56:10AM -0400, Stephen Smalley wrote:
> > No, because we cannot label the inode based on the program's purpose
> > and therefore cannot configure an automatic transition to a suitable
> > security context for the process, unlike call_usermodehelper().
>
> Why, what prevents this?  Can you not just do that based on the "blob
> address" or signature of it or something like that?  Right now you all
> do this based on inode of a random file on a disk, what's the difference
> between a random blob in memory?

Given some kind of key to identify the blob and look up a suitable
context in policy, I think it would work.  We just don't have that
with the current interface.  With /bin/kmod and the like, we have a
security xattr assigned to the file when it was created that we can
use as the basis for determining the process security context.

> > On a different note, will the usermode blob be measured by IMA prior
> > to execution?  What ensures that the blob was actually embedded in the
> > kernel image and wasn't just supplied as data through exploitation of
> > a kernel vulnerability or malicious kernel module?
>
> No reason it couldn't be passed to IMA for measuring, if people want to
> do that.

Actually, I think it probably happens already via IMA's existing hooks
but just wanted to confirm that IMA doesn't ignore S_PRIVATE inodes.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-25 14:26                                                     ` Stephen Smalley
@ 2020-06-25 14:36                                                       ` Stephen Smalley
  2020-06-25 15:21                                                       ` Tetsuo Handa
  1 sibling, 0 replies; 194+ messages in thread
From: Stephen Smalley @ 2020-06-25 14:36 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: Tetsuo Handa, Alexei Starovoitov, Eric W. Biederman,
	Linus Torvalds, Kees Cook, Andrew Morton, Alexei Starovoitov,
	David Miller, Al Viro, bpf, linux-fsdevel, Daniel Borkmann,
	Jakub Kicinski, Masahiro Yamada, Gary Lin, Bruno Meneguele,
	linux-security-module, Casey Schaufler

On Thu, Jun 25, 2020 at 10:26 AM Stephen Smalley
<stephen.smalley.work@gmail.com> wrote:
>
> On Thu, Jun 25, 2020 at 9:25 AM Greg Kroah-Hartman
> <gregkh@linuxfoundation.org> wrote:
> >
> > On Thu, Jun 25, 2020 at 08:56:10AM -0400, Stephen Smalley wrote:
> > > No, because we cannot label the inode based on the program's purpose
> > > and therefore cannot configure an automatic transition to a suitable
> > > security context for the process, unlike call_usermodehelper().
> >
> > Why, what prevents this?  Can you not just do that based on the "blob
> > address" or signature of it or something like that?  Right now you all
> > do this based on inode of a random file on a disk, what's the difference
> > between a random blob in memory?
>
> Given some kind of key to identify the blob and look up a suitable
> context in policy, I think it would work.  We just don't have that
> with the current interface.  With /bin/kmod and the like, we have a
> security xattr assigned to the file when it was created that we can
> use as the basis for determining the process security context.

Looks like info->cmdline could be used as that key if set; we would
just need a LSM hook to permit setting up the inode's security state
based on that key.  If that were passed to shmem_kernel_file_setup()
as the name argument, then that might also help path-based LSMs
although it seems potentially ambiguous.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-25 14:26                                                     ` Stephen Smalley
  2020-06-25 14:36                                                       ` Stephen Smalley
@ 2020-06-25 15:21                                                       ` Tetsuo Handa
  2020-06-25 16:03                                                         ` Stephen Smalley
  2020-06-25 16:06                                                         ` Casey Schaufler
  1 sibling, 2 replies; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-25 15:21 UTC (permalink / raw)
  To: Stephen Smalley, Greg Kroah-Hartman, Alexei Starovoitov
  Cc: Eric W. Biederman, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele, linux-security-module, Casey Schaufler

On 2020/06/25 23:26, Stephen Smalley wrote:
> On Thu, Jun 25, 2020 at 9:25 AM Greg Kroah-Hartman
> <gregkh@linuxfoundation.org> wrote:
>>
>> On Thu, Jun 25, 2020 at 08:56:10AM -0400, Stephen Smalley wrote:
>>> No, because we cannot label the inode based on the program's purpose
>>> and therefore cannot configure an automatic transition to a suitable
>>> security context for the process, unlike call_usermodehelper().
>>
>> Why, what prevents this?  Can you not just do that based on the "blob
>> address" or signature of it or something like that?  Right now you all
>> do this based on inode of a random file on a disk, what's the difference
>> between a random blob in memory?
> 
> Given some kind of key to identify the blob and look up a suitable
> context in policy, I think it would work.  We just don't have that
> with the current interface.  With /bin/kmod and the like, we have a
> security xattr assigned to the file when it was created that we can
> use as the basis for determining the process security context.

My understanding is that fork_usermode_blob() is intended to be able to run
without filesystems so that usermode blobs can start even before global init
program (pid=1) starts.

But SELinux's policy is likely stored inside filesystems which would be
accessible only after global init program (pid=1) started.

Therefore, I wonder whether SELinux can look up a suitable context in policy
even if "some kind of key to identify the blob" is provided.
Also, since (at least some of) usermode blob processes started via
fork_usermode_blob() will remain after SELinux loads policy from filesystems,
I guess that we will need a method for moving already running usermode blob
processes to appropriate security contexts.

Is my understanding/concerns correct?


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-25 15:21                                                       ` Tetsuo Handa
@ 2020-06-25 16:03                                                         ` Stephen Smalley
  2020-06-25 16:06                                                         ` Casey Schaufler
  1 sibling, 0 replies; 194+ messages in thread
From: Stephen Smalley @ 2020-06-25 16:03 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Greg Kroah-Hartman, Alexei Starovoitov, Eric W. Biederman,
	Linus Torvalds, Kees Cook, Andrew Morton, Alexei Starovoitov,
	David Miller, Al Viro, bpf, linux-fsdevel, Daniel Borkmann,
	Jakub Kicinski, Masahiro Yamada, Gary Lin, Bruno Meneguele,
	linux-security-module, Casey Schaufler

On Thu, Jun 25, 2020 at 11:21 AM Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp> wrote:
>
> On 2020/06/25 23:26, Stephen Smalley wrote:
> > On Thu, Jun 25, 2020 at 9:25 AM Greg Kroah-Hartman
> > <gregkh@linuxfoundation.org> wrote:
> >>
> >> On Thu, Jun 25, 2020 at 08:56:10AM -0400, Stephen Smalley wrote:
> >>> No, because we cannot label the inode based on the program's purpose
> >>> and therefore cannot configure an automatic transition to a suitable
> >>> security context for the process, unlike call_usermodehelper().
> >>
> >> Why, what prevents this?  Can you not just do that based on the "blob
> >> address" or signature of it or something like that?  Right now you all
> >> do this based on inode of a random file on a disk, what's the difference
> >> between a random blob in memory?
> >
> > Given some kind of key to identify the blob and look up a suitable
> > context in policy, I think it would work.  We just don't have that
> > with the current interface.  With /bin/kmod and the like, we have a
> > security xattr assigned to the file when it was created that we can
> > use as the basis for determining the process security context.
>
> My understanding is that fork_usermode_blob() is intended to be able to run
> without filesystems so that usermode blobs can start even before global init
> program (pid=1) starts.
>
> But SELinux's policy is likely stored inside filesystems which would be
> accessible only after global init program (pid=1) started.
>
> Therefore, I wonder whether SELinux can look up a suitable context in policy
> even if "some kind of key to identify the blob" is provided.
> Also, since (at least some of) usermode blob processes started via
> fork_usermode_blob() will remain after SELinux loads policy from filesystems,
> I guess that we will need a method for moving already running usermode blob
> processes to appropriate security contexts.
>
> Is my understanding/concerns correct?

It isn't fundamentally different than the issue of program execution
from a filesystem prior to initial policy load, e.g. executing
programs from the initrd or executing init from the "real" root.
Absent a policy, the process will just remain in the initial
SID/context (kernel SID), which will later be mapped to a context when
policy is loaded.  Typical init programs address this by either
re-exec'ing themselves after policy load or by dynamically switching
contexts via write to /proc/self/attr/current.  The kernel doesn't try
to retroactively transition previously started processes; they are
expected to either exit prior to policy load (ala transient processes
run from initrd) or re-exec or set their contexts after policy load.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-25 15:21                                                       ` Tetsuo Handa
  2020-06-25 16:03                                                         ` Stephen Smalley
@ 2020-06-25 16:06                                                         ` Casey Schaufler
  1 sibling, 0 replies; 194+ messages in thread
From: Casey Schaufler @ 2020-06-25 16:06 UTC (permalink / raw)
  To: Tetsuo Handa, Stephen Smalley, Greg Kroah-Hartman, Alexei Starovoitov
  Cc: Eric W. Biederman, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele, linux-security-module, Casey Schaufler

On 6/25/2020 8:21 AM, Tetsuo Handa wrote:
> On 2020/06/25 23:26, Stephen Smalley wrote:
>> On Thu, Jun 25, 2020 at 9:25 AM Greg Kroah-Hartman
>> <gregkh@linuxfoundation.org> wrote:
>>> On Thu, Jun 25, 2020 at 08:56:10AM -0400, Stephen Smalley wrote:
>>>> No, because we cannot label the inode based on the program's purpose
>>>> and therefore cannot configure an automatic transition to a suitable
>>>> security context for the process, unlike call_usermodehelper().
>>> Why, what prevents this?  Can you not just do that based on the "blob
>>> address" or signature of it or something like that?  Right now you all
>>> do this based on inode of a random file on a disk, what's the difference
>>> between a random blob in memory?
>> Given some kind of key to identify the blob and look up a suitable
>> context in policy, I think it would work.  We just don't have that
>> with the current interface.  With /bin/kmod and the like, we have a
>> security xattr assigned to the file when it was created that we can
>> use as the basis for determining the process security context.

It should also be noted that Smack uses multiple xattrs. It is also
possible for multiple current security modules to use xattrs, including
capabilities. It's not sufficient to provide "the security xattr", there
would have to be provision for general security xattrs.

> My understanding is that fork_usermode_blob() is intended to be able to run
> without filesystems so that usermode blobs can start even before global init
> program (pid=1) starts.
>
> But SELinux's policy is likely stored inside filesystems which would be
> accessible only after global init program (pid=1) started.
>
> Therefore, I wonder whether SELinux can look up a suitable context in policy
> even if "some kind of key to identify the blob" is provided.

A security module has to have some sort of policy for whatever happens prior
to "policy load" as it is. That's not unique to this situation.



^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-25 12:07                                                         ` Greg KH
  2020-06-25 14:21                                                           ` Tetsuo Handa
@ 2020-06-25 19:34                                                           ` David Miller
  2020-06-26  1:36                                                             ` Linus Torvalds
  1 sibling, 1 reply; 194+ messages in thread
From: David Miller @ 2020-06-25 19:34 UTC (permalink / raw)
  To: greg
  Cc: penguin-kernel, alexei.starovoitov, ebiederm, torvalds, keescook,
	akpm, ast, viro, bpf, linux-fsdevel, daniel, kuba,
	yamada.masahiro, GLin, bmeneg, linux-security-module, casey

From: Greg KH <greg@kroah.com>
Date: Thu, 25 Jun 2020 14:07:25 +0200

> I really don't understand the objection here, why is this any different
> than any other random kernel driver for what it can do?

It's kernel code executing in userspace.  If you don't trust the
signed code you don't trust the signed code.

Nothing is magic about a piece of code executing in userspace.

I seriously think this dicussion is trying to create an issue
that simply doesn't exist in reality.

If some kernel module executed "/bin/sh" it's the same problem.
There is no way to argue around this, so please stop doing so
it's silly.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-25 19:34                                                           ` David Miller
@ 2020-06-26  1:36                                                             ` Linus Torvalds
  2020-06-26  1:51                                                               ` Alexei Starovoitov
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
  0 siblings, 2 replies; 194+ messages in thread
From: Linus Torvalds @ 2020-06-26  1:36 UTC (permalink / raw)
  To: David Miller
  Cc: Greg Kroah-Hartman, Tetsuo Handa, Alexei Starovoitov,
	Eric W. Biederman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On Thu, Jun 25, 2020 at 12:34 PM David Miller <davem@davemloft.net> wrote:
>
> It's kernel code executing in userspace.  If you don't trust the
> signed code you don't trust the signed code.
>
> Nothing is magic about a piece of code executing in userspace.

Well, there's one real issue: the most likely thing that code is going
to do is execute llvm to generate more code.

And that's I think the real security issue here: the context in which
the code executes. It may be triggered in one namespace, but what
namespaces and what rules should the thing actually then execute in.

So no, trying to dismiss this as "there are no security issues" is
bogus. There very much are security issues.

It's just that the current code that is just a dummy wrapper around
something that doesn't actually do anything doesn't happen to _show_
those issues, because it does nothing.

I've stayed away from this discussion because I wanted to see if it
went anywhere, but it doesn't seem to.

My personally strongest argument for remoiving this kernel code is
that it's been there for a couple of years now, and it has never
actually done anything useful, and there's no actual sign that it ever
will, or that there is a solid plan in place for it.

So to me, it really looks like it was an interesting idea, but one
that hasn't proven itself, and most certainly not one that has shown
itself to be the _right_ idea.

We can dance around the "what about security modules", but that
fundamental problem of "this code hasn't done anything useful for two
years and we don't even know if it's the right thing to do or what the
real security issues _will_ be" is I think the real issue here.

Hmm?

             Linus

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-26  1:36                                                             ` Linus Torvalds
@ 2020-06-26  1:51                                                               ` Alexei Starovoitov
  2020-06-26  4:58                                                                 ` Tetsuo Handa
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
  1 sibling, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-26  1:51 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Eric W. Biederman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On Thu, Jun 25, 2020 at 06:36:34PM -0700, Linus Torvalds wrote:
> On Thu, Jun 25, 2020 at 12:34 PM David Miller <davem@davemloft.net> wrote:
> >
> > It's kernel code executing in userspace.  If you don't trust the
> > signed code you don't trust the signed code.
> >
> > Nothing is magic about a piece of code executing in userspace.
> 
> Well, there's one real issue: the most likely thing that code is going
> to do is execute llvm to generate more code.
> 
> And that's I think the real security issue here: the context in which
> the code executes. It may be triggered in one namespace, but what
> namespaces and what rules should the thing actually then execute in.
> 
> So no, trying to dismiss this as "there are no security issues" is
> bogus. There very much are security issues.

I think you're referring to:

>>   We might need to invent built-in "protected userspace" because existing
>>   "unprotected userspace" is not trustworthy enough to run kernel modules.
>>   That's not just inventing fork_usermode_blob().

Another root process can modify the memory of usermode_blob process.
I think that's Tetsuo's point about lack of LSM hooks is kernel_sock_shutdown().
Obviously, kernel_sock_shutdown() can be called by kernel only.
I suspect he's imaging a hypothetical situation where kernel bits of kernel module
interact with userblob bits of kernel module.
Then another root process tampers with memory of userblob.
Then userblob interaction with kernel module can do kernel_sock_shutdown()
on something that initial design of kernel+userblob module didn't intend.
I think this is trivially enforceable without creating new features.
Existing security_ptrace_access_check() LSM hook can prevent tampering with
memory of userblob.

As far as userblob calling llvm and other things in sequence.
That is no different from systemd calling things.
security label can carry that execution context.

> My personally strongest argument for remoiving this kernel code is
> that it's been there for a couple of years now, and it has never
> actually done anything useful, and there's no actual sign that it ever
> will, or that there is a solid plan in place for it.

you probably missed the detailed plan:
https://lore.kernel.org/bpf/20200609235631.ukpm3xngbehfqthz@ast-mbp.dhcp.thefacebook.com/

The project #3 is the above is the one we're working on right now.
It should be ready to post in a week.

> We can dance around the "what about security modules", but that
> fundamental problem of "this code hasn't done anything useful for two
> years and we don't even know if it's the right thing to do or what the
> real security issues _will_ be" is I think the real issue here.

Please see above link. bpfilter didn't go anywhere, but fork_usermode_blob()
has plenty of use cases that will materialize soon.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-26  1:51                                                               ` Alexei Starovoitov
@ 2020-06-26  4:58                                                                 ` Tetsuo Handa
  2020-06-26  5:41                                                                   ` Alexei Starovoitov
  0 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-26  4:58 UTC (permalink / raw)
  To: Alexei Starovoitov, Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Eric W. Biederman, Kees Cook,
	Andrew Morton, Alexei Starovoitov, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele, LSM List, Casey Schaufler

On 2020/06/26 10:51, Alexei Starovoitov wrote:
> On Thu, Jun 25, 2020 at 06:36:34PM -0700, Linus Torvalds wrote:
>> On Thu, Jun 25, 2020 at 12:34 PM David Miller <davem@davemloft.net> wrote:
>>>
>>> It's kernel code executing in userspace.  If you don't trust the
>>> signed code you don't trust the signed code.
>>>
>>> Nothing is magic about a piece of code executing in userspace.
>>
>> Well, there's one real issue: the most likely thing that code is going
>> to do is execute llvm to generate more code.

Wow! Are we going to allow execution of such complicated programs?

I was hoping that fork_usermode_blob() accepts only simple program
like the content of "hello64" generated by

----------
; nasm -f elf64 hello64.asm && ld -s -m elf_x86_64 -o hello64 hello64.o
section .text
global _start

_start:
  mov rax, 1        ; write(
  mov rdi, 1        ;   1,
  mov rsi, msg      ;   "Hello world\n",
  mov rdx, 12       ;   12
  syscall           ; );
  mov rax, 231      ; _exit(
  mov rdi, 0        ;   0
  syscall           ; );

section .rodata
  msg: db "Hello world", 0x0a
----------

which can be contained by mechanisms like seccomp; there is no pathname
resolution, no networking access etc.

>>
>> And that's I think the real security issue here: the context in which
>> the code executes. It may be triggered in one namespace, but what
>> namespaces and what rules should the thing actually then execute in.
>>
>> So no, trying to dismiss this as "there are no security issues" is
>> bogus. There very much are security issues.
> 
> I think you're referring to:
> 
>>>   We might need to invent built-in "protected userspace" because existing
>>>   "unprotected userspace" is not trustworthy enough to run kernel modules.
>>>   That's not just inventing fork_usermode_blob().
> 
> Another root process can modify the memory of usermode_blob process.

I'm not familiar with ptrace(); I'm just using /usr/bin/strace and /usr/bin/ltrace .
What I'm worrying is that some root process tampers with memory which initially
contained "hello64" above in order to let that memory do something different behavior.

For example, a usermode process started by fork_usermode_blob() which was initially
containing

----------
while (read(0, &uid, sizeof(uid)) == sizeof(uid)) {
    if (uid == 0)
        write(1, "OK\n", 3);
    else
        write(1, "NG\n", 3);
}
----------

can be somehow tampered like

----------
while (read(0, &uid, sizeof(uid)) == sizeof(uid)) {
    if (uid != 0)
        write(1, "OK\n", 3);
    else
        write(1, "NG\n", 3);
}
----------

due to interference from the rest of the system, how can we say "we trust kernel
code executing in userspace" ?

My question is: how is the byte array (which was copied from kernel space) kept secure/intact
under "root can poke into kernel or any process memory." environment? It is obvious that
we can't say "we trust kernel code executing in userspace" without some mechanism.

Currently fork_usermode_blob() is not providing security context for the byte array to be
executed. We could modify fork_usermode_blob() to provide security context for LSMs, but
I'll be more happy if we can implement that mechanism without counting on in-tree LSMs, for
SELinux is too complicated to support.

> I think that's Tetsuo's point about lack of LSM hooks is kernel_sock_shutdown().
> Obviously, kernel_sock_shutdown() can be called by kernel only.

I can't catch what you mean. The kernel code executing in userspace uses syscall
interface (e.g. SYSCALL_DEFINE2(shutdown, int, fd, int, how) path), doesn't it?

> I suspect he's imaging a hypothetical situation where kernel bits of kernel module
> interact with userblob bits of kernel module.
> Then another root process tampers with memory of userblob.

Yes, how to protect the memory of userblob is a concern. The memory of userblob can
interfere (or can be interfered by) the rest of the system is a problem.

> Then userblob interaction with kernel module can do kernel_sock_shutdown()
> on something that initial design of kernel+userblob module didn't intend.

I can't catch what you mean.

> I think this is trivially enforceable without creating new features.
> Existing security_ptrace_access_check() LSM hook can prevent tampering with
> memory of userblob.

There is security_ptrace_access_check() LSM hook, but no zero-configuration
method is available.

> 
> As far as userblob calling llvm and other things in sequence.
> That is no different from systemd calling things.

Right.

> security label can carry that execution context.

If files get a chance to be associated with appropriate pathname and
security label.

> 
>> My personally strongest argument for remoiving this kernel code is
>> that it's been there for a couple of years now, and it has never
>> actually done anything useful, and there's no actual sign that it ever
>> will, or that there is a solid plan in place for it.
> 
> you probably missed the detailed plan:
> https://lore.kernel.org/bpf/20200609235631.ukpm3xngbehfqthz@ast-mbp.dhcp.thefacebook.com/
> 
> The project #3 is the above is the one we're working on right now.
> It should be ready to post in a week.

I got a question on project #3. Given that "cat /sys/fs/bpf/my_ipv6_route"
produces the same human output as "cat /proc/net/ipv6_route", how security
checks which are done for "cat /proc/net/ipv6_route" can be enforced for
"cat /sys/fs/bpf/my_ipv6_route" ? Unless same security checks (e.g. permission
to read /proc/net/ipv6_route ) is enforced, such bpf usage sounds like a method
for bypassing existing security mechanisms.


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-26  4:58                                                                 ` Tetsuo Handa
@ 2020-06-26  5:41                                                                   ` Alexei Starovoitov
  2020-06-26  6:20                                                                     ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-26  5:41 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Linus Torvalds, David Miller, Greg Kroah-Hartman,
	Eric W. Biederman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On Fri, Jun 26, 2020 at 01:58:35PM +0900, Tetsuo Handa wrote:
> On 2020/06/26 10:51, Alexei Starovoitov wrote:
> > On Thu, Jun 25, 2020 at 06:36:34PM -0700, Linus Torvalds wrote:
> >> On Thu, Jun 25, 2020 at 12:34 PM David Miller <davem@davemloft.net> wrote:
> >>>
> >>> It's kernel code executing in userspace.  If you don't trust the
> >>> signed code you don't trust the signed code.
> >>>
> >>> Nothing is magic about a piece of code executing in userspace.
> >>
> >> Well, there's one real issue: the most likely thing that code is going
> >> to do is execute llvm to generate more code.
> 
> Wow! Are we going to allow execution of such complicated programs?

No. llvm was _never_ intended to be run from the blob.
bpfilter was envisioned as self contained binary. If it needed to do
optimizations on generated bpf code it would have to do them internally.

> I was hoping that fork_usermode_blob() accepts only simple program
> like the content of "hello64" generated by

pretty much. statically compiled elf that is self contained.

> For example, a usermode process started by fork_usermode_blob() which was initially
> containing
> 
> ----------
> while (read(0, &uid, sizeof(uid)) == sizeof(uid)) {
>     if (uid == 0)
>         write(1, "OK\n", 3);
>     else
>         write(1, "NG\n", 3);
> }
> ----------
> 
> can be somehow tampered like
> 
> ----------
> while (read(0, &uid, sizeof(uid)) == sizeof(uid)) {
>     if (uid != 0)
>         write(1, "OK\n", 3);
>     else
>         write(1, "NG\n", 3);
> }
> ----------
> 
> due to interference from the rest of the system, how can we say "we trust kernel
> code executing in userspace" ?

I answered this already in the previous email.
Use security_ptrace_access_check() LSM hook to make sure that no other process
can tamper with blob's memory when it's running as user process.
In the future it would be trivial to add a new ptrace flag to
make sure that blob's memory is not ptraceable from the start.

> My question is: how is the byte array (which was copied from kernel space) kept secure/intact
> under "root can poke into kernel or any process memory." environment? It is obvious that
> we can't say "we trust kernel code executing in userspace" without some mechanism.

Already answered.

> Currently fork_usermode_blob() is not providing security context for the byte array to be
> executed. We could modify fork_usermode_blob() to provide security context for LSMs, but
> I'll be more happy if we can implement that mechanism without counting on in-tree LSMs, for
> SELinux is too complicated to support.

I'm pretty sure it was answered in the upthread by selinux folks.
Quick recap: we can add security labels, sha, strings, you_name_it to the blob that
lsm hooks can track.
We can also add another LSM hook to fork_usermode_blob(), so if tomoyo is so worried
about blobs it would be able to reject all of them without too much work.

> 
> > I think that's Tetsuo's point about lack of LSM hooks is kernel_sock_shutdown().
> > Obviously, kernel_sock_shutdown() can be called by kernel only.
> 
> I can't catch what you mean. The kernel code executing in userspace uses syscall
> interface (e.g. SYSCALL_DEFINE2(shutdown, int, fd, int, how) path), doesn't it?

yes.

> > I suspect he's imaging a hypothetical situation where kernel bits of kernel module
> > interact with userblob bits of kernel module.
> > Then another root process tampers with memory of userblob.
> 
> Yes, how to protect the memory of userblob is a concern. The memory of userblob can
> interfere (or can be interfered by) the rest of the system is a problem.

answered.

> > I think this is trivially enforceable without creating new features.
> > Existing security_ptrace_access_check() LSM hook can prevent tampering with
> > memory of userblob.
> 
> There is security_ptrace_access_check() LSM hook, but no zero-configuration
> method is available.

huh?
tomoyo is not using that hook, but selinux and many other LSMs do.
please learn from others.

> > security label can carry that execution context.
> 
> If files get a chance to be associated with appropriate pathname and
> security label.

I can easily add a fake pathname to the blob, but it won't help tomoyo.
That's what I was saying all along.
pathname based security provides false sense of security.

I'm pretty sure this old blog has been read by many folks who
are following this thread, but it's worth reminding again:
https://securityblog.org/2006/04/19/security-anti-pattern-path-based-access-control/
I cannot agree more with Joshua.
Here is a quote:
"The most obvious problem with this is that not all objects are files and thus do not have paths."

> >> My personally strongest argument for remoiving this kernel code is
> >> that it's been there for a couple of years now, and it has never
> >> actually done anything useful, and there's no actual sign that it ever
> >> will, or that there is a solid plan in place for it.
> > 
> > you probably missed the detailed plan:
> > https://lore.kernel.org/bpf/20200609235631.ukpm3xngbehfqthz@ast-mbp.dhcp.thefacebook.com/
> > 
> > The project #3 is the above is the one we're working on right now.
> > It should be ready to post in a week.
> 
> I got a question on project #3. Given that "cat /sys/fs/bpf/my_ipv6_route"
> produces the same human output as "cat /proc/net/ipv6_route", how security
> checks which are done for "cat /proc/net/ipv6_route" can be enforced for
> "cat /sys/fs/bpf/my_ipv6_route" ? Unless same security checks (e.g. permission
> to read /proc/net/ipv6_route ) is enforced, such bpf usage sounds like a method
> for bypassing existing security mechanisms.

Standard file permissions. Nothing to do with bpf.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-26  5:41                                                                   ` Alexei Starovoitov
@ 2020-06-26  6:20                                                                     ` Tetsuo Handa
  2020-06-26  6:39                                                                       ` Alexei Starovoitov
  0 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-26  6:20 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, David Miller, Greg Kroah-Hartman,
	Eric W. Biederman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On 2020/06/26 14:41, Alexei Starovoitov wrote:
>> I was hoping that fork_usermode_blob() accepts only simple program
>> like the content of "hello64" generated by
> 
> pretty much. statically compiled elf that is self contained.

But fork_usermode_blob() itself does not check that.

>> due to interference from the rest of the system, how can we say "we trust kernel
>> code executing in userspace" ?
> 
> I answered this already in the previous email.

Previous post is mostly summary for David Miller who responded

  It's kernel code executing in userspace.  If you don't trust the
  signed code you don't trust the signed code.

  Nothing is magic about a piece of code executing in userspace.

without understanding my concerns.

> Use security_ptrace_access_check() LSM hook to make sure that no other process
> can tamper with blob's memory when it's running as user process.

Yes, security_ptrace_access_check() hook is there. But see the reality explained later.

> In the future it would be trivial to add a new ptrace flag to
> make sure that blob's memory is not ptraceable from the start.

I guess it is some PF_* flag (like PF_KTHREAD is used for avoiding some interference).

>> There is security_ptrace_access_check() LSM hook, but no zero-configuration
>> method is available.
> 
> huh?
> tomoyo is not using that hook, but selinux and many other LSMs do.
> please learn from others.

What I am hoping is that we can restrict interference between usermode blob processes
and other processes without using LSMs, for the reality is

  (1) Linux kernel community does not allow legally accessing LSM infrastructure from
      loadable kernel modules since Linux 2.6.24.
  (2) Red Hat folks enable only SELinux in their kernels.
  (3) Customers I'm working for cannot afford enabling SELinux in their environments.

and therefore

  (4) I have to maintain loadable kernel module version of LSM modules which illegally
      access LSM infrastructure in order to implement single function LSM modules.

Implementing security_ptrace_access_check() hook in TOMOYO is not a solution.

>>> security label can carry that execution context.
>>
>> If files get a chance to be associated with appropriate pathname and
>> security label.
> 
> I can easily add a fake pathname to the blob, but it won't help tomoyo.
> That's what I was saying all along.
> pathname based security provides false sense of security.
> 
> I'm pretty sure this old blog has been read by many folks who
> are following this thread, but it's worth reminding again:
> https://securityblog.org/2006/04/19/security-anti-pattern-path-based-access-control/
> I cannot agree more with Joshua.
> Here is a quote:
> "The most obvious problem with this is that not all objects are files and thus do not have paths."

Don't you know that TOMOYO can coexist with SELinux/Smack/AppArmor since Linux 5.1 ? ;-)


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-26  6:20                                                                     ` Tetsuo Handa
@ 2020-06-26  6:39                                                                       ` Alexei Starovoitov
  0 siblings, 0 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-26  6:39 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Linus Torvalds, David Miller, Greg Kroah-Hartman,
	Eric W. Biederman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On Fri, Jun 26, 2020 at 03:20:35PM +0900, Tetsuo Handa wrote:
> On 2020/06/26 14:41, Alexei Starovoitov wrote:
> >> I was hoping that fork_usermode_blob() accepts only simple program
> >> like the content of "hello64" generated by
> > 
> > pretty much. statically compiled elf that is self contained.
> 
> But fork_usermode_blob() itself does not check that.

As I said few emails back it's trivial to add such check.

> > In the future it would be trivial to add a new ptrace flag to
> > make sure that blob's memory is not ptraceable from the start.
> 
> I guess it is some PF_* flag (like PF_KTHREAD is used for avoiding some interference).

Kinda.
I was thinking about PTRACE_MODE_xxx flag.

> What I am hoping is that we can restrict interference between usermode blob processes
> and other processes without using LSMs,

I don't see why not.
Extra piece of mind that blob memory is untouchable by other root processes is nice to have.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [RFC][PATCH] net/bpfilter: Remove this broken and apparently unmantained
  2020-06-24 14:26                                             ` Alexei Starovoitov
  2020-06-24 23:14                                               ` Tetsuo Handa
@ 2020-06-26 11:30                                               ` Eric W. Biederman
  1 sibling, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 11:30 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Tetsuo Handa, Linus Torvalds, Kees Cook, Andrew Morton,
	Alexei Starovoitov, David Miller, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Wed, Jun 24, 2020 at 5:17 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>>
>> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>>
>> > On Tue, Jun 23, 2020 at 01:53:48PM -0500, Eric W. Biederman wrote:
>>
>> > There is no refcnt bug. It was a user error on tomoyo side.
>> > fork_blob() works as expected.
>>
>> Nope.  I have independently confirmed it myself.
>
> I guess you've tried Tetsuo's fork_blob("#!/bin/true") kernel module ?
> yes. that fails. It never meant to be used for this.
> With elf blob it works, but breaks if there are rejections
> in things like security_bprm_creds_for_exec().
> In my mind that path was 'must succeed or kernel module is toast'.
> Like passing NULL into a function that doesn't check for it.
> Working on a fix for that since Tetsuo cares.

No.  The reference counting issue is present with the elf blob.

It is very straight forward to see when you take a minute to look.

The file is created with shmem_kernel_file_setup in fork_usermode_blob.
The file is put in fork_usermode_blob
The file is put in free_bprm by exec.

Eric

^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-26  1:36                                                             ` Linus Torvalds
  2020-06-26  1:51                                                               ` Alexei Starovoitov
@ 2020-06-26 12:51                                                               ` Eric W. Biederman
  2020-06-26 12:53                                                                 ` [PATCH 01/14] umh: Capture the pid in umh_pipe_setup Eric W. Biederman
                                                                                   ` (17 more replies)
  1 sibling, 18 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 12:51 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler


Asking for people to fix their bugs in this user mode driver code has
been remarkably unproductive.  So here are my bug fixes.

I have tested them by booting with the code compiled in and
by killing "bpfilter_umh" and running iptables -vnL to restart
the userspace driver.

I have split the changes into small enough pieces so they should be
easily readable and testable.  

The changes lean into the preexisting interfaces in the kernel and
remove special cases for user mode driver code in favor of solutions
that don't need special cases.  This results in smaller code with
fewer bugs.

At a practical level this removes the maintenance burden of the
user mode drivers from the user mode helper code and from exec as
the special cases are removed.

Similarly the LSM interaction bugs are fixed by not having unnecessary
special cases for user mode drivers.

Please let me know if you see any bugs.  Once the code review is
finished I plan to take this through my tree.

Eric W. Biederman (14):
      umh: Capture the pid in umh_pipe_setup
      umh: Move setting PF_UMH into umh_pipe_setup
      umh: Rename the user mode driver helpers for clarity
      umh: Remove call_usermodehelper_setup_file.
      umh: Separate the user mode driver and the user mode helper support
      umd: For clarity rename umh_info umd_info
      umd: Rename umd_info.cmdline umd_info.driver_name
      umd: Transform fork_usermode_blob into fork_usermode_driver
      umh: Stop calling do_execve_file
      exec: Remove do_execve_file
      bpfilter: Move bpfilter_umh back into init data
      umd: Track user space drivers with struct pid
      bpfilter: Take advantage of the facilities of struct pid
      umd: Remove exit_umh

 fs/exec.c                        |  38 ++------
 include/linux/binfmts.h          |   1 -
 include/linux/bpfilter.h         |   7 +-
 include/linux/sched.h            |   9 --
 include/linux/umd.h              |  18 ++++
 include/linux/umh.h              |  15 ----
 kernel/Makefile                  |   1 +
 kernel/exit.c                    |   1 -
 kernel/umd.c                     | 183 +++++++++++++++++++++++++++++++++++++++
 kernel/umh.c                     | 171 +-----------------------------------
 net/bpfilter/bpfilter_kern.c     |  38 ++++----
 net/bpfilter/bpfilter_umh_blob.S |   2 +-
 net/ipv4/bpfilter/sockopt.c      |  20 +++--
 13 files changed, 249 insertions(+), 255 deletions(-)

Eric

^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH 01/14] umh: Capture the pid in umh_pipe_setup
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
@ 2020-06-26 12:53                                                                 ` Eric W. Biederman
  2020-06-26 12:53                                                                 ` [PATCH 02/14] umh: Move setting PF_UMH into umh_pipe_setup Eric W. Biederman
                                                                                   ` (16 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 12:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler


The pid in struct subprocess_info is only used by umh_clean_and_save_pid to
write the pid into umh_info.

Instead always capture the pid on struct umh_info in umh_pipe_setup, removing
code that is specific to user mode drivers from the common user path of
user mode helpers.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umh.h | 1 -
 kernel/umh.c        | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/linux/umh.h b/include/linux/umh.h
index 0c08de356d0d..aae16a0ebd0f 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -25,7 +25,6 @@ struct subprocess_info {
 	struct file *file;
 	int wait;
 	int retval;
-	pid_t pid;
 	int (*init)(struct subprocess_info *info, struct cred *new);
 	void (*cleanup)(struct subprocess_info *info);
 	void *data;
diff --git a/kernel/umh.c b/kernel/umh.c
index 79f139a7ca03..c2a582b3a2bf 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -102,7 +102,6 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	sub_info->pid = task_pid_nr(current);
 	if (sub_info->file) {
 		retval = do_execve_file(sub_info->file,
 					sub_info->argv, sub_info->envp);
@@ -468,6 +467,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 
 	umh_info->pipe_to_umh = to_umh[1];
 	umh_info->pipe_from_umh = from_umh[0];
+	umh_info->pid = task_pid_nr(current);
 	return 0;
 }
 
@@ -476,13 +476,12 @@ static void umh_clean_and_save_pid(struct subprocess_info *info)
 	struct umh_info *umh_info = info->data;
 
 	/* cleanup if umh_pipe_setup() was successful but exec failed */
-	if (info->pid && info->retval) {
+	if (info->retval) {
 		fput(umh_info->pipe_to_umh);
 		fput(umh_info->pipe_from_umh);
 	}
 
 	argv_free(info->argv);
-	umh_info->pid = info->pid;
 }
 
 /**
-- 
2.25.0

^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH 02/14] umh: Move setting PF_UMH into umh_pipe_setup
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
  2020-06-26 12:53                                                                 ` [PATCH 01/14] umh: Capture the pid in umh_pipe_setup Eric W. Biederman
@ 2020-06-26 12:53                                                                 ` Eric W. Biederman
  2020-06-26 12:54                                                                 ` [PATCH 03/14] umh: Rename the user mode driver helpers for clarity Eric W. Biederman
                                                                                   ` (15 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 12:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler


I am separating the code specific to user mode drivers from the code
for ordinary user space helpers.  Move setting of PF_UMH from
call_usermodehelper_exec_async which is core user mode helper code
into umh_pipe_setup which is user mode driver code.

The code is equally as easy to write in one location as the other and
the movement minimizes the impact of the user mode driver code on the
core of the user mode helper code.

Setting PF_UMH unconditionally is harmless as an action will only
happen if it is paired with an entry on umh_list.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/umh.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/umh.c b/kernel/umh.c
index c2a582b3a2bf..e6b9d6636850 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -102,12 +102,10 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	if (sub_info->file) {
+	if (sub_info->file)
 		retval = do_execve_file(sub_info->file,
 					sub_info->argv, sub_info->envp);
-		if (!retval)
-			current->flags |= PF_UMH;
-	} else
+	else
 		retval = do_execve(getname_kernel(sub_info->path),
 				   (const char __user *const __user *)sub_info->argv,
 				   (const char __user *const __user *)sub_info->envp);
@@ -468,6 +466,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 	umh_info->pipe_to_umh = to_umh[1];
 	umh_info->pipe_from_umh = from_umh[0];
 	umh_info->pid = task_pid_nr(current);
+	current->flags |= PF_UMH;
 	return 0;
 }
 
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH 03/14] umh: Rename the user mode driver helpers for clarity
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
  2020-06-26 12:53                                                                 ` [PATCH 01/14] umh: Capture the pid in umh_pipe_setup Eric W. Biederman
  2020-06-26 12:53                                                                 ` [PATCH 02/14] umh: Move setting PF_UMH into umh_pipe_setup Eric W. Biederman
@ 2020-06-26 12:54                                                                 ` Eric W. Biederman
  2020-06-26 12:54                                                                 ` [PATCH 04/14] umh: Remove call_usermodehelper_setup_file Eric W. Biederman
                                                                                   ` (14 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 12:54 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler


Now that the functionality of umh_setup_pipe and
umh_clean_and_save_pid has changed their names are too specific and
don't make much sense.  Instead name them  umd_setup and umd_cleanup
for the functional role in setting up user mode drivers.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/umh.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/umh.c b/kernel/umh.c
index e6b9d6636850..0ffe0a08cdde 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -429,7 +429,7 @@ struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
 	return sub_info;
 }
 
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+static int umd_setup(struct subprocess_info *info, struct cred *new)
 {
 	struct umh_info *umh_info = info->data;
 	struct file *from_umh[2];
@@ -470,7 +470,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 	return 0;
 }
 
-static void umh_clean_and_save_pid(struct subprocess_info *info)
+static void umd_cleanup(struct subprocess_info *info)
 {
 	struct umh_info *umh_info = info->data;
 
@@ -520,8 +520,8 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 	}
 
 	err = -ENOMEM;
-	sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup,
-						  umh_clean_and_save_pid, info);
+	sub_info = call_usermodehelper_setup_file(file, umd_setup, umd_cleanup,
+						  info);
 	if (!sub_info)
 		goto out;
 
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH 04/14] umh: Remove call_usermodehelper_setup_file.
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
                                                                                   ` (2 preceding siblings ...)
  2020-06-26 12:54                                                                 ` [PATCH 03/14] umh: Rename the user mode driver helpers for clarity Eric W. Biederman
@ 2020-06-26 12:54                                                                 ` Eric W. Biederman
  2020-06-26 12:55                                                                 ` [PATCH 05/14] umh: Separate the user mode driver and the user mode helper support Eric W. Biederman
                                                                                   ` (13 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 12:54 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler


The only caller of call_usermodehelper_setup_file is fork_usermode_blob.
In fork_usermode_blob replace call_usermodehelper_setup_file with
call_usermodehelper_setup and delete fork_usermodehelper_setup_file.

For this to work the argv_free is moved from umh_clean_and_save_pid
to fork_usermode_blob.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umh.h |  3 ---
 kernel/umh.c        | 42 +++++++++++-------------------------------
 2 files changed, 11 insertions(+), 34 deletions(-)

diff --git a/include/linux/umh.h b/include/linux/umh.h
index aae16a0ebd0f..de08af00c68a 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -39,9 +39,6 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp,
 			  int (*init)(struct subprocess_info *info, struct cred *new),
 			  void (*cleanup)(struct subprocess_info *), void *data);
 
-struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
-			  int (*init)(struct subprocess_info *info, struct cred *new),
-			  void (*cleanup)(struct subprocess_info *), void *data);
 struct umh_info {
 	const char *cmdline;
 	struct file *pipe_to_umh;
diff --git a/kernel/umh.c b/kernel/umh.c
index 0ffe0a08cdde..14d63b5f29a7 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -402,33 +402,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
 }
 EXPORT_SYMBOL(call_usermodehelper_setup);
 
-struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
-		int (*init)(struct subprocess_info *info, struct cred *new),
-		void (*cleanup)(struct subprocess_info *info), void *data)
-{
-	struct subprocess_info *sub_info;
-	struct umh_info *info = data;
-	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
-
-	sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL);
-	if (!sub_info)
-		return NULL;
-
-	sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL);
-	if (!sub_info->argv) {
-		kfree(sub_info);
-		return NULL;
-	}
-
-	INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
-	sub_info->path = "none";
-	sub_info->file = file;
-	sub_info->init = init;
-	sub_info->cleanup = cleanup;
-	sub_info->data = data;
-	return sub_info;
-}
-
 static int umd_setup(struct subprocess_info *info, struct cred *new)
 {
 	struct umh_info *umh_info = info->data;
@@ -479,8 +452,6 @@ static void umd_cleanup(struct subprocess_info *info)
 		fput(umh_info->pipe_to_umh);
 		fput(umh_info->pipe_from_umh);
 	}
-
-	argv_free(info->argv);
 }
 
 /**
@@ -501,7 +472,9 @@ static void umd_cleanup(struct subprocess_info *info)
  */
 int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 {
+	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
 	struct subprocess_info *sub_info;
+	char **argv = NULL;
 	struct file *file;
 	ssize_t written;
 	loff_t pos = 0;
@@ -520,11 +493,16 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 	}
 
 	err = -ENOMEM;
-	sub_info = call_usermodehelper_setup_file(file, umd_setup, umd_cleanup,
-						  info);
+	argv = argv_split(GFP_KERNEL, cmdline, NULL);
+	if (!argv)
+		goto out;
+
+	sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL,
+					     umd_setup, umd_cleanup, info);
 	if (!sub_info)
 		goto out;
 
+	sub_info->file = file;
 	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
 	if (!err) {
 		mutex_lock(&umh_list_lock);
@@ -532,6 +510,8 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 		mutex_unlock(&umh_list_lock);
 	}
 out:
+	if (argv)
+		argv_free(argv);
 	fput(file);
 	return err;
 }
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH 05/14] umh: Separate the user mode driver and the user mode helper support
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
                                                                                   ` (3 preceding siblings ...)
  2020-06-26 12:54                                                                 ` [PATCH 04/14] umh: Remove call_usermodehelper_setup_file Eric W. Biederman
@ 2020-06-26 12:55                                                                 ` Eric W. Biederman
  2020-06-26 14:17                                                                   ` kernel test robot
  2020-06-26 16:22                                                                   ` Tetsuo Handa
  2020-06-26 12:55                                                                 ` [PATCH 06/14] umd: For clarity rename umh_info umd_info Eric W. Biederman
                                                                                   ` (12 subsequent siblings)
  17 siblings, 2 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 12:55 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler


This makes it clear which code is part of the core user mode
helper support and which code is needed to implement user mode
drivers.

This makes the kernel smaller for everyone who does not use a usermode
driver.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/bpfilter.h |   2 +-
 include/linux/umd.h      |  16 +++++
 include/linux/umh.h      |  10 ---
 kernel/Makefile          |   1 +
 kernel/umd.c             | 146 +++++++++++++++++++++++++++++++++++++++
 kernel/umh.c             | 139 -------------------------------------
 6 files changed, 164 insertions(+), 150 deletions(-)
 create mode 100644 include/linux/umd.h
 create mode 100644 kernel/umd.c

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
index d815622cd31e..b42e44e29033 100644
--- a/include/linux/bpfilter.h
+++ b/include/linux/bpfilter.h
@@ -3,7 +3,7 @@
 #define _LINUX_BPFILTER_H
 
 #include <uapi/linux/bpfilter.h>
-#include <linux/umh.h>
+#include <linux/umd.h>
 
 struct sock;
 int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
diff --git a/include/linux/umd.h b/include/linux/umd.h
new file mode 100644
index 000000000000..3f8c5743202b
--- /dev/null
+++ b/include/linux/umd.h
@@ -0,0 +1,16 @@
+#ifndef __LINUX_UMD_H__
+#define __LINUX_UMD_H__
+
+#include <linux/umh.h>
+
+struct umh_info {
+	const char *cmdline;
+	struct file *pipe_to_umh;
+	struct file *pipe_from_umh;
+	struct list_head list;
+	void (*cleanup)(struct umh_info *info);
+	pid_t pid;
+};
+int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
+
+#endif /* __LINUX_UMD_H__ */
diff --git a/include/linux/umh.h b/include/linux/umh.h
index de08af00c68a..73173c4a07e5 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -39,16 +39,6 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp,
 			  int (*init)(struct subprocess_info *info, struct cred *new),
 			  void (*cleanup)(struct subprocess_info *), void *data);
 
-struct umh_info {
-	const char *cmdline;
-	struct file *pipe_to_umh;
-	struct file *pipe_from_umh;
-	struct list_head list;
-	void (*cleanup)(struct umh_info *info);
-	pid_t pid;
-};
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
-
 extern int
 call_usermodehelper_exec(struct subprocess_info *info, int wait);
 
diff --git a/kernel/Makefile b/kernel/Makefile
index f3218bc5ec69..a81d7354323c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
 	    async.o range.o smpboot.o ucount.o
 
+obj-$(CONFIG_BPFILTER) += umd.o
 obj-$(CONFIG_MODULES) += kmod.o
 obj-$(CONFIG_MULTIUSER) += groups.o
 
diff --git a/kernel/umd.c b/kernel/umd.c
new file mode 100644
index 000000000000..8efaa84b6aa1
--- /dev/null
+++ b/kernel/umd.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * umd - User mode driver support
+ */
+#include <linux/shmem_fs.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/umd.h>
+
+static LIST_HEAD(umh_list);
+static DEFINE_MUTEX(umh_list_lock);
+
+static int umd_setup(struct subprocess_info *info, struct cred *new)
+{
+	struct umh_info *umh_info = info->data;
+	struct file *from_umh[2];
+	struct file *to_umh[2];
+	int err;
+
+	/* create pipe to send data to umh */
+	err = create_pipe_files(to_umh, 0);
+	if (err)
+		return err;
+	err = replace_fd(0, to_umh[0], 0);
+	fput(to_umh[0]);
+	if (err < 0) {
+		fput(to_umh[1]);
+		return err;
+	}
+
+	/* create pipe to receive data from umh */
+	err = create_pipe_files(from_umh, 0);
+	if (err) {
+		fput(to_umh[1]);
+		replace_fd(0, NULL, 0);
+		return err;
+	}
+	err = replace_fd(1, from_umh[1], 0);
+	fput(from_umh[1]);
+	if (err < 0) {
+		fput(to_umh[1]);
+		replace_fd(0, NULL, 0);
+		fput(from_umh[0]);
+		return err;
+	}
+
+	umh_info->pipe_to_umh = to_umh[1];
+	umh_info->pipe_from_umh = from_umh[0];
+	umh_info->pid = task_pid_nr(current);
+	current->flags |= PF_UMH;
+	return 0;
+}
+
+static void umd_cleanup(struct subprocess_info *info)
+{
+	struct umh_info *umh_info = info->data;
+
+	/* cleanup if umh_pipe_setup() was successful but exec failed */
+	if (info->retval) {
+		fput(umh_info->pipe_to_umh);
+		fput(umh_info->pipe_from_umh);
+	}
+}
+
+/**
+ * fork_usermode_blob - fork a blob of bytes as a usermode process
+ * @data: a blob of bytes that can be do_execv-ed as a file
+ * @len: length of the blob
+ * @info: information about usermode process (shouldn't be NULL)
+ *
+ * If info->cmdline is set it will be used as command line for the
+ * user process, else "usermodehelper" is used.
+ *
+ * Returns either negative error or zero which indicates success
+ * in executing a blob of bytes as a usermode process. In such
+ * case 'struct umh_info *info' is populated with two pipes
+ * and a pid of the process. The caller is responsible for health
+ * check of the user process, killing it via pid, and closing the
+ * pipes when user process is no longer needed.
+ */
+int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
+{
+	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
+	struct subprocess_info *sub_info;
+	char **argv = NULL;
+	struct file *file;
+	ssize_t written;
+	loff_t pos = 0;
+	int err;
+
+	file = shmem_kernel_file_setup("", len, 0);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	written = kernel_write(file, data, len, &pos);
+	if (written != len) {
+		err = written;
+		if (err >= 0)
+			err = -ENOMEM;
+		goto out;
+	}
+
+	err = -ENOMEM;
+	argv = argv_split(GFP_KERNEL, cmdline, NULL);
+	if (!argv)
+		goto out;
+
+	sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL,
+					     umd_setup, umd_cleanup, info);
+	if (!sub_info)
+		goto out;
+
+	sub_info->file = file;
+	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+	if (!err) {
+		mutex_lock(&umh_list_lock);
+		list_add(&info->list, &umh_list);
+		mutex_unlock(&umh_list_lock);
+	}
+out:
+	if (argv)
+		argv_free(argv);
+	fput(file);
+	return err;
+}
+EXPORT_SYMBOL_GPL(fork_usermode_blob);
+
+void __exit_umh(struct task_struct *tsk)
+{
+	struct umh_info *info;
+	pid_t pid = tsk->pid;
+
+	mutex_lock(&umh_list_lock);
+	list_for_each_entry(info, &umh_list, list) {
+		if (info->pid == pid) {
+			list_del(&info->list);
+			mutex_unlock(&umh_list_lock);
+			goto out;
+		}
+	}
+	mutex_unlock(&umh_list_lock);
+	return;
+out:
+	if (info->cleanup)
+		info->cleanup(info);
+}
+
diff --git a/kernel/umh.c b/kernel/umh.c
index 14d63b5f29a7..3e4e453d45c8 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -26,8 +26,6 @@
 #include <linux/ptrace.h>
 #include <linux/async.h>
 #include <linux/uaccess.h>
-#include <linux/shmem_fs.h>
-#include <linux/pipe_fs_i.h>
 
 #include <trace/events/module.h>
 
@@ -38,8 +36,6 @@ static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
 static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
 static DEFINE_SPINLOCK(umh_sysctl_lock);
 static DECLARE_RWSEM(umhelper_sem);
-static LIST_HEAD(umh_list);
-static DEFINE_MUTEX(umh_list_lock);
 
 static void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
@@ -402,121 +398,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
 }
 EXPORT_SYMBOL(call_usermodehelper_setup);
 
-static int umd_setup(struct subprocess_info *info, struct cred *new)
-{
-	struct umh_info *umh_info = info->data;
-	struct file *from_umh[2];
-	struct file *to_umh[2];
-	int err;
-
-	/* create pipe to send data to umh */
-	err = create_pipe_files(to_umh, 0);
-	if (err)
-		return err;
-	err = replace_fd(0, to_umh[0], 0);
-	fput(to_umh[0]);
-	if (err < 0) {
-		fput(to_umh[1]);
-		return err;
-	}
-
-	/* create pipe to receive data from umh */
-	err = create_pipe_files(from_umh, 0);
-	if (err) {
-		fput(to_umh[1]);
-		replace_fd(0, NULL, 0);
-		return err;
-	}
-	err = replace_fd(1, from_umh[1], 0);
-	fput(from_umh[1]);
-	if (err < 0) {
-		fput(to_umh[1]);
-		replace_fd(0, NULL, 0);
-		fput(from_umh[0]);
-		return err;
-	}
-
-	umh_info->pipe_to_umh = to_umh[1];
-	umh_info->pipe_from_umh = from_umh[0];
-	umh_info->pid = task_pid_nr(current);
-	current->flags |= PF_UMH;
-	return 0;
-}
-
-static void umd_cleanup(struct subprocess_info *info)
-{
-	struct umh_info *umh_info = info->data;
-
-	/* cleanup if umh_pipe_setup() was successful but exec failed */
-	if (info->retval) {
-		fput(umh_info->pipe_to_umh);
-		fput(umh_info->pipe_from_umh);
-	}
-}
-
-/**
- * fork_usermode_blob - fork a blob of bytes as a usermode process
- * @data: a blob of bytes that can be do_execv-ed as a file
- * @len: length of the blob
- * @info: information about usermode process (shouldn't be NULL)
- *
- * If info->cmdline is set it will be used as command line for the
- * user process, else "usermodehelper" is used.
- *
- * Returns either negative error or zero which indicates success
- * in executing a blob of bytes as a usermode process. In such
- * case 'struct umh_info *info' is populated with two pipes
- * and a pid of the process. The caller is responsible for health
- * check of the user process, killing it via pid, and closing the
- * pipes when user process is no longer needed.
- */
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
-{
-	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
-	struct subprocess_info *sub_info;
-	char **argv = NULL;
-	struct file *file;
-	ssize_t written;
-	loff_t pos = 0;
-	int err;
-
-	file = shmem_kernel_file_setup("", len, 0);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
-
-	written = kernel_write(file, data, len, &pos);
-	if (written != len) {
-		err = written;
-		if (err >= 0)
-			err = -ENOMEM;
-		goto out;
-	}
-
-	err = -ENOMEM;
-	argv = argv_split(GFP_KERNEL, cmdline, NULL);
-	if (!argv)
-		goto out;
-
-	sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL,
-					     umd_setup, umd_cleanup, info);
-	if (!sub_info)
-		goto out;
-
-	sub_info->file = file;
-	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
-	if (!err) {
-		mutex_lock(&umh_list_lock);
-		list_add(&info->list, &umh_list);
-		mutex_unlock(&umh_list_lock);
-	}
-out:
-	if (argv)
-		argv_free(argv);
-	fput(file);
-	return err;
-}
-EXPORT_SYMBOL_GPL(fork_usermode_blob);
-
 /**
  * call_usermodehelper_exec - start a usermode application
  * @sub_info: information about the subprocessa
@@ -678,26 +559,6 @@ static int proc_cap_handler(struct ctl_table *table, int write,
 	return 0;
 }
 
-void __exit_umh(struct task_struct *tsk)
-{
-	struct umh_info *info;
-	pid_t pid = tsk->pid;
-
-	mutex_lock(&umh_list_lock);
-	list_for_each_entry(info, &umh_list, list) {
-		if (info->pid == pid) {
-			list_del(&info->list);
-			mutex_unlock(&umh_list_lock);
-			goto out;
-		}
-	}
-	mutex_unlock(&umh_list_lock);
-	return;
-out:
-	if (info->cleanup)
-		info->cleanup(info);
-}
-
 struct ctl_table usermodehelper_table[] = {
 	{
 		.procname	= "bset",
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH 06/14] umd: For clarity rename umh_info umd_info
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
                                                                                   ` (4 preceding siblings ...)
  2020-06-26 12:55                                                                 ` [PATCH 05/14] umh: Separate the user mode driver and the user mode helper support Eric W. Biederman
@ 2020-06-26 12:55                                                                 ` Eric W. Biederman
  2020-06-26 15:37                                                                   ` Kees Cook
  2020-06-26 12:56                                                                 ` [PATCH 07/14] umd: Rename umd_info.cmdline umd_info.driver_name Eric W. Biederman
                                                                                   ` (11 subsequent siblings)
  17 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 12:55 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler


This structure is only used for user mode drivers so change
the prefix from umh to umd to make that clear.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/bpfilter.h    |  2 +-
 include/linux/umd.h         |  6 +++---
 kernel/umd.c                | 20 ++++++++++----------
 net/ipv4/bpfilter/sockopt.c |  2 +-
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
index b42e44e29033..4b43d2240172 100644
--- a/include/linux/bpfilter.h
+++ b/include/linux/bpfilter.h
@@ -11,7 +11,7 @@ int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
 int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
 			    int __user *optlen);
 struct bpfilter_umh_ops {
-	struct umh_info info;
+	struct umd_info info;
 	/* since ip_getsockopt() can run in parallel, serialize access to umh */
 	struct mutex lock;
 	int (*sockopt)(struct sock *sk, int optname,
diff --git a/include/linux/umd.h b/include/linux/umd.h
index 3f8c5743202b..4f61849e2031 100644
--- a/include/linux/umd.h
+++ b/include/linux/umd.h
@@ -3,14 +3,14 @@
 
 #include <linux/umh.h>
 
-struct umh_info {
+struct umd_info {
 	const char *cmdline;
 	struct file *pipe_to_umh;
 	struct file *pipe_from_umh;
 	struct list_head list;
-	void (*cleanup)(struct umh_info *info);
+	void (*cleanup)(struct umd_info *info);
 	pid_t pid;
 };
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
+int fork_usermode_blob(void *data, size_t len, struct umd_info *info);
 
 #endif /* __LINUX_UMD_H__ */
diff --git a/kernel/umd.c b/kernel/umd.c
index 8efaa84b6aa1..aa1215faa8a1 100644
--- a/kernel/umd.c
+++ b/kernel/umd.c
@@ -11,7 +11,7 @@ static DEFINE_MUTEX(umh_list_lock);
 
 static int umd_setup(struct subprocess_info *info, struct cred *new)
 {
-	struct umh_info *umh_info = info->data;
+	struct umd_info *umd_info = info->data;
 	struct file *from_umh[2];
 	struct file *to_umh[2];
 	int err;
@@ -43,21 +43,21 @@ static int umd_setup(struct subprocess_info *info, struct cred *new)
 		return err;
 	}
 
-	umh_info->pipe_to_umh = to_umh[1];
-	umh_info->pipe_from_umh = from_umh[0];
-	umh_info->pid = task_pid_nr(current);
+	umd_info->pipe_to_umh = to_umh[1];
+	umd_info->pipe_from_umh = from_umh[0];
+	umd_info->pid = task_pid_nr(current);
 	current->flags |= PF_UMH;
 	return 0;
 }
 
 static void umd_cleanup(struct subprocess_info *info)
 {
-	struct umh_info *umh_info = info->data;
+	struct umd_info *umd_info = info->data;
 
 	/* cleanup if umh_pipe_setup() was successful but exec failed */
 	if (info->retval) {
-		fput(umh_info->pipe_to_umh);
-		fput(umh_info->pipe_from_umh);
+		fput(umd_info->pipe_to_umh);
+		fput(umd_info->pipe_from_umh);
 	}
 }
 
@@ -72,12 +72,12 @@ static void umd_cleanup(struct subprocess_info *info)
  *
  * Returns either negative error or zero which indicates success
  * in executing a blob of bytes as a usermode process. In such
- * case 'struct umh_info *info' is populated with two pipes
+ * case 'struct umd_info *info' is populated with two pipes
  * and a pid of the process. The caller is responsible for health
  * check of the user process, killing it via pid, and closing the
  * pipes when user process is no longer needed.
  */
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
+int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 {
 	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
 	struct subprocess_info *sub_info;
@@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(fork_usermode_blob);
 
 void __exit_umh(struct task_struct *tsk)
 {
-	struct umh_info *info;
+	struct umd_info *info;
 	pid_t pid = tsk->pid;
 
 	mutex_lock(&umh_list_lock);
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index 0480918bfc7c..c0dbcc86fcdb 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -12,7 +12,7 @@
 struct bpfilter_umh_ops bpfilter_ops;
 EXPORT_SYMBOL_GPL(bpfilter_ops);
 
-static void bpfilter_umh_cleanup(struct umh_info *info)
+static void bpfilter_umh_cleanup(struct umd_info *info)
 {
 	mutex_lock(&bpfilter_ops.lock);
 	bpfilter_ops.stop = true;
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH 07/14] umd: Rename umd_info.cmdline umd_info.driver_name
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
                                                                                   ` (5 preceding siblings ...)
  2020-06-26 12:55                                                                 ` [PATCH 06/14] umd: For clarity rename umh_info umd_info Eric W. Biederman
@ 2020-06-26 12:56                                                                 ` Eric W. Biederman
  2020-06-26 12:56                                                                 ` [PATCH 08/14] umd: Transform fork_usermode_blob into fork_usermode_driver Eric W. Biederman
                                                                                   ` (10 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 12:56 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler


The only thing supplied in the cmdline today is the driver name so
rename the field to clarify the code.

As this value is always supplied stop trying to handle the case of
a NULL cmdline.

Additionally since we now have a name we can count on use the
driver_name any place where the code is looking for a name
of the binary.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umd.h         |  2 +-
 kernel/umd.c                | 11 ++++-------
 net/ipv4/bpfilter/sockopt.c |  2 +-
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/include/linux/umd.h b/include/linux/umd.h
index 4f61849e2031..6c3e00e0520b 100644
--- a/include/linux/umd.h
+++ b/include/linux/umd.h
@@ -4,7 +4,7 @@
 #include <linux/umh.h>
 
 struct umd_info {
-	const char *cmdline;
+	const char *driver_name;
 	struct file *pipe_to_umh;
 	struct file *pipe_from_umh;
 	struct list_head list;
diff --git a/kernel/umd.c b/kernel/umd.c
index aa1215faa8a1..bad2e8da7f96 100644
--- a/kernel/umd.c
+++ b/kernel/umd.c
@@ -67,9 +67,6 @@ static void umd_cleanup(struct subprocess_info *info)
  * @len: length of the blob
  * @info: information about usermode process (shouldn't be NULL)
  *
- * If info->cmdline is set it will be used as command line for the
- * user process, else "usermodehelper" is used.
- *
  * Returns either negative error or zero which indicates success
  * in executing a blob of bytes as a usermode process. In such
  * case 'struct umd_info *info' is populated with two pipes
@@ -79,7 +76,6 @@ static void umd_cleanup(struct subprocess_info *info)
  */
 int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 {
-	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
 	struct subprocess_info *sub_info;
 	char **argv = NULL;
 	struct file *file;
@@ -87,7 +83,7 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 	loff_t pos = 0;
 	int err;
 
-	file = shmem_kernel_file_setup("", len, 0);
+	file = shmem_kernel_file_setup(info->driver_name, len, 0);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
@@ -100,11 +96,12 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 	}
 
 	err = -ENOMEM;
-	argv = argv_split(GFP_KERNEL, cmdline, NULL);
+	argv = argv_split(GFP_KERNEL, info->driver_name, NULL);
 	if (!argv)
 		goto out;
 
-	sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL,
+	sub_info = call_usermodehelper_setup(info->driver_name, argv, NULL,
+					     GFP_KERNEL,
 					     umd_setup, umd_cleanup, info);
 	if (!sub_info)
 		goto out;
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index c0dbcc86fcdb..5050de28333d 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -70,7 +70,7 @@ static int __init bpfilter_sockopt_init(void)
 {
 	mutex_init(&bpfilter_ops.lock);
 	bpfilter_ops.stop = true;
-	bpfilter_ops.info.cmdline = "bpfilter_umh";
+	bpfilter_ops.info.driver_name = "bpfilter_umh";
 	bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup;
 
 	return 0;
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH 08/14] umd: Transform fork_usermode_blob into fork_usermode_driver
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
                                                                                   ` (6 preceding siblings ...)
  2020-06-26 12:56                                                                 ` [PATCH 07/14] umd: Rename umd_info.cmdline umd_info.driver_name Eric W. Biederman
@ 2020-06-26 12:56                                                                 ` Eric W. Biederman
  2020-06-26 12:57                                                                 ` [PATCH 09/14] umh: Stop calling do_execve_file Eric W. Biederman
                                                                                   ` (9 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 12:56 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler


Instead of loading a binary blob into a temporary file with
shmem_kernel_file_setup load a binary blob into a temporary tmpfs
filesystem.  This means that the blob can be stored in an init section
and discared, and it means the binary blob will have a filename so can
be executed normally.

The only tricky thing about this code is that in the helper function
blob_to_mnt __fput_sync is used.  That is because a file can not be
executed if it is still open for write, and the ordinary delayed close
for kernel threads does not happen soon enough, which causes the
following exec to fail.  The function umd_load_blob is not called with
any locks so this should be safe.

Executing the blob normally winds up correcting several problems with
the user mode driver code discovered by Tetsuo Handa[1].  By passing
an ordinary filename into the exec, it is no longer necessary to
figure out how to turn a O_RDWR file descriptor into a properly
referende counted O_EXEC file descriptor that forbids all writes.  For
path based LSMs there are no new special cases.

[1] https://lore.kernel.org/linux-fsdevel/2a8775b4-1dd5-9d5c-aa42-9872445e0942@i-love.sakura.ne.jp/
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umd.h          |   6 +-
 kernel/umd.c                 | 121 ++++++++++++++++++++++++++---------
 net/bpfilter/bpfilter_kern.c |  14 +++-
 3 files changed, 108 insertions(+), 33 deletions(-)

diff --git a/include/linux/umd.h b/include/linux/umd.h
index 6c3e00e0520b..d4a2e9e6f154 100644
--- a/include/linux/umd.h
+++ b/include/linux/umd.h
@@ -2,6 +2,7 @@
 #define __LINUX_UMD_H__
 
 #include <linux/umh.h>
+#include <linux/path.h>
 
 struct umd_info {
 	const char *driver_name;
@@ -9,8 +10,11 @@ struct umd_info {
 	struct file *pipe_from_umh;
 	struct list_head list;
 	void (*cleanup)(struct umd_info *info);
+	struct path wd;
 	pid_t pid;
 };
-int fork_usermode_blob(void *data, size_t len, struct umd_info *info);
+int umd_load_blob(struct umd_info *info, const void *data, size_t len);
+int umd_unload_blob(struct umd_info *info);
+int fork_usermode_driver(struct umd_info *info);
 
 #endif /* __LINUX_UMD_H__ */
diff --git a/kernel/umd.c b/kernel/umd.c
index bad2e8da7f96..afb689d6bf35 100644
--- a/kernel/umd.c
+++ b/kernel/umd.c
@@ -4,11 +4,93 @@
  */
 #include <linux/shmem_fs.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/mount.h>
+#include <linux/fs_struct.h>
 #include <linux/umd.h>
 
 static LIST_HEAD(umh_list);
 static DEFINE_MUTEX(umh_list_lock);
 
+static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name)
+{
+	struct file_system_type *type;
+	struct vfsmount *mnt;
+	struct file *file;
+	ssize_t written;
+	loff_t pos = 0;
+
+	type = get_fs_type("tmpfs");
+	if (!type)
+		return ERR_PTR(-ENODEV);
+
+	mnt = kern_mount(type);
+	put_filesystem(type);
+	if (IS_ERR(mnt))
+		return mnt;
+
+	file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700);
+	if (IS_ERR(file)) {
+		mntput(mnt);
+		return ERR_CAST(file);
+	}
+
+	written = kernel_write(file, data, len, &pos);
+	if (written != len) {
+		int err = written;
+		if (err >= 0)
+			err = -ENOMEM;
+		filp_close(file, NULL);
+		mntput(mnt);
+		return ERR_PTR(err);
+	}
+
+	__fput_sync(file);
+	return mnt;
+}
+
+/**
+ * umd_load_blob - Remember a blob of bytes for fork_usermode_driver
+ * @info: information about usermode driver
+ * @data: a blob of bytes that can be executed as a file
+ * @len:  The lentgh of the blob
+ *
+ */
+int umd_load_blob(struct umd_info *info, const void *data, size_t len)
+{
+	struct vfsmount *mnt;
+
+	if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt))
+		return -EBUSY;
+
+	mnt = blob_to_mnt(data, len, info->driver_name);
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+
+	info->wd.mnt = mnt;
+	info->wd.dentry = mnt->mnt_root;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(umd_load_blob);
+
+/**
+ * umd_unload_blob - Disassociate @info from a previously loaded blob
+ * @info: information about usermode driver
+ *
+ */
+int umd_unload_blob(struct umd_info *info)
+{
+	if (WARN_ON_ONCE(!info->wd.mnt ||
+			 !info->wd.dentry ||
+			 info->wd.mnt->mnt_root != info->wd.dentry))
+		return -EINVAL;
+
+	kern_unmount(info->wd.mnt);
+	info->wd.mnt = NULL;
+	info->wd.dentry = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(umd_unload_blob);
+
 static int umd_setup(struct subprocess_info *info, struct cred *new)
 {
 	struct umd_info *umd_info = info->data;
@@ -43,6 +125,7 @@ static int umd_setup(struct subprocess_info *info, struct cred *new)
 		return err;
 	}
 
+	set_fs_pwd(current->fs, &umd_info->wd);
 	umd_info->pipe_to_umh = to_umh[1];
 	umd_info->pipe_from_umh = from_umh[0];
 	umd_info->pid = task_pid_nr(current);
@@ -62,39 +145,21 @@ static void umd_cleanup(struct subprocess_info *info)
 }
 
 /**
- * fork_usermode_blob - fork a blob of bytes as a usermode process
- * @data: a blob of bytes that can be do_execv-ed as a file
- * @len: length of the blob
- * @info: information about usermode process (shouldn't be NULL)
+ * fork_usermode_driver - fork a usermode driver
+ * @info: information about usermode driver (shouldn't be NULL)
  *
- * Returns either negative error or zero which indicates success
- * in executing a blob of bytes as a usermode process. In such
- * case 'struct umd_info *info' is populated with two pipes
- * and a pid of the process. The caller is responsible for health
- * check of the user process, killing it via pid, and closing the
- * pipes when user process is no longer needed.
+ * Returns either negative error or zero which indicates success in
+ * executing a usermode driver. In such case 'struct umd_info *info'
+ * is populated with two pipes and a pid of the process. The caller is
+ * responsible for health check of the user process, killing it via
+ * pid, and closing the pipes when user process is no longer needed.
  */
-int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
+int fork_usermode_driver(struct umd_info *info)
 {
 	struct subprocess_info *sub_info;
 	char **argv = NULL;
-	struct file *file;
-	ssize_t written;
-	loff_t pos = 0;
 	int err;
 
-	file = shmem_kernel_file_setup(info->driver_name, len, 0);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
-
-	written = kernel_write(file, data, len, &pos);
-	if (written != len) {
-		err = written;
-		if (err >= 0)
-			err = -ENOMEM;
-		goto out;
-	}
-
 	err = -ENOMEM;
 	argv = argv_split(GFP_KERNEL, info->driver_name, NULL);
 	if (!argv)
@@ -106,7 +171,6 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 	if (!sub_info)
 		goto out;
 
-	sub_info->file = file;
 	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
 	if (!err) {
 		mutex_lock(&umh_list_lock);
@@ -116,10 +180,9 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 out:
 	if (argv)
 		argv_free(argv);
-	fput(file);
 	return err;
 }
-EXPORT_SYMBOL_GPL(fork_usermode_blob);
+EXPORT_SYMBOL_GPL(fork_usermode_driver);
 
 void __exit_umh(struct task_struct *tsk)
 {
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index c0f0990f30b6..28883b00609d 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -77,9 +77,7 @@ static int start_umh(void)
 	int err;
 
 	/* fork usermode process */
-	err = fork_usermode_blob(&bpfilter_umh_start,
-				 &bpfilter_umh_end - &bpfilter_umh_start,
-				 &bpfilter_ops.info);
+	err = fork_usermode_driver(&bpfilter_ops.info);
 	if (err)
 		return err;
 	bpfilter_ops.stop = false;
@@ -98,6 +96,12 @@ static int __init load_umh(void)
 {
 	int err;
 
+	err = umd_load_blob(&bpfilter_ops.info,
+			    &bpfilter_umh_start,
+			    &bpfilter_umh_end - &bpfilter_umh_start);
+	if (err)
+		return err;
+
 	mutex_lock(&bpfilter_ops.lock);
 	if (!bpfilter_ops.stop) {
 		err = -EFAULT;
@@ -110,6 +114,8 @@ static int __init load_umh(void)
 	}
 out:
 	mutex_unlock(&bpfilter_ops.lock);
+	if (err)
+		umd_unload_blob(&bpfilter_ops.info);
 	return err;
 }
 
@@ -122,6 +128,8 @@ static void __exit fini_umh(void)
 		bpfilter_ops.sockopt = NULL;
 	}
 	mutex_unlock(&bpfilter_ops.lock);
+
+	umd_unload_blob(&bpfilter_ops.info);
 }
 module_init(load_umh);
 module_exit(fini_umh);
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH 09/14] umh: Stop calling do_execve_file
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
                                                                                   ` (7 preceding siblings ...)
  2020-06-26 12:56                                                                 ` [PATCH 08/14] umd: Transform fork_usermode_blob into fork_usermode_driver Eric W. Biederman
@ 2020-06-26 12:57                                                                 ` Eric W. Biederman
  2020-06-26 12:57                                                                 ` [PATCH 10/14] exec: Remove do_execve_file Eric W. Biederman
                                                                                   ` (8 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 12:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler


With the user mode driver code changed to not set subprocess_info.file
there are no more users of subproces_info.file.  Remove this field
from struct subprocess_info and remove the only user in
call_usermodehelper_exec_async that would call do_execve_file instead
of do_execve if file was set.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umh.h |  1 -
 kernel/umh.c        | 10 +++-------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/include/linux/umh.h b/include/linux/umh.h
index 73173c4a07e5..244aff638220 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -22,7 +22,6 @@ struct subprocess_info {
 	const char *path;
 	char **argv;
 	char **envp;
-	struct file *file;
 	int wait;
 	int retval;
 	int (*init)(struct subprocess_info *info, struct cred *new);
diff --git a/kernel/umh.c b/kernel/umh.c
index 3e4e453d45c8..6ca2096298b9 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -98,13 +98,9 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	if (sub_info->file)
-		retval = do_execve_file(sub_info->file,
-					sub_info->argv, sub_info->envp);
-	else
-		retval = do_execve(getname_kernel(sub_info->path),
-				   (const char __user *const __user *)sub_info->argv,
-				   (const char __user *const __user *)sub_info->envp);
+	retval = do_execve(getname_kernel(sub_info->path),
+			   (const char __user *const __user *)sub_info->argv,
+			   (const char __user *const __user *)sub_info->envp);
 out:
 	sub_info->retval = retval;
 	/*
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH 10/14] exec: Remove do_execve_file
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
                                                                                   ` (8 preceding siblings ...)
  2020-06-26 12:57                                                                 ` [PATCH 09/14] umh: Stop calling do_execve_file Eric W. Biederman
@ 2020-06-26 12:57                                                                 ` Eric W. Biederman
  2020-06-26 12:58                                                                 ` [PATCH 11/14] bpfilter: Move bpfilter_umh back into init data Eric W. Biederman
                                                                                   ` (7 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 12:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler


Now that the last callser has been removed remove this code from exec.

For anyone thinking of resurrecing do_execve_file please note that
the code was buggy in several fundamental ways.

- It did not ensure the file it was passed was read-only and that
  deny_write_access had been called on it.  Which subtlely breaks
  invaniants in exec.

- The caller of do_execve_file was expected to hold and put a
  reference to the file, but an extra reference for use by exec was
  not taken so that when exec put it's reference to the file an
  underflow occured on the file reference count.

- The point of the interface was so that a pathname did not need to
  exist.  Which breaks pathname based LSMs.

Tetsuo Handa originally reported these issues[1].  While it was clear
that deny_write_access was missing the fundamental incompatibility
with the passed in O_RDWR filehandle was not immediately recognized.

All of these issues were fixed by modifying the usermode driver code
to have a path, so it did not need this hack.

Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
[1] https://lore.kernel.org/linux-fsdevel/2a8775b4-1dd5-9d5c-aa42-9872445e0942@i-love.sakura.ne.jp/
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/exec.c               | 38 +++++++++-----------------------------
 include/linux/binfmts.h |  1 -
 2 files changed, 9 insertions(+), 30 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index e6e8a9a70327..23dfbb820626 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1818,13 +1818,14 @@ static int exec_binprm(struct linux_binprm *bprm)
 /*
  * sys_execve() executes a new program.
  */
-static int __do_execve_file(int fd, struct filename *filename,
-			    struct user_arg_ptr argv,
-			    struct user_arg_ptr envp,
-			    int flags, struct file *file)
+static int do_execveat_common(int fd, struct filename *filename,
+			      struct user_arg_ptr argv,
+			      struct user_arg_ptr envp,
+			      int flags)
 {
 	char *pathbuf = NULL;
 	struct linux_binprm *bprm;
+	struct file *file;
 	struct files_struct *displaced;
 	int retval;
 
@@ -1863,8 +1864,7 @@ static int __do_execve_file(int fd, struct filename *filename,
 	check_unsafe_exec(bprm);
 	current->in_execve = 1;
 
-	if (!file)
-		file = do_open_execat(fd, filename, flags);
+	file = do_open_execat(fd, filename, flags);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto out_unmark;
@@ -1872,9 +1872,7 @@ static int __do_execve_file(int fd, struct filename *filename,
 	sched_exec();
 
 	bprm->file = file;
-	if (!filename) {
-		bprm->filename = "none";
-	} else if (fd == AT_FDCWD || filename->name[0] == '/') {
+	if (fd == AT_FDCWD || filename->name[0] == '/') {
 		bprm->filename = filename->name;
 	} else {
 		if (filename->name[0] == '\0')
@@ -1935,8 +1933,7 @@ static int __do_execve_file(int fd, struct filename *filename,
 	task_numa_free(current, false);
 	free_bprm(bprm);
 	kfree(pathbuf);
-	if (filename)
-		putname(filename);
+	putname(filename);
 	if (displaced)
 		put_files_struct(displaced);
 	return retval;
@@ -1967,27 +1964,10 @@ static int __do_execve_file(int fd, struct filename *filename,
 	if (displaced)
 		reset_files_struct(displaced);
 out_ret:
-	if (filename)
-		putname(filename);
+	putname(filename);
 	return retval;
 }
 
-static int do_execveat_common(int fd, struct filename *filename,
-			      struct user_arg_ptr argv,
-			      struct user_arg_ptr envp,
-			      int flags)
-{
-	return __do_execve_file(fd, filename, argv, envp, flags, NULL);
-}
-
-int do_execve_file(struct file *file, void *__argv, void *__envp)
-{
-	struct user_arg_ptr argv = { .ptr.native = __argv };
-	struct user_arg_ptr envp = { .ptr.native = __envp };
-
-	return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file);
-}
-
 int do_execve(struct filename *filename,
 	const char __user *const __user *__argv,
 	const char __user *const __user *__envp)
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 4a20b7517dd0..7c27d7b57871 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -141,6 +141,5 @@ extern int do_execveat(int, struct filename *,
 		       const char __user * const __user *,
 		       const char __user * const __user *,
 		       int);
-int do_execve_file(struct file *file, void *__argv, void *__envp);
 
 #endif /* _LINUX_BINFMTS_H */
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH 11/14] bpfilter: Move bpfilter_umh back into init data
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
                                                                                   ` (9 preceding siblings ...)
  2020-06-26 12:57                                                                 ` [PATCH 10/14] exec: Remove do_execve_file Eric W. Biederman
@ 2020-06-26 12:58                                                                 ` Eric W. Biederman
  2020-06-26 12:58                                                                 ` [PATCH 12/14] umd: Track user space drivers with struct pid Eric W. Biederman
                                                                                   ` (6 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 12:58 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler


To allow for restarts 61fbf5933d42 ("net: bpfilter: restart
bpfilter_umh when error occurred") moved the blob holding the
userspace binary out of the init sections.

Now that loading the blob into a filesystem is separate from executing
the blob the blob no longer needs to live .rodata to allow for restarting.
So move the blob back to .init.rodata.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 net/bpfilter/bpfilter_umh_blob.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S
index 9ea6100dca87..40311d10d2f2 100644
--- a/net/bpfilter/bpfilter_umh_blob.S
+++ b/net/bpfilter/bpfilter_umh_blob.S
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-	.section .rodata, "a"
+	.section .init.rodata, "a"
 	.global bpfilter_umh_start
 bpfilter_umh_start:
 	.incbin "net/bpfilter/bpfilter_umh"
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH 12/14] umd: Track user space drivers with struct pid
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
                                                                                   ` (10 preceding siblings ...)
  2020-06-26 12:58                                                                 ` [PATCH 11/14] bpfilter: Move bpfilter_umh back into init data Eric W. Biederman
@ 2020-06-26 12:58                                                                 ` Eric W. Biederman
  2020-06-26 12:59                                                                 ` [PATCH 13/14] bpfilter: Take advantage of the facilities of " Eric W. Biederman
                                                                                   ` (5 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 12:58 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler


Use struct pid instead of user space pid values that are prone to wrap
araound.

In addition track the entire thread group instead of just the first
thread that is started by exec.  There are no multi-threaded user mode
drivers today but there is nothing preclucing user drivers from being
multi-threaded, so it is just a good idea to track the entire process.

Take a reference count on the tgid's in question to make it possible
to remove exit_umh in a future change.

As a struct pid is available directly use kill_pid_info.

The prior process signalling code was iffy in using a userspace pid
known to be in the initial pid namespace and then looking up it's task
in whatever the current pid namespace is.  It worked only because
kernel threads always run in the initial pid namespace.

As the tgid is now refcounted verify the tgid is NULL at the start of
fork_usermode_driver to avoid the possibility of silent pid leaks.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umd.h          |  2 +-
 kernel/exit.c                |  3 ++-
 kernel/umd.c                 | 15 ++++++++++-----
 net/bpfilter/bpfilter_kern.c | 13 +++++--------
 net/ipv4/bpfilter/sockopt.c  |  3 ++-
 5 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/include/linux/umd.h b/include/linux/umd.h
index d4a2e9e6f154..1c4579d79bce 100644
--- a/include/linux/umd.h
+++ b/include/linux/umd.h
@@ -11,7 +11,7 @@ struct umd_info {
 	struct list_head list;
 	void (*cleanup)(struct umd_info *info);
 	struct path wd;
-	pid_t pid;
+	struct pid *tgid;
 };
 int umd_load_blob(struct umd_info *info, const void *data, size_t len);
 int umd_unload_blob(struct umd_info *info);
diff --git a/kernel/exit.c b/kernel/exit.c
index 727150f28103..671d5066b399 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -804,7 +804,8 @@ void __noreturn do_exit(long code)
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
 	exit_thread(tsk);
-	exit_umh(tsk);
+	if (group_dead)
+		exit_umh(tsk);
 
 	/*
 	 * Flush inherited counters to the parent - before the parent
diff --git a/kernel/umd.c b/kernel/umd.c
index afb689d6bf35..0db9ce3f56c9 100644
--- a/kernel/umd.c
+++ b/kernel/umd.c
@@ -128,7 +128,7 @@ static int umd_setup(struct subprocess_info *info, struct cred *new)
 	set_fs_pwd(current->fs, &umd_info->wd);
 	umd_info->pipe_to_umh = to_umh[1];
 	umd_info->pipe_from_umh = from_umh[0];
-	umd_info->pid = task_pid_nr(current);
+	umd_info->tgid = get_pid(task_tgid(current));
 	current->flags |= PF_UMH;
 	return 0;
 }
@@ -141,6 +141,8 @@ static void umd_cleanup(struct subprocess_info *info)
 	if (info->retval) {
 		fput(umd_info->pipe_to_umh);
 		fput(umd_info->pipe_from_umh);
+		put_pid(umd_info->tgid);
+		umd_info->tgid = NULL;
 	}
 }
 
@@ -150,9 +152,9 @@ static void umd_cleanup(struct subprocess_info *info)
  *
  * Returns either negative error or zero which indicates success in
  * executing a usermode driver. In such case 'struct umd_info *info'
- * is populated with two pipes and a pid of the process. The caller is
+ * is populated with two pipes and a tgid of the process. The caller is
  * responsible for health check of the user process, killing it via
- * pid, and closing the pipes when user process is no longer needed.
+ * tgid, and closing the pipes when user process is no longer needed.
  */
 int fork_usermode_driver(struct umd_info *info)
 {
@@ -160,6 +162,9 @@ int fork_usermode_driver(struct umd_info *info)
 	char **argv = NULL;
 	int err;
 
+	if (WARN_ON_ONCE(info->tgid))
+		return -EBUSY;
+
 	err = -ENOMEM;
 	argv = argv_split(GFP_KERNEL, info->driver_name, NULL);
 	if (!argv)
@@ -187,11 +192,11 @@ EXPORT_SYMBOL_GPL(fork_usermode_driver);
 void __exit_umh(struct task_struct *tsk)
 {
 	struct umd_info *info;
-	pid_t pid = tsk->pid;
+	struct pid *tgid = task_tgid(tsk);
 
 	mutex_lock(&umh_list_lock);
 	list_for_each_entry(info, &umh_list, list) {
-		if (info->pid == pid) {
+		if (info->tgid == tgid) {
 			list_del(&info->list);
 			mutex_unlock(&umh_list_lock);
 			goto out;
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index 28883b00609d..b73dedeb6dbf 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -15,16 +15,13 @@ extern char bpfilter_umh_end;
 
 static void shutdown_umh(void)
 {
-	struct task_struct *tsk;
+	struct umd_info *info = &bpfilter_ops.info;
+	struct pid *tgid = info->tgid;
 
 	if (bpfilter_ops.stop)
 		return;
 
-	tsk = get_pid_task(find_vpid(bpfilter_ops.info.pid), PIDTYPE_PID);
-	if (tsk) {
-		send_sig(SIGKILL, tsk, 1);
-		put_task_struct(tsk);
-	}
+	kill_pid_info(SIGKILL, SEND_SIG_PRIV, tgid);
 }
 
 static void __stop_umh(void)
@@ -48,7 +45,7 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname,
 	req.cmd = optname;
 	req.addr = (long __force __user)optval;
 	req.len = optlen;
-	if (!bpfilter_ops.info.pid)
+	if (!bpfilter_ops.info.tgid)
 		goto out;
 	n = __kernel_write(bpfilter_ops.info.pipe_to_umh, &req, sizeof(req),
 			   &pos);
@@ -81,7 +78,7 @@ static int start_umh(void)
 	if (err)
 		return err;
 	bpfilter_ops.stop = false;
-	pr_info("Loaded bpfilter_umh pid %d\n", bpfilter_ops.info.pid);
+	pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid));
 
 	/* health check that usermode process started correctly */
 	if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) {
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index 5050de28333d..56cbc43145f6 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -18,7 +18,8 @@ static void bpfilter_umh_cleanup(struct umd_info *info)
 	bpfilter_ops.stop = true;
 	fput(info->pipe_to_umh);
 	fput(info->pipe_from_umh);
-	info->pid = 0;
+	put_pid(info->tgid);
+	info->tgid = NULL;
 	mutex_unlock(&bpfilter_ops.lock);
 }
 
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH 13/14] bpfilter: Take advantage of the facilities of struct pid
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
                                                                                   ` (11 preceding siblings ...)
  2020-06-26 12:58                                                                 ` [PATCH 12/14] umd: Track user space drivers with struct pid Eric W. Biederman
@ 2020-06-26 12:59                                                                 ` Eric W. Biederman
  2020-06-26 12:59                                                                 ` [PATCH 14/14] umd: Remove exit_umh Eric W. Biederman
                                                                                   ` (4 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 12:59 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler


Instead of relying on the exit_umh cleanup callback use the fact a
struct pid can be tested to see if a process still exists, and that
struct pid has a wait queue that notifies when the process dies.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/bpfilter.h     |  3 ++-
 net/bpfilter/bpfilter_kern.c | 15 +++++----------
 net/ipv4/bpfilter/sockopt.c  | 15 ++++++++-------
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
index 4b43d2240172..8073ddce73b1 100644
--- a/include/linux/bpfilter.h
+++ b/include/linux/bpfilter.h
@@ -10,6 +10,8 @@ int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
 			    unsigned int optlen);
 int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
 			    int __user *optlen);
+void bpfilter_umh_cleanup(struct umd_info *info);
+
 struct bpfilter_umh_ops {
 	struct umd_info info;
 	/* since ip_getsockopt() can run in parallel, serialize access to umh */
@@ -18,7 +20,6 @@ struct bpfilter_umh_ops {
 		       char __user *optval,
 		       unsigned int optlen, bool is_set);
 	int (*start)(void);
-	bool stop;
 };
 extern struct bpfilter_umh_ops bpfilter_ops;
 #endif
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index b73dedeb6dbf..91474884ddb7 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -18,10 +18,11 @@ static void shutdown_umh(void)
 	struct umd_info *info = &bpfilter_ops.info;
 	struct pid *tgid = info->tgid;
 
-	if (bpfilter_ops.stop)
-		return;
-
-	kill_pid_info(SIGKILL, SEND_SIG_PRIV, tgid);
+	if (tgid) {
+		kill_pid_info(SIGKILL, SEND_SIG_PRIV, tgid);
+		wait_event(tgid->wait_pidfd, !pid_task(tgid, PIDTYPE_TGID));
+		bpfilter_umh_cleanup(info);
+	}
 }
 
 static void __stop_umh(void)
@@ -77,7 +78,6 @@ static int start_umh(void)
 	err = fork_usermode_driver(&bpfilter_ops.info);
 	if (err)
 		return err;
-	bpfilter_ops.stop = false;
 	pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid));
 
 	/* health check that usermode process started correctly */
@@ -100,16 +100,11 @@ static int __init load_umh(void)
 		return err;
 
 	mutex_lock(&bpfilter_ops.lock);
-	if (!bpfilter_ops.stop) {
-		err = -EFAULT;
-		goto out;
-	}
 	err = start_umh();
 	if (!err && IS_ENABLED(CONFIG_INET)) {
 		bpfilter_ops.sockopt = &__bpfilter_process_sockopt;
 		bpfilter_ops.start = &start_umh;
 	}
-out:
 	mutex_unlock(&bpfilter_ops.lock);
 	if (err)
 		umd_unload_blob(&bpfilter_ops.info);
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index 56cbc43145f6..9455eb9cec78 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -12,16 +12,14 @@
 struct bpfilter_umh_ops bpfilter_ops;
 EXPORT_SYMBOL_GPL(bpfilter_ops);
 
-static void bpfilter_umh_cleanup(struct umd_info *info)
+void bpfilter_umh_cleanup(struct umd_info *info)
 {
-	mutex_lock(&bpfilter_ops.lock);
-	bpfilter_ops.stop = true;
 	fput(info->pipe_to_umh);
 	fput(info->pipe_from_umh);
 	put_pid(info->tgid);
 	info->tgid = NULL;
-	mutex_unlock(&bpfilter_ops.lock);
 }
+EXPORT_SYMBOL_GPL(bpfilter_umh_cleanup);
 
 static int bpfilter_mbox_request(struct sock *sk, int optname,
 				 char __user *optval,
@@ -39,7 +37,11 @@ static int bpfilter_mbox_request(struct sock *sk, int optname,
 			goto out;
 		}
 	}
-	if (bpfilter_ops.stop) {
+	if (bpfilter_ops.info.tgid &&
+	    !pid_has_task(bpfilter_ops.info.tgid, PIDTYPE_TGID))
+		bpfilter_umh_cleanup(&bpfilter_ops.info);
+
+	if (!bpfilter_ops.info.tgid) {
 		err = bpfilter_ops.start();
 		if (err)
 			goto out;
@@ -70,9 +72,8 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
 static int __init bpfilter_sockopt_init(void)
 {
 	mutex_init(&bpfilter_ops.lock);
-	bpfilter_ops.stop = true;
+	bpfilter_ops.info.tgid = NULL;
 	bpfilter_ops.info.driver_name = "bpfilter_umh";
-	bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup;
 
 	return 0;
 }
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH 14/14] umd: Remove exit_umh
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
                                                                                   ` (12 preceding siblings ...)
  2020-06-26 12:59                                                                 ` [PATCH 13/14] bpfilter: Take advantage of the facilities of " Eric W. Biederman
@ 2020-06-26 12:59                                                                 ` Eric W. Biederman
  2020-06-26 13:48                                                                 ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
                                                                                   ` (3 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 12:59 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler


The bffilter code no longer uses the umd_info.cleanup callback.  This
callback is what exit_umh exists to call.  So remove exit_umh and all
of it's associated booking.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched.h |  9 ---------
 include/linux/umd.h   |  2 --
 kernel/exit.c         |  2 --
 kernel/umd.c          | 28 ----------------------------
 4 files changed, 41 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b62e6aaf28f0..edb2020875ad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1511,7 +1511,6 @@ extern struct pid *cad_pid;
 #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
 #define PF_SWAPWRITE		0x00800000	/* Allowed to write to swap */
-#define PF_UMH			0x02000000	/* I'm an Usermodehelper process */
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
 #define PF_MEMALLOC_NOCMA	0x10000000	/* All allocation request will have _GFP_MOVABLE cleared */
@@ -2020,14 +2019,6 @@ static inline void rseq_execve(struct task_struct *t)
 
 #endif
 
-void __exit_umh(struct task_struct *tsk);
-
-static inline void exit_umh(struct task_struct *tsk)
-{
-	if (unlikely(tsk->flags & PF_UMH))
-		__exit_umh(tsk);
-}
-
 #ifdef CONFIG_DEBUG_RSEQ
 
 void rseq_syscall(struct pt_regs *regs);
diff --git a/include/linux/umd.h b/include/linux/umd.h
index 1c4579d79bce..71d8f4a41ad7 100644
--- a/include/linux/umd.h
+++ b/include/linux/umd.h
@@ -8,8 +8,6 @@ struct umd_info {
 	const char *driver_name;
 	struct file *pipe_to_umh;
 	struct file *pipe_from_umh;
-	struct list_head list;
-	void (*cleanup)(struct umd_info *info);
 	struct path wd;
 	struct pid *tgid;
 };
diff --git a/kernel/exit.c b/kernel/exit.c
index 671d5066b399..42f079eb71e5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -804,8 +804,6 @@ void __noreturn do_exit(long code)
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
 	exit_thread(tsk);
-	if (group_dead)
-		exit_umh(tsk);
 
 	/*
 	 * Flush inherited counters to the parent - before the parent
diff --git a/kernel/umd.c b/kernel/umd.c
index 0db9ce3f56c9..de2f542191e5 100644
--- a/kernel/umd.c
+++ b/kernel/umd.c
@@ -8,9 +8,6 @@
 #include <linux/fs_struct.h>
 #include <linux/umd.h>
 
-static LIST_HEAD(umh_list);
-static DEFINE_MUTEX(umh_list_lock);
-
 static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name)
 {
 	struct file_system_type *type;
@@ -129,7 +126,6 @@ static int umd_setup(struct subprocess_info *info, struct cred *new)
 	umd_info->pipe_to_umh = to_umh[1];
 	umd_info->pipe_from_umh = from_umh[0];
 	umd_info->tgid = get_pid(task_tgid(current));
-	current->flags |= PF_UMH;
 	return 0;
 }
 
@@ -177,11 +173,6 @@ int fork_usermode_driver(struct umd_info *info)
 		goto out;
 
 	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
-	if (!err) {
-		mutex_lock(&umh_list_lock);
-		list_add(&info->list, &umh_list);
-		mutex_unlock(&umh_list_lock);
-	}
 out:
 	if (argv)
 		argv_free(argv);
@@ -189,23 +180,4 @@ int fork_usermode_driver(struct umd_info *info)
 }
 EXPORT_SYMBOL_GPL(fork_usermode_driver);
 
-void __exit_umh(struct task_struct *tsk)
-{
-	struct umd_info *info;
-	struct pid *tgid = task_tgid(tsk);
-
-	mutex_lock(&umh_list_lock);
-	list_for_each_entry(info, &umh_list, list) {
-		if (info->tgid == tgid) {
-			list_del(&info->list);
-			mutex_unlock(&umh_list_lock);
-			goto out;
-		}
-	}
-	mutex_unlock(&umh_list_lock);
-	return;
-out:
-	if (info->cleanup)
-		info->cleanup(info);
-}
 
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
                                                                                   ` (13 preceding siblings ...)
  2020-06-26 12:59                                                                 ` [PATCH 14/14] umd: Remove exit_umh Eric W. Biederman
@ 2020-06-26 13:48                                                                 ` Eric W. Biederman
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
  2020-06-26 14:10                                                                 ` [PATCH 00/14] " Greg Kroah-Hartman
                                                                                   ` (2 subsequent siblings)
  17 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 13:48 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain


Adding Luis Chamberlain as he maintains the user mode helper code.

Just so everyone who is relevant is at least aware of what is going on.

ebiederm@xmission.com (Eric W. Biederman) writes:

> Asking for people to fix their bugs in this user mode driver code has
> been remarkably unproductive.  So here are my bug fixes.
>
> I have tested them by booting with the code compiled in and
> by killing "bpfilter_umh" and running iptables -vnL to restart
> the userspace driver.
>
> I have split the changes into small enough pieces so they should be
> easily readable and testable.  
>
> The changes lean into the preexisting interfaces in the kernel and
> remove special cases for user mode driver code in favor of solutions
> that don't need special cases.  This results in smaller code with
> fewer bugs.
>
> At a practical level this removes the maintenance burden of the
> user mode drivers from the user mode helper code and from exec as
> the special cases are removed.
>
> Similarly the LSM interaction bugs are fixed by not having unnecessary
> special cases for user mode drivers.
>
> Please let me know if you see any bugs.  Once the code review is
> finished I plan to take this through my tree.
>
> Eric W. Biederman (14):
>       umh: Capture the pid in umh_pipe_setup
>       umh: Move setting PF_UMH into umh_pipe_setup
>       umh: Rename the user mode driver helpers for clarity
>       umh: Remove call_usermodehelper_setup_file.
>       umh: Separate the user mode driver and the user mode helper support
>       umd: For clarity rename umh_info umd_info
>       umd: Rename umd_info.cmdline umd_info.driver_name
>       umd: Transform fork_usermode_blob into fork_usermode_driver
>       umh: Stop calling do_execve_file
>       exec: Remove do_execve_file
>       bpfilter: Move bpfilter_umh back into init data
>       umd: Track user space drivers with struct pid
>       bpfilter: Take advantage of the facilities of struct pid
>       umd: Remove exit_umh
>
>  fs/exec.c                        |  38 ++------
>  include/linux/binfmts.h          |   1 -
>  include/linux/bpfilter.h         |   7 +-
>  include/linux/sched.h            |   9 --
>  include/linux/umd.h              |  18 ++++
>  include/linux/umh.h              |  15 ----
>  kernel/Makefile                  |   1 +
>  kernel/exit.c                    |   1 -
>  kernel/umd.c                     | 183 +++++++++++++++++++++++++++++++++++++++
>  kernel/umh.c                     | 171 +-----------------------------------
>  net/bpfilter/bpfilter_kern.c     |  38 ++++----
>  net/bpfilter/bpfilter_umh_blob.S |   2 +-
>  net/ipv4/bpfilter/sockopt.c      |  20 +++--
>  13 files changed, 249 insertions(+), 255 deletions(-)
>
> Eric

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
                                                                                   ` (14 preceding siblings ...)
  2020-06-26 13:48                                                                 ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
@ 2020-06-26 14:10                                                                 ` Greg Kroah-Hartman
  2020-06-26 16:40                                                                 ` Alexei Starovoitov
  2020-06-27 11:38                                                                 ` Tetsuo Handa
  17 siblings, 0 replies; 194+ messages in thread
From: Greg Kroah-Hartman @ 2020-06-26 14:10 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linus Torvalds, David Miller, Tetsuo Handa, Alexei Starovoitov,
	Kees Cook, Andrew Morton, Alexei Starovoitov, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, LSM List, Casey Schaufler

On Fri, Jun 26, 2020 at 07:51:41AM -0500, Eric W. Biederman wrote:
> 
> Asking for people to fix their bugs in this user mode driver code has
> been remarkably unproductive.  So here are my bug fixes.
> 
> I have tested them by booting with the code compiled in and
> by killing "bpfilter_umh" and running iptables -vnL to restart
> the userspace driver.
> 
> I have split the changes into small enough pieces so they should be
> easily readable and testable.  
> 
> The changes lean into the preexisting interfaces in the kernel and
> remove special cases for user mode driver code in favor of solutions
> that don't need special cases.  This results in smaller code with
> fewer bugs.
> 
> At a practical level this removes the maintenance burden of the
> user mode drivers from the user mode helper code and from exec as
> the special cases are removed.
> 
> Similarly the LSM interaction bugs are fixed by not having unnecessary
> special cases for user mode drivers.
> 
> Please let me know if you see any bugs.  Once the code review is
> finished I plan to take this through my tree.
> 
> Eric W. Biederman (14):
>       umh: Capture the pid in umh_pipe_setup
>       umh: Move setting PF_UMH into umh_pipe_setup
>       umh: Rename the user mode driver helpers for clarity
>       umh: Remove call_usermodehelper_setup_file.
>       umh: Separate the user mode driver and the user mode helper support
>       umd: For clarity rename umh_info umd_info
>       umd: Rename umd_info.cmdline umd_info.driver_name
>       umd: Transform fork_usermode_blob into fork_usermode_driver
>       umh: Stop calling do_execve_file
>       exec: Remove do_execve_file
>       bpfilter: Move bpfilter_umh back into init data
>       umd: Track user space drivers with struct pid
>       bpfilter: Take advantage of the facilities of struct pid
>       umd: Remove exit_umh

After a quick read, all looks sane to me, nice cleanups!

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 05/14] umh: Separate the user mode driver and the user mode helper support
  2020-06-26 12:55                                                                 ` [PATCH 05/14] umh: Separate the user mode driver and the user mode helper support Eric W. Biederman
@ 2020-06-26 14:17                                                                   ` kernel test robot
  2020-06-26 16:22                                                                   ` Tetsuo Handa
  1 sibling, 0 replies; 194+ messages in thread
From: kernel test robot @ 2020-06-26 14:17 UTC (permalink / raw)
  To: Eric W. Biederman, Linus Torvalds
  Cc: kbuild-all, LKML, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton,
	Linux Memory Management List, Al Viro, bpf


[-- Attachment #1: Type: text/plain, Size: 1354 bytes --]

Hi "Eric,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]
[also build test ERROR on bpf/master linux/master linus/master v5.8-rc2 next-20200626]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use  as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Eric-W-Biederman/Make-the-user-mode-driver-code-a-better-citizen/20200626-210513
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: i386-tinyconfig (attached as .config)
compiler: gcc-9 (Debian 9.3.0-13) 9.3.0
reproduce (this is a W=1 build):
        # save the attached .config to linux build tree
        make W=1 ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

Note: the linux-review/Eric-W-Biederman/Make-the-user-mode-driver-code-a-better-citizen/20200626-210513 HEAD 5c5f3ba486c1c4173950c91b97ce9f3b5f6c7403 builds fine.
      It only hurts bisectibility.

All errors (new ones prefixed by >>):

   ld: kernel/exit.o: in function `do_exit':
>> exit.c:(.text+0xec0): undefined reference to `__exit_umh'

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 7287 bytes --]

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 06/14] umd: For clarity rename umh_info umd_info
  2020-06-26 12:55                                                                 ` [PATCH 06/14] umd: For clarity rename umh_info umd_info Eric W. Biederman
@ 2020-06-26 15:37                                                                   ` Kees Cook
  2020-06-26 16:31                                                                     ` Eric W. Biederman
  0 siblings, 1 reply; 194+ messages in thread
From: Kees Cook @ 2020-06-26 15:37 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linus Torvalds, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Andrew Morton, Alexei Starovoitov, Al Viro,
	bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On Fri, Jun 26, 2020 at 07:55:43AM -0500, Eric W. Biederman wrote:
> This structure is only used for user mode drivers so change
> the prefix from umh to umd to make that clear.

Should bpfilter_umh get renamed to bpfilter_umd at some point in this
series too?

-- 
Kees Cook

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 05/14] umh: Separate the user mode driver and the user mode helper support
  2020-06-26 12:55                                                                 ` [PATCH 05/14] umh: Separate the user mode driver and the user mode helper support Eric W. Biederman
  2020-06-26 14:17                                                                   ` kernel test robot
@ 2020-06-26 16:22                                                                   ` Tetsuo Handa
  2020-06-26 16:45                                                                     ` Eric W. Biederman
  1 sibling, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-26 16:22 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linus Torvalds, David Miller, Greg Kroah-Hartman,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On 2020/06/26 21:55, Eric W. Biederman wrote:
> +static void umd_cleanup(struct subprocess_info *info)
> +{
> +	struct umh_info *umh_info = info->data;
> +
> +	/* cleanup if umh_pipe_setup() was successful but exec failed */

s/umh_pipe_setup/umd_setup/

> +	if (info->retval) {
> +		fput(umh_info->pipe_to_umh);
> +		fput(umh_info->pipe_from_umh);
> +	}
> +}

After this cleanup, I expect adding some protections/isolation which kernel threads
have (e.g. excluded from ptrace(), excluded from OOM victim selection, excluded from
SysRq-i, won't be terminated by SIGKILL from usermode processes, won't be stopped by
SIGSTOP from usermode processes, what else?). Doing it means giving up Alexei's

  It's nice to be able to compile that blob with -g and be able to 'gdb -p' into it.
  That works and very convenient when it comes to debugging. Compare that to debugging
  a kernel module!

but I think doing it is essential for keeping usermode blob processes as secure/robust
as kernel threads.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 06/14] umd: For clarity rename umh_info umd_info
  2020-06-26 15:37                                                                   ` Kees Cook
@ 2020-06-26 16:31                                                                     ` Eric W. Biederman
  0 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 16:31 UTC (permalink / raw)
  To: Kees Cook
  Cc: Linus Torvalds, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Andrew Morton, Alexei Starovoitov, Al Viro,
	bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

Kees Cook <keescook@chromium.org> writes:

> On Fri, Jun 26, 2020 at 07:55:43AM -0500, Eric W. Biederman wrote:
>> This structure is only used for user mode drivers so change
>> the prefix from umh to umd to make that clear.
>
> Should bpfilter_umh get renamed to bpfilter_umd at some point in this
> series too?

I think it would make a natural follow on, in a patches welcome sort of
way.

In this series I think it is important to draw a clear line between the
user mode driver infrastructure and the more general user mode helper
infrastructure.  As it fundamentally makes a difference when you are
skimming through the code trying to find the details you care about.

But that line, which removes the maintenance burden from everyone else
is where this series stops.

I will be reposting shortly to fix the build issue I overlooked.

Eric









^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
                                                                                   ` (15 preceding siblings ...)
  2020-06-26 14:10                                                                 ` [PATCH 00/14] " Greg Kroah-Hartman
@ 2020-06-26 16:40                                                                 ` Alexei Starovoitov
  2020-06-26 17:17                                                                   ` Eric W. Biederman
  2020-06-27 11:38                                                                 ` Tetsuo Handa
  17 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-26 16:40 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linus Torvalds, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Kees Cook, Andrew Morton, Alexei Starovoitov, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, LSM List, Casey Schaufler

On Fri, Jun 26, 2020 at 07:51:41AM -0500, Eric W. Biederman wrote:
> 
> Asking for people to fix their bugs in this user mode driver code has
> been remarkably unproductive.  So here are my bug fixes.
> 
> I have tested them by booting with the code compiled in and
> by killing "bpfilter_umh" and running iptables -vnL to restart
> the userspace driver.
> 
> I have split the changes into small enough pieces so they should be
> easily readable and testable.  
> 
> The changes lean into the preexisting interfaces in the kernel and
> remove special cases for user mode driver code in favor of solutions
> that don't need special cases.  This results in smaller code with
> fewer bugs.
> 
> At a practical level this removes the maintenance burden of the
> user mode drivers from the user mode helper code and from exec as
> the special cases are removed.
> 
> Similarly the LSM interaction bugs are fixed by not having unnecessary
> special cases for user mode drivers.
> 
> Please let me know if you see any bugs.  Once the code review is
> finished I plan to take this through my tree.

I did a quick look and I like the cleanup. Thanks! The end result looks good.
The only problem that you keep breaking the build between patches,
so series will not be bisectable.
blob_to_mnt is a great idea. Much better than embedding fs you advocated earlier.

I'm swamped with other stuff today and will test the set Sunday/Monday
with other patches that I'm working on.
I'm not sure why you want to rename the interface. Seems pointless. But fine.

As far as routing trees. Do you mind I'll take it via bpf-next ?
As I said countless times we're working on bpf_iter using fork_blob.
If you take this set via your tree we would need to wait the whole kernel release.
Which is 8+ weeks before we can use the interface (due to renaming and overall changes).
I'd really like to avoid this huge delay.
Unless you can land it into 5.8-rc2 or rc3.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 05/14] umh: Separate the user mode driver and the user mode helper support
  2020-06-26 16:22                                                                   ` Tetsuo Handa
@ 2020-06-26 16:45                                                                     ` Eric W. Biederman
  2020-06-27  1:26                                                                       ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 16:45 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Linus Torvalds, David Miller, Greg Kroah-Hartman,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:

> On 2020/06/26 21:55, Eric W. Biederman wrote:
>> +static void umd_cleanup(struct subprocess_info *info)
>> +{
>> +	struct umh_info *umh_info = info->data;
>> +
>> +	/* cleanup if umh_pipe_setup() was successful but exec failed */
>
> s/umh_pipe_setup/umd_setup/

Good catch.  I will fix that when I respin.

>> +	if (info->retval) {
>> +		fput(umh_info->pipe_to_umh);
>> +		fput(umh_info->pipe_from_umh);
>> +	}
>> +}
>
> After this cleanup, I expect adding some protections/isolation which kernel threads
> have (e.g. excluded from ptrace(), excluded from OOM victim selection, excluded from
> SysRq-i, won't be terminated by SIGKILL from usermode processes, won't be stopped by
> SIGSTOP from usermode processes, what else?). Doing it means giving up Alexei's
>
>   It's nice to be able to compile that blob with -g and be able to 'gdb -p' into it.
>   That works and very convenient when it comes to debugging. Compare that to debugging
>   a kernel module!
>
> but I think doing it is essential for keeping usermode blob processes as secure/robust
> as kernel threads.

Do you have an application for a user mode driver?

I think concerns like that are best addressed in the context of a
specific driver/usecase.  Just to make certain we are solving the right
problems.

My sense is that an advantage of user mode drivers can safely be buggier
than kernel drivers and the freedom to kill them when the drivers go
wrong (knowing the drivers will restart) is important.

Does this series by using the normal path through exec solve your
concerns with LSMs being able to identify these processes (both
individually and as class)?.

Eric


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-26 16:40                                                                 ` Alexei Starovoitov
@ 2020-06-26 17:17                                                                   ` Eric W. Biederman
  2020-06-26 18:22                                                                     ` Alexei Starovoitov
  0 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-26 17:17 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Kees Cook, Andrew Morton, Alexei Starovoitov, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, LSM List, Casey Schaufler

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Fri, Jun 26, 2020 at 07:51:41AM -0500, Eric W. Biederman wrote:
>> 
>> Asking for people to fix their bugs in this user mode driver code has
>> been remarkably unproductive.  So here are my bug fixes.
>> 
>> I have tested them by booting with the code compiled in and
>> by killing "bpfilter_umh" and running iptables -vnL to restart
>> the userspace driver.
>> 
>> I have split the changes into small enough pieces so they should be
>> easily readable and testable.  
>> 
>> The changes lean into the preexisting interfaces in the kernel and
>> remove special cases for user mode driver code in favor of solutions
>> that don't need special cases.  This results in smaller code with
>> fewer bugs.
>> 
>> At a practical level this removes the maintenance burden of the
>> user mode drivers from the user mode helper code and from exec as
>> the special cases are removed.
>> 
>> Similarly the LSM interaction bugs are fixed by not having unnecessary
>> special cases for user mode drivers.
>> 
>> Please let me know if you see any bugs.  Once the code review is
>> finished I plan to take this through my tree.
>
> I did a quick look and I like the cleanup. Thanks!

Good then we have a path forward.

> The end result looks good.
> The only problem that you keep breaking the build between patches,
> so series will not be bisectable.

Keep breaking?  There is an issue with patch 5/14 where the build breaks
when bpfilter is not enabled.  Do you see any others? I know I tested
each patch individually.  But I was only testing with CONFIG_BPFILTER
enabled so I missed one.

So there should not be things that break
but things slip through occassionally.

I will resend this shortly with the fix and any others that I can find.

> blob_to_mnt is a great idea. Much better than embedding fs you advocated earlier.

I was lazy and not overdesigning but I still suspect the blob will
benefit from becoming a cpio in the future.

> I'm swamped with other stuff today and will test the set Sunday/Monday
> with other patches that I'm working on.
> I'm not sure why you want to rename the interface. Seems
> pointless. But fine.

For maintainability I think the code very much benefits from a clear
separation between the user mode driver code from the user mode helper
code.

> As far as routing trees. Do you mind I'll take it via bpf-next ?
> As I said countless times we're working on bpf_iter using fork_blob.
> If you take this set via your tree we would need to wait the whole kernel release.
> Which is 8+ weeks before we can use the interface (due to renaming and overall changes).
> I'd really like to avoid this huge delay.
> Unless you can land it into 5.8-rc2 or rc3.

I also want to build upon this code.

How about when the review is done I post a frozen branch based on
v5.8-rc1 that you can merge into the bpf-next tree, and I can merge into
my branch.  That way we both can build upon this code.  That is the way
conflicts like this are usually handled.

Further I will leave any further enhancements to the user mode driver
infrastructure that people have suggested to you.

I will probably replace do_execve with a kernel_execve that doesn't need
set_fs() to copy the command line argument.  I haven't seen Christoph
Hellwig address that yet, and it looks pretty straight foward at this
point.


Eric


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-26 17:17                                                                   ` Eric W. Biederman
@ 2020-06-26 18:22                                                                     ` Alexei Starovoitov
  0 siblings, 0 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-26 18:22 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linus Torvalds, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Kees Cook, Andrew Morton, Alexei Starovoitov, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, LSM List, Casey Schaufler

On Fri, Jun 26, 2020 at 12:17:40PM -0500, Eric W. Biederman wrote:
> 
> > I'm swamped with other stuff today and will test the set Sunday/Monday
> > with other patches that I'm working on.
> > I'm not sure why you want to rename the interface. Seems
> > pointless. But fine.
> 
> For maintainability I think the code very much benefits from a clear
> separation between the user mode driver code from the user mode helper
> code.

you mean different name gives that separation? makes sense.

> > As far as routing trees. Do you mind I'll take it via bpf-next ?
> > As I said countless times we're working on bpf_iter using fork_blob.
> > If you take this set via your tree we would need to wait the whole kernel release.
> > Which is 8+ weeks before we can use the interface (due to renaming and overall changes).
> > I'd really like to avoid this huge delay.
> > Unless you can land it into 5.8-rc2 or rc3.
> 
> I also want to build upon this code.
> 
> How about when the review is done I post a frozen branch based on
> v5.8-rc1 that you can merge into the bpf-next tree, and I can merge into
> my branch.  That way we both can build upon this code.  That is the way
> conflicts like this are usually handled.

sure. that works too.

> Further I will leave any further enhancements to the user mode driver
> infrastructure that people have suggested to you.

ok

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 05/14] umh: Separate the user mode driver and the user mode helper support
  2020-06-26 16:45                                                                     ` Eric W. Biederman
@ 2020-06-27  1:26                                                                       ` Tetsuo Handa
  2020-06-27  4:21                                                                         ` Eric W. Biederman
  0 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-27  1:26 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linus Torvalds, David Miller, Greg Kroah-Hartman,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On 2020/06/27 1:45, Eric W. Biederman wrote:
>> After this cleanup, I expect adding some protections/isolation which kernel threads
>> have (e.g. excluded from ptrace(), excluded from OOM victim selection, excluded from
>> SysRq-i, won't be terminated by SIGKILL from usermode processes, won't be stopped by
>> SIGSTOP from usermode processes, what else?). Doing it means giving up Alexei's
>>
>>   It's nice to be able to compile that blob with -g and be able to 'gdb -p' into it.
>>   That works and very convenient when it comes to debugging. Compare that to debugging
>>   a kernel module!
>>
>> but I think doing it is essential for keeping usermode blob processes as secure/robust
>> as kernel threads.
> 
> Do you have an application for a user mode driver?

No, I'm not a user of this interface.

> 
> I think concerns like that are best addressed in the context of a
> specific driver/usecase.  Just to make certain we are solving the right
> problems.
> 
> My sense is that an advantage of user mode drivers can safely be buggier
> than kernel drivers and the freedom to kill them when the drivers go
> wrong (knowing the drivers will restart) is important.

Right. Segmentation fault in user mode drivers does not cause a kernel oops
is an advantage of user mode drivers. But the freedom to disturb/kill user mode
drivers due to interference like ptrace()/signals from user mode processes,
SIGKILL from OOM-killer/SysRq-i etc. is a big disadvantage of user mode drivers.
I expect that user mode drivers should be killable only when the manager
interface detected that user mode drivers need to be stopped (or restarted).

One of advantages kernel mode drivers have is that their memory is not swapped
out/in. I don't know whether mlockall(MCL_FUTURE) should be automatically applied
to user mode drivers.

> 
> Does this series by using the normal path through exec solve your
> concerns with LSMs being able to identify these processes (both
> individually and as class)?.

I guess "yes" for pathname based LSMs. Though, TOMOYO wants to obtain both
AT_SYMLINK_NOFOLLOW "struct path" and !AT_SYMLINK_NOFOLLOW "struct path"
at do_open_execat() from do_execveat_common().

I guess "no" for inode based LSMs, for they want a chance to associate
security labels at blob_to_mnt().


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 05/14] umh: Separate the user mode driver and the user mode helper support
  2020-06-27  1:26                                                                       ` Tetsuo Handa
@ 2020-06-27  4:21                                                                         ` Eric W. Biederman
  2020-06-27  4:36                                                                           ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-27  4:21 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Linus Torvalds, David Miller, Greg Kroah-Hartman,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:
> On 2020/06/27 1:45, Eric W. Biederman wrote:
>> Does this series by using the normal path through exec solve your
>> concerns with LSMs being able to identify these processes (both
>> individually and as class)?.
>
> I guess "yes" for pathname based LSMs. Though, TOMOYO wants to obtain both
> AT_SYMLINK_NOFOLLOW "struct path" and !AT_SYMLINK_NOFOLLOW "struct path"
> at do_open_execat() from do_execveat_common().

Is that a problem with the current do_execveat_common in general?

That does not sound like a problem in the user mode driver case as
there are no symlinks involved.

Eric




^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 05/14] umh: Separate the user mode driver and the user mode helper support
  2020-06-27  4:21                                                                         ` Eric W. Biederman
@ 2020-06-27  4:36                                                                           ` Tetsuo Handa
  0 siblings, 0 replies; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-27  4:36 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linus Torvalds, David Miller, Greg Kroah-Hartman,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On 2020/06/27 13:21, Eric W. Biederman wrote:
> Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:
>> On 2020/06/27 1:45, Eric W. Biederman wrote:
>>> Does this series by using the normal path through exec solve your
>>> concerns with LSMs being able to identify these processes (both
>>> individually and as class)?.
>>
>> I guess "yes" for pathname based LSMs. Though, TOMOYO wants to obtain both
>> AT_SYMLINK_NOFOLLOW "struct path" and !AT_SYMLINK_NOFOLLOW "struct path"
>> at do_open_execat() from do_execveat_common().
> 
> Is that a problem with the current do_execveat_common in general?

In general. Since LSM does not receive parameters needed for obtaining
AT_SYMLINK_NOFOLLOW "struct path" (and it is racy even if parameters were
passed to LSM), I want to obtain both paths in one place.

> 
> That does not sound like a problem in the user mode driver case as
> there are no symlinks involved.

Right.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-26 12:51                                                               ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
                                                                                   ` (16 preceding siblings ...)
  2020-06-26 16:40                                                                 ` Alexei Starovoitov
@ 2020-06-27 11:38                                                                 ` Tetsuo Handa
  2020-06-27 12:59                                                                   ` Eric W. Biederman
  17 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-27 11:38 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linus Torvalds, David Miller, Greg Kroah-Hartman,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On 2020/06/26 21:51, Eric W. Biederman wrote:
> Please let me know if you see any bugs.  Once the code review is
> finished I plan to take this through my tree.

This series needs some sanity checks.

diff --git a/kernel/umd.c b/kernel/umd.c
index de2f542191e5..f3e0227a3012 100644
--- a/kernel/umd.c
+++ b/kernel/umd.c
@@ -47,15 +47,18 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na
 
 /**
  * umd_load_blob - Remember a blob of bytes for fork_usermode_driver
- * @info: information about usermode driver
- * @data: a blob of bytes that can be executed as a file
- * @len:  The lentgh of the blob
+ * @info: information about usermode driver (shouldn't be NULL)
+ * @data: a blob of bytes that can be executed as a file (shouldn't be NULL)
+ * @len:  The lentgh of the blob (shouldn't be 0)
  *
  */
 int umd_load_blob(struct umd_info *info, const void *data, size_t len)
 {
 	struct vfsmount *mnt;
 
+	if (!info || !info->driver_name || !data || !len)
+		return -EINVAL;
+
 	if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt))
 		return -EBUSY;
 
@@ -158,6 +161,9 @@ int fork_usermode_driver(struct umd_info *info)
 	char **argv = NULL;
 	int err;
 
+	if (!info || !info->driver_name)
+		return -EINVAL;
+
 	if (WARN_ON_ONCE(info->tgid))
 		return -EBUSY;
 
But loading

----- test.c -----
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/umd.h>

static int __init test_init(void)
{
	const char blob[464] = {
		"\x7f\x45\x4c\x46\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00"
		"\x02\x00\x3e\x00\x01\x00\x00\x00\x80\x00\x40\x00\x00\x00\x00\x00"
		"\x40\x00\x00\x00\x00\x00\x00\x00\xd0\x00\x00\x00\x00\x00\x00\x00"
		"\x00\x00\x00\x00\x40\x00\x38\x00\x01\x00\x40\x00\x04\x00\x03\x00"
		"\x01\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
		"\x00\x00\x40\x00\x00\x00\x00\x00\x00\x00\x40\x00\x00\x00\x00\x00"
		"\xb4\x00\x00\x00\x00\x00\x00\x00\xb4\x00\x00\x00\x00\x00\x00\x00"
		"\x00\x00\x20\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
		"\xb8\x01\x00\x00\x00\xbf\x01\x00\x00\x00\x48\xbe\xa8\x00\x40\x00"
		"\x00\x00\x00\x00\xba\x0c\x00\x00\x00\x0f\x05\xb8\xe7\x00\x00\x00"
		"\xbf\x00\x00\x00\x00\x0f\x05\x00\x48\x65\x6c\x6c\x6f\x20\x77\x6f"
		"\x72\x6c\x64\x0a\x00\x2e\x73\x68\x73\x74\x72\x74\x61\x62\x00\x2e"
		"\x74\x65\x78\x74\x00\x2e\x72\x6f\x64\x61\x74\x61\x00\x00\x00\x00"
		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
		"\x0b\x00\x00\x00\x01\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00"
		"\x80\x00\x40\x00\x00\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\x00"
		"\x27\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
		"\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
		"\x11\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00"
		"\xa8\x00\x40\x00\x00\x00\x00\x00\xa8\x00\x00\x00\x00\x00\x00\x00"
		"\x0c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
		"\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
		"\x01\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
		"\x00\x00\x00\x00\x00\x00\x00\x00\xb4\x00\x00\x00\x00\x00\x00\x00"
		"\x19\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
		"\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
	};
	struct umd_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
	
	if (!info)
		return -ENOMEM;
	info->driver_name = kstrdup("my test driver", GFP_KERNEL);
	printk("umd_load_blob()=%d\n", umd_load_blob(info, blob, 464));
	//printk("fork_usermode_driver()=%d\n", fork_usermode_driver(info));
	return -EINVAL;
}

module_init(test_init);
MODULE_LICENSE("GPL");
----- test.c -----

causes

   BUG_ON(!(task->flags & PF_KTHREAD));

in __fput_sync(). Do we want to forbid umd_load_blob() from process context (e.g.
upon module initialization time) ?

Also, since umd_load_blob() uses info->driver_name as filename, info->driver_name has to
satisfy strchr(info->driver_name, '/') == NULL && strlen(info->driver_name) <= NAME_MAX
in order to avoid -ENOENT failure. On the other hand, since fork_usermode_driver() uses
info->driver_name as argv[], info->driver_name has to use ' ' within this constraint.
This might be inconvenient...

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-27 11:38                                                                 ` Tetsuo Handa
@ 2020-06-27 12:59                                                                   ` Eric W. Biederman
  2020-06-27 13:57                                                                     ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-27 12:59 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Linus Torvalds, David Miller, Greg Kroah-Hartman,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:

> On 2020/06/26 21:51, Eric W. Biederman wrote:
>> Please let me know if you see any bugs.  Once the code review is
>> finished I plan to take this through my tree.
>

[sniped example code]
> causes
>
>    BUG_ON(!(task->flags & PF_KTHREAD));
>
> in __fput_sync(). Do we want to forbid umd_load_blob() from process context (e.g.
> upon module initialization time) ?

Interesting.  I had not realized that fput_sync would not work from
module context.

Forcing the fput to finish is absolutely necessary.  Otherwise the file
will still be open for write and deny_write_access in execve will fail.

Can you try replacing the __fput_sync with:
	fput(file);
        flush_delayed_fput();
        task_work_run();


Given that there is a big requirement for the code to run before init
I don't necessarily think it is a problem __fput_sync is a problem.
But it also seems silly to forbid modules if we can easily fix
the code.

Eric

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-27 12:59                                                                   ` Eric W. Biederman
@ 2020-06-27 13:57                                                                     ` Tetsuo Handa
  2020-06-28 19:44                                                                       ` Alexei Starovoitov
  0 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-27 13:57 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linus Torvalds, David Miller, Greg Kroah-Hartman,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On 2020/06/27 21:59, Eric W. Biederman wrote:
> Can you try replacing the __fput_sync with:
> 	fput(file);
>         flush_delayed_fput();
>         task_work_run();

With below change, TOMOYO can obtain pathname like "tmpfs:/my\040test\040driver".

Please avoid WARN_ON() if printk() is sufficient (for friendliness to panic_on_warn=1 environments).
For argv[], I guess that fork_usermode_driver() should receive argv[] as argument rather than
trying to split info->driver_name, for somebody might want to pass meaningful argv[] (and
TOMOYO wants to use meaningful argv[] as a hint for identifying the intent).

diff --git a/kernel/umd.c b/kernel/umd.c
index de2f542191e5..ae6e85283f13 100644
--- a/kernel/umd.c
+++ b/kernel/umd.c
@@ -7,6 +7,7 @@
 #include <linux/mount.h>
 #include <linux/fs_struct.h>
 #include <linux/umd.h>
+#include <linux/task_work.h>
 
 static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name)
 {
@@ -25,7 +26,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na
 	if (IS_ERR(mnt))
 		return mnt;
 
-	file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700);
+	file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY | O_EXCL, 0700);
 	if (IS_ERR(file)) {
 		mntput(mnt);
 		return ERR_CAST(file);
@@ -41,23 +42,33 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na
 		return ERR_PTR(err);
 	}
 
-	__fput_sync(file);
+	if (current->flags & PF_KTHREAD) {
+		__fput_sync(file);
+	} else {
+		fput(file);
+		flush_delayed_fput();
+		task_work_run();
+	}
 	return mnt;
 }
 
 /**
  * umd_load_blob - Remember a blob of bytes for fork_usermode_driver
- * @info: information about usermode driver
- * @data: a blob of bytes that can be executed as a file
- * @len:  The lentgh of the blob
+ * @info: information about usermode driver (shouldn't be NULL)
+ * @data: a blob of bytes that can be executed as a file (shouldn't be NULL)
+ * @len:  The lentgh of the blob (shouldn't be 0)
  *
  */
 int umd_load_blob(struct umd_info *info, const void *data, size_t len)
 {
 	struct vfsmount *mnt;
 
-	if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt))
+	if (!info || !info->driver_name || !data || !len)
+		return -EINVAL;
+	if (info->wd.dentry || info->wd.mnt) {
+		pr_info("%s already loaded.\n", info->driver_name);
 		return -EBUSY;
+	}
 
 	mnt = blob_to_mnt(data, len, info->driver_name);
 	if (IS_ERR(mnt))
@@ -71,14 +82,14 @@ EXPORT_SYMBOL_GPL(umd_load_blob);
 
 /**
  * umd_unload_blob - Disassociate @info from a previously loaded blob
- * @info: information about usermode driver
+ * @info: information about usermode driver (shouldn't be NULL)
  *
  */
 int umd_unload_blob(struct umd_info *info)
 {
-	if (WARN_ON_ONCE(!info->wd.mnt ||
-			 !info->wd.dentry ||
-			 info->wd.mnt->mnt_root != info->wd.dentry))
+	if (!info || !info->driver_name || !info->wd.dentry || !info->wd.mnt)
+		return -EINVAL;
+	if (WARN_ON_ONCE(info->wd.mnt->mnt_root != info->wd.dentry))
 		return -EINVAL;
 
 	kern_unmount(info->wd.mnt);
@@ -158,8 +169,14 @@ int fork_usermode_driver(struct umd_info *info)
 	char **argv = NULL;
 	int err;
 
-	if (WARN_ON_ONCE(info->tgid))
+	if (!info || !info->driver_name || !info->wd.dentry || !info->wd.mnt)
+		return -EINVAL;
+	if (WARN_ON_ONCE(info->wd.mnt->mnt_root != info->wd.dentry))
+		return -EINVAL;
+	if (info->tgid) {
+		pr_info("%s already running.\n", info->driver_name);
 		return -EBUSY;
+	}
 
 	err = -ENOMEM;
 	argv = argv_split(GFP_KERNEL, info->driver_name, NULL);




^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-27 13:57                                                                     ` Tetsuo Handa
@ 2020-06-28 19:44                                                                       ` Alexei Starovoitov
  2020-06-29  2:20                                                                         ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-28 19:44 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Eric W. Biederman, Linus Torvalds, David Miller,
	Greg Kroah-Hartman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On Sat, Jun 27, 2020 at 10:57:10PM +0900, Tetsuo Handa wrote:
> On 2020/06/27 21:59, Eric W. Biederman wrote:
> > Can you try replacing the __fput_sync with:
> > 	fput(file);
> >         flush_delayed_fput();
> >         task_work_run();
> 
> With below change, TOMOYO can obtain pathname like "tmpfs:/my\040test\040driver".
> 
> Please avoid WARN_ON() if printk() is sufficient (for friendliness to panic_on_warn=1 environments).
> For argv[], I guess that fork_usermode_driver() should receive argv[] as argument rather than
> trying to split info->driver_name, for somebody might want to pass meaningful argv[] (and
> TOMOYO wants to use meaningful argv[] as a hint for identifying the intent).
> 
> diff --git a/kernel/umd.c b/kernel/umd.c
> index de2f542191e5..ae6e85283f13 100644
> --- a/kernel/umd.c
> +++ b/kernel/umd.c
> @@ -7,6 +7,7 @@
>  #include <linux/mount.h>
>  #include <linux/fs_struct.h>
>  #include <linux/umd.h>
> +#include <linux/task_work.h>
>  
>  static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name)
>  {
> @@ -25,7 +26,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na
>  	if (IS_ERR(mnt))
>  		return mnt;
>  
> -	file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700);
> +	file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY | O_EXCL, 0700);
>  	if (IS_ERR(file)) {
>  		mntput(mnt);
>  		return ERR_CAST(file);
> @@ -41,23 +42,33 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na
>  		return ERR_PTR(err);
>  	}
>  
> -	__fput_sync(file);
> +	if (current->flags & PF_KTHREAD) {
> +		__fput_sync(file);
> +	} else {
> +		fput(file);
> +		flush_delayed_fput();
> +		task_work_run();
> +	}

Thanks. This makes sense to me.

>  	return mnt;
>  }
>  
>  /**
>   * umd_load_blob - Remember a blob of bytes for fork_usermode_driver
> - * @info: information about usermode driver
> - * @data: a blob of bytes that can be executed as a file
> - * @len:  The lentgh of the blob
> + * @info: information about usermode driver (shouldn't be NULL)
> + * @data: a blob of bytes that can be executed as a file (shouldn't be NULL)
> + * @len:  The lentgh of the blob (shouldn't be 0)
>   *
>   */
>  int umd_load_blob(struct umd_info *info, const void *data, size_t len)
>  {
>  	struct vfsmount *mnt;
>  
> -	if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt))
> +	if (!info || !info->driver_name || !data || !len)
> +		return -EINVAL;
> +	if (info->wd.dentry || info->wd.mnt) {
> +		pr_info("%s already loaded.\n", info->driver_name);
>  		return -EBUSY;
> +	}

But all the defensive programming kinda goes against general kernel style.
I wouldn't do it. Especially pr_info() ?!
Though I don't feel strongly about it.

I would like to generalize elf_header_check() a bit and call it
before doing blob_to_mnt() to make sure that all blobs are elf files only.
Supporting '#!/bin/bash' or other things as blobs seems wrong to me.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-28 19:44                                                                       ` Alexei Starovoitov
@ 2020-06-29  2:20                                                                         ` Tetsuo Handa
  2020-06-29 20:19                                                                           ` Eric W. Biederman
  0 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-29  2:20 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eric W. Biederman, Linus Torvalds, David Miller,
	Greg Kroah-Hartman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On 2020/06/29 4:44, Alexei Starovoitov wrote:
> But all the defensive programming kinda goes against general kernel style.
> I wouldn't do it. Especially pr_info() ?!
> Though I don't feel strongly about it.

Honestly speaking, caller should check for errors and print appropriate
messages. info->wd.mnt->mnt_root != info->wd.dentry indicates that something
went wrong (maybe memory corruption). But other conditions are not fatal.
That is, I consider even pr_info() here should be unnecessary.

> 
> I would like to generalize elf_header_check() a bit and call it
> before doing blob_to_mnt() to make sure that all blobs are elf files only.
> Supporting '#!/bin/bash' or other things as blobs seems wrong to me.

Why? There is no point with forbidding "#!", for users can use a wrapper
ELF binary which contains instructions including glibc's execv()/system()
functions even if "#!" cannot be used.

What is more important is what protection/isolation properties processes started
via fork_usermode_driver() should hold, for ELF binary can contain arbitrary
instructions, these processes run as daemons (reading request from stdin and
writing response to stdout) but hidden from "/usr/bin/pstree -p 1" (because
they are forked from kthreadd kernel thread).


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v2 00/15] Make the user mode driver code a better citizen
  2020-06-26 13:48                                                                 ` [PATCH 00/14] Make the user mode driver code a better citizen Eric W. Biederman
@ 2020-06-29 19:55                                                                   ` Eric W. Biederman
  2020-06-29 19:56                                                                     ` [PATCH v2 01/15] umh: Capture the pid in umh_pipe_setup Eric W. Biederman
                                                                                       ` (17 more replies)
  0 siblings, 18 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 19:55 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds


This is the second round of my changeset to split the user mode driver
code from the user mode helper code, and to make the code use common
facilities to get things done instead of recreating them just
for the user mode driver code.

I have split the changes into small enough pieces so they should be
easily readable and testable.

The changes lean into the preexisting interfaces in the kernel and
remove special cases for user mode driver code in favor of solutions
that don't need special cases.  This results in smaller code with fewer
bugs.

At a practical level this removes the maintenance burden of the user
mode drivers from the user mode helper code and from exec as the special
cases are removed.

Similarly the LSM interaction bugs are fixed by not having unnecessary
special cases for user mode drivers.

I have tested thes changes by booting with the code compiled in and
by killing "bpfilter_umh" and running iptables -vnL to restart
the userspace driver.

I have compiled tested each change with and without CONFIG_BPFILTER
enabled.

I made a few very small changes from v1 to v2:
- Updated the function name in a comment when the function is renamed
- Moved some more code so that the the !CONFIG_BPFILTER case continues
  to compile when I moved the code into umd.c
- A fix for the module loading case to really flush the file descriptor.
- Removed split_argv entirely from fork_usermode_driver.
  There was nothing to split so it was just confusing.

Please let me know if you see any bugs.  Once the code review is
finished I plan to place the code in a non-rebasing branch
so I can pull it into my tree and so it can also be pulled into
the bpf-next tree.

Eric W. Biederman (15):
      umh: Capture the pid in umh_pipe_setup
      umh: Move setting PF_UMH into umh_pipe_setup
      umh: Rename the user mode driver helpers for clarity
      umh: Remove call_usermodehelper_setup_file.
      umh: Separate the user mode driver and the user mode helper support
      umd: For clarity rename umh_info umd_info
      umd: Rename umd_info.cmdline umd_info.driver_name
      umd: Transform fork_usermode_blob into fork_usermode_driver
      umh: Stop calling do_execve_file
      exec: Remove do_execve_file
      bpfilter: Move bpfilter_umh back into init data
      umd: Track user space drivers with struct pid
      bpfilter: Take advantage of the facilities of struct pid
      umd: Remove exit_umh
      umd: Stop using split_argv

 fs/exec.c                        |  38 ++------
 include/linux/binfmts.h          |   1 -
 include/linux/bpfilter.h         |   7 +-
 include/linux/sched.h            |   9 --
 include/linux/umd.h              |  18 ++++
 include/linux/umh.h              |  15 ----
 kernel/Makefile                  |   1 +
 kernel/exit.c                    |   1 -
 kernel/umd.c                     | 182 +++++++++++++++++++++++++++++++++++++++
 kernel/umh.c                     | 171 +-----------------------------------
 net/bpfilter/bpfilter_kern.c     |  38 ++++----
 net/bpfilter/bpfilter_umh_blob.S |   2 +-
 net/ipv4/bpfilter/sockopt.c      |  20 +++--
 13 files changed, 248 insertions(+), 255 deletions(-)

v1: https://lkml.kernel.org/r/87pn9mgfc2.fsf_-_@x220.int.ebiederm.org
---
git range-diff master v1 v2

 1:  2b76f9b3158d !  1:  d8fb851fa3d8 umh: Capture the pid in umh_pipe_setup
    @@ Commit message
         code that is specific to user mode drivers from the common user path of
         user mode helpers.
     
    +    Link: https://lkml.kernel.org/r/87h7uygf9i.fsf_-_@x220.int.ebiederm.org
    +    Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
         Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
     
      ## include/linux/umh.h ##
 2:  d853e933ae32 !  2:  b191c5df43ec umh: Move setting PF_UMH into umh_pipe_setup
    @@ Commit message
         Setting PF_UMH unconditionally is harmless as an action will only
         happen if it is paired with an entry on umh_list.
     
    +    Link: https://lkml.kernel.org/r/87bll6gf8t.fsf_-_@x220.int.ebiederm.org
    +    Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
         Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
     
      ## kernel/umh.c ##
 3:  92d2550f0d6a !  3:  74e8c0bf3076 umh: Rename the user mode driver helpers for clarity
    @@ Commit message
         don't make much sense.  Instead name them  umd_setup and umd_cleanup
         for the functional role in setting up user mode drivers.
     
    +    Link: https://lkml.kernel.org/r/875zbegf82.fsf_-_@x220.int.ebiederm.org
    +    Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
         Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
     
      ## kernel/umh.c ##
    @@ kernel/umh.c: static int umh_pipe_setup(struct subprocess_info *info, struct cre
      {
      	struct umh_info *umh_info = info->data;
      
    +-	/* cleanup if umh_pipe_setup() was successful but exec failed */
    ++	/* cleanup if umh_setup() was successful but exec failed */
    + 	if (info->retval) {
    + 		fput(umh_info->pipe_to_umh);
    + 		fput(umh_info->pipe_from_umh);
     @@ kernel/umh.c: int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
      	}
      
 4:  5a9cc2c6c64f !  4:  6652f7c0a909 umh: Remove call_usermodehelper_setup_file.
    @@ Commit message
         For this to work the argv_free is moved from umh_clean_and_save_pid
         to fork_usermode_blob.
     
    +    Link: https://lkml.kernel.org/r/87zh8qf0mp.fsf_-_@x220.int.ebiederm.org
    +    Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
         Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
     
      ## include/linux/umh.h ##
 5:  03ed13fa8eee !  5:  2a1ccb05cf9f umh: Separate the user mode driver and the user mode helper support
    @@ Commit message
         This makes the kernel smaller for everyone who does not use a usermode
         driver.
     
    +    v2: Moved exit_umh from sched.h to umd.h and handle the case when the
    +    code is compiled out.
    +
    +    Link: https://lkml.kernel.org/r/87tuyyf0ln.fsf_-_@x220.int.ebiederm.org
    +    Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
         Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
     
      ## include/linux/bpfilter.h ##
    @@ include/linux/bpfilter.h
      struct sock;
      int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
     
    + ## include/linux/sched.h ##
    +@@ include/linux/sched.h: static inline void rseq_execve(struct task_struct *t)
    + 
    + #endif
    + 
    +-void __exit_umh(struct task_struct *tsk);
    +-
    +-static inline void exit_umh(struct task_struct *tsk)
    +-{
    +-	if (unlikely(tsk->flags & PF_UMH))
    +-		__exit_umh(tsk);
    +-}
    +-
    + #ifdef CONFIG_DEBUG_RSEQ
    + 
    + void rseq_syscall(struct pt_regs *regs);
    +
      ## include/linux/umd.h (new) ##
     @@
     +#ifndef __LINUX_UMD_H__
    @@ include/linux/umd.h (new)
     +
     +#include <linux/umh.h>
     +
    ++#ifdef CONFIG_BPFILTER
    ++void __exit_umh(struct task_struct *tsk);
    ++
    ++static inline void exit_umh(struct task_struct *tsk)
    ++{
    ++	if (unlikely(tsk->flags & PF_UMH))
    ++		__exit_umh(tsk);
    ++}
    ++#else
    ++static inline void exit_umh(struct task_struct *tsk)
    ++{
    ++}
    ++#endif
    ++
     +struct umh_info {
     +	const char *cmdline;
     +	struct file *pipe_to_umh;
    @@ kernel/Makefile: obj-y     = fork.o exec_domain.o panic.o \
      obj-$(CONFIG_MULTIUSER) += groups.o
      
     
    + ## kernel/exit.c ##
    +@@
    + #include <linux/random.h>
    + #include <linux/rcuwait.h>
    + #include <linux/compat.h>
    ++#include <linux/umd.h>
    + 
    + #include <linux/uaccess.h>
    + #include <asm/unistd.h>
    +
      ## kernel/umd.c (new) ##
     @@
     +// SPDX-License-Identifier: GPL-2.0-only
    @@ kernel/umd.c (new)
     +{
     +	struct umh_info *umh_info = info->data;
     +
    -+	/* cleanup if umh_pipe_setup() was successful but exec failed */
    ++	/* cleanup if umh_setup() was successful but exec failed */
     +	if (info->retval) {
     +		fput(umh_info->pipe_to_umh);
     +		fput(umh_info->pipe_from_umh);
    @@ kernel/umh.c: struct subprocess_info *call_usermodehelper_setup(const char *path
     -{
     -	struct umh_info *umh_info = info->data;
     -
    --	/* cleanup if umh_pipe_setup() was successful but exec failed */
    +-	/* cleanup if umh_setup() was successful but exec failed */
     -	if (info->retval) {
     -		fput(umh_info->pipe_to_umh);
     -		fput(umh_info->pipe_from_umh);
 6:  698bfbcb6c7f !  6:  b16081fb8d92 umd: For clarity rename umh_info umd_info
    @@ Commit message
         This structure is only used for user mode drivers so change
         the prefix from umh to umd to make that clear.
     
    +    Link: https://lkml.kernel.org/r/87o8p6f0kw.fsf_-_@x220.int.ebiederm.org
    +    Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
         Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
     
      ## include/linux/bpfilter.h ##
    @@ include/linux/bpfilter.h: int bpfilter_ip_set_sockopt(struct sock *sk, int optna
      	int (*sockopt)(struct sock *sk, int optname,
     
      ## include/linux/umd.h ##
    -@@
    - 
    - #include <linux/umh.h>
    +@@ include/linux/umd.h: static inline void exit_umh(struct task_struct *tsk)
    + }
    + #endif
      
     -struct umh_info {
     +struct umd_info {
    @@ kernel/umd.c: static int umd_setup(struct subprocess_info *info, struct cred *ne
     -	struct umh_info *umh_info = info->data;
     +	struct umd_info *umd_info = info->data;
      
    - 	/* cleanup if umh_pipe_setup() was successful but exec failed */
    + 	/* cleanup if umh_setup() was successful but exec failed */
      	if (info->retval) {
     -		fput(umh_info->pipe_to_umh);
     -		fput(umh_info->pipe_from_umh);
 7:  9cdcb5e7fc61 !  7:  42c13aa9c526 umd: Rename umd_info.cmdline umd_info.driver_name
    @@ Commit message
         driver_name any place where the code is looking for a name
         of the binary.
     
    +    Link: https://lkml.kernel.org/r/87imfef0k3.fsf_-_@x220.int.ebiederm.org
    +    Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
         Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
     
      ## include/linux/umd.h ##
    -@@
    - #include <linux/umh.h>
    +@@ include/linux/umd.h: static inline void exit_umh(struct task_struct *tsk)
    + #endif
      
      struct umd_info {
     -	const char *cmdline;
 8:  5ada2f70ae21 !  8:  385ed14a025b umd: Transform fork_usermode_blob into fork_usermode_driver
    @@ Commit message
         path based LSMs there are no new special cases.
     
         [1] https://lore.kernel.org/linux-fsdevel/2a8775b4-1dd5-9d5c-aa42-9872445e0942@i-love.sakura.ne.jp/
    +    Link: https://lkml.kernel.org/r/87d05mf0j9.fsf_-_@x220.int.ebiederm.org
    +    Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
         Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
     
      ## include/linux/umd.h ##
    @@ include/linux/umd.h
      #include <linux/umh.h>
     +#include <linux/path.h>
      
    - struct umd_info {
    - 	const char *driver_name;
    + #ifdef CONFIG_BPFILTER
    + void __exit_umh(struct task_struct *tsk);
     @@ include/linux/umd.h: struct umd_info {
      	struct file *pipe_from_umh;
      	struct list_head list;
    @@ kernel/umd.c
      #include <linux/pipe_fs_i.h>
     +#include <linux/mount.h>
     +#include <linux/fs_struct.h>
    ++#include <linux/task_work.h>
      #include <linux/umd.h>
      
      static LIST_HEAD(umh_list);
    @@ kernel/umd.c
     +		return ERR_PTR(err);
     +	}
     +
    -+	__fput_sync(file);
    ++	fput(file);
    ++
    ++	/* Flush delayed fput so exec can open the file read-only */
    ++	flush_delayed_fput();
    ++	task_work_run();
     +	return mnt;
     +}
     +
 9:  e4ff478e77c9 !  9:  eeae92e3f0da umh: Stop calling do_execve_file
    @@ Commit message
         call_usermodehelper_exec_async that would call do_execve_file instead
         of do_execve if file was set.
     
    +    Link: https://lkml.kernel.org/r/877dvuf0i7.fsf_-_@x220.int.ebiederm.org
    +    Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
         Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
     
      ## include/linux/umh.h ##
10:  dc0a38f6bd51 ! 10:  c7fdaf5660b8 exec: Remove do_execve_file
    @@ Commit message
     
         Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
         [1] https://lore.kernel.org/linux-fsdevel/2a8775b4-1dd5-9d5c-aa42-9872445e0942@i-love.sakura.ne.jp/
    +    Link: https://lkml.kernel.org/r/871rm2f0hi.fsf_-_@x220.int.ebiederm.org
    +    Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
         Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
     
      ## fs/exec.c ##
11:  d0c0c2ddf53b ! 11:  43d08e6986a7 bpfilter: Move bpfilter_umh back into init data
    @@ Commit message
         the blob the blob no longer needs to live .rodata to allow for restarting.
         So move the blob back to .init.rodata.
     
    +    Link: https://lkml.kernel.org/r/87sgeidlvq.fsf_-_@x220.int.ebiederm.org
    +    Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
         Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
     
      ## net/bpfilter/bpfilter_umh_blob.S ##
12:  51b703ad75dd ! 12:  729ee744af46 umd: Track user space drivers with struct pid
    @@ Commit message
         As the tgid is now refcounted verify the tgid is NULL at the start of
         fork_usermode_driver to avoid the possibility of silent pid leaks.
     
    +    Link: https://lkml.kernel.org/r/87mu4qdlv2.fsf_-_@x220.int.ebiederm.org
    +    Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
         Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
     
      ## include/linux/umd.h ##
13:  cdadf89503c9 ! 13:  2d85b10b965e bpfilter: Take advantage of the facilities of struct pid
    @@ Commit message
         struct pid can be tested to see if a process still exists, and that
         struct pid has a wait queue that notifies when the process dies.
     
    +    Link: https://lkml.kernel.org/r/87h7uydlu9.fsf_-_@x220.int.ebiederm.org
    +    Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
         Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
     
      ## include/linux/bpfilter.h ##
14:  1d621649e144 ! 14:  6e7e8ddd2b44 umd: Remove exit_umh
    @@ Commit message
         callback is what exit_umh exists to call.  So remove exit_umh and all
         of it's associated booking.
     
    +    Link: https://lkml.kernel.org/r/87bll6dlte.fsf_-_@x220.int.ebiederm.org
    +    Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
         Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
     
      ## include/linux/sched.h ##
    @@ include/linux/sched.h: extern struct pid *cad_pid;
      #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
      #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
      #define PF_MEMALLOC_NOCMA	0x10000000	/* All allocation request will have _GFP_MOVABLE cleared */
    -@@ include/linux/sched.h: static inline void rseq_execve(struct task_struct *t)
    - 
    - #endif
    +
    + ## include/linux/umd.h ##
    +@@
    + #include <linux/umh.h>
    + #include <linux/path.h>
      
    +-#ifdef CONFIG_BPFILTER
     -void __exit_umh(struct task_struct *tsk);
     -
     -static inline void exit_umh(struct task_struct *tsk)
    @@ include/linux/sched.h: static inline void rseq_execve(struct task_struct *t)
     -	if (unlikely(tsk->flags & PF_UMH))
     -		__exit_umh(tsk);
     -}
    +-#else
    +-static inline void exit_umh(struct task_struct *tsk)
    +-{
    +-}
    +-#endif
     -
    - #ifdef CONFIG_DEBUG_RSEQ
    - 
    - void rseq_syscall(struct pt_regs *regs);
    -
    - ## include/linux/umd.h ##
    -@@ include/linux/umd.h: struct umd_info {
    + struct umd_info {
      	const char *driver_name;
      	struct file *pipe_to_umh;
      	struct file *pipe_from_umh;
    @@ include/linux/umd.h: struct umd_info {
      };
     
      ## kernel/exit.c ##
    +@@
    + #include <linux/random.h>
    + #include <linux/rcuwait.h>
    + #include <linux/compat.h>
    +-#include <linux/umd.h>
    + 
    + #include <linux/uaccess.h>
    + #include <asm/unistd.h>
     @@ kernel/exit.c: void __noreturn do_exit(long code)
      	exit_task_namespaces(tsk);
      	exit_task_work(tsk);
    @@ kernel/exit.c: void __noreturn do_exit(long code)
     
      ## kernel/umd.c ##
     @@
    - #include <linux/fs_struct.h>
    + #include <linux/task_work.h>
      #include <linux/umd.h>
      
     -static LIST_HEAD(umh_list);
 -:  ------------ > 15:  662deff06d76 umd: Stop using split_argv




^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v2 01/15] umh: Capture the pid in umh_pipe_setup
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
@ 2020-06-29 19:56                                                                     ` Eric W. Biederman
  2020-06-29 19:57                                                                     ` [PATCH v2 02/15] umh: Move setting PF_UMH into umh_pipe_setup Eric W. Biederman
                                                                                       ` (16 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 19:56 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds


The pid in struct subprocess_info is only used by umh_clean_and_save_pid to
write the pid into umh_info.

Instead always capture the pid on struct umh_info in umh_pipe_setup, removing
code that is specific to user mode drivers from the common user path of
user mode helpers.

Link: https://lkml.kernel.org/r/87h7uygf9i.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umh.h | 1 -
 kernel/umh.c        | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/linux/umh.h b/include/linux/umh.h
index 0c08de356d0d..aae16a0ebd0f 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -25,7 +25,6 @@ struct subprocess_info {
 	struct file *file;
 	int wait;
 	int retval;
-	pid_t pid;
 	int (*init)(struct subprocess_info *info, struct cred *new);
 	void (*cleanup)(struct subprocess_info *info);
 	void *data;
diff --git a/kernel/umh.c b/kernel/umh.c
index 79f139a7ca03..c2a582b3a2bf 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -102,7 +102,6 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	sub_info->pid = task_pid_nr(current);
 	if (sub_info->file) {
 		retval = do_execve_file(sub_info->file,
 					sub_info->argv, sub_info->envp);
@@ -468,6 +467,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 
 	umh_info->pipe_to_umh = to_umh[1];
 	umh_info->pipe_from_umh = from_umh[0];
+	umh_info->pid = task_pid_nr(current);
 	return 0;
 }
 
@@ -476,13 +476,12 @@ static void umh_clean_and_save_pid(struct subprocess_info *info)
 	struct umh_info *umh_info = info->data;
 
 	/* cleanup if umh_pipe_setup() was successful but exec failed */
-	if (info->pid && info->retval) {
+	if (info->retval) {
 		fput(umh_info->pipe_to_umh);
 		fput(umh_info->pipe_from_umh);
 	}
 
 	argv_free(info->argv);
-	umh_info->pid = info->pid;
 }
 
 /**
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v2 02/15] umh: Move setting PF_UMH into umh_pipe_setup
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
  2020-06-29 19:56                                                                     ` [PATCH v2 01/15] umh: Capture the pid in umh_pipe_setup Eric W. Biederman
@ 2020-06-29 19:57                                                                     ` Eric W. Biederman
  2020-06-29 19:57                                                                     ` [PATCH v2 03/15] umh: Rename the user mode driver helpers for clarity Eric W. Biederman
                                                                                       ` (15 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 19:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds


I am separating the code specific to user mode drivers from the code
for ordinary user space helpers.  Move setting of PF_UMH from
call_usermodehelper_exec_async which is core user mode helper code
into umh_pipe_setup which is user mode driver code.

The code is equally as easy to write in one location as the other and
the movement minimizes the impact of the user mode driver code on the
core of the user mode helper code.

Setting PF_UMH unconditionally is harmless as an action will only
happen if it is paired with an entry on umh_list.

Link: https://lkml.kernel.org/r/87bll6gf8t.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/umh.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/umh.c b/kernel/umh.c
index c2a582b3a2bf..e6b9d6636850 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -102,12 +102,10 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	if (sub_info->file) {
+	if (sub_info->file)
 		retval = do_execve_file(sub_info->file,
 					sub_info->argv, sub_info->envp);
-		if (!retval)
-			current->flags |= PF_UMH;
-	} else
+	else
 		retval = do_execve(getname_kernel(sub_info->path),
 				   (const char __user *const __user *)sub_info->argv,
 				   (const char __user *const __user *)sub_info->envp);
@@ -468,6 +466,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 	umh_info->pipe_to_umh = to_umh[1];
 	umh_info->pipe_from_umh = from_umh[0];
 	umh_info->pid = task_pid_nr(current);
+	current->flags |= PF_UMH;
 	return 0;
 }
 
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v2 03/15] umh: Rename the user mode driver helpers for clarity
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
  2020-06-29 19:56                                                                     ` [PATCH v2 01/15] umh: Capture the pid in umh_pipe_setup Eric W. Biederman
  2020-06-29 19:57                                                                     ` [PATCH v2 02/15] umh: Move setting PF_UMH into umh_pipe_setup Eric W. Biederman
@ 2020-06-29 19:57                                                                     ` Eric W. Biederman
  2020-06-29 19:59                                                                     ` [PATCH v2 04/15] umh: Remove call_usermodehelper_setup_file Eric W. Biederman
                                                                                       ` (14 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 19:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds


Now that the functionality of umh_setup_pipe and
umh_clean_and_save_pid has changed their names are too specific and
don't make much sense.  Instead name them  umd_setup and umd_cleanup
for the functional role in setting up user mode drivers.

Link: https://lkml.kernel.org/r/875zbegf82.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/umh.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/umh.c b/kernel/umh.c
index e6b9d6636850..26c3d493f168 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -429,7 +429,7 @@ struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
 	return sub_info;
 }
 
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+static int umd_setup(struct subprocess_info *info, struct cred *new)
 {
 	struct umh_info *umh_info = info->data;
 	struct file *from_umh[2];
@@ -470,11 +470,11 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 	return 0;
 }
 
-static void umh_clean_and_save_pid(struct subprocess_info *info)
+static void umd_cleanup(struct subprocess_info *info)
 {
 	struct umh_info *umh_info = info->data;
 
-	/* cleanup if umh_pipe_setup() was successful but exec failed */
+	/* cleanup if umh_setup() was successful but exec failed */
 	if (info->retval) {
 		fput(umh_info->pipe_to_umh);
 		fput(umh_info->pipe_from_umh);
@@ -520,8 +520,8 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 	}
 
 	err = -ENOMEM;
-	sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup,
-						  umh_clean_and_save_pid, info);
+	sub_info = call_usermodehelper_setup_file(file, umd_setup, umd_cleanup,
+						  info);
 	if (!sub_info)
 		goto out;
 
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v2 04/15] umh: Remove call_usermodehelper_setup_file.
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
                                                                                       ` (2 preceding siblings ...)
  2020-06-29 19:57                                                                     ` [PATCH v2 03/15] umh: Rename the user mode driver helpers for clarity Eric W. Biederman
@ 2020-06-29 19:59                                                                     ` Eric W. Biederman
  2020-06-29 20:00                                                                     ` [PATCH v2 05/15] umh: Separate the user mode driver and the user mode helper support Eric W. Biederman
                                                                                       ` (13 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 19:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds


The only caller of call_usermodehelper_setup_file is fork_usermode_blob.
In fork_usermode_blob replace call_usermodehelper_setup_file with
call_usermodehelper_setup and delete fork_usermodehelper_setup_file.

For this to work the argv_free is moved from umh_clean_and_save_pid
to fork_usermode_blob.

Link: https://lkml.kernel.org/r/87zh8qf0mp.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umh.h |  3 ---
 kernel/umh.c        | 42 +++++++++++-------------------------------
 2 files changed, 11 insertions(+), 34 deletions(-)

diff --git a/include/linux/umh.h b/include/linux/umh.h
index aae16a0ebd0f..de08af00c68a 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -39,9 +39,6 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp,
 			  int (*init)(struct subprocess_info *info, struct cred *new),
 			  void (*cleanup)(struct subprocess_info *), void *data);
 
-struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
-			  int (*init)(struct subprocess_info *info, struct cred *new),
-			  void (*cleanup)(struct subprocess_info *), void *data);
 struct umh_info {
 	const char *cmdline;
 	struct file *pipe_to_umh;
diff --git a/kernel/umh.c b/kernel/umh.c
index 26c3d493f168..b8fa9b99b366 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -402,33 +402,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
 }
 EXPORT_SYMBOL(call_usermodehelper_setup);
 
-struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
-		int (*init)(struct subprocess_info *info, struct cred *new),
-		void (*cleanup)(struct subprocess_info *info), void *data)
-{
-	struct subprocess_info *sub_info;
-	struct umh_info *info = data;
-	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
-
-	sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL);
-	if (!sub_info)
-		return NULL;
-
-	sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL);
-	if (!sub_info->argv) {
-		kfree(sub_info);
-		return NULL;
-	}
-
-	INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
-	sub_info->path = "none";
-	sub_info->file = file;
-	sub_info->init = init;
-	sub_info->cleanup = cleanup;
-	sub_info->data = data;
-	return sub_info;
-}
-
 static int umd_setup(struct subprocess_info *info, struct cred *new)
 {
 	struct umh_info *umh_info = info->data;
@@ -479,8 +452,6 @@ static void umd_cleanup(struct subprocess_info *info)
 		fput(umh_info->pipe_to_umh);
 		fput(umh_info->pipe_from_umh);
 	}
-
-	argv_free(info->argv);
 }
 
 /**
@@ -501,7 +472,9 @@ static void umd_cleanup(struct subprocess_info *info)
  */
 int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 {
+	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
 	struct subprocess_info *sub_info;
+	char **argv = NULL;
 	struct file *file;
 	ssize_t written;
 	loff_t pos = 0;
@@ -520,11 +493,16 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 	}
 
 	err = -ENOMEM;
-	sub_info = call_usermodehelper_setup_file(file, umd_setup, umd_cleanup,
-						  info);
+	argv = argv_split(GFP_KERNEL, cmdline, NULL);
+	if (!argv)
+		goto out;
+
+	sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL,
+					     umd_setup, umd_cleanup, info);
 	if (!sub_info)
 		goto out;
 
+	sub_info->file = file;
 	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
 	if (!err) {
 		mutex_lock(&umh_list_lock);
@@ -532,6 +510,8 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 		mutex_unlock(&umh_list_lock);
 	}
 out:
+	if (argv)
+		argv_free(argv);
 	fput(file);
 	return err;
 }
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v2 05/15] umh: Separate the user mode driver and the user mode helper support
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
                                                                                       ` (3 preceding siblings ...)
  2020-06-29 19:59                                                                     ` [PATCH v2 04/15] umh: Remove call_usermodehelper_setup_file Eric W. Biederman
@ 2020-06-29 20:00                                                                     ` Eric W. Biederman
  2020-06-30 16:58                                                                       ` Linus Torvalds
  2020-06-29 20:01                                                                     ` [PATCH v2 06/15] umd: For clarity rename umh_info umd_info Eric W. Biederman
                                                                                       ` (12 subsequent siblings)
  17 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 20:00 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds


This makes it clear which code is part of the core user mode
helper support and which code is needed to implement user mode
drivers.

This makes the kernel smaller for everyone who does not use a usermode
driver.

v2: Moved exit_umh from sched.h to umd.h and handle the case when the
code is compiled out.

Link: https://lkml.kernel.org/r/87tuyyf0ln.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/bpfilter.h |   2 +-
 include/linux/sched.h    |   8 ---
 include/linux/umd.h      |  30 ++++++++
 include/linux/umh.h      |  10 ---
 kernel/Makefile          |   1 +
 kernel/exit.c            |   1 +
 kernel/umd.c             | 146 +++++++++++++++++++++++++++++++++++++++
 kernel/umh.c             | 139 -------------------------------------
 8 files changed, 179 insertions(+), 158 deletions(-)
 create mode 100644 include/linux/umd.h
 create mode 100644 kernel/umd.c

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
index d815622cd31e..b42e44e29033 100644
--- a/include/linux/bpfilter.h
+++ b/include/linux/bpfilter.h
@@ -3,7 +3,7 @@
 #define _LINUX_BPFILTER_H
 
 #include <uapi/linux/bpfilter.h>
-#include <linux/umh.h>
+#include <linux/umd.h>
 
 struct sock;
 int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b62e6aaf28f0..59d1e92bb88e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2020,14 +2020,6 @@ static inline void rseq_execve(struct task_struct *t)
 
 #endif
 
-void __exit_umh(struct task_struct *tsk);
-
-static inline void exit_umh(struct task_struct *tsk)
-{
-	if (unlikely(tsk->flags & PF_UMH))
-		__exit_umh(tsk);
-}
-
 #ifdef CONFIG_DEBUG_RSEQ
 
 void rseq_syscall(struct pt_regs *regs);
diff --git a/include/linux/umd.h b/include/linux/umd.h
new file mode 100644
index 000000000000..ef40bee590c1
--- /dev/null
+++ b/include/linux/umd.h
@@ -0,0 +1,30 @@
+#ifndef __LINUX_UMD_H__
+#define __LINUX_UMD_H__
+
+#include <linux/umh.h>
+
+#ifdef CONFIG_BPFILTER
+void __exit_umh(struct task_struct *tsk);
+
+static inline void exit_umh(struct task_struct *tsk)
+{
+	if (unlikely(tsk->flags & PF_UMH))
+		__exit_umh(tsk);
+}
+#else
+static inline void exit_umh(struct task_struct *tsk)
+{
+}
+#endif
+
+struct umh_info {
+	const char *cmdline;
+	struct file *pipe_to_umh;
+	struct file *pipe_from_umh;
+	struct list_head list;
+	void (*cleanup)(struct umh_info *info);
+	pid_t pid;
+};
+int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
+
+#endif /* __LINUX_UMD_H__ */
diff --git a/include/linux/umh.h b/include/linux/umh.h
index de08af00c68a..73173c4a07e5 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -39,16 +39,6 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp,
 			  int (*init)(struct subprocess_info *info, struct cred *new),
 			  void (*cleanup)(struct subprocess_info *), void *data);
 
-struct umh_info {
-	const char *cmdline;
-	struct file *pipe_to_umh;
-	struct file *pipe_from_umh;
-	struct list_head list;
-	void (*cleanup)(struct umh_info *info);
-	pid_t pid;
-};
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
-
 extern int
 call_usermodehelper_exec(struct subprocess_info *info, int wait);
 
diff --git a/kernel/Makefile b/kernel/Makefile
index f3218bc5ec69..a81d7354323c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
 	    async.o range.o smpboot.o ucount.o
 
+obj-$(CONFIG_BPFILTER) += umd.o
 obj-$(CONFIG_MODULES) += kmod.o
 obj-$(CONFIG_MULTIUSER) += groups.o
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 727150f28103..b94fe03e609c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -63,6 +63,7 @@
 #include <linux/random.h>
 #include <linux/rcuwait.h>
 #include <linux/compat.h>
+#include <linux/umd.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
diff --git a/kernel/umd.c b/kernel/umd.c
new file mode 100644
index 000000000000..99af9d594eca
--- /dev/null
+++ b/kernel/umd.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * umd - User mode driver support
+ */
+#include <linux/shmem_fs.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/umd.h>
+
+static LIST_HEAD(umh_list);
+static DEFINE_MUTEX(umh_list_lock);
+
+static int umd_setup(struct subprocess_info *info, struct cred *new)
+{
+	struct umh_info *umh_info = info->data;
+	struct file *from_umh[2];
+	struct file *to_umh[2];
+	int err;
+
+	/* create pipe to send data to umh */
+	err = create_pipe_files(to_umh, 0);
+	if (err)
+		return err;
+	err = replace_fd(0, to_umh[0], 0);
+	fput(to_umh[0]);
+	if (err < 0) {
+		fput(to_umh[1]);
+		return err;
+	}
+
+	/* create pipe to receive data from umh */
+	err = create_pipe_files(from_umh, 0);
+	if (err) {
+		fput(to_umh[1]);
+		replace_fd(0, NULL, 0);
+		return err;
+	}
+	err = replace_fd(1, from_umh[1], 0);
+	fput(from_umh[1]);
+	if (err < 0) {
+		fput(to_umh[1]);
+		replace_fd(0, NULL, 0);
+		fput(from_umh[0]);
+		return err;
+	}
+
+	umh_info->pipe_to_umh = to_umh[1];
+	umh_info->pipe_from_umh = from_umh[0];
+	umh_info->pid = task_pid_nr(current);
+	current->flags |= PF_UMH;
+	return 0;
+}
+
+static void umd_cleanup(struct subprocess_info *info)
+{
+	struct umh_info *umh_info = info->data;
+
+	/* cleanup if umh_setup() was successful but exec failed */
+	if (info->retval) {
+		fput(umh_info->pipe_to_umh);
+		fput(umh_info->pipe_from_umh);
+	}
+}
+
+/**
+ * fork_usermode_blob - fork a blob of bytes as a usermode process
+ * @data: a blob of bytes that can be do_execv-ed as a file
+ * @len: length of the blob
+ * @info: information about usermode process (shouldn't be NULL)
+ *
+ * If info->cmdline is set it will be used as command line for the
+ * user process, else "usermodehelper" is used.
+ *
+ * Returns either negative error or zero which indicates success
+ * in executing a blob of bytes as a usermode process. In such
+ * case 'struct umh_info *info' is populated with two pipes
+ * and a pid of the process. The caller is responsible for health
+ * check of the user process, killing it via pid, and closing the
+ * pipes when user process is no longer needed.
+ */
+int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
+{
+	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
+	struct subprocess_info *sub_info;
+	char **argv = NULL;
+	struct file *file;
+	ssize_t written;
+	loff_t pos = 0;
+	int err;
+
+	file = shmem_kernel_file_setup("", len, 0);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	written = kernel_write(file, data, len, &pos);
+	if (written != len) {
+		err = written;
+		if (err >= 0)
+			err = -ENOMEM;
+		goto out;
+	}
+
+	err = -ENOMEM;
+	argv = argv_split(GFP_KERNEL, cmdline, NULL);
+	if (!argv)
+		goto out;
+
+	sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL,
+					     umd_setup, umd_cleanup, info);
+	if (!sub_info)
+		goto out;
+
+	sub_info->file = file;
+	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+	if (!err) {
+		mutex_lock(&umh_list_lock);
+		list_add(&info->list, &umh_list);
+		mutex_unlock(&umh_list_lock);
+	}
+out:
+	if (argv)
+		argv_free(argv);
+	fput(file);
+	return err;
+}
+EXPORT_SYMBOL_GPL(fork_usermode_blob);
+
+void __exit_umh(struct task_struct *tsk)
+{
+	struct umh_info *info;
+	pid_t pid = tsk->pid;
+
+	mutex_lock(&umh_list_lock);
+	list_for_each_entry(info, &umh_list, list) {
+		if (info->pid == pid) {
+			list_del(&info->list);
+			mutex_unlock(&umh_list_lock);
+			goto out;
+		}
+	}
+	mutex_unlock(&umh_list_lock);
+	return;
+out:
+	if (info->cleanup)
+		info->cleanup(info);
+}
+
diff --git a/kernel/umh.c b/kernel/umh.c
index b8fa9b99b366..3e4e453d45c8 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -26,8 +26,6 @@
 #include <linux/ptrace.h>
 #include <linux/async.h>
 #include <linux/uaccess.h>
-#include <linux/shmem_fs.h>
-#include <linux/pipe_fs_i.h>
 
 #include <trace/events/module.h>
 
@@ -38,8 +36,6 @@ static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
 static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
 static DEFINE_SPINLOCK(umh_sysctl_lock);
 static DECLARE_RWSEM(umhelper_sem);
-static LIST_HEAD(umh_list);
-static DEFINE_MUTEX(umh_list_lock);
 
 static void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
@@ -402,121 +398,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
 }
 EXPORT_SYMBOL(call_usermodehelper_setup);
 
-static int umd_setup(struct subprocess_info *info, struct cred *new)
-{
-	struct umh_info *umh_info = info->data;
-	struct file *from_umh[2];
-	struct file *to_umh[2];
-	int err;
-
-	/* create pipe to send data to umh */
-	err = create_pipe_files(to_umh, 0);
-	if (err)
-		return err;
-	err = replace_fd(0, to_umh[0], 0);
-	fput(to_umh[0]);
-	if (err < 0) {
-		fput(to_umh[1]);
-		return err;
-	}
-
-	/* create pipe to receive data from umh */
-	err = create_pipe_files(from_umh, 0);
-	if (err) {
-		fput(to_umh[1]);
-		replace_fd(0, NULL, 0);
-		return err;
-	}
-	err = replace_fd(1, from_umh[1], 0);
-	fput(from_umh[1]);
-	if (err < 0) {
-		fput(to_umh[1]);
-		replace_fd(0, NULL, 0);
-		fput(from_umh[0]);
-		return err;
-	}
-
-	umh_info->pipe_to_umh = to_umh[1];
-	umh_info->pipe_from_umh = from_umh[0];
-	umh_info->pid = task_pid_nr(current);
-	current->flags |= PF_UMH;
-	return 0;
-}
-
-static void umd_cleanup(struct subprocess_info *info)
-{
-	struct umh_info *umh_info = info->data;
-
-	/* cleanup if umh_setup() was successful but exec failed */
-	if (info->retval) {
-		fput(umh_info->pipe_to_umh);
-		fput(umh_info->pipe_from_umh);
-	}
-}
-
-/**
- * fork_usermode_blob - fork a blob of bytes as a usermode process
- * @data: a blob of bytes that can be do_execv-ed as a file
- * @len: length of the blob
- * @info: information about usermode process (shouldn't be NULL)
- *
- * If info->cmdline is set it will be used as command line for the
- * user process, else "usermodehelper" is used.
- *
- * Returns either negative error or zero which indicates success
- * in executing a blob of bytes as a usermode process. In such
- * case 'struct umh_info *info' is populated with two pipes
- * and a pid of the process. The caller is responsible for health
- * check of the user process, killing it via pid, and closing the
- * pipes when user process is no longer needed.
- */
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
-{
-	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
-	struct subprocess_info *sub_info;
-	char **argv = NULL;
-	struct file *file;
-	ssize_t written;
-	loff_t pos = 0;
-	int err;
-
-	file = shmem_kernel_file_setup("", len, 0);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
-
-	written = kernel_write(file, data, len, &pos);
-	if (written != len) {
-		err = written;
-		if (err >= 0)
-			err = -ENOMEM;
-		goto out;
-	}
-
-	err = -ENOMEM;
-	argv = argv_split(GFP_KERNEL, cmdline, NULL);
-	if (!argv)
-		goto out;
-
-	sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL,
-					     umd_setup, umd_cleanup, info);
-	if (!sub_info)
-		goto out;
-
-	sub_info->file = file;
-	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
-	if (!err) {
-		mutex_lock(&umh_list_lock);
-		list_add(&info->list, &umh_list);
-		mutex_unlock(&umh_list_lock);
-	}
-out:
-	if (argv)
-		argv_free(argv);
-	fput(file);
-	return err;
-}
-EXPORT_SYMBOL_GPL(fork_usermode_blob);
-
 /**
  * call_usermodehelper_exec - start a usermode application
  * @sub_info: information about the subprocessa
@@ -678,26 +559,6 @@ static int proc_cap_handler(struct ctl_table *table, int write,
 	return 0;
 }
 
-void __exit_umh(struct task_struct *tsk)
-{
-	struct umh_info *info;
-	pid_t pid = tsk->pid;
-
-	mutex_lock(&umh_list_lock);
-	list_for_each_entry(info, &umh_list, list) {
-		if (info->pid == pid) {
-			list_del(&info->list);
-			mutex_unlock(&umh_list_lock);
-			goto out;
-		}
-	}
-	mutex_unlock(&umh_list_lock);
-	return;
-out:
-	if (info->cleanup)
-		info->cleanup(info);
-}
-
 struct ctl_table usermodehelper_table[] = {
 	{
 		.procname	= "bset",
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v2 06/15] umd: For clarity rename umh_info umd_info
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
                                                                                       ` (4 preceding siblings ...)
  2020-06-29 20:00                                                                     ` [PATCH v2 05/15] umh: Separate the user mode driver and the user mode helper support Eric W. Biederman
@ 2020-06-29 20:01                                                                     ` Eric W. Biederman
  2020-06-29 20:02                                                                     ` [PATCH v2 07/15] umd: Rename umd_info.cmdline umd_info.driver_name Eric W. Biederman
                                                                                       ` (11 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 20:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds


This structure is only used for user mode drivers so change
the prefix from umh to umd to make that clear.

Link: https://lkml.kernel.org/r/87o8p6f0kw.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/bpfilter.h    |  2 +-
 include/linux/umd.h         |  6 +++---
 kernel/umd.c                | 20 ++++++++++----------
 net/ipv4/bpfilter/sockopt.c |  2 +-
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
index b42e44e29033..4b43d2240172 100644
--- a/include/linux/bpfilter.h
+++ b/include/linux/bpfilter.h
@@ -11,7 +11,7 @@ int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
 int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
 			    int __user *optlen);
 struct bpfilter_umh_ops {
-	struct umh_info info;
+	struct umd_info info;
 	/* since ip_getsockopt() can run in parallel, serialize access to umh */
 	struct mutex lock;
 	int (*sockopt)(struct sock *sk, int optname,
diff --git a/include/linux/umd.h b/include/linux/umd.h
index ef40bee590c1..58a9c603c78d 100644
--- a/include/linux/umd.h
+++ b/include/linux/umd.h
@@ -17,14 +17,14 @@ static inline void exit_umh(struct task_struct *tsk)
 }
 #endif
 
-struct umh_info {
+struct umd_info {
 	const char *cmdline;
 	struct file *pipe_to_umh;
 	struct file *pipe_from_umh;
 	struct list_head list;
-	void (*cleanup)(struct umh_info *info);
+	void (*cleanup)(struct umd_info *info);
 	pid_t pid;
 };
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
+int fork_usermode_blob(void *data, size_t len, struct umd_info *info);
 
 #endif /* __LINUX_UMD_H__ */
diff --git a/kernel/umd.c b/kernel/umd.c
index 99af9d594eca..f7dacb19c705 100644
--- a/kernel/umd.c
+++ b/kernel/umd.c
@@ -11,7 +11,7 @@ static DEFINE_MUTEX(umh_list_lock);
 
 static int umd_setup(struct subprocess_info *info, struct cred *new)
 {
-	struct umh_info *umh_info = info->data;
+	struct umd_info *umd_info = info->data;
 	struct file *from_umh[2];
 	struct file *to_umh[2];
 	int err;
@@ -43,21 +43,21 @@ static int umd_setup(struct subprocess_info *info, struct cred *new)
 		return err;
 	}
 
-	umh_info->pipe_to_umh = to_umh[1];
-	umh_info->pipe_from_umh = from_umh[0];
-	umh_info->pid = task_pid_nr(current);
+	umd_info->pipe_to_umh = to_umh[1];
+	umd_info->pipe_from_umh = from_umh[0];
+	umd_info->pid = task_pid_nr(current);
 	current->flags |= PF_UMH;
 	return 0;
 }
 
 static void umd_cleanup(struct subprocess_info *info)
 {
-	struct umh_info *umh_info = info->data;
+	struct umd_info *umd_info = info->data;
 
 	/* cleanup if umh_setup() was successful but exec failed */
 	if (info->retval) {
-		fput(umh_info->pipe_to_umh);
-		fput(umh_info->pipe_from_umh);
+		fput(umd_info->pipe_to_umh);
+		fput(umd_info->pipe_from_umh);
 	}
 }
 
@@ -72,12 +72,12 @@ static void umd_cleanup(struct subprocess_info *info)
  *
  * Returns either negative error or zero which indicates success
  * in executing a blob of bytes as a usermode process. In such
- * case 'struct umh_info *info' is populated with two pipes
+ * case 'struct umd_info *info' is populated with two pipes
  * and a pid of the process. The caller is responsible for health
  * check of the user process, killing it via pid, and closing the
  * pipes when user process is no longer needed.
  */
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
+int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 {
 	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
 	struct subprocess_info *sub_info;
@@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(fork_usermode_blob);
 
 void __exit_umh(struct task_struct *tsk)
 {
-	struct umh_info *info;
+	struct umd_info *info;
 	pid_t pid = tsk->pid;
 
 	mutex_lock(&umh_list_lock);
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index 0480918bfc7c..c0dbcc86fcdb 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -12,7 +12,7 @@
 struct bpfilter_umh_ops bpfilter_ops;
 EXPORT_SYMBOL_GPL(bpfilter_ops);
 
-static void bpfilter_umh_cleanup(struct umh_info *info)
+static void bpfilter_umh_cleanup(struct umd_info *info)
 {
 	mutex_lock(&bpfilter_ops.lock);
 	bpfilter_ops.stop = true;
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v2 07/15] umd: Rename umd_info.cmdline umd_info.driver_name
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
                                                                                       ` (5 preceding siblings ...)
  2020-06-29 20:01                                                                     ` [PATCH v2 06/15] umd: For clarity rename umh_info umd_info Eric W. Biederman
@ 2020-06-29 20:02                                                                     ` Eric W. Biederman
  2020-06-29 20:03                                                                     ` [PATCH v2 08/15] umd: Transform fork_usermode_blob into fork_usermode_driver Eric W. Biederman
                                                                                       ` (10 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 20:02 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds


The only thing supplied in the cmdline today is the driver name so
rename the field to clarify the code.

As this value is always supplied stop trying to handle the case of
a NULL cmdline.

Additionally since we now have a name we can count on use the
driver_name any place where the code is looking for a name
of the binary.

Link: https://lkml.kernel.org/r/87imfef0k3.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umd.h         |  2 +-
 kernel/umd.c                | 11 ++++-------
 net/ipv4/bpfilter/sockopt.c |  2 +-
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/include/linux/umd.h b/include/linux/umd.h
index 58a9c603c78d..d827fb038d00 100644
--- a/include/linux/umd.h
+++ b/include/linux/umd.h
@@ -18,7 +18,7 @@ static inline void exit_umh(struct task_struct *tsk)
 #endif
 
 struct umd_info {
-	const char *cmdline;
+	const char *driver_name;
 	struct file *pipe_to_umh;
 	struct file *pipe_from_umh;
 	struct list_head list;
diff --git a/kernel/umd.c b/kernel/umd.c
index f7dacb19c705..7fe08a8eb231 100644
--- a/kernel/umd.c
+++ b/kernel/umd.c
@@ -67,9 +67,6 @@ static void umd_cleanup(struct subprocess_info *info)
  * @len: length of the blob
  * @info: information about usermode process (shouldn't be NULL)
  *
- * If info->cmdline is set it will be used as command line for the
- * user process, else "usermodehelper" is used.
- *
  * Returns either negative error or zero which indicates success
  * in executing a blob of bytes as a usermode process. In such
  * case 'struct umd_info *info' is populated with two pipes
@@ -79,7 +76,6 @@ static void umd_cleanup(struct subprocess_info *info)
  */
 int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 {
-	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
 	struct subprocess_info *sub_info;
 	char **argv = NULL;
 	struct file *file;
@@ -87,7 +83,7 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 	loff_t pos = 0;
 	int err;
 
-	file = shmem_kernel_file_setup("", len, 0);
+	file = shmem_kernel_file_setup(info->driver_name, len, 0);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
@@ -100,11 +96,12 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 	}
 
 	err = -ENOMEM;
-	argv = argv_split(GFP_KERNEL, cmdline, NULL);
+	argv = argv_split(GFP_KERNEL, info->driver_name, NULL);
 	if (!argv)
 		goto out;
 
-	sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL,
+	sub_info = call_usermodehelper_setup(info->driver_name, argv, NULL,
+					     GFP_KERNEL,
 					     umd_setup, umd_cleanup, info);
 	if (!sub_info)
 		goto out;
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index c0dbcc86fcdb..5050de28333d 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -70,7 +70,7 @@ static int __init bpfilter_sockopt_init(void)
 {
 	mutex_init(&bpfilter_ops.lock);
 	bpfilter_ops.stop = true;
-	bpfilter_ops.info.cmdline = "bpfilter_umh";
+	bpfilter_ops.info.driver_name = "bpfilter_umh";
 	bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup;
 
 	return 0;
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v2 08/15] umd: Transform fork_usermode_blob into fork_usermode_driver
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
                                                                                       ` (6 preceding siblings ...)
  2020-06-29 20:02                                                                     ` [PATCH v2 07/15] umd: Rename umd_info.cmdline umd_info.driver_name Eric W. Biederman
@ 2020-06-29 20:03                                                                     ` Eric W. Biederman
  2020-06-29 20:03                                                                     ` [PATCH v2 09/15] umh: Stop calling do_execve_file Eric W. Biederman
                                                                                       ` (9 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 20:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds


Instead of loading a binary blob into a temporary file with
shmem_kernel_file_setup load a binary blob into a temporary tmpfs
filesystem.  This means that the blob can be stored in an init section
and discared, and it means the binary blob will have a filename so can
be executed normally.

The only tricky thing about this code is that in the helper function
blob_to_mnt __fput_sync is used.  That is because a file can not be
executed if it is still open for write, and the ordinary delayed close
for kernel threads does not happen soon enough, which causes the
following exec to fail.  The function umd_load_blob is not called with
any locks so this should be safe.

Executing the blob normally winds up correcting several problems with
the user mode driver code discovered by Tetsuo Handa[1].  By passing
an ordinary filename into the exec, it is no longer necessary to
figure out how to turn a O_RDWR file descriptor into a properly
referende counted O_EXEC file descriptor that forbids all writes.  For
path based LSMs there are no new special cases.

[1] https://lore.kernel.org/linux-fsdevel/2a8775b4-1dd5-9d5c-aa42-9872445e0942@i-love.sakura.ne.jp/
Link: https://lkml.kernel.org/r/87d05mf0j9.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umd.h          |   6 +-
 kernel/umd.c                 | 126 +++++++++++++++++++++++++++--------
 net/bpfilter/bpfilter_kern.c |  14 +++-
 3 files changed, 113 insertions(+), 33 deletions(-)

diff --git a/include/linux/umd.h b/include/linux/umd.h
index d827fb038d00..12ff8f753ea7 100644
--- a/include/linux/umd.h
+++ b/include/linux/umd.h
@@ -2,6 +2,7 @@
 #define __LINUX_UMD_H__
 
 #include <linux/umh.h>
+#include <linux/path.h>
 
 #ifdef CONFIG_BPFILTER
 void __exit_umh(struct task_struct *tsk);
@@ -23,8 +24,11 @@ struct umd_info {
 	struct file *pipe_from_umh;
 	struct list_head list;
 	void (*cleanup)(struct umd_info *info);
+	struct path wd;
 	pid_t pid;
 };
-int fork_usermode_blob(void *data, size_t len, struct umd_info *info);
+int umd_load_blob(struct umd_info *info, const void *data, size_t len);
+int umd_unload_blob(struct umd_info *info);
+int fork_usermode_driver(struct umd_info *info);
 
 #endif /* __LINUX_UMD_H__ */
diff --git a/kernel/umd.c b/kernel/umd.c
index 7fe08a8eb231..aaa6f3142e52 100644
--- a/kernel/umd.c
+++ b/kernel/umd.c
@@ -4,11 +4,98 @@
  */
 #include <linux/shmem_fs.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/mount.h>
+#include <linux/fs_struct.h>
+#include <linux/task_work.h>
 #include <linux/umd.h>
 
 static LIST_HEAD(umh_list);
 static DEFINE_MUTEX(umh_list_lock);
 
+static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name)
+{
+	struct file_system_type *type;
+	struct vfsmount *mnt;
+	struct file *file;
+	ssize_t written;
+	loff_t pos = 0;
+
+	type = get_fs_type("tmpfs");
+	if (!type)
+		return ERR_PTR(-ENODEV);
+
+	mnt = kern_mount(type);
+	put_filesystem(type);
+	if (IS_ERR(mnt))
+		return mnt;
+
+	file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700);
+	if (IS_ERR(file)) {
+		mntput(mnt);
+		return ERR_CAST(file);
+	}
+
+	written = kernel_write(file, data, len, &pos);
+	if (written != len) {
+		int err = written;
+		if (err >= 0)
+			err = -ENOMEM;
+		filp_close(file, NULL);
+		mntput(mnt);
+		return ERR_PTR(err);
+	}
+
+	fput(file);
+
+	/* Flush delayed fput so exec can open the file read-only */
+	flush_delayed_fput();
+	task_work_run();
+	return mnt;
+}
+
+/**
+ * umd_load_blob - Remember a blob of bytes for fork_usermode_driver
+ * @info: information about usermode driver
+ * @data: a blob of bytes that can be executed as a file
+ * @len:  The lentgh of the blob
+ *
+ */
+int umd_load_blob(struct umd_info *info, const void *data, size_t len)
+{
+	struct vfsmount *mnt;
+
+	if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt))
+		return -EBUSY;
+
+	mnt = blob_to_mnt(data, len, info->driver_name);
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+
+	info->wd.mnt = mnt;
+	info->wd.dentry = mnt->mnt_root;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(umd_load_blob);
+
+/**
+ * umd_unload_blob - Disassociate @info from a previously loaded blob
+ * @info: information about usermode driver
+ *
+ */
+int umd_unload_blob(struct umd_info *info)
+{
+	if (WARN_ON_ONCE(!info->wd.mnt ||
+			 !info->wd.dentry ||
+			 info->wd.mnt->mnt_root != info->wd.dentry))
+		return -EINVAL;
+
+	kern_unmount(info->wd.mnt);
+	info->wd.mnt = NULL;
+	info->wd.dentry = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(umd_unload_blob);
+
 static int umd_setup(struct subprocess_info *info, struct cred *new)
 {
 	struct umd_info *umd_info = info->data;
@@ -43,6 +130,7 @@ static int umd_setup(struct subprocess_info *info, struct cred *new)
 		return err;
 	}
 
+	set_fs_pwd(current->fs, &umd_info->wd);
 	umd_info->pipe_to_umh = to_umh[1];
 	umd_info->pipe_from_umh = from_umh[0];
 	umd_info->pid = task_pid_nr(current);
@@ -62,39 +150,21 @@ static void umd_cleanup(struct subprocess_info *info)
 }
 
 /**
- * fork_usermode_blob - fork a blob of bytes as a usermode process
- * @data: a blob of bytes that can be do_execv-ed as a file
- * @len: length of the blob
- * @info: information about usermode process (shouldn't be NULL)
+ * fork_usermode_driver - fork a usermode driver
+ * @info: information about usermode driver (shouldn't be NULL)
  *
- * Returns either negative error or zero which indicates success
- * in executing a blob of bytes as a usermode process. In such
- * case 'struct umd_info *info' is populated with two pipes
- * and a pid of the process. The caller is responsible for health
- * check of the user process, killing it via pid, and closing the
- * pipes when user process is no longer needed.
+ * Returns either negative error or zero which indicates success in
+ * executing a usermode driver. In such case 'struct umd_info *info'
+ * is populated with two pipes and a pid of the process. The caller is
+ * responsible for health check of the user process, killing it via
+ * pid, and closing the pipes when user process is no longer needed.
  */
-int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
+int fork_usermode_driver(struct umd_info *info)
 {
 	struct subprocess_info *sub_info;
 	char **argv = NULL;
-	struct file *file;
-	ssize_t written;
-	loff_t pos = 0;
 	int err;
 
-	file = shmem_kernel_file_setup(info->driver_name, len, 0);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
-
-	written = kernel_write(file, data, len, &pos);
-	if (written != len) {
-		err = written;
-		if (err >= 0)
-			err = -ENOMEM;
-		goto out;
-	}
-
 	err = -ENOMEM;
 	argv = argv_split(GFP_KERNEL, info->driver_name, NULL);
 	if (!argv)
@@ -106,7 +176,6 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 	if (!sub_info)
 		goto out;
 
-	sub_info->file = file;
 	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
 	if (!err) {
 		mutex_lock(&umh_list_lock);
@@ -116,10 +185,9 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 out:
 	if (argv)
 		argv_free(argv);
-	fput(file);
 	return err;
 }
-EXPORT_SYMBOL_GPL(fork_usermode_blob);
+EXPORT_SYMBOL_GPL(fork_usermode_driver);
 
 void __exit_umh(struct task_struct *tsk)
 {
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index c0f0990f30b6..28883b00609d 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -77,9 +77,7 @@ static int start_umh(void)
 	int err;
 
 	/* fork usermode process */
-	err = fork_usermode_blob(&bpfilter_umh_start,
-				 &bpfilter_umh_end - &bpfilter_umh_start,
-				 &bpfilter_ops.info);
+	err = fork_usermode_driver(&bpfilter_ops.info);
 	if (err)
 		return err;
 	bpfilter_ops.stop = false;
@@ -98,6 +96,12 @@ static int __init load_umh(void)
 {
 	int err;
 
+	err = umd_load_blob(&bpfilter_ops.info,
+			    &bpfilter_umh_start,
+			    &bpfilter_umh_end - &bpfilter_umh_start);
+	if (err)
+		return err;
+
 	mutex_lock(&bpfilter_ops.lock);
 	if (!bpfilter_ops.stop) {
 		err = -EFAULT;
@@ -110,6 +114,8 @@ static int __init load_umh(void)
 	}
 out:
 	mutex_unlock(&bpfilter_ops.lock);
+	if (err)
+		umd_unload_blob(&bpfilter_ops.info);
 	return err;
 }
 
@@ -122,6 +128,8 @@ static void __exit fini_umh(void)
 		bpfilter_ops.sockopt = NULL;
 	}
 	mutex_unlock(&bpfilter_ops.lock);
+
+	umd_unload_blob(&bpfilter_ops.info);
 }
 module_init(load_umh);
 module_exit(fini_umh);
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v2 09/15] umh: Stop calling do_execve_file
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
                                                                                       ` (7 preceding siblings ...)
  2020-06-29 20:03                                                                     ` [PATCH v2 08/15] umd: Transform fork_usermode_blob into fork_usermode_driver Eric W. Biederman
@ 2020-06-29 20:03                                                                     ` Eric W. Biederman
  2020-06-29 20:04                                                                     ` [PATCH v2 10/15] exec: Remove do_execve_file Eric W. Biederman
                                                                                       ` (8 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 20:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds


With the user mode driver code changed to not set subprocess_info.file
there are no more users of subproces_info.file.  Remove this field
from struct subprocess_info and remove the only user in
call_usermodehelper_exec_async that would call do_execve_file instead
of do_execve if file was set.

Link: https://lkml.kernel.org/r/877dvuf0i7.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umh.h |  1 -
 kernel/umh.c        | 10 +++-------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/include/linux/umh.h b/include/linux/umh.h
index 73173c4a07e5..244aff638220 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -22,7 +22,6 @@ struct subprocess_info {
 	const char *path;
 	char **argv;
 	char **envp;
-	struct file *file;
 	int wait;
 	int retval;
 	int (*init)(struct subprocess_info *info, struct cred *new);
diff --git a/kernel/umh.c b/kernel/umh.c
index 3e4e453d45c8..6ca2096298b9 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -98,13 +98,9 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	if (sub_info->file)
-		retval = do_execve_file(sub_info->file,
-					sub_info->argv, sub_info->envp);
-	else
-		retval = do_execve(getname_kernel(sub_info->path),
-				   (const char __user *const __user *)sub_info->argv,
-				   (const char __user *const __user *)sub_info->envp);
+	retval = do_execve(getname_kernel(sub_info->path),
+			   (const char __user *const __user *)sub_info->argv,
+			   (const char __user *const __user *)sub_info->envp);
 out:
 	sub_info->retval = retval;
 	/*
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v2 10/15] exec: Remove do_execve_file
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
                                                                                       ` (8 preceding siblings ...)
  2020-06-29 20:03                                                                     ` [PATCH v2 09/15] umh: Stop calling do_execve_file Eric W. Biederman
@ 2020-06-29 20:04                                                                     ` Eric W. Biederman
  2020-06-30  5:43                                                                       ` Christoph Hellwig
  2020-06-29 20:05                                                                     ` [PATCH v2 11/15] bpfilter: Move bpfilter_umh back into init data Eric W. Biederman
                                                                                       ` (7 subsequent siblings)
  17 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 20:04 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds


Now that the last callser has been removed remove this code from exec.

For anyone thinking of resurrecing do_execve_file please note that
the code was buggy in several fundamental ways.

- It did not ensure the file it was passed was read-only and that
  deny_write_access had been called on it.  Which subtlely breaks
  invaniants in exec.

- The caller of do_execve_file was expected to hold and put a
  reference to the file, but an extra reference for use by exec was
  not taken so that when exec put it's reference to the file an
  underflow occured on the file reference count.

- The point of the interface was so that a pathname did not need to
  exist.  Which breaks pathname based LSMs.

Tetsuo Handa originally reported these issues[1].  While it was clear
that deny_write_access was missing the fundamental incompatibility
with the passed in O_RDWR filehandle was not immediately recognized.

All of these issues were fixed by modifying the usermode driver code
to have a path, so it did not need this hack.

Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
[1] https://lore.kernel.org/linux-fsdevel/2a8775b4-1dd5-9d5c-aa42-9872445e0942@i-love.sakura.ne.jp/
Link: https://lkml.kernel.org/r/871rm2f0hi.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/exec.c               | 38 +++++++++-----------------------------
 include/linux/binfmts.h |  1 -
 2 files changed, 9 insertions(+), 30 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index e6e8a9a70327..23dfbb820626 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1818,13 +1818,14 @@ static int exec_binprm(struct linux_binprm *bprm)
 /*
  * sys_execve() executes a new program.
  */
-static int __do_execve_file(int fd, struct filename *filename,
-			    struct user_arg_ptr argv,
-			    struct user_arg_ptr envp,
-			    int flags, struct file *file)
+static int do_execveat_common(int fd, struct filename *filename,
+			      struct user_arg_ptr argv,
+			      struct user_arg_ptr envp,
+			      int flags)
 {
 	char *pathbuf = NULL;
 	struct linux_binprm *bprm;
+	struct file *file;
 	struct files_struct *displaced;
 	int retval;
 
@@ -1863,8 +1864,7 @@ static int __do_execve_file(int fd, struct filename *filename,
 	check_unsafe_exec(bprm);
 	current->in_execve = 1;
 
-	if (!file)
-		file = do_open_execat(fd, filename, flags);
+	file = do_open_execat(fd, filename, flags);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto out_unmark;
@@ -1872,9 +1872,7 @@ static int __do_execve_file(int fd, struct filename *filename,
 	sched_exec();
 
 	bprm->file = file;
-	if (!filename) {
-		bprm->filename = "none";
-	} else if (fd == AT_FDCWD || filename->name[0] == '/') {
+	if (fd == AT_FDCWD || filename->name[0] == '/') {
 		bprm->filename = filename->name;
 	} else {
 		if (filename->name[0] == '\0')
@@ -1935,8 +1933,7 @@ static int __do_execve_file(int fd, struct filename *filename,
 	task_numa_free(current, false);
 	free_bprm(bprm);
 	kfree(pathbuf);
-	if (filename)
-		putname(filename);
+	putname(filename);
 	if (displaced)
 		put_files_struct(displaced);
 	return retval;
@@ -1967,27 +1964,10 @@ static int __do_execve_file(int fd, struct filename *filename,
 	if (displaced)
 		reset_files_struct(displaced);
 out_ret:
-	if (filename)
-		putname(filename);
+	putname(filename);
 	return retval;
 }
 
-static int do_execveat_common(int fd, struct filename *filename,
-			      struct user_arg_ptr argv,
-			      struct user_arg_ptr envp,
-			      int flags)
-{
-	return __do_execve_file(fd, filename, argv, envp, flags, NULL);
-}
-
-int do_execve_file(struct file *file, void *__argv, void *__envp)
-{
-	struct user_arg_ptr argv = { .ptr.native = __argv };
-	struct user_arg_ptr envp = { .ptr.native = __envp };
-
-	return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file);
-}
-
 int do_execve(struct filename *filename,
 	const char __user *const __user *__argv,
 	const char __user *const __user *__envp)
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 4a20b7517dd0..7c27d7b57871 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -141,6 +141,5 @@ extern int do_execveat(int, struct filename *,
 		       const char __user * const __user *,
 		       const char __user * const __user *,
 		       int);
-int do_execve_file(struct file *file, void *__argv, void *__envp);
 
 #endif /* _LINUX_BINFMTS_H */
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v2 11/15] bpfilter: Move bpfilter_umh back into init data
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
                                                                                       ` (9 preceding siblings ...)
  2020-06-29 20:04                                                                     ` [PATCH v2 10/15] exec: Remove do_execve_file Eric W. Biederman
@ 2020-06-29 20:05                                                                     ` Eric W. Biederman
  2020-06-29 20:06                                                                     ` [PATCH v2 12/15] umd: Track user space drivers with struct pid Eric W. Biederman
                                                                                       ` (6 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 20:05 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds


To allow for restarts 61fbf5933d42 ("net: bpfilter: restart
bpfilter_umh when error occurred") moved the blob holding the
userspace binary out of the init sections.

Now that loading the blob into a filesystem is separate from executing
the blob the blob no longer needs to live .rodata to allow for restarting.
So move the blob back to .init.rodata.

Link: https://lkml.kernel.org/r/87sgeidlvq.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 net/bpfilter/bpfilter_umh_blob.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S
index 9ea6100dca87..40311d10d2f2 100644
--- a/net/bpfilter/bpfilter_umh_blob.S
+++ b/net/bpfilter/bpfilter_umh_blob.S
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-	.section .rodata, "a"
+	.section .init.rodata, "a"
 	.global bpfilter_umh_start
 bpfilter_umh_start:
 	.incbin "net/bpfilter/bpfilter_umh"
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v2 12/15] umd: Track user space drivers with struct pid
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
                                                                                       ` (10 preceding siblings ...)
  2020-06-29 20:05                                                                     ` [PATCH v2 11/15] bpfilter: Move bpfilter_umh back into init data Eric W. Biederman
@ 2020-06-29 20:06                                                                     ` Eric W. Biederman
  2020-06-29 20:06                                                                     ` [PATCH v2 13/15] bpfilter: Take advantage of the facilities of " Eric W. Biederman
                                                                                       ` (5 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 20:06 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds


Use struct pid instead of user space pid values that are prone to wrap
araound.

In addition track the entire thread group instead of just the first
thread that is started by exec.  There are no multi-threaded user mode
drivers today but there is nothing preclucing user drivers from being
multi-threaded, so it is just a good idea to track the entire process.

Take a reference count on the tgid's in question to make it possible
to remove exit_umh in a future change.

As a struct pid is available directly use kill_pid_info.

The prior process signalling code was iffy in using a userspace pid
known to be in the initial pid namespace and then looking up it's task
in whatever the current pid namespace is.  It worked only because
kernel threads always run in the initial pid namespace.

As the tgid is now refcounted verify the tgid is NULL at the start of
fork_usermode_driver to avoid the possibility of silent pid leaks.

Link: https://lkml.kernel.org/r/87mu4qdlv2.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umd.h          |  2 +-
 kernel/exit.c                |  3 ++-
 kernel/umd.c                 | 15 ++++++++++-----
 net/bpfilter/bpfilter_kern.c | 13 +++++--------
 net/ipv4/bpfilter/sockopt.c  |  3 ++-
 5 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/include/linux/umd.h b/include/linux/umd.h
index 12ff8f753ea7..edb1c62c62f4 100644
--- a/include/linux/umd.h
+++ b/include/linux/umd.h
@@ -25,7 +25,7 @@ struct umd_info {
 	struct list_head list;
 	void (*cleanup)(struct umd_info *info);
 	struct path wd;
-	pid_t pid;
+	struct pid *tgid;
 };
 int umd_load_blob(struct umd_info *info, const void *data, size_t len);
 int umd_unload_blob(struct umd_info *info);
diff --git a/kernel/exit.c b/kernel/exit.c
index b94fe03e609c..b53107abdd31 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -805,7 +805,8 @@ void __noreturn do_exit(long code)
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
 	exit_thread(tsk);
-	exit_umh(tsk);
+	if (group_dead)
+		exit_umh(tsk);
 
 	/*
 	 * Flush inherited counters to the parent - before the parent
diff --git a/kernel/umd.c b/kernel/umd.c
index aaa6f3142e52..c1e8eccaee76 100644
--- a/kernel/umd.c
+++ b/kernel/umd.c
@@ -133,7 +133,7 @@ static int umd_setup(struct subprocess_info *info, struct cred *new)
 	set_fs_pwd(current->fs, &umd_info->wd);
 	umd_info->pipe_to_umh = to_umh[1];
 	umd_info->pipe_from_umh = from_umh[0];
-	umd_info->pid = task_pid_nr(current);
+	umd_info->tgid = get_pid(task_tgid(current));
 	current->flags |= PF_UMH;
 	return 0;
 }
@@ -146,6 +146,8 @@ static void umd_cleanup(struct subprocess_info *info)
 	if (info->retval) {
 		fput(umd_info->pipe_to_umh);
 		fput(umd_info->pipe_from_umh);
+		put_pid(umd_info->tgid);
+		umd_info->tgid = NULL;
 	}
 }
 
@@ -155,9 +157,9 @@ static void umd_cleanup(struct subprocess_info *info)
  *
  * Returns either negative error or zero which indicates success in
  * executing a usermode driver. In such case 'struct umd_info *info'
- * is populated with two pipes and a pid of the process. The caller is
+ * is populated with two pipes and a tgid of the process. The caller is
  * responsible for health check of the user process, killing it via
- * pid, and closing the pipes when user process is no longer needed.
+ * tgid, and closing the pipes when user process is no longer needed.
  */
 int fork_usermode_driver(struct umd_info *info)
 {
@@ -165,6 +167,9 @@ int fork_usermode_driver(struct umd_info *info)
 	char **argv = NULL;
 	int err;
 
+	if (WARN_ON_ONCE(info->tgid))
+		return -EBUSY;
+
 	err = -ENOMEM;
 	argv = argv_split(GFP_KERNEL, info->driver_name, NULL);
 	if (!argv)
@@ -192,11 +197,11 @@ EXPORT_SYMBOL_GPL(fork_usermode_driver);
 void __exit_umh(struct task_struct *tsk)
 {
 	struct umd_info *info;
-	pid_t pid = tsk->pid;
+	struct pid *tgid = task_tgid(tsk);
 
 	mutex_lock(&umh_list_lock);
 	list_for_each_entry(info, &umh_list, list) {
-		if (info->pid == pid) {
+		if (info->tgid == tgid) {
 			list_del(&info->list);
 			mutex_unlock(&umh_list_lock);
 			goto out;
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index 28883b00609d..b73dedeb6dbf 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -15,16 +15,13 @@ extern char bpfilter_umh_end;
 
 static void shutdown_umh(void)
 {
-	struct task_struct *tsk;
+	struct umd_info *info = &bpfilter_ops.info;
+	struct pid *tgid = info->tgid;
 
 	if (bpfilter_ops.stop)
 		return;
 
-	tsk = get_pid_task(find_vpid(bpfilter_ops.info.pid), PIDTYPE_PID);
-	if (tsk) {
-		send_sig(SIGKILL, tsk, 1);
-		put_task_struct(tsk);
-	}
+	kill_pid_info(SIGKILL, SEND_SIG_PRIV, tgid);
 }
 
 static void __stop_umh(void)
@@ -48,7 +45,7 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname,
 	req.cmd = optname;
 	req.addr = (long __force __user)optval;
 	req.len = optlen;
-	if (!bpfilter_ops.info.pid)
+	if (!bpfilter_ops.info.tgid)
 		goto out;
 	n = __kernel_write(bpfilter_ops.info.pipe_to_umh, &req, sizeof(req),
 			   &pos);
@@ -81,7 +78,7 @@ static int start_umh(void)
 	if (err)
 		return err;
 	bpfilter_ops.stop = false;
-	pr_info("Loaded bpfilter_umh pid %d\n", bpfilter_ops.info.pid);
+	pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid));
 
 	/* health check that usermode process started correctly */
 	if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) {
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index 5050de28333d..56cbc43145f6 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -18,7 +18,8 @@ static void bpfilter_umh_cleanup(struct umd_info *info)
 	bpfilter_ops.stop = true;
 	fput(info->pipe_to_umh);
 	fput(info->pipe_from_umh);
-	info->pid = 0;
+	put_pid(info->tgid);
+	info->tgid = NULL;
 	mutex_unlock(&bpfilter_ops.lock);
 }
 
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v2 13/15] bpfilter: Take advantage of the facilities of struct pid
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
                                                                                       ` (11 preceding siblings ...)
  2020-06-29 20:06                                                                     ` [PATCH v2 12/15] umd: Track user space drivers with struct pid Eric W. Biederman
@ 2020-06-29 20:06                                                                     ` Eric W. Biederman
  2020-06-29 20:07                                                                     ` [PATCH v2 14/15] umd: Remove exit_umh Eric W. Biederman
                                                                                       ` (4 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 20:06 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds


Instead of relying on the exit_umh cleanup callback use the fact a
struct pid can be tested to see if a process still exists, and that
struct pid has a wait queue that notifies when the process dies.

Link: https://lkml.kernel.org/r/87h7uydlu9.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/bpfilter.h     |  3 ++-
 net/bpfilter/bpfilter_kern.c | 15 +++++----------
 net/ipv4/bpfilter/sockopt.c  | 15 ++++++++-------
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
index 4b43d2240172..8073ddce73b1 100644
--- a/include/linux/bpfilter.h
+++ b/include/linux/bpfilter.h
@@ -10,6 +10,8 @@ int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
 			    unsigned int optlen);
 int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
 			    int __user *optlen);
+void bpfilter_umh_cleanup(struct umd_info *info);
+
 struct bpfilter_umh_ops {
 	struct umd_info info;
 	/* since ip_getsockopt() can run in parallel, serialize access to umh */
@@ -18,7 +20,6 @@ struct bpfilter_umh_ops {
 		       char __user *optval,
 		       unsigned int optlen, bool is_set);
 	int (*start)(void);
-	bool stop;
 };
 extern struct bpfilter_umh_ops bpfilter_ops;
 #endif
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index b73dedeb6dbf..91474884ddb7 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -18,10 +18,11 @@ static void shutdown_umh(void)
 	struct umd_info *info = &bpfilter_ops.info;
 	struct pid *tgid = info->tgid;
 
-	if (bpfilter_ops.stop)
-		return;
-
-	kill_pid_info(SIGKILL, SEND_SIG_PRIV, tgid);
+	if (tgid) {
+		kill_pid_info(SIGKILL, SEND_SIG_PRIV, tgid);
+		wait_event(tgid->wait_pidfd, !pid_task(tgid, PIDTYPE_TGID));
+		bpfilter_umh_cleanup(info);
+	}
 }
 
 static void __stop_umh(void)
@@ -77,7 +78,6 @@ static int start_umh(void)
 	err = fork_usermode_driver(&bpfilter_ops.info);
 	if (err)
 		return err;
-	bpfilter_ops.stop = false;
 	pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid));
 
 	/* health check that usermode process started correctly */
@@ -100,16 +100,11 @@ static int __init load_umh(void)
 		return err;
 
 	mutex_lock(&bpfilter_ops.lock);
-	if (!bpfilter_ops.stop) {
-		err = -EFAULT;
-		goto out;
-	}
 	err = start_umh();
 	if (!err && IS_ENABLED(CONFIG_INET)) {
 		bpfilter_ops.sockopt = &__bpfilter_process_sockopt;
 		bpfilter_ops.start = &start_umh;
 	}
-out:
 	mutex_unlock(&bpfilter_ops.lock);
 	if (err)
 		umd_unload_blob(&bpfilter_ops.info);
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index 56cbc43145f6..9455eb9cec78 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -12,16 +12,14 @@
 struct bpfilter_umh_ops bpfilter_ops;
 EXPORT_SYMBOL_GPL(bpfilter_ops);
 
-static void bpfilter_umh_cleanup(struct umd_info *info)
+void bpfilter_umh_cleanup(struct umd_info *info)
 {
-	mutex_lock(&bpfilter_ops.lock);
-	bpfilter_ops.stop = true;
 	fput(info->pipe_to_umh);
 	fput(info->pipe_from_umh);
 	put_pid(info->tgid);
 	info->tgid = NULL;
-	mutex_unlock(&bpfilter_ops.lock);
 }
+EXPORT_SYMBOL_GPL(bpfilter_umh_cleanup);
 
 static int bpfilter_mbox_request(struct sock *sk, int optname,
 				 char __user *optval,
@@ -39,7 +37,11 @@ static int bpfilter_mbox_request(struct sock *sk, int optname,
 			goto out;
 		}
 	}
-	if (bpfilter_ops.stop) {
+	if (bpfilter_ops.info.tgid &&
+	    !pid_has_task(bpfilter_ops.info.tgid, PIDTYPE_TGID))
+		bpfilter_umh_cleanup(&bpfilter_ops.info);
+
+	if (!bpfilter_ops.info.tgid) {
 		err = bpfilter_ops.start();
 		if (err)
 			goto out;
@@ -70,9 +72,8 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
 static int __init bpfilter_sockopt_init(void)
 {
 	mutex_init(&bpfilter_ops.lock);
-	bpfilter_ops.stop = true;
+	bpfilter_ops.info.tgid = NULL;
 	bpfilter_ops.info.driver_name = "bpfilter_umh";
-	bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup;
 
 	return 0;
 }
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v2 14/15] umd: Remove exit_umh
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
                                                                                       ` (12 preceding siblings ...)
  2020-06-29 20:06                                                                     ` [PATCH v2 13/15] bpfilter: Take advantage of the facilities of " Eric W. Biederman
@ 2020-06-29 20:07                                                                     ` Eric W. Biederman
  2020-06-29 20:08                                                                     ` [PATCH v2 15/15] umd: Stop using split_argv Eric W. Biederman
                                                                                       ` (3 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 20:07 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds


The bffilter code no longer uses the umd_info.cleanup callback.  This
callback is what exit_umh exists to call.  So remove exit_umh and all
of it's associated booking.

Link: https://lkml.kernel.org/r/87bll6dlte.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched.h |  1 -
 include/linux/umd.h   | 16 ----------------
 kernel/exit.c         |  3 ---
 kernel/umd.c          | 28 ----------------------------
 4 files changed, 48 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 59d1e92bb88e..edb2020875ad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1511,7 +1511,6 @@ extern struct pid *cad_pid;
 #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
 #define PF_SWAPWRITE		0x00800000	/* Allowed to write to swap */
-#define PF_UMH			0x02000000	/* I'm an Usermodehelper process */
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
 #define PF_MEMALLOC_NOCMA	0x10000000	/* All allocation request will have _GFP_MOVABLE cleared */
diff --git a/include/linux/umd.h b/include/linux/umd.h
index edb1c62c62f4..71d8f4a41ad7 100644
--- a/include/linux/umd.h
+++ b/include/linux/umd.h
@@ -4,26 +4,10 @@
 #include <linux/umh.h>
 #include <linux/path.h>
 
-#ifdef CONFIG_BPFILTER
-void __exit_umh(struct task_struct *tsk);
-
-static inline void exit_umh(struct task_struct *tsk)
-{
-	if (unlikely(tsk->flags & PF_UMH))
-		__exit_umh(tsk);
-}
-#else
-static inline void exit_umh(struct task_struct *tsk)
-{
-}
-#endif
-
 struct umd_info {
 	const char *driver_name;
 	struct file *pipe_to_umh;
 	struct file *pipe_from_umh;
-	struct list_head list;
-	void (*cleanup)(struct umd_info *info);
 	struct path wd;
 	struct pid *tgid;
 };
diff --git a/kernel/exit.c b/kernel/exit.c
index b53107abdd31..42f079eb71e5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -63,7 +63,6 @@
 #include <linux/random.h>
 #include <linux/rcuwait.h>
 #include <linux/compat.h>
-#include <linux/umd.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -805,8 +804,6 @@ void __noreturn do_exit(long code)
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
 	exit_thread(tsk);
-	if (group_dead)
-		exit_umh(tsk);
 
 	/*
 	 * Flush inherited counters to the parent - before the parent
diff --git a/kernel/umd.c b/kernel/umd.c
index c1e8eccaee76..4188b71de267 100644
--- a/kernel/umd.c
+++ b/kernel/umd.c
@@ -9,9 +9,6 @@
 #include <linux/task_work.h>
 #include <linux/umd.h>
 
-static LIST_HEAD(umh_list);
-static DEFINE_MUTEX(umh_list_lock);
-
 static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name)
 {
 	struct file_system_type *type;
@@ -134,7 +131,6 @@ static int umd_setup(struct subprocess_info *info, struct cred *new)
 	umd_info->pipe_to_umh = to_umh[1];
 	umd_info->pipe_from_umh = from_umh[0];
 	umd_info->tgid = get_pid(task_tgid(current));
-	current->flags |= PF_UMH;
 	return 0;
 }
 
@@ -182,11 +178,6 @@ int fork_usermode_driver(struct umd_info *info)
 		goto out;
 
 	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
-	if (!err) {
-		mutex_lock(&umh_list_lock);
-		list_add(&info->list, &umh_list);
-		mutex_unlock(&umh_list_lock);
-	}
 out:
 	if (argv)
 		argv_free(argv);
@@ -194,23 +185,4 @@ int fork_usermode_driver(struct umd_info *info)
 }
 EXPORT_SYMBOL_GPL(fork_usermode_driver);
 
-void __exit_umh(struct task_struct *tsk)
-{
-	struct umd_info *info;
-	struct pid *tgid = task_tgid(tsk);
-
-	mutex_lock(&umh_list_lock);
-	list_for_each_entry(info, &umh_list, list) {
-		if (info->tgid == tgid) {
-			list_del(&info->list);
-			mutex_unlock(&umh_list_lock);
-			goto out;
-		}
-	}
-	mutex_unlock(&umh_list_lock);
-	return;
-out:
-	if (info->cleanup)
-		info->cleanup(info);
-}
 
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v2 15/15] umd: Stop using split_argv
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
                                                                                       ` (13 preceding siblings ...)
  2020-06-29 20:07                                                                     ` [PATCH v2 14/15] umd: Remove exit_umh Eric W. Biederman
@ 2020-06-29 20:08                                                                     ` Eric W. Biederman
  2020-06-29 22:12                                                                     ` [PATCH v2 00/15] Make the user mode driver code a better citizen Alexei Starovoitov
                                                                                       ` (2 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 20:08 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds


There is exactly one argument so there is nothing to split.  All
split_argv does now is cause confusion and avoid the need for a cast
when passing a "const char *" string to call_usermodehelper_setup.

So avoid confusion and the possibility of an odd driver name causing
problems by just using a fixed argv array with a cast in the call to
call_usermodehelper_setup.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/umd.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/kernel/umd.c b/kernel/umd.c
index 4188b71de267..ff79fb16d738 100644
--- a/kernel/umd.c
+++ b/kernel/umd.c
@@ -160,27 +160,21 @@ static void umd_cleanup(struct subprocess_info *info)
 int fork_usermode_driver(struct umd_info *info)
 {
 	struct subprocess_info *sub_info;
-	char **argv = NULL;
+	const char *argv[] = { info->driver_name, NULL };
 	int err;
 
 	if (WARN_ON_ONCE(info->tgid))
 		return -EBUSY;
 
 	err = -ENOMEM;
-	argv = argv_split(GFP_KERNEL, info->driver_name, NULL);
-	if (!argv)
-		goto out;
-
-	sub_info = call_usermodehelper_setup(info->driver_name, argv, NULL,
-					     GFP_KERNEL,
+	sub_info = call_usermodehelper_setup(info->driver_name,
+					     (char **)argv, NULL, GFP_KERNEL,
 					     umd_setup, umd_cleanup, info);
 	if (!sub_info)
 		goto out;
 
 	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
 out:
-	if (argv)
-		argv_free(argv);
 	return err;
 }
 EXPORT_SYMBOL_GPL(fork_usermode_driver);
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-29  2:20                                                                         ` Tetsuo Handa
@ 2020-06-29 20:19                                                                           ` Eric W. Biederman
  2020-06-30  6:28                                                                             ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-29 20:19 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Alexei Starovoitov, Linus Torvalds, David Miller,
	Greg Kroah-Hartman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:

> On 2020/06/29 4:44, Alexei Starovoitov wrote:
>> But all the defensive programming kinda goes against general kernel style.
>> I wouldn't do it. Especially pr_info() ?!
>> Though I don't feel strongly about it.
>
> Honestly speaking, caller should check for errors and print appropriate
> messages. info->wd.mnt->mnt_root != info->wd.dentry indicates that something
> went wrong (maybe memory corruption). But other conditions are not fatal.
> That is, I consider even pr_info() here should be unnecessary.

They were all should never happen cases.  Which is why my patches do:
if (WARN_ON_ONCE(...))

That let's the caller know the messed up very clearly while still
providing a change to continue.

If they were clearly corruption no ones kernel should ever continue
BUG_ON would be appropriate.

>> I would like to generalize elf_header_check() a bit and call it
>> before doing blob_to_mnt() to make sure that all blobs are elf files only.
>> Supporting '#!/bin/bash' or other things as blobs seems wrong to me.

I vote for not worry about things that have never happened, and are
obviously incorrect.

The only points of checks like that is to catch cases where other
developers misunderstand the interface.  When you get to something like
sysfs with lots and lots of users where it is hard to audit there
is real value in sanity checks.  In something like this with very few
users. Just making the code clear should be enough for people not to do
ridiculous things.


In any case Tetsuo I will leave futher sanity checks for you and Alexei
to work out.  It is beyond the scope of my patchset, and they are easy
enough to add as follow on patches.

Eric

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 00/15] Make the user mode driver code a better citizen
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
                                                                                       ` (14 preceding siblings ...)
  2020-06-29 20:08                                                                     ` [PATCH v2 15/15] umd: Stop using split_argv Eric W. Biederman
@ 2020-06-29 22:12                                                                     ` Alexei Starovoitov
  2020-06-30  1:13                                                                       ` Eric W. Biederman
  2020-06-30 12:29                                                                       ` Eric W. Biederman
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
  2020-07-08  5:20                                                                     ` [PATCH v2 00/15] " Luis Chamberlain
  17 siblings, 2 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-29 22:12 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-kernel, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Kees Cook, Andrew Morton, Alexei Starovoitov, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, LSM List, Casey Schaufler,
	Luis Chamberlain, Linus Torvalds

On Mon, Jun 29, 2020 at 02:55:05PM -0500, Eric W. Biederman wrote:
> 
> I have tested thes changes by booting with the code compiled in and
> by killing "bpfilter_umh" and running iptables -vnL to restart
> the userspace driver.
> 
> I have compiled tested each change with and without CONFIG_BPFILTER
> enabled.

With
CONFIG_BPFILTER=y
CONFIG_BPFILTER_UMH=m
it doesn't build:

ERROR: modpost: "kill_pid_info" [net/bpfilter/bpfilter.ko] undefined!

I've added:
+EXPORT_SYMBOL(kill_pid_info);
to continue testing...

And then did:
while true; do iptables -L;rmmod bpfilter; done
 
Unfortunately sometimes 'rmmod bpfilter' hangs in wait_event().

I suspect patch 13 is somehow responsible:
+	if (tgid) {
+		kill_pid_info(SIGKILL, SEND_SIG_PRIV, tgid);
+		wait_event(tgid->wait_pidfd, !pid_task(tgid, PIDTYPE_TGID));
+		bpfilter_umh_cleanup(info);
+	}

I cannot figure out why it hangs. Some sort of race ?
Since adding short delay between kill and wait makes it work.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 00/15] Make the user mode driver code a better citizen
  2020-06-29 22:12                                                                     ` [PATCH v2 00/15] Make the user mode driver code a better citizen Alexei Starovoitov
@ 2020-06-30  1:13                                                                       ` Eric W. Biederman
  2020-06-30  6:16                                                                         ` Tetsuo Handa
  2020-06-30 12:29                                                                       ` Eric W. Biederman
  1 sibling, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-30  1:13 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: linux-kernel, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Kees Cook, Andrew Morton, Alexei Starovoitov, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, LSM List, Casey Schaufler,
	Luis Chamberlain, Linus Torvalds

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Mon, Jun 29, 2020 at 02:55:05PM -0500, Eric W. Biederman wrote:
>> 
>> I have tested thes changes by booting with the code compiled in and
>> by killing "bpfilter_umh" and running iptables -vnL to restart
>> the userspace driver.
>> 
>> I have compiled tested each change with and without CONFIG_BPFILTER
>> enabled.
>
> With
> CONFIG_BPFILTER=y
> CONFIG_BPFILTER_UMH=m
> it doesn't build:
>
> ERROR: modpost: "kill_pid_info" [net/bpfilter/bpfilter.ko] undefined!
>
> I've added:
> +EXPORT_SYMBOL(kill_pid_info);
> to continue testing...
>
> And then did:
> while true; do iptables -L;rmmod bpfilter; done
>  
> Unfortunately sometimes 'rmmod bpfilter' hangs in wait_event().
>
> I suspect patch 13 is somehow responsible:
> +	if (tgid) {
> +		kill_pid_info(SIGKILL, SEND_SIG_PRIV, tgid);
> +		wait_event(tgid->wait_pidfd, !pid_task(tgid, PIDTYPE_TGID));
> +		bpfilter_umh_cleanup(info);
> +	}
>
> I cannot figure out why it hangs. Some sort of race ?
> Since adding short delay between kill and wait makes it work.

Thanks.  I will take a look.

Eric

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 10/15] exec: Remove do_execve_file
  2020-06-29 20:04                                                                     ` [PATCH v2 10/15] exec: Remove do_execve_file Eric W. Biederman
@ 2020-06-30  5:43                                                                       ` Christoph Hellwig
  2020-06-30 12:14                                                                         ` Eric W. Biederman
  0 siblings, 1 reply; 194+ messages in thread
From: Christoph Hellwig @ 2020-06-30  5:43 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-kernel, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds

FYI, this clashes badly with my exec rework.  I'd suggest you
drop everything touching exec here for now, and I can then
add the final file based exec removal to the end of my series.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 00/15] Make the user mode driver code a better citizen
  2020-06-30  1:13                                                                       ` Eric W. Biederman
@ 2020-06-30  6:16                                                                         ` Tetsuo Handa
  0 siblings, 0 replies; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-30  6:16 UTC (permalink / raw)
  To: Eric W. Biederman, Alexei Starovoitov
  Cc: linux-kernel, David Miller, Greg Kroah-Hartman, Kees Cook,
	Andrew Morton, Alexei Starovoitov, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele, LSM List, Casey Schaufler, Luis Chamberlain,
	Linus Torvalds

On 2020/06/30 10:13, Eric W. Biederman wrote:
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> 
>> On Mon, Jun 29, 2020 at 02:55:05PM -0500, Eric W. Biederman wrote:
>>>
>>> I have tested thes changes by booting with the code compiled in and
>>> by killing "bpfilter_umh" and running iptables -vnL to restart
>>> the userspace driver.
>>>
>>> I have compiled tested each change with and without CONFIG_BPFILTER
>>> enabled.
>>
>> With
>> CONFIG_BPFILTER=y
>> CONFIG_BPFILTER_UMH=m
>> it doesn't build:
>>
>> ERROR: modpost: "kill_pid_info" [net/bpfilter/bpfilter.ko] undefined!
>>
>> I've added:
>> +EXPORT_SYMBOL(kill_pid_info);
>> to continue testing...

kill_pid() is already exported.

>>
>> And then did:
>> while true; do iptables -L;rmmod bpfilter; done
>>  
>> Unfortunately sometimes 'rmmod bpfilter' hangs in wait_event().
>>
>> I suspect patch 13 is somehow responsible:
>> +	if (tgid) {
>> +		kill_pid_info(SIGKILL, SEND_SIG_PRIV, tgid);
>> +		wait_event(tgid->wait_pidfd, !pid_task(tgid, PIDTYPE_TGID));
>> +		bpfilter_umh_cleanup(info);
>> +	}
>>
>> I cannot figure out why it hangs. Some sort of race ?
>> Since adding short delay between kill and wait makes it work.

Because there is a race window that detach_pid() from __unhash_process() from
__exit_signal() from release_task() from exit_notify() from do_exit() is called
some time after wake_up_all(&pid->wait_pidfd) from do_notify_pidfd() from
do_notify_parent() from exit_notify() from do_exit() was called (in other words,
we can't use pid->wait_pidfd when pid_task() is used at wait_event()) ?

Below are changes I suggest.

diff --git a/kernel/umd.c b/kernel/umd.c
index ff79fb16d738..f688813b8830 100644
--- a/kernel/umd.c
+++ b/kernel/umd.c
@@ -26,7 +26,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na
 	if (IS_ERR(mnt))
 		return mnt;
 
-	file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700);
+	file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY | O_EXCL, 0700);
 	if (IS_ERR(file)) {
 		mntput(mnt);
 		return ERR_CAST(file);
@@ -52,16 +52,18 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na
 
 /**
  * umd_load_blob - Remember a blob of bytes for fork_usermode_driver
- * @info: information about usermode driver
- * @data: a blob of bytes that can be executed as a file
- * @len:  The lentgh of the blob
+ * @info: information about usermode driver (shouldn't be NULL)
+ * @data: a blob of bytes that can be executed as a file (shouldn't be NULL)
+ * @len:  The lentgh of the blob (shouldn't be 0)
  *
  */
 int umd_load_blob(struct umd_info *info, const void *data, size_t len)
 {
 	struct vfsmount *mnt;
 
-	if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt))
+	if (!info || !info->driver_name || !data || !len)
+		return -EINVAL;
+	if (info->wd.dentry || info->wd.mnt)
 		return -EBUSY;
 
 	mnt = blob_to_mnt(data, len, info->driver_name);
@@ -76,15 +78,14 @@ EXPORT_SYMBOL_GPL(umd_load_blob);
 
 /**
  * umd_unload_blob - Disassociate @info from a previously loaded blob
- * @info: information about usermode driver
+ * @info: information about usermode driver (shouldn't be NULL)
  *
  */
 int umd_unload_blob(struct umd_info *info)
 {
-	if (WARN_ON_ONCE(!info->wd.mnt ||
-			 !info->wd.dentry ||
-			 info->wd.mnt->mnt_root != info->wd.dentry))
+	if (!info || !info->driver_name || !info->wd.dentry || !info->wd.mnt)
 		return -EINVAL;
+	BUG_ON(info->wd.mnt->mnt_root != info->wd.dentry);
 
 	kern_unmount(info->wd.mnt);
 	info->wd.mnt = NULL;
@@ -138,7 +139,7 @@ static void umd_cleanup(struct subprocess_info *info)
 {
 	struct umd_info *umd_info = info->data;
 
-	/* cleanup if umh_setup() was successful but exec failed */
+	/* cleanup if umd_setup() was successful but exec failed */
 	if (info->retval) {
 		fput(umd_info->pipe_to_umh);
 		fput(umd_info->pipe_from_umh);
@@ -163,7 +164,10 @@ int fork_usermode_driver(struct umd_info *info)
 	const char *argv[] = { info->driver_name, NULL };
 	int err;
 
-	if (WARN_ON_ONCE(info->tgid))
+	if (!info || !info->driver_name || !info->wd.dentry || !info->wd.mnt)
+		return -EINVAL;
+	BUG_ON(info->wd.mnt->mnt_root != info->wd.dentry);
+	if (info->tgid)
 		return -EBUSY;
 
 	err = -ENOMEM;
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index 91474884ddb7..9dd70aacb81a 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -19,8 +19,13 @@ static void shutdown_umh(void)
 	struct pid *tgid = info->tgid;
 
 	if (tgid) {
-		kill_pid_info(SIGKILL, SEND_SIG_PRIV, tgid);
-		wait_event(tgid->wait_pidfd, !pid_task(tgid, PIDTYPE_TGID));
+		kill_pid(tgid, SIGKILL, 1);
+		while (({ bool done;
+			  rcu_read_lock();
+			  done = !pid_task(tgid, PIDTYPE_TGID);
+			  rcu_read_unlock();
+			  done; }))
+			schedule_timeout_uninterruptible(1);
 		bpfilter_umh_cleanup(info);
 	}
 }


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-29 20:19                                                                           ` Eric W. Biederman
@ 2020-06-30  6:28                                                                             ` Tetsuo Handa
  2020-06-30 12:32                                                                               ` Eric W. Biederman
  2020-06-30 16:48                                                                               ` Alexei Starovoitov
  0 siblings, 2 replies; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-30  6:28 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Alexei Starovoitov, Linus Torvalds, David Miller,
	Greg Kroah-Hartman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On 2020/06/30 5:19, Eric W. Biederman wrote:
> Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:
> 
>> On 2020/06/29 4:44, Alexei Starovoitov wrote:
>>> But all the defensive programming kinda goes against general kernel style.
>>> I wouldn't do it. Especially pr_info() ?!
>>> Though I don't feel strongly about it.
>>
>> Honestly speaking, caller should check for errors and print appropriate
>> messages. info->wd.mnt->mnt_root != info->wd.dentry indicates that something
>> went wrong (maybe memory corruption). But other conditions are not fatal.
>> That is, I consider even pr_info() here should be unnecessary.
> 
> They were all should never happen cases.  Which is why my patches do:
> if (WARN_ON_ONCE(...))

No. Fuzz testing (which uses panic_on_warn=1) will trivially hit them.
This bug was unfortunately not found by syzkaller because this path is
not easily reachable via syscall interface.

> 
> That let's the caller know the messed up very clearly while still
> providing a change to continue.
> 
> If they were clearly corruption no ones kernel should ever continue
> BUG_ON would be appropriate.

Please use BUG_ON() (to only corruption case) like I suggested in my updated diff.


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 10/15] exec: Remove do_execve_file
  2020-06-30  5:43                                                                       ` Christoph Hellwig
@ 2020-06-30 12:14                                                                         ` Eric W. Biederman
  2020-06-30 13:38                                                                           ` Christoph Hellwig
  0 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-30 12:14 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-kernel, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds

Christoph Hellwig <hch@infradead.org> writes:

> FYI, this clashes badly with my exec rework.  I'd suggest you
> drop everything touching exec here for now, and I can then
> add the final file based exec removal to the end of my series.

I have looked and I haven't even seen any exec work.  Where can it be
found?

I have working and cleaning up exec for what 3 cycles now.  There is
still quite a ways to go before it becomes possible to fix some of the
deep problems in exec.  Removing all of these broken exec special cases
is quite frankly the entire point of this patchset.

Sight unseen I suggest you send me your exec work and I can merge it
into my branch if we are going to conflict badly.

Eric


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 00/15] Make the user mode driver code a better citizen
  2020-06-29 22:12                                                                     ` [PATCH v2 00/15] Make the user mode driver code a better citizen Alexei Starovoitov
  2020-06-30  1:13                                                                       ` Eric W. Biederman
@ 2020-06-30 12:29                                                                       ` Eric W. Biederman
  2020-06-30 13:21                                                                         ` Tetsuo Handa
  2020-06-30 16:52                                                                         ` Alexei Starovoitov
  1 sibling, 2 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-30 12:29 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: linux-kernel, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Kees Cook, Andrew Morton, Alexei Starovoitov, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, LSM List, Casey Schaufler,
	Luis Chamberlain, Linus Torvalds

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

2> On Mon, Jun 29, 2020 at 02:55:05PM -0500, Eric W. Biederman wrote:
>> 
>> I have tested thes changes by booting with the code compiled in and
>> by killing "bpfilter_umh" and running iptables -vnL to restart
>> the userspace driver.
>> 
>> I have compiled tested each change with and without CONFIG_BPFILTER
>> enabled.
>
> With
> CONFIG_BPFILTER=y
> CONFIG_BPFILTER_UMH=m
> it doesn't build:
>
> ERROR: modpost: "kill_pid_info" [net/bpfilter/bpfilter.ko] undefined!
>
> I've added:
> +EXPORT_SYMBOL(kill_pid_info);
> to continue testing...

I am rather surprised I thought Tetsuo had already compile tested
modules.


> I suspect patch 13 is somehow responsible:
> +	if (tgid) {
> +		kill_pid_info(SIGKILL, SEND_SIG_PRIV, tgid);
> +		wait_event(tgid->wait_pidfd, !pid_task(tgid, PIDTYPE_TGID));
> +		bpfilter_umh_cleanup(info);
> +	}
>
> I cannot figure out why it hangs. Some sort of race ?
> Since adding short delay between kill and wait makes it work.

Having had a chance to sleep kill_pid_info was a thinko, as was
!pid_task.  It should have been !pid_has_task as that takes the proper
rcu locking.

I don't know if that is going to be enough to fix the wait_event
but those are obvious bugs that need to be fixed.

diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index 91474884ddb7..3e1874030daa 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -19,8 +19,8 @@ static void shutdown_umh(void)
        struct pid *tgid = info->tgid;
 
        if (tgid) {
-               kill_pid_info(SIGKILL, SEND_SIG_PRIV, tgid);
-               wait_event(tgid->wait_pidfd, !pid_task(tgid, PIDTYPE_TGID));
+               kill_pid(tgid, SIGKILL, 1);
+               wait_event(tgid->wait_pidfd, !pid_has_task(tgid, PIDTYPE_TGID));
                bpfilter_umh_cleanup(info);
        }
 }

> And then did:
> while true; do iptables -L;rmmod bpfilter; done
>  
> Unfortunately sometimes 'rmmod bpfilter' hangs in wait_event().

Hmm.  The wake up happens just of tgid->wait_pidfd happens just before
release_task is called so there is a race.  As it is possible to wake
up and then go back to sleep before pid_has_task becomes false.

So I think I need a friendly helper that does:

bool task_has_exited(struct pid *tgid)
{
	bool exited = false;

	rcu_read_lock();
        tsk = pid_task(tgid, PIDTYPE_TGID);
        exited = !!tsk;
        if (tsk) {
        	exited = !!tsk->exit_state;
out:
	rcu_unlock();
	return exited;
}

There should be a sensible way to do that.

Eric



^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-30  6:28                                                                             ` Tetsuo Handa
@ 2020-06-30 12:32                                                                               ` Eric W. Biederman
  2020-06-30 16:48                                                                               ` Alexei Starovoitov
  1 sibling, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-30 12:32 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Alexei Starovoitov, Linus Torvalds, David Miller,
	Greg Kroah-Hartman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:

> On 2020/06/30 5:19, Eric W. Biederman wrote:
>> Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:
>> 
>>> On 2020/06/29 4:44, Alexei Starovoitov wrote:
>>>> But all the defensive programming kinda goes against general kernel style.
>>>> I wouldn't do it. Especially pr_info() ?!
>>>> Though I don't feel strongly about it.
>>>
>>> Honestly speaking, caller should check for errors and print appropriate
>>> messages. info->wd.mnt->mnt_root != info->wd.dentry indicates that something
>>> went wrong (maybe memory corruption). But other conditions are not fatal.
>>> That is, I consider even pr_info() here should be unnecessary.
>> 
>> They were all should never happen cases.  Which is why my patches do:
>> if (WARN_ON_ONCE(...))
>
> No. Fuzz testing (which uses panic_on_warn=1) will trivially hit them.
> This bug was unfortunately not found by syzkaller because this path is
> not easily reachable via syscall interface.

Absolutely yes.  These are cases that should never happen.
They should never be reachable by userspace.

It is absolutely a bug if these are hit by userspace.

Now if fuzzers want horrible cases to be even more horrible and change a
nice friendly warn into a panic that is their problem.  The issue being
do they capture the information the rest of us need to fix.

Eric



^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 00/15] Make the user mode driver code a better citizen
  2020-06-30 12:29                                                                       ` Eric W. Biederman
@ 2020-06-30 13:21                                                                         ` Tetsuo Handa
  2020-07-02 13:08                                                                           ` Eric W. Biederman
  2020-06-30 16:52                                                                         ` Alexei Starovoitov
  1 sibling, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-30 13:21 UTC (permalink / raw)
  To: Eric W. Biederman, Alexei Starovoitov
  Cc: linux-kernel, David Miller, Greg Kroah-Hartman, Kees Cook,
	Andrew Morton, Alexei Starovoitov, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele, LSM List, Casey Schaufler, Luis Chamberlain,
	Linus Torvalds

On 2020/06/30 21:29, Eric W. Biederman wrote:
> Hmm.  The wake up happens just of tgid->wait_pidfd happens just before
> release_task is called so there is a race.  As it is possible to wake
> up and then go back to sleep before pid_has_task becomes false.

What is the reason we want to wait until pid_has_task() becomes false?

- wait_event(tgid->wait_pidfd, !pid_has_task(tgid, PIDTYPE_TGID));
+ while (!wait_event_timeout(tgid->wait_pidfd, !pid_has_task(tgid, PIDTYPE_TGID), 1));




By the way, commit 4a9d4b024a3102fc ("switch fput to task_work_add") says
that use of flush_delayed_fput() has to be careful. Al, is it safe to call
flush_delayed_fput() from blob_to_mnt() from umd_load_blob() (which might be
called from both kernel thread and from process context (e.g. init_module()
syscall by /sbin/insmod )) ?

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 10/15] exec: Remove do_execve_file
  2020-06-30 12:14                                                                         ` Eric W. Biederman
@ 2020-06-30 13:38                                                                           ` Christoph Hellwig
  2020-06-30 14:28                                                                             ` Eric W. Biederman
  0 siblings, 1 reply; 194+ messages in thread
From: Christoph Hellwig @ 2020-06-30 13:38 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Christoph Hellwig, linux-kernel, David Miller,
	Greg Kroah-Hartman, Tetsuo Handa, Alexei Starovoitov, Kees Cook,
	Andrew Morton, Alexei Starovoitov, Al Viro, bpf, linux-fsdevel,
	Daniel Borkmann, Jakub Kicinski, Masahiro Yamada, Gary Lin,
	Bruno Meneguele, LSM List, Casey Schaufler, Luis Chamberlain,
	Linus Torvalds

On Tue, Jun 30, 2020 at 07:14:23AM -0500, Eric W. Biederman wrote:
> Christoph Hellwig <hch@infradead.org> writes:
> 
> > FYI, this clashes badly with my exec rework.  I'd suggest you
> > drop everything touching exec here for now, and I can then
> > add the final file based exec removal to the end of my series.
> 
> I have looked and I haven't even seen any exec work.  Where can it be
> found?
> 
> I have working and cleaning up exec for what 3 cycles now.  There is
> still quite a ways to go before it becomes possible to fix some of the
> deep problems in exec.  Removing all of these broken exec special cases
> is quite frankly the entire point of this patchset.
> 
> Sight unseen I suggest you send me your exec work and I can merge it
> into my branch if we are going to conflict badly.

https://lore.kernel.org/linux-fsdevel/20200627072704.2447163-1-hch@lst.de/T/#t

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 10/15] exec: Remove do_execve_file
  2020-06-30 13:38                                                                           ` Christoph Hellwig
@ 2020-06-30 14:28                                                                             ` Eric W. Biederman
  2020-06-30 16:55                                                                               ` Alexei Starovoitov
  0 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-06-30 14:28 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-kernel, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds

Christoph Hellwig <hch@infradead.org> writes:

> On Tue, Jun 30, 2020 at 07:14:23AM -0500, Eric W. Biederman wrote:
>> Christoph Hellwig <hch@infradead.org> writes:
>> 
>> > FYI, this clashes badly with my exec rework.  I'd suggest you
>> > drop everything touching exec here for now, and I can then
>> > add the final file based exec removal to the end of my series.
>> 
>> I have looked and I haven't even seen any exec work.  Where can it be
>> found?
>> 
>> I have working and cleaning up exec for what 3 cycles now.  There is
>> still quite a ways to go before it becomes possible to fix some of the
>> deep problems in exec.  Removing all of these broken exec special cases
>> is quite frankly the entire point of this patchset.
>> 
>> Sight unseen I suggest you send me your exec work and I can merge it
>> into my branch if we are going to conflict badly.
>
> https://lore.kernel.org/linux-fsdevel/20200627072704.2447163-1-hch@lst.de/T/#t


Looking at your final patch I do not like the construct.

static int __do_execveat(int fd, struct filename *filename,
 		const char __user *const __user *argv,
 		const char __user *const __user *envp,
		const char *const *kernel_argv,
		const char *const *kernel_envp,
 		int flags, struct file *file);


It results in a function that is full of:
	if (kernel_argv) {
        	// For kernel_exeveat 
		...
	} else {
        	// For ordinary exeveat
        	
        }

Which while understandable.  I do not think results in good long term
maintainble code.

The current file paramter that I am getting rid of in my patchset is
a stark example of that.  Because of all of the if's no one realized
that the code had it's file reference counting wrong (amoung other
bugs).

I think this is important to address as exec has already passed
the point where people can fix all of the bugs in exec because
the code is so hairy.

I think to be maintainable and clear the code exec code is going to
need to look something like:

static int bprm_execveat(int fd, struct filename *filename,
			struct bprm *bprm, int flags);

int kernel_execve(const char *filename,
		  const char *const *argv, const char *const *envp, int flags)
{
	bprm = kzalloc(sizeof(*pbrm), GFP_KERNEL);
        bprm->argc = count_kernel_strings(argv);
        bprm->envc = count_kernel_strings(envp);
        prepare_arg_pages(bprm);
        copy_strings_kernel(bprm->envc, envp, bprm);
        copy_strings_kernel(bprm->argc, argc, bprm);
	ret = bprm_execveat(AT_FDCWD, filename, bprm);
        free_bprm(bprm);
        return ret;
}

int do_exeveat(int fd, const char *filename,
		const char __user *const __user *argv,
                const char __user *const __user *envp, int flags)
{
	bprm = kzalloc(sizeof(*pbrm), GFP_KERNEL);
        bprm->argc = count_strings(argv);
        bprm->envc = count_strings(envp);
        prepare_arg_pages(bprm);
        copy_strings(bprm->envc, envp, bprm);
        copy_strings(bprm->argc, argc, bprm);
	ret = bprm_execveat(fd, filename, bprm);
        free_bprm(bprm);
        return ret;
}

More work is required obviously to make the code above really work but
when the dust clears a structure like that doesn't have funny edge cases
that can hide bugs and make it tricky to change the code.

Eric




^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-30  6:28                                                                             ` Tetsuo Handa
  2020-06-30 12:32                                                                               ` Eric W. Biederman
@ 2020-06-30 16:48                                                                               ` Alexei Starovoitov
  2020-06-30 21:54                                                                                 ` Tetsuo Handa
  1 sibling, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-30 16:48 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Eric W. Biederman, Linus Torvalds, David Miller,
	Greg Kroah-Hartman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On Tue, Jun 30, 2020 at 03:28:49PM +0900, Tetsuo Handa wrote:
> On 2020/06/30 5:19, Eric W. Biederman wrote:
> > Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:
> > 
> >> On 2020/06/29 4:44, Alexei Starovoitov wrote:
> >>> But all the defensive programming kinda goes against general kernel style.
> >>> I wouldn't do it. Especially pr_info() ?!
> >>> Though I don't feel strongly about it.
> >>
> >> Honestly speaking, caller should check for errors and print appropriate
> >> messages. info->wd.mnt->mnt_root != info->wd.dentry indicates that something
> >> went wrong (maybe memory corruption). But other conditions are not fatal.
> >> That is, I consider even pr_info() here should be unnecessary.
> > 
> > They were all should never happen cases.  Which is why my patches do:
> > if (WARN_ON_ONCE(...))
> 
> No. Fuzz testing (which uses panic_on_warn=1) will trivially hit them.

I don't believe that's true.
Please show fuzzing stack trace to prove your point.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 00/15] Make the user mode driver code a better citizen
  2020-06-30 12:29                                                                       ` Eric W. Biederman
  2020-06-30 13:21                                                                         ` Tetsuo Handa
@ 2020-06-30 16:52                                                                         ` Alexei Starovoitov
  2020-07-01 17:12                                                                           ` Eric W. Biederman
  1 sibling, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-30 16:52 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-kernel, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Kees Cook, Andrew Morton, Alexei Starovoitov, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, LSM List, Casey Schaufler,
	Luis Chamberlain, Linus Torvalds

On Tue, Jun 30, 2020 at 07:29:34AM -0500, Eric W. Biederman wrote:
> 
> diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
> index 91474884ddb7..3e1874030daa 100644
> --- a/net/bpfilter/bpfilter_kern.c
> +++ b/net/bpfilter/bpfilter_kern.c
> @@ -19,8 +19,8 @@ static void shutdown_umh(void)
>         struct pid *tgid = info->tgid;
>  
>         if (tgid) {
> -               kill_pid_info(SIGKILL, SEND_SIG_PRIV, tgid);
> -               wait_event(tgid->wait_pidfd, !pid_task(tgid, PIDTYPE_TGID));
> +               kill_pid(tgid, SIGKILL, 1);
> +               wait_event(tgid->wait_pidfd, !pid_has_task(tgid, PIDTYPE_TGID));
>                 bpfilter_umh_cleanup(info);
>         }
>  }
> 
> > And then did:
> > while true; do iptables -L;rmmod bpfilter; done
> >  
> > Unfortunately sometimes 'rmmod bpfilter' hangs in wait_event().
> 
> Hmm.  The wake up happens just of tgid->wait_pidfd happens just before
> release_task is called so there is a race.  As it is possible to wake
> up and then go back to sleep before pid_has_task becomes false.
> 
> So I think I need a friendly helper that does:
> 
> bool task_has_exited(struct pid *tgid)
> {
> 	bool exited = false;
> 
> 	rcu_read_lock();
>         tsk = pid_task(tgid, PIDTYPE_TGID);
>         exited = !!tsk;
>         if (tsk) {
>         	exited = !!tsk->exit_state;
> out:
> 	rcu_unlock();
> 	return exited;
> }

All makes sense to me.
If I understood the race condition such helper should indeed solve it.
Are you going to add such patch to your series?
I'll proceed with my work on top of your series and will ignore this
race for now, but I think it should be fixed before we land this set
into multiple trees.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 10/15] exec: Remove do_execve_file
  2020-06-30 14:28                                                                             ` Eric W. Biederman
@ 2020-06-30 16:55                                                                               ` Alexei Starovoitov
  0 siblings, 0 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-30 16:55 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Christoph Hellwig, linux-kernel, David Miller,
	Greg Kroah-Hartman, Tetsuo Handa, Kees Cook, Andrew Morton,
	Alexei Starovoitov, Al Viro, bpf, linux-fsdevel, Daniel Borkmann,
	Jakub Kicinski, Masahiro Yamada, Gary Lin, Bruno Meneguele,
	LSM List, Casey Schaufler, Luis Chamberlain, Linus Torvalds

On Tue, Jun 30, 2020 at 09:28:10AM -0500, Eric W. Biederman wrote:
> Christoph Hellwig <hch@infradead.org> writes:
> 
> > On Tue, Jun 30, 2020 at 07:14:23AM -0500, Eric W. Biederman wrote:
> >> Christoph Hellwig <hch@infradead.org> writes:
> >> 
> >> > FYI, this clashes badly with my exec rework.  I'd suggest you
> >> > drop everything touching exec here for now, and I can then
> >> > add the final file based exec removal to the end of my series.
> >> 
> >> I have looked and I haven't even seen any exec work.  Where can it be
> >> found?
> >> 
> >> I have working and cleaning up exec for what 3 cycles now.  There is
> >> still quite a ways to go before it becomes possible to fix some of the
> >> deep problems in exec.  Removing all of these broken exec special cases
> >> is quite frankly the entire point of this patchset.
> >> 
> >> Sight unseen I suggest you send me your exec work and I can merge it
> >> into my branch if we are going to conflict badly.
> >
> > https://lore.kernel.org/linux-fsdevel/20200627072704.2447163-1-hch@lst.de/T/#t
> 
> 
> Looking at your final patch I do not like the construct.
> 
> static int __do_execveat(int fd, struct filename *filename,
>  		const char __user *const __user *argv,
>  		const char __user *const __user *envp,
> 		const char *const *kernel_argv,
> 		const char *const *kernel_envp,
>  		int flags, struct file *file);
> 
> 
> It results in a function that is full of:
> 	if (kernel_argv) {
>         	// For kernel_exeveat 
> 		...
> 	} else {
>         	// For ordinary exeveat
>         	
>         }
> 
> Which while understandable.  I do not think results in good long term
> maintainble code.
> 
> The current file paramter that I am getting rid of in my patchset is
> a stark example of that.  Because of all of the if's no one realized
> that the code had it's file reference counting wrong (amoung other
> bugs).
> 
> I think this is important to address as exec has already passed
> the point where people can fix all of the bugs in exec because
> the code is so hairy.
> 
> I think to be maintainable and clear the code exec code is going to
> need to look something like:
> 
> static int bprm_execveat(int fd, struct filename *filename,
> 			struct bprm *bprm, int flags);
> 
> int kernel_execve(const char *filename,
> 		  const char *const *argv, const char *const *envp, int flags)
> {
> 	bprm = kzalloc(sizeof(*pbrm), GFP_KERNEL);
>         bprm->argc = count_kernel_strings(argv);
>         bprm->envc = count_kernel_strings(envp);
>         prepare_arg_pages(bprm);
>         copy_strings_kernel(bprm->envc, envp, bprm);
>         copy_strings_kernel(bprm->argc, argc, bprm);
> 	ret = bprm_execveat(AT_FDCWD, filename, bprm);
>         free_bprm(bprm);
>         return ret;
> }
> 
> int do_exeveat(int fd, const char *filename,
> 		const char __user *const __user *argv,
>                 const char __user *const __user *envp, int flags)
> {
> 	bprm = kzalloc(sizeof(*pbrm), GFP_KERNEL);
>         bprm->argc = count_strings(argv);
>         bprm->envc = count_strings(envp);
>         prepare_arg_pages(bprm);
>         copy_strings(bprm->envc, envp, bprm);
>         copy_strings(bprm->argc, argc, bprm);
> 	ret = bprm_execveat(fd, filename, bprm);
>         free_bprm(bprm);
>         return ret;
> }
> 
> More work is required obviously to make the code above really work but
> when the dust clears a structure like that doesn't have funny edge cases
> that can hide bugs and make it tricky to change the code.

+1 to the approach.
I think Christoph's work need to be on top of Eric's.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 05/15] umh: Separate the user mode driver and the user mode helper support
  2020-06-29 20:00                                                                     ` [PATCH v2 05/15] umh: Separate the user mode driver and the user mode helper support Eric W. Biederman
@ 2020-06-30 16:58                                                                       ` Linus Torvalds
  2020-07-01 17:18                                                                         ` Eric W. Biederman
  0 siblings, 1 reply; 194+ messages in thread
From: Linus Torvalds @ 2020-06-30 16:58 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linux Kernel Mailing List, David Miller, Greg Kroah-Hartman,
	Tetsuo Handa, Alexei Starovoitov, Kees Cook, Andrew Morton,
	Alexei Starovoitov, Al Viro, bpf, linux-fsdevel, Daniel Borkmann,
	Jakub Kicinski, Masahiro Yamada, Gary Lin, Bruno Meneguele,
	LSM List, Casey Schaufler, Luis Chamberlain

On Mon, Jun 29, 2020 at 1:05 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> This makes it clear which code is part of the core user mode
> helper support and which code is needed to implement user mode
> drivers.
>
>  kernel/umd.c             | 146 +++++++++++++++++++++++++++++++++++++++
>  kernel/umh.c             | 139 -------------------------------------

I certainly don't object to the split, but I hate the name.

We have uml, umd and umh for user mode {linux, drivers, helper}
respectively.And honestly, I don't see the point in using an obscure
and unreadable TLA for something like this.

I really don't think it would hurt to write out even the full name
with "usermode_driver.c" or something like that, would it?

Then "umd" could be continued to be used as a prefix for the helper
functions, by all means, but if we startv renaming files, can we do it
properly?

                   Linus

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-30 16:48                                                                               ` Alexei Starovoitov
@ 2020-06-30 21:54                                                                                 ` Tetsuo Handa
  2020-06-30 21:57                                                                                   ` Alexei Starovoitov
  0 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-30 21:54 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eric W. Biederman, Linus Torvalds, David Miller,
	Greg Kroah-Hartman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On 2020/07/01 1:48, Alexei Starovoitov wrote:
> On Tue, Jun 30, 2020 at 03:28:49PM +0900, Tetsuo Handa wrote:
>> On 2020/06/30 5:19, Eric W. Biederman wrote:
>>> Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:
>>>
>>>> On 2020/06/29 4:44, Alexei Starovoitov wrote:
>>>>> But all the defensive programming kinda goes against general kernel style.
>>>>> I wouldn't do it. Especially pr_info() ?!
>>>>> Though I don't feel strongly about it.
>>>>
>>>> Honestly speaking, caller should check for errors and print appropriate
>>>> messages. info->wd.mnt->mnt_root != info->wd.dentry indicates that something
>>>> went wrong (maybe memory corruption). But other conditions are not fatal.
>>>> That is, I consider even pr_info() here should be unnecessary.
>>>
>>> They were all should never happen cases.  Which is why my patches do:
>>> if (WARN_ON_ONCE(...))
>>
>> No. Fuzz testing (which uses panic_on_warn=1) will trivially hit them.
> 
> I don't believe that's true.
> Please show fuzzing stack trace to prove your point.
> 

Please find links containing "WARNING" from https://syzkaller.appspot.com/upstream . ;-)

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-30 21:54                                                                                 ` Tetsuo Handa
@ 2020-06-30 21:57                                                                                   ` Alexei Starovoitov
  2020-06-30 22:58                                                                                     ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-06-30 21:57 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Eric W. Biederman, Linus Torvalds, David Miller,
	Greg Kroah-Hartman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On Tue, Jun 30, 2020 at 2:55 PM Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp> wrote:
>
> On 2020/07/01 1:48, Alexei Starovoitov wrote:
> > On Tue, Jun 30, 2020 at 03:28:49PM +0900, Tetsuo Handa wrote:
> >> On 2020/06/30 5:19, Eric W. Biederman wrote:
> >>> Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:
> >>>
> >>>> On 2020/06/29 4:44, Alexei Starovoitov wrote:
> >>>>> But all the defensive programming kinda goes against general kernel style.
> >>>>> I wouldn't do it. Especially pr_info() ?!
> >>>>> Though I don't feel strongly about it.
> >>>>
> >>>> Honestly speaking, caller should check for errors and print appropriate
> >>>> messages. info->wd.mnt->mnt_root != info->wd.dentry indicates that something
> >>>> went wrong (maybe memory corruption). But other conditions are not fatal.
> >>>> That is, I consider even pr_info() here should be unnecessary.
> >>>
> >>> They were all should never happen cases.  Which is why my patches do:
> >>> if (WARN_ON_ONCE(...))
> >>
> >> No. Fuzz testing (which uses panic_on_warn=1) will trivially hit them.
> >
> > I don't believe that's true.
> > Please show fuzzing stack trace to prove your point.
> >
>
> Please find links containing "WARNING" from https://syzkaller.appspot.com/upstream . ;-)

Is it a joke? Do you understand how syzbot works?
If so, please explain how it can invoke umd_* interface.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH 00/14] Make the user mode driver code a better citizen
  2020-06-30 21:57                                                                                   ` Alexei Starovoitov
@ 2020-06-30 22:58                                                                                     ` Tetsuo Handa
  0 siblings, 0 replies; 194+ messages in thread
From: Tetsuo Handa @ 2020-06-30 22:58 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eric W. Biederman, Linus Torvalds, David Miller,
	Greg Kroah-Hartman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler

On 2020/07/01 6:57, Alexei Starovoitov wrote:
>>>>> They were all should never happen cases.  Which is why my patches do:
>>>>> if (WARN_ON_ONCE(...))
>>>>
>>>> No. Fuzz testing (which uses panic_on_warn=1) will trivially hit them.
>>>
>>> I don't believe that's true.
>>> Please show fuzzing stack trace to prove your point.
>>>
>>
>> Please find links containing "WARNING" from https://syzkaller.appspot.com/upstream . ;-)
> 
> Is it a joke? Do you understand how syzbot works?
> If so, please explain how it can invoke umd_* interface.
> 

Currently syzkaller can't invoke umd_* interface because this interface is used by only
bpfilter_umh module. But I can imagine that someone starts using this interface in a way
syzkaller can somehow invoke. Thus, how can it be a joke? I don't understand your question.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 00/15] Make the user mode driver code a better citizen
  2020-06-30 16:52                                                                         ` Alexei Starovoitov
@ 2020-07-01 17:12                                                                           ` Eric W. Biederman
  0 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-01 17:12 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: linux-kernel, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Kees Cook, Andrew Morton, Alexei Starovoitov, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, LSM List, Casey Schaufler,
	Luis Chamberlain, Linus Torvalds

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Tue, Jun 30, 2020 at 07:29:34AM -0500, Eric W. Biederman wrote:
>> 
>> diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
>> index 91474884ddb7..3e1874030daa 100644
>> --- a/net/bpfilter/bpfilter_kern.c
>> +++ b/net/bpfilter/bpfilter_kern.c
>> @@ -19,8 +19,8 @@ static void shutdown_umh(void)
>>         struct pid *tgid = info->tgid;
>>  
>>         if (tgid) {
>> -               kill_pid_info(SIGKILL, SEND_SIG_PRIV, tgid);
>> -               wait_event(tgid->wait_pidfd, !pid_task(tgid, PIDTYPE_TGID));
>> +               kill_pid(tgid, SIGKILL, 1);
>> +               wait_event(tgid->wait_pidfd, !pid_has_task(tgid, PIDTYPE_TGID));
>>                 bpfilter_umh_cleanup(info);
>>         }
>>  }
>> 
>> > And then did:
>> > while true; do iptables -L;rmmod bpfilter; done
>> >  
>> > Unfortunately sometimes 'rmmod bpfilter' hangs in wait_event().
>> 
>> Hmm.  The wake up happens just of tgid->wait_pidfd happens just before
>> release_task is called so there is a race.  As it is possible to wake
>> up and then go back to sleep before pid_has_task becomes false.
>> 
>> So I think I need a friendly helper that does:
>> 
>> bool task_has_exited(struct pid *tgid)
>> {
>> 	bool exited = false;
>> 
>> 	rcu_read_lock();
>>         tsk = pid_task(tgid, PIDTYPE_TGID);
>>         exited = !!tsk;
>>         if (tsk) {
>>         	exited = !!tsk->exit_state;
>> out:
>> 	rcu_unlock();
>> 	return exited;
>> }
>
> All makes sense to me.
> If I understood the race condition such helper should indeed solve it.
> Are you going to add such patch to your series?
> I'll proceed with my work on top of your series and will ignore this
> race for now, but I think it should be fixed before we land this set
> into multiple trees.

Yes. I am just finishing it up now.

Eric


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 05/15] umh: Separate the user mode driver and the user mode helper support
  2020-06-30 16:58                                                                       ` Linus Torvalds
@ 2020-07-01 17:18                                                                         ` Eric W. Biederman
  2020-07-01 17:42                                                                           ` Alexei Starovoitov
  0 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-01 17:18 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Linux Kernel Mailing List, David Miller, Greg Kroah-Hartman,
	Tetsuo Handa, Alexei Starovoitov, Kees Cook, Andrew Morton,
	Alexei Starovoitov, Al Viro, bpf, linux-fsdevel, Daniel Borkmann,
	Jakub Kicinski, Masahiro Yamada, Gary Lin, Bruno Meneguele,
	LSM List, Casey Schaufler, Luis Chamberlain

Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Mon, Jun 29, 2020 at 1:05 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
>>
>> This makes it clear which code is part of the core user mode
>> helper support and which code is needed to implement user mode
>> drivers.
>>
>>  kernel/umd.c             | 146 +++++++++++++++++++++++++++++++++++++++
>>  kernel/umh.c             | 139 -------------------------------------
>
> I certainly don't object to the split, but I hate the name.
>
> We have uml, umd and umh for user mode {linux, drivers, helper}
> respectively.And honestly, I don't see the point in using an obscure
> and unreadable TLA for something like this.
>
> I really don't think it would hurt to write out even the full name
> with "usermode_driver.c" or something like that, would it?
>
> Then "umd" could be continued to be used as a prefix for the helper
> functions, by all means, but if we startv renaming files, can we do it
> properly?

I will take care of it.  I have to respin the patchset for a silly bug anyways.

Eric

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 05/15] umh: Separate the user mode driver and the user mode helper support
  2020-07-01 17:18                                                                         ` Eric W. Biederman
@ 2020-07-01 17:42                                                                           ` Alexei Starovoitov
  0 siblings, 0 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-07-01 17:42 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linus Torvalds, Linux Kernel Mailing List, David Miller,
	Greg Kroah-Hartman, Tetsuo Handa, Kees Cook, Andrew Morton,
	Alexei Starovoitov, Al Viro, bpf, linux-fsdevel, Daniel Borkmann,
	Jakub Kicinski, Masahiro Yamada, Gary Lin, Bruno Meneguele,
	LSM List, Casey Schaufler, Luis Chamberlain

On Wed, Jul 1, 2020 at 10:23 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> Linus Torvalds <torvalds@linux-foundation.org> writes:
>
> > On Mon, Jun 29, 2020 at 1:05 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
> >>
> >> This makes it clear which code is part of the core user mode
> >> helper support and which code is needed to implement user mode
> >> drivers.
> >>
> >>  kernel/umd.c             | 146 +++++++++++++++++++++++++++++++++++++++
> >>  kernel/umh.c             | 139 -------------------------------------
> >
> > I certainly don't object to the split, but I hate the name.
> >
> > We have uml, umd and umh for user mode {linux, drivers, helper}
> > respectively.And honestly, I don't see the point in using an obscure
> > and unreadable TLA for something like this.
> >
> > I really don't think it would hurt to write out even the full name
> > with "usermode_driver.c" or something like that, would it?
> >
> > Then "umd" could be continued to be used as a prefix for the helper
> > functions, by all means, but if we startv renaming files, can we do it
> > properly?
>
> I will take care of it.  I have to respin the patchset for a silly bug anyways.

I guess with the header name too: umd.h -> usermode_driver.h ?

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 00/15] Make the user mode driver code a better citizen
  2020-06-30 13:21                                                                         ` Tetsuo Handa
@ 2020-07-02 13:08                                                                           ` Eric W. Biederman
  2020-07-02 13:40                                                                             ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 13:08 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Alexei Starovoitov, linux-kernel, David Miller,
	Greg Kroah-Hartman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds

Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:

> On 2020/06/30 21:29, Eric W. Biederman wrote:
>> Hmm.  The wake up happens just of tgid->wait_pidfd happens just before
>> release_task is called so there is a race.  As it is possible to wake
>> up and then go back to sleep before pid_has_task becomes false.
>
> What is the reason we want to wait until pid_has_task() becomes false?
>
> - wait_event(tgid->wait_pidfd, !pid_has_task(tgid, PIDTYPE_TGID));
> + while (!wait_event_timeout(tgid->wait_pidfd, !pid_has_task(tgid, PIDTYPE_TGID), 1));

So that it is safe to call bpfilter_umh_cleanup.  The previous code
performed the wait by having a callback in do_exit.

It might be possible to call bpf_umh_cleanup early but I have not done
that analysis.

To perform the test correctly what I have right now is:

bool thread_group_exited(struct pid *pid)
{
	struct task_struct *tsk;
	bool exited;

	rcu_read_lock();
	tsk = pid_task(pid, PIDTYPE_PID);
	exited = !tsk || (READ_ONCE(tsk->exit_state) && thread_group_empty(tsk));
	rcu_read_unlock();

	return exited;
}

Which is factored out of pidfd_poll.  Which means that this won't be
something that the bpfilter code has to maintain.  That seems to be a
fundamentally good facility to have regardless of bpfilter.

I will post the whole thing in a bit once I have a chance to dot my i's
and cross my t's.

> By the way, commit 4a9d4b024a3102fc ("switch fput to task_work_add") says
> that use of flush_delayed_fput() has to be careful. Al, is it safe to call
> flush_delayed_fput() from blob_to_mnt() from umd_load_blob() (which might be
> called from both kernel thread and from process context (e.g. init_module()
> syscall by /sbin/insmod )) ?

And __fput_sync needs to be even more careful.
umd_load_blob is called in these changes without any locks held.

We fundamentally AKA in any correct version of this code need to flush
the file descriptor before we call exec or exec can not open it a
read-only denying all writes from any other opens.

The use case of flush_delayed_fput is exactly the same as that used
when loading the initramfs.

Eric





^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 00/15] Make the user mode driver code a better citizen
  2020-07-02 13:08                                                                           ` Eric W. Biederman
@ 2020-07-02 13:40                                                                             ` Tetsuo Handa
  2020-07-02 16:02                                                                               ` Eric W. Biederman
  0 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-07-02 13:40 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Alexei Starovoitov, linux-kernel, David Miller,
	Greg Kroah-Hartman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds

On 2020/07/02 22:08, Eric W. Biederman wrote:
> Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:
> 
>> On 2020/06/30 21:29, Eric W. Biederman wrote:
>>> Hmm.  The wake up happens just of tgid->wait_pidfd happens just before
>>> release_task is called so there is a race.  As it is possible to wake
>>> up and then go back to sleep before pid_has_task becomes false.
>>
>> What is the reason we want to wait until pid_has_task() becomes false?
>>
>> - wait_event(tgid->wait_pidfd, !pid_has_task(tgid, PIDTYPE_TGID));
>> + while (!wait_event_timeout(tgid->wait_pidfd, !pid_has_task(tgid, PIDTYPE_TGID), 1));
> 
> So that it is safe to call bpfilter_umh_cleanup.  The previous code
> performed the wait by having a callback in do_exit.

But bpfilter_umh_cleanup() does only

	fput(info->pipe_to_umh);
	fput(info->pipe_from_umh);
	put_pid(info->tgid);
	info->tgid = NULL;

which is (I think) already safe regardless of the usermode process because
bpfilter_umh_cleanup() merely closes one side of two pipes used between
two processes and forgets about the usermode process.

> 
> It might be possible to call bpf_umh_cleanup early but I have not done
> that analysis.
> 
> To perform the test correctly what I have right now is:

Waiting for the termination of a SIGKILLed usermode process is not
such simple. If a usermode process was killed by the OOM killer, it
might take minutes for the killed process to reach do_exit() due to
invisible memory allocation dependency chain. Since the OOM killer
kicks the OOM reaper, and the OOM reaper forgets about the killed
process after one second if mmap_sem could not be held (in order to
avoid OOM deadlock), the OOM situation will be eventually solved; but
there is no guarantee that the killed process can reach do_exit()
in a short period.


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 00/15] Make the user mode driver code a better citizen
  2020-07-02 13:40                                                                             ` Tetsuo Handa
@ 2020-07-02 16:02                                                                               ` Eric W. Biederman
  2020-07-03 13:19                                                                                 ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:02 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Alexei Starovoitov, linux-kernel, David Miller,
	Greg Kroah-Hartman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds

Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:

> On 2020/07/02 22:08, Eric W. Biederman wrote:
>> Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:
>> 
>>> On 2020/06/30 21:29, Eric W. Biederman wrote:
>>>> Hmm.  The wake up happens just of tgid->wait_pidfd happens just before
>>>> release_task is called so there is a race.  As it is possible to wake
>>>> up and then go back to sleep before pid_has_task becomes false.
>>>
>>> What is the reason we want to wait until pid_has_task() becomes false?
>>>
>>> - wait_event(tgid->wait_pidfd, !pid_has_task(tgid, PIDTYPE_TGID));
>>> + while (!wait_event_timeout(tgid->wait_pidfd, !pid_has_task(tgid, PIDTYPE_TGID), 1));
>> 
>> So that it is safe to call bpfilter_umh_cleanup.  The previous code
>> performed the wait by having a callback in do_exit.
>
> But bpfilter_umh_cleanup() does only
>
> 	fput(info->pipe_to_umh);
> 	fput(info->pipe_from_umh);
> 	put_pid(info->tgid);
> 	info->tgid = NULL;
>
> which is (I think) already safe regardless of the usermode process because
> bpfilter_umh_cleanup() merely closes one side of two pipes used between
> two processes and forgets about the usermode process.

It is not safe.

Baring bugs there is only one use of shtudown_umh that matters.  The one
in fini_umh.  The use of the file by the mm must be finished before
umd_unload_blob.  AKA unmount.  Which completely frees the filesystem.

>> It might be possible to call bpf_umh_cleanup early but I have not done
>> that analysis.
>> 
>> To perform the test correctly what I have right now is:
>
> Waiting for the termination of a SIGKILLed usermode process is not
> such simple.

The waiting is that simple.

You are correct it might not be a quick process.

A good general principle is to start with something simple and correct
for what it does, and then to make it more complicated when real world
cases show up, and it can be understood what the real challenges are.

I am not going to merge known broken code but I am also not going to
overcomplicate it.

Dealing with very rare and pathological cases that are not handled or
considered today is out of scope for my patchset.

Eric

^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 00/16] Make the user mode driver code a better citizen
  2020-06-29 19:55                                                                   ` [PATCH v2 00/15] " Eric W. Biederman
                                                                                       ` (15 preceding siblings ...)
  2020-06-29 22:12                                                                     ` [PATCH v2 00/15] Make the user mode driver code a better citizen Alexei Starovoitov
@ 2020-07-02 16:40                                                                     ` Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 01/16] umh: Capture the pid in umh_pipe_setup Eric W. Biederman
                                                                                         ` (17 more replies)
  2020-07-08  5:20                                                                     ` [PATCH v2 00/15] " Luis Chamberlain
  17 siblings, 18 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:40 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner


This is the third round of my changeset to split the user mode driver
code from the user mode helper code, and to make the code use common
facilities to get things done instead of recreating them just
for the user mode driver code.

I have split the changes into small enough pieces so they should be
easily readable and testable.

The changes lean into the preexisting interfaces in the kernel and
remove special cases for user mode driver code in favor of solutions
that don't need special cases.  This results in smaller code with fewer
bugs.

At a practical level this removes the maintenance burden of the user
mode drivers from the user mode helper code and from exec as the special
cases are removed.

Similarly the LSM interaction bugs are fixed by not having unnecessary
special cases for user mode drivers.

I have tested thes changes by booting with the code compiled in and
by killing "bpfilter_umh" and "running iptables -vnL" to restart
the userspace driver, also by running "while true; do iptables -L;rmmod
bpfilter; done" to verify the module load and unload work properly.

I have compiled tested each change with and without CONFIG_BPFILTER
enabled.

From v2 to v3 I have made two siginficant changes.
- I factored thread_group_exit out of pidfd_poll to allow the test
  to be used by the bpfilter code.
- I renamed umd.c and umd.h to usermode_driver.c and usermode_driver.h
  respectively.

I made a few very small changes from v1 to v2:
- Updated the function name in a comment when the function is renamed
- Moved some more code so that the the !CONFIG_BPFILTER case continues
  to compile when I moved the code into umd.c
- A fix for the module loading case to really flush the file descriptor.
- Removed split_argv entirely from fork_usermode_driver.
  There was nothing to split so it was just confusing.

Please let me know if you see any bugs.  Once the code review is
finished I plan to place the code in a non-rebasing branch
so I can pull it into my tree and so it can also be pulled into
the bpf-next tree.

v1: https://lkml.kernel.org/r/87pn9mgfc2.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87bll17ili.fsf_-_@x220.int.ebiederm.org

Eric W. Biederman (16):
      umh: Capture the pid in umh_pipe_setup
      umh: Move setting PF_UMH into umh_pipe_setup
      umh: Rename the user mode driver helpers for clarity
      umh: Remove call_usermodehelper_setup_file.
      umh: Separate the user mode driver and the user mode helper support
      umd: For clarity rename umh_info umd_info
      umd: Rename umd_info.cmdline umd_info.driver_name
      umd: Transform fork_usermode_blob into fork_usermode_driver
      umh: Stop calling do_execve_file
      exec: Remove do_execve_file
      bpfilter: Move bpfilter_umh back into init data
      umd: Track user space drivers with struct pid
      exit: Factor thread_group_exited out of pidfd_poll
      bpfilter: Take advantage of the facilities of struct pid
      umd: Remove exit_umh
      umd: Stop using split_argv

 fs/exec.c                        |  38 ++------
 include/linux/binfmts.h          |   1 -
 include/linux/bpfilter.h         |   7 +-
 include/linux/sched.h            |   9 --
 include/linux/sched/signal.h     |   2 +
 include/linux/umh.h              |  15 ----
 include/linux/usermode_driver.h  |  18 ++++
 kernel/Makefile                  |   1 +
 kernel/exit.c                    |  25 +++++-
 kernel/fork.c                    |   6 +-
 kernel/umh.c                     | 171 +-----------------------------------
 kernel/usermode_driver.c         | 182 +++++++++++++++++++++++++++++++++++++++
 net/bpfilter/bpfilter_kern.c     |  38 ++++----
 net/bpfilter/bpfilter_umh_blob.S |   2 +-
 net/ipv4/bpfilter/sockopt.c      |  20 +++--
 15 files changed, 275 insertions(+), 260 deletions(-)


Eric W. Biederman (15):
      umh: Capture the pid in umh_pipe_setup
      umh: Move setting PF_UMH into umh_pipe_setup
      umh: Rename the user mode driver helpers for clarity
      umh: Remove call_usermodehelper_setup_file.
      umh: Separate the user mode driver and the user mode helper support
      umd: For clarity rename umh_info umd_info
      umd: Rename umd_info.cmdline umd_info.driver_name
      umd: Transform fork_usermode_blob into fork_usermode_driver
      umh: Stop calling do_execve_file
      exec: Remove do_execve_file
      bpfilter: Move bpfilter_umh back into init data
      umd: Track user space drivers with struct pid
      bpfilter: Take advantage of the facilities of struct pid
      umd: Remove exit_umh
      umd: Stop using split_argv

 fs/exec.c                        |  38 ++------
 include/linux/binfmts.h          |   1 -
 include/linux/bpfilter.h         |   7 +-
 include/linux/sched.h            |   9 --
 include/linux/umd.h              |  18 ++++
 include/linux/umh.h              |  15 ----
 kernel/Makefile                  |   1 +
 kernel/exit.c                    |   1 -
 kernel/umd.c                     | 182 +++++++++++++++++++++++++++++++++++++++
 kernel/umh.c                     | 171 +-----------------------------------
 net/bpfilter/bpfilter_kern.c     |  38 ++++----
 net/bpfilter/bpfilter_umh_blob.S |   2 +-
 net/ipv4/bpfilter/sockopt.c      |  20 +++--
 13 files changed, 248 insertions(+), 255 deletions(-)


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 01/16] umh: Capture the pid in umh_pipe_setup
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
@ 2020-07-02 16:41                                                                       ` Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 02/16] umh: Move setting PF_UMH into umh_pipe_setup Eric W. Biederman
                                                                                         ` (16 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner, Eric W. Biederman, Greg Kroah-Hartman

The pid in struct subprocess_info is only used by umh_clean_and_save_pid to
write the pid into umh_info.

Instead always capture the pid on struct umh_info in umh_pipe_setup, removing
code that is specific to user mode drivers from the common user path of
user mode helpers.

v1: https://lkml.kernel.org/r/87h7uygf9i.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/875zb97iix.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umh.h | 1 -
 kernel/umh.c        | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/linux/umh.h b/include/linux/umh.h
index 0c08de356d0d..aae16a0ebd0f 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -25,7 +25,6 @@ struct subprocess_info {
 	struct file *file;
 	int wait;
 	int retval;
-	pid_t pid;
 	int (*init)(struct subprocess_info *info, struct cred *new);
 	void (*cleanup)(struct subprocess_info *info);
 	void *data;
diff --git a/kernel/umh.c b/kernel/umh.c
index 79f139a7ca03..c2a582b3a2bf 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -102,7 +102,6 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	sub_info->pid = task_pid_nr(current);
 	if (sub_info->file) {
 		retval = do_execve_file(sub_info->file,
 					sub_info->argv, sub_info->envp);
@@ -468,6 +467,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 
 	umh_info->pipe_to_umh = to_umh[1];
 	umh_info->pipe_from_umh = from_umh[0];
+	umh_info->pid = task_pid_nr(current);
 	return 0;
 }
 
@@ -476,13 +476,12 @@ static void umh_clean_and_save_pid(struct subprocess_info *info)
 	struct umh_info *umh_info = info->data;
 
 	/* cleanup if umh_pipe_setup() was successful but exec failed */
-	if (info->pid && info->retval) {
+	if (info->retval) {
 		fput(umh_info->pipe_to_umh);
 		fput(umh_info->pipe_from_umh);
 	}
 
 	argv_free(info->argv);
-	umh_info->pid = info->pid;
 }
 
 /**
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 02/16] umh: Move setting PF_UMH into umh_pipe_setup
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 01/16] umh: Capture the pid in umh_pipe_setup Eric W. Biederman
@ 2020-07-02 16:41                                                                       ` Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 03/16] umh: Rename the user mode driver helpers for clarity Eric W. Biederman
                                                                                         ` (15 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner, Eric W. Biederman, Greg Kroah-Hartman

I am separating the code specific to user mode drivers from the code
for ordinary user space helpers.  Move setting of PF_UMH from
call_usermodehelper_exec_async which is core user mode helper code
into umh_pipe_setup which is user mode driver code.

The code is equally as easy to write in one location as the other and
the movement minimizes the impact of the user mode driver code on the
core of the user mode helper code.

Setting PF_UMH unconditionally is harmless as an action will only
happen if it is paired with an entry on umh_list.

v1: https://lkml.kernel.org/r/87bll6gf8t.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87zh8l63xs.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/umh.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/umh.c b/kernel/umh.c
index c2a582b3a2bf..e6b9d6636850 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -102,12 +102,10 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	if (sub_info->file) {
+	if (sub_info->file)
 		retval = do_execve_file(sub_info->file,
 					sub_info->argv, sub_info->envp);
-		if (!retval)
-			current->flags |= PF_UMH;
-	} else
+	else
 		retval = do_execve(getname_kernel(sub_info->path),
 				   (const char __user *const __user *)sub_info->argv,
 				   (const char __user *const __user *)sub_info->envp);
@@ -468,6 +466,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 	umh_info->pipe_to_umh = to_umh[1];
 	umh_info->pipe_from_umh = from_umh[0];
 	umh_info->pid = task_pid_nr(current);
+	current->flags |= PF_UMH;
 	return 0;
 }
 
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 03/16] umh: Rename the user mode driver helpers for clarity
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 01/16] umh: Capture the pid in umh_pipe_setup Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 02/16] umh: Move setting PF_UMH into umh_pipe_setup Eric W. Biederman
@ 2020-07-02 16:41                                                                       ` Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 04/16] umh: Remove call_usermodehelper_setup_file Eric W. Biederman
                                                                                         ` (14 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner, Eric W. Biederman, Greg Kroah-Hartman

Now that the functionality of umh_setup_pipe and
umh_clean_and_save_pid has changed their names are too specific and
don't make much sense.  Instead name them  umd_setup and umd_cleanup
for the functional role in setting up user mode drivers.

v1: https://lkml.kernel.org/r/875zbegf82.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87tuyt63x3.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/umh.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/umh.c b/kernel/umh.c
index e6b9d6636850..26c3d493f168 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -429,7 +429,7 @@ struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
 	return sub_info;
 }
 
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+static int umd_setup(struct subprocess_info *info, struct cred *new)
 {
 	struct umh_info *umh_info = info->data;
 	struct file *from_umh[2];
@@ -470,11 +470,11 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 	return 0;
 }
 
-static void umh_clean_and_save_pid(struct subprocess_info *info)
+static void umd_cleanup(struct subprocess_info *info)
 {
 	struct umh_info *umh_info = info->data;
 
-	/* cleanup if umh_pipe_setup() was successful but exec failed */
+	/* cleanup if umh_setup() was successful but exec failed */
 	if (info->retval) {
 		fput(umh_info->pipe_to_umh);
 		fput(umh_info->pipe_from_umh);
@@ -520,8 +520,8 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 	}
 
 	err = -ENOMEM;
-	sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup,
-						  umh_clean_and_save_pid, info);
+	sub_info = call_usermodehelper_setup_file(file, umd_setup, umd_cleanup,
+						  info);
 	if (!sub_info)
 		goto out;
 
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 04/16] umh: Remove call_usermodehelper_setup_file.
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
                                                                                         ` (2 preceding siblings ...)
  2020-07-02 16:41                                                                       ` [PATCH v3 03/16] umh: Rename the user mode driver helpers for clarity Eric W. Biederman
@ 2020-07-02 16:41                                                                       ` Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 05/16] umh: Separate the user mode driver and the user mode helper support Eric W. Biederman
                                                                                         ` (13 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner, Eric W. Biederman, Greg Kroah-Hartman

The only caller of call_usermodehelper_setup_file is fork_usermode_blob.
In fork_usermode_blob replace call_usermodehelper_setup_file with
call_usermodehelper_setup and delete fork_usermodehelper_setup_file.

For this to work the argv_free is moved from umh_clean_and_save_pid
to fork_usermode_blob.

v1: https://lkml.kernel.org/r/87zh8qf0mp.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87o8p163u1.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umh.h |  3 ---
 kernel/umh.c        | 42 +++++++++++-------------------------------
 2 files changed, 11 insertions(+), 34 deletions(-)

diff --git a/include/linux/umh.h b/include/linux/umh.h
index aae16a0ebd0f..de08af00c68a 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -39,9 +39,6 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp,
 			  int (*init)(struct subprocess_info *info, struct cred *new),
 			  void (*cleanup)(struct subprocess_info *), void *data);
 
-struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
-			  int (*init)(struct subprocess_info *info, struct cred *new),
-			  void (*cleanup)(struct subprocess_info *), void *data);
 struct umh_info {
 	const char *cmdline;
 	struct file *pipe_to_umh;
diff --git a/kernel/umh.c b/kernel/umh.c
index 26c3d493f168..b8fa9b99b366 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -402,33 +402,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
 }
 EXPORT_SYMBOL(call_usermodehelper_setup);
 
-struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
-		int (*init)(struct subprocess_info *info, struct cred *new),
-		void (*cleanup)(struct subprocess_info *info), void *data)
-{
-	struct subprocess_info *sub_info;
-	struct umh_info *info = data;
-	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
-
-	sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL);
-	if (!sub_info)
-		return NULL;
-
-	sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL);
-	if (!sub_info->argv) {
-		kfree(sub_info);
-		return NULL;
-	}
-
-	INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
-	sub_info->path = "none";
-	sub_info->file = file;
-	sub_info->init = init;
-	sub_info->cleanup = cleanup;
-	sub_info->data = data;
-	return sub_info;
-}
-
 static int umd_setup(struct subprocess_info *info, struct cred *new)
 {
 	struct umh_info *umh_info = info->data;
@@ -479,8 +452,6 @@ static void umd_cleanup(struct subprocess_info *info)
 		fput(umh_info->pipe_to_umh);
 		fput(umh_info->pipe_from_umh);
 	}
-
-	argv_free(info->argv);
 }
 
 /**
@@ -501,7 +472,9 @@ static void umd_cleanup(struct subprocess_info *info)
  */
 int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 {
+	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
 	struct subprocess_info *sub_info;
+	char **argv = NULL;
 	struct file *file;
 	ssize_t written;
 	loff_t pos = 0;
@@ -520,11 +493,16 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 	}
 
 	err = -ENOMEM;
-	sub_info = call_usermodehelper_setup_file(file, umd_setup, umd_cleanup,
-						  info);
+	argv = argv_split(GFP_KERNEL, cmdline, NULL);
+	if (!argv)
+		goto out;
+
+	sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL,
+					     umd_setup, umd_cleanup, info);
 	if (!sub_info)
 		goto out;
 
+	sub_info->file = file;
 	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
 	if (!err) {
 		mutex_lock(&umh_list_lock);
@@ -532,6 +510,8 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 		mutex_unlock(&umh_list_lock);
 	}
 out:
+	if (argv)
+		argv_free(argv);
 	fput(file);
 	return err;
 }
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 05/16] umh: Separate the user mode driver and the user mode helper support
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
                                                                                         ` (3 preceding siblings ...)
  2020-07-02 16:41                                                                       ` [PATCH v3 04/16] umh: Remove call_usermodehelper_setup_file Eric W. Biederman
@ 2020-07-02 16:41                                                                       ` Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 06/16] umd: For clarity rename umh_info umd_info Eric W. Biederman
                                                                                         ` (12 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner, Eric W. Biederman, Greg Kroah-Hartman

This makes it clear which code is part of the core user mode
helper support and which code is needed to implement user mode
drivers.

This makes the kernel smaller for everyone who does not use a usermode
driver.

v1: https://lkml.kernel.org/r/87tuyyf0ln.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87imf963s6.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/bpfilter.h        |   2 +-
 include/linux/sched.h           |   8 --
 include/linux/umh.h             |  10 ---
 include/linux/usermode_driver.h |  30 +++++++
 kernel/Makefile                 |   1 +
 kernel/exit.c                   |   1 +
 kernel/umh.c                    | 139 ------------------------------
 kernel/usermode_driver.c        | 146 ++++++++++++++++++++++++++++++++
 8 files changed, 179 insertions(+), 158 deletions(-)
 create mode 100644 include/linux/usermode_driver.h
 create mode 100644 kernel/usermode_driver.c

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
index d815622cd31e..d6d6206052a6 100644
--- a/include/linux/bpfilter.h
+++ b/include/linux/bpfilter.h
@@ -3,7 +3,7 @@
 #define _LINUX_BPFILTER_H
 
 #include <uapi/linux/bpfilter.h>
-#include <linux/umh.h>
+#include <linux/usermode_driver.h>
 
 struct sock;
 int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b62e6aaf28f0..59d1e92bb88e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2020,14 +2020,6 @@ static inline void rseq_execve(struct task_struct *t)
 
 #endif
 
-void __exit_umh(struct task_struct *tsk);
-
-static inline void exit_umh(struct task_struct *tsk)
-{
-	if (unlikely(tsk->flags & PF_UMH))
-		__exit_umh(tsk);
-}
-
 #ifdef CONFIG_DEBUG_RSEQ
 
 void rseq_syscall(struct pt_regs *regs);
diff --git a/include/linux/umh.h b/include/linux/umh.h
index de08af00c68a..73173c4a07e5 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -39,16 +39,6 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp,
 			  int (*init)(struct subprocess_info *info, struct cred *new),
 			  void (*cleanup)(struct subprocess_info *), void *data);
 
-struct umh_info {
-	const char *cmdline;
-	struct file *pipe_to_umh;
-	struct file *pipe_from_umh;
-	struct list_head list;
-	void (*cleanup)(struct umh_info *info);
-	pid_t pid;
-};
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
-
 extern int
 call_usermodehelper_exec(struct subprocess_info *info, int wait);
 
diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h
new file mode 100644
index 000000000000..c5f6dc950227
--- /dev/null
+++ b/include/linux/usermode_driver.h
@@ -0,0 +1,30 @@
+#ifndef __LINUX_USERMODE_DRIVER_H__
+#define __LINUX_USERMODE_DRIVER_H__
+
+#include <linux/umh.h>
+
+#ifdef CONFIG_BPFILTER
+void __exit_umh(struct task_struct *tsk);
+
+static inline void exit_umh(struct task_struct *tsk)
+{
+	if (unlikely(tsk->flags & PF_UMH))
+		__exit_umh(tsk);
+}
+#else
+static inline void exit_umh(struct task_struct *tsk)
+{
+}
+#endif
+
+struct umh_info {
+	const char *cmdline;
+	struct file *pipe_to_umh;
+	struct file *pipe_from_umh;
+	struct list_head list;
+	void (*cleanup)(struct umh_info *info);
+	pid_t pid;
+};
+int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
+
+#endif /* __LINUX_USERMODE_DRIVER_H__ */
diff --git a/kernel/Makefile b/kernel/Makefile
index f3218bc5ec69..43928759893a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
 	    async.o range.o smpboot.o ucount.o
 
+obj-$(CONFIG_BPFILTER) += usermode_driver.o
 obj-$(CONFIG_MODULES) += kmod.o
 obj-$(CONFIG_MULTIUSER) += groups.o
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 727150f28103..a081deea52ca 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -63,6 +63,7 @@
 #include <linux/random.h>
 #include <linux/rcuwait.h>
 #include <linux/compat.h>
+#include <linux/usermode_driver.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
diff --git a/kernel/umh.c b/kernel/umh.c
index b8fa9b99b366..3e4e453d45c8 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -26,8 +26,6 @@
 #include <linux/ptrace.h>
 #include <linux/async.h>
 #include <linux/uaccess.h>
-#include <linux/shmem_fs.h>
-#include <linux/pipe_fs_i.h>
 
 #include <trace/events/module.h>
 
@@ -38,8 +36,6 @@ static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
 static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
 static DEFINE_SPINLOCK(umh_sysctl_lock);
 static DECLARE_RWSEM(umhelper_sem);
-static LIST_HEAD(umh_list);
-static DEFINE_MUTEX(umh_list_lock);
 
 static void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
@@ -402,121 +398,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
 }
 EXPORT_SYMBOL(call_usermodehelper_setup);
 
-static int umd_setup(struct subprocess_info *info, struct cred *new)
-{
-	struct umh_info *umh_info = info->data;
-	struct file *from_umh[2];
-	struct file *to_umh[2];
-	int err;
-
-	/* create pipe to send data to umh */
-	err = create_pipe_files(to_umh, 0);
-	if (err)
-		return err;
-	err = replace_fd(0, to_umh[0], 0);
-	fput(to_umh[0]);
-	if (err < 0) {
-		fput(to_umh[1]);
-		return err;
-	}
-
-	/* create pipe to receive data from umh */
-	err = create_pipe_files(from_umh, 0);
-	if (err) {
-		fput(to_umh[1]);
-		replace_fd(0, NULL, 0);
-		return err;
-	}
-	err = replace_fd(1, from_umh[1], 0);
-	fput(from_umh[1]);
-	if (err < 0) {
-		fput(to_umh[1]);
-		replace_fd(0, NULL, 0);
-		fput(from_umh[0]);
-		return err;
-	}
-
-	umh_info->pipe_to_umh = to_umh[1];
-	umh_info->pipe_from_umh = from_umh[0];
-	umh_info->pid = task_pid_nr(current);
-	current->flags |= PF_UMH;
-	return 0;
-}
-
-static void umd_cleanup(struct subprocess_info *info)
-{
-	struct umh_info *umh_info = info->data;
-
-	/* cleanup if umh_setup() was successful but exec failed */
-	if (info->retval) {
-		fput(umh_info->pipe_to_umh);
-		fput(umh_info->pipe_from_umh);
-	}
-}
-
-/**
- * fork_usermode_blob - fork a blob of bytes as a usermode process
- * @data: a blob of bytes that can be do_execv-ed as a file
- * @len: length of the blob
- * @info: information about usermode process (shouldn't be NULL)
- *
- * If info->cmdline is set it will be used as command line for the
- * user process, else "usermodehelper" is used.
- *
- * Returns either negative error or zero which indicates success
- * in executing a blob of bytes as a usermode process. In such
- * case 'struct umh_info *info' is populated with two pipes
- * and a pid of the process. The caller is responsible for health
- * check of the user process, killing it via pid, and closing the
- * pipes when user process is no longer needed.
- */
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
-{
-	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
-	struct subprocess_info *sub_info;
-	char **argv = NULL;
-	struct file *file;
-	ssize_t written;
-	loff_t pos = 0;
-	int err;
-
-	file = shmem_kernel_file_setup("", len, 0);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
-
-	written = kernel_write(file, data, len, &pos);
-	if (written != len) {
-		err = written;
-		if (err >= 0)
-			err = -ENOMEM;
-		goto out;
-	}
-
-	err = -ENOMEM;
-	argv = argv_split(GFP_KERNEL, cmdline, NULL);
-	if (!argv)
-		goto out;
-
-	sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL,
-					     umd_setup, umd_cleanup, info);
-	if (!sub_info)
-		goto out;
-
-	sub_info->file = file;
-	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
-	if (!err) {
-		mutex_lock(&umh_list_lock);
-		list_add(&info->list, &umh_list);
-		mutex_unlock(&umh_list_lock);
-	}
-out:
-	if (argv)
-		argv_free(argv);
-	fput(file);
-	return err;
-}
-EXPORT_SYMBOL_GPL(fork_usermode_blob);
-
 /**
  * call_usermodehelper_exec - start a usermode application
  * @sub_info: information about the subprocessa
@@ -678,26 +559,6 @@ static int proc_cap_handler(struct ctl_table *table, int write,
 	return 0;
 }
 
-void __exit_umh(struct task_struct *tsk)
-{
-	struct umh_info *info;
-	pid_t pid = tsk->pid;
-
-	mutex_lock(&umh_list_lock);
-	list_for_each_entry(info, &umh_list, list) {
-		if (info->pid == pid) {
-			list_del(&info->list);
-			mutex_unlock(&umh_list_lock);
-			goto out;
-		}
-	}
-	mutex_unlock(&umh_list_lock);
-	return;
-out:
-	if (info->cleanup)
-		info->cleanup(info);
-}
-
 struct ctl_table usermodehelper_table[] = {
 	{
 		.procname	= "bset",
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
new file mode 100644
index 000000000000..5b05863af855
--- /dev/null
+++ b/kernel/usermode_driver.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * umd - User mode driver support
+ */
+#include <linux/shmem_fs.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/usermode_driver.h>
+
+static LIST_HEAD(umh_list);
+static DEFINE_MUTEX(umh_list_lock);
+
+static int umd_setup(struct subprocess_info *info, struct cred *new)
+{
+	struct umh_info *umh_info = info->data;
+	struct file *from_umh[2];
+	struct file *to_umh[2];
+	int err;
+
+	/* create pipe to send data to umh */
+	err = create_pipe_files(to_umh, 0);
+	if (err)
+		return err;
+	err = replace_fd(0, to_umh[0], 0);
+	fput(to_umh[0]);
+	if (err < 0) {
+		fput(to_umh[1]);
+		return err;
+	}
+
+	/* create pipe to receive data from umh */
+	err = create_pipe_files(from_umh, 0);
+	if (err) {
+		fput(to_umh[1]);
+		replace_fd(0, NULL, 0);
+		return err;
+	}
+	err = replace_fd(1, from_umh[1], 0);
+	fput(from_umh[1]);
+	if (err < 0) {
+		fput(to_umh[1]);
+		replace_fd(0, NULL, 0);
+		fput(from_umh[0]);
+		return err;
+	}
+
+	umh_info->pipe_to_umh = to_umh[1];
+	umh_info->pipe_from_umh = from_umh[0];
+	umh_info->pid = task_pid_nr(current);
+	current->flags |= PF_UMH;
+	return 0;
+}
+
+static void umd_cleanup(struct subprocess_info *info)
+{
+	struct umh_info *umh_info = info->data;
+
+	/* cleanup if umh_setup() was successful but exec failed */
+	if (info->retval) {
+		fput(umh_info->pipe_to_umh);
+		fput(umh_info->pipe_from_umh);
+	}
+}
+
+/**
+ * fork_usermode_blob - fork a blob of bytes as a usermode process
+ * @data: a blob of bytes that can be do_execv-ed as a file
+ * @len: length of the blob
+ * @info: information about usermode process (shouldn't be NULL)
+ *
+ * If info->cmdline is set it will be used as command line for the
+ * user process, else "usermodehelper" is used.
+ *
+ * Returns either negative error or zero which indicates success
+ * in executing a blob of bytes as a usermode process. In such
+ * case 'struct umh_info *info' is populated with two pipes
+ * and a pid of the process. The caller is responsible for health
+ * check of the user process, killing it via pid, and closing the
+ * pipes when user process is no longer needed.
+ */
+int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
+{
+	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
+	struct subprocess_info *sub_info;
+	char **argv = NULL;
+	struct file *file;
+	ssize_t written;
+	loff_t pos = 0;
+	int err;
+
+	file = shmem_kernel_file_setup("", len, 0);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	written = kernel_write(file, data, len, &pos);
+	if (written != len) {
+		err = written;
+		if (err >= 0)
+			err = -ENOMEM;
+		goto out;
+	}
+
+	err = -ENOMEM;
+	argv = argv_split(GFP_KERNEL, cmdline, NULL);
+	if (!argv)
+		goto out;
+
+	sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL,
+					     umd_setup, umd_cleanup, info);
+	if (!sub_info)
+		goto out;
+
+	sub_info->file = file;
+	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+	if (!err) {
+		mutex_lock(&umh_list_lock);
+		list_add(&info->list, &umh_list);
+		mutex_unlock(&umh_list_lock);
+	}
+out:
+	if (argv)
+		argv_free(argv);
+	fput(file);
+	return err;
+}
+EXPORT_SYMBOL_GPL(fork_usermode_blob);
+
+void __exit_umh(struct task_struct *tsk)
+{
+	struct umh_info *info;
+	pid_t pid = tsk->pid;
+
+	mutex_lock(&umh_list_lock);
+	list_for_each_entry(info, &umh_list, list) {
+		if (info->pid == pid) {
+			list_del(&info->list);
+			mutex_unlock(&umh_list_lock);
+			goto out;
+		}
+	}
+	mutex_unlock(&umh_list_lock);
+	return;
+out:
+	if (info->cleanup)
+		info->cleanup(info);
+}
+
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 06/16] umd: For clarity rename umh_info umd_info
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
                                                                                         ` (4 preceding siblings ...)
  2020-07-02 16:41                                                                       ` [PATCH v3 05/16] umh: Separate the user mode driver and the user mode helper support Eric W. Biederman
@ 2020-07-02 16:41                                                                       ` Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 07/16] umd: Rename umd_info.cmdline umd_info.driver_name Eric W. Biederman
                                                                                         ` (11 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner, Eric W. Biederman, Greg Kroah-Hartman

This structure is only used for user mode drivers so change
the prefix from umh to umd to make that clear.

v1: https://lkml.kernel.org/r/87o8p6f0kw.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/878sg563po.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/bpfilter.h        |  2 +-
 include/linux/usermode_driver.h |  6 +++---
 kernel/usermode_driver.c        | 20 ++++++++++----------
 net/ipv4/bpfilter/sockopt.c     |  2 +-
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
index d6d6206052a6..ec9972d822e0 100644
--- a/include/linux/bpfilter.h
+++ b/include/linux/bpfilter.h
@@ -11,7 +11,7 @@ int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
 int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
 			    int __user *optlen);
 struct bpfilter_umh_ops {
-	struct umh_info info;
+	struct umd_info info;
 	/* since ip_getsockopt() can run in parallel, serialize access to umh */
 	struct mutex lock;
 	int (*sockopt)(struct sock *sk, int optname,
diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h
index c5f6dc950227..7131ea611bab 100644
--- a/include/linux/usermode_driver.h
+++ b/include/linux/usermode_driver.h
@@ -17,14 +17,14 @@ static inline void exit_umh(struct task_struct *tsk)
 }
 #endif
 
-struct umh_info {
+struct umd_info {
 	const char *cmdline;
 	struct file *pipe_to_umh;
 	struct file *pipe_from_umh;
 	struct list_head list;
-	void (*cleanup)(struct umh_info *info);
+	void (*cleanup)(struct umd_info *info);
 	pid_t pid;
 };
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
+int fork_usermode_blob(void *data, size_t len, struct umd_info *info);
 
 #endif /* __LINUX_USERMODE_DRIVER_H__ */
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index 5b05863af855..e73550e946d6 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -11,7 +11,7 @@ static DEFINE_MUTEX(umh_list_lock);
 
 static int umd_setup(struct subprocess_info *info, struct cred *new)
 {
-	struct umh_info *umh_info = info->data;
+	struct umd_info *umd_info = info->data;
 	struct file *from_umh[2];
 	struct file *to_umh[2];
 	int err;
@@ -43,21 +43,21 @@ static int umd_setup(struct subprocess_info *info, struct cred *new)
 		return err;
 	}
 
-	umh_info->pipe_to_umh = to_umh[1];
-	umh_info->pipe_from_umh = from_umh[0];
-	umh_info->pid = task_pid_nr(current);
+	umd_info->pipe_to_umh = to_umh[1];
+	umd_info->pipe_from_umh = from_umh[0];
+	umd_info->pid = task_pid_nr(current);
 	current->flags |= PF_UMH;
 	return 0;
 }
 
 static void umd_cleanup(struct subprocess_info *info)
 {
-	struct umh_info *umh_info = info->data;
+	struct umd_info *umd_info = info->data;
 
 	/* cleanup if umh_setup() was successful but exec failed */
 	if (info->retval) {
-		fput(umh_info->pipe_to_umh);
-		fput(umh_info->pipe_from_umh);
+		fput(umd_info->pipe_to_umh);
+		fput(umd_info->pipe_from_umh);
 	}
 }
 
@@ -72,12 +72,12 @@ static void umd_cleanup(struct subprocess_info *info)
  *
  * Returns either negative error or zero which indicates success
  * in executing a blob of bytes as a usermode process. In such
- * case 'struct umh_info *info' is populated with two pipes
+ * case 'struct umd_info *info' is populated with two pipes
  * and a pid of the process. The caller is responsible for health
  * check of the user process, killing it via pid, and closing the
  * pipes when user process is no longer needed.
  */
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
+int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 {
 	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
 	struct subprocess_info *sub_info;
@@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(fork_usermode_blob);
 
 void __exit_umh(struct task_struct *tsk)
 {
-	struct umh_info *info;
+	struct umd_info *info;
 	pid_t pid = tsk->pid;
 
 	mutex_lock(&umh_list_lock);
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index 0480918bfc7c..c0dbcc86fcdb 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -12,7 +12,7 @@
 struct bpfilter_umh_ops bpfilter_ops;
 EXPORT_SYMBOL_GPL(bpfilter_ops);
 
-static void bpfilter_umh_cleanup(struct umh_info *info)
+static void bpfilter_umh_cleanup(struct umd_info *info)
 {
 	mutex_lock(&bpfilter_ops.lock);
 	bpfilter_ops.stop = true;
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 07/16] umd: Rename umd_info.cmdline umd_info.driver_name
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
                                                                                         ` (5 preceding siblings ...)
  2020-07-02 16:41                                                                       ` [PATCH v3 06/16] umd: For clarity rename umh_info umd_info Eric W. Biederman
@ 2020-07-02 16:41                                                                       ` Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 08/16] umd: Transform fork_usermode_blob into fork_usermode_driver Eric W. Biederman
                                                                                         ` (10 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner, Eric W. Biederman, Greg Kroah-Hartman

The only thing supplied in the cmdline today is the driver name so
rename the field to clarify the code.

As this value is always supplied stop trying to handle the case of
a NULL cmdline.

Additionally since we now have a name we can count on use the
driver_name any place where the code is looking for a name
of the binary.

v1: https://lkml.kernel.org/r/87imfef0k3.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87366d63os.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/usermode_driver.h |  2 +-
 kernel/usermode_driver.c        | 11 ++++-------
 net/ipv4/bpfilter/sockopt.c     |  2 +-
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h
index 7131ea611bab..48cf25e3145d 100644
--- a/include/linux/usermode_driver.h
+++ b/include/linux/usermode_driver.h
@@ -18,7 +18,7 @@ static inline void exit_umh(struct task_struct *tsk)
 #endif
 
 struct umd_info {
-	const char *cmdline;
+	const char *driver_name;
 	struct file *pipe_to_umh;
 	struct file *pipe_from_umh;
 	struct list_head list;
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index e73550e946d6..46d60d855e93 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -67,9 +67,6 @@ static void umd_cleanup(struct subprocess_info *info)
  * @len: length of the blob
  * @info: information about usermode process (shouldn't be NULL)
  *
- * If info->cmdline is set it will be used as command line for the
- * user process, else "usermodehelper" is used.
- *
  * Returns either negative error or zero which indicates success
  * in executing a blob of bytes as a usermode process. In such
  * case 'struct umd_info *info' is populated with two pipes
@@ -79,7 +76,6 @@ static void umd_cleanup(struct subprocess_info *info)
  */
 int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 {
-	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
 	struct subprocess_info *sub_info;
 	char **argv = NULL;
 	struct file *file;
@@ -87,7 +83,7 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 	loff_t pos = 0;
 	int err;
 
-	file = shmem_kernel_file_setup("", len, 0);
+	file = shmem_kernel_file_setup(info->driver_name, len, 0);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
@@ -100,11 +96,12 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 	}
 
 	err = -ENOMEM;
-	argv = argv_split(GFP_KERNEL, cmdline, NULL);
+	argv = argv_split(GFP_KERNEL, info->driver_name, NULL);
 	if (!argv)
 		goto out;
 
-	sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL,
+	sub_info = call_usermodehelper_setup(info->driver_name, argv, NULL,
+					     GFP_KERNEL,
 					     umd_setup, umd_cleanup, info);
 	if (!sub_info)
 		goto out;
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index c0dbcc86fcdb..5050de28333d 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -70,7 +70,7 @@ static int __init bpfilter_sockopt_init(void)
 {
 	mutex_init(&bpfilter_ops.lock);
 	bpfilter_ops.stop = true;
-	bpfilter_ops.info.cmdline = "bpfilter_umh";
+	bpfilter_ops.info.driver_name = "bpfilter_umh";
 	bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup;
 
 	return 0;
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 08/16] umd: Transform fork_usermode_blob into fork_usermode_driver
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
                                                                                         ` (6 preceding siblings ...)
  2020-07-02 16:41                                                                       ` [PATCH v3 07/16] umd: Rename umd_info.cmdline umd_info.driver_name Eric W. Biederman
@ 2020-07-02 16:41                                                                       ` Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 09/16] umh: Stop calling do_execve_file Eric W. Biederman
                                                                                         ` (9 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner, Eric W. Biederman, Greg Kroah-Hartman

Instead of loading a binary blob into a temporary file with
shmem_kernel_file_setup load a binary blob into a temporary tmpfs
filesystem.  This means that the blob can be stored in an init section
and discared, and it means the binary blob will have a filename so can
be executed normally.

The only tricky thing about this code is that in the helper function
blob_to_mnt __fput_sync is used.  That is because a file can not be
executed if it is still open for write, and the ordinary delayed close
for kernel threads does not happen soon enough, which causes the
following exec to fail.  The function umd_load_blob is not called with
any locks so this should be safe.

Executing the blob normally winds up correcting several problems with
the user mode driver code discovered by Tetsuo Handa[1].  By passing
an ordinary filename into the exec, it is no longer necessary to
figure out how to turn a O_RDWR file descriptor into a properly
referende counted O_EXEC file descriptor that forbids all writes.  For
path based LSMs there are no new special cases.

[1] https://lore.kernel.org/linux-fsdevel/2a8775b4-1dd5-9d5c-aa42-9872445e0942@i-love.sakura.ne.jp/
v1: https://lkml.kernel.org/r/87d05mf0j9.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87wo3p4p35.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/usermode_driver.h |   6 +-
 kernel/usermode_driver.c        | 126 ++++++++++++++++++++++++--------
 net/bpfilter/bpfilter_kern.c    |  14 +++-
 3 files changed, 113 insertions(+), 33 deletions(-)

diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h
index 48cf25e3145d..97c919b7147c 100644
--- a/include/linux/usermode_driver.h
+++ b/include/linux/usermode_driver.h
@@ -2,6 +2,7 @@
 #define __LINUX_USERMODE_DRIVER_H__
 
 #include <linux/umh.h>
+#include <linux/path.h>
 
 #ifdef CONFIG_BPFILTER
 void __exit_umh(struct task_struct *tsk);
@@ -23,8 +24,11 @@ struct umd_info {
 	struct file *pipe_from_umh;
 	struct list_head list;
 	void (*cleanup)(struct umd_info *info);
+	struct path wd;
 	pid_t pid;
 };
-int fork_usermode_blob(void *data, size_t len, struct umd_info *info);
+int umd_load_blob(struct umd_info *info, const void *data, size_t len);
+int umd_unload_blob(struct umd_info *info);
+int fork_usermode_driver(struct umd_info *info);
 
 #endif /* __LINUX_USERMODE_DRIVER_H__ */
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index 46d60d855e93..a86798759f83 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -4,11 +4,98 @@
  */
 #include <linux/shmem_fs.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/mount.h>
+#include <linux/fs_struct.h>
+#include <linux/task_work.h>
 #include <linux/usermode_driver.h>
 
 static LIST_HEAD(umh_list);
 static DEFINE_MUTEX(umh_list_lock);
 
+static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name)
+{
+	struct file_system_type *type;
+	struct vfsmount *mnt;
+	struct file *file;
+	ssize_t written;
+	loff_t pos = 0;
+
+	type = get_fs_type("tmpfs");
+	if (!type)
+		return ERR_PTR(-ENODEV);
+
+	mnt = kern_mount(type);
+	put_filesystem(type);
+	if (IS_ERR(mnt))
+		return mnt;
+
+	file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700);
+	if (IS_ERR(file)) {
+		mntput(mnt);
+		return ERR_CAST(file);
+	}
+
+	written = kernel_write(file, data, len, &pos);
+	if (written != len) {
+		int err = written;
+		if (err >= 0)
+			err = -ENOMEM;
+		filp_close(file, NULL);
+		mntput(mnt);
+		return ERR_PTR(err);
+	}
+
+	fput(file);
+
+	/* Flush delayed fput so exec can open the file read-only */
+	flush_delayed_fput();
+	task_work_run();
+	return mnt;
+}
+
+/**
+ * umd_load_blob - Remember a blob of bytes for fork_usermode_driver
+ * @info: information about usermode driver
+ * @data: a blob of bytes that can be executed as a file
+ * @len:  The lentgh of the blob
+ *
+ */
+int umd_load_blob(struct umd_info *info, const void *data, size_t len)
+{
+	struct vfsmount *mnt;
+
+	if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt))
+		return -EBUSY;
+
+	mnt = blob_to_mnt(data, len, info->driver_name);
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+
+	info->wd.mnt = mnt;
+	info->wd.dentry = mnt->mnt_root;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(umd_load_blob);
+
+/**
+ * umd_unload_blob - Disassociate @info from a previously loaded blob
+ * @info: information about usermode driver
+ *
+ */
+int umd_unload_blob(struct umd_info *info)
+{
+	if (WARN_ON_ONCE(!info->wd.mnt ||
+			 !info->wd.dentry ||
+			 info->wd.mnt->mnt_root != info->wd.dentry))
+		return -EINVAL;
+
+	kern_unmount(info->wd.mnt);
+	info->wd.mnt = NULL;
+	info->wd.dentry = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(umd_unload_blob);
+
 static int umd_setup(struct subprocess_info *info, struct cred *new)
 {
 	struct umd_info *umd_info = info->data;
@@ -43,6 +130,7 @@ static int umd_setup(struct subprocess_info *info, struct cred *new)
 		return err;
 	}
 
+	set_fs_pwd(current->fs, &umd_info->wd);
 	umd_info->pipe_to_umh = to_umh[1];
 	umd_info->pipe_from_umh = from_umh[0];
 	umd_info->pid = task_pid_nr(current);
@@ -62,39 +150,21 @@ static void umd_cleanup(struct subprocess_info *info)
 }
 
 /**
- * fork_usermode_blob - fork a blob of bytes as a usermode process
- * @data: a blob of bytes that can be do_execv-ed as a file
- * @len: length of the blob
- * @info: information about usermode process (shouldn't be NULL)
+ * fork_usermode_driver - fork a usermode driver
+ * @info: information about usermode driver (shouldn't be NULL)
  *
- * Returns either negative error or zero which indicates success
- * in executing a blob of bytes as a usermode process. In such
- * case 'struct umd_info *info' is populated with two pipes
- * and a pid of the process. The caller is responsible for health
- * check of the user process, killing it via pid, and closing the
- * pipes when user process is no longer needed.
+ * Returns either negative error or zero which indicates success in
+ * executing a usermode driver. In such case 'struct umd_info *info'
+ * is populated with two pipes and a pid of the process. The caller is
+ * responsible for health check of the user process, killing it via
+ * pid, and closing the pipes when user process is no longer needed.
  */
-int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
+int fork_usermode_driver(struct umd_info *info)
 {
 	struct subprocess_info *sub_info;
 	char **argv = NULL;
-	struct file *file;
-	ssize_t written;
-	loff_t pos = 0;
 	int err;
 
-	file = shmem_kernel_file_setup(info->driver_name, len, 0);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
-
-	written = kernel_write(file, data, len, &pos);
-	if (written != len) {
-		err = written;
-		if (err >= 0)
-			err = -ENOMEM;
-		goto out;
-	}
-
 	err = -ENOMEM;
 	argv = argv_split(GFP_KERNEL, info->driver_name, NULL);
 	if (!argv)
@@ -106,7 +176,6 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 	if (!sub_info)
 		goto out;
 
-	sub_info->file = file;
 	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
 	if (!err) {
 		mutex_lock(&umh_list_lock);
@@ -116,10 +185,9 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 out:
 	if (argv)
 		argv_free(argv);
-	fput(file);
 	return err;
 }
-EXPORT_SYMBOL_GPL(fork_usermode_blob);
+EXPORT_SYMBOL_GPL(fork_usermode_driver);
 
 void __exit_umh(struct task_struct *tsk)
 {
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index c0f0990f30b6..28883b00609d 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -77,9 +77,7 @@ static int start_umh(void)
 	int err;
 
 	/* fork usermode process */
-	err = fork_usermode_blob(&bpfilter_umh_start,
-				 &bpfilter_umh_end - &bpfilter_umh_start,
-				 &bpfilter_ops.info);
+	err = fork_usermode_driver(&bpfilter_ops.info);
 	if (err)
 		return err;
 	bpfilter_ops.stop = false;
@@ -98,6 +96,12 @@ static int __init load_umh(void)
 {
 	int err;
 
+	err = umd_load_blob(&bpfilter_ops.info,
+			    &bpfilter_umh_start,
+			    &bpfilter_umh_end - &bpfilter_umh_start);
+	if (err)
+		return err;
+
 	mutex_lock(&bpfilter_ops.lock);
 	if (!bpfilter_ops.stop) {
 		err = -EFAULT;
@@ -110,6 +114,8 @@ static int __init load_umh(void)
 	}
 out:
 	mutex_unlock(&bpfilter_ops.lock);
+	if (err)
+		umd_unload_blob(&bpfilter_ops.info);
 	return err;
 }
 
@@ -122,6 +128,8 @@ static void __exit fini_umh(void)
 		bpfilter_ops.sockopt = NULL;
 	}
 	mutex_unlock(&bpfilter_ops.lock);
+
+	umd_unload_blob(&bpfilter_ops.info);
 }
 module_init(load_umh);
 module_exit(fini_umh);
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 09/16] umh: Stop calling do_execve_file
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
                                                                                         ` (7 preceding siblings ...)
  2020-07-02 16:41                                                                       ` [PATCH v3 08/16] umd: Transform fork_usermode_blob into fork_usermode_driver Eric W. Biederman
@ 2020-07-02 16:41                                                                       ` Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 10/16] exec: Remove do_execve_file Eric W. Biederman
                                                                                         ` (8 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner, Eric W. Biederman, Greg Kroah-Hartman

With the user mode driver code changed to not set subprocess_info.file
there are no more users of subproces_info.file.  Remove this field
from struct subprocess_info and remove the only user in
call_usermodehelper_exec_async that would call do_execve_file instead
of do_execve if file was set.

v1: https://lkml.kernel.org/r/877dvuf0i7.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87r1tx4p2a.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umh.h |  1 -
 kernel/umh.c        | 10 +++-------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/include/linux/umh.h b/include/linux/umh.h
index 73173c4a07e5..244aff638220 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -22,7 +22,6 @@ struct subprocess_info {
 	const char *path;
 	char **argv;
 	char **envp;
-	struct file *file;
 	int wait;
 	int retval;
 	int (*init)(struct subprocess_info *info, struct cred *new);
diff --git a/kernel/umh.c b/kernel/umh.c
index 3e4e453d45c8..6ca2096298b9 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -98,13 +98,9 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	if (sub_info->file)
-		retval = do_execve_file(sub_info->file,
-					sub_info->argv, sub_info->envp);
-	else
-		retval = do_execve(getname_kernel(sub_info->path),
-				   (const char __user *const __user *)sub_info->argv,
-				   (const char __user *const __user *)sub_info->envp);
+	retval = do_execve(getname_kernel(sub_info->path),
+			   (const char __user *const __user *)sub_info->argv,
+			   (const char __user *const __user *)sub_info->envp);
 out:
 	sub_info->retval = retval;
 	/*
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 10/16] exec: Remove do_execve_file
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
                                                                                         ` (8 preceding siblings ...)
  2020-07-02 16:41                                                                       ` [PATCH v3 09/16] umh: Stop calling do_execve_file Eric W. Biederman
@ 2020-07-02 16:41                                                                       ` Eric W. Biederman
  2020-07-08  6:35                                                                         ` Luis Chamberlain
  2020-07-12 21:02                                                                         ` Pavel Machek
  2020-07-02 16:41                                                                       ` [PATCH v3 11/16] bpfilter: Move bpfilter_umh back into init data Eric W. Biederman
                                                                                         ` (7 subsequent siblings)
  17 siblings, 2 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner, Eric W. Biederman, Tetsuo Handa,
	Greg Kroah-Hartman

Now that the last callser has been removed remove this code from exec.

For anyone thinking of resurrecing do_execve_file please note that
the code was buggy in several fundamental ways.

- It did not ensure the file it was passed was read-only and that
  deny_write_access had been called on it.  Which subtlely breaks
  invaniants in exec.

- The caller of do_execve_file was expected to hold and put a
  reference to the file, but an extra reference for use by exec was
  not taken so that when exec put it's reference to the file an
  underflow occured on the file reference count.

- The point of the interface was so that a pathname did not need to
  exist.  Which breaks pathname based LSMs.

Tetsuo Handa originally reported these issues[1].  While it was clear
that deny_write_access was missing the fundamental incompatibility
with the passed in O_RDWR filehandle was not immediately recognized.

All of these issues were fixed by modifying the usermode driver code
to have a path, so it did not need this hack.

Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
[1] https://lore.kernel.org/linux-fsdevel/2a8775b4-1dd5-9d5c-aa42-9872445e0942@i-love.sakura.ne.jp/
v1: https://lkml.kernel.org/r/871rm2f0hi.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87lfk54p0m.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/exec.c               | 38 +++++++++-----------------------------
 include/linux/binfmts.h |  1 -
 2 files changed, 9 insertions(+), 30 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index e6e8a9a70327..23dfbb820626 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1818,13 +1818,14 @@ static int exec_binprm(struct linux_binprm *bprm)
 /*
  * sys_execve() executes a new program.
  */
-static int __do_execve_file(int fd, struct filename *filename,
-			    struct user_arg_ptr argv,
-			    struct user_arg_ptr envp,
-			    int flags, struct file *file)
+static int do_execveat_common(int fd, struct filename *filename,
+			      struct user_arg_ptr argv,
+			      struct user_arg_ptr envp,
+			      int flags)
 {
 	char *pathbuf = NULL;
 	struct linux_binprm *bprm;
+	struct file *file;
 	struct files_struct *displaced;
 	int retval;
 
@@ -1863,8 +1864,7 @@ static int __do_execve_file(int fd, struct filename *filename,
 	check_unsafe_exec(bprm);
 	current->in_execve = 1;
 
-	if (!file)
-		file = do_open_execat(fd, filename, flags);
+	file = do_open_execat(fd, filename, flags);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto out_unmark;
@@ -1872,9 +1872,7 @@ static int __do_execve_file(int fd, struct filename *filename,
 	sched_exec();
 
 	bprm->file = file;
-	if (!filename) {
-		bprm->filename = "none";
-	} else if (fd == AT_FDCWD || filename->name[0] == '/') {
+	if (fd == AT_FDCWD || filename->name[0] == '/') {
 		bprm->filename = filename->name;
 	} else {
 		if (filename->name[0] == '\0')
@@ -1935,8 +1933,7 @@ static int __do_execve_file(int fd, struct filename *filename,
 	task_numa_free(current, false);
 	free_bprm(bprm);
 	kfree(pathbuf);
-	if (filename)
-		putname(filename);
+	putname(filename);
 	if (displaced)
 		put_files_struct(displaced);
 	return retval;
@@ -1967,27 +1964,10 @@ static int __do_execve_file(int fd, struct filename *filename,
 	if (displaced)
 		reset_files_struct(displaced);
 out_ret:
-	if (filename)
-		putname(filename);
+	putname(filename);
 	return retval;
 }
 
-static int do_execveat_common(int fd, struct filename *filename,
-			      struct user_arg_ptr argv,
-			      struct user_arg_ptr envp,
-			      int flags)
-{
-	return __do_execve_file(fd, filename, argv, envp, flags, NULL);
-}
-
-int do_execve_file(struct file *file, void *__argv, void *__envp)
-{
-	struct user_arg_ptr argv = { .ptr.native = __argv };
-	struct user_arg_ptr envp = { .ptr.native = __envp };
-
-	return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file);
-}
-
 int do_execve(struct filename *filename,
 	const char __user *const __user *__argv,
 	const char __user *const __user *__envp)
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 4a20b7517dd0..7c27d7b57871 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -141,6 +141,5 @@ extern int do_execveat(int, struct filename *,
 		       const char __user * const __user *,
 		       const char __user * const __user *,
 		       int);
-int do_execve_file(struct file *file, void *__argv, void *__envp);
 
 #endif /* _LINUX_BINFMTS_H */
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 11/16] bpfilter: Move bpfilter_umh back into init data
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
                                                                                         ` (9 preceding siblings ...)
  2020-07-02 16:41                                                                       ` [PATCH v3 10/16] exec: Remove do_execve_file Eric W. Biederman
@ 2020-07-02 16:41                                                                       ` Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 12/16] umd: Track user space drivers with struct pid Eric W. Biederman
                                                                                         ` (6 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner, Eric W. Biederman, Greg Kroah-Hartman

To allow for restarts 61fbf5933d42 ("net: bpfilter: restart
bpfilter_umh when error occurred") moved the blob holding the
userspace binary out of the init sections.

Now that loading the blob into a filesystem is separate from executing
the blob the blob no longer needs to live .rodata to allow for restarting.
So move the blob back to .init.rodata.

v1: https://lkml.kernel.org/r/87sgeidlvq.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87ftad4ozc.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 net/bpfilter/bpfilter_umh_blob.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S
index 9ea6100dca87..40311d10d2f2 100644
--- a/net/bpfilter/bpfilter_umh_blob.S
+++ b/net/bpfilter/bpfilter_umh_blob.S
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-	.section .rodata, "a"
+	.section .init.rodata, "a"
 	.global bpfilter_umh_start
 bpfilter_umh_start:
 	.incbin "net/bpfilter/bpfilter_umh"
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 12/16] umd: Track user space drivers with struct pid
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
                                                                                         ` (10 preceding siblings ...)
  2020-07-02 16:41                                                                       ` [PATCH v3 11/16] bpfilter: Move bpfilter_umh back into init data Eric W. Biederman
@ 2020-07-02 16:41                                                                       ` Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 13/16] exit: Factor thread_group_exited out of pidfd_poll Eric W. Biederman
                                                                                         ` (5 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner, Eric W. Biederman, Greg Kroah-Hartman

Use struct pid instead of user space pid values that are prone to wrap
araound.

In addition track the entire thread group instead of just the first
thread that is started by exec.  There are no multi-threaded user mode
drivers today but there is nothing preclucing user drivers from being
multi-threaded, so it is just a good idea to track the entire process.

Take a reference count on the tgid's in question to make it possible
to remove exit_umh in a future change.

As a struct pid is available directly use kill_pid_info.

The prior process signalling code was iffy in using a userspace pid
known to be in the initial pid namespace and then looking up it's task
in whatever the current pid namespace is.  It worked only because
kernel threads always run in the initial pid namespace.

As the tgid is now refcounted verify the tgid is NULL at the start of
fork_usermode_driver to avoid the possibility of silent pid leaks.

v1: https://lkml.kernel.org/r/87mu4qdlv2.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/a70l4oy8.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/usermode_driver.h |  2 +-
 kernel/exit.c                   |  3 ++-
 kernel/usermode_driver.c        | 15 ++++++++++-----
 net/bpfilter/bpfilter_kern.c    | 13 +++++--------
 net/ipv4/bpfilter/sockopt.c     |  3 ++-
 5 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h
index 97c919b7147c..45adbffb31d9 100644
--- a/include/linux/usermode_driver.h
+++ b/include/linux/usermode_driver.h
@@ -25,7 +25,7 @@ struct umd_info {
 	struct list_head list;
 	void (*cleanup)(struct umd_info *info);
 	struct path wd;
-	pid_t pid;
+	struct pid *tgid;
 };
 int umd_load_blob(struct umd_info *info, const void *data, size_t len);
 int umd_unload_blob(struct umd_info *info);
diff --git a/kernel/exit.c b/kernel/exit.c
index a081deea52ca..d3294b611df1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -805,7 +805,8 @@ void __noreturn do_exit(long code)
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
 	exit_thread(tsk);
-	exit_umh(tsk);
+	if (group_dead)
+		exit_umh(tsk);
 
 	/*
 	 * Flush inherited counters to the parent - before the parent
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index a86798759f83..f77f8d7ce9e3 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -133,7 +133,7 @@ static int umd_setup(struct subprocess_info *info, struct cred *new)
 	set_fs_pwd(current->fs, &umd_info->wd);
 	umd_info->pipe_to_umh = to_umh[1];
 	umd_info->pipe_from_umh = from_umh[0];
-	umd_info->pid = task_pid_nr(current);
+	umd_info->tgid = get_pid(task_tgid(current));
 	current->flags |= PF_UMH;
 	return 0;
 }
@@ -146,6 +146,8 @@ static void umd_cleanup(struct subprocess_info *info)
 	if (info->retval) {
 		fput(umd_info->pipe_to_umh);
 		fput(umd_info->pipe_from_umh);
+		put_pid(umd_info->tgid);
+		umd_info->tgid = NULL;
 	}
 }
 
@@ -155,9 +157,9 @@ static void umd_cleanup(struct subprocess_info *info)
  *
  * Returns either negative error or zero which indicates success in
  * executing a usermode driver. In such case 'struct umd_info *info'
- * is populated with two pipes and a pid of the process. The caller is
+ * is populated with two pipes and a tgid of the process. The caller is
  * responsible for health check of the user process, killing it via
- * pid, and closing the pipes when user process is no longer needed.
+ * tgid, and closing the pipes when user process is no longer needed.
  */
 int fork_usermode_driver(struct umd_info *info)
 {
@@ -165,6 +167,9 @@ int fork_usermode_driver(struct umd_info *info)
 	char **argv = NULL;
 	int err;
 
+	if (WARN_ON_ONCE(info->tgid))
+		return -EBUSY;
+
 	err = -ENOMEM;
 	argv = argv_split(GFP_KERNEL, info->driver_name, NULL);
 	if (!argv)
@@ -192,11 +197,11 @@ EXPORT_SYMBOL_GPL(fork_usermode_driver);
 void __exit_umh(struct task_struct *tsk)
 {
 	struct umd_info *info;
-	pid_t pid = tsk->pid;
+	struct pid *tgid = task_tgid(tsk);
 
 	mutex_lock(&umh_list_lock);
 	list_for_each_entry(info, &umh_list, list) {
-		if (info->pid == pid) {
+		if (info->tgid == tgid) {
 			list_del(&info->list);
 			mutex_unlock(&umh_list_lock);
 			goto out;
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index 28883b00609d..08ea77c2b137 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -15,16 +15,13 @@ extern char bpfilter_umh_end;
 
 static void shutdown_umh(void)
 {
-	struct task_struct *tsk;
+	struct umd_info *info = &bpfilter_ops.info;
+	struct pid *tgid = info->tgid;
 
 	if (bpfilter_ops.stop)
 		return;
 
-	tsk = get_pid_task(find_vpid(bpfilter_ops.info.pid), PIDTYPE_PID);
-	if (tsk) {
-		send_sig(SIGKILL, tsk, 1);
-		put_task_struct(tsk);
-	}
+	kill_pid(tgid, SIGKILL, 1);
 }
 
 static void __stop_umh(void)
@@ -48,7 +45,7 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname,
 	req.cmd = optname;
 	req.addr = (long __force __user)optval;
 	req.len = optlen;
-	if (!bpfilter_ops.info.pid)
+	if (!bpfilter_ops.info.tgid)
 		goto out;
 	n = __kernel_write(bpfilter_ops.info.pipe_to_umh, &req, sizeof(req),
 			   &pos);
@@ -81,7 +78,7 @@ static int start_umh(void)
 	if (err)
 		return err;
 	bpfilter_ops.stop = false;
-	pr_info("Loaded bpfilter_umh pid %d\n", bpfilter_ops.info.pid);
+	pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid));
 
 	/* health check that usermode process started correctly */
 	if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) {
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index 5050de28333d..56cbc43145f6 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -18,7 +18,8 @@ static void bpfilter_umh_cleanup(struct umd_info *info)
 	bpfilter_ops.stop = true;
 	fput(info->pipe_to_umh);
 	fput(info->pipe_from_umh);
-	info->pid = 0;
+	put_pid(info->tgid);
+	info->tgid = NULL;
 	mutex_unlock(&bpfilter_ops.lock);
 }
 
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 13/16] exit: Factor thread_group_exited out of pidfd_poll
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
                                                                                         ` (11 preceding siblings ...)
  2020-07-02 16:41                                                                       ` [PATCH v3 12/16] umd: Track user space drivers with struct pid Eric W. Biederman
@ 2020-07-02 16:41                                                                       ` Eric W. Biederman
  2020-07-03 20:30                                                                         ` Alexei Starovoitov
  2020-07-04 16:00                                                                         ` Christian Brauner
  2020-07-02 16:41                                                                       ` [PATCH v3 14/16] bpfilter: Take advantage of the facilities of struct pid Eric W. Biederman
                                                                                         ` (4 subsequent siblings)
  17 siblings, 2 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner, Eric W. Biederman

Create an independent helper thread_group_exited report return true
when all threads have passed exit_notify in do_exit.  AKA all of the
threads are at least zombies and might be dead or completely gone.

Create this helper by taking the logic out of pidfd_poll where
it is already tested, and adding a missing READ_ONCE on
the read of task->exit_state.

I will be changing the user mode driver code to use this same logic
to know when a user mode driver needs to be restarted.

Place the new helper thread_group_exited in kernel/exit.c and
EXPORT it so it can be used by modules.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched/signal.h |  2 ++
 kernel/exit.c                | 24 ++++++++++++++++++++++++
 kernel/fork.c                |  6 +-----
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0ee5e696c5d8..1bad18a1d8ba 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -674,6 +674,8 @@ static inline int thread_group_empty(struct task_struct *p)
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 
+extern bool thread_group_exited(struct pid *pid);
+
 extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
 							unsigned long *flags);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index d3294b611df1..a7f112feb0f6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1713,6 +1713,30 @@ COMPAT_SYSCALL_DEFINE5(waitid,
 }
 #endif
 
+/**
+ * thread_group_exited - check that a thread group has exited
+ * @pid: tgid of thread group to be checked.
+ *
+ * Test if thread group is has exited (all threads are zombies, dead
+ * or completely gone).
+ *
+ * Return: true if the thread group has exited. false otherwise.
+ */
+bool thread_group_exited(struct pid *pid)
+{
+	struct task_struct *task;
+	bool exited;
+
+	rcu_read_lock();
+	task = pid_task(pid, PIDTYPE_PID);
+	exited = !task ||
+		(READ_ONCE(task->exit_state) && thread_group_empty(task));
+	rcu_read_unlock();
+
+	return exited;
+}
+EXPORT_SYMBOL(thread_group_exited);
+
 __weak void abort(void)
 {
 	BUG();
diff --git a/kernel/fork.c b/kernel/fork.c
index 142b23645d82..bf215af7a904 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1787,22 +1787,18 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
  */
 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 {
-	struct task_struct *task;
 	struct pid *pid = file->private_data;
 	__poll_t poll_flags = 0;
 
 	poll_wait(file, &pid->wait_pidfd, pts);
 
-	rcu_read_lock();
-	task = pid_task(pid, PIDTYPE_PID);
 	/*
 	 * Inform pollers only when the whole thread group exits.
 	 * If the thread group leader exits before all other threads in the
 	 * group, then poll(2) should block, similar to the wait(2) family.
 	 */
-	if (!task || (task->exit_state && thread_group_empty(task)))
+	if (thread_group_exited(pid))
 		poll_flags = EPOLLIN | EPOLLRDNORM;
-	rcu_read_unlock();
 
 	return poll_flags;
 }
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 14/16] bpfilter: Take advantage of the facilities of struct pid
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
                                                                                         ` (12 preceding siblings ...)
  2020-07-02 16:41                                                                       ` [PATCH v3 13/16] exit: Factor thread_group_exited out of pidfd_poll Eric W. Biederman
@ 2020-07-02 16:41                                                                       ` Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 15/16] umd: Remove exit_umh Eric W. Biederman
                                                                                         ` (3 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner, Eric W. Biederman, Greg Kroah-Hartman

Instead of relying on the exit_umh cleanup callback use the fact a
struct pid can be tested to see if a process still exists, and that
struct pid has a wait queue that notifies when the process dies.

v1: https://lkml.kernel.org/r/87h7uydlu9.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/874kqt4owu.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/bpfilter.h     |  3 ++-
 net/bpfilter/bpfilter_kern.c | 15 +++++----------
 net/ipv4/bpfilter/sockopt.c  | 15 ++++++++-------
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
index ec9972d822e0..9b114c718a76 100644
--- a/include/linux/bpfilter.h
+++ b/include/linux/bpfilter.h
@@ -10,6 +10,8 @@ int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
 			    unsigned int optlen);
 int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
 			    int __user *optlen);
+void bpfilter_umh_cleanup(struct umd_info *info);
+
 struct bpfilter_umh_ops {
 	struct umd_info info;
 	/* since ip_getsockopt() can run in parallel, serialize access to umh */
@@ -18,7 +20,6 @@ struct bpfilter_umh_ops {
 		       char __user *optval,
 		       unsigned int optlen, bool is_set);
 	int (*start)(void);
-	bool stop;
 };
 extern struct bpfilter_umh_ops bpfilter_ops;
 #endif
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index 08ea77c2b137..9616fb7defeb 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -18,10 +18,11 @@ static void shutdown_umh(void)
 	struct umd_info *info = &bpfilter_ops.info;
 	struct pid *tgid = info->tgid;
 
-	if (bpfilter_ops.stop)
-		return;
-
-	kill_pid(tgid, SIGKILL, 1);
+	if (tgid) {
+		kill_pid(tgid, SIGKILL, 1);
+		wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
+		bpfilter_umh_cleanup(info);
+	}
 }
 
 static void __stop_umh(void)
@@ -77,7 +78,6 @@ static int start_umh(void)
 	err = fork_usermode_driver(&bpfilter_ops.info);
 	if (err)
 		return err;
-	bpfilter_ops.stop = false;
 	pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid));
 
 	/* health check that usermode process started correctly */
@@ -100,16 +100,11 @@ static int __init load_umh(void)
 		return err;
 
 	mutex_lock(&bpfilter_ops.lock);
-	if (!bpfilter_ops.stop) {
-		err = -EFAULT;
-		goto out;
-	}
 	err = start_umh();
 	if (!err && IS_ENABLED(CONFIG_INET)) {
 		bpfilter_ops.sockopt = &__bpfilter_process_sockopt;
 		bpfilter_ops.start = &start_umh;
 	}
-out:
 	mutex_unlock(&bpfilter_ops.lock);
 	if (err)
 		umd_unload_blob(&bpfilter_ops.info);
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index 56cbc43145f6..9063c6767d34 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -12,16 +12,14 @@
 struct bpfilter_umh_ops bpfilter_ops;
 EXPORT_SYMBOL_GPL(bpfilter_ops);
 
-static void bpfilter_umh_cleanup(struct umd_info *info)
+void bpfilter_umh_cleanup(struct umd_info *info)
 {
-	mutex_lock(&bpfilter_ops.lock);
-	bpfilter_ops.stop = true;
 	fput(info->pipe_to_umh);
 	fput(info->pipe_from_umh);
 	put_pid(info->tgid);
 	info->tgid = NULL;
-	mutex_unlock(&bpfilter_ops.lock);
 }
+EXPORT_SYMBOL_GPL(bpfilter_umh_cleanup);
 
 static int bpfilter_mbox_request(struct sock *sk, int optname,
 				 char __user *optval,
@@ -39,7 +37,11 @@ static int bpfilter_mbox_request(struct sock *sk, int optname,
 			goto out;
 		}
 	}
-	if (bpfilter_ops.stop) {
+	if (bpfilter_ops.info.tgid &&
+	    thread_group_exited(bpfilter_ops.info.tgid))
+		bpfilter_umh_cleanup(&bpfilter_ops.info);
+
+	if (!bpfilter_ops.info.tgid) {
 		err = bpfilter_ops.start();
 		if (err)
 			goto out;
@@ -70,9 +72,8 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
 static int __init bpfilter_sockopt_init(void)
 {
 	mutex_init(&bpfilter_ops.lock);
-	bpfilter_ops.stop = true;
+	bpfilter_ops.info.tgid = NULL;
 	bpfilter_ops.info.driver_name = "bpfilter_umh";
-	bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup;
 
 	return 0;
 }
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 15/16] umd: Remove exit_umh
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
                                                                                         ` (13 preceding siblings ...)
  2020-07-02 16:41                                                                       ` [PATCH v3 14/16] bpfilter: Take advantage of the facilities of struct pid Eric W. Biederman
@ 2020-07-02 16:41                                                                       ` Eric W. Biederman
  2020-07-02 16:41                                                                       ` [PATCH v3 16/16] umd: Stop using split_argv Eric W. Biederman
                                                                                         ` (2 subsequent siblings)
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner, Eric W. Biederman, Greg Kroah-Hartman

The bpfilter code no longer uses the umd_info.cleanup callback.  This
callback is what exit_umh exists to call.  So remove exit_umh and all
of it's associated booking.

v1: https://lkml.kernel.org/r/87bll6dlte.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87y2o53abg.fsf_-_@x220.int.ebiederm.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched.h           |  1 -
 include/linux/usermode_driver.h | 16 ----------------
 kernel/exit.c                   |  3 ---
 kernel/usermode_driver.c        | 28 ----------------------------
 4 files changed, 48 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 59d1e92bb88e..edb2020875ad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1511,7 +1511,6 @@ extern struct pid *cad_pid;
 #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
 #define PF_SWAPWRITE		0x00800000	/* Allowed to write to swap */
-#define PF_UMH			0x02000000	/* I'm an Usermodehelper process */
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
 #define PF_MEMALLOC_NOCMA	0x10000000	/* All allocation request will have _GFP_MOVABLE cleared */
diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h
index 45adbffb31d9..073a9e0ec07d 100644
--- a/include/linux/usermode_driver.h
+++ b/include/linux/usermode_driver.h
@@ -4,26 +4,10 @@
 #include <linux/umh.h>
 #include <linux/path.h>
 
-#ifdef CONFIG_BPFILTER
-void __exit_umh(struct task_struct *tsk);
-
-static inline void exit_umh(struct task_struct *tsk)
-{
-	if (unlikely(tsk->flags & PF_UMH))
-		__exit_umh(tsk);
-}
-#else
-static inline void exit_umh(struct task_struct *tsk)
-{
-}
-#endif
-
 struct umd_info {
 	const char *driver_name;
 	struct file *pipe_to_umh;
 	struct file *pipe_from_umh;
-	struct list_head list;
-	void (*cleanup)(struct umd_info *info);
 	struct path wd;
 	struct pid *tgid;
 };
diff --git a/kernel/exit.c b/kernel/exit.c
index a7f112feb0f6..4ec82859bfe5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -63,7 +63,6 @@
 #include <linux/random.h>
 #include <linux/rcuwait.h>
 #include <linux/compat.h>
-#include <linux/usermode_driver.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -805,8 +804,6 @@ void __noreturn do_exit(long code)
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
 	exit_thread(tsk);
-	if (group_dead)
-		exit_umh(tsk);
 
 	/*
 	 * Flush inherited counters to the parent - before the parent
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index f77f8d7ce9e3..cd136f86f799 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -9,9 +9,6 @@
 #include <linux/task_work.h>
 #include <linux/usermode_driver.h>
 
-static LIST_HEAD(umh_list);
-static DEFINE_MUTEX(umh_list_lock);
-
 static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name)
 {
 	struct file_system_type *type;
@@ -134,7 +131,6 @@ static int umd_setup(struct subprocess_info *info, struct cred *new)
 	umd_info->pipe_to_umh = to_umh[1];
 	umd_info->pipe_from_umh = from_umh[0];
 	umd_info->tgid = get_pid(task_tgid(current));
-	current->flags |= PF_UMH;
 	return 0;
 }
 
@@ -182,11 +178,6 @@ int fork_usermode_driver(struct umd_info *info)
 		goto out;
 
 	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
-	if (!err) {
-		mutex_lock(&umh_list_lock);
-		list_add(&info->list, &umh_list);
-		mutex_unlock(&umh_list_lock);
-	}
 out:
 	if (argv)
 		argv_free(argv);
@@ -194,23 +185,4 @@ int fork_usermode_driver(struct umd_info *info)
 }
 EXPORT_SYMBOL_GPL(fork_usermode_driver);
 
-void __exit_umh(struct task_struct *tsk)
-{
-	struct umd_info *info;
-	struct pid *tgid = task_tgid(tsk);
-
-	mutex_lock(&umh_list_lock);
-	list_for_each_entry(info, &umh_list, list) {
-		if (info->tgid == tgid) {
-			list_del(&info->list);
-			mutex_unlock(&umh_list_lock);
-			goto out;
-		}
-	}
-	mutex_unlock(&umh_list_lock);
-	return;
-out:
-	if (info->cleanup)
-		info->cleanup(info);
-}
 
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* [PATCH v3 16/16] umd: Stop using split_argv
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
                                                                                         ` (14 preceding siblings ...)
  2020-07-02 16:41                                                                       ` [PATCH v3 15/16] umd: Remove exit_umh Eric W. Biederman
@ 2020-07-02 16:41                                                                       ` Eric W. Biederman
  2020-07-02 23:51                                                                       ` [PATCH v3 00/16] Make the user mode driver code a better citizen Tetsuo Handa
  2020-07-09 22:05                                                                       ` [merged][PATCH " Eric W. Biederman
  17 siblings, 0 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-02 16:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner, Eric W. Biederman

There is exactly one argument so there is nothing to split.  All
split_argv does now is cause confusion and avoid the need for a cast
when passing a "const char *" string to call_usermodehelper_setup.

So avoid confusion and the possibility of an odd driver name causing
problems by just using a fixed argv array with a cast in the call to
call_usermodehelper_setup.

v1: https://lkml.kernel.org/r/87sged3a9n.fsf_-_@x220.int.ebiederm.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/usermode_driver.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index cd136f86f799..0b35212ffc3d 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -160,27 +160,21 @@ static void umd_cleanup(struct subprocess_info *info)
 int fork_usermode_driver(struct umd_info *info)
 {
 	struct subprocess_info *sub_info;
-	char **argv = NULL;
+	const char *argv[] = { info->driver_name, NULL };
 	int err;
 
 	if (WARN_ON_ONCE(info->tgid))
 		return -EBUSY;
 
 	err = -ENOMEM;
-	argv = argv_split(GFP_KERNEL, info->driver_name, NULL);
-	if (!argv)
-		goto out;
-
-	sub_info = call_usermodehelper_setup(info->driver_name, argv, NULL,
-					     GFP_KERNEL,
+	sub_info = call_usermodehelper_setup(info->driver_name,
+					     (char **)argv, NULL, GFP_KERNEL,
 					     umd_setup, umd_cleanup, info);
 	if (!sub_info)
 		goto out;
 
 	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
 out:
-	if (argv)
-		argv_free(argv);
 	return err;
 }
 EXPORT_SYMBOL_GPL(fork_usermode_driver);
-- 
2.25.0


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v3 00/16] Make the user mode driver code a better citizen
  2020-07-02 16:40                                                                     ` [PATCH v3 00/16] " Eric W. Biederman
                                                                                         ` (15 preceding siblings ...)
  2020-07-02 16:41                                                                       ` [PATCH v3 16/16] umd: Stop using split_argv Eric W. Biederman
@ 2020-07-02 23:51                                                                       ` Tetsuo Handa
  2020-07-09 22:05                                                                       ` [merged][PATCH " Eric W. Biederman
  17 siblings, 0 replies; 194+ messages in thread
From: Tetsuo Handa @ 2020-07-02 23:51 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-kernel, David Miller, Greg Kroah-Hartman,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds,
	Christian Brauner

On 2020/07/03 1:40, Eric W. Biederman wrote:
> 
> This is the third round of my changeset to split the user mode driver
> code from the user mode helper code, and to make the code use common
> facilities to get things done instead of recreating them just
> for the user mode driver code.

I won't test this version, for you are ignoring my comments.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 00/15] Make the user mode driver code a better citizen
  2020-07-02 16:02                                                                               ` Eric W. Biederman
@ 2020-07-03 13:19                                                                                 ` Tetsuo Handa
  2020-07-03 22:25                                                                                   ` Eric W. Biederman
  0 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-07-03 13:19 UTC (permalink / raw)
  To: Eric W. Biederman, Al Viro, Casey Schaufler
  Cc: Alexei Starovoitov, linux-kernel, David Miller,
	Greg Kroah-Hartman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Luis Chamberlain, Linus Torvalds

On 2020/07/02 22:08, Eric W. Biederman wrote:
>> By the way, commit 4a9d4b024a3102fc ("switch fput to task_work_add") says
>> that use of flush_delayed_fput() has to be careful. Al, is it safe to call
>> flush_delayed_fput() from blob_to_mnt() from umd_load_blob() (which might be
>> called from both kernel thread and from process context (e.g. init_module()
>> syscall by /sbin/insmod )) ?
> 
> And __fput_sync needs to be even more careful.
> umd_load_blob is called in these changes without any locks held.

But where is the guarantee that a thread which called flush_delayed_fput() waits for
the completion of processing _all_ "struct file" linked into delayed_fput_list ?
If some other thread or delayed_fput_work (scheduled by fput_many()) called
flush_delayed_fput() between blob_to_mnt()'s fput(file) and flush_delayed_fput()
sequence? blob_to_mnt()'s flush_delayed_fput() can miss the "struct file" which
needs to be processed before execve(), can't it?

Also, I don't know how convoluted the dependency of all "struct file" linked into
delayed_fput_list might be, for there can be "struct file" which will not be a
simple close of tmpfs file created by blob_to_mnt()'s file_open_root() request.

On the other hand, although __fput_sync() cannot be called from !PF_KTHREAD threads,
there is a guarantee that __fput_sync() waits for the completion of "struct file"
which needs to be flushed before execve(), isn't there?

> 
> We fundamentally AKA in any correct version of this code need to flush
> the file descriptor before we call exec or exec can not open it a
> read-only denying all writes from any other opens.
> 
> The use case of flush_delayed_fput is exactly the same as that used
> when loading the initramfs.

When loading the initramfs, the number of threads is quite few (which
means that the possibility of hitting the race window and convoluted
dependency is small).

But like EXPORT_SYMBOL_GPL(umd_load_blob) indicates, blob_to_mnt()'s
flush_delayed_fput() might be called after many number of threads already
started running.



On 2020/07/03 1:02, Eric W. Biederman wrote:
>>>> On 2020/06/30 21:29, Eric W. Biederman wrote:
>>>>> Hmm.  The wake up happens just of tgid->wait_pidfd happens just before
>>>>> release_task is called so there is a race.  As it is possible to wake
>>>>> up and then go back to sleep before pid_has_task becomes false.
>>>>
>>>> What is the reason we want to wait until pid_has_task() becomes false?
>>>>
>>>> - wait_event(tgid->wait_pidfd, !pid_has_task(tgid, PIDTYPE_TGID));
>>>> + while (!wait_event_timeout(tgid->wait_pidfd, !pid_has_task(tgid, PIDTYPE_TGID), 1));
>>>
>>> So that it is safe to call bpfilter_umh_cleanup.  The previous code
>>> performed the wait by having a callback in do_exit.
>>
>> But bpfilter_umh_cleanup() does only
>>
>> 	fput(info->pipe_to_umh);
>> 	fput(info->pipe_from_umh);
>> 	put_pid(info->tgid);
>> 	info->tgid = NULL;
>>
>> which is (I think) already safe regardless of the usermode process because
>> bpfilter_umh_cleanup() merely closes one side of two pipes used between
>> two processes and forgets about the usermode process.
> 
> It is not safe.
> 
> Baring bugs there is only one use of shtudown_umh that matters.  The one
> in fini_umh.  The use of the file by the mm must be finished before
> umd_unload_blob.  AKA unmount.  Which completely frees the filesystem.

Do we really need to mount upon umd_load_blob() and unmount upon umd_unload_blob() ?
LSM modules might prefer only one instance of filesystem for umd blobs.

For pathname based LSMs, since that filesystem is not visible from mount tree, only
info->driver_name can be used for distinction. Therefore, one instance of filesystem
with files created with file_open_root(O_CREAT | O_WRONLY | O_EXCL) might be preferable.

For inode based LSMs, reusing one instance of filesystem created upon early boot might
be convenient for labeling.

Also, we might want a dedicated filesystem (say, "umdfs") instead of regular tmpfs in
order to implement protections without labeling files. Then, we might also be able to
implement minimal protections without LSMs.


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v3 13/16] exit: Factor thread_group_exited out of pidfd_poll
  2020-07-02 16:41                                                                       ` [PATCH v3 13/16] exit: Factor thread_group_exited out of pidfd_poll Eric W. Biederman
@ 2020-07-03 20:30                                                                         ` Alexei Starovoitov
  2020-07-03 21:37                                                                           ` Eric W. Biederman
  2020-07-04 16:00                                                                         ` Christian Brauner
  1 sibling, 1 reply; 194+ messages in thread
From: Alexei Starovoitov @ 2020-07-03 20:30 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-kernel, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Kees Cook, Andrew Morton, Alexei Starovoitov, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, LSM List, Casey Schaufler,
	Luis Chamberlain, Linus Torvalds, Christian Brauner

On Thu, Jul 02, 2020 at 11:41:37AM -0500, Eric W. Biederman wrote:
> Create an independent helper thread_group_exited report return true
> when all threads have passed exit_notify in do_exit.  AKA all of the
> threads are at least zombies and might be dead or completely gone.
> 
> Create this helper by taking the logic out of pidfd_poll where
> it is already tested, and adding a missing READ_ONCE on
> the read of task->exit_state.
> 
> I will be changing the user mode driver code to use this same logic
> to know when a user mode driver needs to be restarted.
> 
> Place the new helper thread_group_exited in kernel/exit.c and
> EXPORT it so it can be used by modules.
> 
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
>  include/linux/sched/signal.h |  2 ++
>  kernel/exit.c                | 24 ++++++++++++++++++++++++
>  kernel/fork.c                |  6 +-----
>  3 files changed, 27 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 0ee5e696c5d8..1bad18a1d8ba 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -674,6 +674,8 @@ static inline int thread_group_empty(struct task_struct *p)
>  #define delay_group_leader(p) \
>  		(thread_group_leader(p) && !thread_group_empty(p))
>  
> +extern bool thread_group_exited(struct pid *pid);
> +
>  extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
>  							unsigned long *flags);
>  
> diff --git a/kernel/exit.c b/kernel/exit.c
> index d3294b611df1..a7f112feb0f6 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -1713,6 +1713,30 @@ COMPAT_SYSCALL_DEFINE5(waitid,
>  }
>  #endif
>  
> +/**
> + * thread_group_exited - check that a thread group has exited
> + * @pid: tgid of thread group to be checked.
> + *
> + * Test if thread group is has exited (all threads are zombies, dead
> + * or completely gone).
> + *
> + * Return: true if the thread group has exited. false otherwise.
> + */
> +bool thread_group_exited(struct pid *pid)
> +{
> +	struct task_struct *task;
> +	bool exited;
> +
> +	rcu_read_lock();
> +	task = pid_task(pid, PIDTYPE_PID);
> +	exited = !task ||
> +		(READ_ONCE(task->exit_state) && thread_group_empty(task));
> +	rcu_read_unlock();
> +
> +	return exited;
> +}

I'm not sure why you think READ_ONCE was missing.
It's different in wait_consider_task() where READ_ONCE is needed because
of multiple checks. Here it's done once.

The rest all looks good to me. Tested with and without bpf_preload patches.
Feel free to create a frozen branch with this set.

btw I'll be offline starting tomorrow for a week.
Will catch up with threads afterwards.

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v3 13/16] exit: Factor thread_group_exited out of pidfd_poll
  2020-07-03 20:30                                                                         ` Alexei Starovoitov
@ 2020-07-03 21:37                                                                           ` Eric W. Biederman
  2020-07-04  0:03                                                                             ` Alexei Starovoitov
  2020-07-04 15:50                                                                             ` Christian Brauner
  0 siblings, 2 replies; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-03 21:37 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: linux-kernel, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Kees Cook, Andrew Morton, Alexei Starovoitov, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, LSM List, Casey Schaufler,
	Luis Chamberlain, Linus Torvalds, Christian Brauner

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Thu, Jul 02, 2020 at 11:41:37AM -0500, Eric W. Biederman wrote:
>> Create an independent helper thread_group_exited report return true
>> when all threads have passed exit_notify in do_exit.  AKA all of the
>> threads are at least zombies and might be dead or completely gone.
>> 
>> Create this helper by taking the logic out of pidfd_poll where
>> it is already tested, and adding a missing READ_ONCE on
>> the read of task->exit_state.
>> 
>> I will be changing the user mode driver code to use this same logic
>> to know when a user mode driver needs to be restarted.
>> 
>> Place the new helper thread_group_exited in kernel/exit.c and
>> EXPORT it so it can be used by modules.
>> 
>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> ---
>>  include/linux/sched/signal.h |  2 ++
>>  kernel/exit.c                | 24 ++++++++++++++++++++++++
>>  kernel/fork.c                |  6 +-----
>>  3 files changed, 27 insertions(+), 5 deletions(-)
>> 
>> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
>> index 0ee5e696c5d8..1bad18a1d8ba 100644
>> --- a/include/linux/sched/signal.h
>> +++ b/include/linux/sched/signal.h
>> @@ -674,6 +674,8 @@ static inline int thread_group_empty(struct task_struct *p)
>>  #define delay_group_leader(p) \
>>  		(thread_group_leader(p) && !thread_group_empty(p))
>>  
>> +extern bool thread_group_exited(struct pid *pid);
>> +
>>  extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
>>  							unsigned long *flags);
>>  
>> diff --git a/kernel/exit.c b/kernel/exit.c
>> index d3294b611df1..a7f112feb0f6 100644
>> --- a/kernel/exit.c
>> +++ b/kernel/exit.c
>> @@ -1713,6 +1713,30 @@ COMPAT_SYSCALL_DEFINE5(waitid,
>>  }
>>  #endif
>>  
>> +/**
>> + * thread_group_exited - check that a thread group has exited
>> + * @pid: tgid of thread group to be checked.
>> + *
>> + * Test if thread group is has exited (all threads are zombies, dead
>> + * or completely gone).
>> + *
>> + * Return: true if the thread group has exited. false otherwise.
>> + */
>> +bool thread_group_exited(struct pid *pid)
>> +{
>> +	struct task_struct *task;
>> +	bool exited;
>> +
>> +	rcu_read_lock();
>> +	task = pid_task(pid, PIDTYPE_PID);
>> +	exited = !task ||
>> +		(READ_ONCE(task->exit_state) && thread_group_empty(task));
>> +	rcu_read_unlock();
>> +
>> +	return exited;
>> +}
>
> I'm not sure why you think READ_ONCE was missing.
> It's different in wait_consider_task() where READ_ONCE is needed because
> of multiple checks. Here it's done once.

In practice it probably has no effect on the generated code.  But
READ_ONCE is about telling the compiler not to be clever.  Don't use
tearing loads or stores etc.  When all of the other readers are using
READ_ONCE I just get nervous if we have a case that doesn't.

> The rest all looks good to me. Tested with and without bpf_preload patches.
> Feel free to create a frozen branch with this set.

Can I have your Tested-by and Acked-by?

> btw I'll be offline starting tomorrow for a week.
> Will catch up with threads afterwards.

Eric


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 00/15] Make the user mode driver code a better citizen
  2020-07-03 13:19                                                                                 ` Tetsuo Handa
@ 2020-07-03 22:25                                                                                   ` Eric W. Biederman
  2020-07-04  6:57                                                                                     ` Tetsuo Handa
  0 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-03 22:25 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Al Viro, Casey Schaufler, Alexei Starovoitov, linux-kernel,
	David Miller, Greg Kroah-Hartman, Kees Cook, Andrew Morton,
	Alexei Starovoitov, bpf, linux-fsdevel, Daniel Borkmann,
	Jakub Kicinski, Masahiro Yamada, Gary Lin, Bruno Meneguele,
	LSM List, Luis Chamberlain, Linus Torvalds

Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:

> On 2020/07/02 22:08, Eric W. Biederman wrote:
>>> By the way, commit 4a9d4b024a3102fc ("switch fput to task_work_add") says
>>> that use of flush_delayed_fput() has to be careful. Al, is it safe to call
>>> flush_delayed_fput() from blob_to_mnt() from umd_load_blob() (which might be
>>> called from both kernel thread and from process context (e.g. init_module()
>>> syscall by /sbin/insmod )) ?
>> 
>> And __fput_sync needs to be even more careful.
>> umd_load_blob is called in these changes without any locks held.
>
> But where is the guarantee that a thread which called flush_delayed_fput() waits for
> the completion of processing _all_ "struct file" linked into delayed_fput_list ?
> If some other thread or delayed_fput_work (scheduled by fput_many()) called
> flush_delayed_fput() between blob_to_mnt()'s fput(file) and flush_delayed_fput()
> sequence? blob_to_mnt()'s flush_delayed_fput() can miss the "struct file" which
> needs to be processed before execve(), can't it?

As a module the guarantee is we call task_work_run.
Built into the kernel the guarantee as best I can trace it is that
kthreadd hasn't started, and as such nothing that is scheduled has run
yet.

> Also, I don't know how convoluted the dependency of all "struct file" linked into
> delayed_fput_list might be, for there can be "struct file" which will not be a
> simple close of tmpfs file created by blob_to_mnt()'s file_open_root() request.
>
> On the other hand, although __fput_sync() cannot be called from !PF_KTHREAD threads,
> there is a guarantee that __fput_sync() waits for the completion of "struct file"
> which needs to be flushed before execve(), isn't there?

There is really not a good helper or helpers, and this code suggests we
have something better.  Right now I have used the existing helpers to
the best of my ability.  If you or someone else wants to write a better
version of flushing so that exec can happen be my guest.

As far as I can tell what I have is good enough.

>> We fundamentally AKA in any correct version of this code need to flush
>> the file descriptor before we call exec or exec can not open it a
>> read-only denying all writes from any other opens.
>> 
>> The use case of flush_delayed_fput is exactly the same as that used
>> when loading the initramfs.
>
> When loading the initramfs, the number of threads is quite few (which
> means that the possibility of hitting the race window and convoluted
> dependency is small).

But the reality is the code run very early, before the initramfs is
initialized in practice.

> But like EXPORT_SYMBOL_GPL(umd_load_blob) indicates, blob_to_mnt()'s
> flush_delayed_fput() might be called after many number of threads already
> started running.

At which point the code probably won't be runnig from a kernel thread
but instead will be running in a thread where task_work_run is relevant.

At worst it is a very small race, where someone else in another thread
starts flushing the file.  Which means the file could still be
completely close before exec.   Even that is not necessarily fatal,
as the usermode driver code has a respawn capability.

Code that is used enough that it hits that race sounds like a very
good problem to have from the perspective of the usermode driver code.

> Do we really need to mount upon umd_load_blob() and unmount upon umd_unload_blob() ?
> LSM modules might prefer only one instance of filesystem for umd
> blobs.

It is simple. People are free to change it, but a single filesystem
seems like a very good place to start with this functionality.

> For pathname based LSMs, since that filesystem is not visible from mount tree, only
> info->driver_name can be used for distinction. Therefore, one instance of filesystem
> with files created with file_open_root(O_CREAT | O_WRONLY | O_EXCL)
> might be preferable.

I took a quick look and the creation and removal of files with the
in-kernel helpers is not particularly easy.  Certainly it is more work
and thus a higher likelyhood of bugs than what I have done.

A directory per driver does sound tempting.  Just more work that I am
willing to do.

> For inode based LSMs, reusing one instance of filesystem created upon early boot might
> be convenient for labeling.
>
> Also, we might want a dedicated filesystem (say, "umdfs") instead of regular tmpfs in
> order to implement protections without labeling files. Then, we might also be able to
> implement minimal protections without LSMs.

All valid points.  Nothing sets this design in stone.
Nothing says this is the endpoint of the evolution of this code.

The entire point of this patchset for me is that I remove the
unnecessary special cases from exec and do_exit, so I don't have to deal
with the usermode driver code anymore.

Eric

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v3 13/16] exit: Factor thread_group_exited out of pidfd_poll
  2020-07-03 21:37                                                                           ` Eric W. Biederman
@ 2020-07-04  0:03                                                                             ` Alexei Starovoitov
  2020-07-04 15:50                                                                             ` Christian Brauner
  1 sibling, 0 replies; 194+ messages in thread
From: Alexei Starovoitov @ 2020-07-04  0:03 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-kernel, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Kees Cook, Andrew Morton, Alexei Starovoitov, Al Viro, bpf,
	linux-fsdevel, Daniel Borkmann, Jakub Kicinski, Masahiro Yamada,
	Gary Lin, Bruno Meneguele, LSM List, Casey Schaufler,
	Luis Chamberlain, Linus Torvalds, Christian Brauner

On Fri, Jul 03, 2020 at 04:37:47PM -0500, Eric W. Biederman wrote:
> 
> > The rest all looks good to me. Tested with and without bpf_preload patches.
> > Feel free to create a frozen branch with this set.
> 
> Can I have your Tested-by and Acked-by?

For the set:
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Alexei Starovoitov <ast@kernel.org>

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v2 00/15] Make the user mode driver code a better citizen
  2020-07-03 22:25                                                                                   ` Eric W. Biederman
@ 2020-07-04  6:57                                                                                     ` Tetsuo Handa
  2020-07-08  4:46                                                                                       ` Eric W. Biederman
  0 siblings, 1 reply; 194+ messages in thread
From: Tetsuo Handa @ 2020-07-04  6:57 UTC (permalink / raw)
  To: Eric W. Biederman, Al Viro
  Cc: Casey Schaufler, Alexei Starovoitov, linux-kernel, David Miller,
	Greg Kroah-Hartman, Kees Cook, Andrew Morton, Alexei Starovoitov,
	bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Luis Chamberlain, Linus Torvalds

On 2020/07/04 7:25, Eric W. Biederman wrote:
> Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> writes:
> 
>> On 2020/07/02 22:08, Eric W. Biederman wrote:
>>>> By the way, commit 4a9d4b024a3102fc ("switch fput to task_work_add") says
>>>> that use of flush_delayed_fput() has to be careful. Al, is it safe to call
>>>> flush_delayed_fput() from blob_to_mnt() from umd_load_blob() (which might be
>>>> called from both kernel thread and from process context (e.g. init_module()
>>>> syscall by /sbin/insmod )) ?
>>>
>>> And __fput_sync needs to be even more careful.
>>> umd_load_blob is called in these changes without any locks held.
>>
>> But where is the guarantee that a thread which called flush_delayed_fput() waits for
>> the completion of processing _all_ "struct file" linked into delayed_fput_list ?
>> If some other thread or delayed_fput_work (scheduled by fput_many()) called
>> flush_delayed_fput() between blob_to_mnt()'s fput(file) and flush_delayed_fput()
>> sequence? blob_to_mnt()'s flush_delayed_fput() can miss the "struct file" which
>> needs to be processed before execve(), can't it?
> 
> As a module the guarantee is we call task_work_run.

No. It is possible that blob_to_mnt() is called by a kernel thread which was
started by init_module() syscall by /sbin/insmod .

> Built into the kernel the guarantee as best I can trace it is that
> kthreadd hasn't started, and as such nothing that is scheduled has run
> yet.

Have you ever checked how early the kthreadd (PID=2) gets started?

----------
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2306,6 +2306,7 @@ static __latent_entropy struct task_struct *copy_process(
        trace_task_newtask(p, clone_flags);
        uprobe_copy_process(p, clone_flags);

+       printk(KERN_INFO "Created PID: %u Comm: %s\n", p->pid, p->comm);
        return p;

 bad_fork_cancel_cgroup:
----------

----------
[    0.090757][    T0] pid_max: default: 65536 minimum: 512
[    0.090890][    T0] LSM: Security Framework initializing
[    0.090890][    T0] Mount-cache hash table entries: 8192 (order: 4, 65536 bytes, linear)
[    0.090890][    T0] Mountpoint-cache hash table entries: 8192 (order: 4, 65536 bytes, linear)
[    0.090890][    T0] Disabled fast string operations
[    0.090890][    T0] Last level iTLB entries: 4KB 1024, 2MB 1024, 4MB 1024
[    0.090890][    T0] Last level dTLB entries: 4KB 1024, 2MB 1024, 4MB 1024, 1GB 4
[    0.090890][    T0] Spectre V1 : Mitigation: usercopy/swapgs barriers and __user pointer sanitization
[    0.090890][    T0] Spectre V2 : Spectre mitigation: kernel not compiled with retpoline; no mitigation available!
[    0.090890][    T0] Speculative Store Bypass: Mitigation: Speculative Store Bypass disabled via prctl and seccomp
[    0.090890][    T0] SRBDS: Unknown: Dependent on hypervisor status
[    0.090890][    T0] MDS: Mitigation: Clear CPU buffers
[    0.090890][    T0] Freeing SMP alternatives memory: 24K
[    0.090890][    T0] Created PID: 1 Comm: swapper/0
[    0.090890][    T0] Created PID: 2 Comm: swapper/0
[    0.090890][    T1] smpboot: CPU0: Intel(R) Core(TM) i5-4440S CPU @ 2.80GHz (family: 0x6, model: 0x3c, stepping: 0x3)
[    0.091000][    T2] Created PID: 3 Comm: kthreadd
[    0.091995][    T2] Created PID: 4 Comm: kthreadd
[    0.093028][    T2] Created PID: 5 Comm: kthreadd
[    0.093997][    T2] Created PID: 6 Comm: kthreadd
[    0.094995][    T2] Created PID: 7 Comm: kthreadd
[    0.096037][    T2] Created PID: 8 Comm: kthreadd
(...snipped...)
[    0.135716][    T2] Created PID: 13 Comm: kthreadd
[    0.135716][    T1] smp: Bringing up secondary CPUs ...
[    0.135716][    T2] Created PID: 14 Comm: kthreadd
[    0.135716][    T2] Created PID: 15 Comm: kthreadd
[    0.135716][    T2] Created PID: 16 Comm: kthreadd
[    0.135716][    T2] Created PID: 17 Comm: kthreadd
[    0.135716][    T2] Created PID: 18 Comm: kthreadd
[    0.135716][    T1] x86: Booting SMP configuration:
(...snipped...)
[    0.901990][    T1] pci 0000:00:00.0: Limiting direct PCI/PCI transfers
[    0.902145][    T1] pci 0000:00:0f.0: Video device with shadowed ROM at [mem 0x000c0000-0x000dffff]
[    0.902213][    T1] pci 0000:02:00.0: CLS mismatch (32 != 64), using 64 bytes
[    0.902224][    T1] Trying to unpack rootfs image as initramfs...
[    1.107993][    T1] Freeing initrd memory: 18876K
[    1.109049][    T1] PCI-DMA: Using software bounce buffering for IO (SWIOTLB)
[    1.111003][    T1] software IO TLB: mapped [mem 0xab000000-0xaf000000] (64MB)
[    1.112136][    T1] check: Scanning for low memory corruption every 60 seconds
[    1.115040][    T2] Created PID: 52 Comm: kthreadd
[    1.116110][    T1] workingset: timestamp_bits=46 max_order=20 bucket_order=0
[    1.120936][    T1] SGI XFS with ACLs, security attributes, verbose warnings, quota, no debug enabled
[    1.129626][    T2] Created PID: 53 Comm: kthreadd
[    1.131403][    T2] Created PID: 54 Comm: kthreadd
----------

kthreadd (PID=2) is created by swapper/0 (PID=0) immediately after init (PID=1) was created by
swapper/0 (PID=0). It is even before secondary CPUs are brought up, and far earlier than unpacking
initramfs.

And how can we prove that blob_to_mnt() is only called by a kernel thread before some kernel
thread that interferes fput() starts running? blob_to_mnt() needs to be prepared for being
called after many processes already started running.

> 
>> Also, I don't know how convoluted the dependency of all "struct file" linked into
>> delayed_fput_list might be, for there can be "struct file" which will not be a
>> simple close of tmpfs file created by blob_to_mnt()'s file_open_root() request.
>>
>> On the other hand, although __fput_sync() cannot be called from !PF_KTHREAD threads,
>> there is a guarantee that __fput_sync() waits for the completion of "struct file"
>> which needs to be flushed before execve(), isn't there?
> 
> There is really not a good helper or helpers, and this code suggests we
> have something better.  Right now I have used the existing helpers to
> the best of my ability.  If you or someone else wants to write a better
> version of flushing so that exec can happen be my guest.
> 
> As far as I can tell what I have is good enough.

Just saying what you think is not a "review". I'm waiting for answer from Al Viro
because I consider that Al will be the most familiar with fput()'s behavior.
At least I consider that

	if (current->flags & PF_KTHREAD) {
		__fput_sync(file);
	} else {
		fput(file);
		task_work_run();
	}

is a candidate for closing the race window. And depending on Al's answer,
removing

	BUG_ON(!(task->flags & PF_KTHREAD));

 from __fput_sync() and unconditionally using

	__fput_sync(file);

 from blob_to_mnt() might be the better choice. Anyway, I consider that
Al's response is important for this "review".

> 
>>> We fundamentally AKA in any correct version of this code need to flush
>>> the file descriptor before we call exec or exec can not open it a
>>> read-only denying all writes from any other opens.
>>>
>>> The use case of flush_delayed_fput is exactly the same as that used
>>> when loading the initramfs.
>>
>> When loading the initramfs, the number of threads is quite few (which
>> means that the possibility of hitting the race window and convoluted
>> dependency is small).
> 
> But the reality is the code run very early, before the initramfs is
> initialized in practice.

Such expectation is not a reality.

> 
>> But like EXPORT_SYMBOL_GPL(umd_load_blob) indicates, blob_to_mnt()'s
>> flush_delayed_fput() might be called after many number of threads already
>> started running.
> 
> At which point the code probably won't be runnig from a kernel thread
> but instead will be running in a thread where task_work_run is relevant.

No. It is possible that blob_to_mnt() is called by a kernel thread which was
started by init_module() syscall by /sbin/insmod .

> 
> At worst it is a very small race, where someone else in another thread
> starts flushing the file.  Which means the file could still be
> completely close before exec.   Even that is not necessarily fatal,
> as the usermode driver code has a respawn capability.
> 
> Code that is used enough that it hits that race sounds like a very
> good problem to have from the perspective of the usermode driver code.

In general, unconditionally retrying call_usermodehelper() when it returned
a negative value (e.g. -ENOENT, -ENOMEM, -EBUSY) is bad. I don't know which
code is an implementation of "a respawn capability"; I'd like to check where
that code is and whether that code is checking -ETXTBSY.


^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v3 13/16] exit: Factor thread_group_exited out of pidfd_poll
  2020-07-03 21:37                                                                           ` Eric W. Biederman
  2020-07-04  0:03                                                                             ` Alexei Starovoitov
@ 2020-07-04 15:50                                                                             ` Christian Brauner
  2020-07-07 17:09                                                                               ` Eric W. Biederman
  1 sibling, 1 reply; 194+ messages in thread
From: Christian Brauner @ 2020-07-04 15:50 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Alexei Starovoitov, linux-kernel, David Miller,
	Greg Kroah-Hartman, Tetsuo Handa, Kees Cook, Andrew Morton,
	Alexei Starovoitov, Al Viro, bpf, linux-fsdevel, Daniel Borkmann,
	Jakub Kicinski, Masahiro Yamada, Gary Lin, Bruno Meneguele,
	LSM List, Casey Schaufler, Luis Chamberlain, Linus Torvalds

On Fri, Jul 03, 2020 at 04:37:47PM -0500, Eric W. Biederman wrote:
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> 
> > On Thu, Jul 02, 2020 at 11:41:37AM -0500, Eric W. Biederman wrote:
> >> Create an independent helper thread_group_exited report return true
> >> when all threads have passed exit_notify in do_exit.  AKA all of the
> >> threads are at least zombies and might be dead or completely gone.
> >> 
> >> Create this helper by taking the logic out of pidfd_poll where
> >> it is already tested, and adding a missing READ_ONCE on
> >> the read of task->exit_state.
> >> 
> >> I will be changing the user mode driver code to use this same logic
> >> to know when a user mode driver needs to be restarted.
> >> 
> >> Place the new helper thread_group_exited in kernel/exit.c and
> >> EXPORT it so it can be used by modules.
> >> 
> >> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> >> ---
> >>  include/linux/sched/signal.h |  2 ++
> >>  kernel/exit.c                | 24 ++++++++++++++++++++++++
> >>  kernel/fork.c                |  6 +-----
> >>  3 files changed, 27 insertions(+), 5 deletions(-)
> >> 
> >> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> >> index 0ee5e696c5d8..1bad18a1d8ba 100644
> >> --- a/include/linux/sched/signal.h
> >> +++ b/include/linux/sched/signal.h
> >> @@ -674,6 +674,8 @@ static inline int thread_group_empty(struct task_struct *p)
> >>  #define delay_group_leader(p) \
> >>  		(thread_group_leader(p) && !thread_group_empty(p))
> >>  
> >> +extern bool thread_group_exited(struct pid *pid);
> >> +
> >>  extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
> >>  							unsigned long *flags);
> >>  
> >> diff --git a/kernel/exit.c b/kernel/exit.c
> >> index d3294b611df1..a7f112feb0f6 100644
> >> --- a/kernel/exit.c
> >> +++ b/kernel/exit.c
> >> @@ -1713,6 +1713,30 @@ COMPAT_SYSCALL_DEFINE5(waitid,
> >>  }
> >>  #endif
> >>  
> >> +/**
> >> + * thread_group_exited - check that a thread group has exited
> >> + * @pid: tgid of thread group to be checked.
> >> + *
> >> + * Test if thread group is has exited (all threads are zombies, dead
> >> + * or completely gone).
> >> + *
> >> + * Return: true if the thread group has exited. false otherwise.
> >> + */
> >> +bool thread_group_exited(struct pid *pid)
> >> +{
> >> +	struct task_struct *task;
> >> +	bool exited;
> >> +
> >> +	rcu_read_lock();
> >> +	task = pid_task(pid, PIDTYPE_PID);
> >> +	exited = !task ||
> >> +		(READ_ONCE(task->exit_state) && thread_group_empty(task));
> >> +	rcu_read_unlock();
> >> +
> >> +	return exited;
> >> +}
> >
> > I'm not sure why you think READ_ONCE was missing.
> > It's different in wait_consider_task() where READ_ONCE is needed because
> > of multiple checks. Here it's done once.
> 
> In practice it probably has no effect on the generated code.  But
> READ_ONCE is about telling the compiler not to be clever.  Don't use
> tearing loads or stores etc.  When all of the other readers are using
> READ_ONCE I just get nervous if we have a case that doesn't.

That's not true. The only place where READ_ONCE(->exit_state) is used is
in wait_consider_task() and nowhere else. We had that discussion a while
ago where I or someone proposed to simply place a READ_ONCE() around all
accesses to exit_state for the sake of kcsan and we agreed that it's
unnecessary and not to do this.
But it obviously doesn't hurt to have it.

Christian

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v3 13/16] exit: Factor thread_group_exited out of pidfd_poll
  2020-07-02 16:41                                                                       ` [PATCH v3 13/16] exit: Factor thread_group_exited out of pidfd_poll Eric W. Biederman
  2020-07-03 20:30                                                                         ` Alexei Starovoitov
@ 2020-07-04 16:00                                                                         ` Christian Brauner
  1 sibling, 0 replies; 194+ messages in thread
From: Christian Brauner @ 2020-07-04 16:00 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-kernel, David Miller, Greg Kroah-Hartman, Tetsuo Handa,
	Alexei Starovoitov, Kees Cook, Andrew Morton, Alexei Starovoitov,
	Al Viro, bpf, linux-fsdevel, Daniel Borkmann, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds

On Thu, Jul 02, 2020 at 11:41:37AM -0500, Eric W. Biederman wrote:
> Create an independent helper thread_group_exited report return true

s/report return/which reports/

> when all threads have passed exit_notify in do_exit.  AKA all of the
> threads are at least zombies and might be dead or completely gone.
> 
> Create this helper by taking the logic out of pidfd_poll where
> it is already tested, and adding a missing READ_ONCE on
> the read of task->exit_state.

I would prefer to have this comment dropped as this read_once() is not
missing as you can see from the comments elsewhere in this thread.

> 
> I will be changing the user mode driver code to use this same logic
> to know when a user mode driver needs to be restarted.
> 
> Place the new helper thread_group_exited in kernel/exit.c and
> EXPORT it so it can be used by modules.
> 
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---

Minus the typos above and below, this looks good and passes the pidfd
and process test-suite.
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>

Thanks!
Christian

>  include/linux/sched/signal.h |  2 ++
>  kernel/exit.c                | 24 ++++++++++++++++++++++++
>  kernel/fork.c                |  6 +-----
>  3 files changed, 27 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 0ee5e696c5d8..1bad18a1d8ba 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -674,6 +674,8 @@ static inline int thread_group_empty(struct task_struct *p)
>  #define delay_group_leader(p) \
>  		(thread_group_leader(p) && !thread_group_empty(p))
>  
> +extern bool thread_group_exited(struct pid *pid);
> +
>  extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
>  							unsigned long *flags);
>  
> diff --git a/kernel/exit.c b/kernel/exit.c
> index d3294b611df1..a7f112feb0f6 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -1713,6 +1713,30 @@ COMPAT_SYSCALL_DEFINE5(waitid,
>  }
>  #endif
>  
> +/**
> + * thread_group_exited - check that a thread group has exited
> + * @pid: tgid of thread group to be checked.
> + *
> + * Test if thread group is has exited (all threads are zombies, dead

s/is has exited/has exited/

> + * or completely gone).
> + *
> + * Return: true if the thread group has exited. false otherwise.
> + */
> +bool thread_group_exited(struct pid *pid)
> +{
> +	struct task_struct *task;
> +	bool exited;
> +
> +	rcu_read_lock();
> +	task = pid_task(pid, PIDTYPE_PID);
> +	exited = !task ||
> +		(READ_ONCE(task->exit_state) && thread_group_empty(task));
> +	rcu_read_unlock();
> +
> +	return exited;
> +}
> +EXPORT_SYMBOL(thread_group_exited);
> +
>  __weak void abort(void)
>  {
>  	BUG();
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 142b23645d82..bf215af7a904 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1787,22 +1787,18 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
>   */
>  static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
>  {
> -	struct task_struct *task;
>  	struct pid *pid = file->private_data;
>  	__poll_t poll_flags = 0;
>  
>  	poll_wait(file, &pid->wait_pidfd, pts);
>  
> -	rcu_read_lock();
> -	task = pid_task(pid, PIDTYPE_PID);
>  	/*
>  	 * Inform pollers only when the whole thread group exits.
>  	 * If the thread group leader exits before all other threads in the
>  	 * group, then poll(2) should block, similar to the wait(2) family.
>  	 */
> -	if (!task || (task->exit_state && thread_group_empty(task)))
> +	if (thread_group_exited(pid))
>  		poll_flags = EPOLLIN | EPOLLRDNORM;
> -	rcu_read_unlock();
>  
>  	return poll_flags;
>  }
> -- 
> 2.25.0
> 

^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v3 13/16] exit: Factor thread_group_exited out of pidfd_poll
  2020-07-04 15:50                                                                             ` Christian Brauner
@ 2020-07-07 17:09                                                                               ` Eric W. Biederman
  2020-07-08  0:05                                                                                 ` Daniel Borkmann
  0 siblings, 1 reply; 194+ messages in thread
From: Eric W. Biederman @ 2020-07-07 17:09 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Alexei Starovoitov, linux-kernel, David Miller,
	Greg Kroah-Hartman, Tetsuo Handa, Kees Cook, Andrew Morton,
	Alexei Starovoitov, Al Viro, bpf, linux-fsdevel, Daniel Borkmann,
	Jakub Kicinski, Masahiro Yamada, Gary Lin, Bruno Meneguele,
	LSM List, Casey Schaufler, Luis Chamberlain, Linus Torvalds

Christian Brauner <christian.brauner@ubuntu.com> writes:

> On Fri, Jul 03, 2020 at 04:37:47PM -0500, Eric W. Biederman wrote:
>> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>> 
>> > On Thu, Jul 02, 2020 at 11:41:37AM -0500, Eric W. Biederman wrote:
>> >> Create an independent helper thread_group_exited report return true
>> >> when all threads have passed exit_notify in do_exit.  AKA all of the
>> >> threads are at least zombies and might be dead or completely gone.
>> >> 
>> >> Create this helper by taking the logic out of pidfd_poll where
>> >> it is already tested, and adding a missing READ_ONCE on
>> >> the read of task->exit_state.
>> >> 
>> >> I will be changing the user mode driver code to use this same logic
>> >> to know when a user mode driver needs to be restarted.
>> >> 
>> >> Place the new helper thread_group_exited in kernel/exit.c and
>> >> EXPORT it so it can be used by modules.
>> >> 
>> >> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> >> ---
>> >>  include/linux/sched/signal.h |  2 ++
>> >>  kernel/exit.c                | 24 ++++++++++++++++++++++++
>> >>  kernel/fork.c                |  6 +-----
>> >>  3 files changed, 27 insertions(+), 5 deletions(-)
>> >> 
>> >> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
>> >> index 0ee5e696c5d8..1bad18a1d8ba 100644
>> >> --- a/include/linux/sched/signal.h
>> >> +++ b/include/linux/sched/signal.h
>> >> @@ -674,6 +674,8 @@ static inline int thread_group_empty(struct task_struct *p)
>> >>  #define delay_group_leader(p) \
>> >>  		(thread_group_leader(p) && !thread_group_empty(p))
>> >>  
>> >> +extern bool thread_group_exited(struct pid *pid);
>> >> +
>> >>  extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
>> >>  							unsigned long *flags);
>> >>  
>> >> diff --git a/kernel/exit.c b/kernel/exit.c
>> >> index d3294b611df1..a7f112feb0f6 100644
>> >> --- a/kernel/exit.c
>> >> +++ b/kernel/exit.c
>> >> @@ -1713,6 +1713,30 @@ COMPAT_SYSCALL_DEFINE5(waitid,
>> >>  }
>> >>  #endif
>> >>  
>> >> +/**
>> >> + * thread_group_exited - check that a thread group has exited
>> >> + * @pid: tgid of thread group to be checked.
>> >> + *
>> >> + * Test if thread group is has exited (all threads are zombies, dead
>> >> + * or completely gone).
>> >> + *
>> >> + * Return: true if the thread group has exited. false otherwise.
>> >> + */
>> >> +bool thread_group_exited(struct pid *pid)
>> >> +{
>> >> +	struct task_struct *task;
>> >> +	bool exited;
>> >> +
>> >> +	rcu_read_lock();
>> >> +	task = pid_task(pid, PIDTYPE_PID);
>> >> +	exited = !task ||
>> >> +		(READ_ONCE(task->exit_state) && thread_group_empty(task));
>> >> +	rcu_read_unlock();
>> >> +
>> >> +	return exited;
>> >> +}
>> >
>> > I'm not sure why you think READ_ONCE was missing.
>> > It's different in wait_consider_task() where READ_ONCE is needed because
>> > of multiple checks. Here it's done once.
>> 
>> In practice it probably has no effect on the generated code.  But
>> READ_ONCE is about telling the compiler not to be clever.  Don't use
>> tearing loads or stores etc.  When all of the other readers are using
>> READ_ONCE I just get nervous if we have a case that doesn't.
>
> That's not true. The only place where READ_ONCE(->exit_state) is used is
> in wait_consider_task() and nowhere else. We had that discussion a while
> ago where I or someone proposed to simply place a READ_ONCE() around all
> accesses to exit_state for the sake of kcsan and we agreed that it's
> unnecessary and not to do this.
> But it obviously doesn't hurt to have it.

There is a larger discussion to be had around the proper handling of
exit_state.

In this particular case because we are accessing exit_state with
only rcu_read_lock protection, because the outcome of the read
is about correctness, and because the compiler has nothing else
telling it not to re-read exit_state, I believe we actually need
the READ_ONCE.

At the same time it would take a pretty special compiler to want to
reaccess that field in thread_group_exited.

I have looked through and I don't find any of the other access of
exit_state where the result is about correctness (so that we care)
and we don't hold tasklist_lock.

But I have removed the necessary wording from the commit comment.

There is a much larger discussion to be had about what to do with
exit_state, because I think I found about half the accesses were
slightly buggy in one form or another.

Eric



^ permalink raw reply	[flat|nested] 194+ messages in thread

* Re: [PATCH v3 13/16] exit: Factor thread_group_exited out of pidfd_poll
  2020-07-07 17:09                                                                               ` Eric W. Biederman
@ 2020-07-08  0:05                                                                                 ` Daniel Borkmann
  2020-07-08  3:50                                                                                   ` Eric W. Biederman
  0 siblings, 1 reply; 194+ messages in thread
From: Daniel Borkmann @ 2020-07-08  0:05 UTC (permalink / raw)
  To: Eric W. Biederman, Christian Brauner
  Cc: Alexei Starovoitov, linux-kernel, David Miller,
	Greg Kroah-Hartman, Tetsuo Handa, Kees Cook, Andrew Morton,
	Alexei Starovoitov, Al Viro, bpf, linux-fsdevel, Jakub Kicinski,
	Masahiro Yamada, Gary Lin, Bruno Meneguele, LSM List,
	Casey Schaufler, Luis Chamberlain, Linus Torvalds

On 7/7/20 7:09 PM, Eric W. Biederman wrote:
> Christian Brauner <christian.brauner@ubuntu.com> writes:
>> On Fri, Jul 03, 2020 at 04:37:47PM -0500, Eric W. Biederman wrote:
>>> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>>>
>>>> On Thu, Jul 02, 2020 at 11:41:37AM -0500, Eric W. Biederman wrote:
>>>>> Create an independent helper thread_group_exited report return true
>>>>> when all threads have passed exit_notify in do_exit.  AKA all of the
>>>>> threads are at least zombies and might be dead or completely gone.
>>>>>
>>>>> Create this helper by taking the logic out of pidfd_poll where
>>>>> it is already tested, and adding a missing READ_ONCE on
>>>>> the read of task->exit_state.
>>>>>
>>>>> I will be changing the user mode driver code to use this same logic
>>>>> to know when a user mode driver needs to be restarted.
>>>>>
>>>>> Place the new helper thread_group_exited in kernel/exit.c and
>>>>> EXPORT it so it can be used by modules.
>>>>>
>>>>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>>>>> ---
>>>>>   include/linux/sched/signal.h |  2 ++
>>>>>   kernel/exit.c                | 24 ++++++++++++++++++++++++
>>>>>   kernel/fork.c                |  6 +-----
>>>>>   3 files changed, 27 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
>>>>> index 0ee5e696c5d8..1bad18a1d8ba 100644
>>>>> --- a/include/linux/sched/signal.h
>>>>> +++ b/include/linux/sched/signal.h
>>>>> @@ -674,6 +674,8 @@ static inline int thread_group_empty(struct task_struct *p)
>>>>>   #define delay_group_leader(p) \
>>>>>   		(thread_group_leader(p) && !thread_group_empty(p))
>>>>>   
>>>>> +extern bool thread_group_exited(struct pid *pid);
>>>>> +
>>>>>   extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
>>>>>   							unsigned long *flags);
>>>>>   
>>>>> diff --git a/kernel/exit.c b/kernel/exit.c
>>>>> index d3294b611df1..a7f112feb0f6 100644
>>>>> --- a/kernel/exit.c
>>>>> +++ b/kernel/exit.c
>>>>> @@ -1713,6 +1713,30 @@ COMPAT_SYSCALL_DEFINE5(waitid,
>>>>>   }
>>>>>   #endif
>>>>>   
>>>>> +/**
>>>>> + * thread_group_exited - check that a thread group has exited
>>>>> + * @pid: tgid of thread group to be checked.
>>>>> + *
>>>>> + * Test if thread group is has exited (all threads are zombies, dead
>>>>> + * or completely gone).
>>>>> + *
>>>>> + * Return: true if the thread group has exited. false otherwise.
>>>>> + */
>>>>> +bool thread_group_exited(struct pid *pid)
>>>>> +{
>>>>> +	struct task_struct *task;
>>>>> +	bool exited;
>>>>> +
>>>>> +	rcu_read_lock();
>>>>> +	task = pid_task(pid, PIDTYPE_PID);
>>>>> +	exited = !task ||
>>>>> +		(READ_ONCE(task->exit_state) && thread_group_empty(task));
>>>>> +	rcu_read_unlock();
>>>>> +
>>>>> +	return exited;
>>>>> +}
>>>>
>>>> I'm not sure why you think READ_ONCE was missing.
>>>> It's different in wait_consider_task() where READ_ONCE is needed because
>>>> of multiple checks. Here it's done once.
>>>
>>> In practice it probably has no effect on the generated code.  But
>>> READ_ONCE is about telling the compiler not to be clever.  Don't use
>>> tearing loads or stores etc.  When all of the other readers are using
>>> READ_ONCE I just get nervous if we have a case that doesn't.
>>
>> That's not true. The only place where READ_ONCE(->exit_state) is used is
>> in wait_consider_task() and nowhere else. We had that discussion a while
>> ago where I or someone proposed to simply place a READ_ONCE() around all
>> accesses to exit_state for the sake of kcsan and we agreed that it's
>> unnecessary and not to do this.
>> But it obviously doesn't hurt to have it.
> 
> There is a larger discussion to be had around the proper handling of
> exit_state.
> 
> In this particular case because we are accessing exit_state with
> only rcu_read_lock protection, because the outcome of the read
> is about correctness, and because the compiler has nothing else
> telling it not to re-read exit_state, I believe we actually need
> the READ_ONCE.
> 
> At the same time it would take a pretty special compiler to want to
> reaccess that field in thread_group_exited.
> 
> I have looked through and I don't find any of the other access of
> exit_state where the result is about correctness (so that we care)
> and we don't