Re: [PATCH] exit: Retain nsproxy for exit_task_work() work entries

From: Tejun Heo <tj@kernel.org>
To: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>,
	"Michal Koutný" <mkoutny@suse.com>,
	"Linux Kernel Mailing List" <linux-kernel@vger.kernel.org>,
	"Jens Axboe" <axboe@kernel.dk>,
	"Kees Cook" <keescook@chromium.org>,
	"Oleg Nesterov" <oleg@redhat.com>,
	"Peter Zijlstra" <peterz@infradead.org>,
	"Thomas Gleixner" <tglx@linutronix.de>,
	"Jim Newsome" <jnewsome@torproject.org>,
	"Alexey Gladkov" <legion@kernel.org>,
	"Security Officers" <security@kernel.org>,
	"Andy Lutomirski" <luto@amacapital.net>,
	"Jann Horn" <jannh@google.com>
Subject: Re: [PATCH] exit: Retain nsproxy for exit_task_work() work entries
Date: Wed, 8 Dec 2021 13:07:54 -1000	[thread overview]
Message-ID: <YbE6yvMav5Xtp5HO@slm.duckdns.org> (raw)
In-Reply-To: <YbEMPal0sKkk0+Tl@slm.duckdns.org>

Hello,

On Wed, Dec 08, 2021 at 09:49:17AM -1000, Tejun Heo wrote:
> >  (b) alternatively, go ahead and do the permission check at IO time,
> > but do it using "file->f_cred" (ie the open-time permission), not the
> > current process ones.

So, I have sth like the following. It builds and euid-open test case behaves
as expected (can't evade delegation restrictions if the fd was opened with
lesser euid). The namespace part is a bit more involved as f_cred doesn't
capture them on open. I made cgroup file open path capture it and use that
for all permission checks. Please let me know if anything looks weird.
Otherwise, I'm gonna add selftests and prep the patchset.

Thanks.

Index: work/kernel/cgroup/cgroup-v1.c
===================================================================

--- work.orig/kernel/cgroup/cgroup-v1.c
+++ work/kernel/cgroup/cgroup-v1.c
@@ -504,10 +504,11 @@ static ssize_t __cgroup1_procs_write(str
 		goto out_unlock;
 
 	/*
-	 * Even if we're attaching all tasks in the thread group, we only
-	 * need to check permissions on one of them.
+	 * Even if we're attaching all tasks in the thread group, we only need
+	 * to check permissions on one of them. Check permissions using the
+	 * credentials from file open to protect against inherited fd attacks.
 	 */
-	cred = current_cred();
+	cred = of->file->f_cred;
 	tcred = get_task_cred(task);
 	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
 	    !uid_eq(cred->euid, tcred->uid) &&
Index: work/kernel/cgroup/cgroup.c
===================================================================
--- work.orig/kernel/cgroup/cgroup.c
+++ work/kernel/cgroup/cgroup.c
@@ -3630,6 +3630,7 @@ static int cgroup_cpu_pressure_show(stru
 static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
 					  size_t nbytes, enum psi_res res)
 {
+	struct cgroup_file_ctx *ctx = of->priv;
 	struct psi_trigger *new;
 	struct cgroup *cgrp;
 	struct psi_group *psi;
@@ -3648,7 +3649,7 @@ static ssize_t cgroup_pressure_write(str
 		return PTR_ERR(new);
 	}
 
-	psi_trigger_replace(&of->priv, new);
+	psi_trigger_replace(&ctx->psi.trigger, new);
 
 	cgroup_put(cgrp);
 
@@ -3679,12 +3680,16 @@ static ssize_t cgroup_cpu_pressure_write
 static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
 					  poll_table *pt)
 {
-	return psi_trigger_poll(&of->priv, of->file, pt);
+	struct cgroup_file_ctx *ctx = of->priv;
+
+	return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
 }
 
 static void cgroup_pressure_release(struct kernfs_open_file *of)
 {
-	psi_trigger_replace(&of->priv, NULL);
+	struct cgroup_file_ctx *ctx = of->priv;
+
+	psi_trigger_replace(&ctx->psi.trigger, NULL);
 }
 
 bool cgroup_psi_enabled(void)
@@ -3811,24 +3816,42 @@ static ssize_t cgroup_kill_write(struct
 static int cgroup_file_open(struct kernfs_open_file *of)
 {
 	struct cftype *cft = of_cft(of);
+	struct cgroup_file_ctx *ctx;
+	int ret;
 
-	if (cft->open)
-		return cft->open(of);
-	return 0;
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->ns = current->nsproxy->cgroup_ns;
+	get_cgroup_ns(ctx->ns);
+
+	of->priv = ctx;
+
+	if (!cft->open)
+		return 0;
+
+	ret = cft->open(of);
+	if (ret)
+		kfree(ctx);
+	return ret;
 }
 
 static void cgroup_file_release(struct kernfs_open_file *of)
 {
 	struct cftype *cft = of_cft(of);
+	struct cgroup_file_ctx *ctx = of->priv;
 
 	if (cft->release)
 		cft->release(of);
+	put_cgroup_ns(ctx->ns);
+	kfree(ctx);
 }
 
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off)
 {
-	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+	struct cgroup_file_ctx *ctx = of->priv;
 	struct cgroup *cgrp = of->kn->parent->priv;
 	struct cftype *cft = of_cft(of);
 	struct cgroup_subsys_state *css;
@@ -3845,7 +3868,7 @@ static ssize_t cgroup_file_write(struct
 	 */
 	if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
 	    !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
-	    ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+	    ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
 		return -EPERM;
 
 	if (cft->write)
@@ -4751,21 +4774,23 @@ void css_task_iter_end(struct css_task_i
 
 static void cgroup_procs_release(struct kernfs_open_file *of)
 {
-	if (of->priv) {
-		css_task_iter_end(of->priv);
-		kfree(of->priv);
+	struct cgroup_file_ctx *ctx = of->priv;
+
+	if (ctx->procs.it) {
+		css_task_iter_end(ctx->procs.it);
+		kfree(ctx->procs.it);
 	}
 }
 
 static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
 {
 	struct kernfs_open_file *of = s->private;
-	struct css_task_iter *it = of->priv;
+	struct cgroup_file_ctx *ctx = of->priv;
 
 	if (pos)
 		(*pos)++;
 
-	return css_task_iter_next(it);
+	return css_task_iter_next(ctx->procs.it);
 }
 
 static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
@@ -4773,7 +4798,8 @@ static void *__cgroup_procs_start(struct
 {
 	struct kernfs_open_file *of = s->private;
 	struct cgroup *cgrp = seq_css(s)->cgroup;
-	struct css_task_iter *it = of->priv;
+	struct cgroup_file_ctx *ctx = of->priv;
+	struct css_task_iter *it = ctx->procs.it;
 
 	/*
 	 * When a seq_file is seeked, it's always traversed sequentially
@@ -4786,7 +4812,7 @@ static void *__cgroup_procs_start(struct
 		it = kzalloc(sizeof(*it), GFP_KERNEL);
 		if (!it)
 			return ERR_PTR(-ENOMEM);
-		of->priv = it;
+		ctx->procs.it = it;
 		css_task_iter_start(&cgrp->self, iter_flags, it);
 	} else if (!(*pos)) {
 		css_task_iter_end(it);
@@ -4838,9 +4864,9 @@ static int cgroup_may_write(const struct
 
 static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
 					 struct cgroup *dst_cgrp,
-					 struct super_block *sb)
+					 struct super_block *sb,
+					 struct cgroup_namespace *ns)
 {
-	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
 	struct cgroup *com_cgrp = src_cgrp;
 	int ret;
 
@@ -4869,11 +4895,12 @@ static int cgroup_procs_write_permission
 
 static int cgroup_attach_permissions(struct cgroup *src_cgrp,
 				     struct cgroup *dst_cgrp,
-				     struct super_block *sb, bool threadgroup)
+				     struct super_block *sb, bool threadgroup,
+				     struct cgroup_namespace *ns)
 {
 	int ret = 0;
 
-	ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
+	ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
 	if (ret)
 		return ret;
 
@@ -4890,8 +4917,10 @@ static int cgroup_attach_permissions(str
 static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 				    bool threadgroup)
 {
+	struct cgroup_file_ctx *ctx = of->priv;
 	struct cgroup *src_cgrp, *dst_cgrp;
 	struct task_struct *task;
+	const struct cred *saved_cred;
 	ssize_t ret;
 	bool locked;
 
@@ -4909,9 +4938,16 @@ static ssize_t __cgroup_procs_write(stru
 	src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
 	spin_unlock_irq(&css_set_lock);
 
-	/* process and thread migrations follow same delegation rule */
+	/*
+	 * Process and thread migrations follow same delegation rule. Check
+	 * permissions using the credentials from file open to protect against
+	 * inherited fd attacks.
+	 */
+	saved_cred = override_creds(of->file->f_cred);
 	ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
-					of->file->f_path.dentry->d_sb, threadgroup);
+					of->file->f_path.dentry->d_sb,
+					threadgroup, ctx->ns);
+	revert_creds(saved_cred);
 	if (ret)
 		goto out_finish;
 
@@ -6130,7 +6166,8 @@ static int cgroup_css_set_fork(struct ke
 		goto err;
 
 	ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
-					!(kargs->flags & CLONE_THREAD));
+					!(kargs->flags & CLONE_THREAD),
+					current->nsproxy->cgroup_ns);
 	if (ret)
 		goto err;
 
Index: work/kernel/cgroup/cgroup-internal.h
===================================================================
--- work.orig/kernel/cgroup/cgroup-internal.h
+++ work/kernel/cgroup/cgroup-internal.h
@@ -65,6 +65,20 @@ static inline struct cgroup_fs_context *
 	return container_of(kfc, struct cgroup_fs_context, kfc);
 }
 
+struct cgroup_file_ctx {
+	struct cgroup_namespace	*ns;
+
+	union {
+		struct {
+			struct css_task_iter	*it;
+		} procs;
+
+		struct {
+			void			*trigger;
+		} psi;
+	};
+};
+
 /*
  * A cgroup can be associated with multiple css_sets as different tasks may
  * belong to different cgroups on different hierarchies.  In the other