linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: joel@joelfernandes.org, chris.hyser@oracle.com,
	joshdon@google.com, mingo@kernel.org, vincent.guittot@linaro.org,
	valentin.schneider@arm.com, mgorman@suse.de
Cc: linux-kernel@vger.kernel.org, peterz@infradead.org,
	tj@kernel.org, tglx@linutronix.de
Subject: [PATCH 9/9] sched: prctl() and cgroup interaction
Date: Thu, 01 Apr 2021 15:10:21 +0200	[thread overview]
Message-ID: <20210401133917.590280797@infradead.org> (raw)
In-Reply-To: <20210401131012.395311786@infradead.org>

All the nasty bits that manage prctl() and cgroup core-sched
interaction.

In order to manage this; a 'fat' cookie is introduced, this is
basically a nested cookie that links to two other cookies. Any unique
combination of task and cgroup cookies will get _one_ fat cookie.

Uniqueness of fat cookies is ensured by use of a global tree.

Due to the locking rules for cookies, the need for fat cookies is not
apparent up-front, nor can they be allocated in-situ, therefore
pre-allocate them agressively and mostly free them instantly when not
used.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched.h     |    1 
 kernel/sched/core.c       |   25 +++
 kernel/sched/core_sched.c |  329 ++++++++++++++++++++++++++++++++++++++++++----
 kernel/sched/sched.h      |   11 +
 4 files changed, 337 insertions(+), 29 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -704,6 +704,7 @@ struct task_struct {
 #ifdef CONFIG_SCHED_CORE
 	struct rb_node			core_node;
 	unsigned long			core_cookie;
+	void				*core_spare_fat;
 	unsigned int			core_occupation;
 #endif
 
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9487,7 +9487,7 @@ void sched_move_task(struct task_struct
 	task_rq_unlock(rq, tsk, &rf);
 
 	cookie = sched_core_cgroup_cookie(tsk->sched_task_group);
-	cookie = sched_core_update_cookie(tsk, cookie);
+	cookie = sched_core_update_cookie(tsk, cookie | GROUP_COOKIE);
 	sched_core_put_cookie(cookie);
 }
 
@@ -9592,6 +9592,10 @@ static int cpu_cgroup_can_attach(struct
 
 		if (ret)
 			break;
+
+		ret = sched_core_prealloc_fat(task);
+		if (ret)
+			break;
 	}
 	return ret;
 }
@@ -10122,13 +10126,28 @@ int cpu_sched_core_write_u64(struct cgro
 
 	old_cookie = tg->core_cookie;
 	if (val) {
-		cookie = sched_core_alloc_cookie();
+		cookie = sched_core_alloc_cookie(GROUP_COOKIE);
 		if (!cookie) {
 			ret = -ENOMEM;
 			goto unlock;
 		}
 		WARN_ON_ONCE(old_cookie);
 
+		css_for_each_descendant_pre(cssi, css) {
+			struct css_task_iter it;
+			struct task_struct *p;
+
+			css_task_iter_start(cssi, 0, &it);
+			while ((p = css_task_iter_next(&it))) {
+				ret = sched_core_prealloc_fat(p);
+				if (ret) {
+					css_task_iter_end(&it);
+					goto unlock;
+				}
+			}
+			css_task_iter_end(&it);
+		}
+
 	} else if (tg->parent) {
 		if (tg->parent->core_parent)
 			parent = tg->parent->core_parent;
@@ -10164,7 +10183,7 @@ int cpu_sched_core_write_u64(struct cgro
 			unsigned long p_cookie;
 
 			cookie = sched_core_get_cookie(cookie);
-			p_cookie = sched_core_update_cookie(p, cookie);
+			p_cookie = sched_core_update_cookie(p, cookie | GROUP_COOKIE);
 			sched_core_put_cookie(p_cookie);
 		}
 		css_task_iter_end(&it);
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
 #include <linux/prctl.h>
+#include <linux/rbtree.h>
+#include <linux/cgroup.h>
 #include "sched.h"
 
 /*
@@ -8,26 +10,243 @@
  * address is used to compute the cookie of the task.
  */
 struct sched_core_cookie {
-	refcount_t refcnt;
+	refcount_t	refcnt;
+	unsigned int	type;
 };
 
-unsigned long sched_core_alloc_cookie(void)
+static inline void *cookie_ptr(unsigned long cookie)
+{
+	return (void *)(cookie & ~3UL);
+}
+
+static inline int cookie_type(unsigned long cookie)
+{
+	return cookie & 3;
+}
+
+static inline void sched_core_init_cookie(struct sched_core_cookie *ck, unsigned int type)
+{
+	refcount_set(&ck->refcnt, 1);
+	ck->type = type;
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+
+#define FAT_COOKIE	0x03
+
+struct sched_core_fat_cookie {
+	struct sched_core_cookie	cookie;
+	unsigned long			task_cookie;
+	unsigned long			group_cookie;
+	struct rb_node			node;
+};
+
+static DEFINE_RAW_SPINLOCK(fat_lock);
+static struct rb_root fat_root;
+
+static void fat_mutex_lock(void)
+{
+	/*
+	 * { ss->can_attach(), ss->attach() } vs prctl() for p->core_spare_fat
+	 */
+	mutex_lock(&cgroup_mutex);
+}
+
+static void fat_mutex_unlock(void)
+{
+	mutex_unlock(&cgroup_mutex);
+}
+
+static void sched_core_put_fat(struct sched_core_fat_cookie *fat)
+{
+	unsigned long flags;
+
+	if (fat->cookie.type != FAT_COOKIE)
+		return;
+
+	sched_core_put_cookie(fat->task_cookie);
+	sched_core_put_cookie(fat->group_cookie);
+
+	if (!RB_EMPTY_NODE(&fat->node)) {
+		raw_spin_lock_irqsave(&fat_lock, flags);
+		rb_erase(&fat->node, &fat_root);
+		raw_spin_unlock_irqrestore(&fat_lock, flags);
+	}
+}
+
+static void *node_2_fat(struct rb_node *n)
+{
+	return rb_entry(n, struct sched_core_fat_cookie, node);
+}
+
+static int fat_cmp(struct rb_node *a, struct rb_node *b)
+{
+	struct sched_core_fat_cookie *ca = node_2_fat(a);
+	struct sched_core_fat_cookie *cb = node_2_fat(b);
+
+	if (ca->group_cookie < cb->group_cookie)
+		return -1;
+	if (ca->group_cookie > cb->group_cookie)
+		return 1;
+
+	if (ca->task_cookie < cb->task_cookie)
+		return -1;
+	if (ca->task_cookie > cb->task_cookie)
+		return 1;
+
+	if (refcount_inc_not_zero(&cb->cookie.refcnt))
+		return 0;
+
+	return 1;
+}
+
+static unsigned long __sched_core_fat_cookie(struct task_struct *p,
+					     void **spare_fat,
+					     unsigned long cookie)
+{
+	unsigned long task_cookie, group_cookie;
+	unsigned int p_type = cookie_type(p->core_cookie);
+	unsigned int c_type = cookie_type(cookie);
+	struct sched_core_fat_cookie *fat;
+	unsigned long flags;
+	struct rb_node *n;
+
+	if (WARN_ON_ONCE(c_type == FAT_COOKIE))
+		return cookie;
+
+	if (!p_type || p_type == c_type)
+		return cookie;
+
+	if (p_type == FAT_COOKIE) {
+		fat = cookie_ptr(p->core_cookie);
+
+		/* loose fat */
+		if (!cookie_ptr(cookie)) {
+			if (c_type == TASK_COOKIE)
+				cookie = fat->group_cookie;
+			else
+				cookie = fat->task_cookie;
+
+			WARN_ON_ONCE(!cookie_ptr(cookie));
+			return sched_core_get_cookie(cookie);
+		}
+
+		/* other fat */
+		if (c_type == TASK_COOKIE)
+			group_cookie = fat->group_cookie;
+		else
+			task_cookie = fat->task_cookie;
+
+	} else {
+
+		/* new fat */
+		if (p_type == TASK_COOKIE)
+			task_cookie = p->core_cookie;
+		else
+			group_cookie = p->core_cookie;
+	}
+
+	if (c_type == TASK_COOKIE)
+		task_cookie = cookie;
+	else
+		group_cookie = cookie;
+
+	fat = *spare_fat;
+	if (WARN_ON_ONCE(!fat))
+		return cookie;
+
+	sched_core_init_cookie(&fat->cookie, FAT_COOKIE);
+	fat->task_cookie = sched_core_get_cookie(task_cookie);
+	fat->group_cookie = sched_core_get_cookie(group_cookie);
+	RB_CLEAR_NODE(&fat->node);
+
+	raw_spin_lock_irqsave(&fat_lock, flags);
+	n = rb_find_add(&fat->node, &fat_root, fat_cmp);
+	raw_spin_unlock_irqrestore(&fat_lock, flags);
+
+	if (n) {
+		sched_core_put_fat(fat);
+		fat = node_2_fat(n);
+	} else {
+		*spare_fat = NULL;
+	}
+
+	return (unsigned long)fat | FAT_COOKIE;
+}
+
+static int __sched_core_alloc_fat(void **spare_fat)
+{
+	if (*spare_fat)
+		return 0;
+
+	*spare_fat = kmalloc(sizeof(struct sched_core_fat_cookie), GFP_KERNEL);
+	if (!*spare_fat)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int sched_core_prealloc_fat(struct task_struct *p)
+{
+	lockdep_assert_held(&cgroup_mutex);
+	return __sched_core_alloc_fat(&p->core_spare_fat);
+}
+
+static inline unsigned long __sched_core_task_cookie(struct task_struct *p)
+{
+	unsigned long cookie = p->core_cookie;
+	unsigned int c_type = cookie_type(cookie);
+
+	if (!(c_type & TASK_COOKIE))
+		return 0;
+
+	if (c_type == FAT_COOKIE)
+		cookie = ((struct sched_core_fat_cookie *)cookie_ptr(cookie))->task_cookie;
+
+	return cookie;
+}
+
+#else
+
+static inline void fat_mutex_lock(void) { }
+static inline void fat_mutex_unlock(void) { }
+
+static inline void sched_core_put_fat(void *ptr) { }
+static inline int __sched_core_alloc_fat(void **spare_fat) { return 0; }
+
+static inline unsigned long __sched_core_fat_cookie(struct task_struct *p,
+						    void **spare_fat,
+						    unsigned long cookie)
+{
+	return cookie;
+}
+
+static inline unsigned long __sched_core_task_cookie(struct task_struct *p)
+{
+	return p->core_cookie;
+}
+
+#endif /* CGROUP_SCHED */
+
+unsigned long sched_core_alloc_cookie(unsigned int type)
 {
 	struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL);
 	if (!ck)
 		return 0;
 
-	refcount_set(&ck->refcnt, 1);
+	WARN_ON_ONCE(type > GROUP_COOKIE);
+	sched_core_init_cookie(ck, type);
 	sched_core_get();
 
-	return (unsigned long)ck;
+	return (unsigned long)ck | type;
 }
 
 void sched_core_put_cookie(unsigned long cookie)
 {
-	struct sched_core_cookie *ptr = (void *)cookie;
+	struct sched_core_cookie *ptr = cookie_ptr(cookie);
 
 	if (ptr && refcount_dec_and_test(&ptr->refcnt)) {
+		sched_core_put_fat((void *)ptr);
 		kfree(ptr);
 		sched_core_put();
 	}
@@ -35,7 +254,7 @@ void sched_core_put_cookie(unsigned long
 
 unsigned long sched_core_get_cookie(unsigned long cookie)
 {
-	struct sched_core_cookie *ptr = (void *)cookie;
+	struct sched_core_cookie *ptr = cookie_ptr(cookie);
 
 	if (ptr)
 		refcount_inc(&ptr->refcnt);
@@ -50,14 +269,22 @@ unsigned long sched_core_get_cookie(unsi
  * @cookie: The new cookie.
  * @cookie_type: The cookie field to which the cookie corresponds.
  */
-unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie)
+static unsigned long __sched_core_update_cookie(struct task_struct *p,
+						void **spare_fat,
+						unsigned long cookie)
 {
 	unsigned long old_cookie;
 	struct rq_flags rf;
 	struct rq *rq;
 	bool enqueued;
 
-	rq = task_rq_lock(p, &rf);
+	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+
+	cookie = __sched_core_fat_cookie(p, spare_fat, cookie);
+	if (!cookie_ptr(cookie))
+		cookie = 0UL;
+
+	rq = __task_rq_lock(p, &rf);
 
 	/*
 	 * Since creating a cookie implies sched_core_get(), and we cannot set
@@ -90,9 +317,19 @@ unsigned long sched_core_update_cookie(s
 	return old_cookie;
 }
 
+unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie)
+{
+	cookie =  __sched_core_update_cookie(p, &p->core_spare_fat, cookie);
+	if (p->core_spare_fat) {
+		kfree(p->core_spare_fat);
+		p->core_spare_fat = NULL;
+	}
+	return cookie;
+}
+
 static unsigned long sched_core_clone_cookie(struct task_struct *p)
 {
-	unsigned long cookie, flags;
+	unsigned long flags, cookie;
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	cookie = sched_core_get_cookie(p->core_cookie);
@@ -101,26 +338,47 @@ static unsigned long sched_core_clone_co
 	return cookie;
 }
 
+static unsigned long sched_core_clone_task_cookie(struct task_struct *p)
+{
+	unsigned long flags, cookie;
+
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	cookie = sched_core_get_cookie(__sched_core_task_cookie(p));
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	return cookie;
+}
+
 void sched_core_fork(struct task_struct *p)
 {
 	RB_CLEAR_NODE(&p->core_node);
 	p->core_cookie = sched_core_clone_cookie(current);
+	p->core_spare_fat = NULL;
 }
 
 void sched_core_free(struct task_struct *p)
 {
 	sched_core_put_cookie(p->core_cookie);
+	kfree(p->core_spare_fat);
 }
 
 int sched_core_exec(void)
 {
 	/* absent a policy mech, if task had a cookie, give it a new one */
-	if (current->core_cookie) {
-		unsigned long cookie = sched_core_alloc_cookie();
+	if (current->core_cookie & TASK_COOKIE) {
+		void *spare_fat = NULL;
+		unsigned long cookie;
+
+		if (__sched_core_alloc_fat(&spare_fat))
+			return -ENOMEM;
+
+		cookie = sched_core_alloc_cookie(TASK_COOKIE);
 		if (!cookie)
 			return -ENOMEM;
-		cookie = sched_core_update_cookie(current, cookie);
+
+		cookie = __sched_core_update_cookie(current, &spare_fat, cookie);
 		sched_core_put_cookie(cookie);
+		kfree(spare_fat);
 	}
 
 	return 0;
@@ -129,7 +387,7 @@ int sched_core_exec(void)
 static void __sched_core_set(struct task_struct *p, unsigned long cookie)
 {
 	cookie = sched_core_get_cookie(cookie);
-	cookie = sched_core_update_cookie(p, cookie);
+	cookie = sched_core_update_cookie(p, cookie | TASK_COOKIE);
 	sched_core_put_cookie(cookie);
 }
 
@@ -171,55 +429,62 @@ int sched_core_share_pid(unsigned int cm
 		goto out;
 	}
 
+	fat_mutex_lock();
+
+	err = sched_core_prealloc_fat(task);
+	if (err)
+		goto out_unlock;
+
 	switch (cmd) {
 	case PR_SCHED_CORE_GET:
 		if (type != PIDTYPE_PID || uaddr & 7) {
 			err = -EINVAL;
-			goto out;
+			goto out_unlock;
 		}
-		cookie = sched_core_clone_cookie(task);
-		if (cookie) {
+		cookie = sched_core_clone_task_cookie(task);
+		if (cookie_ptr(cookie)) {
 			/* XXX improve ? */
 			ptr_to_hashval((void *)cookie, &id);
 		}
 		err = put_user(id, (u64 __user *)uaddr);
-		goto out;
+		goto out_unlock;
 
 	case PR_SCHED_CORE_CLEAR:
 		cookie = 0;
 		break;
 
 	case PR_SCHED_CORE_CREATE:
-		cookie = sched_core_alloc_cookie();
+		cookie = sched_core_alloc_cookie(TASK_COOKIE);
 		if (!cookie) {
 			err = -ENOMEM;
-			goto out;
+			goto out_unlock;
 		}
 		break;
 
 	case PR_SCHED_CORE_SHARE_TO:
-		cookie = sched_core_clone_cookie(current);
+		cookie = sched_core_clone_task_cookie(current);
 		break;
 
 	case PR_SCHED_CORE_SHARE_FROM:
 		if (type != PIDTYPE_PID) {
 			err = -EINVAL;
-			goto out;
+			goto out_unlock;
 		}
-		cookie = sched_core_clone_cookie(task);
+		cookie = sched_core_clone_task_cookie(task);
 		__sched_core_set(current, cookie);
-		goto out;
+		goto out_unlock;
 
 	default:
 		err = -EINVAL;
-		goto out;
+		goto out_unlock;
 	};
 
 	if (type == PIDTYPE_PID) {
 		__sched_core_set(task, cookie);
-		goto out;
+		goto out_unlock;
 	}
 
+again:
 	read_lock(&tasklist_lock);
 	grp = task_pid_type(task, type);
 
@@ -228,6 +493,18 @@ int sched_core_share_pid(unsigned int cm
 			err = -EPERM;
 			goto out_tasklist;
 		}
+
+		if (IS_ENABLED(CONFIG_CGROUP_SCHED) && !p->core_spare_fat) {
+			get_task_struct(p);
+			read_unlock(&tasklist_lock);
+
+			err = sched_core_prealloc_fat(p);
+			put_task_struct(p);
+			if (err)
+				goto out_unlock;
+
+			goto again;
+		}
 	} while_each_pid_thread(grp, type, p);
 
 	do_each_pid_thread(grp, type, p) {
@@ -236,6 +513,8 @@ int sched_core_share_pid(unsigned int cm
 out_tasklist:
 	read_unlock(&tasklist_lock);
 
+out_unlock:
+	fat_mutex_unlock();
 out:
 	sched_core_put_cookie(cookie);
 	put_task_struct(task);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1240,10 +1240,14 @@ extern void sched_core_dequeue(struct rq
 extern void sched_core_get(void);
 extern void sched_core_put(void);
 
-extern unsigned long sched_core_alloc_cookie(void);
+#define TASK_COOKIE	0x01
+#define GROUP_COOKIE	0x02
+
+extern unsigned long sched_core_alloc_cookie(unsigned int type);
 extern void sched_core_put_cookie(unsigned long cookie);
 extern unsigned long sched_core_get_cookie(unsigned long cookie);
 extern unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie);
+extern int sched_core_prealloc_fat(struct task_struct *p);
 
 #else /* !CONFIG_SCHED_CORE */
 
@@ -1257,6 +1261,11 @@ static inline bool sched_core_disabled(v
 	return true;
 }
 
+static inline int sched_core_prealloc_fat(struct task_struct *p)
+{
+	return 0;
+}
+
 static inline raw_spinlock_t *rq_lockp(struct rq *rq)
 {
 	return &rq->__lock;



  parent reply	other threads:[~2021-04-01 18:43 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-01 13:10 [PATCH 0/9] sched: Core scheduling interfaces Peter Zijlstra
2021-04-01 13:10 ` [PATCH 1/9] sched: Allow sched_core_put() from atomic context Peter Zijlstra
2021-04-01 13:10 ` [PATCH 2/9] sched: Implement core-sched assertions Peter Zijlstra
2021-04-01 13:10 ` [PATCH 3/9] sched: Trivial core scheduling cookie management Peter Zijlstra
2021-04-01 20:04   ` Josh Don
2021-04-02  7:13     ` Peter Zijlstra
2021-04-01 13:10 ` [PATCH 4/9] sched: Default core-sched policy Peter Zijlstra
2021-04-21 13:33   ` Peter Zijlstra
2021-04-21 14:31     ` Chris Hyser
2021-04-01 13:10 ` [PATCH 5/9] sched: prctl() core-scheduling interface Peter Zijlstra
2021-04-07 17:00   ` Peter Zijlstra
2021-04-18  3:52     ` Joel Fernandes
2021-04-01 13:10 ` [PATCH 6/9] kselftest: Add test for core sched prctl interface Peter Zijlstra
2021-04-01 13:10 ` [PATCH 7/9] sched: Cgroup core-scheduling interface Peter Zijlstra
2021-04-02  0:34   ` Josh Don
2021-04-01 13:10 ` [PATCH 8/9] rbtree: Remove const from the rb_find_add() comparator Peter Zijlstra
2021-04-01 13:10 ` Peter Zijlstra [this message]
2021-04-03  1:30   ` [PATCH 9/9] sched: prctl() and cgroup interaction Josh Don
2021-04-06 15:12     ` Peter Zijlstra
2021-04-04 23:39 ` [PATCH 0/9] sched: Core scheduling interfaces Tejun Heo
2021-04-05 18:46   ` Joel Fernandes
2021-04-06 14:16     ` Tejun Heo
2021-04-18  1:35       ` Joel Fernandes
2021-04-19  9:00         ` Peter Zijlstra
2021-04-21 13:35           ` Peter Zijlstra
2021-04-21 14:45             ` Chris Hyser
2021-04-06 15:32   ` Peter Zijlstra
2021-04-06 16:08     ` Tejun Heo
2021-04-07 18:39       ` Peter Zijlstra
2021-04-07 16:50   ` Michal Koutný
2021-04-07 18:34     ` Peter Zijlstra
2021-04-08 13:25       ` Michal Koutný
2021-04-08 15:02         ` Peter Zijlstra
2021-04-09  0:16           ` Josh Don
2021-04-19 11:30       ` Tejun Heo
2021-04-20  1:17         ` Josh Don

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210401133917.590280797@infradead.org \
    --to=peterz@infradead.org \
    --cc=chris.hyser@oracle.com \
    --cc=joel@joelfernandes.org \
    --cc=joshdon@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@kernel.org \
    --cc=tglx@linutronix.de \
    --cc=tj@kernel.org \
    --cc=valentin.schneider@arm.com \
    --cc=vincent.guittot@linaro.org \
    --subject='Re: [PATCH 9/9] sched: prctl() and cgroup interaction' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
on how to clone and mirror all data and code used for this inbox