All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC 5.13] io_uring: add IORING_REGISTER_PRIORITY
@ 2021-05-06 14:33 Hao Xu
  2021-05-06 17:10 ` Jens Axboe
  0 siblings, 1 reply; 6+ messages in thread
From: Hao Xu @ 2021-05-06 14:33 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, Pavel Begunkov, Joseph Qi

Users may want a higher priority for sq_thread or io-worker. Provide a
way to change the nice value(for SCHED_NORMAL) or scheduling policy.

Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
---

currently support for sqthread, while space reserved for io-worker.

 fs/io_uring.c                 | 117 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  29 +++++++++++
 2 files changed, 146 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 63ff70587d4f..a77b7a05c058 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -83,6 +83,7 @@
 #include <trace/events/io_uring.h>
 
 #include <uapi/linux/io_uring.h>
+#include <uapi/linux/sched/types.h>
 
 #include "internal.h"
 #include "io-wq.h"
@@ -9908,6 +9909,115 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 	return -EINVAL;
 }
 
+/*
+ * Returns true if current's euid is same as p's uid or euid,
+ * or has CAP_SYS_NICE to p's user_ns.
+ *
+ * Called with rcu_read_lock, creds are safe
+ */
+static bool set_one_prio_perm(struct task_struct *p)
+{
+	const struct cred *cred = current_cred(), *pcred = __task_cred(p);
+
+	if (uid_eq(pcred->uid,  cred->euid) ||
+	    uid_eq(pcred->euid, cred->euid))
+		return true;
+	if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
+		return true;
+	return false;
+}
+
+/*
+ * set the priority of a task
+ * - the caller must hold the RCU read lock
+ */
+static int set_one_prio(struct task_struct *p, int niceval, int error)
+{
+	int no_nice;
+
+	if (!set_one_prio_perm(p)) {
+		error = -EPERM;
+		goto out;
+	}
+	if (niceval < task_nice(p) && !can_nice(p, niceval)) {
+		error = -EACCES;
+		goto out;
+	}
+	no_nice = security_task_setnice(p, niceval);
+	if (no_nice) {
+		error = no_nice;
+		goto out;
+	}
+	if (error == -ESRCH)
+		error = 0;
+	set_user_nice(p, niceval);
+out:
+	return error;
+}
+
+static int io_register_priority(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_priority prio;
+	int error = -ESRCH;
+
+	if (copy_from_user(&prio, arg, sizeof(struct io_uring_priority)))
+		return -EFAULT;
+
+	if (prio.target >= IORING_PRIORITY_TARGET_LAST)
+		return -EINVAL;
+
+	if (prio.mode >= IORING_PRIORITY_MODE_LAST)
+		return -EINVAL;
+
+
+	switch (prio.mode) {
+	case IORING_PRIORITY_MODE_NICE: {
+		int niceval = prio.nice;
+
+		if (niceval < MIN_NICE)
+			niceval = MIN_NICE;
+		if (niceval > MAX_NICE)
+			niceval = MAX_NICE;
+
+		if (prio.target == IORING_PRIORITY_TARGET_SQ) {
+			struct io_sq_data *sqd = ctx->sq_data;
+
+			if (!sqd)
+				return error;
+
+			io_sq_thread_park(sqd);
+			if (sqd->thread)
+				error = set_one_prio(sqd->thread, niceval, -ESRCH);
+			io_sq_thread_unpark(sqd);
+		}
+		break;
+	}
+	case IORING_PRIORITY_MODE_SCHEDULER: {
+		int policy = prio.param.policy;
+		int sched_priority = prio.param.sched_priority;
+		struct sched_param lparam = { .sched_priority = sched_priority };
+
+		if (policy < 0)
+			return -EINVAL;
+
+		if (prio.target == IORING_PRIORITY_TARGET_SQ) {
+			struct io_sq_data *sqd = ctx->sq_data;
+
+			if (!sqd)
+				return error;
+
+			io_sq_thread_park(sqd);
+			if (sqd->thread)
+				error = sched_setscheduler(sqd->thread, policy, &lparam);
+			io_sq_thread_unpark(sqd);
+		}
+		break;
+	}
+	}
+
+	return error;
+}
+
 static bool io_register_op_must_quiesce(int op)
 {
 	switch (op) {
@@ -9921,6 +10031,7 @@ static bool io_register_op_must_quiesce(int op)
 	case IORING_UNREGISTER_PERSONALITY:
 	case IORING_REGISTER_RSRC:
 	case IORING_REGISTER_RSRC_UPDATE:
+	case IORING_REGISTER_PRIORITY:
 		return false;
 	default:
 		return true;
@@ -10052,6 +10163,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	case IORING_REGISTER_RSRC_UPDATE:
 		ret = io_register_rsrc_update(ctx, arg, nr_args);
 		break;
+	case IORING_REGISTER_PRIORITY:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_priority(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index e1ae46683301..4788a62bf3a1 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -300,6 +300,7 @@ enum {
 	IORING_REGISTER_ENABLE_RINGS		= 12,
 	IORING_REGISTER_RSRC			= 13,
 	IORING_REGISTER_RSRC_UPDATE		= 14,
+	IORING_REGISTER_PRIORITY		= 15,
 
 	/* this goes last */
 	IORING_REGISTER_LAST
@@ -396,4 +397,32 @@ struct io_uring_getevents_arg {
 	__u64	ts;
 };
 
+enum {
+	IORING_PRIORITY_TARGET_SQ,	/*  update priority of sqthread */
+	IORING_PRIORITY_TARGET_WQ,	/*  update priority of io worker */
+
+	IORING_PRIORITY_TARGET_LAST,
+};
+
+enum {
+	IORING_PRIORITY_MODE_NICE,
+	IORING_PRIORITY_MODE_SCHEDULER,
+
+	IORING_PRIORITY_MODE_LAST,
+};
+
+struct io_uring_sched_param {
+	__s32 policy;
+	__s32 sched_priority;
+};
+
+struct io_uring_priority {
+	__u32 target;
+	__u32 mode;
+	union {
+		__s32 nice;
+		struct io_uring_sched_param param;
+	};
+};
+
 #endif
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH RFC 5.13] io_uring: add IORING_REGISTER_PRIORITY
  2021-05-06 14:33 [PATCH RFC 5.13] io_uring: add IORING_REGISTER_PRIORITY Hao Xu
@ 2021-05-06 17:10 ` Jens Axboe
  2021-05-06 19:20   ` Hao Xu
  0 siblings, 1 reply; 6+ messages in thread
From: Jens Axboe @ 2021-05-06 17:10 UTC (permalink / raw)
  To: Hao Xu; +Cc: io-uring, Pavel Begunkov, Joseph Qi

On 5/6/21 8:33 AM, Hao Xu wrote:
> Users may want a higher priority for sq_thread or io-worker. Provide a
> way to change the nice value(for SCHED_NORMAL) or scheduling policy.

Silly question - why is this needed for sqpoll? With the threads now
being essentially user threads, why can't we just modify nice and
scheduler class from userspace instead? That should work now. I think
this is especially true for sqpoll where it's persistent, and argument
could be made for the io-wq worker threads that we'd need io_uring
support for that, as they come and go and there's no reliable way to
find and tweak the thread scheduler settings for that particular use
case.

It may be more convenient to support this through io_uring, and that is
a valid argument. I do think that the better way would then be to simply
pass back the sqpoll pid after ring setup, because then it'd almost be
as simple to do it from the app itself using the regular system call
interfaces for that.

In summary, I do think this _may_ make sense for the worker threads,
being able to pass in this information and have io-wq worker thread
setup perform the necessary tweaks when a thread is created, but it does
seem a bit silly to add this for sqpoll where it could just as easily be
achieved from the application itself without needing to add this
support.

What do you think?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH RFC 5.13] io_uring: add IORING_REGISTER_PRIORITY
  2021-05-06 17:10 ` Jens Axboe
@ 2021-05-06 19:20   ` Hao Xu
  2021-05-07 15:19     ` Pavel Begunkov
  2021-05-07 15:22     ` Jens Axboe
  0 siblings, 2 replies; 6+ messages in thread
From: Hao Xu @ 2021-05-06 19:20 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, Pavel Begunkov, Joseph Qi

在 2021/5/7 上午1:10, Jens Axboe 写道:
> On 5/6/21 8:33 AM, Hao Xu wrote:
>> Users may want a higher priority for sq_thread or io-worker. Provide a
>> way to change the nice value(for SCHED_NORMAL) or scheduling policy.
> 
> Silly question - why is this needed for sqpoll? With the threads now
> being essentially user threads, why can't we just modify nice and
> scheduler class from userspace instead? That should work now. I think
> this is especially true for sqpoll where it's persistent, and argument
> could be made for the io-wq worker threads that we'd need io_uring
> support for that, as they come and go and there's no reliable way to
> find and tweak the thread scheduler settings for that particular use
> case.
> 
> It may be more convenient to support this through io_uring, and that is
> a valid argument. I do think that the better way would then be to simply
> pass back the sqpoll pid after ring setup, because then it'd almost be
> as simple to do it from the app itself using the regular system call
> interfaces for that.
> 
Hi Jens,
It's my bad. I didn't realize this until I almost completed the patch,
then I looked into io_uring_param, found just __u32 resv[3] can be
leveraged. Not sure if it's neccessary to occupy one to do this, so I
still sent this patch for comments.
> In summary, I do think this _may_ make sense for the worker threads,
> being able to pass in this information and have io-wq worker thread
> setup perform the necessary tweaks when a thread is created, but it does
I'm working on this(for the io-wq worker), have done part of it.
> seem a bit silly to add this for sqpoll where it could just as easily be
> achieved from the application itself without needing to add this
It's beyond my knowledge, correct me if I'm wrong: if we do
it from application, we have to search the pid of sq_thread by it's name
which is iou-sqp-`sqd->task_pid`, and may be cut off because of
TASK_COMM_LEN(would this macro value be possibly changed in the
future?). And set_task_comm() is called when sq_thread runs, so there is
very small chance(but there is) that set_task_comm() hasn't been called
when application try to get the command name of sq_thread. Based on this
(if it is not wrong...) I think return pid of sq_thread in io_uring
level may be a better choice.

Regards,
Hao
> support.
> 
> What do you think?
> 


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH RFC 5.13] io_uring: add IORING_REGISTER_PRIORITY
  2021-05-06 19:20   ` Hao Xu
@ 2021-05-07 15:19     ` Pavel Begunkov
  2021-05-07 15:23       ` Jens Axboe
  2021-05-07 15:22     ` Jens Axboe
  1 sibling, 1 reply; 6+ messages in thread
From: Pavel Begunkov @ 2021-05-07 15:19 UTC (permalink / raw)
  To: Hao Xu, Jens Axboe; +Cc: io-uring, Joseph Qi

On 5/6/21 8:20 PM, Hao Xu wrote:
> 在 2021/5/7 上午1:10, Jens Axboe 写道:
>> On 5/6/21 8:33 AM, Hao Xu wrote:
>>> Users may want a higher priority for sq_thread or io-worker. Provide a
>>> way to change the nice value(for SCHED_NORMAL) or scheduling policy.
>>
>> Silly question - why is this needed for sqpoll? With the threads now
>> being essentially user threads, why can't we just modify nice and
>> scheduler class from userspace instead? That should work now. I think
>> this is especially true for sqpoll where it's persistent, and argument
>> could be made for the io-wq worker threads that we'd need io_uring
>> support for that, as they come and go and there's no reliable way to
>> find and tweak the thread scheduler settings for that particular use
>> case.
>>
>> It may be more convenient to support this through io_uring, and that is
>> a valid argument. I do think that the better way would then be to simply
>> pass back the sqpoll pid after ring setup, because then it'd almost be
>> as simple to do it from the app itself using the regular system call
>> interfaces for that.
>>> It's my bad. I didn't realize this until I almost completed the patch,
> then I looked into io_uring_param, found just __u32 resv[3] can be
> leveraged. Not sure if it's neccessary to occupy one to do this, so I
> still sent this patch for comments.

io_uring_param is not a problem, can be extended.

>> In summary, I do think this _may_ make sense for the worker threads,
>> being able to pass in this information and have io-wq worker thread
>> setup perform the necessary tweaks when a thread is created, but it does
> I'm working on this(for the io-wq worker), have done part of it.

I'm not sure the io-wq part makes much sense,

1) they are per thread, so an instance not related to some particular
ring, and so should not be controlled by it. E.g. what if a ring
has two different rings and sets different schedulers?

2) io-wq is slow path in any case, don't think it's worth trinking
with it.

>> seem a bit silly to add this for sqpoll where it could just as easily be
>> achieved from the application itself without needing to add this
> It's beyond my knowledge, correct me if I'm wrong: if we do
> it from application, we have to search the pid of sq_thread by it's name
> which is iou-sqp-`sqd->task_pid`, and may be cut off because of
> TASK_COMM_LEN(would this macro value be possibly changed in the
> future?). And set_task_comm() is called when sq_thread runs, so there is
> very small chance(but there is) that set_task_comm() hasn't been called
> when application try to get the command name of sq_thread. Based on this
> (if it is not wrong...) I think return pid of sq_thread in io_uring
> level may be a better choice.

Right, we may return some id of sqpoll task back in io_uring_param,
though we need to be careful with namespaces.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH RFC 5.13] io_uring: add IORING_REGISTER_PRIORITY
  2021-05-06 19:20   ` Hao Xu
  2021-05-07 15:19     ` Pavel Begunkov
@ 2021-05-07 15:22     ` Jens Axboe
  1 sibling, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2021-05-07 15:22 UTC (permalink / raw)
  To: Hao Xu; +Cc: io-uring, Pavel Begunkov, Joseph Qi

On 5/6/21 1:20 PM, Hao Xu wrote:
> 在 2021/5/7 上午1:10, Jens Axboe 写道:
>> On 5/6/21 8:33 AM, Hao Xu wrote:
>>> Users may want a higher priority for sq_thread or io-worker. Provide a
>>> way to change the nice value(for SCHED_NORMAL) or scheduling policy.
>>
>> Silly question - why is this needed for sqpoll? With the threads now
>> being essentially user threads, why can't we just modify nice and
>> scheduler class from userspace instead? That should work now. I think
>> this is especially true for sqpoll where it's persistent, and argument
>> could be made for the io-wq worker threads that we'd need io_uring
>> support for that, as they come and go and there's no reliable way to
>> find and tweak the thread scheduler settings for that particular use
>> case.
>>
>> It may be more convenient to support this through io_uring, and that is
>> a valid argument. I do think that the better way would then be to simply
>> pass back the sqpoll pid after ring setup, because then it'd almost be
>> as simple to do it from the app itself using the regular system call
>> interfaces for that.
>>
> Hi Jens,
> It's my bad. I didn't realize this until I almost completed the patch,
> then I looked into io_uring_param, found just __u32 resv[3] can be
> leveraged. Not sure if it's neccessary to occupy one to do this, so I
> still sent this patch for comments.
>> In summary, I do think this _may_ make sense for the worker threads,
>> being able to pass in this information and have io-wq worker thread
>> setup perform the necessary tweaks when a thread is created, but it does
> I'm working on this(for the io-wq worker), have done part of it.
>> seem a bit silly to add this for sqpoll where it could just as easily be
>> achieved from the application itself without needing to add this
> It's beyond my knowledge, correct me if I'm wrong: if we do
> it from application, we have to search the pid of sq_thread by it's name
> which is iou-sqp-`sqd->task_pid`, and may be cut off because of
> TASK_COMM_LEN(would this macro value be possibly changed in the
> future?). And set_task_comm() is called when sq_thread runs, so there is
> very small chance(but there is) that set_task_comm() hasn't been called
> when application try to get the command name of sq_thread. Based on this
> (if it is not wrong...) I think return pid of sq_thread in io_uring
> level may be a better choice.

Right, as mentioned in my email, we'd want to return the pid to both
make it easier and reliable to perform this action for the sqpoll
thread. Otherwise it's a bit of a mess in the application with having to
look it up, even if we enure we set task comm before the thread is live.
Having to lookup through that is very ugly, and I would not want the
application to do that.

But returning the pid would be a trivial change... We already copy the
params back, we can just stuff it in there. Either as a new member, or
re-use one of the existing sqthread members as they don't return any
information currently.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH RFC 5.13] io_uring: add IORING_REGISTER_PRIORITY
  2021-05-07 15:19     ` Pavel Begunkov
@ 2021-05-07 15:23       ` Jens Axboe
  0 siblings, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2021-05-07 15:23 UTC (permalink / raw)
  To: Pavel Begunkov, Hao Xu; +Cc: io-uring, Joseph Qi

On 5/7/21 9:19 AM, Pavel Begunkov wrote:
> On 5/6/21 8:20 PM, Hao Xu wrote:
>> 在 2021/5/7 上午1:10, Jens Axboe 写道:
>>> On 5/6/21 8:33 AM, Hao Xu wrote:
>>>> Users may want a higher priority for sq_thread or io-worker. Provide a
>>>> way to change the nice value(for SCHED_NORMAL) or scheduling policy.
>>>
>>> Silly question - why is this needed for sqpoll? With the threads now
>>> being essentially user threads, why can't we just modify nice and
>>> scheduler class from userspace instead? That should work now. I think
>>> this is especially true for sqpoll where it's persistent, and argument
>>> could be made for the io-wq worker threads that we'd need io_uring
>>> support for that, as they come and go and there's no reliable way to
>>> find and tweak the thread scheduler settings for that particular use
>>> case.
>>>
>>> It may be more convenient to support this through io_uring, and that is
>>> a valid argument. I do think that the better way would then be to simply
>>> pass back the sqpoll pid after ring setup, because then it'd almost be
>>> as simple to do it from the app itself using the regular system call
>>> interfaces for that.
>>>> It's my bad. I didn't realize this until I almost completed the patch,
>> then I looked into io_uring_param, found just __u32 resv[3] can be
>> leveraged. Not sure if it's neccessary to occupy one to do this, so I
>> still sent this patch for comments.
> 
> io_uring_param is not a problem, can be extended.
> 
>>> In summary, I do think this _may_ make sense for the worker threads,
>>> being able to pass in this information and have io-wq worker thread
>>> setup perform the necessary tweaks when a thread is created, but it does
>> I'm working on this(for the io-wq worker), have done part of it.
> 
> I'm not sure the io-wq part makes much sense,
> 
> 1) they are per thread, so an instance not related to some particular
> ring, and so should not be controlled by it. E.g. what if a ring
> has two different rings and sets different schedulers?
> 
> 2) io-wq is slow path in any case, don't think it's worth trinking
> with it.

Right, it's normally slow path, so perhaps you'd want to nice it down
or use a different scheduler class for it. I'm not personally seeing
a strong need, but willing to entertain use cases if valid.

>>> seem a bit silly to add this for sqpoll where it could just as easily be
>>> achieved from the application itself without needing to add this
>> It's beyond my knowledge, correct me if I'm wrong: if we do
>> it from application, we have to search the pid of sq_thread by it's name
>> which is iou-sqp-`sqd->task_pid`, and may be cut off because of
>> TASK_COMM_LEN(would this macro value be possibly changed in the
>> future?). And set_task_comm() is called when sq_thread runs, so there is
>> very small chance(but there is) that set_task_comm() hasn't been called
>> when application try to get the command name of sq_thread. Based on this
>> (if it is not wrong...) I think return pid of sq_thread in io_uring
>> level may be a better choice.
> 
> Right, we may return some id of sqpoll task back in io_uring_param,
> though we need to be careful with namespaces.

Yep

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-05-07 15:23 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-06 14:33 [PATCH RFC 5.13] io_uring: add IORING_REGISTER_PRIORITY Hao Xu
2021-05-06 17:10 ` Jens Axboe
2021-05-06 19:20   ` Hao Xu
2021-05-07 15:19     ` Pavel Begunkov
2021-05-07 15:23       ` Jens Axboe
2021-05-07 15:22     ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.