From: Konstantin Khebnikov <khlebnikov@yandex-team.ru>
To: linux-mm@kvack.org, cgroups@vger.kernel.org
Cc: Roman Gushchin <klamm@yandex-team.ru>, Jan Kara <jack@suse.cz>,
Dave Chinner <david@fromorbit.com>,
linux-kernel@vger.kernel.org, Tejun Heo <tj@kernel.org>,
linux-fsdevel@vger.kernel.org, koct9i@gmail.com
Subject: [PATCH 5/6] delay-injection: resource management via procrastination
Date: Thu, 15 Jan 2015 21:49:17 +0300 [thread overview]
Message-ID: <20150115184917.10450.38284.stgit@buzz> (raw)
In-Reply-To: <20150115180242.10450.92.stgit@buzz>
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
inject_delay() allows to pause current task before returning
into userspace in place where kernel doesn't hold any locks
thus wait wouldn't introduce any priority-inversion problems.
This code abuses existing task-work and 'TASK_PARKED' state.
Parked tasks are killable and don't contribute into cpu load.
Together with percpu_ratelimit this could be used in this manner:
if (percpu_ratelimit_charge(&ratelimit, events))
inject_delay(percpu_ratelimit_target(&ratelimit));
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
include/linux/sched.h | 7 ++++
include/trace/events/sched.h | 7 ++++
kernel/sched/core.c | 66 ++++++++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 12 ++++++++
4 files changed, 92 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8db31ef..2363918 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1132,6 +1132,7 @@ struct sched_statistics {
u64 iowait_sum;
u64 sleep_start;
+ u64 delay_start;
u64 sleep_max;
s64 sum_sleep_runtime;
@@ -1662,6 +1663,10 @@ struct task_struct {
unsigned long timer_slack_ns;
unsigned long default_timer_slack_ns;
+ /* Pause task till this time before returning into userspace */
+ ktime_t delay_injection_target;
+ struct callback_head delay_injection_work;
+
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* Index of current stored address in ret_stack */
int curr_ret_stack;
@@ -2277,6 +2282,8 @@ extern void set_curr_task(int cpu, struct task_struct *p);
void yield(void);
+extern void inject_delay(ktime_t target);
+
/*
* The default (Linux) execution domain.
*/
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 30fedaf..d35154e 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -365,6 +365,13 @@ DEFINE_EVENT(sched_stat_template, sched_stat_blocked,
TP_ARGS(tsk, delay));
/*
+ * Tracepoint for accounting delay-injection
+ */
+DEFINE_EVENT(sched_stat_template, sched_stat_delayed,
+ TP_PROTO(struct task_struct *tsk, u64 delay),
+ TP_ARGS(tsk, delay));
+
+/*
* Tracepoint for accounting runtime (time the task is executing
* on a CPU).
*/
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c0accc0..7a9d6a1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -65,6 +65,7 @@
#include <linux/unistd.h>
#include <linux/pagemap.h>
#include <linux/hrtimer.h>
+#include <linux/task_work.h>
#include <linux/tick.h>
#include <linux/debugfs.h>
#include <linux/ctype.h>
@@ -8377,3 +8378,68 @@ void dump_cpu_task(int cpu)
pr_info("Task dump for CPU %d:\n", cpu);
sched_show_task(cpu_curr(cpu));
}
+
+#define DELAY_INJECTION_SLACK_NS (NSEC_PER_SEC / 50)
+
+static enum hrtimer_restart delay_injection_wakeup(struct hrtimer *timer)
+{
+ struct hrtimer_sleeper *t =
+ container_of(timer, struct hrtimer_sleeper, timer);
+ struct task_struct *task = t->task;
+
+ t->task = NULL;
+ if (task)
+ wake_up_state(task, TASK_PARKED);
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * Here delayed task sleeps in 'P'arked state.
+ */
+static void delay_injection_sleep(struct callback_head *head)
+{
+ struct task_struct *task = current;
+ struct hrtimer_sleeper t;
+
+ head->func = NULL;
+ __set_task_state(task, TASK_WAKEKILL | TASK_PARKED);
+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_set_expires_range_ns(&t.timer, current->delay_injection_target,
+ DELAY_INJECTION_SLACK_NS);
+
+ t.timer.function = delay_injection_wakeup;
+ t.task = task;
+
+ hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
+ if (!hrtimer_active(&t.timer))
+ t.task = NULL;
+
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+ destroy_hrtimer_on_stack(&t.timer);
+
+ __set_task_state(task, TASK_RUNNING);
+}
+
+/*
+ * inject_delay - injects delay before returning into userspace
+ * @target: absolute monotomic timestamp to sleeping for,
+ * task will not return into userspace before this time
+ */
+void inject_delay(ktime_t target)
+{
+ struct task_struct *task = current;
+
+ if (ktime_after(target, task->delay_injection_target)) {
+ task->delay_injection_target = target;
+ if (!task->delay_injection_work.func) {
+ init_task_work(&task->delay_injection_work,
+ delay_injection_sleep);
+ task_work_add(task, &task->delay_injection_work, true);
+ }
+ }
+}
+EXPORT_SYMBOL(inject_delay);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40667cb..2e3269b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2944,6 +2944,15 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
account_scheduler_latency(tsk, delta >> 10, 0);
}
}
+ if (se->statistics.delay_start) {
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.delay_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ se->statistics.delay_start = 0;
+ trace_sched_stat_delayed(tsk, delta);
+ }
#endif
}
@@ -3095,6 +3104,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
if (tsk->state & TASK_UNINTERRUPTIBLE)
se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+ if ((tsk->state & TASK_PARKED) &&
+ tsk->delay_injection_target.tv64)
+ se->statistics.delay_start = rq_clock(rq_of(cfs_rq));
}
#endif
}
next prev parent reply other threads:[~2015-01-15 18:56 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-01-15 18:49 [PATCHSET RFC 0/6] memcg: inode-based dirty-set controller Konstantin Khebnikov
2015-01-15 18:49 ` [PATCH 1/6] memcg: inode-based dirty and writeback pages accounting Konstantin Khebnikov
2015-01-15 18:49 ` [PATCH 2/6] memcg: dirty-set limiting and filtered writeback Konstantin Khebnikov
2015-01-15 18:49 ` [PATCH 3/6] memcg: track shared inodes with dirty pages Konstantin Khebnikov
2015-01-15 18:55 ` Tejun Heo
2015-01-15 19:04 ` Konstantin Khlebnikov
2015-01-15 19:08 ` Tejun Heo
2015-01-15 18:49 ` [PATCH 4/6] percpu_ratelimit: high-performance ratelimiting counter Konstantin Khebnikov
2015-01-15 18:49 ` Konstantin Khebnikov [this message]
2015-01-15 18:49 ` [PATCH 6/6] memcg: filesystem bandwidth controller Konstantin Khebnikov
2015-01-16 9:37 ` [PATCHSET RFC 0/6] memcg: inode-based dirty-set controller Jan Kara
2015-01-16 12:33 ` Konstantin Khlebnikov
2015-01-16 14:25 ` Jan Kara
2015-01-29 1:21 ` Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20150115184917.10450.38284.stgit@buzz \
--to=khlebnikov@yandex-team.ru \
--cc=cgroups@vger.kernel.org \
--cc=david@fromorbit.com \
--cc=jack@suse.cz \
--cc=klamm@yandex-team.ru \
--cc=koct9i@gmail.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=tj@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).