linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Konstantin Khebnikov <khlebnikov@yandex-team.ru>
To: linux-mm@kvack.org, cgroups@vger.kernel.org
Cc: Roman Gushchin <klamm@yandex-team.ru>, Jan Kara <jack@suse.cz>,
	Dave Chinner <david@fromorbit.com>,
	linux-kernel@vger.kernel.org, Tejun Heo <tj@kernel.org>,
	linux-fsdevel@vger.kernel.org, koct9i@gmail.com
Subject: [PATCH 5/6] delay-injection: resource management via procrastination
Date: Thu, 15 Jan 2015 21:49:17 +0300	[thread overview]
Message-ID: <20150115184917.10450.38284.stgit@buzz> (raw)
In-Reply-To: <20150115180242.10450.92.stgit@buzz>

From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>

inject_delay() allows to pause current task before returning
into userspace in place where kernel doesn't hold any locks
thus wait wouldn't introduce any priority-inversion problems.

This code abuses existing task-work and 'TASK_PARKED' state.
Parked tasks are killable and don't contribute into cpu load.

Together with percpu_ratelimit this could be used in this manner:

if (percpu_ratelimit_charge(&ratelimit, events))
        inject_delay(percpu_ratelimit_target(&ratelimit));

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
 include/linux/sched.h        |    7 ++++
 include/trace/events/sched.h |    7 ++++
 kernel/sched/core.c          |   66 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |   12 ++++++++
 4 files changed, 92 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8db31ef..2363918 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1132,6 +1132,7 @@ struct sched_statistics {
 	u64			iowait_sum;
 
 	u64			sleep_start;
+	u64			delay_start;
 	u64			sleep_max;
 	s64			sum_sleep_runtime;
 
@@ -1662,6 +1663,10 @@ struct task_struct {
 	unsigned long timer_slack_ns;
 	unsigned long default_timer_slack_ns;
 
+	/* Pause task till this time before returning into userspace */
+	ktime_t delay_injection_target;
+	struct callback_head delay_injection_work;
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	/* Index of current stored address in ret_stack */
 	int curr_ret_stack;
@@ -2277,6 +2282,8 @@ extern void set_curr_task(int cpu, struct task_struct *p);
 
 void yield(void);
 
+extern void inject_delay(ktime_t target);
+
 /*
  * The default (Linux) execution domain.
  */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 30fedaf..d35154e 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -365,6 +365,13 @@ DEFINE_EVENT(sched_stat_template, sched_stat_blocked,
 	     TP_ARGS(tsk, delay));
 
 /*
+ * Tracepoint for accounting delay-injection
+ */
+DEFINE_EVENT(sched_stat_template, sched_stat_delayed,
+	     TP_PROTO(struct task_struct *tsk, u64 delay),
+	     TP_ARGS(tsk, delay));
+
+/*
  * Tracepoint for accounting runtime (time the task is executing
  * on a CPU).
  */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c0accc0..7a9d6a1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -65,6 +65,7 @@
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
+#include <linux/task_work.h>
 #include <linux/tick.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
@@ -8377,3 +8378,68 @@ void dump_cpu_task(int cpu)
 	pr_info("Task dump for CPU %d:\n", cpu);
 	sched_show_task(cpu_curr(cpu));
 }
+
+#define DELAY_INJECTION_SLACK_NS	(NSEC_PER_SEC / 50)
+
+static enum hrtimer_restart delay_injection_wakeup(struct hrtimer *timer)
+{
+	struct hrtimer_sleeper *t =
+		container_of(timer, struct hrtimer_sleeper, timer);
+	struct task_struct *task = t->task;
+
+	t->task = NULL;
+	if (task)
+		wake_up_state(task, TASK_PARKED);
+
+	return HRTIMER_NORESTART;
+}
+
+/*
+ * Here delayed task sleeps in 'P'arked state.
+ */
+static void delay_injection_sleep(struct callback_head *head)
+{
+	struct task_struct *task = current;
+	struct hrtimer_sleeper t;
+
+	head->func = NULL;
+	__set_task_state(task, TASK_WAKEKILL | TASK_PARKED);
+	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	hrtimer_set_expires_range_ns(&t.timer, current->delay_injection_target,
+				     DELAY_INJECTION_SLACK_NS);
+
+	t.timer.function = delay_injection_wakeup;
+	t.task = task;
+
+	hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
+	if (!hrtimer_active(&t.timer))
+		t.task = NULL;
+
+	if (likely(t.task))
+		schedule();
+
+	hrtimer_cancel(&t.timer);
+	destroy_hrtimer_on_stack(&t.timer);
+
+	__set_task_state(task, TASK_RUNNING);
+}
+
+/*
+ * inject_delay - injects delay before returning into userspace
+ * @target: absolute monotomic timestamp to sleeping for,
+ *	    task will not return into userspace before this time
+ */
+void inject_delay(ktime_t target)
+{
+	struct task_struct *task = current;
+
+	if (ktime_after(target, task->delay_injection_target)) {
+		task->delay_injection_target = target;
+		if (!task->delay_injection_work.func) {
+			init_task_work(&task->delay_injection_work,
+					delay_injection_sleep);
+			task_work_add(task, &task->delay_injection_work, true);
+		}
+	}
+}
+EXPORT_SYMBOL(inject_delay);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40667cb..2e3269b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2944,6 +2944,15 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 			account_scheduler_latency(tsk, delta >> 10, 0);
 		}
 	}
+	if (se->statistics.delay_start) {
+		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.delay_start;
+
+		if ((s64)delta < 0)
+			delta = 0;
+
+		se->statistics.delay_start = 0;
+		trace_sched_stat_delayed(tsk, delta);
+	}
 #endif
 }
 
@@ -3095,6 +3104,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
 			if (tsk->state & TASK_UNINTERRUPTIBLE)
 				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+			if ((tsk->state & TASK_PARKED) &&
+			    tsk->delay_injection_target.tv64)
+				se->statistics.delay_start = rq_clock(rq_of(cfs_rq));
 		}
 #endif
 	}


  parent reply	other threads:[~2015-01-15 18:56 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-01-15 18:49 [PATCHSET RFC 0/6] memcg: inode-based dirty-set controller Konstantin Khebnikov
2015-01-15 18:49 ` [PATCH 1/6] memcg: inode-based dirty and writeback pages accounting Konstantin Khebnikov
2015-01-15 18:49 ` [PATCH 2/6] memcg: dirty-set limiting and filtered writeback Konstantin Khebnikov
2015-01-15 18:49 ` [PATCH 3/6] memcg: track shared inodes with dirty pages Konstantin Khebnikov
2015-01-15 18:55   ` Tejun Heo
2015-01-15 19:04     ` Konstantin Khlebnikov
2015-01-15 19:08       ` Tejun Heo
2015-01-15 18:49 ` [PATCH 4/6] percpu_ratelimit: high-performance ratelimiting counter Konstantin Khebnikov
2015-01-15 18:49 ` Konstantin Khebnikov [this message]
2015-01-15 18:49 ` [PATCH 6/6] memcg: filesystem bandwidth controller Konstantin Khebnikov
2015-01-16  9:37 ` [PATCHSET RFC 0/6] memcg: inode-based dirty-set controller Jan Kara
2015-01-16 12:33   ` Konstantin Khlebnikov
2015-01-16 14:25     ` Jan Kara
2015-01-29  1:21 ` Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150115184917.10450.38284.stgit@buzz \
    --to=khlebnikov@yandex-team.ru \
    --cc=cgroups@vger.kernel.org \
    --cc=david@fromorbit.com \
    --cc=jack@suse.cz \
    --cc=klamm@yandex-team.ru \
    --cc=koct9i@gmail.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).