[PATCH 4/4] sched,time: only call account_{user,sys,guest,idle}_time once a jiffy

From: riel@redhat.com
To: linux-kernel@vger.kernel.org
Cc: tglx@linutronix.de, mingo@kernel.org, luto@amacapital.net,
	fweisbec@gmail.com, peterz@infradead.org, clark@redhat.com
Subject: [PATCH 4/4] sched,time: only call account_{user,sys,guest,idle}_time once a jiffy
Date: Fri, 29 Jan 2016 22:36:05 -0500	[thread overview]
Message-ID: <1454124965-13974-5-git-send-email-riel@redhat.com> (raw)
In-Reply-To: <1454124965-13974-1-git-send-email-riel@redhat.com>

From: Rik van Riel <riel@redhat.com>

After removing __acct_update_integrals from the profile,
native_sched_clock remains as the top CPU user. This can be
reduced by only calling account_{user,sys,guest,idle}_time
once per jiffy for long running tasks on nohz_full CPUs.

This will reduce timing accuracy on nohz_full CPUs to jiffy
based sampling, just like on normal CPUs. It results in
totally removing native_sched_clock from the profile, and
significantly speeding up the syscall entry and exit path,
as well as irq entry and exit, and kvm guest entry & exit.

This code relies on another CPU advancing jiffies when the
system is busy. On a nohz_full system, this is done by a
housekeeping CPU.

A microbenchmark calling an invalid syscall number 10 million
times in a row speeds up an additional 30% over the numbers
with just the previous patches, for a total speedup of about
40% over 4.4 and 4.5-rc1.

Run times for the microbenchmark:

4.4				3.8 seconds
4.5-rc1				3.7 seconds
4.5-rc1 + first patch		3.3 seconds
4.5-rc1 + first 3 patches	3.1 seconds
4.5-rc1 + all patches		2.3 seconds

Signed-off-by: Rik van Riel <riel@redhat.com>
---
 include/linux/sched.h  |  1 +
 kernel/sched/cputime.c | 35 +++++++++++++++++++++++++++++------
 2 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a10494a94cc3..019c3af98503 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1532,6 +1532,7 @@ struct task_struct {
 	struct prev_cputime prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 	seqcount_t vtime_seqcount;
+	unsigned long vtime_jiffies;
 	unsigned long long vtime_snap;
 	enum {
 		/* Task is sleeping or running in a CPU with VTIME inactive */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index b2ab2ffb1adc..923c110319b1 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -668,6 +668,15 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+static bool vtime_jiffies_changed(struct task_struct *tsk, unsigned long now)
+{
+	if (tsk->vtime_jiffies == jiffies)
+		return false;
+
+	tsk->vtime_jiffies = jiffies;
+	return true;
+}
+
 static unsigned long long vtime_delta(struct task_struct *tsk)
 {
 	unsigned long long clock;
@@ -699,6 +708,9 @@ static void __vtime_account_system(struct task_struct *tsk)
 
 void vtime_account_system(struct task_struct *tsk)
 {
+	if (!vtime_jiffies_changed(tsk, jiffies))
+		return;
+
 	write_seqcount_begin(&tsk->vtime_seqcount);
 	__vtime_account_system(tsk);
 	write_seqcount_end(&tsk->vtime_seqcount);
@@ -707,7 +719,8 @@ void vtime_account_system(struct task_struct *tsk)
 void vtime_gen_account_irq_exit(struct task_struct *tsk)
 {
 	write_seqcount_begin(&tsk->vtime_seqcount);
-	__vtime_account_system(tsk);
+	if (vtime_jiffies_changed(tsk, jiffies))
+		__vtime_account_system(tsk);
 	if (context_tracking_in_user())
 		tsk->vtime_snap_whence = VTIME_USER;
 	write_seqcount_end(&tsk->vtime_seqcount);
@@ -718,16 +731,19 @@ void vtime_account_user(struct task_struct *tsk)
 	cputime_t delta_cpu;
 
 	write_seqcount_begin(&tsk->vtime_seqcount);
-	delta_cpu = get_vtime_delta(tsk);
 	tsk->vtime_snap_whence = VTIME_SYS;
-	account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+	if (vtime_jiffies_changed(tsk, jiffies)) {
+		delta_cpu = get_vtime_delta(tsk);
+		account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+	}
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_user_enter(struct task_struct *tsk)
 {
 	write_seqcount_begin(&tsk->vtime_seqcount);
-	__vtime_account_system(tsk);
+	if (vtime_jiffies_changed(tsk, jiffies))
+		__vtime_account_system(tsk);
 	tsk->vtime_snap_whence = VTIME_USER;
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
@@ -742,7 +758,8 @@ void vtime_guest_enter(struct task_struct *tsk)
 	 * that can thus safely catch up with a tickless delta.
 	 */
 	write_seqcount_begin(&tsk->vtime_seqcount);
-	__vtime_account_system(tsk);
+	if (vtime_jiffies_changed(tsk, jiffies))
+		__vtime_account_system(tsk);
 	current->flags |= PF_VCPU;
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
@@ -759,8 +776,12 @@ EXPORT_SYMBOL_GPL(vtime_guest_exit);
 
 void vtime_account_idle(struct task_struct *tsk)
 {
-	cputime_t delta_cpu = get_vtime_delta(tsk);
+	cputime_t delta_cpu;
+
+	if (!vtime_jiffies_changed(tsk, jiffies))
+		return;
 
+	delta_cpu = get_vtime_delta(tsk);
 	account_idle_time(delta_cpu);
 }
 
@@ -773,6 +794,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
 	write_seqcount_begin(&current->vtime_seqcount);
 	current->vtime_snap_whence = VTIME_SYS;
 	current->vtime_snap = sched_clock_cpu(smp_processor_id());
+	current->vtime_jiffies = jiffies;
 	write_seqcount_end(&current->vtime_seqcount);
 }
 
@@ -784,6 +806,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
 	write_seqcount_begin(&t->vtime_seqcount);
 	t->vtime_snap_whence = VTIME_SYS;
 	t->vtime_snap = sched_clock_cpu(cpu);
+	t->vtime_jiffies = jiffies;
 	write_seqcount_end(&t->vtime_seqcount);
 	local_irq_restore(flags);
 }
-- 
2.5.0