linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: root <yang.zhang.wz@gmail.com>
To: tglx@linutronix.de, mingo@redhat.com, hpa@zytor.com, pbonzini@redhat.com
Cc: x86@kernel.org, corbet@lwn.net, tony.luck@intel.com,
	bp@alien8.de, peterz@infradead.org, mchehab@kernel.org,
	akpm@linux-foundation.org, krzk@kernel.org, jpoimboe@redhat.com,
	luto@kernel.org, borntraeger@de.ibm.com, thgarnie@google.com,
	rgerst@gmail.com, minipli@googlemail.com,
	douly.fnst@cn.fujitsu.com, nicstange@gmail.com,
	fweisbec@gmail.com, dvlasenk@redhat.com, bristot@redhat.com,
	yamada.masahiro@socionext.com, mika.westerberg@linux.intel.com,
	yu.c.chen@intel.com, aaron.lu@intel.com, rostedt@goodmis.org,
	me@kylehuey.com, len.brown@intel.com, prarit@redhat.com,
	hidehiro.kawai.ez@hitachi.com, fengtiantian@huawei.com,
	pmladek@suse.com, jeyu@redhat.com, Larry.Finger@lwfinger.net,
	zijun_hu@htc.com, luisbg@osg.samsung.com,
	johannes.berg@intel.com, niklas.soderlund+renesas@ragnatech.se,
	zlpnobody@gmail.com, adobriyan@gmail.com,
	fgao@48lvckh6395k16k5.yundunddos.com, ebiederm@xmission.com,
	subashab@codeaurora.org, arnd@arndb.de, matt@codeblueprint.co.uk,
	mgorman@techsingularity.net, linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org, linux-edac@vger.kernel.org,
	kvm@vger.kernel.org, Yang Zhang <yang.zhang.wz@gmail.com>
Subject: [PATCH 1/2] x86/idle: add halt poll for halt idle
Date: Thu, 22 Jun 2017 11:22:13 +0000	[thread overview]
Message-ID: <1498130534-26568-2-git-send-email-root@ip-172-31-39-62.us-west-2.compute.internal> (raw)
In-Reply-To: <1498130534-26568-1-git-send-email-root@ip-172-31-39-62.us-west-2.compute.internal>

From: Yang Zhang <yang.zhang.wz@gmail.com>

This patch introduce a new mechanism to poll for a while before
entering idle state.

David has a topic in KVM forum to describe the problem on current KVM VM
when running some message passing workload in KVM forum. Also, there
are some work to improve the performance in KVM, like halt polling in KVM.
But we still has 4 MSR wirtes and HLT vmexit when going into halt idle
which introduce lot of latency.

Halt polling in KVM provide the capbility to not schedule out VCPU when
it is the only task in this pCPU. Unlike it, this patch will let VCPU polls
for a while if there is no work inside VCPU to elimiate heavy vmexit during
in/out idle. The potential impact is it will cost more CPU cycle since we
are doing polling and may impact other task which waiting on the same
physical CPU in host.

Here is the data i get when running benchmark contextswitch
(https://github.com/tsuna/contextswitch)

before patch:
2000000 process context switches in 4822613801ns (2411.3ns/ctxsw)

after patch:
2000000 process context switches in 3584098241ns (1792.0ns/ctxsw)

Signed-off-by: Yang Zhang <yang.zhang.wz@gmail.com>
---
 Documentation/sysctl/kernel.txt | 10 ++++++++++
 arch/x86/kernel/process.c       | 21 +++++++++++++++++++++
 include/linux/kernel.h          |  3 +++
 kernel/sched/idle.c             |  3 +++
 kernel/sysctl.c                 |  9 +++++++++
 5 files changed, 46 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index bac23c1..4e71bfe 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -63,6 +63,7 @@ show up in /proc/sys/kernel:
 - perf_event_max_stack
 - perf_event_max_contexts_per_stack
 - pid_max
+- poll_threshold_ns        [ X86 only ]
 - powersave-nap               [ PPC only ]
 - printk
 - printk_delay
@@ -702,6 +703,15 @@ kernel tries to allocate a number starting from this one.
 
 ==============================================================
 
+poll_threshold_ns: (X86 only)
+
+This parameter used to control the max wait time to poll before going
+into real idle state. By default, the values is 0 means don't poll.
+It is recommended to change the value to non-zero if running latency-bound
+workloads in VM.
+
+==============================================================
+
 powersave-nap: (PPC only)
 
 If set, Linux-PPC will use the 'nap' mode of powersaving,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0bb8842..6361783 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -39,6 +39,10 @@
 #include <asm/desc.h>
 #include <asm/prctl.h>
 
+#ifdef CONFIG_HYPERVISOR_GUEST
+unsigned long poll_threshold_ns;
+#endif
+
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
  * no more per-task TSS's. The TSS size is kept cacheline-aligned
@@ -313,6 +317,23 @@ static inline void play_dead(void)
 }
 #endif
 
+#ifdef CONFIG_HYPERVISOR_GUEST
+void arch_cpu_idle_poll(void)
+{
+	ktime_t start, cur, stop;
+
+	if (poll_threshold_ns) {
+		start = cur = ktime_get();
+		stop = ktime_add_ns(ktime_get(), poll_threshold_ns);
+		do {
+			if (need_resched())
+				break;
+			cur = ktime_get();
+		} while (ktime_before(cur, stop));
+	}
+}
+#endif
+
 void arch_cpu_idle_enter(void)
 {
 	tsc_verify_tsc_adjust(false);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 13bc08a..04cf774 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -460,6 +460,9 @@ extern __scanf(2, 0)
 extern int sysctl_panic_on_stackoverflow;
 
 extern bool crash_kexec_post_notifiers;
+#ifdef CONFIG_HYPERVISOR_GUEST
+extern unsigned long poll_threshold_ns;
+#endif
 
 /*
  * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2a25a9e..e789f99 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -74,6 +74,7 @@ static noinline int __cpuidle cpu_idle_poll(void)
 }
 
 /* Weak implementations for optional arch specific functions */
+void __weak arch_cpu_idle_poll(void) { }
 void __weak arch_cpu_idle_prepare(void) { }
 void __weak arch_cpu_idle_enter(void) { }
 void __weak arch_cpu_idle_exit(void) { }
@@ -219,6 +220,8 @@ static void do_idle(void)
 	 */
 
 	__current_set_polling();
+	arch_cpu_idle_poll();
+
 	tick_nohz_idle_enter();
 
 	while (!need_resched()) {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4dfba1a..9174d57 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1203,6 +1203,15 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
 		.extra2		= &one,
 	},
 #endif
+#ifdef CONFIG_HYPERVISOR_GUEST
+	{
+		.procname	= "halt_poll_threshold",
+		.data		= &poll_threshold_ns,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 	{ }
 };
 
-- 
1.8.3.1

  reply	other threads:[~2017-06-22 11:22 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-06-22 11:22 [PATCH 0/2] x86/idle: add halt poll support root
2017-06-22 11:22 ` root [this message]
2017-06-22 14:23   ` [PATCH 1/2] x86/idle: add halt poll for halt idle Thomas Gleixner
2017-06-23  4:05     ` Yang Zhang
2017-08-16  4:04   ` Michael S. Tsirkin
2017-08-17  7:29     ` Yang Zhang
2017-06-22 11:22 ` [PATCH 2/2] x86/idle: use dynamic halt poll root
2017-06-22 11:51   ` Paolo Bonzini
2017-06-23  3:58     ` Yang Zhang
2017-06-27 11:22       ` Yang Zhang
2017-06-27 12:07         ` Paolo Bonzini
2017-06-27 12:23           ` Wanpeng Li
2017-06-27 12:28             ` Paolo Bonzini
2017-06-27 13:40               ` Radim Krčmář
2017-06-27 13:56                 ` Paolo Bonzini
2017-06-27 14:22                   ` Radim Krčmář
2017-06-27 14:26                     ` Paolo Bonzini
2017-07-03  9:28                     ` Yang Zhang
2017-07-03 10:06                       ` Thomas Gleixner
2017-07-04  2:19                         ` Yang Zhang
2017-07-04 14:13                       ` Radim Krčmář
2017-07-04 14:50                         ` Thomas Gleixner
2017-07-13 11:49                         ` Yang Zhang
2017-07-14  9:37                           ` Alexander Graf
2017-07-17  9:26                             ` Yang Zhang
2017-07-17  9:54                               ` Alexander Graf
2017-07-17 12:50                                 ` Yang Zhang
2017-07-04 22:28                       ` Wanpeng Li
2017-06-22 14:32   ` Thomas Gleixner
2017-06-23  4:04     ` Yang Zhang
2017-06-22 22:46   ` kbuild test robot
2017-06-22 11:32 ` [PATCH 0/2] x86/idle: add halt poll support Yang Zhang
2017-06-22 11:50 ` Wanpeng Li
2017-06-23  4:08   ` Yang Zhang
2017-06-23  4:35     ` Wanpeng Li
2017-06-23  6:49       ` Yang Zhang
2017-06-27 14:00         ` Radim Krčmář

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1498130534-26568-2-git-send-email-root@ip-172-31-39-62.us-west-2.compute.internal \
    --to=yang.zhang.wz@gmail.com \
    --cc=Larry.Finger@lwfinger.net \
    --cc=aaron.lu@intel.com \
    --cc=adobriyan@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=arnd@arndb.de \
    --cc=borntraeger@de.ibm.com \
    --cc=bp@alien8.de \
    --cc=bristot@redhat.com \
    --cc=corbet@lwn.net \
    --cc=douly.fnst@cn.fujitsu.com \
    --cc=dvlasenk@redhat.com \
    --cc=ebiederm@xmission.com \
    --cc=fengtiantian@huawei.com \
    --cc=fgao@48lvckh6395k16k5.yundunddos.com \
    --cc=fweisbec@gmail.com \
    --cc=hidehiro.kawai.ez@hitachi.com \
    --cc=hpa@zytor.com \
    --cc=jeyu@redhat.com \
    --cc=johannes.berg@intel.com \
    --cc=jpoimboe@redhat.com \
    --cc=krzk@kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=len.brown@intel.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luisbg@osg.samsung.com \
    --cc=luto@kernel.org \
    --cc=matt@codeblueprint.co.uk \
    --cc=mchehab@kernel.org \
    --cc=me@kylehuey.com \
    --cc=mgorman@techsingularity.net \
    --cc=mika.westerberg@linux.intel.com \
    --cc=mingo@redhat.com \
    --cc=minipli@googlemail.com \
    --cc=nicstange@gmail.com \
    --cc=niklas.soderlund+renesas@ragnatech.se \
    --cc=pbonzini@redhat.com \
    --cc=peterz@infradead.org \
    --cc=pmladek@suse.com \
    --cc=prarit@redhat.com \
    --cc=rgerst@gmail.com \
    --cc=rostedt@goodmis.org \
    --cc=subashab@codeaurora.org \
    --cc=tglx@linutronix.de \
    --cc=thgarnie@google.com \
    --cc=tony.luck@intel.com \
    --cc=x86@kernel.org \
    --cc=yamada.masahiro@socionext.com \
    --cc=yu.c.chen@intel.com \
    --cc=zijun_hu@htc.com \
    --cc=zlpnobody@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).