linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] sched: introduce configurable delay before entering idle
@ 2019-05-07 18:56 Marcelo Tosatti
  2019-05-07 22:15 ` Peter Zijlstra
  2019-05-13  9:20 ` Wanpeng Li
  0 siblings, 2 replies; 19+ messages in thread
From: Marcelo Tosatti @ 2019-05-07 18:56 UTC (permalink / raw)
  To: kvm-devel, linux-kernel
  Cc: Thomas Gleixner, Ingo Molnar, Andrea Arcangeli, Bandan Das,
	Paolo Bonzini

[-- Attachment #1: Type: text/plain, Size: 3259 bytes --]


Certain workloads perform poorly on KVM compared to baremetal
due to baremetal's ability to perform mwait on NEED_RESCHED
bit of task flags (therefore skipping the IPI).

This patch introduces a configurable busy-wait delay before entering the
architecture delay routine, allowing wakeup IPIs to be skipped 
(if the IPI happens in that window).

The real-life workload which this patch improves performance
is SAP HANA (by 5-10%) (for which case setting idle_spin to 30 
is sufficient).

This patch improves the attached server.py and client.py example 
as follows:

Host:                           31.814230202231556
Guest:                          38.17718765199993       (83 %)
Guest, idle_spin=50us:          33.317709898000004      (95 %)
Guest, idle_spin=220us:         32.27826551499999       (98 %)

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 kernel/sched/idle.c |   86 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f5516bae0c1b..bca7656a7ea0 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -216,6 +216,29 @@ static void cpuidle_idle_call(void)
 	rcu_idle_exit();
 }
 
+static unsigned int spin_before_idle_us;

+static void do_spin_before_idle(void)
+{
+	ktime_t now, end_spin;
+
+	now = ktime_get();
+	end_spin = ktime_add_ns(now, spin_before_idle_us*1000);
+
+	rcu_idle_enter();
+	local_irq_enable();
+	stop_critical_timings();
+
+	do {
+		cpu_relax();
+		now = ktime_get();
+	} while (!tif_need_resched() && ktime_before(now, end_spin));
+
+	start_critical_timings();
+	rcu_idle_exit();
+	local_irq_disable();
+}
+
 /*
  * Generic idle loop implementation
  *
@@ -259,6 +282,8 @@ static void do_idle(void)
 			tick_nohz_idle_restart_tick();
 			cpu_idle_poll();
 		} else {
+			if (spin_before_idle_us)
+				do_spin_before_idle();
 			cpuidle_idle_call();
 		}
 		arch_cpu_idle_exit();
@@ -465,3 +490,64 @@ const struct sched_class idle_sched_class = {
 	.switched_to		= switched_to_idle,
 	.update_curr		= update_curr_idle,
 };
+
+
+static ssize_t store_idle_spin(struct kobject *kobj,
+			       struct kobj_attribute *attr,
+			       const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (kstrtouint(buf, 10, &val) < 0)
+		return -EINVAL;
+
+	if (val > USEC_PER_SEC)
+		return -EINVAL;
+
+	spin_before_idle_us = val;
+	return count;
+}
+
+static ssize_t show_idle_spin(struct kobject *kobj,
+			      struct kobj_attribute *attr,
+			      char *buf)
+{
+	ssize_t ret;
+
+	ret = sprintf(buf, "%d\n", spin_before_idle_us);
+
+	return ret;
+}
+
+static struct kobj_attribute idle_spin_attr =
+	__ATTR(idle_spin, 0644, show_idle_spin, store_idle_spin);
+
+static struct attribute *sched_attrs[] = {
+	&idle_spin_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group sched_attr_group = {
+	.attrs = sched_attrs,
+};
+
+static struct kobject *sched_kobj;
+
+static int __init sched_sysfs_init(void)
+{
+	int error;
+
+	sched_kobj = kobject_create_and_add("sched", kernel_kobj);
+	if (!sched_kobj)
+		return -ENOMEM;
+
+	error = sysfs_create_group(sched_kobj, &sched_attr_group);
+	if (error)
+		goto err;
+	return 0;
+
+err:
+	kobject_put(sched_kobj);
+	return error;
+}
+postcore_initcall(sched_sysfs_init);

[-- Attachment #2: client.py --]
[-- Type: text/plain, Size: 619 bytes --]

#!/bin/python3

import socket
import sys
import struct, fcntl, os
import os, errno, time
import time

sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

server_address = ('127.0.0.1', 999)
print ("connecting to 127.0.0.1")
sock.connect(server_address)

nr_writes = 0

start_time = time.clock_gettime(time.CLOCK_MONOTONIC)

while nr_writes < 90000:
	data = sock.recv(4096)
	if len(data) == 0:
		print("connection closed!\n");
		exit(0);
	# sleep 20us
	time.sleep(20/1000000)
	sock.send(data)
	nr_writes = nr_writes+1

end_time = time.clock_gettime(time.CLOCK_MONOTONIC)
delta = end_time - start_time
print(delta)

[-- Attachment #3: server.py --]
[-- Type: text/plain, Size: 417 bytes --]

#!/bin/python3

import socket
import sys
import struct, fcntl, os
import os, errno, time
import time

sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.bind(('127.0.0.1', 999))
sock.listen(10)
conn, addr = sock.accept()

nr_written=0
while 1:
	conn.sendall(b"a response line of text")
	data = conn.recv(1024)
	if not data:
        	break
	# sleep 200us
	time.sleep(200/1000000)
	nr_written = nr_written+1

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-07 18:56 [PATCH] sched: introduce configurable delay before entering idle Marcelo Tosatti
@ 2019-05-07 22:15 ` Peter Zijlstra
  2019-05-07 23:44   ` Marcelo Tosatti
  2019-05-13  9:20 ` Wanpeng Li
  1 sibling, 1 reply; 19+ messages in thread
From: Peter Zijlstra @ 2019-05-07 22:15 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm-devel, linux-kernel, Thomas Gleixner, Ingo Molnar,
	Andrea Arcangeli, Bandan Das, Paolo Bonzini

On Tue, May 07, 2019 at 03:56:49PM -0300, Marcelo Tosatti wrote:
> 
> Certain workloads perform poorly on KVM compared to baremetal
> due to baremetal's ability to perform mwait on NEED_RESCHED
> bit of task flags (therefore skipping the IPI).
> 
> This patch introduces a configurable busy-wait delay before entering the
> architecture delay routine, allowing wakeup IPIs to be skipped 
> (if the IPI happens in that window).
> 
> The real-life workload which this patch improves performance
> is SAP HANA (by 5-10%) (for which case setting idle_spin to 30 
> is sufficient).
> 
> This patch improves the attached server.py and client.py example 
> as follows:
> 
> Host:                           31.814230202231556
> Guest:                          38.17718765199993       (83 %)
> Guest, idle_spin=50us:          33.317709898000004      (95 %)
> Guest, idle_spin=220us:         32.27826551499999       (98 %)
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Thanks for the CC..

NAK, this is something that should live in a virt idle governor or
something along those lines.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-07 22:15 ` Peter Zijlstra
@ 2019-05-07 23:44   ` Marcelo Tosatti
  0 siblings, 0 replies; 19+ messages in thread
From: Marcelo Tosatti @ 2019-05-07 23:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: kvm-devel, linux-kernel, Thomas Gleixner, Ingo Molnar,
	Andrea Arcangeli, Bandan Das, Paolo Bonzini

On Wed, May 08, 2019 at 12:15:19AM +0200, Peter Zijlstra wrote:
> On Tue, May 07, 2019 at 03:56:49PM -0300, Marcelo Tosatti wrote:
> > 
> > Certain workloads perform poorly on KVM compared to baremetal
> > due to baremetal's ability to perform mwait on NEED_RESCHED
> > bit of task flags (therefore skipping the IPI).
> > 
> > This patch introduces a configurable busy-wait delay before entering the
> > architecture delay routine, allowing wakeup IPIs to be skipped 
> > (if the IPI happens in that window).
> > 
> > The real-life workload which this patch improves performance
> > is SAP HANA (by 5-10%) (for which case setting idle_spin to 30 
> > is sufficient).
> > 
> > This patch improves the attached server.py and client.py example 
> > as follows:
> > 
> > Host:                           31.814230202231556
> > Guest:                          38.17718765199993       (83 %)
> > Guest, idle_spin=50us:          33.317709898000004      (95 %)
> > Guest, idle_spin=220us:         32.27826551499999       (98 %)
> > 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> Thanks for the CC..
> 
> NAK, this is something that should live in a virt idle governor or
> something along those lines.

Ok, makes sense, will rework the patch!



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-07 18:56 [PATCH] sched: introduce configurable delay before entering idle Marcelo Tosatti
  2019-05-07 22:15 ` Peter Zijlstra
@ 2019-05-13  9:20 ` Wanpeng Li
  2019-05-13 11:31   ` Konrad Rzeszutek Wilk
  2019-05-14 13:50   ` Marcelo Tosatti
  1 sibling, 2 replies; 19+ messages in thread
From: Wanpeng Li @ 2019-05-13  9:20 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm-devel, LKML, Thomas Gleixner, Ingo Molnar, Andrea Arcangeli,
	Bandan Das, Paolo Bonzini

On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>
>
> Certain workloads perform poorly on KVM compared to baremetal
> due to baremetal's ability to perform mwait on NEED_RESCHED
> bit of task flags (therefore skipping the IPI).

KVM supports expose mwait to the guest, if it can solve this?

Regards,
Wanpeng Li

>
> This patch introduces a configurable busy-wait delay before entering the
> architecture delay routine, allowing wakeup IPIs to be skipped
> (if the IPI happens in that window).
>
> The real-life workload which this patch improves performance
> is SAP HANA (by 5-10%) (for which case setting idle_spin to 30
> is sufficient).
>
> This patch improves the attached server.py and client.py example
> as follows:
>
> Host:                           31.814230202231556
> Guest:                          38.17718765199993       (83 %)
> Guest, idle_spin=50us:          33.317709898000004      (95 %)
> Guest, idle_spin=220us:         32.27826551499999       (98 %)
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>
> ---
>  kernel/sched/idle.c |   86 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 86 insertions(+)
>
> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> index f5516bae0c1b..bca7656a7ea0 100644
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -216,6 +216,29 @@ static void cpuidle_idle_call(void)
>         rcu_idle_exit();
>  }
>
> +static unsigned int spin_before_idle_us;
>
> +static void do_spin_before_idle(void)
> +{
> +       ktime_t now, end_spin;
> +
> +       now = ktime_get();
> +       end_spin = ktime_add_ns(now, spin_before_idle_us*1000);
> +
> +       rcu_idle_enter();
> +       local_irq_enable();
> +       stop_critical_timings();
> +
> +       do {
> +               cpu_relax();
> +               now = ktime_get();
> +       } while (!tif_need_resched() && ktime_before(now, end_spin));
> +
> +       start_critical_timings();
> +       rcu_idle_exit();
> +       local_irq_disable();
> +}
> +
>  /*
>   * Generic idle loop implementation
>   *
> @@ -259,6 +282,8 @@ static void do_idle(void)
>                         tick_nohz_idle_restart_tick();
>                         cpu_idle_poll();
>                 } else {
> +                       if (spin_before_idle_us)
> +                               do_spin_before_idle();
>                         cpuidle_idle_call();
>                 }
>                 arch_cpu_idle_exit();
> @@ -465,3 +490,64 @@ const struct sched_class idle_sched_class = {
>         .switched_to            = switched_to_idle,
>         .update_curr            = update_curr_idle,
>  };
> +
> +
> +static ssize_t store_idle_spin(struct kobject *kobj,
> +                              struct kobj_attribute *attr,
> +                              const char *buf, size_t count)
> +{
> +       unsigned int val;
> +
> +       if (kstrtouint(buf, 10, &val) < 0)
> +               return -EINVAL;
> +
> +       if (val > USEC_PER_SEC)
> +               return -EINVAL;
> +
> +       spin_before_idle_us = val;
> +       return count;
> +}
> +
> +static ssize_t show_idle_spin(struct kobject *kobj,
> +                             struct kobj_attribute *attr,
> +                             char *buf)
> +{
> +       ssize_t ret;
> +
> +       ret = sprintf(buf, "%d\n", spin_before_idle_us);
> +
> +       return ret;
> +}
> +
> +static struct kobj_attribute idle_spin_attr =
> +       __ATTR(idle_spin, 0644, show_idle_spin, store_idle_spin);
> +
> +static struct attribute *sched_attrs[] = {
> +       &idle_spin_attr.attr,
> +       NULL,
> +};
> +
> +static const struct attribute_group sched_attr_group = {
> +       .attrs = sched_attrs,
> +};
> +
> +static struct kobject *sched_kobj;
> +
> +static int __init sched_sysfs_init(void)
> +{
> +       int error;
> +
> +       sched_kobj = kobject_create_and_add("sched", kernel_kobj);
> +       if (!sched_kobj)
> +               return -ENOMEM;
> +
> +       error = sysfs_create_group(sched_kobj, &sched_attr_group);
> +       if (error)
> +               goto err;
> +       return 0;
> +
> +err:
> +       kobject_put(sched_kobj);
> +       return error;
> +}
> +postcore_initcall(sched_sysfs_init);

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-13  9:20 ` Wanpeng Li
@ 2019-05-13 11:31   ` Konrad Rzeszutek Wilk
  2019-05-13 11:51     ` Raslan, KarimAllah
  2019-05-14 13:50   ` Marcelo Tosatti
  1 sibling, 1 reply; 19+ messages in thread
From: Konrad Rzeszutek Wilk @ 2019-05-13 11:31 UTC (permalink / raw)
  To: Wanpeng Li, Marcelo Tosatti
  Cc: kvm-devel, LKML, Thomas Gleixner, Ingo Molnar, Andrea Arcangeli,
	Bandan Das, Paolo Bonzini, ankur.a.arora, Boris Ostrovsky

On May 13, 2019 5:20:37 AM EDT, Wanpeng Li <kernellwp@gmail.com> wrote:
>On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@redhat.com>
>wrote:
>>
>>
>> Certain workloads perform poorly on KVM compared to baremetal
>> due to baremetal's ability to perform mwait on NEED_RESCHED
>> bit of task flags (therefore skipping the IPI).
>
>KVM supports expose mwait to the guest, if it can solve this?
>


There is a bit of problem with that. The host will see 100% CPU utilization even if the guest is idle and taking long naps..

Which depending on your dashboard can look like the machine is on fire.

CCing Ankur and Boris

>Regards,
>Wanpeng Li
>
>>
>> This patch introduces a configurable busy-wait delay before entering
>the
>> architecture delay routine, allowing wakeup IPIs to be skipped
>> (if the IPI happens in that window).
>>
>> The real-life workload which this patch improves performance
>> is SAP HANA (by 5-10%) (for which case setting idle_spin to 30
>> is sufficient).
>>
>> This patch improves the attached server.py and client.py example
>> as follows:
>>
>> Host:                           31.814230202231556
>> Guest:                          38.17718765199993       (83 %)
>> Guest, idle_spin=50us:          33.317709898000004      (95 %)
>> Guest, idle_spin=220us:         32.27826551499999       (98 %)
>>
>> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>>
>> ---
>>  kernel/sched/idle.c |   86
>++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 86 insertions(+)
>>
>> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
>> index f5516bae0c1b..bca7656a7ea0 100644
>> --- a/kernel/sched/idle.c
>> +++ b/kernel/sched/idle.c
>> @@ -216,6 +216,29 @@ static void cpuidle_idle_call(void)
>>         rcu_idle_exit();
>>  }
>>
>> +static unsigned int spin_before_idle_us;
>>
>> +static void do_spin_before_idle(void)
>> +{
>> +       ktime_t now, end_spin;
>> +
>> +       now = ktime_get();
>> +       end_spin = ktime_add_ns(now, spin_before_idle_us*1000);
>> +
>> +       rcu_idle_enter();
>> +       local_irq_enable();
>> +       stop_critical_timings();
>> +
>> +       do {
>> +               cpu_relax();
>> +               now = ktime_get();
>> +       } while (!tif_need_resched() && ktime_before(now, end_spin));
>> +
>> +       start_critical_timings();
>> +       rcu_idle_exit();
>> +       local_irq_disable();
>> +}
>> +
>>  /*
>>   * Generic idle loop implementation
>>   *
>> @@ -259,6 +282,8 @@ static void do_idle(void)
>>                         tick_nohz_idle_restart_tick();
>>                         cpu_idle_poll();
>>                 } else {
>> +                       if (spin_before_idle_us)
>> +                               do_spin_before_idle();
>>                         cpuidle_idle_call();
>>                 }
>>                 arch_cpu_idle_exit();
>> @@ -465,3 +490,64 @@ const struct sched_class idle_sched_class = {
>>         .switched_to            = switched_to_idle,
>>         .update_curr            = update_curr_idle,
>>  };
>> +
>> +
>> +static ssize_t store_idle_spin(struct kobject *kobj,
>> +                              struct kobj_attribute *attr,
>> +                              const char *buf, size_t count)
>> +{
>> +       unsigned int val;
>> +
>> +       if (kstrtouint(buf, 10, &val) < 0)
>> +               return -EINVAL;
>> +
>> +       if (val > USEC_PER_SEC)
>> +               return -EINVAL;
>> +
>> +       spin_before_idle_us = val;
>> +       return count;
>> +}
>> +
>> +static ssize_t show_idle_spin(struct kobject *kobj,
>> +                             struct kobj_attribute *attr,
>> +                             char *buf)
>> +{
>> +       ssize_t ret;
>> +
>> +       ret = sprintf(buf, "%d\n", spin_before_idle_us);
>> +
>> +       return ret;
>> +}
>> +
>> +static struct kobj_attribute idle_spin_attr =
>> +       __ATTR(idle_spin, 0644, show_idle_spin, store_idle_spin);
>> +
>> +static struct attribute *sched_attrs[] = {
>> +       &idle_spin_attr.attr,
>> +       NULL,
>> +};
>> +
>> +static const struct attribute_group sched_attr_group = {
>> +       .attrs = sched_attrs,
>> +};
>> +
>> +static struct kobject *sched_kobj;
>> +
>> +static int __init sched_sysfs_init(void)
>> +{
>> +       int error;
>> +
>> +       sched_kobj = kobject_create_and_add("sched", kernel_kobj);
>> +       if (!sched_kobj)
>> +               return -ENOMEM;
>> +
>> +       error = sysfs_create_group(sched_kobj, &sched_attr_group);
>> +       if (error)
>> +               goto err;
>> +       return 0;
>> +
>> +err:
>> +       kobject_put(sched_kobj);
>> +       return error;
>> +}
>> +postcore_initcall(sched_sysfs_init);


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-13 11:31   ` Konrad Rzeszutek Wilk
@ 2019-05-13 11:51     ` Raslan, KarimAllah
  2019-05-13 12:30       ` Boris Ostrovsky
  2019-05-15  1:45       ` Wanpeng Li
  0 siblings, 2 replies; 19+ messages in thread
From: Raslan, KarimAllah @ 2019-05-13 11:51 UTC (permalink / raw)
  To: mtosatti, konrad.wilk, kernellwp
  Cc: kvm, linux-kernel, mingo, boris.ostrovsky, tglx, bsd, aarcange,
	pbonzini, ankur.a.arora

On Mon, 2019-05-13 at 07:31 -0400, Konrad Rzeszutek Wilk wrote:
> On May 13, 2019 5:20:37 AM EDT, Wanpeng Li <kernellwp@gmail.com> wrote:
> > 
> > On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@redhat.com>
> > wrote:
> > > 
> > > 
> > > 
> > > Certain workloads perform poorly on KVM compared to baremetal
> > > due to baremetal's ability to perform mwait on NEED_RESCHED
> > > bit of task flags (therefore skipping the IPI).
> > 
> > KVM supports expose mwait to the guest, if it can solve this?
> > 
> 
> 
> There is a bit of problem with that. The host will see 100% CPU utilization even if the guest is idle and taking long naps..
> 
> Which depending on your dashboard can look like the machine is on fire.

This can also be fixed. I have a patch that kind of expose proper information 
about the *real* utilization here if that would be help.

> 
> CCing Ankur and Boris
> 
> > 
> > Regards,
> > Wanpeng Li
> > 
> > > 
> > > 
> > > This patch introduces a configurable busy-wait delay before entering
> > the
> > > 
> > > architecture delay routine, allowing wakeup IPIs to be skipped
> > > (if the IPI happens in that window).
> > > 
> > > The real-life workload which this patch improves performance
> > > is SAP HANA (by 5-10%) (for which case setting idle_spin to 30
> > > is sufficient).
> > > 
> > > This patch improves the attached server.py and client.py example
> > > as follows:
> > > 
> > > Host:                           31.814230202231556
> > > Guest:                          38.17718765199993       (83 %)
> > > Guest, idle_spin=50us:          33.317709898000004      (95 %)
> > > Guest, idle_spin=220us:         32.27826551499999       (98 %)
> > > 
> > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > 
> > > ---
> > >  kernel/sched/idle.c |   86
> > ++++++++++++++++++++++++++++++++++++++++++
> > > 
> > >  1 file changed, 86 insertions(+)
> > > 
> > > diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> > > index f5516bae0c1b..bca7656a7ea0 100644
> > > --- a/kernel/sched/idle.c
> > > +++ b/kernel/sched/idle.c
> > > @@ -216,6 +216,29 @@ static void cpuidle_idle_call(void)
> > >         rcu_idle_exit();
> > >  }
> > > 
> > > +static unsigned int spin_before_idle_us;
> > > 
> > > +static void do_spin_before_idle(void)
> > > +{
> > > +       ktime_t now, end_spin;
> > > +
> > > +       now = ktime_get();
> > > +       end_spin = ktime_add_ns(now, spin_before_idle_us*1000);
> > > +
> > > +       rcu_idle_enter();
> > > +       local_irq_enable();
> > > +       stop_critical_timings();
> > > +
> > > +       do {
> > > +               cpu_relax();
> > > +               now = ktime_get();
> > > +       } while (!tif_need_resched() && ktime_before(now, end_spin));
> > > +
> > > +       start_critical_timings();
> > > +       rcu_idle_exit();
> > > +       local_irq_disable();
> > > +}
> > > +
> > >  /*
> > >   * Generic idle loop implementation
> > >   *
> > > @@ -259,6 +282,8 @@ static void do_idle(void)
> > >                         tick_nohz_idle_restart_tick();
> > >                         cpu_idle_poll();
> > >                 } else {
> > > +                       if (spin_before_idle_us)
> > > +                               do_spin_before_idle();
> > >                         cpuidle_idle_call();
> > >                 }
> > >                 arch_cpu_idle_exit();
> > > @@ -465,3 +490,64 @@ const struct sched_class idle_sched_class = {
> > >         .switched_to            = switched_to_idle,
> > >         .update_curr            = update_curr_idle,
> > >  };
> > > +
> > > +
> > > +static ssize_t store_idle_spin(struct kobject *kobj,
> > > +                              struct kobj_attribute *attr,
> > > +                              const char *buf, size_t count)
> > > +{
> > > +       unsigned int val;
> > > +
> > > +       if (kstrtouint(buf, 10, &val) < 0)
> > > +               return -EINVAL;
> > > +
> > > +       if (val > USEC_PER_SEC)
> > > +               return -EINVAL;
> > > +
> > > +       spin_before_idle_us = val;
> > > +       return count;
> > > +}
> > > +
> > > +static ssize_t show_idle_spin(struct kobject *kobj,
> > > +                             struct kobj_attribute *attr,
> > > +                             char *buf)
> > > +{
> > > +       ssize_t ret;
> > > +
> > > +       ret = sprintf(buf, "%d\n", spin_before_idle_us);
> > > +
> > > +       return ret;
> > > +}
> > > +
> > > +static struct kobj_attribute idle_spin_attr =
> > > +       __ATTR(idle_spin, 0644, show_idle_spin, store_idle_spin);
> > > +
> > > +static struct attribute *sched_attrs[] = {
> > > +       &idle_spin_attr.attr,
> > > +       NULL,
> > > +};
> > > +
> > > +static const struct attribute_group sched_attr_group = {
> > > +       .attrs = sched_attrs,
> > > +};
> > > +
> > > +static struct kobject *sched_kobj;
> > > +
> > > +static int __init sched_sysfs_init(void)
> > > +{
> > > +       int error;
> > > +
> > > +       sched_kobj = kobject_create_and_add("sched", kernel_kobj);
> > > +       if (!sched_kobj)
> > > +               return -ENOMEM;
> > > +
> > > +       error = sysfs_create_group(sched_kobj, &sched_attr_group);
> > > +       if (error)
> > > +               goto err;
> > > +       return 0;
> > > +
> > > +err:
> > > +       kobject_put(sched_kobj);
> > > +       return error;
> > > +}
> > > +postcore_initcall(sched_sysfs_init);
> 



Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrer: Christian Schlaeger, Ralf Herbrich
Ust-ID: DE 289 237 879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-13 11:51     ` Raslan, KarimAllah
@ 2019-05-13 12:30       ` Boris Ostrovsky
  2019-05-15  1:45       ` Wanpeng Li
  1 sibling, 0 replies; 19+ messages in thread
From: Boris Ostrovsky @ 2019-05-13 12:30 UTC (permalink / raw)
  To: Raslan, KarimAllah, mtosatti, konrad.wilk, kernellwp
  Cc: kvm, linux-kernel, mingo, tglx, bsd, aarcange, pbonzini, ankur.a.arora

On 5/13/19 7:51 AM, Raslan, KarimAllah wrote:
> On Mon, 2019-05-13 at 07:31 -0400, Konrad Rzeszutek Wilk wrote:
>> On May 13, 2019 5:20:37 AM EDT, Wanpeng Li <kernellwp@gmail.com> wrote:
>>> On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@redhat.com>
>>> wrote:
>>>>
>>>>
>>>> Certain workloads perform poorly on KVM compared to baremetal
>>>> due to baremetal's ability to perform mwait on NEED_RESCHED
>>>> bit of task flags (therefore skipping the IPI).
>>> KVM supports expose mwait to the guest, if it can solve this?
>>>
>>
>> There is a bit of problem with that. The host will see 100% CPU utilization even if the guest is idle and taking long naps..
>>
>> Which depending on your dashboard can look like the machine is on fire.
> This can also be fixed. I have a patch that kind of expose proper information 
> about the *real* utilization here if that would be help.

Yes, that would certainly be interesting to see. Thanks.


--boris


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-13  9:20 ` Wanpeng Li
  2019-05-13 11:31   ` Konrad Rzeszutek Wilk
@ 2019-05-14 13:50   ` Marcelo Tosatti
  2019-05-14 15:20     ` Konrad Rzeszutek Wilk
  2019-05-15 18:42     ` Ankur Arora
  1 sibling, 2 replies; 19+ messages in thread
From: Marcelo Tosatti @ 2019-05-14 13:50 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: kvm-devel, LKML, Thomas Gleixner, Ingo Molnar, Andrea Arcangeli,
	Bandan Das, Paolo Bonzini

On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote:
> On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >
> >
> > Certain workloads perform poorly on KVM compared to baremetal
> > due to baremetal's ability to perform mwait on NEED_RESCHED
> > bit of task flags (therefore skipping the IPI).
> 
> KVM supports expose mwait to the guest, if it can solve this?
> 
> Regards,
> Wanpeng Li

Unfortunately mwait in guest is not feasible (uncompatible with multiple
guests). Checking whether a paravirt solution is possible.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-14 13:50   ` Marcelo Tosatti
@ 2019-05-14 15:20     ` Konrad Rzeszutek Wilk
  2019-05-14 17:42       ` Marcelo Tosatti
  2019-05-15 18:42     ` Ankur Arora
  1 sibling, 1 reply; 19+ messages in thread
From: Konrad Rzeszutek Wilk @ 2019-05-14 15:20 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Wanpeng Li, kvm-devel, LKML, Thomas Gleixner, Ingo Molnar,
	Andrea Arcangeli, Bandan Das, Paolo Bonzini

On Tue, May 14, 2019 at 10:50:23AM -0300, Marcelo Tosatti wrote:
> On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote:
> > On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > >
> > >
> > > Certain workloads perform poorly on KVM compared to baremetal
> > > due to baremetal's ability to perform mwait on NEED_RESCHED
> > > bit of task flags (therefore skipping the IPI).
> > 
> > KVM supports expose mwait to the guest, if it can solve this?
> > 
> > Regards,
> > Wanpeng Li
> 
> Unfortunately mwait in guest is not feasible (uncompatible with multiple
> guests). Checking whether a paravirt solution is possible.

There is the obvious problem with that the guest can be malicious and
provide via the paravirt solution bogus data. That is it expose 0% CPU
usage but in reality be mining and using 100%.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-14 15:20     ` Konrad Rzeszutek Wilk
@ 2019-05-14 17:42       ` Marcelo Tosatti
  2019-05-15  1:42         ` Wanpeng Li
  0 siblings, 1 reply; 19+ messages in thread
From: Marcelo Tosatti @ 2019-05-14 17:42 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: Wanpeng Li, kvm-devel, LKML, Thomas Gleixner, Ingo Molnar,
	Andrea Arcangeli, Bandan Das, Paolo Bonzini

On Tue, May 14, 2019 at 11:20:15AM -0400, Konrad Rzeszutek Wilk wrote:
> On Tue, May 14, 2019 at 10:50:23AM -0300, Marcelo Tosatti wrote:
> > On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote:
> > > On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > >
> > > >
> > > > Certain workloads perform poorly on KVM compared to baremetal
> > > > due to baremetal's ability to perform mwait on NEED_RESCHED
> > > > bit of task flags (therefore skipping the IPI).
> > > 
> > > KVM supports expose mwait to the guest, if it can solve this?
> > > 
> > > Regards,
> > > Wanpeng Li
> > 
> > Unfortunately mwait in guest is not feasible (uncompatible with multiple
> > guests). Checking whether a paravirt solution is possible.
> 
> There is the obvious problem with that the guest can be malicious and
> provide via the paravirt solution bogus data. That is it expose 0% CPU
> usage but in reality be mining and using 100%.

The idea is to have a hypercall for the guest to perform the
need_resched=1 bit set. It can only hurt itself.



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-14 17:42       ` Marcelo Tosatti
@ 2019-05-15  1:42         ` Wanpeng Li
  2019-05-15 20:26           ` Marcelo Tosatti
  0 siblings, 1 reply; 19+ messages in thread
From: Wanpeng Li @ 2019-05-15  1:42 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Konrad Rzeszutek Wilk, kvm-devel, LKML, Thomas Gleixner,
	Ingo Molnar, Andrea Arcangeli, Bandan Das, Paolo Bonzini

On Wed, 15 May 2019 at 02:20, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>
> On Tue, May 14, 2019 at 11:20:15AM -0400, Konrad Rzeszutek Wilk wrote:
> > On Tue, May 14, 2019 at 10:50:23AM -0300, Marcelo Tosatti wrote:
> > > On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote:
> > > > On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > > >
> > > > >
> > > > > Certain workloads perform poorly on KVM compared to baremetal
> > > > > due to baremetal's ability to perform mwait on NEED_RESCHED
> > > > > bit of task flags (therefore skipping the IPI).
> > > >
> > > > KVM supports expose mwait to the guest, if it can solve this?
> > > >
> > > > Regards,
> > > > Wanpeng Li
> > >
> > > Unfortunately mwait in guest is not feasible (uncompatible with multiple
> > > guests). Checking whether a paravirt solution is possible.
> >
> > There is the obvious problem with that the guest can be malicious and
> > provide via the paravirt solution bogus data. That is it expose 0% CPU
> > usage but in reality be mining and using 100%.
>
> The idea is to have a hypercall for the guest to perform the
> need_resched=1 bit set. It can only hurt itself.

This lets me recall the patchset from aliyun
https://lkml.org/lkml/2017/6/22/296 They poll after
__current_set_polling() in do_idle() so avoid this hypercall I think.
Btw, do you get SAP HANA by 5-10% bonus even if adaptive halt-polling
is enabled?

Regards,
Wanpeng Li

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-13 11:51     ` Raslan, KarimAllah
  2019-05-13 12:30       ` Boris Ostrovsky
@ 2019-05-15  1:45       ` Wanpeng Li
  1 sibling, 0 replies; 19+ messages in thread
From: Wanpeng Li @ 2019-05-15  1:45 UTC (permalink / raw)
  To: Raslan, KarimAllah
  Cc: mtosatti, konrad.wilk, kvm, linux-kernel, mingo, boris.ostrovsky,
	tglx, bsd, aarcange, pbonzini, ankur.a.arora

On Mon, 13 May 2019 at 19:52, Raslan, KarimAllah <karahmed@amazon.de> wrote:
>
> On Mon, 2019-05-13 at 07:31 -0400, Konrad Rzeszutek Wilk wrote:
> > On May 13, 2019 5:20:37 AM EDT, Wanpeng Li <kernellwp@gmail.com> wrote:
> > >
> > > On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@redhat.com>
> > > wrote:
> > > >
> > > >
> > > >
> > > > Certain workloads perform poorly on KVM compared to baremetal
> > > > due to baremetal's ability to perform mwait on NEED_RESCHED
> > > > bit of task flags (therefore skipping the IPI).
> > >
> > > KVM supports expose mwait to the guest, if it can solve this?
> > >
> >
> >
> > There is a bit of problem with that. The host will see 100% CPU utilization even if the guest is idle and taking long naps..
> >
> > Which depending on your dashboard can look like the machine is on fire.
>
> This can also be fixed. I have a patch that kind of expose proper information
> about the *real* utilization here if that would be help.

You can have a post.

Regards,
Wanpeng Li

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-14 13:50   ` Marcelo Tosatti
  2019-05-14 15:20     ` Konrad Rzeszutek Wilk
@ 2019-05-15 18:42     ` Ankur Arora
  2019-05-15 20:43       ` Marcelo Tosatti
  2019-05-16  1:07       ` Wanpeng Li
  1 sibling, 2 replies; 19+ messages in thread
From: Ankur Arora @ 2019-05-15 18:42 UTC (permalink / raw)
  To: Marcelo Tosatti, Wanpeng Li
  Cc: kvm-devel, LKML, Thomas Gleixner, Ingo Molnar, Andrea Arcangeli,
	Bandan Das, Paolo Bonzini

On 5/14/19 6:50 AM, Marcelo Tosatti wrote:
> On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote:
>> On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>>>
>>>
>>> Certain workloads perform poorly on KVM compared to baremetal
>>> due to baremetal's ability to perform mwait on NEED_RESCHED
>>> bit of task flags (therefore skipping the IPI).
>>
>> KVM supports expose mwait to the guest, if it can solve this?
>>
>> Regards,
>> Wanpeng Li
> 
> Unfortunately mwait in guest is not feasible (uncompatible with multiple
> guests). Checking whether a paravirt solution is possible.

Hi Marcelo,

I was also looking at making MWAIT available to guests in a safe manner:
whether through emulation or a PV-MWAIT. My (unsolicited) thoughts
follow.

We basically want to handle this sequence:

     monitor(monitor_address);
     if (*monitor_address == base_value)
          mwaitx(max_delay);

Emulation seems problematic because, AFAICS this would happen:

     guest                                   hypervisor
     =====                                   ====

     monitor(monitor_address);
         vmexit  ===>                        monitor(monitor_address)
     if (*monitor_address == base_value)
          mwait();
               vmexit    ====>               mwait()

There's a context switch back to the guest in this sequence which seems
problematic. Both the AMD and Intel specs list system calls and
far calls as events which would lead to the MWAIT being woken up: 
"Voluntary transitions due to fast system call and far calls (occurring 
prior to issuing MWAIT but after setting the monitor)".


We could do this instead:

     guest                                   hypervisor
     =====                                   ====

     monitor(monitor_address);
         vmexit  ===>                        cache monitor_address
     if (*monitor_address == base_value)
          mwait();
               vmexit    ====>              monitor(monitor_address)
                                            mwait()

But, this would miss the "if (*monitor_address == base_value)" check in
the host which is problematic if *monitor_address changed simultaneously
when monitor was executed.
(Similar problem if we cache both the monitor_address and
*monitor_address.)


So, AFAICS, the only thing that would work is the guest offloading the
whole PV-MWAIT operation.

AFAICS, that could be a paravirt operation which needs three parameters:
(monitor_address, base_value, max_delay.)

This would allow the guest to offload this whole operation to
the host:
     monitor(monitor_address);
     if (*monitor_address == base_value)
          mwaitx(max_delay);

I'm guessing you are thinking on similar lines?


High level semantics: If the CPU doesn't have any runnable threads, then
we actually do this version of PV-MWAIT -- arming a timer if necessary
so we only sleep until the time-slice expires or the MWAIT max_delay does.

If the CPU has any runnable threads then this could still finish its 
time-quanta or we could just do a schedule-out.


So the semantics guaranteed to the host would be that PV-MWAIT returns 
after >= max_delay OR with the *monitor_address changed.



Ankur

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-15  1:42         ` Wanpeng Li
@ 2019-05-15 20:26           ` Marcelo Tosatti
  0 siblings, 0 replies; 19+ messages in thread
From: Marcelo Tosatti @ 2019-05-15 20:26 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: Konrad Rzeszutek Wilk, kvm-devel, LKML, Thomas Gleixner,
	Ingo Molnar, Andrea Arcangeli, Bandan Das, Paolo Bonzini

On Wed, May 15, 2019 at 09:42:48AM +0800, Wanpeng Li wrote:
> On Wed, 15 May 2019 at 02:20, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >
> > On Tue, May 14, 2019 at 11:20:15AM -0400, Konrad Rzeszutek Wilk wrote:
> > > On Tue, May 14, 2019 at 10:50:23AM -0300, Marcelo Tosatti wrote:
> > > > On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote:
> > > > > On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > > > >
> > > > > >
> > > > > > Certain workloads perform poorly on KVM compared to baremetal
> > > > > > due to baremetal's ability to perform mwait on NEED_RESCHED
> > > > > > bit of task flags (therefore skipping the IPI).
> > > > >
> > > > > KVM supports expose mwait to the guest, if it can solve this?
> > > > >
> > > > > Regards,
> > > > > Wanpeng Li
> > > >
> > > > Unfortunately mwait in guest is not feasible (uncompatible with multiple
> > > > guests). Checking whether a paravirt solution is possible.
> > >
> > > There is the obvious problem with that the guest can be malicious and
> > > provide via the paravirt solution bogus data. That is it expose 0% CPU
> > > usage but in reality be mining and using 100%.
> >
> > The idea is to have a hypercall for the guest to perform the
> > need_resched=1 bit set. It can only hurt itself.
> 
> This lets me recall the patchset from aliyun
> https://lkml.org/lkml/2017/6/22/296 

Thanks for the pointer.

"The background is that we(Alibaba Cloud) do get more and more
complaints from our customers in both KVM and Xen compare to bare-mental.
After investigations, the root cause is known to us: big cost in message 
passing workload(David show it in KVM forum 2015) 

A typical message workload like below: 
vcpu 0                             vcpu 1 
1. send ipi                     2.  doing hlt 
3. go into idle                 4.  receive ipi and wake up from hlt 
5. write APIC time twice        6.  write APIC time twice to 
    to stop sched timer              reprogram sched timer 
7. doing hlt                    8.  handle task and send ipi to 
                                     vcpu 0 
9. same to 4.                   10. same to 3"

This is very similar to the client/server example pair 
included in the first message.

 
> They poll after
> __current_set_polling() in do_idle() so avoid this hypercall I think.

Yes, i was thinking about a variant without poll.

> Btw, do you get SAP HANA by 5-10% bonus even if adaptive halt-polling
> is enabled?

host			   = 31.18 
halt_poll_ns set to 200000 = 38.55	(80%)
halt_poll_ns set to 300000 = 33.28	(93%)
idle_spin set to 220000 = 32.22 	(96%)

So avoiding the IPI VM-exits is faster. 

300000 is the optimal value vfor this workload. Haven't checked
adaptive halt-polling.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-15 18:42     ` Ankur Arora
@ 2019-05-15 20:43       ` Marcelo Tosatti
  2019-05-17  4:32         ` Ankur Arora
  2019-05-16  1:07       ` Wanpeng Li
  1 sibling, 1 reply; 19+ messages in thread
From: Marcelo Tosatti @ 2019-05-15 20:43 UTC (permalink / raw)
  To: Ankur Arora
  Cc: Wanpeng Li, kvm-devel, LKML, Thomas Gleixner, Ingo Molnar,
	Andrea Arcangeli, Bandan Das, Paolo Bonzini

On Wed, May 15, 2019 at 11:42:56AM -0700, Ankur Arora wrote:
> On 5/14/19 6:50 AM, Marcelo Tosatti wrote:
> >On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote:
> >>On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >>>
> >>>
> >>>Certain workloads perform poorly on KVM compared to baremetal
> >>>due to baremetal's ability to perform mwait on NEED_RESCHED
> >>>bit of task flags (therefore skipping the IPI).
> >>
> >>KVM supports expose mwait to the guest, if it can solve this?
> >>
> >>Regards,
> >>Wanpeng Li
> >
> >Unfortunately mwait in guest is not feasible (uncompatible with multiple
> >guests). Checking whether a paravirt solution is possible.

Hi Ankur,

> 
> Hi Marcelo,
> 
> I was also looking at making MWAIT available to guests in a safe manner:
> whether through emulation or a PV-MWAIT. My (unsolicited) thoughts

What use-case are you interested in? 

> 
> We basically want to handle this sequence:
> 
>     monitor(monitor_address);
>     if (*monitor_address == base_value)
>          mwaitx(max_delay);
> 
> Emulation seems problematic because, AFAICS this would happen:
> 
>     guest                                   hypervisor
>     =====                                   ====
> 
>     monitor(monitor_address);
>         vmexit  ===>                        monitor(monitor_address)
>     if (*monitor_address == base_value)
>          mwait();
>               vmexit    ====>               mwait()
> 
> There's a context switch back to the guest in this sequence which seems
> problematic. Both the AMD and Intel specs list system calls and
> far calls as events which would lead to the MWAIT being woken up:
> "Voluntary transitions due to fast system call and far calls
> (occurring prior to issuing MWAIT but after setting the monitor)".
> 
> 
> We could do this instead:
> 
>     guest                                   hypervisor
>     =====                                   ====
> 
>     monitor(monitor_address);
>         vmexit  ===>                        cache monitor_address
>     if (*monitor_address == base_value)
>          mwait();
>               vmexit    ====>              monitor(monitor_address)
>                                            mwait()
> 
> But, this would miss the "if (*monitor_address == base_value)" check in
> the host which is problematic if *monitor_address changed simultaneously
> when monitor was executed.
> (Similar problem if we cache both the monitor_address and
> *monitor_address.)
> 
> 
> So, AFAICS, the only thing that would work is the guest offloading the
> whole PV-MWAIT operation.
> 
> AFAICS, that could be a paravirt operation which needs three parameters:
> (monitor_address, base_value, max_delay.)
> 
> This would allow the guest to offload this whole operation to
> the host:
>     monitor(monitor_address);
>     if (*monitor_address == base_value)
>          mwaitx(max_delay);
> 
> I'm guessing you are thinking on similar lines?

Sort of: only trying to avoid the IPI to wake a remote vCPU.

Problem is that MWAIT works only on a contiguous range 
of bits in memory (512 bits max on current CPUs).

So if you execute mwait on the host on behalf of the guest,
the region of memory monitored must include both host
and guest bits.

> 
> 
> High level semantics: If the CPU doesn't have any runnable threads, then
> we actually do this version of PV-MWAIT -- arming a timer if necessary
> so we only sleep until the time-slice expires or the MWAIT max_delay does.

That would kill the sched_wake_idle_without_ipi optimization for the
host.

> If the CPU has any runnable threads then this could still finish its
> time-quanta or we could just do a schedule-out.
> 
> 
> So the semantics guaranteed to the host would be that PV-MWAIT
> returns after >= max_delay OR with the *monitor_address changed.
> 
> 
> 
> Ankur

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-15 18:42     ` Ankur Arora
  2019-05-15 20:43       ` Marcelo Tosatti
@ 2019-05-16  1:07       ` Wanpeng Li
  2019-05-17  2:06         ` Ankur Arora
  1 sibling, 1 reply; 19+ messages in thread
From: Wanpeng Li @ 2019-05-16  1:07 UTC (permalink / raw)
  To: Ankur Arora
  Cc: Marcelo Tosatti, kvm-devel, LKML, Thomas Gleixner, Ingo Molnar,
	Andrea Arcangeli, Bandan Das, Paolo Bonzini

On Thu, 16 May 2019 at 02:42, Ankur Arora <ankur.a.arora@oracle.com> wrote:
>
> On 5/14/19 6:50 AM, Marcelo Tosatti wrote:
> > On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote:
> >> On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >>>
> >>>
> >>> Certain workloads perform poorly on KVM compared to baremetal
> >>> due to baremetal's ability to perform mwait on NEED_RESCHED
> >>> bit of task flags (therefore skipping the IPI).
> >>
> >> KVM supports expose mwait to the guest, if it can solve this?
> >>
> >> Regards,
> >> Wanpeng Li
> >
> > Unfortunately mwait in guest is not feasible (uncompatible with multiple
> > guests). Checking whether a paravirt solution is possible.
>
> Hi Marcelo,
>
> I was also looking at making MWAIT available to guests in a safe manner:
> whether through emulation or a PV-MWAIT. My (unsolicited) thoughts

MWAIT emulation is not simple, here is a research
https://www.contrib.andrew.cmu.edu/~somlo/OSXKVM/mwait.html

Regards,
Wanpeng Li

> follow.
>
> We basically want to handle this sequence:
>
>      monitor(monitor_address);
>      if (*monitor_address == base_value)
>           mwaitx(max_delay);
>
> Emulation seems problematic because, AFAICS this would happen:
>
>      guest                                   hypervisor
>      =====                                   ====
>
>      monitor(monitor_address);
>          vmexit  ===>                        monitor(monitor_address)
>      if (*monitor_address == base_value)
>           mwait();
>                vmexit    ====>               mwait()
>
> There's a context switch back to the guest in this sequence which seems
> problematic. Both the AMD and Intel specs list system calls and
> far calls as events which would lead to the MWAIT being woken up:
> "Voluntary transitions due to fast system call and far calls (occurring
> prior to issuing MWAIT but after setting the monitor)".
>
>
> We could do this instead:
>
>      guest                                   hypervisor
>      =====                                   ====
>
>      monitor(monitor_address);
>          vmexit  ===>                        cache monitor_address
>      if (*monitor_address == base_value)
>           mwait();
>                vmexit    ====>              monitor(monitor_address)
>                                             mwait()
>
> But, this would miss the "if (*monitor_address == base_value)" check in
> the host which is problematic if *monitor_address changed simultaneously
> when monitor was executed.
> (Similar problem if we cache both the monitor_address and
> *monitor_address.)
>
>
> So, AFAICS, the only thing that would work is the guest offloading the
> whole PV-MWAIT operation.
>
> AFAICS, that could be a paravirt operation which needs three parameters:
> (monitor_address, base_value, max_delay.)
>
> This would allow the guest to offload this whole operation to
> the host:
>      monitor(monitor_address);
>      if (*monitor_address == base_value)
>           mwaitx(max_delay);
>
> I'm guessing you are thinking on similar lines?
>
>
> High level semantics: If the CPU doesn't have any runnable threads, then
> we actually do this version of PV-MWAIT -- arming a timer if necessary
> so we only sleep until the time-slice expires or the MWAIT max_delay does.
>
> If the CPU has any runnable threads then this could still finish its
> time-quanta or we could just do a schedule-out.
>
>
> So the semantics guaranteed to the host would be that PV-MWAIT returns
> after >= max_delay OR with the *monitor_address changed.
>
>
>
> Ankur

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-16  1:07       ` Wanpeng Li
@ 2019-05-17  2:06         ` Ankur Arora
  0 siblings, 0 replies; 19+ messages in thread
From: Ankur Arora @ 2019-05-17  2:06 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: Marcelo Tosatti, kvm-devel, LKML, Thomas Gleixner, Ingo Molnar,
	Andrea Arcangeli, Bandan Das, Paolo Bonzini

On 2019-05-15 6:07 p.m., Wanpeng Li wrote:
> On Thu, 16 May 2019 at 02:42, Ankur Arora <ankur.a.arora@oracle.com> wrote:
>>
>> On 5/14/19 6:50 AM, Marcelo Tosatti wrote:
>>> On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote:
>>>> On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>>>>>
>>>>>
>>>>> Certain workloads perform poorly on KVM compared to baremetal
>>>>> due to baremetal's ability to perform mwait on NEED_RESCHED
>>>>> bit of task flags (therefore skipping the IPI).
>>>>
>>>> KVM supports expose mwait to the guest, if it can solve this?
>>>>
>>>> Regards,
>>>> Wanpeng Li
>>>
>>> Unfortunately mwait in guest is not feasible (uncompatible with multiple
>>> guests). Checking whether a paravirt solution is possible.
>>
>> Hi Marcelo,
>>
>> I was also looking at making MWAIT available to guests in a safe manner:
>> whether through emulation or a PV-MWAIT. My (unsolicited) thoughts
> 
> MWAIT emulation is not simple, here is a research
> https://www.contrib.andrew.cmu.edu/~somlo/OSXKVM/mwait.html
Agreed. I had outlined my attempt to do that below and come
to the conclusion that we would need a PV-MWAIT.

Ankur

> 
> Regards,
> Wanpeng Li
> 
>> follow.
>>
>> We basically want to handle this sequence:
>>
>>       monitor(monitor_address);
>>       if (*monitor_address == base_value)
>>            mwaitx(max_delay);
>>
>> Emulation seems problematic because, AFAICS this would happen:
>>
>>       guest                                   hypervisor
>>       =====                                   ====
>>
>>       monitor(monitor_address);
>>           vmexit  ===>                        monitor(monitor_address)
>>       if (*monitor_address == base_value)
>>            mwait();
>>                 vmexit    ====>               mwait()
>>
>> There's a context switch back to the guest in this sequence which seems
>> problematic. Both the AMD and Intel specs list system calls and
>> far calls as events which would lead to the MWAIT being woken up:
>> "Voluntary transitions due to fast system call and far calls (occurring
>> prior to issuing MWAIT but after setting the monitor)".
>>
>>
>> We could do this instead:
>>
>>       guest                                   hypervisor
>>       =====                                   ====
>>
>>       monitor(monitor_address);
>>           vmexit  ===>                        cache monitor_address
>>       if (*monitor_address == base_value)
>>            mwait();
>>                 vmexit    ====>              monitor(monitor_address)
>>                                              mwait()
>>
>> But, this would miss the "if (*monitor_address == base_value)" check in
>> the host which is problematic if *monitor_address changed simultaneously
>> when monitor was executed.
>> (Similar problem if we cache both the monitor_address and
>> *monitor_address.)
>>
>>
>> So, AFAICS, the only thing that would work is the guest offloading the
>> whole PV-MWAIT operation.
>>
>> AFAICS, that could be a paravirt operation which needs three parameters:
>> (monitor_address, base_value, max_delay.)
>>
>> This would allow the guest to offload this whole operation to
>> the host:
>>       monitor(monitor_address);
>>       if (*monitor_address == base_value)
>>            mwaitx(max_delay);
>>
>> I'm guessing you are thinking on similar lines?
>>
>>
>> High level semantics: If the CPU doesn't have any runnable threads, then
>> we actually do this version of PV-MWAIT -- arming a timer if necessary
>> so we only sleep until the time-slice expires or the MWAIT max_delay does.
>>
>> If the CPU has any runnable threads then this could still finish its
>> time-quanta or we could just do a schedule-out.
>>
>>
>> So the semantics guaranteed to the host would be that PV-MWAIT returns
>> after >= max_delay OR with the *monitor_address changed.
>>
>>
>>
>> Ankur


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-15 20:43       ` Marcelo Tosatti
@ 2019-05-17  4:32         ` Ankur Arora
  2019-05-17 17:49           ` Marcelo Tosatti
  0 siblings, 1 reply; 19+ messages in thread
From: Ankur Arora @ 2019-05-17  4:32 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Wanpeng Li, kvm-devel, LKML, Thomas Gleixner, Ingo Molnar,
	Andrea Arcangeli, Bandan Das, Paolo Bonzini

On 2019-05-15 1:43 p.m., Marcelo Tosatti wrote:
> On Wed, May 15, 2019 at 11:42:56AM -0700, Ankur Arora wrote:
>> On 5/14/19 6:50 AM, Marcelo Tosatti wrote:
>>> On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote:
>>>> On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>>>>>
>>>>>
>>>>> Certain workloads perform poorly on KVM compared to baremetal
>>>>> due to baremetal's ability to perform mwait on NEED_RESCHED
>>>>> bit of task flags (therefore skipping the IPI).
>>>>
>>>> KVM supports expose mwait to the guest, if it can solve this?
>>>>
>>>> Regards,
>>>> Wanpeng Li
>>>
>>> Unfortunately mwait in guest is not feasible (uncompatible with multiple
>>> guests). Checking whether a paravirt solution is possible.
> 
> Hi Ankur,
> 
>>
>> Hi Marcelo,
>>
>> I was also looking at making MWAIT available to guests in a safe manner:
>> whether through emulation or a PV-MWAIT. My (unsolicited) thoughts
> 
> What use-case are you interested in?
Currently Oracle does not make MWAIT available to guests in cloud
environments. My interest is 1) allow guests to avoid the IPI and
2) allow the waiting to be in deeper C-states so that other cores
could get the benefit of turbo-boost etc.


> 
>>
>> We basically want to handle this sequence:
>>
>>      monitor(monitor_address);
>>      if (*monitor_address == base_value)
>>           mwaitx(max_delay);
>>
>> Emulation seems problematic because, AFAICS this would happen:
>>
>>      guest                                   hypervisor
>>      =====                                   ====
>>
>>      monitor(monitor_address);
>>          vmexit  ===>                        monitor(monitor_address)
>>      if (*monitor_address == base_value)
>>           mwait();
>>                vmexit    ====>               mwait()
>>
>> There's a context switch back to the guest in this sequence which seems
>> problematic. Both the AMD and Intel specs list system calls and
>> far calls as events which would lead to the MWAIT being woken up:
>> "Voluntary transitions due to fast system call and far calls
>> (occurring prior to issuing MWAIT but after setting the monitor)".
>>
>>
>> We could do this instead:
>>
>>      guest                                   hypervisor
>>      =====                                   ====
>>
>>      monitor(monitor_address);
>>          vmexit  ===>                        cache monitor_address
>>      if (*monitor_address == base_value)
>>           mwait();
>>                vmexit    ====>              monitor(monitor_address)
>>                                             mwait()
>>
>> But, this would miss the "if (*monitor_address == base_value)" check in
>> the host which is problematic if *monitor_address changed simultaneously
>> when monitor was executed.
>> (Similar problem if we cache both the monitor_address and
>> *monitor_address.)
>>
>>
>> So, AFAICS, the only thing that would work is the guest offloading the
>> whole PV-MWAIT operation.
>>
>> AFAICS, that could be a paravirt operation which needs three parameters:
>> (monitor_address, base_value, max_delay.)
>>
>> This would allow the guest to offload this whole operation to
>> the host:
>>      monitor(monitor_address);
>>      if (*monitor_address == base_value)
>>           mwaitx(max_delay);
>>
>> I'm guessing you are thinking on similar lines?
> 
> Sort of: only trying to avoid the IPI to wake a remote vCPU.
> 
> Problem is that MWAIT works only on a contiguous range
> of bits in memory (512 bits max on current CPUs).
> 
> So if you execute mwait on the host on behalf of the guest,
> the region of memory monitored must include both host
> and guest bits.
Yeah, an MWAITv would have come pretty handy here ;).

My idea of PV-MWAIT didn't include waiting on behalf of the host. I
was thinking of waiting in the host but exclusively on behalf of the
guest, until the guest is woken up or when it's time-quanta expires.

Waiting on behalf of both the guest and the host would clearly be better.

If we can do mwait for both the guest and host (say they share a 512
bit region), then the host will need some protection from the guest.
Maybe the waking guest-thread could just do a hypercall to wake up
the remote vCPU? Or maybe it could poke the monitored region,
but that is handled as a special page-fault?
The hypercall-to-wake would also allow us to move guest-threads across
CPUs. That said, I'm not sure how expensive either of these would be.

Assuming host/guest can share a monitored region safely, the host's
idle could monitor some region other than its &thread_info->flags.
Maybe we could setup a mwait notifier with a percpu waiting area which
could be registered by idle, guests etc.

Though on second thoughts, if the remote thread will do a
hypercall/page-fault then the handling could just as easily be: mark
the guest's remote thread runnable and set the resched bit.

> 
>>
>>
>> High level semantics: If the CPU doesn't have any runnable threads, then
>> we actually do this version of PV-MWAIT -- arming a timer if necessary
>> so we only sleep until the time-slice expires or the MWAIT max_delay does.
> 
> That would kill the sched_wake_idle_without_ipi optimization for the
> host.
Yeah, I was thinking in terms of the MWAIT being exclusively on behalf
of the guest so in a sense the guest was still scheduled just waiting.

Ankur

> 
>> If the CPU has any runnable threads then this could still finish its
>> time-quanta or we could just do a schedule-out.
>>
>>
>> So the semantics guaranteed to the host would be that PV-MWAIT
>> returns after >= max_delay OR with the *monitor_address changed.
>>
>>
>>
>> Ankur


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] sched: introduce configurable delay before entering idle
  2019-05-17  4:32         ` Ankur Arora
@ 2019-05-17 17:49           ` Marcelo Tosatti
  0 siblings, 0 replies; 19+ messages in thread
From: Marcelo Tosatti @ 2019-05-17 17:49 UTC (permalink / raw)
  To: Ankur Arora
  Cc: Wanpeng Li, kvm-devel, LKML, Thomas Gleixner, Ingo Molnar,
	Andrea Arcangeli, Bandan Das, Paolo Bonzini

On Thu, May 16, 2019 at 09:32:06PM -0700, Ankur Arora wrote:
> On 2019-05-15 1:43 p.m., Marcelo Tosatti wrote:
> >On Wed, May 15, 2019 at 11:42:56AM -0700, Ankur Arora wrote:
> >>On 5/14/19 6:50 AM, Marcelo Tosatti wrote:
> >>>On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote:
> >>>>On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >>>>>
> >>>>>
> >>>>>Certain workloads perform poorly on KVM compared to baremetal
> >>>>>due to baremetal's ability to perform mwait on NEED_RESCHED
> >>>>>bit of task flags (therefore skipping the IPI).
> >>>>
> >>>>KVM supports expose mwait to the guest, if it can solve this?
> >>>>
> >>>>Regards,
> >>>>Wanpeng Li
> >>>
> >>>Unfortunately mwait in guest is not feasible (uncompatible with multiple
> >>>guests). Checking whether a paravirt solution is possible.
> >
> >Hi Ankur,
> >
> >>
> >>Hi Marcelo,
> >>
> >>I was also looking at making MWAIT available to guests in a safe manner:
> >>whether through emulation or a PV-MWAIT. My (unsolicited) thoughts
> >
> >What use-case are you interested in?
> Currently Oracle does not make MWAIT available to guests in cloud
> environments. My interest is 1) allow guests to avoid the IPI and
> 2) allow the waiting to be in deeper C-states so that other cores
> could get the benefit of turbo-boost etc.
> 
> 
> >
> >>
> >>We basically want to handle this sequence:
> >>
> >>     monitor(monitor_address);
> >>     if (*monitor_address == base_value)
> >>          mwaitx(max_delay);
> >>
> >>Emulation seems problematic because, AFAICS this would happen:
> >>
> >>     guest                                   hypervisor
> >>     =====                                   ====
> >>
> >>     monitor(monitor_address);
> >>         vmexit  ===>                        monitor(monitor_address)
> >>     if (*monitor_address == base_value)
> >>          mwait();
> >>               vmexit    ====>               mwait()
> >>
> >>There's a context switch back to the guest in this sequence which seems
> >>problematic. Both the AMD and Intel specs list system calls and
> >>far calls as events which would lead to the MWAIT being woken up:
> >>"Voluntary transitions due to fast system call and far calls
> >>(occurring prior to issuing MWAIT but after setting the monitor)".
> >>
> >>
> >>We could do this instead:
> >>
> >>     guest                                   hypervisor
> >>     =====                                   ====
> >>
> >>     monitor(monitor_address);
> >>         vmexit  ===>                        cache monitor_address
> >>     if (*monitor_address == base_value)
> >>          mwait();
> >>               vmexit    ====>              monitor(monitor_address)
> >>                                            mwait()
> >>
> >>But, this would miss the "if (*monitor_address == base_value)" check in
> >>the host which is problematic if *monitor_address changed simultaneously
> >>when monitor was executed.
> >>(Similar problem if we cache both the monitor_address and
> >>*monitor_address.)
> >>
> >>
> >>So, AFAICS, the only thing that would work is the guest offloading the
> >>whole PV-MWAIT operation.
> >>
> >>AFAICS, that could be a paravirt operation which needs three parameters:
> >>(monitor_address, base_value, max_delay.)
> >>
> >>This would allow the guest to offload this whole operation to
> >>the host:
> >>     monitor(monitor_address);
> >>     if (*monitor_address == base_value)
> >>          mwaitx(max_delay);
> >>
> >>I'm guessing you are thinking on similar lines?
> >
> >Sort of: only trying to avoid the IPI to wake a remote vCPU.
> >
> >Problem is that MWAIT works only on a contiguous range
> >of bits in memory (512 bits max on current CPUs).
> >
> >So if you execute mwait on the host on behalf of the guest,
> >the region of memory monitored must include both host
> >and guest bits.
> Yeah, an MWAITv would have come pretty handy here ;).
> 
> My idea of PV-MWAIT didn't include waiting on behalf of the host. I
> was thinking of waiting in the host but exclusively on behalf of the
> guest, until the guest is woken up or when it's time-quanta expires.
> 
> Waiting on behalf of both the guest and the host would clearly be better.
> 
> If we can do mwait for both the guest and host (say they share a 512
> bit region), then the host will need some protection from the guest.
> Maybe the waking guest-thread could just do a hypercall to wake up
> the remote vCPU? Or maybe it could poke the monitored region,
> but that is handled as a special page-fault?
>
> The hypercall-to-wake would also allow us to move guest-threads across
> CPUs. That said, I'm not sure how expensive either of these would be.
> 
> Assuming host/guest can share a monitored region safely, the host's
> idle could monitor some region other than its &thread_info->flags.
> Maybe we could setup a mwait notifier with a percpu waiting area which
> could be registered by idle, guests etc.
> 
> Though on second thoughts, if the remote thread will do a
> hypercall/page-fault then the handling could just as easily be: mark
> the guest's remote thread runnable and set the resched bit.

Yes, arrived at the same conclusion...

However, it seems avoiding the exit in the first via busy spinning
provides the largest performance benefit (avoiding the exit 
on the sender side and receiver sides).

See cpuidle driver just posted. 

mwait instruction that worked on multiple addresses would be ideal
for virtualization.

> >>High level semantics: If the CPU doesn't have any runnable threads, then
> >>we actually do this version of PV-MWAIT -- arming a timer if necessary
> >>so we only sleep until the time-slice expires or the MWAIT max_delay does.
> >
> >That would kill the sched_wake_idle_without_ipi optimization for the
> >host.
> Yeah, I was thinking in terms of the MWAIT being exclusively on behalf
> of the guest so in a sense the guest was still scheduled just waiting.
> 
> Ankur
> 
> >
> >>If the CPU has any runnable threads then this could still finish its
> >>time-quanta or we could just do a schedule-out.
> >>
> >>
> >>So the semantics guaranteed to the host would be that PV-MWAIT
> >>returns after >= max_delay OR with the *monitor_address changed.
> >>
> >>
> >>
> >>Ankur

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2019-05-17 17:50 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-07 18:56 [PATCH] sched: introduce configurable delay before entering idle Marcelo Tosatti
2019-05-07 22:15 ` Peter Zijlstra
2019-05-07 23:44   ` Marcelo Tosatti
2019-05-13  9:20 ` Wanpeng Li
2019-05-13 11:31   ` Konrad Rzeszutek Wilk
2019-05-13 11:51     ` Raslan, KarimAllah
2019-05-13 12:30       ` Boris Ostrovsky
2019-05-15  1:45       ` Wanpeng Li
2019-05-14 13:50   ` Marcelo Tosatti
2019-05-14 15:20     ` Konrad Rzeszutek Wilk
2019-05-14 17:42       ` Marcelo Tosatti
2019-05-15  1:42         ` Wanpeng Li
2019-05-15 20:26           ` Marcelo Tosatti
2019-05-15 18:42     ` Ankur Arora
2019-05-15 20:43       ` Marcelo Tosatti
2019-05-17  4:32         ` Ankur Arora
2019-05-17 17:49           ` Marcelo Tosatti
2019-05-16  1:07       ` Wanpeng Li
2019-05-17  2:06         ` Ankur Arora

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).