kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Josh Don <joshdon@google.com>, g@hirez.programming.kicks-ass.net
Cc: Ingo Molnar <mingo@redhat.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	"David S. Miller" <davem@davemloft.net>,
	Jakub Kicinski <kuba@kernel.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Eric Dumazet <edumazet@google.com>,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	kvm@vger.kernel.org, Xi Wang <xii@google.com>
Subject: Re: [PATCH 1/3] sched: better handling for busy polling loops
Date: Fri, 23 Oct 2020 09:19:05 +0200	[thread overview]
Message-ID: <20201023071905.GL2611@hirez.programming.kicks-ass.net> (raw)
In-Reply-To: <20201023032944.399861-1-joshdon@google.com>

On Thu, Oct 22, 2020 at 08:29:42PM -0700, Josh Don wrote:
> Busy polling loops in the kernel such as network socket poll and kvm
> halt polling have performance problems related to process scheduler load
> accounting.

AFAICT you're not actually fixing the load accounting issue at all.

> This change also disables preemption for the duration of the busy
> polling loop. This is important, as it ensures that if a polling thread
> decides to end its poll to relinquish cpu to another thread, the polling
> thread will actually exit the busy loop and potentially block. When it
> later becomes runnable, it will have the opportunity to find an idle cpu
> via wakeup cpu selection.

At the cost of inducing a sleep+wake cycle; which is mucho expensive. So
this could go either way. No data presented.

> +void prepare_to_busy_poll(void)
> +{
> +	struct rq __maybe_unused *rq = this_rq();
> +	unsigned long __maybe_unused flags;
> +
> +	/* Preemption will be reenabled by end_busy_poll() */
> +	preempt_disable();
> +
> +#ifdef CONFIG_SMP
> +	raw_spin_lock_irqsave(&rq->lock, flags);
> +	/* preemption disabled; only one thread can poll at a time */
> +	WARN_ON_ONCE(rq->busy_polling);
> +	rq->busy_polling++;
> +	raw_spin_unlock_irqrestore(&rq->lock, flags);
> +#endif

Explain to me the purpose of that rq->lock usage.

> +}
> +EXPORT_SYMBOL(prepare_to_busy_poll);

_GPL

> +
> +int continue_busy_poll(void)
> +{
> +	if (!single_task_running())
> +		return 0;

Why? If there's more, we'll end up in the below condition anyway.

> +
> +	/* Important that we check this, since preemption is disabled */
> +	if (need_resched())
> +		return 0;
> +
> +	return 1;
> +}
> +EXPORT_SYMBOL(continue_busy_poll);

_GPL

> +
> +/*
> + * Restore any state modified by prepare_to_busy_poll(), including re-enabling
> + * preemption.
> + *
> + * @allow_resched: If true, this potentially calls schedule() as part of
> + * enabling preemption. A busy poll loop can use false in order to have an
> + * opportunity to block before rescheduling.
> + */
> +void end_busy_poll(bool allow_resched)
> +{
> +#ifdef CONFIG_SMP
> +	struct rq *rq = this_rq();
> +	unsigned long flags;
> +
> +	raw_spin_lock_irqsave(&rq->lock, flags);
> +	BUG_ON(!rq->busy_polling); /* not paired with prepare() */
> +	rq->busy_polling--;
> +	raw_spin_unlock_irqrestore(&rq->lock, flags);
> +#endif

Again, please explain this lock usage.

> +
> +	/*
> +	 * preemption needs to be kept disabled between prepare_to_busy_poll()
> +	 * and end_busy_poll().
> +	 */
> +	BUG_ON(preemptible());
> +	if (allow_resched)
> +		preempt_enable();
> +	else
> +		preempt_enable_no_resched();

NAK on @allow_resched

> +}
> +EXPORT_SYMBOL(end_busy_poll);

_GPL

> +
>  #ifdef CONFIG_CGROUP_SCHED
>  /* task_group_lock serializes the addition/removal of task groups */
>  static DEFINE_SPINLOCK(task_group_lock);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 1a68a0536add..58e525c74cc6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5460,6 +5460,11 @@ static int sched_idle_cpu(int cpu)
>  {
>  	return sched_idle_rq(cpu_rq(cpu));
>  }
> +
> +static int sched_idle_or_polling_cpu(int cpu)
> +{
> +	return sched_idle_cpu(cpu) || polling_cpu(cpu);
> +}
>  #endif
>  
>  /*
> @@ -5880,6 +5885,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
>  	u64 latest_idle_timestamp = 0;
>  	int least_loaded_cpu = this_cpu;
>  	int shallowest_idle_cpu = -1;
> +	int found_polling = 0;
>  	int i;
>  
>  	/* Check if we have any choice: */
> @@ -5914,10 +5920,14 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
>  				shallowest_idle_cpu = i;
>  			}
>  		} else if (shallowest_idle_cpu == -1) {
> +			int polling = polling_cpu(i);
> +
>  			load = cpu_load(cpu_rq(i));
> -			if (load < min_load) {
> +			if ((polling == found_polling && load < min_load) ||
> +			    (polling && !found_polling)) {
>  				min_load = load;
>  				least_loaded_cpu = i;
> +				found_polling = polling;
>  			}
>  		}
>  	}
> @@ -6085,7 +6095,7 @@ static int select_idle_smt(struct task_struct *p, int target)
>  	for_each_cpu(cpu, cpu_smt_mask(target)) {
>  		if (!cpumask_test_cpu(cpu, p->cpus_ptr))
>  			continue;
> -		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> +		if (available_idle_cpu(cpu) || sched_idle_or_polling_cpu(cpu))
>  			return cpu;
>  	}
>  
> @@ -6149,7 +6159,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
>  	for_each_cpu_wrap(cpu, cpus, target) {
>  		if (!--nr)
>  			return -1;
> -		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> +		if (available_idle_cpu(cpu) || sched_idle_or_polling_cpu(cpu))
>  			break;
>  	}
>  
> @@ -6179,7 +6189,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>  	for_each_cpu_wrap(cpu, cpus, target) {
>  		unsigned long cpu_cap = capacity_of(cpu);
>  
> -		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
> +		if (!available_idle_cpu(cpu) && !sched_idle_or_polling_cpu(cpu))
>  			continue;
>  		if (task_fits_capacity(p, cpu_cap))
>  			return cpu;
> @@ -6223,14 +6233,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>  	}
>  
>  symmetric:
> -	if (available_idle_cpu(target) || sched_idle_cpu(target))
> +	if (available_idle_cpu(target) || sched_idle_or_polling_cpu(target))
>  		return target;
>  
>  	/*
>  	 * If the previous CPU is cache affine and idle, don't be stupid:
>  	 */
>  	if (prev != target && cpus_share_cache(prev, target) &&
> -	    (available_idle_cpu(prev) || sched_idle_cpu(prev)))
> +	    (available_idle_cpu(prev) || sched_idle_or_polling_cpu(prev)))
>  		return prev;
>  
>  	/*
> @@ -6252,7 +6262,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>  	if (recent_used_cpu != prev &&
>  	    recent_used_cpu != target &&
>  	    cpus_share_cache(recent_used_cpu, target) &&
> -	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
> +	    (available_idle_cpu(recent_used_cpu) ||
> +	     sched_idle_or_polling_cpu(recent_used_cpu)) &&
>  	    cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
>  		/*
>  		 * Replace recent_used_cpu with prev as it is a potential


None of this affects load-tracking

> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 28709f6b0975..45de468d0ffb 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1003,6 +1003,8 @@ struct rq {
>  
>  	/* This is used to determine avg_idle's max value */
>  	u64			max_idle_balance_cost;
> +
> +	unsigned int		busy_polling;

This is a good location, cache-wise, because?

>  #endif /* CONFIG_SMP */
>  
>  #ifdef CONFIG_IRQ_TIME_ACCOUNTING

  parent reply	other threads:[~2020-10-23  7:19 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-10-23  3:29 [PATCH 1/3] sched: better handling for busy polling loops Josh Don
2020-10-23  3:29 ` [PATCH 2/3] kvm: better handling for kvm halt polling Josh Don
2020-10-23  3:29 ` [PATCH 3/3] net: better handling for network busy poll Josh Don
2020-10-23  7:19 ` Peter Zijlstra [this message]
2020-10-23  9:33   ` [PATCH 1/3] sched: better handling for busy polling loops Paolo Bonzini
2020-10-27 23:58   ` Josh Don
2020-10-23  7:33 ` Vincent Guittot
2020-10-23 17:48 ` Jakub Kicinski
2020-10-28  0:29   ` Josh Don

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201023071905.GL2611@hirez.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=bsegall@google.com \
    --cc=davem@davemloft.net \
    --cc=dietmar.eggemann@arm.com \
    --cc=edumazet@google.com \
    --cc=g@hirez.programming.kicks-ass.net \
    --cc=joshdon@google.com \
    --cc=juri.lelli@redhat.com \
    --cc=kuba@kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=rostedt@goodmis.org \
    --cc=vincent.guittot@linaro.org \
    --cc=xii@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).