Re: [PATCH v11 3/6] mm/vmstat: manage per-CPU stats from CPU context when NOHZ full

From: Frederic Weisbecker <frederic@kernel.org>
To: Marcelo Tosatti <mtosatti@redhat.com>
Cc: atomlin@atomlin.com, cl@linux.com, tglx@linutronix.de,
	mingo@kernel.org, peterz@infradead.org, pauld@redhat.com,
	neelx@redhat.com, oleksandr@natalenko.name,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org
Subject: Re: [PATCH v11 3/6] mm/vmstat: manage per-CPU stats from CPU context when NOHZ full
Date: Fri, 23 Dec 2022 15:41:50 +0100	[thread overview]
Message-ID: <20221223144150.GA79369@lothringen> (raw)
In-Reply-To: <20221221170436.330627967@redhat.com>

On Wed, Dec 21, 2022 at 01:58:04PM -0300, Marcelo Tosatti wrote:
> @@ -194,21 +195,50 @@ void fold_vm_numa_events(void)
>  #endif
>  
>  #ifdef CONFIG_SMP
> -static DEFINE_PER_CPU_ALIGNED(bool, vmstat_dirty);
> +
> +struct vmstat_dirty {
> +	bool dirty;
> +	bool cpuhotplug;

May be call it "online" for clarity. Also should it depend on CONFIG_FLUSH_WORK_ON_RESUME_USER?

> +};
> +
> +static DEFINE_PER_CPU_ALIGNED(struct vmstat_dirty, vmstat_dirty_pcpu);
> +static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
> +int sysctl_stat_interval __read_mostly = HZ;
>  
>  static inline void vmstat_mark_dirty(void)
>  {
> -	this_cpu_write(vmstat_dirty, true);
> +	struct vmstat_dirty *vms = this_cpu_ptr(&vmstat_dirty_pcpu);
> +
> +#ifdef CONFIG_FLUSH_WORK_ON_RESUME_USER

Please avoid ifdeffery in the middle of a function when possible.
This block could be in a different function or use IS_ENABLED()
for example.

> +	int cpu = smp_processor_id();
> +
> +	if (tick_nohz_full_cpu(cpu) && !vms->dirty) {
> +		struct delayed_work *dw;
> +
> +		dw = this_cpu_ptr(&vmstat_work);
> +		if (!delayed_work_pending(dw) && !vms->cpuhotplug) {
> +			unsigned long delay;
> +
> +			delay = round_jiffies_relative(sysctl_stat_interval);
> +			queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
> +		}
> +	}
> +#endif
> +	vms->dirty = true;
>  }
>  
>  static inline void vmstat_clear_dirty(void)
>  {
> -	this_cpu_write(vmstat_dirty, false);
> +	struct vmstat_dirty *vms = this_cpu_ptr(&vmstat_dirty_pcpu);
> +
> +	vms->dirty = false;

You could keep this_cpu_write(vmstat_dirty.dirty, false)

>  }
>  
>  static inline bool is_vmstat_dirty(void)
>  {
> -	return this_cpu_read(vmstat_dirty);
> +	struct vmstat_dirty *vms = this_cpu_ptr(&vmstat_dirty_pcpu);
> +
> +	return vms->dirty;

Ditto with this_cpu_read()?

>  }
>  
>  int calculate_pressure_threshold(struct zone *zone)
> @@ -1981,13 +2008,18 @@ void quiet_vmstat(void)
>  	if (!is_vmstat_dirty())
>  		return;
>  
> +	refresh_cpu_vm_stats(false);
> +
> +#ifdef CONFIG_FLUSH_WORK_ON_RESUME_USER

This can use IS_ENABLED()

> +	if (!user)
> +		return;
>  	/*
> -	 * Just refresh counters and do not care about the pending delayed
> -	 * vmstat_update. It doesn't fire that often to matter and canceling
> -	 * it would be too expensive from this path.
> -	 * vmstat_shepherd will take care about that for us.
> +	 * If the tick is stopped, cancel any delayed work to avoid
> +	 * interruptions to this CPU in the future.
>  	 */
> -	refresh_cpu_vm_stats(false);
> +	if (delayed_work_pending(this_cpu_ptr(&vmstat_work)))
> +		cancel_delayed_work(this_cpu_ptr(&vmstat_work));
> +#endif
>  }
>  
>  /*
> @@ -2008,8 +2040,15 @@ static void vmstat_shepherd(struct work_
>  	/* Check processors whose vmstat worker threads have been disabled */
>  	for_each_online_cpu(cpu) {
>  		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
> +		struct vmstat_dirty *vms = per_cpu_ptr(&vmstat_dirty_pcpu, cpu);
>  
> -		if (!delayed_work_pending(dw) && per_cpu(vmstat_dirty, cpu))
> +#ifdef CONFIG_FLUSH_WORK_ON_RESUME_USER

Same here.

> +		/* NOHZ full CPUs manage their own vmstat flushing */
> +		if (tick_nohz_full_cpu(cpu))
> +			continue;
> +#endif
> +
> +		if (!delayed_work_pending(dw) && vms->dirty)
>  			queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
>  
>  		cond_resched();
> @@ -2053,8 +2111,15 @@ static int vmstat_cpu_online(unsigned in
>  	return 0;
>  }
>  
> +/*
> + * ONLINE: The callbacks are invoked on the hotplugged CPU from the per CPU
> + * hotplug thread with interrupts and preemption enabled.

This is OFFLINE and the reason behind that comment is confusing.

> + */
>  static int vmstat_cpu_down_prep(unsigned int cpu)
>  {
> +	struct vmstat_dirty *vms = per_cpu_ptr(&vmstat_dirty_pcpu, cpu);
> +
> +	vms->cpuhotplug = true;

this_cpu_write() ?

>  	cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
>  	return 0;
>  }
> +config FLUSH_WORK_ON_RESUME_USER
> +	bool "Flush per-CPU vmstats on user return (for nohz full CPUs)"
> +	depends on NO_HZ_FULL
> +	default y
> +
> +	help
> +	  By default, nohz full CPUs flush per-CPU vm statistics on return
> +	  to userspace (to avoid additional interferences when executing
> +	  userspace code). This has a small but measurable impact on
> +	  system call performance. You can disable this to improve system call
> +	  performance, at the expense of potential interferences to userspace
> +	  execution.

Can you move that below config CPU_ISOLATION ?

Thanks!

> +
>  # multi-gen LRU {
>  config LRU_GEN
>  	bool "Multi-Gen LRU"
> 
>