Re: [PATCH v8 4/6] lib/dlock-list: Make sibling CPUs share the same linked list

From: Jan Kara <jack@suse.cz>
To: Waiman Long <longman@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>,
	Jan Kara <jack@suse.com>, Jeff Layton <jlayton@poochiereds.net>,
	"J. Bruce Fields" <bfields@fieldses.org>,
	Tejun Heo <tj@kernel.org>,
	Christoph Lameter <cl@linux-foundation.org>,
	linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Andi Kleen <andi@firstfloor.org>,
	Dave Chinner <dchinner@redhat.com>,
	Boqun Feng <boqun.feng@gmail.com>,
	Davidlohr Bueso <dave@stgolabs.net>
Subject: Re: [PATCH v8 4/6] lib/dlock-list: Make sibling CPUs share the same linked list
Date: Wed, 1 Nov 2017 09:38:19 +0100	[thread overview]
Message-ID: <20171101083819.GK26128@quack2.suse.cz> (raw)
In-Reply-To: <1509475860-16139-5-git-send-email-longman@redhat.com>

On Tue 31-10-17 14:50:58, Waiman Long wrote:
> The dlock list needs one list for each of the CPUs available. However,
> for sibling CPUs, they are sharing the L2 and probably L1 caches
> too. As a result, there is not much to gain in term of avoiding
> cacheline contention while increasing the cacheline footprint of the
> L1/L2 caches as separate lists may need to be in the cache.
> 
> This patch makes all the sibling CPUs share the same list, thus
> reducing the number of lists that need to be maintained in each
> dlock list without having any noticeable impact on performance. It
> also improves dlock list iteration performance as fewer lists need
> to be iterated.
> 
> Signed-off-by: Waiman Long <longman@redhat.com>

The patch looks good to me. You can add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  lib/dlock-list.c | 74 ++++++++++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 59 insertions(+), 15 deletions(-)
> 
> diff --git a/lib/dlock-list.c b/lib/dlock-list.c
> index 17ced06..a4ddecc 100644
> --- a/lib/dlock-list.c
> +++ b/lib/dlock-list.c
> @@ -25,31 +25,65 @@
>   * The distributed and locked list is a distributed set of lists each of
>   * which is protected by its own spinlock, but acts like a single
>   * consolidated list to the callers. For scaling purpose, the number of
> - * lists used is equal to the number of possible CPUs in the system to
> - * minimize contention.
> + * lists used is equal to the number of possible cores in the system to
> + * minimize contention. All threads of the same CPU core will share the
> + * same list.
>   *
> - * However, it is possible that individual CPU numbers may be equal to
> - * or greater than the number of possible CPUs when there are holes in
> - * the CPU number list. As a result, we need to map the CPU number to a
> - * list index.
> + * We need to map each CPU number to a list index.
>   */
>  static DEFINE_PER_CPU_READ_MOSTLY(int, cpu2idx);
> +static int nr_dlock_lists __read_mostly;
>  
>  /*
> - * Initialize cpu2idx mapping table
> + * Initialize cpu2idx mapping table & nr_dlock_lists.
>   *
>   * It is possible that a dlock-list can be allocated before the cpu2idx is
>   * initialized. In this case, all the cpus are mapped to the first entry
>   * before initialization.
>   *
> + * All the sibling CPUs of a sibling group will map to the same dlock list so
> + * as to reduce the number of dlock lists to be maintained while minimizing
> + * cacheline contention.
> + *
> + * As the sibling masks are set up in the core initcall phase, this function
> + * has to be done in the postcore phase to get the right data.
>   */
>  static int __init cpu2idx_init(void)
>  {
>  	int idx, cpu;
> +	struct cpumask *sibling_mask;
> +	static struct cpumask mask __initdata;
>  
> +	cpumask_clear(&mask);
>  	idx = 0;
> -	for_each_possible_cpu(cpu)
> -		per_cpu(cpu2idx, cpu) = idx++;
> +	for_each_possible_cpu(cpu) {
> +		int scpu;
> +
> +		if (cpumask_test_cpu(cpu, &mask))
> +			continue;
> +		per_cpu(cpu2idx, cpu) = idx;
> +		cpumask_set_cpu(cpu, &mask);
> +
> +		sibling_mask = topology_sibling_cpumask(cpu);
> +		if (sibling_mask) {
> +			for_each_cpu(scpu, sibling_mask) {
> +				per_cpu(cpu2idx, scpu) = idx;
> +				cpumask_set_cpu(scpu, &mask);
> +			}
> +		}
> +		idx++;
> +	}
> +
> +	/*
> +	 * nr_dlock_lists can only be set after cpu2idx is properly
> +	 * initialized.
> +	 */
> +	smp_mb();
> +	nr_dlock_lists = idx;
> +	WARN_ON(nr_dlock_lists > nr_cpu_ids);
> +
> +	pr_info("dlock-list: %d head entries per dlock list.\n",
> +		nr_dlock_lists);
>  	return 0;
>  }
>  postcore_initcall(cpu2idx_init);
> @@ -67,19 +101,23 @@ static int __init cpu2idx_init(void)
>   *
>   * Dynamically allocated locks need to have their own special lock class
>   * to avoid lockdep warning.
> + *
> + * Since nr_dlock_lists will always be <= nr_cpu_ids, having more lists
> + * than necessary allocated is not a problem other than some wasted memory.
> + * The extra lists will not be ever used as all the cpu2idx entries will be
> + * 0 before initialization.
>   */
>  int __alloc_dlock_list_heads(struct dlock_list_heads *dlist,
>  			     struct lock_class_key *key)
>  {
> -	int idx;
> +	int idx, cnt = nr_dlock_lists ? nr_dlock_lists : nr_cpu_ids;
>  
> -	dlist->heads = kcalloc(nr_cpu_ids, sizeof(struct dlock_list_head),
> -			       GFP_KERNEL);
> +	dlist->heads = kcalloc(cnt, sizeof(struct dlock_list_head), GFP_KERNEL);
>  
>  	if (!dlist->heads)
>  		return -ENOMEM;
>  
> -	for (idx = 0; idx < nr_cpu_ids; idx++) {
> +	for (idx = 0; idx < cnt; idx++) {
>  		struct dlock_list_head *head = &dlist->heads[idx];
>  
>  		INIT_LIST_HEAD(&head->list);
> @@ -117,7 +155,10 @@ bool dlock_lists_empty(struct dlock_list_heads *dlist)
>  {
>  	int idx;
>  
> -	for (idx = 0; idx < nr_cpu_ids; idx++)
> +	/* Shouldn't be called before nr_dlock_lists is initialized */
> +	WARN_ON_ONCE(!nr_dlock_lists);
> +
> +	for (idx = 0; idx < nr_dlock_lists; idx++)
>  		if (!list_empty(&dlist->heads[idx].list))
>  			return false;
>  	return true;
> @@ -199,6 +240,9 @@ struct dlock_list_node *__dlock_list_next_list(struct dlock_list_iter *iter)
>  	struct dlock_list_node *next;
>  	struct dlock_list_head *head;
>  
> +	/* Shouldn't be called before nr_dlock_lists is initialized */
> +	WARN_ON_ONCE(!nr_dlock_lists);
> +
>  restart:
>  	if (iter->entry) {
>  		spin_unlock(&iter->entry->lock);
> @@ -209,7 +253,7 @@ struct dlock_list_node *__dlock_list_next_list(struct dlock_list_iter *iter)
>  	/*
>  	 * Try next list
>  	 */
> -	if (++iter->index >= nr_cpu_ids)
> +	if (++iter->index >= nr_dlock_lists)
>  		return NULL;	/* All the entries iterated */
>  
>  	if (list_empty(&iter->head[iter->index].list))
> -- 
> 1.8.3.1
> 
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR