Re: [PATCH 1/3] powerpc/smp: Parse ibm,thread-groups with multiple properties

From: Gautham R Shenoy <ego@linux.vnet.ibm.com>
To: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>,
	"Gautham R. Shenoy" <ego@linux.vnet.ibm.com>,
	Michael Neuling <mikey@neuling.org>,
	Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>,
	Peter Zijlstra <peterz@infradead.org>,
	linux-kernel@vger.kernel.org, Nicholas Piggin <npiggin@gmail.com>,
	linuxppc-dev@lists.ozlabs.org,
	Valentin Schneider <valentin.schneider@arm.com>
Subject: Re: [PATCH 1/3] powerpc/smp: Parse ibm,thread-groups with multiple properties
Date: Tue, 8 Dec 2020 22:55:40 +0530	[thread overview]
Message-ID: <20201208172540.GA14206@in.ibm.com> (raw)
In-Reply-To: <20201207121042.GH528281@linux.vnet.ibm.com>

Hello Srikar,

Thanks for taking a look at the patch.

On Mon, Dec 07, 2020 at 05:40:42PM +0530, Srikar Dronamraju wrote:
> * Gautham R. Shenoy <ego@linux.vnet.ibm.com> [2020-12-04 10:18:45]:
> 
> > From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
> 
> <snipped>
> 
> > 
> >  static int parse_thread_groups(struct device_node *dn,
> > -			       struct thread_groups *tg,
> > -			       unsigned int property)
> > +			       struct thread_groups_list *tglp)
> >  {
> > -	int i;
> > -	u32 thread_group_array[3 + MAX_THREAD_LIST_SIZE];
> > +	int i = 0;
> > +	u32 *thread_group_array;
> >  	u32 *thread_list;
> >  	size_t total_threads;
> > -	int ret;
> > +	int ret = 0, count;
> > +	unsigned int property_idx = 0;
> 
> NIT:
> tglx mentions in one of his recent comments to try keep a reverse fir tree
> ordering of variables where possible.

I suppose you mean moving the longer local variable declarations to to
the top and shorter ones to the bottom. Thanks. Will fix this.

> 
> > 
> > +	count = of_property_count_u32_elems(dn, "ibm,thread-groups");
> > +	thread_group_array = kcalloc(count, sizeof(u32), GFP_KERNEL);
> >  	ret = of_property_read_u32_array(dn, "ibm,thread-groups",
> > -					 thread_group_array, 3);
> > +					 thread_group_array, count);
> >  	if (ret)
> > -		return ret;
> > -
> > -	tg->property = thread_group_array[0];
> > -	tg->nr_groups = thread_group_array[1];
> > -	tg->threads_per_group = thread_group_array[2];
> > -	if (tg->property != property ||
> > -	    tg->nr_groups < 1 ||
> > -	    tg->threads_per_group < 1)
> > -		return -ENODATA;
> > +		goto out_free;
> > 
> > -	total_threads = tg->nr_groups * tg->threads_per_group;
> > +	while (i < count && property_idx < MAX_THREAD_GROUP_PROPERTIES) {
> > +		int j;
> > +		struct thread_groups *tg = &tglp->property_tgs[property_idx++];
> 
> NIT: same as above.

Ok.
> 
> > 
> > -	ret = of_property_read_u32_array(dn, "ibm,thread-groups",
> > -					 thread_group_array,
> > -					 3 + total_threads);
> > -	if (ret)
> > -		return ret;
> > +		tg->property = thread_group_array[i];
> > +		tg->nr_groups = thread_group_array[i + 1];
> > +		tg->threads_per_group = thread_group_array[i + 2];
> > +		total_threads = tg->nr_groups * tg->threads_per_group;
> > +
> > +		thread_list = &thread_group_array[i + 3];
> > 
> > -	thread_list = &thread_group_array[3];
> > +		for (j = 0; j < total_threads; j++)
> > +			tg->thread_list[j] = thread_list[j];
> > +		i = i + 3 + total_threads;
> 
> 	Can't we simply use memcpy instead?

We could. But this one makes it more explicit.

> 
> > +	}
> > 
> > -	for (i = 0 ; i < total_threads; i++)
> > -		tg->thread_list[i] = thread_list[i];
> > +	tglp->nr_properties = property_idx;
> > 
> > -	return 0;
> > +out_free:
> > +	kfree(thread_group_array);
> > +	return ret;
> >  }
> > 
> >  /*
> > @@ -805,24 +827,39 @@ static int get_cpu_thread_group_start(int cpu, struct thread_groups *tg)
> >  	return -1;
> >  }
> > 
> > -static int init_cpu_l1_cache_map(int cpu)
> > +static int init_cpu_cache_map(int cpu, unsigned int cache_property)
> > 
> >  {
> >  	struct device_node *dn = of_get_cpu_node(cpu, NULL);
> > -	struct thread_groups tg = {.property = 0,
> > -				   .nr_groups = 0,
> > -				   .threads_per_group = 0};
> > +	struct thread_groups *tg = NULL;
> >  	int first_thread = cpu_first_thread_sibling(cpu);
> >  	int i, cpu_group_start = -1, err = 0;
> > +	cpumask_var_t *mask;
> > +	struct thread_groups_list *cpu_tgl = &tgl[cpu];
> 
> NIT: same as 1st comment.

Sure, will fix this.

> 
> > 
> >  	if (!dn)
> >  		return -ENODATA;
> > 
> > -	err = parse_thread_groups(dn, &tg, THREAD_GROUP_SHARE_L1);
> > -	if (err)
> > -		goto out;
> > +	if (!(cache_property == THREAD_GROUP_SHARE_L1))
> > +		return -EINVAL;
> > 
> > -	cpu_group_start = get_cpu_thread_group_start(cpu, &tg);
> > +	if (!cpu_tgl->nr_properties) {
> > +		err = parse_thread_groups(dn, cpu_tgl);
> > +		if (err)
> > +			goto out;
> > +	}
> > +
> > +	for (i = 0; i < cpu_tgl->nr_properties; i++) {
> > +		if (cpu_tgl->property_tgs[i].property == cache_property) {
> > +			tg = &cpu_tgl->property_tgs[i];
> > +			break;
> > +		}
> > +	}
> > +
> > +	if (!tg)
> > +		return -EINVAL;
> > +
> > +	cpu_group_start = get_cpu_thread_group_start(cpu, tg);
> 
> This whole hunk should be moved to a new function and called before
> init_cpu_cache_map. It will simplify the logic to great extent.

I suppose you are referring to the part where we select the correct
tg. Yeah, that can move to a different helper.

> 
> > 
> >  	if (unlikely(cpu_group_start == -1)) {
> >  		WARN_ON_ONCE(1);
> > @@ -830,11 +867,12 @@ static int init_cpu_l1_cache_map(int cpu)
> >  		goto out;
> >  	}
> > 
> > -	zalloc_cpumask_var_node(&per_cpu(cpu_l1_cache_map, cpu),
> > -				GFP_KERNEL, cpu_to_node(cpu));
> > +	mask = &per_cpu(cpu_l1_cache_map, cpu);
> > +
> > +	zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cpu));
> > 
> 
> This hunk (and the next hunk) should be moved to next patch.
>

The next patch is only about introducing  THREAD_GROUP_SHARE_L2. Hence
I put in any other code in this patch, since it seems to be a logical
place to collate whatever we have in a generic form.

> >  	for (i = first_thread; i < first_thread + threads_per_core; i++) {
> > -		int i_group_start = get_cpu_thread_group_start(i, &tg);
> > +		int i_group_start = get_cpu_thread_group_start(i, tg);
> > 
> >  		if (unlikely(i_group_start == -1)) {
> >  			WARN_ON_ONCE(1);
> > @@ -843,7 +881,7 @@ static int init_cpu_l1_cache_map(int cpu)
> >  		}
> > 
> >  		if (i_group_start == cpu_group_start)
> > -			cpumask_set_cpu(i, per_cpu(cpu_l1_cache_map, cpu));
> > +			cpumask_set_cpu(i, *mask);
> >  	}
> > 
> >  out:
> > @@ -924,7 +962,7 @@ static int init_big_cores(void)
> >  	int cpu;
> > 
> >  	for_each_possible_cpu(cpu) {
> > -		int err = init_cpu_l1_cache_map(cpu);
> > +		int err = init_cpu_cache_map(cpu, THREAD_GROUP_SHARE_L1);
> > 
> >  		if (err)
> >  			return err;
> > -- 
> > 1.9.4
> > 
> 
> -- 
> Thanks and Regards
> Srikar Dronamraju