On Mon, Jul 11, 2016 at 01:32:11PM -0400, Waiman Long wrote:
> The percpu APIs are extensively used in the Linux kernel to reduce
> cacheline contention and improve performance. For some use cases, the
> percpu APIs may be too fine-grain for distributed resources whereas
> a per-node based allocation may be too coarse as we can have dozens
> of CPUs in a NUMA node in some high-end systems.
> 
> This patch introduces a simple per-subnode APIs where each of the
> distributed resources will be shared by only a handful of CPUs within
> a NUMA node. The per-subnode APIs are built on top of the percpu APIs
> and hence requires the same amount of memory as if the percpu APIs
> are used. However, it helps to reduce the total number of separate
> resources that needed to be managed. As a result, it can speed up code
> that need to iterate all the resources compared with using the percpu
> APIs. Cacheline contention, however, will increases slightly as each
> resource is shared by more than one CPU. As long as the number of CPUs
> in each subnode is small, the performance impact won't be significant.
> 
> In this patch, at most 2 sibling groups can be put into a subnode. For
> an x86-64 CPU, at most 4 CPUs will be in a subnode when HT is enabled
> and 2 when it is not.
> 
> Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
> ---
>  include/linux/persubnode.h |   80 +++++++++++++++++++++++++++++
>  init/main.c                |    2 +
>  lib/Makefile               |    2 +
>  lib/persubnode.c           |  119 ++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 203 insertions(+), 0 deletions(-)
>  create mode 100644 include/linux/persubnode.h
>  create mode 100644 lib/persubnode.c
> 
> diff --git a/include/linux/persubnode.h b/include/linux/persubnode.h
> new file mode 100644
> index 0000000..b777daa
> --- /dev/null
> +++ b/include/linux/persubnode.h
> @@ -0,0 +1,80 @@
> +/*
> + * Per-subnode definitions
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
> + *
> + * Authors: Waiman Long <waiman.long@hpe.com>
> + */
> +#ifndef __LINUX_PERSUBNODE_H
> +#define __LINUX_PERSUBNODE_H
> +
> +#include <linux/percpu.h>
> +#include <linux/topology.h>
> +
> +/*
> + * Per-subnode APIs
> + */
> +#define __persubnode			__percpu
> +#define nr_subnode_ids			nr_cpu_ids
> +#define alloc_persubnode(type)		alloc_percpu(type)
> +#define free_persubnode(var)		free_percpu(var)
> +#define for_each_subnode(snode)		for_each_cpu(snode, subnode_mask)
> +#define per_subnode_ptr(ptr, subnode)	per_cpu_ptr(ptr, subnode)
> +#define per_subnode(var, subnode)	per_cpu(var, subnode)
> +
> +#ifdef CONFIG_SMP
> +
> +extern struct cpumask __subnode_mask __read_mostly;
> +DECLARE_PER_CPU_READ_MOSTLY(int, cpu_subnode_id);
> +
> +#define subnode_mask		(&__subnode_mask)
> +
> +static inline int this_cpu_to_subnode(void)
> +{
> +	return *this_cpu_ptr(&cpu_subnode_id);
> +}
> +
> +/*
> + * For safety, preemption should be disabled before using this_subnode_ptr().
> + */
> +#define this_subnode_ptr(ptr)			\
> +({						\
> +	int _snid = this_cpu_to_subnode();	\
> +	per_cpu_ptr(ptr, _snid);		\
> +})
> +
> +#define get_subnode_ptr(ptr)			\
> +({						\
> +	preempt_disable();			\
> +	this_subnode_ptr(ptr);			\
> +})
> +
> +#define put_subnode_ptr(ptr)			\
> +do {						\
> +	(void)(ptr);				\
> +	preempt_enable();			\
> +} while (0)
> +
> +extern void __init subnode_early_init(void);
> +
> +#else /* CONFIG_SMP */
> +
> +#define subnode_mask		cpu_possible_mask
> +#define this_subnode_ptr(ptr)	this_cpu_ptr(ptr)
> +#define get_subnode_ptr(ptr)	get_cpu_ptr(ptr)
> +#define put_subnode_ptr(ptr)	put_cpu_ptr(ptr)
> +
> +static inline void subnode_early_init(void) { }
> +
> +#endif /* CONFIG_SMP */
> +#endif /* __LINUX_PERSUBNODE_H */
> diff --git a/init/main.c b/init/main.c
> index 4c17fda..28e4425 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -81,6 +81,7 @@
>  #include <linux/integrity.h>
>  #include <linux/proc_ns.h>
>  #include <linux/io.h>
> +#include <linux/persubnode.h>
>  
>  #include <asm/io.h>
>  #include <asm/bugs.h>
> @@ -524,6 +525,7 @@ asmlinkage __visible void __init start_kernel(void)
>  			   NULL, set_init_arg);
>  
>  	jump_label_init();
> +	subnode_early_init();
>  
>  	/*
>  	 * These use large bootmem allocations and must precede
> diff --git a/lib/Makefile b/lib/Makefile
> index 92e8c38..440152c 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -232,3 +232,5 @@ obj-$(CONFIG_UCS2_STRING) += ucs2_string.o
>  obj-$(CONFIG_UBSAN) += ubsan.o
>  
>  UBSAN_SANITIZE_ubsan.o := n
> +
> +obj-$(CONFIG_SMP) += persubnode.o
> diff --git a/lib/persubnode.c b/lib/persubnode.c
> new file mode 100644
> index 0000000..9febe7c
> --- /dev/null
> +++ b/lib/persubnode.c
> @@ -0,0 +1,119 @@
> +/*
> + * Per-subnode APIs
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
> + *
> + * Authors: Waiman Long <waiman.long@hpe.com>
> + */
> +
> +/*
> + * The per-subnode APIs work on top of the per-cpu APIs. Instead of
> + * having to manage n separate resources on a n-cpu system, users can
> + * now manage only n/m separate resources where m is the number of CPUs
> + * in each subnode. This cuts down the time needed to traverse all the
> + * resources but at the expense of some wasted per-cpu memory as part of
> + * the per-cpu memory will not be used.
> + *
> + * All the CPUs in the same subnode must come from the same NUMA node.
> + * However, there can be more than one subnode in each NUMA node.
> + *
> + * As the per-subnode APIs can be used early in the bootup process while
> + * not all the information needed for initialization may be available
> + * at that time, the initialization is separated into two steps:
> + *  1) an early init that is called directly from start_kernel(); and
> + *  2) a postcore init.
> + *
> + * Before initialization, all the subnode IDs of the CPUs will be zero. So
> + * they will all use the same resource at subnode 0. The early init copies
> + * the cpu_possible_mask to subnode_mask causing resource initialization
> + * to be done for all the per-cpu resources allocated. At the postcore
> + * init, some bits of the subnode_mask will be cleared and the corresponding
> + * cpu to subnode ID mapping will be set accordingly.
> + */
> +#include <linux/persubnode.h>
> +
> +DEFINE_PER_CPU_READ_MOSTLY(int, cpu_subnode_id);
> +EXPORT_PER_CPU_SYMBOL(cpu_subnode_id);
> +
> +struct cpumask __subnode_mask __read_mostly;
> +EXPORT_SYMBOL(__subnode_mask);
> +
> +/*
> + * Iterates all the CPUs from the given starting CPU
> + */
> +#define for_each_cpu_from(from, cpu, mask)		\
> +	for ((cpu) = (from); (cpu) < nr_cpu_ids;	\
> +	     (cpu) = cpumask_next((cpu), (mask)))
> +
> +/*
> + * Early subnode initialization to be called early in the boot process.
> + */
> +void __init subnode_early_init(void)
> +{
> +	/* Make subnode_mask the same as cpu_possible_mask */
> +	cpumask_copy(subnode_mask, cpu_possible_mask);
> +}
> +
> +/*
> + * Initialize the subnodes
> + *
> + * All the sibling CPUs will be in the same subnode. On top of that, we will
> + * put at most 2 sibling groups into the same subnode. The percpu
> + * topology_sibling_cpumask() and topology_core_cpumask() are used for
> + * grouping CPUs into subnodes. The subnode ID is the CPU number of the
> + * first CPU in the subnode.
> + */
> +static int __init subnode_init(void)
> +{
> +	int cpu;
> +	int nr_subnodes = 0;
> +	const int subnode_nr_cpus = 2;
> +
> +	/*
> +	 * Some of the bits in the subnode_mask will be cleared as we proceed.
> +	 */
> +	for_each_cpu(cpu, subnode_mask) {
> +		int ccpu, scpu;
> +		int cpucnt = 0;
> +
> +		cpumask_var_t core_mask = topology_core_cpumask(cpu);
> +		cpumask_var_t sibling_mask;
> +
> +		/*
> +		 * Put subnode_nr_cpus of CPUs and their siblings into each
> +		 * subnode.
> +		 */
> +		for_each_cpu_from(cpu, ccpu, core_mask) {
> +			sibling_mask = topology_sibling_cpumask(ccpu);
> +			for_each_cpu_from(ccpu, scpu, sibling_mask) {
> +				/*
> +				 * Clear the bits of the higher CPUs.
> +				 */
> +				if (scpu > cpu)
> +					cpumask_clear_cpu(scpu, subnode_mask);

Do we also need to clear the 'core_mask' here? Consider a core consist
of two sibling groups and each sibling group consist of two cpus. At the
beginning of the outer loop(for_each_cpu_from(cpu, ccpu, core_mask)):

'core_mask' is 0b1111

so at the beginning of the inner loop first time:

'ccpu' is 0, therefore 'sibling_mask' is 0b1100, in this loop we set the
'cpu_subnode_id' of cpu 0 and 1 to 0.

at the beginning of the inner loop second time:

'ccpu' is 1 because we don't clear cpu 1 from 'core_mask'. Therefore
sibling_mask is still 0b1100, so in this loop we still do the setting on
'cpu_subnode_id' of cpu 0 and 1.

Am I missing something here?

Regards,
Boqun

> +				per_cpu(cpu_subnode_id, scpu) = cpu;
> +			}
> +			if (++cpucnt == subnode_nr_cpus)
> +				break;	/* One subnode formed */
> +		}
> +		nr_subnodes++;
> +	}
> +	pr_info("Number of subnodes initialized = %d\n", nr_subnodes);
> +
> +	/*
> +	 * CPU 0 must be mapped to subnode 0
> +	 */
> +	BUG_ON(per_cpu(cpu_subnode_id, 0) != 0);
> +	return 0;
> +}
> +postcore_initcall(subnode_init);
> -- 
> 1.7.1
>