Re: [PATCH v12 4/5] arm64, numa: Add NUMA support for arm64 platforms.

From: Will Deacon <will.deacon@arm.com>
To: David Daney <ddaney.cavm@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org,
	Rob Herring <robh+dt@kernel.org>, Pawel Moll <pawel.moll@arm.com>,
	Mark Rutland <mark.rutland@arm.com>,
	Ian Campbell <ijc+devicetree@hellion.org.uk>,
	Kumar Gala <galak@codeaurora.org>,
	devicetree@vger.kernel.org,
	Ard Biesheuvel <ard.biesheuvel@linaro.org>,
	Frank Rowand <frowand.list@gmail.com>,
	Grant Likely <grant.likely@linaro.org>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Matt Fleming <matt@codeblueprint.co.uk>,
	linux-efi@vger.kernel.org,
	Ganapatrao Kulkarni <gkulkarni@caviumnetworks.com>,
	Robert Richter <rrichter@cavium.com>,
	linux-kernel@vger.kernel.org,
	David Daney <david.daney@cavium.com>
Subject: Re: [PATCH v12 4/5] arm64, numa: Add NUMA support for arm64 platforms.
Date: Fri, 26 Feb 2016 18:53:41 +0000	[thread overview]
Message-ID: <20160226185341.GN29125@arm.com> (raw)
In-Reply-To: <1456192703-2274-5-git-send-email-ddaney.cavm@gmail.com>

On Mon, Feb 22, 2016 at 05:58:22PM -0800, David Daney wrote:
> From: Ganapatrao Kulkarni <gkulkarni@caviumnetworks.com>
> 
> Attempt to get the memory and CPU NUMA node via of_numa.  If that
> fails, default the dummy NUMA node and map all memory and CPUs to node
> 0.
> 
> Tested-by: Shannon Zhao <shannon.zhao@linaro.org>
> Reviewed-by: Robert Richter <rrichter@cavium.com>
> Signed-off-by: Ganapatrao Kulkarni <gkulkarni@caviumnetworks.com>
> Signed-off-by: David Daney <david.daney@cavium.com>

[...]

> @@ -133,10 +151,15 @@ static void __init arm64_memory_present(void)
>  static void __init arm64_memory_present(void)
>  {
>  	struct memblock_region *reg;
> +	int nid = 0;
>  
> -	for_each_memblock(memory, reg)
> -		memory_present(0, memblock_region_memory_base_pfn(reg),
> -			       memblock_region_memory_end_pfn(reg));
> +	for_each_memblock(memory, reg) {
> +#ifdef CONFIG_NUMA
> +		nid = reg->nid;
> +#endif
> +		memory_present(nid, memblock_region_memory_base_pfn(reg),
> +				memblock_region_memory_end_pfn(reg));
> +	}
>  }
>  #endif
>  
> @@ -181,7 +204,6 @@ void __init arm64_memblock_init(void)
>  	dma_contiguous_reserve(arm64_dma_phys_limit);
>  
>  	memblock_allow_resize();
> -	memblock_dump_all();
>  }
>  
>  void __init bootmem_init(void)
> @@ -193,6 +215,9 @@ void __init bootmem_init(void)
>  
>  	early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);
>  
> +	max_pfn = max_low_pfn = max;
> +
> +	arm64_numa_init();
>  	/*
>  	 * Sparsemem tries to allocate bootmem in memory_present(), so must be
>  	 * done after the fixed reservations.
> @@ -203,7 +228,6 @@ void __init bootmem_init(void)
>  	zone_sizes_init(min, max);
>  
>  	high_memory = __va((max << PAGE_SHIFT) - 1) + 1;
> -	max_pfn = max_low_pfn = max;
>  }
>  
>  #ifndef CONFIG_SPARSEMEM_VMEMMAP
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 58faeaa..44e3854 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -463,6 +463,7 @@ void __init paging_init(void)
>  	zero_page = early_alloc(PAGE_SIZE);
>  
>  	bootmem_init();
> +	memblock_dump_all();
>  
>  	empty_zero_page = virt_to_page(zero_page);
>  
> diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
> new file mode 100644
> index 0000000..604e886
> --- /dev/null
> +++ b/arch/arm64/mm/numa.c
> @@ -0,0 +1,403 @@
> +/*
> + * NUMA support, based on the x86 implementation.
> + *
> + * Copyright (C) 2015 Cavium Inc.
> + * Author: Ganapatrao Kulkarni <gkulkarni@cavium.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <linux/bootmem.h>
> +#include <linux/memblock.h>
> +#include <linux/module.h>
> +#include <linux/of.h>
> +
> +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
> +EXPORT_SYMBOL(node_data);
> +nodemask_t numa_nodes_parsed __initdata;
> +static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
> +
> +static int numa_off;
> +static int numa_distance_cnt;
> +static u8 *numa_distance;
> +
> +static __init int numa_parse_early_param(char *opt)
> +{
> +	if (!opt)
> +		return -EINVAL;
> +	if (!strncmp(opt, "off", 3)) {
> +		pr_info("%s\n", "NUMA turned off");
> +		numa_off = 1;
> +	}
> +	return 0;
> +}
> +early_param("numa", numa_parse_early_param);

Curious, but when is this option actually useful?

> +
> +cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
> +EXPORT_SYMBOL(node_to_cpumask_map);
> +
> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
> +
> +/*
> + * Returns a pointer to the bitmask of CPUs on Node 'node'.
> + */
> +const struct cpumask *cpumask_of_node(int node)
> +{
> +	if (WARN_ON(node >= nr_node_ids))
> +		return cpu_none_mask;
> +
> +	if (WARN_ON(node_to_cpumask_map[node] == NULL))
> +		return cpu_online_mask;
> +
> +	return node_to_cpumask_map[node];
> +}
> +EXPORT_SYMBOL(cpumask_of_node);
> +
> +#endif
> +
> +static void map_cpu_to_node(unsigned int cpu, int nid)
> +{
> +	set_cpu_numa_node(cpu, nid);
> +	if (nid >= 0)
> +		cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
> +}
> +
> +static void unmap_cpu_to_node(unsigned int cpu)
> +{
> +	int nid = cpu_to_node(cpu);
> +
> +	if (nid >= 0)
> +		cpumask_clear_cpu(cpu, node_to_cpumask_map[nid]);
> +	set_cpu_numa_node(cpu, NUMA_NO_NODE);
> +}

How do you end up with negative nids this late in the game?

> +
> +void numa_clear_node(unsigned int cpu)
> +{
> +	unmap_cpu_to_node(cpu);

Why don't you just inline this function?

> +}
> +
> +/*
> + * Allocate node_to_cpumask_map based on number of available nodes
> + * Requires node_possible_map to be valid.
> + *
> + * Note: cpumask_of_node() is not valid until after this is done.
> + * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
> + */
> +static void __init setup_node_to_cpumask_map(void)
> +{
> +	unsigned int cpu;
> +	int node;
> +
> +	/* setup nr_node_ids if not done yet */
> +	if (nr_node_ids == MAX_NUMNODES)
> +		setup_nr_node_ids();
> +
> +	/* allocate and clear the mapping */
> +	for (node = 0; node < nr_node_ids; node++) {
> +		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
> +		cpumask_clear(node_to_cpumask_map[node]);
> +	}
> +
> +	for_each_possible_cpu(cpu)
> +		set_cpu_numa_node(cpu, NUMA_NO_NODE);
> +
> +	/* cpumask_of_node() will now work */
> +	pr_debug("NUMA: Node to cpumask map for %d nodes\n", nr_node_ids);
> +}
> +
> +/*
> + *  Set the cpu to node and mem mapping
> + */
> +void numa_store_cpu_info(unsigned int cpu)
> +{
> +	map_cpu_to_node(cpu, numa_off ? 0 : cpu_to_node_map[cpu]);
> +}
> +
> +void __init early_map_cpu_to_node(unsigned int cpu, int nid)
> +{
> +	/* fallback to node 0 */
> +	if (nid < 0 || nid >= MAX_NUMNODES)
> +		nid = 0;
> +
> +	cpu_to_node_map[cpu] = nid;
> +}
> +
> +/**
> + * numa_add_memblk - Set node id to memblk
> + * @nid: NUMA node ID of the new memblk
> + * @start: Start address of the new memblk
> + * @size:  Size of the new memblk
> + *
> + * RETURNS:
> + * 0 on success, -errno on failure.
> + */
> +int __init numa_add_memblk(int nid, u64 start, u64 size)
> +{
> +	int ret;
> +
> +	ret = memblock_set_node(start, size, &memblock.memory, nid);
> +	if (ret < 0) {
> +		pr_err("NUMA: memblock [0x%llx - 0x%llx] failed to add on node %d\n",
> +			start, (start + size - 1), nid);
> +		return ret;
> +	}
> +
> +	node_set(nid, numa_nodes_parsed);
> +	pr_info("NUMA: Adding memblock [0x%llx - 0x%llx] on node %d\n",
> +			start, (start + size - 1), nid);
> +	return ret;
> +}
> +EXPORT_SYMBOL(numa_add_memblk);

But this is marked __init... (and you've done this elsewhere in the patch
too).

Will